Update functions.py
Browse files- functions.py +18 -17
functions.py
CHANGED
|
@@ -97,7 +97,7 @@ prompt = ChatPromptTemplate.from_messages(messages)
|
|
| 97 |
|
| 98 |
###################### Functions #######################################################################################
|
| 99 |
|
| 100 |
-
@st.
|
| 101 |
def load_models():
|
| 102 |
|
| 103 |
'''Load and cache all the models to be used'''
|
|
@@ -116,20 +116,20 @@ def load_models():
|
|
| 116 |
|
| 117 |
return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert
|
| 118 |
|
| 119 |
-
@st.
|
| 120 |
def load_asr_model(asr_model_name):
|
| 121 |
asr_model = whisper.load_model(asr_model_name)
|
| 122 |
|
| 123 |
return asr_model
|
| 124 |
|
| 125 |
-
@st.
|
| 126 |
def load_whisper_api(audio):
|
| 127 |
file = open(audio, "rb")
|
| 128 |
transcript = openai.Audio.translate("whisper-1", file)
|
| 129 |
|
| 130 |
return transcript
|
| 131 |
|
| 132 |
-
@st.
|
| 133 |
def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
|
| 134 |
|
| 135 |
'''Process text for Semantic Search'''
|
|
@@ -144,7 +144,7 @@ def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
|
|
| 144 |
|
| 145 |
return docsearch
|
| 146 |
|
| 147 |
-
@st.
|
| 148 |
def chunk_and_preprocess_text(text,thresh=500):
|
| 149 |
|
| 150 |
"""Chunk text longer than n tokens for summarization"""
|
|
@@ -170,7 +170,7 @@ def chunk_and_preprocess_text(text,thresh=500):
|
|
| 170 |
|
| 171 |
return chunks
|
| 172 |
|
| 173 |
-
@st.
|
| 174 |
def gen_embeddings(embedding_model):
|
| 175 |
|
| 176 |
'''Generate embeddings for given model'''
|
|
@@ -187,7 +187,7 @@ def gen_embeddings(embedding_model):
|
|
| 187 |
|
| 188 |
return embeddings
|
| 189 |
|
| 190 |
-
@st.
|
| 191 |
def embed_text(query,title,embedding_model,_docsearch):
|
| 192 |
|
| 193 |
'''Embed text and generate semantic search scores'''
|
|
@@ -212,12 +212,12 @@ def embed_text(query,title,embedding_model,_docsearch):
|
|
| 212 |
|
| 213 |
return answer
|
| 214 |
|
| 215 |
-
@st.
|
| 216 |
def get_spacy():
|
| 217 |
nlp = en_core_web_lg.load()
|
| 218 |
return nlp
|
| 219 |
|
| 220 |
-
@st.
|
| 221 |
def inference(link, upload, _asr_model):
|
| 222 |
'''Convert Youtube video or Audio upload to text'''
|
| 223 |
|
|
@@ -307,7 +307,7 @@ def inference(link, upload, _asr_model):
|
|
| 307 |
return results['text'], yt.title
|
| 308 |
|
| 309 |
|
| 310 |
-
@st.
|
| 311 |
def sentiment_pipe(earnings_text):
|
| 312 |
'''Determine the sentiment of the text'''
|
| 313 |
|
|
@@ -316,7 +316,7 @@ def sentiment_pipe(earnings_text):
|
|
| 316 |
|
| 317 |
return earnings_sentiment, earnings_sentences
|
| 318 |
|
| 319 |
-
@st.
|
| 320 |
def summarize_text(text_to_summarize,max_len,min_len):
|
| 321 |
'''Summarize text with HF model'''
|
| 322 |
|
|
@@ -329,7 +329,7 @@ def summarize_text(text_to_summarize,max_len,min_len):
|
|
| 329 |
|
| 330 |
return summarized_text
|
| 331 |
|
| 332 |
-
@st.
|
| 333 |
def clean_text(text):
|
| 334 |
'''Clean all text'''
|
| 335 |
|
|
@@ -341,7 +341,7 @@ def clean_text(text):
|
|
| 341 |
|
| 342 |
return text
|
| 343 |
|
| 344 |
-
@st.
|
| 345 |
def chunk_long_text(text,threshold,window_size=3,stride=2):
|
| 346 |
'''Preprocess text and chunk for sentiment analysis'''
|
| 347 |
|
|
@@ -378,7 +378,7 @@ def summary_downloader(raw_text):
|
|
| 378 |
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
|
| 379 |
st.markdown(href,unsafe_allow_html=True)
|
| 380 |
|
| 381 |
-
@st.
|
| 382 |
def get_all_entities_per_sentence(text):
|
| 383 |
doc = nlp(''.join(text))
|
| 384 |
|
|
@@ -401,12 +401,12 @@ def get_all_entities_per_sentence(text):
|
|
| 401 |
|
| 402 |
return entities_all_sentences
|
| 403 |
|
| 404 |
-
@st.
|
| 405 |
def get_all_entities(text):
|
| 406 |
all_entities_per_sentence = get_all_entities_per_sentence(text)
|
| 407 |
return list(itertools.chain.from_iterable(all_entities_per_sentence))
|
| 408 |
|
| 409 |
-
@st.
|
| 410 |
def get_and_compare_entities(article_content,summary_output):
|
| 411 |
|
| 412 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
|
@@ -454,7 +454,7 @@ def get_and_compare_entities(article_content,summary_output):
|
|
| 454 |
|
| 455 |
return matched_entities, unmatched_entities
|
| 456 |
|
| 457 |
-
@st.
|
| 458 |
def highlight_entities(article_content,summary_output):
|
| 459 |
|
| 460 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
|
@@ -506,6 +506,7 @@ def fin_ext(text):
|
|
| 506 |
|
| 507 |
## Knowledge Graphs code
|
| 508 |
|
|
|
|
| 509 |
def extract_relations_from_model_output(text):
|
| 510 |
relations = []
|
| 511 |
relation, subject, relation, object_ = '', '', '', ''
|
|
|
|
| 97 |
|
| 98 |
###################### Functions #######################################################################################
|
| 99 |
|
| 100 |
+
@st.cache_resource
|
| 101 |
def load_models():
|
| 102 |
|
| 103 |
'''Load and cache all the models to be used'''
|
|
|
|
| 116 |
|
| 117 |
return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert
|
| 118 |
|
| 119 |
+
@st.cache_resource
|
| 120 |
def load_asr_model(asr_model_name):
|
| 121 |
asr_model = whisper.load_model(asr_model_name)
|
| 122 |
|
| 123 |
return asr_model
|
| 124 |
|
| 125 |
+
@st.cache_data
|
| 126 |
def load_whisper_api(audio):
|
| 127 |
file = open(audio, "rb")
|
| 128 |
transcript = openai.Audio.translate("whisper-1", file)
|
| 129 |
|
| 130 |
return transcript
|
| 131 |
|
| 132 |
+
@st.cache_data
|
| 133 |
def process_corpus(corpus, title, embedding_model, chunk_size=1000, overlap=50):
|
| 134 |
|
| 135 |
'''Process text for Semantic Search'''
|
|
|
|
| 144 |
|
| 145 |
return docsearch
|
| 146 |
|
| 147 |
+
@st.cache_data
|
| 148 |
def chunk_and_preprocess_text(text,thresh=500):
|
| 149 |
|
| 150 |
"""Chunk text longer than n tokens for summarization"""
|
|
|
|
| 170 |
|
| 171 |
return chunks
|
| 172 |
|
| 173 |
+
@st.cache_resource
|
| 174 |
def gen_embeddings(embedding_model):
|
| 175 |
|
| 176 |
'''Generate embeddings for given model'''
|
|
|
|
| 187 |
|
| 188 |
return embeddings
|
| 189 |
|
| 190 |
+
@st.cache_data
|
| 191 |
def embed_text(query,title,embedding_model,_docsearch):
|
| 192 |
|
| 193 |
'''Embed text and generate semantic search scores'''
|
|
|
|
| 212 |
|
| 213 |
return answer
|
| 214 |
|
| 215 |
+
@st.cache_resource
|
| 216 |
def get_spacy():
|
| 217 |
nlp = en_core_web_lg.load()
|
| 218 |
return nlp
|
| 219 |
|
| 220 |
+
@st.cache_data
|
| 221 |
def inference(link, upload, _asr_model):
|
| 222 |
'''Convert Youtube video or Audio upload to text'''
|
| 223 |
|
|
|
|
| 307 |
return results['text'], yt.title
|
| 308 |
|
| 309 |
|
| 310 |
+
@st.cache_data
|
| 311 |
def sentiment_pipe(earnings_text):
|
| 312 |
'''Determine the sentiment of the text'''
|
| 313 |
|
|
|
|
| 316 |
|
| 317 |
return earnings_sentiment, earnings_sentences
|
| 318 |
|
| 319 |
+
@st.cache_data
|
| 320 |
def summarize_text(text_to_summarize,max_len,min_len):
|
| 321 |
'''Summarize text with HF model'''
|
| 322 |
|
|
|
|
| 329 |
|
| 330 |
return summarized_text
|
| 331 |
|
| 332 |
+
@st.cache_data
|
| 333 |
def clean_text(text):
|
| 334 |
'''Clean all text'''
|
| 335 |
|
|
|
|
| 341 |
|
| 342 |
return text
|
| 343 |
|
| 344 |
+
@st.cache_data
|
| 345 |
def chunk_long_text(text,threshold,window_size=3,stride=2):
|
| 346 |
'''Preprocess text and chunk for sentiment analysis'''
|
| 347 |
|
|
|
|
| 378 |
href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>'
|
| 379 |
st.markdown(href,unsafe_allow_html=True)
|
| 380 |
|
| 381 |
+
@st.cache_data
|
| 382 |
def get_all_entities_per_sentence(text):
|
| 383 |
doc = nlp(''.join(text))
|
| 384 |
|
|
|
|
| 401 |
|
| 402 |
return entities_all_sentences
|
| 403 |
|
| 404 |
+
@st.cache_data
|
| 405 |
def get_all_entities(text):
|
| 406 |
all_entities_per_sentence = get_all_entities_per_sentence(text)
|
| 407 |
return list(itertools.chain.from_iterable(all_entities_per_sentence))
|
| 408 |
|
| 409 |
+
@st.cache_data
|
| 410 |
def get_and_compare_entities(article_content,summary_output):
|
| 411 |
|
| 412 |
all_entities_per_sentence = get_all_entities_per_sentence(article_content)
|
|
|
|
| 454 |
|
| 455 |
return matched_entities, unmatched_entities
|
| 456 |
|
| 457 |
+
@st.cache_data
|
| 458 |
def highlight_entities(article_content,summary_output):
|
| 459 |
|
| 460 |
markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">"
|
|
|
|
| 506 |
|
| 507 |
## Knowledge Graphs code
|
| 508 |
|
| 509 |
+
@st.cache_data
|
| 510 |
def extract_relations_from_model_output(text):
|
| 511 |
relations = []
|
| 512 |
relation, subject, relation, object_ = '', '', '', ''
|