import pandas as pd
import datetime
import gradio as gr
import os
# It shows the demo data format in finetuning tab
def move_to(move,model_ans):
df_temp=pd.read_excel(os.path.join("model_ans",str(model_ans)))
id_temp=int((df_temp.loc[move])['id'])
ques_temp=(df_temp.loc[move])['question']
ans_temp=(df_temp.loc[move])['answer']
if int(move)>=len(df_temp)+1:
gr.Info(f"Number of questions: {len(df_temp)}")
move=0
return [
gr.Label(value=str(id_temp),label="ID"),
gr.Label(value=ques_temp,label="Question"),
gr.Label(value=ans_temp,label="Answer")
]
def display_table(path=r"data/demo_table_data.xlsx"):
df = pd.read_excel(path)
df_with_custom_index = df.head(2)
# df_with_custom_index.index = [f"Row {i+1}" for i in range(len(df_with_custom_index))]
html_table = df_with_custom_index.to_html(index=False)
return f"
{html_table}
"
def current_time():
# ff="model_ans_llama_finetuned486_rag_ensemble"
# df=pd.read_excel(r"model_ans/model_ans_mistral_finetuned486_rag_ensemble.xlsx")
current_datetime = datetime.datetime.now()
# file_name = current_datetime.strftime("%Y_%m_%d_%H_%M_%S")+ff
return current_datetime.strftime("%Y_%m_%d_%H_%M_%S")
# This function use in human evaluation
def random_ques_ans2():
import random
import pandas as pd
df=pd.read_excel(r"data/existing_dataset.xlsx")
id=random.randint(0,len(df))
ques_temp=(df.loc[id])['question']
ans_temp=""
return ques_temp,ans_temp
def score_report_bar():
path="score_report"
import os
import math
dat=[]
for x in os.listdir(path):
wh=[]
flag=0
for x2 in x:
if x2>='a' and x2<='z':
flag=1
wh.append(x2)
elif flag==1:
wh.append(" ")
wh=''.join(wh)
wh=wh.replace("model ans","")
wh=wh.replace("finetuned","")
wh=wh.replace(" "," ")
wh=wh.replace("xlsx","")
df_temp=pd.read_excel(os.path.join(path,x))
rating=sum(df_temp["rating"])/len(df_temp)
dat.append({
"Model Name":wh,
"Average Rating":rating
})
temp=pd.DataFrame(dat)
return temp
def parse_data(link,progress):
from bs4 import BeautifulSoup
import requests
import re
from docx import Document
from langchain_community.document_loaders import WebBaseLoader
s=set()
import time
start_time = time.time()
duration = 5
def get_links(url):
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data, 'lxml')
links = []
for link in soup.find_all('a'):
link_url = link.get('href')
if link_url is not None and link_url.startswith('http'):
s.add(link_url)
links.append(link_url)
return links
# def write_to_file(links):
# with open('data.txt', 'a') as f:
# f.writelines(links)
def get_all_links(url):
for link in get_links(url):
if (time.time() - start_time) >= duration:
return
get_all_links(link)
def data_ret2(link):
loader = WebBaseLoader(f"{link}")
data = loader.load()
return data[0].page_content
# link = 'https://kuet.ac.bd'
s.add(link)
get_all_links(link)
li=list(s)
all_data=[]
for x in progress.tqdm(li):
try:
print("Link: ",x)
all_data.append(data_ret2(x))
except:
print("pass")
continue
all_data2 = re.sub(r'\n+', '\n\n', "\n".join(all_data))
all_data2=re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', all_data2)
document = Document()
document.add_paragraph(all_data2)
document.save(f'rag_data/{link}.docx')
print("Finished!!")
return
def all_contri_ans(id, ques):
folder_path = 'save_ques_ans'
data_frames = []
for filename in os.listdir(folder_path):
if filename.endswith(".xlsx") or filename.endswith(".xls"):
file_path = os.path.join(folder_path, filename)
df = pd.read_excel(file_path)
data_frames.append(df)
df_hum = pd.concat(data_frames, ignore_index=True)
temp=[]
for x,y in zip(df_hum['question'],df_hum['answer']):
if x==ques:
temp.append(y)
if len(temp)==0:
temp=["This question's answer is not available."]
return temp
import json
import os
def save_params_to_file(model_name,embedding_name, splitter_type_dropdown, chunk_size_slider,
chunk_overlap_slider, separator_textbox, max_tokens_slider, filename="params.txt"):
params = {
"model_name":model_name,
"embedding_name": embedding_name,
"splitter_type_dropdown": splitter_type_dropdown,
"chunk_size_slider": chunk_size_slider,
"chunk_overlap_slider": chunk_overlap_slider,
"separator_textbox": separator_textbox,
"max_tokens_slider": max_tokens_slider
}
with open(filename, 'w') as f:
json.dump(params, f)
with open("deploy//params.txt", 'w') as f:
json.dump(params, f)
def load_params_from_file(filename="params.txt"):
if os.path.exists(filename):
with open(filename, 'r') as f:
params = json.load(f)
return params
else:
return None