File size: 3,200 Bytes
0c6d13f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import os
import torch
from time import time
from torch import cuda, bfloat16
from langchain import HuggingFacePipeline
from transformers import (AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, AutoConfig,
T5ForConditionalGeneration, pipeline, BitsAndBytesConfig)
def load_model_and_pipeline(model_id, temperature=0):
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type='nf4',
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=bfloat16
)
time_1 = time()
model_config = AutoConfig.from_pretrained(
model_id,
)
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
config=model_config,
quantization_config=bnb_config,
device_map='auto',
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
time_2 = time()
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")
time_1 = time()
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
dtype=torch.float16,
device_map="auto",
max_new_tokens=512,
do_sample=True,
top_k=30,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id)
time_2 = time()
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': temperature})
return tokenizer, model, llm
import pandas as pd
from datasets import Dataset
def calculate_rag_metrics(model_ques_ans_gen, llm_model, embedding_model="BAAI/bge-base-en-v1.5"):
# Create a dictionary from the model_ques_ans_gen list
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness,answer_similarity,answer_relevancy,context_recall, context_precision
# context = {'contexts':'NA'}
# print([item['question'] for item in model_ques_ans_gen])
data_samples = {
'question': [item['question'] for item in model_ques_ans_gen],
'answer': [item['answer'] for item in model_ques_ans_gen],
'contexts': [[''] for item in model_ques_ans_gen],
'reference': [item['ground_truths'] for item in model_ques_ans_gen]
}
# Convert the dictionary to a pandas DataFrame
rag_df = pd.DataFrame(data_samples)
# Convert the DataFrame to a HuggingFace Dataset
rag_eval_dataset = Dataset.from_pandas(rag_df)
# Define the list of metrics to calculate
metrics = [
answer_correctness, answer_similarity,
answer_relevancy, faithfulness,
context_recall, context_precision
]
# Perform the evaluation using the provided LLM and embedding models
result = evaluate(
rag_eval_dataset,
metrics=metrics,
llm=llm_model,
embeddings=embedding_model
)
# result.to_pandas()
return result.to_pandas() |