|
|
import os
|
|
|
import torch
|
|
|
from time import time
|
|
|
from torch import cuda, bfloat16
|
|
|
from langchain import HuggingFacePipeline
|
|
|
from transformers import (AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, AutoConfig,
|
|
|
T5ForConditionalGeneration, pipeline, BitsAndBytesConfig)
|
|
|
|
|
|
def load_model_and_pipeline(model_id, temperature=0):
|
|
|
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
|
|
|
|
|
|
bnb_config = BitsAndBytesConfig(
|
|
|
load_in_4bit=True,
|
|
|
bnb_4bit_quant_type='nf4',
|
|
|
bnb_4bit_use_double_quant=True,
|
|
|
bnb_4bit_compute_dtype=bfloat16
|
|
|
)
|
|
|
|
|
|
time_1 = time()
|
|
|
model_config = AutoConfig.from_pretrained(
|
|
|
model_id,
|
|
|
)
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
|
model_id,
|
|
|
trust_remote_code=True,
|
|
|
config=model_config,
|
|
|
quantization_config=bnb_config,
|
|
|
device_map='auto',
|
|
|
)
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
|
time_2 = time()
|
|
|
print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")
|
|
|
time_1 = time()
|
|
|
|
|
|
pipe = pipeline(
|
|
|
"text-generation",
|
|
|
model=model,
|
|
|
tokenizer=tokenizer,
|
|
|
dtype=torch.float16,
|
|
|
device_map="auto",
|
|
|
max_new_tokens=512,
|
|
|
do_sample=True,
|
|
|
top_k=30,
|
|
|
num_return_sequences=1,
|
|
|
eos_token_id=tokenizer.eos_token_id)
|
|
|
time_2 = time()
|
|
|
print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")
|
|
|
|
|
|
llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': temperature})
|
|
|
return tokenizer, model, llm
|
|
|
|
|
|
import pandas as pd
|
|
|
from datasets import Dataset
|
|
|
|
|
|
def calculate_rag_metrics(model_ques_ans_gen, llm_model, embedding_model="BAAI/bge-base-en-v1.5"):
|
|
|
|
|
|
from ragas import evaluate
|
|
|
from ragas.metrics import faithfulness, answer_correctness,answer_similarity,answer_relevancy,context_recall, context_precision
|
|
|
|
|
|
|
|
|
data_samples = {
|
|
|
'question': [item['question'] for item in model_ques_ans_gen],
|
|
|
'answer': [item['answer'] for item in model_ques_ans_gen],
|
|
|
'contexts': [[''] for item in model_ques_ans_gen],
|
|
|
'reference': [item['ground_truths'] for item in model_ques_ans_gen]
|
|
|
}
|
|
|
|
|
|
|
|
|
rag_df = pd.DataFrame(data_samples)
|
|
|
|
|
|
|
|
|
rag_eval_dataset = Dataset.from_pandas(rag_df)
|
|
|
|
|
|
|
|
|
metrics = [
|
|
|
answer_correctness, answer_similarity,
|
|
|
answer_relevancy, faithfulness,
|
|
|
context_recall, context_precision
|
|
|
]
|
|
|
|
|
|
|
|
|
result = evaluate(
|
|
|
rag_eval_dataset,
|
|
|
metrics=metrics,
|
|
|
llm=llm_model,
|
|
|
embeddings=embedding_model
|
|
|
)
|
|
|
|
|
|
return result.to_pandas() |