Spaces:

CGIAR
/

farmerbot

Sleeping

File size: 3,200 Bytes

0c6d13f

import os
import torch
from time import time
from torch import cuda, bfloat16
from langchain import HuggingFacePipeline
from transformers import (AutoTokenizer, AutoModelForCausalLM, T5Tokenizer, AutoConfig,
                          T5ForConditionalGeneration, pipeline, BitsAndBytesConfig)

def load_model_and_pipeline(model_id, temperature=0):
    device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=bfloat16
    )

    time_1 = time()
    model_config = AutoConfig.from_pretrained(
        model_id,
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
    model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    time_2 = time()
    print(f"Prepare model, tokenizer: {round(time_2-time_1, 3)} sec.")
    time_1 = time()

    pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            dtype=torch.float16,
            device_map="auto",
            max_new_tokens=512,
            do_sample=True,
            top_k=30,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id)
    time_2 = time()
    print(f"Prepare pipeline: {round(time_2-time_1, 3)} sec.")

    llm = HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature': temperature})
    return tokenizer, model, llm

import pandas as pd
from datasets import Dataset

def calculate_rag_metrics(model_ques_ans_gen, llm_model, embedding_model="BAAI/bge-base-en-v1.5"):
    # Create a dictionary from the model_ques_ans_gen list
    from ragas import evaluate
    from ragas.metrics import faithfulness, answer_correctness,answer_similarity,answer_relevancy,context_recall, context_precision
    # context = {'contexts':'NA'}
    # print([item['question'] for item in model_ques_ans_gen])
    data_samples = {
        'question': [item['question'] for item in model_ques_ans_gen],
        'answer': [item['answer'] for item in model_ques_ans_gen],
        'contexts': [[''] for item in model_ques_ans_gen],
        'reference': [item['ground_truths'] for item in model_ques_ans_gen]
    }

    # Convert the dictionary to a pandas DataFrame
    rag_df = pd.DataFrame(data_samples)

    # Convert the DataFrame to a HuggingFace Dataset
    rag_eval_dataset = Dataset.from_pandas(rag_df)

    # Define the list of metrics to calculate
    metrics = [
        answer_correctness, answer_similarity,
        answer_relevancy, faithfulness,
        context_recall, context_precision
    ]

    # Perform the evaluation using the provided LLM and embedding models
    result = evaluate(
        rag_eval_dataset,
        metrics=metrics,
        llm=llm_model,
        embeddings=embedding_model
    )
    # result.to_pandas()
    return result.to_pandas()