|
|
|
|
|
import os |
|
|
import torch |
|
|
from datasets import load_dataset, Dataset |
|
|
import pandas as pd |
|
|
import transformers |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig |
|
|
from trl import SFTTrainer |
|
|
import transformers |
|
|
|
|
|
from transformers import GenerationConfig |
|
|
from pynvml import * |
|
|
import glob |
|
|
class phi_trainer: |
|
|
def formatted_text(self,x,tokenizer): |
|
|
temp = [ |
|
|
|
|
|
{"role": "user", "content": """You are a helpful chatbot, help users by answering their queries. |
|
|
Question: """ + x["question"]}, |
|
|
{"role": "assistant", "content": x["answer"]} |
|
|
] |
|
|
return tokenizer.apply_chat_template(temp, add_generation_prompt=False, tokenize=False) |
|
|
def phi_finetune(self,lr,epoch,batch_size,gradient_accumulation,quantization,lora_r,lora_alpha,lora_dropout): |
|
|
base_model = "microsoft/Phi-3-mini-4k-instruct" |
|
|
from datetime import datetime |
|
|
lora_output = f'models/{quantization}_Phi_lora_{datetime.now().strftime("%Y_%m_%d_%H_%M_%S")}' |
|
|
full_output = f'models/{quantization}_Phi_full_{datetime.now().strftime("%Y_%m_%d_%H_%M_%S")}' |
|
|
DEVICE = 'cuda' |
|
|
tokenizer = AutoTokenizer.from_pretrained(base_model) |
|
|
tokenizer.padding_side = 'right' |
|
|
|
|
|
data_location = r"data/finetune_data.xlsx" |
|
|
data_df=pd.read_excel( data_location ) |
|
|
|
|
|
data_df["text"] = data_df[["question", "answer"]].apply(lambda x: self.formatted_text(x,tokenizer), axis=1) |
|
|
print(data_df.iloc[0]) |
|
|
dataset = Dataset.from_pandas(data_df) |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_8bit= True, |
|
|
) |
|
|
if quantization == 4: |
|
|
print("*"*10,": 4 bit quantization") |
|
|
bnb_config = BitsAndBytesConfig( |
|
|
load_in_4bit= True, |
|
|
bnb_4bit_use_double_quant=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16 |
|
|
) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
base_model, |
|
|
quantization_config=bnb_config, |
|
|
torch_dtype=torch.bfloat16, |
|
|
device_map="auto", |
|
|
trust_remote_code=True, |
|
|
) |
|
|
model.config.use_cache = False |
|
|
model.config.pretraining_tp = 1 |
|
|
model.gradient_checkpointing_enable() |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True) |
|
|
tokenizer.padding_side = 'right' |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
|
|
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
|
|
|
|
|
|
|
|
config = LoraConfig( |
|
|
r= lora_r if lora_r else 16, |
|
|
lora_alpha= lora_alpha if lora_alpha else 32, |
|
|
target_modules=["q_proj", "v_proj","k_proj","o_proj","gate_proj","up_proj","down_proj"], |
|
|
lora_dropout= lora_dropout if lora_dropout else 0.05, |
|
|
bias="none", |
|
|
task_type="CAUSAL_LM") |
|
|
|
|
|
|
|
|
model = prepare_model_for_kbit_training(model) |
|
|
|
|
|
model = get_peft_model(model, config) |
|
|
|
|
|
|
|
|
MAXLEN=512 |
|
|
BATCH_SIZE = batch_size if batch_size else 4 |
|
|
GRAD_ACC = gradient_accumulation if gradient_accumulation else 4 |
|
|
OPTIMIZER ='paged_adamw_8bit' |
|
|
LR=lr if lr else 5e-06 |
|
|
|
|
|
training_config = transformers.TrainingArguments(per_device_train_batch_size=BATCH_SIZE, |
|
|
gradient_accumulation_steps=GRAD_ACC, |
|
|
optim=OPTIMIZER, |
|
|
learning_rate=LR, |
|
|
fp16=True, |
|
|
logging_steps=10, |
|
|
num_train_epochs = epoch if epoch else 2, |
|
|
output_dir=lora_output, |
|
|
remove_unused_columns=True, |
|
|
) |
|
|
|
|
|
|
|
|
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer,mlm=False) |
|
|
|
|
|
|
|
|
trainer = SFTTrainer(model=model, |
|
|
train_dataset=dataset, |
|
|
data_collator=data_collator, |
|
|
args=training_config, |
|
|
dataset_text_field="text", |
|
|
|
|
|
) |
|
|
|
|
|
trainer.train() |
|
|
print("*"*10,": Finetune ended!!!!") |
|
|
trainer.save_model(lora_output) |
|
|
|
|
|
|
|
|
from peft import PeftConfig |
|
|
config = PeftConfig.from_pretrained(lora_output) |
|
|
|
|
|
model = transformers.AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,trust_remote_code=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from peft import PeftModel |
|
|
model = PeftModel.from_pretrained(model, lora_output) |
|
|
|
|
|
|
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained(config.base_model_name_or_path) |
|
|
merged_model = model.merge_and_unload() |
|
|
|
|
|
merged_model.save_pretrained(full_output) |
|
|
tokenizer.save_pretrained(full_output) |
|
|
print("*"*10,": Model is saved!!!") |