| | import os
|
| | import torch
|
| | import json
|
| | from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
|
| | from datasets import Dataset
|
| | import matplotlib.pyplot as plt
|
| |
|
| |
|
| | os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
| |
|
| |
|
| | model_name = "Salesforce/codegen-350M-multi"
|
| | local_model_path = "./codegen_model"
|
| | tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path)
|
| | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, cache_dir=local_model_path)
|
| |
|
| |
|
| | tokenizer.pad_token = tokenizer.eos_token
|
| |
|
| |
|
| | device = torch.device("cpu")
|
| | model.to(device)
|
| |
|
| |
|
| | dataset_path = "./custom_dataset.jsonl"
|
| | data = []
|
| | with open(dataset_path, 'r', encoding='utf-8') as f:
|
| | for line in f:
|
| | data.append(json.loads(line.strip()))
|
| | dataset = Dataset.from_list(data)
|
| |
|
| |
|
| | def tokenize_function(examples):
|
| | inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])]
|
| | return tokenizer(inputs, truncation=True, padding="max_length", max_length=128)
|
| |
|
| | tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"])
|
| |
|
| |
|
| | data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
| |
|
| |
|
| | training_args = TrainingArguments(
|
| | output_dir="./finetuned_codegen",
|
| | overwrite_output_dir=True,
|
| | num_train_epochs=5,
|
| | per_device_train_batch_size=1,
|
| | gradient_accumulation_steps=4,
|
| | save_steps=500,
|
| | save_total_limit=2,
|
| | logging_steps=10,
|
| | learning_rate=5e-5,
|
| | fp16=False,
|
| | no_cuda=True,
|
| | dataloader_pin_memory=False,
|
| | )
|
| |
|
| |
|
| | class LossCallback(TrainerCallback):
|
| | def __init__(self):
|
| | self.losses = []
|
| | self.steps = []
|
| |
|
| | def on_log(self, args, state, control, logs=None, **kwargs):
|
| | if logs and "loss" in logs:
|
| | self.losses.append(logs["loss"])
|
| | self.steps.append(state.global_step)
|
| |
|
| | loss_callback = LossCallback()
|
| |
|
| |
|
| | trainer = Trainer(
|
| | model=model,
|
| | args=training_args,
|
| | train_dataset=tokenized_dataset,
|
| | data_collator=data_collator,
|
| | callbacks=[loss_callback],
|
| | )
|
| |
|
| |
|
| | print("Starting fine-tuning...")
|
| | trainer.train()
|
| |
|
| |
|
| | model.save_pretrained("./finetuned_codegen")
|
| | tokenizer.save_pretrained("./finetuned_codegen")
|
| |
|
| |
|
| | plt.plot(loss_callback.steps, loss_callback.losses, label="Training Loss")
|
| | plt.xlabel("Steps")
|
| | plt.ylabel("Loss")
|
| | plt.title("Fine-Tuning Loss Curve")
|
| | plt.legend()
|
| | plt.savefig("./finetuned_codegen/loss_plot.png")
|
| | plt.show()
|
| |
|
| | print("Fine-tuning completed. Model saved to ./finetuned_codegen. Loss plot saved to ./finetuned_codegen/loss_plot.png")
|
| |
|
| |
|
| | print("\nTesting fine-tuned model...")
|
| | prompts = [
|
| | "Write a Python program to print 'Hello, guys how are you!'"
|
| | ]
|
| |
|
| | for prompt in prompts:
|
| | inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
|
| | outputs = model.generate(
|
| | **inputs,
|
| | max_length=200,
|
| | num_return_sequences=1,
|
| | pad_token_id=tokenizer.eos_token_id,
|
| | do_sample=True,
|
| | temperature=0.7,
|
| | top_p=0.9
|
| | )
|
| | generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| | print(f"Prompt: {prompt}\nGenerated Code:\n{generated_code}\n{'-'*50}") |