Project Logging with Weights and Biases (WandB)#
from workshop_utils import display_pdf
display_pdf("Slides_part9.pdf")
# Import libraries
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch
wandb.login()
Do LoRA with W&B logging#
import os
import random
import time
import wandb
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
BitsAndBytesConfig,
TrainerCallback,
logging
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import torch
# Initialize WandB
wandb.init(project="llms_finetune", job_type="training")
# Access configuration
config = wandb.config
config.model_name = "Qwen/Qwen2.5-0.5B-Instruct"
config.dataset_name = "HuggingFaceH4/MATH-500"
config.lora_r = 8
config.lora_alpha = 32
config.lora_dropout = 0.05
config.per_device_train_batch_size = 8
config.learning_rate = 1e-4
config.num_train_epochs = 2
config.fp16 = True
config.bf16 = False
config.save_steps = 10
config.eval_steps = 10
config.save_total_limit = 1
config.optim = "paged_adamw_32bit"
Prepare data (just as in previous noteboooks)#
# Load the dataset
ds = load_dataset(config.dataset_name)
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]
# Load the model and tokenizer
model_name = config.model_name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto"
)
# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator
data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)
# Map the formatting function over the dataset.
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})
train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
from peft import LoraConfig, get_peft_model, TaskType
# Define LoRA Config
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=config.lora_r,
lora_alpha=config.lora_alpha,
lora_dropout=config.lora_dropout,
target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)
# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)
Perform training (while logging to W&B)#
from transformers import Trainer, TrainingArguments
# Define training arguments
training_args = TrainingArguments(
output_dir="./qwen-lora-math", # Output directory
num_train_epochs=config.num_train_epochs, # Number of training epochs
per_device_train_batch_size=config.per_device_train_batch_size, # Batch size per device during training
optim=config.optim, # Optimizer, you might need to install accelerate: pip install accelerate -U
save_steps=config.save_steps, # Save checkpoint every X updates steps
eval_steps=config.eval_steps, # Evaluate every X updates steps
eval_strategy="steps", # Evaluation strategy
save_total_limit=config.save_total_limit, # Limit the total amount of checkpoints
load_best_model_at_end=True, # Load the best model when finished training (default is True)
logging_steps=10, # Log every X updates steps
learning_rate=config.learning_rate, # Learning rate
fp16=config.fp16, # Use mixed precision training
bf16=config.bf16, # Use bfloat16 training
report_to="wandb"
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset_tokenized,
eval_dataset=eval_dataset_tokenized,
data_collator=data_collator_fn, # Data collator if needed
)
# Train the model
trainer.train()
# Finish the WandB run
wandb.finish()
Run a WandB Sweep#
Define a sweep configuration with hyperparameters to tune. Use wandb.sweep()
to create a sweep and wandb.agent()
to run the sweep agent, optimizing the hyperparameters.
# Set up the config we'll use for each run during the sweep
config_defaults = {
"model_name": "Qwen/Qwen2.5-0.5B-Instruct",
"dataset_name": "HuggingFaceH4/MATH-500",
"lora_alpha": 32,
"lora_dropout": 0.05,
"num_train_epochs": 2,
"fp16": True,
"bf16": False,
"save_steps": 20,
"eval_steps": 20,
"eval_strategy": "steps",
"save_total_limit": 1,
"epochs": 2,
"optim": "paged_adamw_32bit"
}
# Define a sweep configuration with hyperparameters to tune
sweep_config = {
"method": "random", # Random search; other options include "grid", "bayesian", etc.
"metric": {
"name": "eval/loss",
"goal": "minimize"
},
"parameters": {
"learning_rate": {
"values": [1e-5, 1e-4, 1e-3]
},
"batch_size": {
"values": [4, 8]
},
"lora_r": {
"values": [4, 8, 16]
}
}
}
# Create a sweep
sweep_id = wandb.sweep(sweep_config, project="llm-finetuning")
# Define the training function
def train():
# Initialize a new wandb run
wandb.init(config=config_defaults)
config = wandb.config
# Load the dataset
ds = load_dataset(wandb.config.dataset_name)
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]
# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModelForCausalLM.from_pretrained(config.model_name, device_map="auto")
# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator
data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)
# Map the formatting function over the dataset.
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})
train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
from peft import LoraConfig, get_peft_model, TaskType
# Define LoRA Config
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=config.lora_r,
lora_alpha=config.lora_alpha,
lora_dropout=config.lora_dropout,
target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)
# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)
# Define training arguments
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=config.epochs,
per_device_train_batch_size=config.batch_size,
save_steps=config.save_steps,
eval_steps=config.eval_steps,
eval_strategy=config.eval_strategy,
save_total_limit=1,
learning_rate=config.learning_rate,
fp16=config.fp16,
bf16=config.bf16,
logging_dir="./logs",
logging_steps=10,
load_best_model_at_end=True,
report_to="wandb",
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset_tokenized,
eval_dataset=eval_dataset_tokenized,
tokenizer=tokenizer,
data_collator=data_collator_fn,
)
# Train the model
trainer.train()
# Save the model
# model.save_pretrained(f"./model_{wandb.run.id}")
# tokenizer.save_pretrained(f"./model_{wandb.run.id}")
# # Log model checkpoint as artifact
# artifact = wandb.Artifact(f"model_{wandb.run.id}", type="model")
# artifact.add_dir(f"./model_{wandb.run.id}")
# wandb.log_artifact(artifact)
# Finish the WandB run
wandb.finish()
# Run the sweep agent
wandb.agent(sweep_id, function=train, count=3)