Project Logging with Weights and Biases (WandB)#

from workshop_utils import display_pdf

display_pdf("Slides_part9.pdf")
# Import libraries
import wandb
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch
wandb.login()

Do LoRA with W&B logging#

import os
import random
import time
import wandb
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    TrainerCallback,
    logging
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
import torch
# Initialize WandB
wandb.init(project="llms_finetune", job_type="training")

# Access configuration
config = wandb.config
config.model_name = "Qwen/Qwen2.5-0.5B-Instruct"
config.dataset_name = "HuggingFaceH4/MATH-500"
config.lora_r = 8
config.lora_alpha = 32
config.lora_dropout = 0.05
config.per_device_train_batch_size = 8
config.learning_rate = 1e-4
config.num_train_epochs = 2
config.fp16 = True
config.bf16 = False
config.save_steps = 10
config.eval_steps = 10
config.save_total_limit = 1
config.optim = "paged_adamw_32bit"

Prepare data (just as in previous noteboooks)#

# Load the dataset
ds = load_dataset(config.dataset_name)
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

# Load the model and tokenizer
model_name = config.model_name
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)

# Map the formatting function over the dataset.
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=config.lora_r, 
    lora_alpha=config.lora_alpha, 
    lora_dropout=config.lora_dropout,
    target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)

# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)

Perform training (while logging to W&B)#

from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-lora-math",          # Output directory
    num_train_epochs=config.num_train_epochs,              # Number of training epochs
    per_device_train_batch_size=config.per_device_train_batch_size,   # Batch size per device during training
    optim=config.optim,        # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=config.save_steps,                   # Save checkpoint every X updates steps
    eval_steps=config.eval_steps,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=config.save_total_limit,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=config.learning_rate,              # Learning rate
    fp16=config.fp16,                       # Use mixed precision training
    bf16=config.bf16,                      # Use bfloat16 training
    report_to="wandb"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

# Finish the WandB run
wandb.finish()

Run a WandB Sweep#

Define a sweep configuration with hyperparameters to tune. Use wandb.sweep() to create a sweep and wandb.agent() to run the sweep agent, optimizing the hyperparameters.

# Set up the config we'll use for each run during the sweep

config_defaults = {
    "model_name": "Qwen/Qwen2.5-0.5B-Instruct",
    "dataset_name": "HuggingFaceH4/MATH-500",
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "num_train_epochs": 2,
    "fp16": True,
    "bf16": False,
    "save_steps": 20,
    "eval_steps": 20,
    "eval_strategy": "steps",
    "save_total_limit": 1,
    "epochs": 2,
    "optim": "paged_adamw_32bit"
}
# Define a sweep configuration with hyperparameters to tune
sweep_config = {
    "method": "random",  # Random search; other options include "grid", "bayesian", etc.
    "metric": {
        "name": "eval/loss",
        "goal": "minimize"
    },
    "parameters": {
        "learning_rate": {
            "values": [1e-5, 1e-4, 1e-3]
        },
        "batch_size": {
            "values": [4, 8]
        },
        "lora_r": {
            "values": [4, 8, 16]
        }
    }
}

# Create a sweep
sweep_id = wandb.sweep(sweep_config, project="llm-finetuning")

# Define the training function
def train():
    # Initialize a new wandb run
    wandb.init(config=config_defaults)
    config = wandb.config
    
    # Load the dataset
    ds = load_dataset(wandb.config.dataset_name)
    train_val_dataset = ds["test"].train_test_split(test_size=0.1)
    train_dataset = train_val_dataset["train"]
    eval_dataset = train_val_dataset["test"]
    
    # Load the model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    model = AutoModelForCausalLM.from_pretrained(config.model_name, device_map="auto")

    # The model may not have a pad token set by default, so set it (using the EOS token)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator
    
    data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)
    
    # Map the formatting function over the dataset.
    train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
    eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
    
    # Get a sample dataset so we can examine model generations before and after training
    sample_dataset = eval_dataset.select(range(3))
    sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})
    
    train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
    eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
    sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
    
    from peft import LoraConfig, get_peft_model, TaskType
    
    # Define LoRA Config
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM, 
        inference_mode=False, 
        r=config.lora_r, 
        lora_alpha=config.lora_alpha, 
        lora_dropout=config.lora_dropout,
        target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
    )
    
    # Add LoRA adapter to the model
    model = get_peft_model(model, lora_config)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=config.epochs,
        per_device_train_batch_size=config.batch_size,
        save_steps=config.save_steps,
        eval_steps=config.eval_steps,
        eval_strategy=config.eval_strategy,
        save_total_limit=1,
        learning_rate=config.learning_rate,
        fp16=config.fp16,
        bf16=config.bf16,
        logging_dir="./logs",
        logging_steps=10,
        load_best_model_at_end=True,
        report_to="wandb",
    )
    
    
    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset_tokenized,
        eval_dataset=eval_dataset_tokenized,
        tokenizer=tokenizer,
        data_collator=data_collator_fn,
    )
    
    # Train the model
    trainer.train()
    
    # Save the model
    # model.save_pretrained(f"./model_{wandb.run.id}")
    # tokenizer.save_pretrained(f"./model_{wandb.run.id}")
    
    # # Log model checkpoint as artifact
    # artifact = wandb.Artifact(f"model_{wandb.run.id}", type="model")
    # artifact.add_dir(f"./model_{wandb.run.id}")
    # wandb.log_artifact(artifact)
    
    # Finish the WandB run
    wandb.finish()

# Run the sweep agent
wandb.agent(sweep_id, function=train, count=3)