Full fine-tune

Full fine-tune#

Before jumping in, let’s look at some of the dangers of fine-tuning.

from workshop_utils import display_pdf

display_pdf("Slides_part6.pdf")

Now, let’s look at a brief overview of different kinds of learning paradigms, and then start jumping into the different kinds of model updates that can be used to implement any of those learning paradigms.

display_pdf("Slides_part7.pdf")

Let’s implement a full fine-tune.

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
import torch

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

 # Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map={'':0} # "auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer) 

# Map the formatting function over the dataset.
# This applies the formatting function to each example in the dataset.
# The result is that we have a dataset where each math problem is formatted as a prompt for the model,
# and the solution is formatted as a response that the model should generate.
# Each example is also tokenized
# (If your dataset is large you might use batched=True; here we keep it simple.)
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

!module list

If the below code cell gives you an error MissingCUDAException: CUDA_HOME does not exist, unable to compile CUDA op(s), then you need to load the cuda system module. When requesting an OpenOnDemand job, you should enter cuda in the field labeled “List of modules to be loaded, separate by an empty space”. The the code cell just above this will show that both anaconda3 and cuda are loaded.

from transformers import Trainer, TrainingArguments

# Set up training arguments.
training_args = TrainingArguments(
    output_dir="./qwen-finetuned-math",
    per_device_train_batch_size=4,  # Adjust as needed
    num_train_epochs=2,
    logging_steps=20,
    save_steps=20,
    fp16=True,  # Use mixed precision if supported.
    eval_strategy="steps",  # Evaluate every eval_steps
    eval_steps=20,  # Evaluate every x steps
    save_total_limit=1, # Only save one checkpoint
    load_best_model_at_end=True, # Load the best model at the end of training
    report_to="none"
)

# Set up the Trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,
)

# Start fine-tuning.
trainer.train()

# Save the model and tokenizer
model_path = "./qwen-finetuned-math-final"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Load the saved model
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "./qwen-finetuned-math-final"
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

import gc
import torch

# Clear CUDA cache
torch.cuda.empty_cache()
# Garbage collection
gc.collect()

The Trainer class takes care of a lot of things under the hood. If you’d rather deal with these details directly yourself, you can avoid using the Trainer class and set up the training logic yourself.

from torch.utils.data import DataLoader
from tqdm import tqdm

# Define your dataloaders using the custom data_collator to pad variable-length sequences
train_dataloader = DataLoader(train_dataset_tokenized, batch_size=2, shuffle=True, collate_fn=data_collator_fn)
eval_dataloader = DataLoader(eval_dataset_tokenized, batch_size=2, shuffle=False, collate_fn=data_collator_fn)

# Define your optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 2
device = model.device

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Move batch tensors to the right device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average training loss: {avg_loss:.4f}")

    # Evaluation loop (optional)
    model.eval()  # Set the model to evaluation mode
    eval_loss = 0
    with torch.no_grad():  # Disable gradient calculation during evaluation
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            # Move batch tensors to the right device
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average evaluation loss: {avg_eval_loss:.4f}")

print("Training complete!")