Parameter-efficient Fine-tuning (PEFT)

Contents

Parameter-efficient Fine-tuning (PEFT)#

from workshop_utils import display_pdf

display_pdf("Slides_part8.pdf")

LoRA#

Now let’s use LoRA to train the model. Notice how many fewer trainable parameters there are compared to the full fine-tune, and how much faster the training.

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
import torch

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

 # Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer) 

# Map the formatting function over the dataset.
# This applies the formatting function to each example in the dataset.
# The result is that we have a dataset where each math problem is formatted as a prompt for the model,
# and the solution is formatted as a response that the model should generate.
# Each example is also tokenized
# (If your dataset is large you might use batched=True; here we keep it simple.)
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_params = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        all_params += num_params
        if param.requires_grad:
            trainable_params += num_params
    print(f"trainable params: {trainable_params:,d} || all params: {all_params:,d} || trainable%: {100 * trainable_params / all_params:.2f}%")

print_trainable_parameters(model)

from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)

# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-lora-math",          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size per device during training
    gradient_accumulation_steps=1,   # Number of updates steps to accumulate before performing a backward/update pass
    optim="paged_adamw_32bit",        # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=20,                   # Save checkpoint every X updates steps
    eval_steps=20,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=1,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=1e-4,              # Learning rate
    weight_decay=0.001,              # Weight decay
    fp16=True,                       # Use mixed precision training
    bf16=False,                      # Use bfloat16 training
    max_grad_norm=0.3,               # Gradient clipping max norm
    max_steps=-1,                    # If > 0: set total number of training steps to perform. Override num_train_epochs.
    warmup_ratio=0.03,               # Linear warmup over warmup_ratio fraction of the total number of training steps.
    group_by_length=True,            # Group sequences of roughly the same length together for more efficient training
    lr_scheduler_type="cosine",       # Learning rate scheduler type
    report_to="none",                # Disable logging
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

# Save the LORA model
output_dir = "./qwen-lora-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Load the LORA model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, output_dir)

# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

QLoRA#

Now let’s try QLoRA. The primary benefit is reduced model size through quantization, so you can train a larger model on the same resources.

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
import torch
from transformers import BitsAndBytesConfig

# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

 # This example only has a test split, so we use that, for demonstration purposes.

# Configuration for bitsandbytes (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # Or torch.bfloat16 if supported
)

 # Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

from peft import prepare_model_for_kbit_training


# Prepare Model for QLoRA
model = prepare_model_for_kbit_training(model)

# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    target_modules=["q_proj", "v_proj"] # Adjust for your model
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./qwen-qlora-math",          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size per device during training
    gradient_accumulation_steps=1,   # Number of updates steps to accumulate before performing a backward/update pass
    optim="paged_adamw_32bit",       # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=20,                   # Save checkpoint every X updates steps
    eval_steps=20,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=1,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=1e-4,              # Learning rate
    weight_decay=0.001,              # Weight decay
    fp16=True,                       # Use mixed precision training
    bf16=False,                      # Use bfloat16 training
    max_grad_norm=0.3,               # Gradient clipping max norm
    max_steps=-1,                    # If > 0: set total number of training steps to perform. Override num_train_epochs.
    warmup_ratio=0.03,               # Linear warmup over warmup_ratio fraction of the total number of training steps.
    group_by_length=True,            # Group sequences of roughly the same length together for more efficient training
    lr_scheduler_type="cosine",      # Learning rate scheduler type
    report_to="none",                # Disable logging
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()

# Save the QLoRA model
output_dir = "./qwen-qlora-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Load the QLoRA model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto", quantization_config=bnb_config)
model = PeftModel.from_pretrained(model, output_dir)

# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)

Prompt tuning#

The last PEFT method we’ll use is prompt tuning, in which the only thing trained is a continuous-valued addition to the prompt.

from peft import PromptTuningConfig, get_peft_model

# Define Prompt Tuning Config
prompt_tuning_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM, 
    num_virtual_tokens=20, # Length of the prompt
    prompt_tuning_init="TEXT",
    prompt_tuning_init_text="Solve the following math problem:",
    tokenizer_name_or_path=model_name,
)

# Add prompt tuning adapter to the model
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = get_peft_model(model, prompt_tuning_config)
model.print_trainable_parameters()

# Training Arguments (example - adjust as needed)
training_args = TrainingArguments(
    output_dir="./qwen-prompt-tuning-math",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=20,
    eval_steps=20,
    evaluation_strategy="steps",
    save_total_limit=1,
    load_best_model_at_end=True,
    logging_steps=10,
    learning_rate=5e-3,  # Smaller learning rate often helps
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="none",
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,
)

trainer.train()

# Save the model
output_dir = "./qwen-prompt-tuning-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Load the model
from peft import PeftModel
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, output_dir)

# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)