Parameter-efficient Fine-tuning (PEFT)#
from workshop_utils import display_pdf
display_pdf("Slides_part8.pdf")
LoRA#
Now let’s use LoRA to train the model. Notice how many fewer trainable parameters there are compared to the full fine-tune, and how much faster the training.
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
)
import torch
# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]
# This example only has a test split, so we use that, for demonstration purposes.
# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto"
)
# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator
data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)
# Map the formatting function over the dataset.
# This applies the formatting function to each example in the dataset.
# The result is that we have a dataset where each math problem is formatted as a prompt for the model,
# and the solution is formatted as a response that the model should generate.
# Each example is also tokenized
# (If your dataset is large you might use batched=True; here we keep it simple.)
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})
train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_params = 0
for _, param in model.named_parameters():
num_params = param.numel()
all_params += num_params
if param.requires_grad:
trainable_params += num_params
print(f"trainable params: {trainable_params:,d} || all params: {all_params:,d} || trainable%: {100 * trainable_params / all_params:.2f}%")
print_trainable_parameters(model)
from peft import LoraConfig, get_peft_model, TaskType
# Define LoRA Config
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)
# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
from transformers import Trainer, TrainingArguments
# Define training arguments
training_args = TrainingArguments(
output_dir="./qwen-lora-math", # Output directory
num_train_epochs=3, # Number of training epochs
per_device_train_batch_size=8, # Batch size per device during training
gradient_accumulation_steps=1, # Number of updates steps to accumulate before performing a backward/update pass
optim="paged_adamw_32bit", # Optimizer, you might need to install accelerate: pip install accelerate -U
save_steps=20, # Save checkpoint every X updates steps
eval_steps=20, # Evaluate every X updates steps
eval_strategy="steps", # Evaluation strategy
save_total_limit=1, # Limit the total amount of checkpoints
load_best_model_at_end=True, # Load the best model when finished training (default is True)
logging_steps=10, # Log every X updates steps
learning_rate=1e-4, # Learning rate
weight_decay=0.001, # Weight decay
fp16=True, # Use mixed precision training
bf16=False, # Use bfloat16 training
max_grad_norm=0.3, # Gradient clipping max norm
max_steps=-1, # If > 0: set total number of training steps to perform. Override num_train_epochs.
warmup_ratio=0.03, # Linear warmup over warmup_ratio fraction of the total number of training steps.
group_by_length=True, # Group sequences of roughly the same length together for more efficient training
lr_scheduler_type="cosine", # Learning rate scheduler type
report_to="none", # Disable logging
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset_tokenized,
eval_dataset=eval_dataset_tokenized,
data_collator=data_collator_fn, # Data collator if needed
)
# Train the model
trainer.train()
# Save the LORA model
output_dir = "./qwen-lora-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Load the LORA model
from peft import PeftModel
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, output_dir)
# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)
QLoRA#
Now let’s try QLoRA. The primary benefit is reduced model size through quantization, so you can train a larger model on the same resources.
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
)
import torch
from transformers import BitsAndBytesConfig
# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
# Split the dataset into training and validation sets
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]
# This example only has a test split, so we use that, for demonstration purposes.
# Configuration for bitsandbytes (QLoRA)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16, # Or torch.bfloat16 if supported
)
# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto"
)
# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
from peft import prepare_model_for_kbit_training
# Prepare Model for QLoRA
model = prepare_model_for_kbit_training(model)
# Configure LoRA
lora_config = LoraConfig(
r=8,
lora_alpha=32,
lora_dropout=0.05,
bias="none",
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
target_modules=["q_proj", "v_proj"] # Adjust for your model
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Define training arguments
training_args = TrainingArguments(
output_dir="./qwen-qlora-math", # Output directory
num_train_epochs=3, # Number of training epochs
per_device_train_batch_size=8, # Batch size per device during training
gradient_accumulation_steps=1, # Number of updates steps to accumulate before performing a backward/update pass
optim="paged_adamw_32bit", # Optimizer, you might need to install accelerate: pip install accelerate -U
save_steps=20, # Save checkpoint every X updates steps
eval_steps=20, # Evaluate every X updates steps
eval_strategy="steps", # Evaluation strategy
save_total_limit=1, # Limit the total amount of checkpoints
load_best_model_at_end=True, # Load the best model when finished training (default is True)
logging_steps=10, # Log every X updates steps
learning_rate=1e-4, # Learning rate
weight_decay=0.001, # Weight decay
fp16=True, # Use mixed precision training
bf16=False, # Use bfloat16 training
max_grad_norm=0.3, # Gradient clipping max norm
max_steps=-1, # If > 0: set total number of training steps to perform. Override num_train_epochs.
warmup_ratio=0.03, # Linear warmup over warmup_ratio fraction of the total number of training steps.
group_by_length=True, # Group sequences of roughly the same length together for more efficient training
lr_scheduler_type="cosine", # Learning rate scheduler type
report_to="none", # Disable logging
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset_tokenized,
eval_dataset=eval_dataset_tokenized,
data_collator=data_collator_fn, # Data collator if needed
)
# Train the model
trainer.train()
# Save the QLoRA model
output_dir = "./qwen-qlora-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Load the QLoRA model
from peft import PeftModel
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto", quantization_config=bnb_config)
model = PeftModel.from_pretrained(model, output_dir)
# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)
Prompt tuning#
The last PEFT method we’ll use is prompt tuning, in which the only thing trained is a continuous-valued addition to the prompt.
from peft import PromptTuningConfig, get_peft_model
# Define Prompt Tuning Config
prompt_tuning_config = PromptTuningConfig(
task_type=TaskType.CAUSAL_LM,
num_virtual_tokens=20, # Length of the prompt
prompt_tuning_init="TEXT",
prompt_tuning_init_text="Solve the following math problem:",
tokenizer_name_or_path=model_name,
)
# Add prompt tuning adapter to the model
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = get_peft_model(model, prompt_tuning_config)
model.print_trainable_parameters()
# Training Arguments (example - adjust as needed)
training_args = TrainingArguments(
output_dir="./qwen-prompt-tuning-math",
num_train_epochs=3,
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
optim="paged_adamw_32bit",
save_steps=20,
eval_steps=20,
evaluation_strategy="steps",
save_total_limit=1,
load_best_model_at_end=True,
logging_steps=10,
learning_rate=5e-3, # Smaller learning rate often helps
weight_decay=0.001,
fp16=True,
bf16=False,
max_grad_norm=0.3,
max_steps=-1,
warmup_ratio=0.03,
group_by_length=True,
lr_scheduler_type="cosine",
report_to="none",
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset_tokenized,
eval_dataset=eval_dataset_tokenized,
data_collator=data_collator_fn,
)
trainer.train()
# Save the model
output_dir = "./qwen-prompt-tuning-math-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
# Load the model
from peft import PeftModel
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
model = PeftModel.from_pretrained(model, output_dir)
# Generate and print model outputs after training
generate_and_print(sample_dataset, sample_dataset_tokenized, model, tokenizer)