Efficiency and using multiple GPUs#
from workshop_utils import display_pdf
display_pdf("Slides_part10.pdf")
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import os
# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# If multiple GPUs are available, get the count
if device == "cuda":
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
else:
num_gpus = 0
import sys
class FilteredStream:
def __init__(self, stream, filter_str):
self.stream = stream
self.filter_str = filter_str
def write(self, data):
# Only write data if it does not contain the unwanted substring.
if self.filter_str not in data:
self.stream.write(data)
def flush(self):
self.stream.flush()
# Replace sys.stderr with our filtered stream.
sys.stderr = FilteredStream(sys.stderr, "cannot find -laio")
# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]
# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct" # "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" # "Qwen/Qwen2.5-32B-Instruct" #
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto" # {'':0} #
)
# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator
data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)
# Map the formatting function over the dataset.
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})
train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])
from peft import LoraConfig, get_peft_model, TaskType
# Define LoRA Config
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.05,
target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)
# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)
training_args = TrainingArguments(
output_dir="./qwen-lora-math-dp", # Output directory
num_train_epochs=1, # Number of training epochs
per_device_train_batch_size=1, # Batch size
gradient_accumulation_steps=1, # Number of updates steps to accumulate before performing a backward/update pass
optim="paged_adamw_32bit", # Optimizer, you might need to install accelerate: pip install accelerate -U
save_steps=20, # Save checkpoint every X updates steps
eval_steps=20, # Evaluate every X updates steps
eval_strategy="steps", # Evaluation strategy
save_total_limit=1, # Limit the total amount of checkpoints
load_best_model_at_end=True, # Load the best model when finished training (default is True)
logging_steps=10, # Log every X updates steps
learning_rate=1e-4, # Learning rate
weight_decay=0.001, # Weight decay
fp16=True, # Use mixed precision training
bf16=False, # Use bfloat16 training
max_grad_norm=0.3, # Gradient clipping max norm
max_steps=-1, # If > 0: set total number of training steps to perform. Override num_train_epochs.
warmup_ratio=0.03, # Linear warmup over warmup_ratio fraction of the total number of training steps.
group_by_length=True, # Group sequences of roughly the same length together for more efficient training
lr_scheduler_type="cosine", # Learning rate scheduler type
report_to="none", # Disable logging
)
# Create Trainer instance
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset_tokenized,
eval_dataset=eval_dataset_tokenized,
data_collator=data_collator_fn, # Data collator if needed
)
# Train the model
trainer.train()