Efficiency and using multiple GPUs

Efficiency and using multiple GPUs#

from workshop_utils import display_pdf

display_pdf("Slides_part10.pdf")
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import os

# Check for CUDA availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# If multiple GPUs are available, get the count
if device == "cuda":
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
else:
    num_gpus = 0
import sys

class FilteredStream:
    def __init__(self, stream, filter_str):
        self.stream = stream
        self.filter_str = filter_str

    def write(self, data):
        # Only write data if it does not contain the unwanted substring.
        if self.filter_str not in data:
            self.stream.write(data)

    def flush(self):
        self.stream.flush()

# Replace sys.stderr with our filtered stream.
sys.stderr = FilteredStream(sys.stderr, "cannot find -laio")
# Load the dataset
ds = load_dataset("HuggingFaceH4/MATH-500")
train_val_dataset = ds["test"].train_test_split(test_size=0.1)
train_dataset = train_val_dataset["train"]
eval_dataset = train_val_dataset["test"]

# Load the model and tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct" # "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B" # "Qwen/Qwen2.5-32B-Instruct" # 
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto" # {'':0} # 
)

# The model may not have a pad token set by default, so set it (using the EOS token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

from workshop_utils import tokenize_and_mask, tokenize_for_generation, generate_and_print, data_collator

data_collator_fn = lambda features: data_collator(features, tokenizer=tokenizer)

# Map the formatting function over the dataset.
train_dataset_tokenized = train_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})
eval_dataset_tokenized = eval_dataset.map(tokenize_and_mask, batched=False, fn_kwargs={"tokenizer": tokenizer})

# Get a sample dataset so we can examine model generations before and after training
sample_dataset = eval_dataset.select(range(3))
sample_dataset_tokenized = sample_dataset.map(tokenize_for_generation, batched=False, fn_kwargs={"tokenizer": tokenizer})

train_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
eval_dataset_tokenized.set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])
sample_dataset_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask"])

from peft import LoraConfig, get_peft_model, TaskType

# Define LoRA Config
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"] # Replace with the target modules of your model
)

# Add LoRA adapter to the model
model = get_peft_model(model, lora_config)
training_args = TrainingArguments(
    output_dir="./qwen-lora-math-dp",          # Output directory
    num_train_epochs=1,              # Number of training epochs
    per_device_train_batch_size=1,   # Batch size
    gradient_accumulation_steps=1,   # Number of updates steps to accumulate before performing a backward/update pass
    optim="paged_adamw_32bit",        # Optimizer, you might need to install accelerate: pip install accelerate -U
    save_steps=20,                   # Save checkpoint every X updates steps
    eval_steps=20,                   # Evaluate every X updates steps
    eval_strategy="steps",           # Evaluation strategy
    save_total_limit=1,              # Limit the total amount of checkpoints
    load_best_model_at_end=True,     # Load the best model when finished training (default is True)
    logging_steps=10,                # Log every X updates steps
    learning_rate=1e-4,              # Learning rate
    weight_decay=0.001,              # Weight decay
    fp16=True,                       # Use mixed precision training
    bf16=False,                      # Use bfloat16 training
    max_grad_norm=0.3,               # Gradient clipping max norm
    max_steps=-1,                    # If > 0: set total number of training steps to perform. Override num_train_epochs.
    warmup_ratio=0.03,               # Linear warmup over warmup_ratio fraction of the total number of training steps.
    group_by_length=True,            # Group sequences of roughly the same length together for more efficient training
    lr_scheduler_type="cosine",       # Learning rate scheduler type
    report_to="none",                # Disable logging
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    data_collator=data_collator_fn,     # Data collator if needed
)

# Train the model
trainer.train()