Alternatives to fine-tuning

Alternatives to fine-tuning#

from workshop_utils import display_pdf

display_pdf("Slides_part1.pdf")

Prompt engineering#

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Set random seeds for reproducibility
from transformers import set_seed
SEED = 355
torch.manual_seed(SEED)
set_seed(SEED)

model = "Qwen/Qwen2.5-0.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model, device_map="auto")

input_text = "1. If I have 23 apples and I give 7 to my friend and sell the rest for $1.33 each, how much money do I have?"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=200, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)

for i, sample_output in enumerate(output):
    print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")

First, let’s make sure we’re using the appropriate chat template for this model.

torch.manual_seed(SEED)
set_seed(SEED)

input_messages = [{"role":"system", "content":"Answer the user's question."},
                  {"role":"user", "content":input_text}]
input_ids = tokenizer.apply_chat_template(input_messages, add_generation_prompt=True,return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=200, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)

for i, sample_output in enumerate(output):
    print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")

Now, let’s make our prompt more explicit, detailed and clear.

torch.manual_seed(SEED)
set_seed(SEED)

input_messages = [{"role":"system", "content":"Answer the user's question. Think step-by-step and double-check your arithmetic. Enclose your final answer in delimiters like `<answer>{your answer here}</answer>`."},
                  {"role":"user", "content":input_text}]
input_ids = tokenizer.apply_chat_template(input_messages, add_generation_prompt=True,return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=300, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)

for i, sample_output in enumerate(output):
    print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")
display_pdf("Slides_part2.pdf")

Few-shot learning#

# Examples for FSL
example_1_question = "If you have 41 pizzas and each pizza has 8 slices, and you sell the slices for 2.15 each, how much money do you make?"
example_1_answer = """Let's solve this step by step, breaking down arithmetic steps to make them easier:

Calculate the total number of slices: 41 * 8 = (40 * 8) + (1 * 8) = 320 + 8 = 328 slices
Calculate the total money made: 328 * 2.15 = (300 * 2.15) + (20 * 2.15) + (8 * 2.15) = ((300 * 2) + (300 * 0.15)) + ((20 * 2) + (20 * 0.15)) + ((8 * 2) + (8 * 0.15)) = (600 + 45) + (40 + 3) + (16 + 1.2) = 645 + 43 + 17.2 = 705.2
<answer>705.2</answer>"""
example_2_question = "If Sandy has 61 acorns, gives 7 to Bob, and then sells the remaining acorns for $1.20 per acorn, how much money does she make?"
example_2_answer = """Let's solve this step by step, breaking down arithmetic steps to make them easier:

Calculate the acorns left after giving some to Bob: 61 - 7 = 54 acorns
Calculate the total money made: 54 * 1.20 = (50 * 1.20) + (4 * 1.20) = ((50 + 1) + (50 * 0.20)) + ((4 * 1) + (4 * 0.20)) = (60 + 10) + (4 + 0.8) = 70 + 4.8 = 74.8 
<answer>74.8</answer>"""
torch.manual_seed(SEED)
set_seed(SEED)

input_messages = [{"role":"system", "content":"Answer the user's question. Think step-by-step and double-check your arithmetic. Enclose your final answer in delimiters like `<answer>{your answer here}</answer>`."},
                    {"role":"user", "content":example_1_question},
                    {"role":"assistant", "content":example_1_answer},
                    {"role":"user", "content":example_2_question},
                    {"role":"assistant", "content":example_2_answer},
                    {"role":"user", "content":input_text}]
input_ids = tokenizer.apply_chat_template(input_messages, add_generation_prompt=True,return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=200, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)

for i, sample_output in enumerate(output):
    print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")
from workshop_utils import display_pdf

display_pdf("Slides_part3.pdf")

RAG Example#

We’ll build a simple RAG system that answers questions by consulting the Canadian tax code.

from langchain_community.document_loaders import UnstructuredXMLLoader

# This file contains the Canadian tax code.
loader = UnstructuredXMLLoader(
    "./can_tax.xml",
)

docs = loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=8192, chunk_overlap=1024)

chunked_docs = splitter.split_documents(docs)

Let’s look at an example of a chunk from our database of the tax code.

print(F"NUMBER OF CHUNKS: {len(chunked_docs)}\n\n")
print(chunked_docs[1].page_content)

Now let’s use an embedding model to create vector representations of each chunk of the tax code.

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from sentence_transformers import SentenceTransformer

# Load a Sentence Transformer model 
model_name = "nomic-ai/modernbert-embed-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)  
embeddings.client.to('cuda')  # Explicitly move the model to GPU

# Create a FAISS vectorstore using the Sentence Transformer embeddings
vectorstore = FAISS.from_documents(chunked_docs, embeddings) 

Let’s use the embedding model to get a vector representation of a sample query, and then let’s see what the two most relevant chunks of the tax code are to that query.

# Perform a similarity search on a query 
query = "I'm a schoolteacher, can I deduct expenses for my classroom?"
results = vectorstore.similarity_search(query, k=2)  # Get top 2 similar documents

# Print the results
for i,result in enumerate(results):
    print(f"RESULT {i}:\n{result.page_content}\n\n")
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 2}
)

Now let’s connect what we did above to a generative LLM, so that the generative LLM can answer a user’s query while also examining the most relevant chunks of the Canadian tax code.

system_prompt = "Give the below context from the Canadian tax code, answer the user's question. Make sure that your answer is based solely in the context provided below.\n\n## Context:\n{context}"
user_prompt = "I'm a schoolteacher, can I deduct expenses for my classroom?"
context_docs = '/n'.join([doc.page_content for doc in retriever.get_relevant_documents(query)])

prompt_template = [{"role":"system", "content":system_prompt.format(context=context_docs)}, {"role":"user", "content":user_prompt}]

input_ids = tokenizer.apply_chat_template(prompt_template, add_generation_prompt=True,return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=200, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)

for i, sample_output in enumerate(output):
    print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")
display_pdf("Slides_part4.pdf")