Alternatives to fine-tuning#
from workshop_utils import display_pdf
display_pdf("Slides_part1.pdf")
Prompt engineering#
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Set random seeds for reproducibility
from transformers import set_seed
SEED = 355
torch.manual_seed(SEED)
set_seed(SEED)
model = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model, device_map="auto")
input_text = "1. If I have 23 apples and I give 7 to my friend and sell the rest for $1.33 each, how much money do I have?"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=200, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)
for i, sample_output in enumerate(output):
print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")
First, let’s make sure we’re using the appropriate chat template for this model.
torch.manual_seed(SEED)
set_seed(SEED)
input_messages = [{"role":"system", "content":"Answer the user's question."},
{"role":"user", "content":input_text}]
input_ids = tokenizer.apply_chat_template(input_messages, add_generation_prompt=True,return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=200, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)
for i, sample_output in enumerate(output):
print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")
Now, let’s make our prompt more explicit, detailed and clear.
torch.manual_seed(SEED)
set_seed(SEED)
input_messages = [{"role":"system", "content":"Answer the user's question. Think step-by-step and double-check your arithmetic. Enclose your final answer in delimiters like `<answer>{your answer here}</answer>`."},
{"role":"user", "content":input_text}]
input_ids = tokenizer.apply_chat_template(input_messages, add_generation_prompt=True,return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=300, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)
for i, sample_output in enumerate(output):
print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")
display_pdf("Slides_part2.pdf")
Few-shot learning#
# Examples for FSL
example_1_question = "If you have 41 pizzas and each pizza has 8 slices, and you sell the slices for 2.15 each, how much money do you make?"
example_1_answer = """Let's solve this step by step, breaking down arithmetic steps to make them easier:
Calculate the total number of slices: 41 * 8 = (40 * 8) + (1 * 8) = 320 + 8 = 328 slices
Calculate the total money made: 328 * 2.15 = (300 * 2.15) + (20 * 2.15) + (8 * 2.15) = ((300 * 2) + (300 * 0.15)) + ((20 * 2) + (20 * 0.15)) + ((8 * 2) + (8 * 0.15)) = (600 + 45) + (40 + 3) + (16 + 1.2) = 645 + 43 + 17.2 = 705.2
<answer>705.2</answer>"""
example_2_question = "If Sandy has 61 acorns, gives 7 to Bob, and then sells the remaining acorns for $1.20 per acorn, how much money does she make?"
example_2_answer = """Let's solve this step by step, breaking down arithmetic steps to make them easier:
Calculate the acorns left after giving some to Bob: 61 - 7 = 54 acorns
Calculate the total money made: 54 * 1.20 = (50 * 1.20) + (4 * 1.20) = ((50 + 1) + (50 * 0.20)) + ((4 * 1) + (4 * 0.20)) = (60 + 10) + (4 + 0.8) = 70 + 4.8 = 74.8
<answer>74.8</answer>"""
torch.manual_seed(SEED)
set_seed(SEED)
input_messages = [{"role":"system", "content":"Answer the user's question. Think step-by-step and double-check your arithmetic. Enclose your final answer in delimiters like `<answer>{your answer here}</answer>`."},
{"role":"user", "content":example_1_question},
{"role":"assistant", "content":example_1_answer},
{"role":"user", "content":example_2_question},
{"role":"assistant", "content":example_2_answer},
{"role":"user", "content":input_text}]
input_ids = tokenizer.apply_chat_template(input_messages, add_generation_prompt=True,return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=200, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)
for i, sample_output in enumerate(output):
print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")
from workshop_utils import display_pdf
display_pdf("Slides_part3.pdf")
RAG Example#
We’ll build a simple RAG system that answers questions by consulting the Canadian tax code.
from langchain_community.document_loaders import UnstructuredXMLLoader
# This file contains the Canadian tax code.
loader = UnstructuredXMLLoader(
"./can_tax.xml",
)
docs = loader.load()
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=8192, chunk_overlap=1024)
chunked_docs = splitter.split_documents(docs)
Let’s look at an example of a chunk from our database of the tax code.
print(F"NUMBER OF CHUNKS: {len(chunked_docs)}\n\n")
print(chunked_docs[1].page_content)
Now let’s use an embedding model to create vector representations of each chunk of the tax code.
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
# Load a Sentence Transformer model
model_name = "nomic-ai/modernbert-embed-base"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
embeddings.client.to('cuda') # Explicitly move the model to GPU
# Create a FAISS vectorstore using the Sentence Transformer embeddings
vectorstore = FAISS.from_documents(chunked_docs, embeddings)
Let’s use the embedding model to get a vector representation of a sample query, and then let’s see what the two most relevant chunks of the tax code are to that query.
# Perform a similarity search on a query
query = "I'm a schoolteacher, can I deduct expenses for my classroom?"
results = vectorstore.similarity_search(query, k=2) # Get top 2 similar documents
# Print the results
for i,result in enumerate(results):
print(f"RESULT {i}:\n{result.page_content}\n\n")
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={'k': 2}
)
Now let’s connect what we did above to a generative LLM, so that the generative LLM can answer a user’s query while also examining the most relevant chunks of the Canadian tax code.
system_prompt = "Give the below context from the Canadian tax code, answer the user's question. Make sure that your answer is based solely in the context provided below.\n\n## Context:\n{context}"
user_prompt = "I'm a schoolteacher, can I deduct expenses for my classroom?"
context_docs = '/n'.join([doc.page_content for doc in retriever.get_relevant_documents(query)])
prompt_template = [{"role":"system", "content":system_prompt.format(context=context_docs)}, {"role":"user", "content":user_prompt}]
input_ids = tokenizer.apply_chat_template(prompt_template, add_generation_prompt=True,return_tensors="pt").to(model.device)
output = model.generate(input_ids, max_new_tokens=200, num_return_sequences=5, pad_token_id=tokenizer.eos_token_id, do_sample=True)
for i, sample_output in enumerate(output):
print(f"########\nOUTPUT {i}:\n {tokenizer.decode(sample_output[len(input_ids[0]):], skip_special_tokens=True)}\n")
display_pdf("Slides_part4.pdf")