Model in use
Collection
5 items
•
Updated
This model is a fine-tuned version of base model meta-llama/Llama-3.2-11B-Vision-Instruct using LoRA on the qa_with_chat_template_250201.csv dataset combing materials from FEM courses of Prof. Krishna Garikipati.
Compared with TOMMI-0.3, TOMMI-0.35 uses the same hyperparameter full data (without student asked QA pairs) and increased token length of 700 from 500.
For Expanse usage, you should request at least two V100s to run the following code.
#!/bin/bash
# python 3.10 + cuda 11.8.0
export MKL_NUM_THREADS=1
export NUMEXPR_NUM_THREADS=1
export OPENBLAS_NUM_THREADS=1
export OMP_NUM_THREADS=1
conda clean -a -y # conda for traditional and reliable setup
mamba clean -a -y # mamba for smart and efficient setup
pip install --upgrade pip
# cuda, gcc/g++, torch
conda install cuda -c nvidia/label/cuda-11.8.0 -y
pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu118
pip install torchao==0.7.0 --index-url https://download.pytorch.org/whl/cu118
# deepspeed
mamba install gcc gxx -c conda-forge -y # ensure > 9.0 for ninja JIT
pip install deepspeed==0.15.4
# bitsandbytes
pip install setuptools
mamba install bitsandbytes=0.45.0 -c conda-forge --no-deps -y
pip install psutil
# add the following to your .bashrc or running scripts
#export BNB_CUDA_VERSION=118
#export CUDA_HOME=$CONDA_PREFIX
#export LD_LIBRARY_PATH="$CONDA_PREFIX/lib:$LD_LIBRARY_PATH"
# trl, accelerate, peft
pip install trl
pip install accelerate peft optuna optuna_integration datasets
# other dependencies
pip install scikit-learn pexpect
pip install wandb plotly # takes a while
from peft import PeftModel
import time
import torch
from transformers import PreTrainedTokenizerFast, AutoModelForCausalLM
class Conversation:
def __init__(self,
model,
tokenizer,
device,
system=""):
self.model = model
self.tokenizer = tokenizer
self.device = device
self.message = []
if system:
self.message.append({"role": "system", "content": system})
def get_prompt(self):
prompt = '<|begin_of_text|>'
# Include the system message if it exists
for msg in self.message:
role = msg['role']
content = msg['content']
prompt += f"<|start_header_id|>{role}<|end_header_id|>{content}<|eot_id|>"
# Append the assistant's role header to prompt for the next response
prompt += "<|start_header_id|>assistant<|end_header_id|>"
return prompt
def generate(self,
user_input,
temp=0.7,
max_new_tokens=1024,
top_k=50,
top_p=0.95):
# Add the user's input to the conversation history
self.message.append({"role": "user", "content": user_input})
# Generate the prompt
prompt = self.get_prompt()
# Tokenize the prompt
inputs = self.tokenizer(prompt,
return_tensors="pt",
truncation=True,
max_length=2048).to(self.device)
# inputs = {k: v.to(device) for k, v in inputs.items()}
if self.tokenizer.eos_token_id is None:
self.tokenizer.eos_token_id = self.tokenizer.convert_tokens_to_ids('</s>')
if self.tokenizer.pad_token_id is None:
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
print(f"EOS Token ID: {self.tokenizer.eos_token_id}")
print(f"PAD Token ID: {self.tokenizer.pad_token_id}")
# Generate the response
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temp,
top_k=top_k,
top_p=top_p,
pad_token_id=self.tokenizer.eos_token_id,
# eos_token_id=self.tokenizer.convert_tokens_to_ids('<|eot_id|>'),
eos_token_id=self.tokenizer.eos_token_id,
)
# Decode the generated tokens
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=False)
# Extract the assistant's response
assistant_response = self.extract_assistant_response(prompt, generated_text)
# Append the assistant's response to the conversation
self.message.append({'role': 'assistant', 'content': assistant_response})
return assistant_response
def extract_assistant_response(self, prompt, generated_text):
# Llama will keep generating after the prompt submitted, this function will
# extract only the LLM's generated output with no special tokens
# Remove the prompt from the generated text
response_text = generated_text[len(prompt):]
# Split at the end-of-turn token
if '<|eot_id|>' in response_text:
assistant_response = response_text.split('<|eot_id|>')[0]
else:
assistant_response = response_text
# Remove special token at the end and leading or trailing whitespaces
assistant_response = assistant_response.replace('<|end_header_id|>', '')
assistant_response = assistant_response.strip()
return assistant_response
if __name__ == "__main__":
base_model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
peft_model_name = "my-ai-university/TOMMI-0.3"
tokenizer = PreTrainedTokenizerFast.from_pretrained(
model_args.model_name_or_path,
return_tensors="pt")
tokenizer.pad_token = "<|reserved_special_token_5|>"
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.bfloat16,
device_map="auto")
model = PeftModel.from_pretrained(base_model, peft_model_name)
model = model.merge_and_unload() # Optional: Merge adapter with base model for faster inference
# Initialize the conversation object
system_message = 'You are an expert professor who replies in a helpful way.'
conv = Conversation(
model,
tokenizer,
model.device,
system_message)
# Run the conversation loop
print("Starting conversation ...")
input_text = ""
while input_text.lower() != "exit":
input_text = input("Enter your prompt (type 'exit' to quit): ")
start_time = time.time()
response = conv.generate(input_text)
end_time = time.time()
print(response)
print(f"Response time: {end_time - start_time:.2f} seconds")
# Save the conversation to a file
with open("./conversation.txt", "w") as f:
f.write(str(conv.message))
Base model
meta-llama/Llama-3.2-11B-Vision-Instruct