Skip to main content

Fine tuning and validating LLMs

Fine-tuning language models allows customization to specific tasks or domains, thereby enhancing performance and adaptability for more accurate and contextually relevant outputs. In this tutorial, we are going to show you how to apply Low-Rank Adaptation of Large Language Models (LoRA) to fine tune a language model using Covalent Cloud, Transformers, and PEFT. To demonstrate the benefits of fine tuning, we will show how the model improved on a summarization task by learning from a dataset of summaries. All the computations will be executed on Covalent Cloud for the total expense of ~$0.75.

Getting started

The tutorial requires installing Covalent, Covalent Cloud, transformers, datasets, peft, tensorboardX, evaluate, and accelerate. We install them from the requirements_finetune.txt fine using pip.

with open("./requirements_finetune.txt", "r") as file:
for line in file:
print(line.rstrip())
datasets==2.16.1
transformers==4.37.2
torch==2.2.0
rouge-score
peft==0.8.2
tensorboardX
evaluate
accelerate
bitsandbytes

Uncomment and run the following cell to install the libraries.

# Installing required packages
# !pip install -r ./requirements_finetune.txt

Import everything necessary to create and launch the workflow.

import covalent as ct
import covalent_cloud as cc
from datasets import load_dataset, load_from_disk
from evaluate import load
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import (
LoraConfig, get_peft_model,
prepare_model_for_int8_training, TaskType
)
from peft import PeftModel, PeftConfig
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from tqdm import tqdm
from pathlib import Path
import torch
import numpy as np

After creating a local python environment, we create one in Covalent Cloud. See more on creating Covalent Cloud environments here.

cc.save_api_key("YOUR-API-KEY")
cc.create_env(
name="finetune-samsum",
conda={
"dependencies": ["python=3.10"]
}, pip=[
"datasets==0.17.0", "transformers==4.37.2",
"torch==2.2.0", "rouge-score==0.1.2"
],
wait=True
)
VOLUME_NAME = "finetune"

Using Covalent Cloud resource allocation, we define two executors: one for CPU and one for GPU. The GPU resource will only be used for the fine tuning process.

cpu_executor = cc.CloudExecutor(
env="finetune-samsum", num_cpus=2,
memory="8GB", time_limit="1 hours"
)
gpu_executor = cc.CloudExecutor(
env="finetune-samsum", num_cpus=2, num_gpus=1,
gpu_type="v100", memory="8GB", time_limit="1 hours"
)

Data Loading and Preprocessing

To fine tune our language model, we will load the samsum dataset which contains 16k summaries of conversations.

@ct.electron(executor=cpu_executor)
def load_data():
# Load dataset from the hub
ds_cache_dir = Path("/volumes") / "models"
ds_cache_dir.mkdir(exist_ok=True)
dataset = load_dataset("samsum", cache_dir=ds_cache_dir)
return dataset

We build a tokenizer for the dataset and then immediately apply it on the text columns dialogue and summary. The tokens of dialogue and made to be inputs, whereas the summary tokens are the target labels.

@ct.electron(executor=cpu_executor)
def load_tokenizer(model_id="google/flan-t5-small"):
# Load tokenizer of FLAN-t5-XL
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Save tokenizer to model folder
tokenizer_folder = Path("/volumes") / VOLUME_NAME / "tokenizer"
tokenizer_folder.mkdir(exist_ok=True, parents=True)

tokenizer.save_pretrained(tokenizer_folder)
return tokenizer_folder


@ct.electron(executor=cpu_executor)
def preprocess_dataset(
dataset_path, tokenizer_path, max_source_length=255, max_target_length=90
):
dataset = load_dataset("samsum", data_dir=dataset_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

def preprocess_function(sample):
inputs = [f"summarize: {item}" for item in sample["dialogue"]]
model_inputs = tokenizer(inputs, max_length=max_source_length, padding="max_length", truncation=True)
labels = tokenizer(sample["summary"], max_length=max_target_length, padding="max_length", truncation=True)
labels["input_ids"] = [[lab if lab != tokenizer.pad_token_id else -100 for lab in label] for label in labels["input_ids"]]
model_inputs["labels"] = labels["input_ids"]
return model_inputs

dataset_transformed = dataset.map(
preprocess_function, batched=True,
remove_columns=["dialogue", "summary", "id"]
)
preprocessed_dataset_path = Path("/volumes") / VOLUME_NAME / "preprocessed"
# save dataset to disk
dataset_transformed.save_to_disk(preprocessed_dataset_path)
return preprocessed_dataset_path

Fine-tuning a Pretrained Model

Next, we define a PEFT configuration and load a previously trained model.

@ct.electron(executor=cpu_executor)
def load_model_for_finetuning(model_id="google/flan-t5-small"):
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q", "v"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_2_SEQ_LM
)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)
# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# save model to model folder
model_folder = Path("/volumes") / "models" / "pretrained"
model_folder.mkdir(exist_ok=True)
model.save_pretrained(model_folder)
return model, model_folder

Using the fine tuning configuration and pretrained model, we proceed to train the model using the loaded and preprocessed dataset. The train dataset is used for fine tuning, whereas we will use the test dataset to evaluate performance. Once trained, the model and tokenizer are saved to disk.

@ct.electron(executor=gpu_executor)
def train_model(model, tokenizer, train_dataset):
# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# output directory
output_dir = Path("/volumes") / "models" / "finetuned"
# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=8
)
# Define training args
training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,
auto_find_batch_size=True,
learning_rate=1e-3, # higher learning rate
num_train_epochs=5,
logging_dir=f"{output_dir}/logs",
logging_strategy="steps",
logging_steps=500,
save_strategy="no",
report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
model.config.use_cache = False
trainer.train()

# save folder
model_folder = Path("/volumes") / "models" / "finetuned"
trainer.model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)
return model_folder

Evaluation

Finally, after succesfully training the model, we evaluate the performance of summarization by calculating rouge score on converting dialogues to summaries. The fine tuned model is loaded, switched into inference mode and executed across the examples of the samsum test_dataset.

@ct.electron(executor=gpu_executor)
def evaluate_peft_model(test_dataset, model_folder, max_target_length=50):
# load the trained model
config = PeftConfig.from_pretrained(model_folder)
# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(
config.base_model_name_or_path, load_in_8bit=True
)
tokenizer = AutoTokenizer.from_pretrained(
config.base_model_name_or_path
)
# load the Lora model
model = PeftModel.from_pretrained(model, model_folder)
model.eval()

def evaluate_example(model, tokenizer, example, max_target_length=50):
# generate summary
input_ids = torch.Tensor(example["input_ids"]).to(torch.int32)
outputs = model.generate(
input_ids=input_ids.unsqueeze(0),
do_sample=True, top_p=0.9, max_new_tokens=max_target_length
)
prediction = tokenizer.decode(
outputs[0], skip_special_tokens=True
)
# decode eval example
# Replace -100 in the labels as we can't decode them.
labels = np.array(example['labels'])
labels = np.where(
labels != -100, labels, tokenizer.pad_token_id
)
labels = tokenizer.decode(labels, skip_special_tokens=True)

return prediction, labels

predictions, references = [], []
for example in tqdm(test_dataset):
pred, ref = evaluate_example(
model, tokenizer,
example, max_target_length=max_target_length
)
predictions.append(pred)
references.append(ref)

metric = evaluate.load("rouge")
rouge = metric.compute(
predictions=predictions, references=references, use_stemmer=True
)
predictions_and_references = list(zip(predictions, references))
return rouge, predictions_and_references

Covalent Workflow

We put everything together by defining a workflow that uses the above electrons.

@ct.lattice(workflow_executor=cpu_executor, executor=cpu_executor)
def workflow():
dataset = load_data()
tokenizer = load_tokenizer()
tokenized_dataset = preprocess_dataset(dataset, tokenizer)
model, pretrained_model_folder = load_model_for_finetuning()
rouge_before, predictions_and_references_before = evaluate_peft_model(
tokenized_dataset['test'], pretrained_model_folder
)
finetuned_model_folder = train_model(
model, tokenizer, tokenized_dataset['train']
)
rouge_after, predictions_and_references_after = evaluate_peft_model(
tokenized_dataset['test'], finetuned_model_folder
)
return {
"rouge_before": rouge_before,
"rouge_after": rouge_after,
"predictions_and_references_before": predictions_and_references_before,
"predictions_and_references_after": predictions_and_references_after
}

We may then execute the workflow by running:

volume = cc.volume("/models")
dispatch_id = cc.dispatch(workflow, volume=volume)()

Looking at the Covalent Cloud UI, the workflow should have the following graph

finetuneworkflow.jpg

Finally, we show the results of the workflow and how fine tuning helps produce more accurate summaries.

result = cc.get_result(dispatch_id, wait=True)
result.result.load()
result_dict = result.result.value

rouge_average_before = np.round(np.average(list(result_dict['rouge_before'].values())), 2)
rouge_average_after = np.round(np.average(list(result_dict['rouge_after'].values())), 2)
difference = np.round((rouge_average_after - rouge_average_before), 2) * 100
print(
f'Rouge score with pretrained model was {rouge_average_before} and after finetuning is {rouge_average_after}, difference is {difference}%'
)
Rouge score with pretrained model was 0.28 and after finetuning is 0.3, difference is 2.0%

Differences in the summaries before and after fine tuning can be seen with the following examples.

Before Fine tuningAfter Fine tuningGold summary
Scott's friends are doing fun things and Ethan and Toby will share the news.Scott and Toby are going to spend some time together.Ethan, Toby and Marshall are making fun of Scott.
Jane told Anne that Mark was lying to Anne, but he was also lied about his passport. She and Jane thought he was a 20-year-old.Anne has lied to Mark about the theft of his passport.Mark lied to Anne about his age. Mark is 40.
Myah cannot see the phone number yet. She writes the text.Selah can't find the phone number on his phone. Selah can't see the phone number and will be free for next week.Selah called a person that did not pick up.
May drinks cocktail with May. May is trying to get a drink and Jack will make a drink with her.Jack and May will drink a drink later.Jack and May will drink cocktails later.
Janice's son is asking for a hamster for his birthday but she doesn't want to sell it.Janice's son has asked her to buy him a hamster for his birthday.Martina advises against getting a hamster.

Conclusion

In this guide, we combined Covalent and huggingface libraries to fine tune a language model to on conversations and summaries, all within the Covalent Cloud platform. We demonstrated the simplicity of transitioning between CPU and GPU devices and permanently saving results with volumes. If you execute the provided workflow using t4 GPU nodes as listed and fine tune for 5 epochs, the total price will be ~$0.75.

The full code can be found below:

Full Code
import covalent as ct
import covalent_cloud as cc
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import concatenate_datasets
from peft import (
LoraConfig, get_peft_model,
prepare_model_for_int8_training, TaskType
)
from peft import PeftModel, PeftConfig
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from tqdm import tqdm
from pathlib import Path
import torch
import evaluate
import numpy as np


cc.save_api_key("YOUR-API-KEY")
cc.create_env(
name="finetune-samsum-acc",
conda={
"dependencies": ["python=3.10"]
}, pip=[
"datasets==2.16.1", "transformers==4.37.2",
"torch==2.2.0", "rouge-score", "peft==0.8.2",
"tensorboardX", "evaluate", "accelerate", "bitsandbytes"
],
wait=True
)
cpu_executor = cc.CloudExecutor(
env="finetune-samsum-acc", num_cpus=2,
memory="8GB", time_limit="01:00:00"
)
gpu_executor = cc.CloudExecutor(
env="finetune-samsum-acc", num_cpus=2, num_gpus=1,
gpu_type="t4", memory="8GB", time_limit="01:00:00"
)
VOLUME_NAME = "finetune"


@ct.electron(executor=cpu_executor)
def load_data():
# Load dataset from the hub
parent_folder = Path("/volumes") / VOLUME_NAME / "datasets"
ds_cache_dir = parent_folder / "cache"
save_dir = parent_folder / "samsum"

ds_cache_dir.mkdir(exist_ok=True, parents=True)
save_dir.mkdir(exist_ok=True, parents=True)

dataset = load_dataset("samsum", cache_dir=ds_cache_dir)

# save dataset to disk
dataset.save_to_disk(save_dir)
return save_dir


@ct.electron(executor=cpu_executor)
def load_tokenizer(model_id="google/flan-t5-small"):
# Load tokenizer of FLAN-t5-XL
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Save tokenizer to model folder
tokenizer_folder = Path("/volumes") / VOLUME_NAME / "tokenizer"
tokenizer_folder.mkdir(exist_ok=True, parents=True)

tokenizer.save_pretrained(tokenizer_folder)
return tokenizer_folder


@ct.electron(executor=cpu_executor)
def preprocess_dataset(
dataset_path, tokenizer_path, max_source_length=255, max_target_length=90
):
dataset = load_dataset("samsum", data_dir=dataset_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

def preprocess_function(sample):
inputs = [f"summarize: {item}" for item in sample["dialogue"]]
model_inputs = tokenizer(inputs, max_length=max_source_length, padding="max_length", truncation=True)
labels = tokenizer(sample["summary"], max_length=max_target_length, padding="max_length", truncation=True)
labels["input_ids"] = [[lab if lab != tokenizer.pad_token_id else -100 for lab in label] for label in labels["input_ids"]]
model_inputs["labels"] = labels["input_ids"]
return model_inputs

dataset_transformed = dataset.map(
preprocess_function, batched=True,
remove_columns=["dialogue", "summary", "id"]
)
preprocessed_dataset_path = Path("/volumes") / VOLUME_NAME / "preprocessed"
# save dataset to disk
dataset_transformed.save_to_disk(preprocessed_dataset_path)
return preprocessed_dataset_path


@ct.electron(executor=cpu_executor)
def load_model_for_finetuning(model_id="google/flan-t5-small"):
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q", "v"],
lora_dropout=0.05,
bias="none",
task_type=TaskType.SEQ_2_SEQ_LM
)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)
# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# save model to model folder
model_folder = Path("/volumes") / VOLUME_NAME / "pretrained"
model_folder.mkdir(exist_ok=True, parents=True)
model.save_pretrained(model_folder)
return model_folder


@ct.electron(executor=gpu_executor)
def train_model(model_path, tokenizer_path, dataset_path):
# load dataset
train_dataset = load_from_disk(dataset_path)['train']

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# output directory
output_dir = Path("/volumes") / VOLUME_NAME / "finetuned"

# load tokenizer and model from disk
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

config = PeftConfig.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(
config.base_model_name_or_path, load_in_8bit=True
)
model = PeftModel.from_pretrained(model, model_path, is_trainable=True)

# Data collator
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=8
)
# Define training args
training_args = Seq2SeqTrainingArguments(
output_dir=output_dir,
auto_find_batch_size=True,
learning_rate=1e-3, # higher learning rate
num_train_epochs=5,
logging_dir=f"{output_dir}/logs",
logging_strategy="steps",
logging_steps=500,
save_strategy="no",
report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
model.config.use_cache = False
trainer.train()

# save folder
model_folder = Path("/volumes") / VOLUME_NAME / "finetuned"
trainer.model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)
return model_folder


@ct.electron(executor=gpu_executor)
def evaluate_peft_model(dataset_path, model_folder, max_target_length=50):
test_dataset = load_from_disk(dataset_path)['test']
# Load configurations, model, and tokenizer
config = PeftConfig.from_pretrained(model_folder)
model = AutoModelForSeq2SeqLM.from_pretrained(
config.base_model_name_or_path, load_in_8bit=True
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, model_folder)
model.eval()

def evaluate_example(example):
input_ids = torch.tensor(example["input_ids"], dtype=torch.int32).unsqueeze(0)
outputs = model.generate(
input_ids=input_ids, do_sample=True,
top_p=0.9, max_new_tokens=max_target_length
)
prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
labels = np.where(
np.array(example['labels']) != -100, np.array(example['labels']), tokenizer.pad_token_id
)
reference = tokenizer.decode(labels, skip_special_tokens=True)
return prediction, reference

# Generate predictions and references
predictions, references = zip(*(evaluate_example(example) for example in tqdm(test_dataset)))

# Evaluate using ROUGE metric
rouge_metric = load("rouge")
rouge_score = rouge_metric.compute(predictions=predictions, references=references, use_stemmer=True)
return rouge_score, list(zip(predictions, references))


@ct.lattice(workflow_executor=cpu_executor, executor=cpu_executor)
def workflow():
dataset_path = load_data()
tokenizer_path = load_tokenizer()
tokenized_dataset_path = preprocess_dataset(dataset_path, tokenizer_path)
pretrained_model_path = load_model_for_finetuning()
rouge_before, predictions_and_references_before = evaluate_peft_model(
tokenized_dataset_path, pretrained_model_path,
)
finetuned_model_folder = train_model(
pretrained_model_path, tokenizer_path, tokenized_dataset_path
)
rouge_after, predictions_and_references_after = evaluate_peft_model(
tokenized_dataset_path, finetuned_model_folder
)
return {
"rouge_before": rouge_before,
"rouge_after": rouge_after,
"predictions_and_references_before": predictions_and_references_before,
"predictions_and_references_after": predictions_and_references_after
}

volume = cc.volume("/models")
dispatch_id = cc.dispatch(workflow, volume=volume)()
result = cc.get_result(dispatch_id, wait=True)
result.result.load()
result_dict = result.result.value

rouge_average_before = np.round(np.average(list(result_dict['rouge_before'].values())), 2)
rouge_average_after = np.round(np.average(list(result_dict['rouge_after'].values())), 2)
difference = np.round((rouge_average_after - rouge_average_before), 2) * 100
print(
f'Rouge score with pretrained model was {rouge_average_before} and after finetuning is {rouge_average_after}, difference is {difference}%'
)
before_examples, gold_examples = zip(*result_dict['predictions_and_references_before'])
after_examples, gold_examples = zip(*result_dict['predictions_and_references_after'])

data = []
for before, after, gold in zip(before_examples, after_examples, gold_examples):
if before != after and len(gold) < 50:
data.append((before, after, gold))

pd.DataFrame(data, columns=['before finetuning', 'after finetuning', 'gold']).head(5)