Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Top-level code put in functions + pep8 #1

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
231 changes: 117 additions & 114 deletions llm_multilabel_clf/tutorial.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
import random
import functools
import csv
Expand All @@ -22,12 +21,6 @@
)


def tokenize_examples(examples, tokenizer):
tokenized_inputs = tokenizer(examples['text'])
tokenized_inputs['labels'] = examples['labels']
return tokenized_inputs


# define custom batch preprocessor
def collate_fn(batch, tokenizer):
dict_keys = ['input_ids', 'attention_mask', 'labels']
Expand All @@ -45,9 +38,9 @@ def collate_fn(batch, tokenizer):
# define which metrics to compute for evaluation
def compute_metrics(p):
predictions, labels = p
f1_micro = f1_score(labels, predictions > 0, average = 'micro')
f1_macro = f1_score(labels, predictions > 0, average = 'macro')
f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
f1_micro = f1_score(labels, predictions > 0, average='micro')
f1_macro = f1_score(labels, predictions > 0, average='macro')
f1_weighted = f1_score(labels, predictions > 0, average='weighted')
return {
'f1_micro': f1_micro,
'f1_macro': f1_macro,
Expand All @@ -61,123 +54,133 @@ class CustomTrainer(Trainer):
def __init__(self, label_weights, **kwargs):
super().__init__(**kwargs)
self.label_weights = label_weights

def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop("labels")

# forward pass
outputs = model(**inputs)
logits = outputs.get("logits")

# compute custom loss
loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
return (loss, outputs) if return_outputs else loss


# set random seed
random.seed(0)

# load data
with open('train.csv', newline='') as csvfile:
data = list(csv.reader(csvfile, delimiter=','))
header_row = data.pop(0)

# shuffle data
random.shuffle(data)

# reshape
idx, text, labels = list(zip(*[(int(row[0]), f'Title: {row[1].strip()}\n\nAbstract: {row[2].strip()}', row[3:]) for row in data]))
labels = np.array(labels, dtype=int)

# create label weights
label_weights = 1 - labels.sum(axis=0) / labels.sum()

# stratified train test split for multilabel ds
row_ids = np.arange(len(labels))
train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.1)
x_train = [text[i] for i in train_idx.flatten()]
x_val = [text[i] for i in val_idx.flatten()]

# create hf dataset
ds = DatasetDict({
'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})

# model name
model_name = 'mistralai/Mistral-7B-v0.1'

# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
tokenized_inputs = tokenizer(examples['text'])
tokenized_inputs['labels'] = examples['labels']
return tokenized_inputs

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

# qunatization config
quantization_config = BitsAndBytesConfig(
load_in_4bit = True, # enable 4-bit quantization
bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)
def load_data():
with open('train.csv', newline='') as csvfile:
data = list(csv.reader(csvfile, delimiter=','))
_ = data.pop(0) # Header_row.

# shuffle data
random.shuffle(data)

# reshape
idx, text, labels = list(zip(*[(int(row[0]), f'Title: {row[1].strip()}\n\nAbstract: {row[2].strip()}', row[3:])
for row in data]))
labels = np.array(labels, dtype=int)

# stratified train test split for multilabel dataset
row_ids = np.arange(len(labels))
train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:, np.newaxis], labels, test_size=0.1)
x_train = [text[i] for i in train_idx.flatten()]
x_val = [text[i] for i in val_idx.flatten()]

# create hf dataset
dataset = DatasetDict({
'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
})
return dataset


def main(output_dir, dataset=None, model_name='mistralai/Mistral-7B-v0.1'):
# set random seed
random.seed(0)

if dataset is None:
dataset = load_data()

# preprocess dataset with tokenizer
def tokenize_examples(examples, tokenizer):
tokenized_inputs = tokenizer(examples['text'])
tokenized_inputs['labels'] = examples['labels']
return tokenized_inputs

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = dataset.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
tokenized_ds = tokenized_ds.with_format('torch')

# create label weights
labels = tokenized_ds['train']['labels']
label_weights = 1 - labels.sum(axis=0) / labels.sum()

# qunatization config
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, # enable 4-bit quantization
bnb_4bit_quant_type='nf4', # information theoretically optimal dtype for normally distributed weights
bnb_4bit_use_double_quant=True, # quantize quantized weights //insert xzibit meme
bnb_4bit_compute_dtype=torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
r = 16, # the dimension of the low-rank matrices
lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
lora_dropout = 0.05, # dropout probability of the LoRA layers
bias = 'none', # wether to train bias weights, set to 'none' for attention layers
task_type = 'SEQ_CLS'
)
# lora config
lora_config = LoraConfig(
r=16, # the dimension of the low-rank matrices
lora_alpha=8, # scaling factor for LoRA activations vs pre-trained weight activations
target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
lora_dropout=0.05, # dropout probability of the LoRA layers
bias='none', # wether to train bias weights, set to 'none' for attention layers
task_type='SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
quantization_config=quantization_config,
num_labels=labels.shape[1]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

# define training args
training_args = TrainingArguments(
output_dir = 'multilabel_classification',
learning_rate = 1e-4,
per_device_train_batch_size = 8, # tested with 16gb gpu ram
per_device_eval_batch_size = 8,
num_train_epochs = 10,
weight_decay = 0.01,
evaluation_strategy = 'epoch',
save_strategy = 'epoch',
load_best_model_at_end = True
)
# load model
model = AutoModelForSequenceClassification.from_pretrained(
model_name,
quantization_config=quantization_config,
num_labels=tokenized_ds['train']['labels'].shape[1]
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

# define training args
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=1e-4,
per_device_train_batch_size=8, # tested with 16gb gpu ram
per_device_eval_batch_size=8,
num_train_epochs=10,
weight_decay=0.01,
eval_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True
)

# train
trainer = CustomTrainer(
model=model,
args=training_args,
train_dataset=tokenized_ds['train'],
eval_dataset=tokenized_ds['val'],
tokenizer=tokenizer,
data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
compute_metrics=compute_metrics,
label_weights=torch.tensor(label_weights, device=model.device)
)

trainer.train()

# save model
peft_model_id = 'multilabel_mistral'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

# train
trainer = CustomTrainer(
model = model,
args = training_args,
train_dataset = tokenized_ds['train'],
eval_dataset = tokenized_ds['val'],
tokenizer = tokenizer,
data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
compute_metrics = compute_metrics,
label_weights = torch.tensor(label_weights, device=model.device)
)

trainer.train()
if __name__ == "__main__":
main()

# save model
peft_model_id = 'multilabel_mistral'
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)

# load model
peft_model_id = 'multilabel_mistral'
model = AutoModelForSequenceClassification.from_pretrained(peft_model_id)
def load_model():
peft_model_id = 'multilabel_mistral'
return AutoModelForSequenceClassification.from_pretrained(peft_model_id)