sirluk · makrai · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/llm_multilabel_clf/tutorial.py b/llm_multilabel_clf/tutorial.py
@@ -1,4 +1,3 @@
-import os
 import random
 import functools
 import csv
@@ -22,12 +21,6 @@
 )
 
 
-def tokenize_examples(examples, tokenizer):
-    tokenized_inputs = tokenizer(examples['text'])
-    tokenized_inputs['labels'] = examples['labels']
-    return tokenized_inputs
-
-
 # define custom batch preprocessor
 def collate_fn(batch, tokenizer):
     dict_keys = ['input_ids', 'attention_mask', 'labels']
@@ -45,9 +38,9 @@ def collate_fn(batch, tokenizer):
 # define which metrics to compute for evaluation
 def compute_metrics(p):
     predictions, labels = p
-    f1_micro = f1_score(labels, predictions > 0, average = 'micro')
-    f1_macro = f1_score(labels, predictions > 0, average = 'macro')
-    f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
+    f1_micro = f1_score(labels, predictions > 0, average='micro')
+    f1_macro = f1_score(labels, predictions > 0, average='macro')
+    f1_weighted = f1_score(labels, predictions > 0, average='weighted')
     return {
         'f1_micro': f1_micro,
         'f1_macro': f1_macro,
@@ -61,123 +54,133 @@ class CustomTrainer(Trainer):
     def __init__(self, label_weights, **kwargs):
         super().__init__(**kwargs)
         self.label_weights = label_weights
-    
+
     def compute_loss(self, model, inputs, return_outputs=False):
         labels = inputs.pop("labels")
-        
+
         # forward pass
         outputs = model(**inputs)
         logits = outputs.get("logits")
-        
+
         # compute custom loss
         loss = F.binary_cross_entropy_with_logits(logits, labels.to(torch.float32), pos_weight=self.label_weights)
         return (loss, outputs) if return_outputs else loss
 
 
-# set random seed
-random.seed(0)
-
-# load data
-with open('train.csv', newline='') as csvfile:
-    data = list(csv.reader(csvfile, delimiter=','))
-    header_row = data.pop(0)
-
-# shuffle data
-random.shuffle(data)
-
-# reshape
-idx, text, labels = list(zip(*[(int(row[0]), f'Title: {row[1].strip()}\n\nAbstract: {row[2].strip()}', row[3:]) for row in data]))
-labels = np.array(labels, dtype=int)
-
-# create label weights
-label_weights = 1 - labels.sum(axis=0) / labels.sum()
-
-# stratified train test split for multilabel ds
-row_ids = np.arange(len(labels))
-train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:,np.newaxis], labels, test_size = 0.1)
-x_train = [text[i] for i in train_idx.flatten()]
-x_val = [text[i] for i in val_idx.flatten()]
-
-# create hf dataset
-ds = DatasetDict({
-    'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
-    'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
-})
-
-# model name
-model_name = 'mistralai/Mistral-7B-v0.1'
-
-# preprocess dataset with tokenizer
-def tokenize_examples(examples, tokenizer):
-    tokenized_inputs = tokenizer(examples['text'])
-    tokenized_inputs['labels'] = examples['labels']
-    return tokenized_inputs
-
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-tokenizer.pad_token = tokenizer.eos_token
-tokenized_ds = ds.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
-tokenized_ds = tokenized_ds.with_format('torch')
-
-# qunatization config
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit = True, # enable 4-bit quantization
-    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
-    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
-    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
-)
+def load_data():
+    with open('train.csv', newline='') as csvfile:
+        data = list(csv.reader(csvfile, delimiter=','))
+        _ = data.pop(0)  # Header_row.
+
+    # shuffle data
+    random.shuffle(data)
+
+    # reshape
+    idx, text, labels = list(zip(*[(int(row[0]), f'Title: {row[1].strip()}\n\nAbstract: {row[2].strip()}', row[3:])
+                                   for row in data]))
+    labels = np.array(labels, dtype=int)
+
+    # stratified train test split for multilabel dataset
+    row_ids = np.arange(len(labels))
+    train_idx, y_train, val_idx, y_val = iterative_train_test_split(row_ids[:, np.newaxis], labels, test_size=0.1)
+    x_train = [text[i] for i in train_idx.flatten()]
+    x_val = [text[i] for i in val_idx.flatten()]
+
+    # create hf dataset
+    dataset = DatasetDict({
+        'train': Dataset.from_dict({'text': x_train, 'labels': y_train}),
+        'val': Dataset.from_dict({'text': x_val, 'labels': y_val})
+    })
+    return dataset
+
+
+def main(output_dir, dataset=None, model_name='mistralai/Mistral-7B-v0.1'):
+    # set random seed
+    random.seed(0)
+
+    if dataset is None:
+        dataset = load_data()
+
+    # preprocess dataset with tokenizer
+    def tokenize_examples(examples, tokenizer):
+        tokenized_inputs = tokenizer(examples['text'])
+        tokenized_inputs['labels'] = examples['labels']
+        return tokenized_inputs
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenized_ds = dataset.map(functools.partial(tokenize_examples, tokenizer=tokenizer), batched=True)
+    tokenized_ds = tokenized_ds.with_format('torch')
+
+    # create label weights
+    labels = tokenized_ds['train']['labels']
+    label_weights = 1 - labels.sum(axis=0) / labels.sum()
+
+    # qunatization config
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,  # enable 4-bit quantization
+        bnb_4bit_quant_type='nf4',  # information theoretically optimal dtype for normally distributed weights
+        bnb_4bit_use_double_quant=True,  # quantize quantized weights //insert xzibit meme
+        bnb_4bit_compute_dtype=torch.bfloat16  # optimized fp format for ML
+    )
 
-# lora config
-lora_config = LoraConfig(
-    r = 16, # the dimension of the low-rank matrices
-    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
-    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
-    lora_dropout = 0.05, # dropout probability of the LoRA layers
-    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
-    task_type = 'SEQ_CLS'
-)
+    # lora config
+    lora_config = LoraConfig(
+        r=16,  # the dimension of the low-rank matrices
+        lora_alpha=8,  # scaling factor for LoRA activations vs pre-trained weight activations
+        target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'],
+        lora_dropout=0.05,  # dropout probability of the LoRA layers
+        bias='none',  # wether to train bias weights, set to 'none' for attention layers
+        task_type='SEQ_CLS'
+    )
 
-# load model
-model = AutoModelForSequenceClassification.from_pretrained(
-    model_name,
-    quantization_config=quantization_config,
-    num_labels=labels.shape[1]
-)
-model = prepare_model_for_kbit_training(model)
-model = get_peft_model(model, lora_config)
-model.config.pad_token_id = tokenizer.pad_token_id
-
-# define training args
-training_args = TrainingArguments(
-    output_dir = 'multilabel_classification',
-    learning_rate = 1e-4,
-    per_device_train_batch_size = 8, # tested with 16gb gpu ram
-    per_device_eval_batch_size = 8,
-    num_train_epochs = 10,
-    weight_decay = 0.01,
-    evaluation_strategy = 'epoch',
-    save_strategy = 'epoch',
-    load_best_model_at_end = True
-)
+    # load model
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        quantization_config=quantization_config,
+        num_labels=tokenized_ds['train']['labels'].shape[1]
+    )
+    model = prepare_model_for_kbit_training(model)
+    model = get_peft_model(model, lora_config)
+    model.config.pad_token_id = tokenizer.pad_token_id
+
+    # define training args
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        learning_rate=1e-4,
+        per_device_train_batch_size=8,  # tested with 16gb gpu ram
+        per_device_eval_batch_size=8,
+        num_train_epochs=10,
+        weight_decay=0.01,
+        eval_strategy='epoch',
+        save_strategy='epoch',
+        load_best_model_at_end=True
+    )
+
+    # train
+    trainer = CustomTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_ds['train'],
+        eval_dataset=tokenized_ds['val'],
+        tokenizer=tokenizer,
+        data_collator=functools.partial(collate_fn, tokenizer=tokenizer),
+        compute_metrics=compute_metrics,
+        label_weights=torch.tensor(label_weights, device=model.device)
+    )
+
+    trainer.train()
+
+    # save model
+    peft_model_id = 'multilabel_mistral'
+    trainer.model.save_pretrained(peft_model_id)
+    tokenizer.save_pretrained(peft_model_id)
 
-# train
-trainer = CustomTrainer(
-    model = model,
-    args = training_args,
-    train_dataset = tokenized_ds['train'],
-    eval_dataset = tokenized_ds['val'],
-    tokenizer = tokenizer,
-    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
-    compute_metrics = compute_metrics,
-    label_weights = torch.tensor(label_weights, device=model.device)
-)
 
-trainer.train()
+if __name__ == "__main__":
+    main()
 
-# save model
-peft_model_id = 'multilabel_mistral'
-trainer.model.save_pretrained(peft_model_id)
-tokenizer.save_pretrained(peft_model_id)
 
-# load model
-peft_model_id = 'multilabel_mistral'
-model = AutoModelForSequenceClassification.from_pretrained(peft_model_id)
+def load_model():
+    peft_model_id = 'multilabel_mistral'
+    return AutoModelForSequenceClassification.from_pretrained(peft_model_id)