Merge pull request #80 from tanishq-ids/changes

Changes
os-climate · Nov 7, 2024 · 59ee612 · 59ee612
2 parents 2492f1f + cf7fa3d
commit 59ee612
Show file tree

Hide file tree

Showing 7 changed files with 190 additions and 183 deletions.
diff --git a/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py
@@ -1,4 +1,5 @@
 import typer
+import os
 from .train_kpi_detection import (
     train_kpi_detection,
     check_output_dir,
@@ -43,6 +44,7 @@ def fine_tune_qna(
     output_dir: str = typer.Argument(
         ..., help="Directory to save the fine-tuned model."
     ),
+    export_model_name: str = typer.Argument(..., help="Name of the model to export."),
     save_steps: int = typer.Argument(
         ..., help="Number of steps between saving model checkpoints."
     ),
@@ -59,10 +61,13 @@ def fine_tune_qna(
         batch_size=batch_size,
         learning_rate=learning_rate,
         output_dir=output_dir,
+        export_model_name=export_model_name,
         save_steps=save_steps,
     )
-
-    typer.echo(f"Model '{model_name}' trained and saved successfully at {output_dir}")
+    saved_model_path = os.path.join(output_dir, f"{export_model_name}")
+    typer.echo(
+        f"Model '{model_name}' is trained and saved successfully at {saved_model_path}"
+    )
 
 
 @kpi_detection_app.command("inference")
@@ -76,6 +81,7 @@ def inference_qna(
     model_path: str = typer.Argument(
         ..., help="Path to the pre-trained model directory OR name on huggingface."
     ),
+    batch_size: int = typer.Argument(16, help="The batch size for inference."),
 ):
     """Perform inference using a pre-trained model on a dataset of kpis and contexts, saving an output Excel file."""
     try:
@@ -86,6 +92,7 @@ def inference_qna(
             data_file_path=data_file_path,
             output_path=output_path,
             model_path=model_path,
+            batch_size=batch_size,
         )
 
         typer.echo("Inference completed successfully!")

diff --git a/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py
@@ -3,11 +3,12 @@
 """
 
 import os
+import torch
 import pandas as pd
 from pathlib import Path
 from tqdm import tqdm
-import torch
 from transformers import pipeline, AutoConfig
+from transformers.pipelines import QuestionAnsweringPipeline
 
 
 def resolve_model_path(model_path: str):
@@ -48,29 +49,32 @@ def validate_path_exists(path: str, which_path: str):
         raise ValueError(f"{which_path}: {path} does not exist.")
 
 
-def get_inference_kpi_detection(question: str, context: str, model_path: str, device):
+def get_batch_inference_kpi_detection(
+    questions, contexts, question_answerer: QuestionAnsweringPipeline, batch_size
+):
     """
-    Performs kpi-detection inference using a specified model.
+    Perform batch inference using the question-answering pipeline.
 
     Args:
-        question (str): The question to be answered.
-        context (str): The context in which to find the answer.
-        model_path (str): Path to the pre-trained model to be used for inference.
+        questions (list): List of questions.
+        contexts (list): List of contexts.
+        question_answerer (QuestionAnsweringPipeline): The question-answering pipeline.
+        batch_size (int): The batch size for inference.
 
     Returns:
-        tuple: A tuple containing:
-            - answer (str): The predicted answer.
-            - score (float): The confidence score of the prediction.
-            - start (int): The start position of the answer in the context.
-            - end (int): The end position of the answer in the context.
+        list of dict: List of dictionaries containing answers, scores, and positions.
     """
-    question_answerer = pipeline("question-answering", model=model_path, device=device)
-    result = question_answerer(question=question, context=context)
-    return result["answer"], result["score"], result["start"], result["end"]
+    results = question_answerer(
+        questions, contexts, batch_size=batch_size
+    )  # Adjust batch size as needed
+    return results
 
 
 def run_full_inference_kpi_detection(
-    data_file_path: str, output_path: str, model_path: str
+    data_file_path: str,
+    output_path: str,
+    model_path: str,
+    batch_size: int,
 ):
     """
     Runs full inference on a dataset of questions and contexts, and saves the results.
@@ -80,6 +84,7 @@ def run_full_inference_kpi_detection(
             The dataset should have columns 'question' and 'context'.
         output_path (str): Path to the directory where the output Excel file will be saved.
         model_path (str): Path to the pre-trained model to be used for inference.
+        batch_size (int): The batch size for inference.
 
     Returns:
         None: The function saves the resulting DataFrame to an Excel file and prints a success message.
@@ -90,7 +95,6 @@ def run_full_inference_kpi_detection(
 
     data = pd.read_csv(data_file_path)
 
-    # Dynamically detect the device: CUDA, MPS, or CPU
     if torch.cuda.is_available():
         device = torch.device("cuda")  # Use NVIDIA GPU
         print("Using CUDA GPU")
@@ -101,19 +105,38 @@ def run_full_inference_kpi_detection(
         device = torch.device("cpu")  # Fallback to CPU
         print("Using CPU")
 
-    result = []
-    for _, row in tqdm(data.iterrows(), total=data.shape[0], desc="Processing Rows"):
-        question = row["question"]
-        context = row["context"]
-        answer, score, start, end = get_inference_kpi_detection(
-            question, context, model_path, device
-        )
-        result.append(
-            {"predicted_answer": answer, "start": start, "end": end, "score": score}
-        )
+    # Initialize the question-answering pipeline
+    question_answerer = pipeline("question-answering", model=model_path, device=device)
 
-    df = pd.DataFrame(result)
+    results = []
+
+    # Process in batches
+    for start_idx in tqdm(
+        range(0, data.shape[0], batch_size), desc="Processing Batches"
+    ):
+        end_idx = min(start_idx + batch_size, data.shape[0])
+        batch_questions = data["question"].iloc[start_idx:end_idx].tolist()
+        batch_contexts = data["context"].iloc[start_idx:end_idx].tolist()
+
+        # Perform batch inference
+        batch_results = get_batch_inference_kpi_detection(
+            questions=batch_questions,
+            contexts=batch_contexts,
+            question_answerer=question_answerer,
+            batch_size=batch_size,
+        )
 
+        for result in batch_results:
+            results.append(
+                {
+                    "predicted_answer": result["answer"],
+                    "start": result["start"],
+                    "end": result["end"],
+                    "score": result["score"],
+                }
+            )
+
+    df = pd.DataFrame(results)
     combined_df = pd.concat([data, df], axis=1)
 
     file_name = Path(output_path) / "output.xlsx"

diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
@@ -34,7 +34,6 @@
 )
 import torch
 import numpy as np
-from datetime import datetime
 from sklearn.model_selection import train_test_split
 
 
@@ -92,6 +91,7 @@ def train_kpi_detection(
     batch_size,
     learning_rate,
     output_dir,
+    export_model_name,
     save_steps,
 ):
     """
@@ -105,6 +105,7 @@ def train_kpi_detection(
         batch_size (int): Batch size for training.
         learning_rate (float): Learning rate for trainig
         output_dir (str): Directory where the model will be saved during training.
+        export_model_name (str): The name to export the trained model
         save_steps (int): Number of steps before saving the model during training.
     """
     # Load the data
@@ -229,8 +230,7 @@ def preprocess_function(examples, max_length):
     data_collator = DefaultDataCollator()
 
     # Get the current timestamp
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    saved_model_path = os.path.join(output_dir, f"{model_name}_{timestamp}")
+    saved_model_path = os.path.join(output_dir, export_model_name)
     os.makedirs(saved_model_path, exist_ok=True)
 
     checkpoint_dir = os.path.join(saved_model_path, "checkpoints")

diff --git a/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py b/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py
@@ -1,4 +1,5 @@
 import typer
+import os
 from .fine_tune import (
     check_csv_columns,
     check_output_dir,
@@ -43,6 +44,7 @@ def fine_tune(
     output_dir: str = typer.Argument(
         ..., help="Directory to save the fine-tuned model."
     ),
+    export_model_name: str = typer.Argument(..., help="Name of the model to export."),
     save_steps: int = typer.Argument(
         ..., help="Number of steps between saving model checkpoints."
     ),
@@ -60,10 +62,13 @@ def fine_tune(
         batch_size=batch_size,
         learning_rate=learning_rate,
         output_dir=output_dir,
+        export_model_name=export_model_name,
         save_steps=save_steps,
     )
-
-    typer.echo(f"Model '{model_name}' trained and saved successfully at {output_dir}")
+    saved_model_path = os.path.join(output_dir, f"{export_model_name}")
+    typer.echo(
+        f"Model '{model_name}' is trained and saved successfully at {saved_model_path}"
+    )
 
 
 @relevance_detector_app.command("inference")
@@ -81,6 +86,7 @@ def inference(
     tokenizer_path: str = typer.Argument(
         ..., help="Path to the tokenizer directory OR name on huggingface."
     ),
+    batch_size: int = typer.Argument(16, help="Batch size to process the rows"),
     threshold: float = typer.Argument(
         0.5, help="Threshold value for prediction confidence."
     ),
@@ -97,6 +103,7 @@ def inference(
             output_path=output_path,
             model_path=model_path,
             tokenizer_path=tokenizer_path,
+            batch_size=batch_size,
             threshold=threshold,
         )
 

diff --git a/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py b/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py
@@ -10,6 +10,7 @@
 """
 
 import os
+import shutil
 import pandas as pd
 import torch
 from transformers import (
@@ -150,6 +151,7 @@ def fine_tune_model(
     batch_size,
     learning_rate,
     output_dir,
+    export_model_name,
     save_steps,
 ):
     """
@@ -164,6 +166,7 @@ def fine_tune_model(
         batch_size (int): Batch size for training.
         learning_rate (float): Learning rate for trainig
         output_dir (str): Directory where the model will be saved during training.
+        export_model_name (str): The name to export the trained model
         save_steps (int): Number of steps before saving the model during training.
     """
     # Load your dataset into a pandas DataFrame
@@ -210,11 +213,14 @@ def fine_tune_model(
         device,
     )
 
-    saved_model_path = os.path.join(output_dir, "saved_model")
+    saved_model_path = os.path.join(output_dir, export_model_name)
+    os.makedirs(saved_model_path, exist_ok=True)
+
+    checkpoint_dir = os.path.join(saved_model_path, "checkpoints")
     os.makedirs(saved_model_path, exist_ok=True)
 
     training_args = TrainingArguments(
-        output_dir=saved_model_path,
+        output_dir=checkpoint_dir,
         evaluation_strategy="epoch",  # Evaluate at the end of each epoch
         logging_dir="./logs",  # Directory for logs
         logging_steps=10,  # Log every 10 steps
@@ -228,7 +234,7 @@ def fine_tune_model(
         save_strategy="epoch",
         load_best_model_at_end=True,
         metric_for_best_model="eval_loss",
-        greater_is_better=True,
+        greater_is_better=False,
         save_total_limit=1,
     )
 
@@ -243,6 +249,12 @@ def fine_tune_model(
     # Start Training
     trainer.train()
 
+    # Save the final trained model and config
+    trainer.save_model(saved_model_path)
+
+    # Save the tokenizer manually
+    tokenizer.save_pretrained(saved_model_path)
+
     # Evaluate the model
     eval_result = trainer.evaluate(eval_dataset)
     print("Evaluation results:")
@@ -269,3 +281,8 @@ def fine_tune_model(
         print(f"Input: {tokenizer.decode(input_ids, skip_special_tokens=True)}")
         print(f"True Label: {true_label}, Predicted Label: {predicted_label}")
         print("\n")
+
+    try:
+        shutil.rmtree(checkpoint_dir)
+    except OSError:
+        pass