diff --git a/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py index 8cae7d4..e9344d1 100644 --- a/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py +++ b/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py @@ -1,4 +1,5 @@ import typer +import os from .train_kpi_detection import ( train_kpi_detection, check_output_dir, @@ -43,6 +44,7 @@ def fine_tune_qna( output_dir: str = typer.Argument( ..., help="Directory to save the fine-tuned model." ), + export_model_name: str = typer.Argument(..., help="Name of the model to export."), save_steps: int = typer.Argument( ..., help="Number of steps between saving model checkpoints." ), @@ -59,10 +61,13 @@ def fine_tune_qna( batch_size=batch_size, learning_rate=learning_rate, output_dir=output_dir, + export_model_name=export_model_name, save_steps=save_steps, ) - - typer.echo(f"Model '{model_name}' trained and saved successfully at {output_dir}") + saved_model_path = os.path.join(output_dir, f"{export_model_name}") + typer.echo( + f"Model '{model_name}' is trained and saved successfully at {saved_model_path}" + ) @kpi_detection_app.command("inference") @@ -76,6 +81,7 @@ def inference_qna( model_path: str = typer.Argument( ..., help="Path to the pre-trained model directory OR name on huggingface." ), + batch_size: int = typer.Argument(16, help="The batch size for inference."), ): """Perform inference using a pre-trained model on a dataset of kpis and contexts, saving an output Excel file.""" try: @@ -86,6 +92,7 @@ def inference_qna( data_file_path=data_file_path, output_path=output_path, model_path=model_path, + batch_size=batch_size, ) typer.echo("Inference completed successfully!") diff --git a/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py index 5389b67..b311d4f 100644 --- a/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py +++ b/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py @@ -3,11 +3,12 @@ """ import os +import torch import pandas as pd from pathlib import Path from tqdm import tqdm -import torch from transformers import pipeline, AutoConfig +from transformers.pipelines import QuestionAnsweringPipeline def resolve_model_path(model_path: str): @@ -48,29 +49,32 @@ def validate_path_exists(path: str, which_path: str): raise ValueError(f"{which_path}: {path} does not exist.") -def get_inference_kpi_detection(question: str, context: str, model_path: str, device): +def get_batch_inference_kpi_detection( + questions, contexts, question_answerer: QuestionAnsweringPipeline, batch_size +): """ - Performs kpi-detection inference using a specified model. + Perform batch inference using the question-answering pipeline. Args: - question (str): The question to be answered. - context (str): The context in which to find the answer. - model_path (str): Path to the pre-trained model to be used for inference. + questions (list): List of questions. + contexts (list): List of contexts. + question_answerer (QuestionAnsweringPipeline): The question-answering pipeline. + batch_size (int): The batch size for inference. Returns: - tuple: A tuple containing: - - answer (str): The predicted answer. - - score (float): The confidence score of the prediction. - - start (int): The start position of the answer in the context. - - end (int): The end position of the answer in the context. + list of dict: List of dictionaries containing answers, scores, and positions. """ - question_answerer = pipeline("question-answering", model=model_path, device=device) - result = question_answerer(question=question, context=context) - return result["answer"], result["score"], result["start"], result["end"] + results = question_answerer( + questions, contexts, batch_size=batch_size + ) # Adjust batch size as needed + return results def run_full_inference_kpi_detection( - data_file_path: str, output_path: str, model_path: str + data_file_path: str, + output_path: str, + model_path: str, + batch_size: int, ): """ Runs full inference on a dataset of questions and contexts, and saves the results. @@ -80,6 +84,7 @@ def run_full_inference_kpi_detection( The dataset should have columns 'question' and 'context'. output_path (str): Path to the directory where the output Excel file will be saved. model_path (str): Path to the pre-trained model to be used for inference. + batch_size (int): The batch size for inference. Returns: None: The function saves the resulting DataFrame to an Excel file and prints a success message. @@ -90,7 +95,6 @@ def run_full_inference_kpi_detection( data = pd.read_csv(data_file_path) - # Dynamically detect the device: CUDA, MPS, or CPU if torch.cuda.is_available(): device = torch.device("cuda") # Use NVIDIA GPU print("Using CUDA GPU") @@ -101,19 +105,38 @@ def run_full_inference_kpi_detection( device = torch.device("cpu") # Fallback to CPU print("Using CPU") - result = [] - for _, row in tqdm(data.iterrows(), total=data.shape[0], desc="Processing Rows"): - question = row["question"] - context = row["context"] - answer, score, start, end = get_inference_kpi_detection( - question, context, model_path, device - ) - result.append( - {"predicted_answer": answer, "start": start, "end": end, "score": score} - ) + # Initialize the question-answering pipeline + question_answerer = pipeline("question-answering", model=model_path, device=device) - df = pd.DataFrame(result) + results = [] + + # Process in batches + for start_idx in tqdm( + range(0, data.shape[0], batch_size), desc="Processing Batches" + ): + end_idx = min(start_idx + batch_size, data.shape[0]) + batch_questions = data["question"].iloc[start_idx:end_idx].tolist() + batch_contexts = data["context"].iloc[start_idx:end_idx].tolist() + + # Perform batch inference + batch_results = get_batch_inference_kpi_detection( + questions=batch_questions, + contexts=batch_contexts, + question_answerer=question_answerer, + batch_size=batch_size, + ) + for result in batch_results: + results.append( + { + "predicted_answer": result["answer"], + "start": result["start"], + "end": result["end"], + "score": result["score"], + } + ) + + df = pd.DataFrame(results) combined_df = pd.concat([data, df], axis=1) file_name = Path(output_path) / "output.xlsx" diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py index 414cb7c..16c334c 100644 --- a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py +++ b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py @@ -34,7 +34,6 @@ ) import torch import numpy as np -from datetime import datetime from sklearn.model_selection import train_test_split @@ -92,6 +91,7 @@ def train_kpi_detection( batch_size, learning_rate, output_dir, + export_model_name, save_steps, ): """ @@ -105,6 +105,7 @@ def train_kpi_detection( batch_size (int): Batch size for training. learning_rate (float): Learning rate for trainig output_dir (str): Directory where the model will be saved during training. + export_model_name (str): The name to export the trained model save_steps (int): Number of steps before saving the model during training. """ # Load the data @@ -229,8 +230,7 @@ def preprocess_function(examples, max_length): data_collator = DefaultDataCollator() # Get the current timestamp - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - saved_model_path = os.path.join(output_dir, f"{model_name}_{timestamp}") + saved_model_path = os.path.join(output_dir, export_model_name) os.makedirs(saved_model_path, exist_ok=True) checkpoint_dir = os.path.join(saved_model_path, "checkpoints") diff --git a/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py b/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py index 639f8c1..60ca152 100644 --- a/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py +++ b/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py @@ -1,4 +1,5 @@ import typer +import os from .fine_tune import ( check_csv_columns, check_output_dir, @@ -43,6 +44,7 @@ def fine_tune( output_dir: str = typer.Argument( ..., help="Directory to save the fine-tuned model." ), + export_model_name: str = typer.Argument(..., help="Name of the model to export."), save_steps: int = typer.Argument( ..., help="Number of steps between saving model checkpoints." ), @@ -60,10 +62,13 @@ def fine_tune( batch_size=batch_size, learning_rate=learning_rate, output_dir=output_dir, + export_model_name=export_model_name, save_steps=save_steps, ) - - typer.echo(f"Model '{model_name}' trained and saved successfully at {output_dir}") + saved_model_path = os.path.join(output_dir, f"{export_model_name}") + typer.echo( + f"Model '{model_name}' is trained and saved successfully at {saved_model_path}" + ) @relevance_detector_app.command("inference") @@ -81,6 +86,7 @@ def inference( tokenizer_path: str = typer.Argument( ..., help="Path to the tokenizer directory OR name on huggingface." ), + batch_size: int = typer.Argument(16, help="Batch size to process the rows"), threshold: float = typer.Argument( 0.5, help="Threshold value for prediction confidence." ), @@ -97,6 +103,7 @@ def inference( output_path=output_path, model_path=model_path, tokenizer_path=tokenizer_path, + batch_size=batch_size, threshold=threshold, ) diff --git a/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py b/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py index 4ecca64..5b85193 100644 --- a/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py +++ b/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py @@ -10,6 +10,7 @@ """ import os +import shutil import pandas as pd import torch from transformers import ( @@ -150,6 +151,7 @@ def fine_tune_model( batch_size, learning_rate, output_dir, + export_model_name, save_steps, ): """ @@ -164,6 +166,7 @@ def fine_tune_model( batch_size (int): Batch size for training. learning_rate (float): Learning rate for trainig output_dir (str): Directory where the model will be saved during training. + export_model_name (str): The name to export the trained model save_steps (int): Number of steps before saving the model during training. """ # Load your dataset into a pandas DataFrame @@ -210,11 +213,14 @@ def fine_tune_model( device, ) - saved_model_path = os.path.join(output_dir, "saved_model") + saved_model_path = os.path.join(output_dir, export_model_name) + os.makedirs(saved_model_path, exist_ok=True) + + checkpoint_dir = os.path.join(saved_model_path, "checkpoints") os.makedirs(saved_model_path, exist_ok=True) training_args = TrainingArguments( - output_dir=saved_model_path, + output_dir=checkpoint_dir, evaluation_strategy="epoch", # Evaluate at the end of each epoch logging_dir="./logs", # Directory for logs logging_steps=10, # Log every 10 steps @@ -228,7 +234,7 @@ def fine_tune_model( save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="eval_loss", - greater_is_better=True, + greater_is_better=False, save_total_limit=1, ) @@ -243,6 +249,12 @@ def fine_tune_model( # Start Training trainer.train() + # Save the final trained model and config + trainer.save_model(saved_model_path) + + # Save the tokenizer manually + tokenizer.save_pretrained(saved_model_path) + # Evaluate the model eval_result = trainer.evaluate(eval_dataset) print("Evaluation results:") @@ -269,3 +281,8 @@ def fine_tune_model( print(f"Input: {tokenizer.decode(input_ids, skip_special_tokens=True)}") print(f"True Label: {true_label}, Predicted Label: {predicted_label}") print("\n") + + try: + shutil.rmtree(checkpoint_dir) + except OSError: + pass diff --git a/src/osc_transformer_based_extractor/relevance_detector/inference.py b/src/osc_transformer_based_extractor/relevance_detector/inference.py index 73b32a5..6e007dc 100644 --- a/src/osc_transformer_based_extractor/relevance_detector/inference.py +++ b/src/osc_transformer_based_extractor/relevance_detector/inference.py @@ -1,8 +1,3 @@ -"""This module contains utility functions for performing inference using pre-trained sequence classification models.""" - -# Module: inference -# Author: Tanishq-ids - import os import torch import pandas as pd @@ -12,30 +7,6 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig -def resolve_model_path(model_path: str): - """ - Resolves whether the given `model_path` is a Hugging Face model name or a local system path. - - - If the `model_path` refers to a Hugging Face model (e.g., "bert-base-uncased"), the function will return the - model name as a string. - - If the `model_path` refers to a valid local system path, the function will convert it into a `Path` object. - - If neither, the function raises a `ValueError`. - """ - - # Check if it's a local path - if os.path.exists(model_path): - return Path(model_path) - - # Check if it's a Hugging Face model name - try: - AutoConfig.from_pretrained(model_path) - return model_path # It's a Hugging Face model name, return as string - except Exception: - raise ValueError( - f"{model_path} is neither a valid Hugging Face model nor a local file path." - ) - - def validate_path_exists(path: str, which_path: str): """ Validate if the given path exists. @@ -50,54 +21,36 @@ def validate_path_exists(path: str, which_path: str): raise ValueError(f"{which_path}: {path} does not exist.") -def get_inference( - question: str, context: str, model_path: str, tokenizer_path: str, threshold: float -): +def resolve_model_path(model_path: str): """ - Perform inference using a pre-trained sequence classification model. - - Parameters: - question (str): The question for inference. - context (str): The context to be analyzed. - model_path (str): Path to the pre-trained model directory OR name on huggingface. - tokenizer_path (str): Path to the tokenizer directory OR name on huggingface. - threshold (float): The threshold for the inference score. - - Returns: - int: Predicted label ID (0 or 1). - float: class probability + Resolves whether the given `model_path` is a Hugging Face model name or a local system path. """ - model_path = str(Path(model_path)) - tokenizer_path = str(Path(tokenizer_path)) - - # Dynamically detect the device: CUDA, MPS, or CPU - if torch.cuda.is_available(): - device = torch.device("cuda") # Use NVIDIA GPU - print("Using CUDA GPU") - elif torch.backends.mps.is_available(): - device = torch.device("mps") # Use Apple Silicon GPU (MPS) - print("Using MPS (Apple Silicon GPU)") - else: - device = torch.device("cpu") # Fallback to CPU - print("Using CPU") - - print(f"Using device: {device}") # Print device to confirm + if os.path.exists(model_path): + return Path(model_path) + try: + AutoConfig.from_pretrained(model_path) + return model_path + except Exception: + raise ValueError( + f"{model_path} is neither a valid Hugging Face model nor a local file path." + ) - # Load model and tokenizer - model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device) - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - # Tokenize inputs +def get_batch_inference(questions, contexts, model, tokenizer, device, threshold): + """ + Perform batch inference using the model and tokenizer. + """ + # Tokenize the batch of questions and contexts inputs = tokenizer( - question, - context, + questions, + contexts, return_tensors="pt", truncation=True, padding=True, max_length=512, ) - # Move tokenized inputs to the same device as the model + # Move inputs to the device inputs = {key: val.to(device) for key, val in inputs.items()} # Forward pass @@ -108,11 +61,10 @@ def get_inference( probabilities = torch.softmax(outputs.logits, dim=1) # Probability of the positive class (label 1) - positive_class_prob = probabilities[0, 1].item() + positive_class_probs = probabilities[:, 1].cpu().numpy() + labels = (positive_class_probs >= threshold).astype(int) - label = 1 if positive_class_prob >= threshold else 0 - - return label, positive_class_prob + return labels, positive_class_probs def run_full_inference( @@ -121,35 +73,38 @@ def run_full_inference( output_path: str, model_path: str, tokenizer_path: str, + batch_size: int, threshold: float, ): """ Perform inference on JSON files in a specified folder and save the results to Excel files. - - This function reads JSON files from a specified folder, processes the data, merges it with a KPI mapping, - performs inference using a specified model and tokenizer, and saves the results to Excel files. - - Args: - json_folder_path (str): Path to the folder containing JSON files to process. - kpi_mapping_path (str): Path to the CSV file containing KPI mappings. - output_path (str): Path to the folder where the output Excel files will be saved. - model_path (str): Path to the model used for inference (local or Huggingface). - tokenizer_path (str): Path to the tokenizer used for inference (local or Huggingface). - threshold (float): Threshold value for the inference process. - - Returns: - None - - Raises: - Exception: If there is an error reading or processing the JSON files, or saving the Excel files. """ kpi_mapping_path = str(Path(kpi_mapping_path)) json_folder_path = str(Path(json_folder_path)) output_path = str(Path(output_path)) + + # Resolve model and tokenizer paths model_path = resolve_model_path(model_path) tokenizer_path = resolve_model_path(tokenizer_path) - # Read the KPI mapping outside the loop + # Load model and tokenizer once + # Dynamically detect the device: CUDA, MPS, or CPU + if torch.cuda.is_available(): + device = torch.device("cuda") # Use NVIDIA GPU + print("Using CUDA GPU") + elif torch.backends.mps.is_available(): + device = torch.device("mps") # Use Apple Silicon GPU (MPS) + print("Using MPS (Apple Silicon GPU)") + else: + device = torch.device("cpu") # Fallback to CPU + print("Using CPU") + + print(f"Using device: {device}") # Print device to confirm + model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device) + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True) + model.eval() # Set model to evaluation mode + + # Read the KPI mapping kpi_mapping = pd.read_csv(kpi_mapping_path) kpi_mapping = kpi_mapping[["kpi_id", "question"]] @@ -189,19 +144,18 @@ def run_full_inference( labels = [] probs = [] - # Iterate over the rows of the DataFrame and perform inference - for _, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]): - question = row["question"] - context = row["paragraph"] - label, prob = get_inference( - question=question, - context=context, - model_path=model_path, - tokenizer_path=tokenizer_path, - threshold=threshold, + # Perform inference in batches + for start_idx in tqdm(range(0, merged_df.shape[0], batch_size)): + end_idx = min(start_idx + batch_size, merged_df.shape[0]) + batch_questions = merged_df["question"].iloc[start_idx:end_idx].tolist() + batch_contexts = merged_df["paragraph"].iloc[start_idx:end_idx].tolist() + + batch_labels, batch_probs = get_batch_inference( + batch_questions, batch_contexts, model, tokenizer, device, threshold ) - labels.append(label) - probs.append(prob) + + labels.extend(batch_labels) + probs.extend(batch_probs) # Add results to the DataFrame merged_df["paragraph_relevance_flag"] = labels diff --git a/tests/osc_transformer_based_extractor/relevance_detector/test_inference.py b/tests/osc_transformer_based_extractor/relevance_detector/test_inference.py index d32c90f..733e7bb 100644 --- a/tests/osc_transformer_based_extractor/relevance_detector/test_inference.py +++ b/tests/osc_transformer_based_extractor/relevance_detector/test_inference.py @@ -9,10 +9,11 @@ from unittest.mock import patch, mock_open, MagicMock from pathlib import Path import pandas as pd +import numpy as np import torch import pytest from osc_transformer_based_extractor.relevance_detector.inference import ( - get_inference, + get_batch_inference, run_full_inference, ) @@ -33,54 +34,43 @@ @patch( "osc_transformer_based_extractor.relevance_detector.inference.AutoTokenizer.from_pretrained" ) -def test_get_inference(mock_tokenizer, mock_model): - """Test the get_inference function for inference correctness.""" +def test_get_batch_inference(): + """Test the get_batch_inference function for inference correctness.""" + # Mock tokenizer and model tokenizer_mock = MagicMock() model_mock = MagicMock() - mock_tokenizer.return_value = tokenizer_mock - mock_model.return_value = model_mock - # Configure the tokenizer mock - tokenizer_mock.encode_plus.return_value = { - "input_ids": torch.tensor([[101, 102]]), - "attention_mask": torch.tensor([[1, 1]]), + # Configure the tokenizer mock for batch inputs + tokenizer_mock.return_value = { + "input_ids": torch.tensor([[101, 102], [101, 103]]), + "attention_mask": torch.tensor([[1, 1], [1, 1]]), } # Configure the model mock to return logits tensor model_output_mock = MagicMock() - model_output_mock.logits = torch.tensor([[0.1, 0.9]]) + model_output_mock.logits = torch.tensor([[0.1, 0.9], [0.7, 0.3]]) model_mock.return_value = model_output_mock - # Dummy question and context - question = "What is the capital of France?" - context = "Paris is the capital of France." + # Dummy questions and contexts + questions = ["What is the capital of France?", "What is the capital of Germany?"] + contexts = ["Paris is the capital of France.", "Berlin is the capital of Germany."] # Test inference - predicted_label_id, class_prob = get_inference( - question, context, model_path_valid, tokenizer_path_valid, threshold=0.7 + labels, positive_class_probs = get_batch_inference( + questions, contexts, model_mock, tokenizer_mock, device="cpu", threshold=0.7 ) # Assertions - assert isinstance(predicted_label_id, int) - assert isinstance(class_prob, float) - - # Test different inputs - tokenizer_mock.encode_plus.return_value = { - "input_ids": torch.tensor([[101, 103]]), - "attention_mask": torch.tensor([[1, 1]]), - } - model_output_mock.logits = torch.tensor([[0.7, 0.3]]) - predicted_label_id, class_prob = get_inference( - "What is the capital of Germany?", - "Berlin is the capital of Germany.", - model_path_valid, - tokenizer_path_valid, - threshold=0.7, + assert isinstance(labels, np.ndarray) and labels.dtype == int + assert ( + isinstance(positive_class_probs, np.ndarray) + and positive_class_probs.dtype == float ) - assert isinstance(predicted_label_id, int) - assert isinstance(class_prob, float) + # Check correct shapes and values + assert labels.shape[0] == len(questions) + assert positive_class_probs.shape[0] == len(questions) @pytest.fixture @@ -135,7 +125,7 @@ def sample_merged_dataframe(sample_dataframe, sample_kpi_mapping): @patch("pandas.DataFrame.to_excel") def test_run_full_inference( mock_to_excel, - mock_get_inference, + mock_get_batch_inference, mock_listdir, mock_read_csv, mock_open, @@ -149,32 +139,41 @@ def test_run_full_inference( output_path = "output_folder" model_path = "model_path" tokenizer_path = "tokenizer_path" + batch_size = 2 threshold = 0.5 + # Set up mock returns mock_read_csv.return_value = sample_kpi_mapping mock_listdir.return_value = ["test_file.json"] mock_open.return_value.read = json.dumps(sample_json_data) - mock_get_inference.return_value = (1, 0.95) + mock_get_batch_inference.return_value = ( + [1, 0], + [0.95, 0.3], + ) # mock labels and probabilities with patch("json.load", return_value=sample_json_data): with patch("pandas.DataFrame.merge", return_value=sample_merged_dataframe): - run_full_inference( - folder_path, - kpi_mapping_path, - output_path, - model_path, - tokenizer_path, - threshold, - ) + with patch("pandas.DataFrame.to_excel", mock_to_excel): + run_full_inference( + folder_path, + kpi_mapping_path, + output_path, + model_path, + tokenizer_path, + batch_size, + threshold, + ) + # Assertions mock_read_csv.assert_called_once_with(kpi_mapping_path) mock_listdir.assert_called_once_with(folder_path) mock_open.assert_called_once_with(Path(folder_path) / "test_file.json", "r") - assert mock_get_inference.call_count == len(sample_merged_dataframe) + # Check if batch inference was called with expected count + assert ( + mock_get_batch_inference.call_count + == (len(sample_merged_dataframe) // batch_size) + 1 + ) assert mock_to_excel.call_count == 1 output_file_path = Path(output_path) / "test_file.xlsx" mock_to_excel.assert_called_once_with(output_file_path, index=False) - - # Ensure no files were created - mock_open().write.assert_not_called()