diff --git a/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py
index 8cae7d4..e9344d1 100644
--- a/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py
+++ b/src/osc_transformer_based_extractor/kpi_detection/cli_kpi_detection.py
@@ -1,4 +1,5 @@
 import typer
+import os
 from .train_kpi_detection import (
     train_kpi_detection,
     check_output_dir,
@@ -43,6 +44,7 @@ def fine_tune_qna(
     output_dir: str = typer.Argument(
         ..., help="Directory to save the fine-tuned model."
     ),
+    export_model_name: str = typer.Argument(..., help="Name of the model to export."),
     save_steps: int = typer.Argument(
         ..., help="Number of steps between saving model checkpoints."
     ),
@@ -59,10 +61,13 @@ def fine_tune_qna(
         batch_size=batch_size,
         learning_rate=learning_rate,
         output_dir=output_dir,
+        export_model_name=export_model_name,
         save_steps=save_steps,
     )
-
-    typer.echo(f"Model '{model_name}' trained and saved successfully at {output_dir}")
+    saved_model_path = os.path.join(output_dir, f"{export_model_name}")
+    typer.echo(
+        f"Model '{model_name}' is trained and saved successfully at {saved_model_path}"
+    )
 
 
 @kpi_detection_app.command("inference")
@@ -76,6 +81,7 @@ def inference_qna(
     model_path: str = typer.Argument(
         ..., help="Path to the pre-trained model directory OR name on huggingface."
     ),
+    batch_size: int = typer.Argument(16, help="The batch size for inference."),
 ):
     """Perform inference using a pre-trained model on a dataset of kpis and contexts, saving an output Excel file."""
     try:
@@ -86,6 +92,7 @@ def inference_qna(
             data_file_path=data_file_path,
             output_path=output_path,
             model_path=model_path,
+            batch_size=batch_size,
         )
 
         typer.echo("Inference completed successfully!")
diff --git a/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py
index 5389b67..b311d4f 100644
--- a/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py
+++ b/src/osc_transformer_based_extractor/kpi_detection/inference_kpi_detection.py
@@ -3,11 +3,12 @@
 """
 
 import os
+import torch
 import pandas as pd
 from pathlib import Path
 from tqdm import tqdm
-import torch
 from transformers import pipeline, AutoConfig
+from transformers.pipelines import QuestionAnsweringPipeline
 
 
 def resolve_model_path(model_path: str):
@@ -48,29 +49,32 @@ def validate_path_exists(path: str, which_path: str):
         raise ValueError(f"{which_path}: {path} does not exist.")
 
 
-def get_inference_kpi_detection(question: str, context: str, model_path: str, device):
+def get_batch_inference_kpi_detection(
+    questions, contexts, question_answerer: QuestionAnsweringPipeline, batch_size
+):
     """
-    Performs kpi-detection inference using a specified model.
+    Perform batch inference using the question-answering pipeline.
 
     Args:
-        question (str): The question to be answered.
-        context (str): The context in which to find the answer.
-        model_path (str): Path to the pre-trained model to be used for inference.
+        questions (list): List of questions.
+        contexts (list): List of contexts.
+        question_answerer (QuestionAnsweringPipeline): The question-answering pipeline.
+        batch_size (int): The batch size for inference.
 
     Returns:
-        tuple: A tuple containing:
-            - answer (str): The predicted answer.
-            - score (float): The confidence score of the prediction.
-            - start (int): The start position of the answer in the context.
-            - end (int): The end position of the answer in the context.
+        list of dict: List of dictionaries containing answers, scores, and positions.
     """
-    question_answerer = pipeline("question-answering", model=model_path, device=device)
-    result = question_answerer(question=question, context=context)
-    return result["answer"], result["score"], result["start"], result["end"]
+    results = question_answerer(
+        questions, contexts, batch_size=batch_size
+    )  # Adjust batch size as needed
+    return results
 
 
 def run_full_inference_kpi_detection(
-    data_file_path: str, output_path: str, model_path: str
+    data_file_path: str,
+    output_path: str,
+    model_path: str,
+    batch_size: int,
 ):
     """
     Runs full inference on a dataset of questions and contexts, and saves the results.
@@ -80,6 +84,7 @@ def run_full_inference_kpi_detection(
             The dataset should have columns 'question' and 'context'.
         output_path (str): Path to the directory where the output Excel file will be saved.
         model_path (str): Path to the pre-trained model to be used for inference.
+        batch_size (int): The batch size for inference.
 
     Returns:
         None: The function saves the resulting DataFrame to an Excel file and prints a success message.
@@ -90,7 +95,6 @@ def run_full_inference_kpi_detection(
 
     data = pd.read_csv(data_file_path)
 
-    # Dynamically detect the device: CUDA, MPS, or CPU
     if torch.cuda.is_available():
         device = torch.device("cuda")  # Use NVIDIA GPU
         print("Using CUDA GPU")
@@ -101,19 +105,38 @@ def run_full_inference_kpi_detection(
         device = torch.device("cpu")  # Fallback to CPU
         print("Using CPU")
 
-    result = []
-    for _, row in tqdm(data.iterrows(), total=data.shape[0], desc="Processing Rows"):
-        question = row["question"]
-        context = row["context"]
-        answer, score, start, end = get_inference_kpi_detection(
-            question, context, model_path, device
-        )
-        result.append(
-            {"predicted_answer": answer, "start": start, "end": end, "score": score}
-        )
+    # Initialize the question-answering pipeline
+    question_answerer = pipeline("question-answering", model=model_path, device=device)
 
-    df = pd.DataFrame(result)
+    results = []
+
+    # Process in batches
+    for start_idx in tqdm(
+        range(0, data.shape[0], batch_size), desc="Processing Batches"
+    ):
+        end_idx = min(start_idx + batch_size, data.shape[0])
+        batch_questions = data["question"].iloc[start_idx:end_idx].tolist()
+        batch_contexts = data["context"].iloc[start_idx:end_idx].tolist()
+
+        # Perform batch inference
+        batch_results = get_batch_inference_kpi_detection(
+            questions=batch_questions,
+            contexts=batch_contexts,
+            question_answerer=question_answerer,
+            batch_size=batch_size,
+        )
 
+        for result in batch_results:
+            results.append(
+                {
+                    "predicted_answer": result["answer"],
+                    "start": result["start"],
+                    "end": result["end"],
+                    "score": result["score"],
+                }
+            )
+
+    df = pd.DataFrame(results)
     combined_df = pd.concat([data, df], axis=1)
 
     file_name = Path(output_path) / "output.xlsx"
diff --git a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
index 414cb7c..16c334c 100644
--- a/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
+++ b/src/osc_transformer_based_extractor/kpi_detection/train_kpi_detection.py
@@ -34,7 +34,6 @@
 )
 import torch
 import numpy as np
-from datetime import datetime
 from sklearn.model_selection import train_test_split
 
 
@@ -92,6 +91,7 @@ def train_kpi_detection(
     batch_size,
     learning_rate,
     output_dir,
+    export_model_name,
     save_steps,
 ):
     """
@@ -105,6 +105,7 @@ def train_kpi_detection(
         batch_size (int): Batch size for training.
         learning_rate (float): Learning rate for trainig
         output_dir (str): Directory where the model will be saved during training.
+        export_model_name (str): The name to export the trained model
         save_steps (int): Number of steps before saving the model during training.
     """
     # Load the data
@@ -229,8 +230,7 @@ def preprocess_function(examples, max_length):
     data_collator = DefaultDataCollator()
 
     # Get the current timestamp
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    saved_model_path = os.path.join(output_dir, f"{model_name}_{timestamp}")
+    saved_model_path = os.path.join(output_dir, export_model_name)
     os.makedirs(saved_model_path, exist_ok=True)
 
     checkpoint_dir = os.path.join(saved_model_path, "checkpoints")
diff --git a/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py b/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py
index 639f8c1..60ca152 100644
--- a/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py
+++ b/src/osc_transformer_based_extractor/relevance_detector/cli_relevance_detector.py
@@ -1,4 +1,5 @@
 import typer
+import os
 from .fine_tune import (
     check_csv_columns,
     check_output_dir,
@@ -43,6 +44,7 @@ def fine_tune(
     output_dir: str = typer.Argument(
         ..., help="Directory to save the fine-tuned model."
     ),
+    export_model_name: str = typer.Argument(..., help="Name of the model to export."),
     save_steps: int = typer.Argument(
         ..., help="Number of steps between saving model checkpoints."
     ),
@@ -60,10 +62,13 @@ def fine_tune(
         batch_size=batch_size,
         learning_rate=learning_rate,
         output_dir=output_dir,
+        export_model_name=export_model_name,
         save_steps=save_steps,
     )
-
-    typer.echo(f"Model '{model_name}' trained and saved successfully at {output_dir}")
+    saved_model_path = os.path.join(output_dir, f"{export_model_name}")
+    typer.echo(
+        f"Model '{model_name}' is trained and saved successfully at {saved_model_path}"
+    )
 
 
 @relevance_detector_app.command("inference")
@@ -81,6 +86,7 @@ def inference(
     tokenizer_path: str = typer.Argument(
         ..., help="Path to the tokenizer directory OR name on huggingface."
     ),
+    batch_size: int = typer.Argument(16, help="Batch size to process the rows"),
     threshold: float = typer.Argument(
         0.5, help="Threshold value for prediction confidence."
     ),
@@ -97,6 +103,7 @@ def inference(
             output_path=output_path,
             model_path=model_path,
             tokenizer_path=tokenizer_path,
+            batch_size=batch_size,
             threshold=threshold,
         )
 
diff --git a/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py b/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py
index 4ecca64..5b85193 100644
--- a/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py
+++ b/src/osc_transformer_based_extractor/relevance_detector/fine_tune.py
@@ -10,6 +10,7 @@
 """
 
 import os
+import shutil
 import pandas as pd
 import torch
 from transformers import (
@@ -150,6 +151,7 @@ def fine_tune_model(
     batch_size,
     learning_rate,
     output_dir,
+    export_model_name,
     save_steps,
 ):
     """
@@ -164,6 +166,7 @@ def fine_tune_model(
         batch_size (int): Batch size for training.
         learning_rate (float): Learning rate for trainig
         output_dir (str): Directory where the model will be saved during training.
+        export_model_name (str): The name to export the trained model
         save_steps (int): Number of steps before saving the model during training.
     """
     # Load your dataset into a pandas DataFrame
@@ -210,11 +213,14 @@ def fine_tune_model(
         device,
     )
 
-    saved_model_path = os.path.join(output_dir, "saved_model")
+    saved_model_path = os.path.join(output_dir, export_model_name)
+    os.makedirs(saved_model_path, exist_ok=True)
+
+    checkpoint_dir = os.path.join(saved_model_path, "checkpoints")
     os.makedirs(saved_model_path, exist_ok=True)
 
     training_args = TrainingArguments(
-        output_dir=saved_model_path,
+        output_dir=checkpoint_dir,
         evaluation_strategy="epoch",  # Evaluate at the end of each epoch
         logging_dir="./logs",  # Directory for logs
         logging_steps=10,  # Log every 10 steps
@@ -228,7 +234,7 @@ def fine_tune_model(
         save_strategy="epoch",
         load_best_model_at_end=True,
         metric_for_best_model="eval_loss",
-        greater_is_better=True,
+        greater_is_better=False,
         save_total_limit=1,
     )
 
@@ -243,6 +249,12 @@ def fine_tune_model(
     # Start Training
     trainer.train()
 
+    # Save the final trained model and config
+    trainer.save_model(saved_model_path)
+
+    # Save the tokenizer manually
+    tokenizer.save_pretrained(saved_model_path)
+
     # Evaluate the model
     eval_result = trainer.evaluate(eval_dataset)
     print("Evaluation results:")
@@ -269,3 +281,8 @@ def fine_tune_model(
         print(f"Input: {tokenizer.decode(input_ids, skip_special_tokens=True)}")
         print(f"True Label: {true_label}, Predicted Label: {predicted_label}")
         print("\n")
+
+    try:
+        shutil.rmtree(checkpoint_dir)
+    except OSError:
+        pass
diff --git a/src/osc_transformer_based_extractor/relevance_detector/inference.py b/src/osc_transformer_based_extractor/relevance_detector/inference.py
index 73b32a5..6e007dc 100644
--- a/src/osc_transformer_based_extractor/relevance_detector/inference.py
+++ b/src/osc_transformer_based_extractor/relevance_detector/inference.py
@@ -1,8 +1,3 @@
-"""This module contains utility functions for performing inference using pre-trained sequence classification models."""
-
-# Module: inference
-# Author: Tanishq-ids
-
 import os
 import torch
 import pandas as pd
@@ -12,30 +7,6 @@
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 
 
-def resolve_model_path(model_path: str):
-    """
-    Resolves whether the given `model_path` is a Hugging Face model name or a local system path.
-
-    - If the `model_path` refers to a Hugging Face model (e.g., "bert-base-uncased"), the function will return the
-      model name as a string.
-    - If the `model_path` refers to a valid local system path, the function will convert it into a `Path` object.
-    - If neither, the function raises a `ValueError`.
-    """
-
-    # Check if it's a local path
-    if os.path.exists(model_path):
-        return Path(model_path)
-
-    # Check if it's a Hugging Face model name
-    try:
-        AutoConfig.from_pretrained(model_path)
-        return model_path  # It's a Hugging Face model name, return as string
-    except Exception:
-        raise ValueError(
-            f"{model_path} is neither a valid Hugging Face model nor a local file path."
-        )
-
-
 def validate_path_exists(path: str, which_path: str):
     """
     Validate if the given path exists.
@@ -50,54 +21,36 @@ def validate_path_exists(path: str, which_path: str):
         raise ValueError(f"{which_path}: {path} does not exist.")
 
 
-def get_inference(
-    question: str, context: str, model_path: str, tokenizer_path: str, threshold: float
-):
+def resolve_model_path(model_path: str):
     """
-    Perform inference using a pre-trained sequence classification model.
-
-    Parameters:
-        question (str): The question for inference.
-        context (str): The context to be analyzed.
-        model_path (str): Path to the pre-trained model directory OR name on huggingface.
-        tokenizer_path (str): Path to the tokenizer directory OR name on huggingface.
-        threshold (float): The threshold for the inference score.
-
-    Returns:
-        int: Predicted label ID (0 or 1).
-        float: class probability
+    Resolves whether the given `model_path` is a Hugging Face model name or a local system path.
     """
-    model_path = str(Path(model_path))
-    tokenizer_path = str(Path(tokenizer_path))
-
-    # Dynamically detect the device: CUDA, MPS, or CPU
-    if torch.cuda.is_available():
-        device = torch.device("cuda")  # Use NVIDIA GPU
-        print("Using CUDA GPU")
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")  # Use Apple Silicon GPU (MPS)
-        print("Using MPS (Apple Silicon GPU)")
-    else:
-        device = torch.device("cpu")  # Fallback to CPU
-        print("Using CPU")
-
-    print(f"Using device: {device}")  # Print device to confirm
+    if os.path.exists(model_path):
+        return Path(model_path)
+    try:
+        AutoConfig.from_pretrained(model_path)
+        return model_path
+    except Exception:
+        raise ValueError(
+            f"{model_path} is neither a valid Hugging Face model nor a local file path."
+        )
 
-    # Load model and tokenizer
-    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
 
-    # Tokenize inputs
+def get_batch_inference(questions, contexts, model, tokenizer, device, threshold):
+    """
+    Perform batch inference using the model and tokenizer.
+    """
+    # Tokenize the batch of questions and contexts
     inputs = tokenizer(
-        question,
-        context,
+        questions,
+        contexts,
         return_tensors="pt",
         truncation=True,
         padding=True,
         max_length=512,
     )
 
-    # Move tokenized inputs to the same device as the model
+    # Move inputs to the device
     inputs = {key: val.to(device) for key, val in inputs.items()}
 
     # Forward pass
@@ -108,11 +61,10 @@ def get_inference(
     probabilities = torch.softmax(outputs.logits, dim=1)
 
     # Probability of the positive class (label 1)
-    positive_class_prob = probabilities[0, 1].item()
+    positive_class_probs = probabilities[:, 1].cpu().numpy()
+    labels = (positive_class_probs >= threshold).astype(int)
 
-    label = 1 if positive_class_prob >= threshold else 0
-
-    return label, positive_class_prob
+    return labels, positive_class_probs
 
 
 def run_full_inference(
@@ -121,35 +73,38 @@ def run_full_inference(
     output_path: str,
     model_path: str,
     tokenizer_path: str,
+    batch_size: int,
     threshold: float,
 ):
     """
     Perform inference on JSON files in a specified folder and save the results to Excel files.
-
-    This function reads JSON files from a specified folder, processes the data, merges it with a KPI mapping,
-    performs inference using a specified model and tokenizer, and saves the results to Excel files.
-
-    Args:
-        json_folder_path (str): Path to the folder containing JSON files to process.
-        kpi_mapping_path (str): Path to the CSV file containing KPI mappings.
-        output_path (str): Path to the folder where the output Excel files will be saved.
-        model_path (str): Path to the model used for inference (local or Huggingface).
-        tokenizer_path (str): Path to the tokenizer used for inference (local or Huggingface).
-        threshold (float): Threshold value for the inference process.
-
-    Returns:
-        None
-
-    Raises:
-        Exception: If there is an error reading or processing the JSON files, or saving the Excel files.
     """
     kpi_mapping_path = str(Path(kpi_mapping_path))
     json_folder_path = str(Path(json_folder_path))
     output_path = str(Path(output_path))
+
+    # Resolve model and tokenizer paths
     model_path = resolve_model_path(model_path)
     tokenizer_path = resolve_model_path(tokenizer_path)
 
-    # Read the KPI mapping outside the loop
+    # Load model and tokenizer once
+    # Dynamically detect the device: CUDA, MPS, or CPU
+    if torch.cuda.is_available():
+        device = torch.device("cuda")  # Use NVIDIA GPU
+        print("Using CUDA GPU")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")  # Use Apple Silicon GPU (MPS)
+        print("Using MPS (Apple Silicon GPU)")
+    else:
+        device = torch.device("cpu")  # Fallback to CPU
+        print("Using CPU")
+
+    print(f"Using device: {device}")  # Print device to confirm
+    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
+    model.eval()  # Set model to evaluation mode
+
+    # Read the KPI mapping
     kpi_mapping = pd.read_csv(kpi_mapping_path)
     kpi_mapping = kpi_mapping[["kpi_id", "question"]]
 
@@ -189,19 +144,18 @@ def run_full_inference(
             labels = []
             probs = []
 
-            # Iterate over the rows of the DataFrame and perform inference
-            for _, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
-                question = row["question"]
-                context = row["paragraph"]
-                label, prob = get_inference(
-                    question=question,
-                    context=context,
-                    model_path=model_path,
-                    tokenizer_path=tokenizer_path,
-                    threshold=threshold,
+            # Perform inference in batches
+            for start_idx in tqdm(range(0, merged_df.shape[0], batch_size)):
+                end_idx = min(start_idx + batch_size, merged_df.shape[0])
+                batch_questions = merged_df["question"].iloc[start_idx:end_idx].tolist()
+                batch_contexts = merged_df["paragraph"].iloc[start_idx:end_idx].tolist()
+
+                batch_labels, batch_probs = get_batch_inference(
+                    batch_questions, batch_contexts, model, tokenizer, device, threshold
                 )
-                labels.append(label)
-                probs.append(prob)
+
+                labels.extend(batch_labels)
+                probs.extend(batch_probs)
 
             # Add results to the DataFrame
             merged_df["paragraph_relevance_flag"] = labels
diff --git a/tests/osc_transformer_based_extractor/relevance_detector/test_inference.py b/tests/osc_transformer_based_extractor/relevance_detector/test_inference.py
index d32c90f..733e7bb 100644
--- a/tests/osc_transformer_based_extractor/relevance_detector/test_inference.py
+++ b/tests/osc_transformer_based_extractor/relevance_detector/test_inference.py
@@ -9,10 +9,11 @@
 from unittest.mock import patch, mock_open, MagicMock
 from pathlib import Path
 import pandas as pd
+import numpy as np
 import torch
 import pytest
 from osc_transformer_based_extractor.relevance_detector.inference import (
-    get_inference,
+    get_batch_inference,
     run_full_inference,
 )
 
@@ -33,54 +34,43 @@
 @patch(
     "osc_transformer_based_extractor.relevance_detector.inference.AutoTokenizer.from_pretrained"
 )
-def test_get_inference(mock_tokenizer, mock_model):
-    """Test the get_inference function for inference correctness."""
+def test_get_batch_inference():
+    """Test the get_batch_inference function for inference correctness."""
+
     # Mock tokenizer and model
     tokenizer_mock = MagicMock()
     model_mock = MagicMock()
-    mock_tokenizer.return_value = tokenizer_mock
-    mock_model.return_value = model_mock
 
-    # Configure the tokenizer mock
-    tokenizer_mock.encode_plus.return_value = {
-        "input_ids": torch.tensor([[101, 102]]),
-        "attention_mask": torch.tensor([[1, 1]]),
+    # Configure the tokenizer mock for batch inputs
+    tokenizer_mock.return_value = {
+        "input_ids": torch.tensor([[101, 102], [101, 103]]),
+        "attention_mask": torch.tensor([[1, 1], [1, 1]]),
     }
 
     # Configure the model mock to return logits tensor
     model_output_mock = MagicMock()
-    model_output_mock.logits = torch.tensor([[0.1, 0.9]])
+    model_output_mock.logits = torch.tensor([[0.1, 0.9], [0.7, 0.3]])
     model_mock.return_value = model_output_mock
 
-    # Dummy question and context
-    question = "What is the capital of France?"
-    context = "Paris is the capital of France."
+    # Dummy questions and contexts
+    questions = ["What is the capital of France?", "What is the capital of Germany?"]
+    contexts = ["Paris is the capital of France.", "Berlin is the capital of Germany."]
 
     # Test inference
-    predicted_label_id, class_prob = get_inference(
-        question, context, model_path_valid, tokenizer_path_valid, threshold=0.7
+    labels, positive_class_probs = get_batch_inference(
+        questions, contexts, model_mock, tokenizer_mock, device="cpu", threshold=0.7
     )
 
     # Assertions
-    assert isinstance(predicted_label_id, int)
-    assert isinstance(class_prob, float)
-
-    # Test different inputs
-    tokenizer_mock.encode_plus.return_value = {
-        "input_ids": torch.tensor([[101, 103]]),
-        "attention_mask": torch.tensor([[1, 1]]),
-    }
-    model_output_mock.logits = torch.tensor([[0.7, 0.3]])
-    predicted_label_id, class_prob = get_inference(
-        "What is the capital of Germany?",
-        "Berlin is the capital of Germany.",
-        model_path_valid,
-        tokenizer_path_valid,
-        threshold=0.7,
+    assert isinstance(labels, np.ndarray) and labels.dtype == int
+    assert (
+        isinstance(positive_class_probs, np.ndarray)
+        and positive_class_probs.dtype == float
     )
 
-    assert isinstance(predicted_label_id, int)
-    assert isinstance(class_prob, float)
+    # Check correct shapes and values
+    assert labels.shape[0] == len(questions)
+    assert positive_class_probs.shape[0] == len(questions)
 
 
 @pytest.fixture
@@ -135,7 +125,7 @@ def sample_merged_dataframe(sample_dataframe, sample_kpi_mapping):
 @patch("pandas.DataFrame.to_excel")
 def test_run_full_inference(
     mock_to_excel,
-    mock_get_inference,
+    mock_get_batch_inference,
     mock_listdir,
     mock_read_csv,
     mock_open,
@@ -149,32 +139,41 @@ def test_run_full_inference(
     output_path = "output_folder"
     model_path = "model_path"
     tokenizer_path = "tokenizer_path"
+    batch_size = 2
     threshold = 0.5
 
+    # Set up mock returns
     mock_read_csv.return_value = sample_kpi_mapping
     mock_listdir.return_value = ["test_file.json"]
     mock_open.return_value.read = json.dumps(sample_json_data)
-    mock_get_inference.return_value = (1, 0.95)
+    mock_get_batch_inference.return_value = (
+        [1, 0],
+        [0.95, 0.3],
+    )  # mock labels and probabilities
 
     with patch("json.load", return_value=sample_json_data):
         with patch("pandas.DataFrame.merge", return_value=sample_merged_dataframe):
-            run_full_inference(
-                folder_path,
-                kpi_mapping_path,
-                output_path,
-                model_path,
-                tokenizer_path,
-                threshold,
-            )
+            with patch("pandas.DataFrame.to_excel", mock_to_excel):
+                run_full_inference(
+                    folder_path,
+                    kpi_mapping_path,
+                    output_path,
+                    model_path,
+                    tokenizer_path,
+                    batch_size,
+                    threshold,
+                )
 
+    # Assertions
     mock_read_csv.assert_called_once_with(kpi_mapping_path)
     mock_listdir.assert_called_once_with(folder_path)
     mock_open.assert_called_once_with(Path(folder_path) / "test_file.json", "r")
 
-    assert mock_get_inference.call_count == len(sample_merged_dataframe)
+    # Check if batch inference was called with expected count
+    assert (
+        mock_get_batch_inference.call_count
+        == (len(sample_merged_dataframe) // batch_size) + 1
+    )
     assert mock_to_excel.call_count == 1
     output_file_path = Path(output_path) / "test_file.xlsx"
     mock_to_excel.assert_called_once_with(output_file_path, index=False)
-
-    # Ensure no files were created
-    mock_open().write.assert_not_called()