Refactor codebase (#2)

xingjian-zhang · Jun 18, 2024 · 7fd1fdd · 7fd1fdd
1 parent b329f77
commit 7fd1fdd
Show file tree

Hide file tree

Showing 15 changed files with 154 additions and 157 deletions.
diff --git a/.github/workflows/pycodestyle.yml b/.github/workflows/pycodestyle.yml
@@ -1,6 +1,6 @@
 name: Pycodestyle
 
-on: [pull_request, workflow_dispatch, push]
+on: [pull_request, workflow_dispatch]
 
 jobs:
   build:

diff --git a/.github/workflows/pydocstyle.yml b/.github/workflows/pydocstyle.yml
@@ -1,6 +1,6 @@
 name: Pydocstyle
 
-on: [pull_request, workflow_dispatch, push]
+on: [pull_request, workflow_dispatch]
 
 jobs:
   build:

diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -1,6 +1,6 @@
 name: Pylint
 
-on: [pull_request, workflow_dispatch, push]
+on: [pull_request, workflow_dispatch]
 
 jobs:
   build:

diff --git a/benchmark/aspect_prediction/README.md b/benchmark/aspect_prediction/README.md
@@ -10,7 +10,7 @@ python benchmark/aspect_prediction/task.py --model <model> --prompt <prompt_styl
 
 where:
 
-- `<model>` is chosen from `gpt-35-turbo`, `gpt-4`, `mistral-8x7b`.
+- `<model>` is chosen from `gpt-35-turbo`, `gpt-4`, `mixtral-8x7b`.
 - `<prompt_style>` is chosen from `zero-shot`, `few-shot`, `chain-of-thought`, `few-shot-cot`.
 
 > We provide the benchmark output through a Dropbox link
@@ -37,23 +37,23 @@ python benchmark/aspect_prediction/eval.py --model_output_dir benchmark/aspect_p
 
 ## Adding a Custom Model to MASSW/API
 
-To extend the functionality of MASSW by adding custom API scripts for additional models, follow these guidelines. This will allow your model to integrate seamlessly with the existing framework used for aspect prediction and evaluation.
+To extend the functionality of MASSW by adding custom model scripts for additional models, follow these guidelines. This will allow your model to integrate seamlessly with the existing framework used for aspect prediction and evaluation.
 
 #### 1. **Location for API Scripts**
 
-Place your custom API scripts in the `massw/api` directory. This should be similar in structure and design to the existing scripts:
+Place your custom model scripts in the `massw/models` directory. This should be similar in structure and design to the existing scripts:
 
-- `massw/api/api_gpt.py`
-- `massw/api/api_mistral.py`
+- `massw/models/gpt_azure.py`
+- `massw/models/mixtral_azure.py`
 
 #### 2. **Required Functions**
 
 Each API script must include two essential functions:
 
-- **`prompts_to_raw_output_<model_name>`**: This function processes prompts and generates raw outputs.
+- **`prompts_to_raw_output`**: This function processes prompts and generates raw outputs.
 
 ```python
-def prompts_to_raw_output_<model_name>(messages: List[Tuple[str, str]], **other_arguments) -> pd.DataFrame:
+def prompts_to_raw_output(messages: List[Tuple[str, str]], **other_arguments) -> pd.DataFrame:
     """
     Process prompts to generate raw outputs.
 
@@ -67,16 +67,16 @@ def prompts_to_raw_output_<model_name>(messages: List[Tuple[str, str]], **other_
     pass
 ```
 
-- **`raw_output_to_dict_<model_name>`**: This function parses raw outputs into a dictionary format.
+- **`raw_output_to_dict`**: This function parses raw outputs into a dictionary format.
 
   ```python
-  def raw_output_to_dict_<model_name>(output_path: str) -> Dict[str, str]:
+  def raw_output_to_dict(output_path: str) -> Dict[str, str]:
       """
       Convert raw outputs into a dictionary mapping from paper ID to output.
-  
+
       Parameters:
       - output_path (str): The file path to the output directory where the results are stored.
-  
+
       Returns:
       - Dict[str, str]: A dictionary mapping each paper ID to its corresponding output.
       """
@@ -85,4 +85,7 @@ def prompts_to_raw_output_<model_name>(messages: List[Tuple[str, str]], **other_
 
 #### 3. **Modify the Task Processing Function**
 
-Update the `process_task` function in `benchmark/aspect_prediction/task.py` to handle your custom model by calling your new API functions. Additionally, adapt the `postprocess_output` function in `benchmark/aspect_observer/eval.py` to support the evaluation of your model's outputs.
+Update the `process_task` function in `benchmark/aspect_prediction/task.py` to
+handle your custom model. Additionally, adapt the `postprocess_output` function
+in `benchmark/aspect_observer/eval.py` to support the evaluation of your
+model's outputs.
diff --git a/benchmark/aspect_prediction/eval.py b/benchmark/aspect_prediction/eval.py
@@ -1,14 +1,14 @@
 """Evaluate experiment results based on the model generated output (file)."""
-import pandas as pd
-from massw.metrics import compute_metrics, flatten_metrics
-from massw.api.api_gpt import raw_output_to_dict_gpt
-from massw.api.api_mistral import raw_output_to_dict_mistral
-from utils import postprocess_cot, TASK_NAMES, TASK2GT
-
 import argparse
 import json
 import sys
+
 import nest_asyncio
+import pandas as pd
+from utils import TASK2GT, TASK_NAMES, postprocess_cot
+
+from massw.metrics import compute_metrics, flatten_metrics
+from massw.models import gpt_azure, mixtral_azure
 
 sys.path.append("..")
 nest_asyncio.apply()
@@ -40,9 +40,9 @@ def postprocess_output(model_output_dir,
         model_path = f"{model_output_dir}/{task_name}.tsv"
 
         if model_type == "gpt":
-            id2predictions = raw_output_to_dict_gpt(model_path)
-        elif model_type == "mistral":
-            id2predictions = raw_output_to_dict_mistral(model_path)
+            id2predictions = gpt_azure.raw_output_to_dict(model_path)
+        elif model_type == "mixtral":
+            id2predictions = mixtral_azure.raw_output_to_dict(model_path)
         else:
             raise ValueError(f"Model type {model_type} not supported.")
 
@@ -80,7 +80,7 @@ def main():
         help="Used COT.",
     )
     args = parser.parse_args()
-    model_type = "gpt" if "gpt" in args.model_output_dir else "mistral"
+    model_type = "gpt" if "gpt" in args.model_output_dir else "mixtral"
 
     results = postprocess_output(
         args.model_output_dir,

diff --git a/benchmark/aspect_prediction/task.py b/benchmark/aspect_prediction/task.py
@@ -6,22 +6,18 @@
 of test data asynchronously.
 """
 
-from massw.api.api_gpt import prompts_to_raw_output_gpt
-from massw.api.api_mistral import prompts_to_raw_output_mistral
-
-from prompts import (
-    future_work_recommendation, idea_generation, method_recommendation,
-    outcome_prediction, predict_title, SYSTEM_PROMPT
-)
-from utils import (
-    allow_self_signed_https, load_examples,
-    MODEL_CHOICES, PROMPT_CHOICES, save_results
-)
-
 import argparse
 import os
 import sys
+
 import jsonlines as jl
+from prompts import (SYSTEM_PROMPT, future_work_recommendation,
+                     idea_generation, method_recommendation,
+                     outcome_prediction, predict_title)
+from utils import (MODEL_CHOICES, PROMPT_CHOICES, allow_self_signed_https,
+                   load_examples, save_results)
+
+from massw.models import gpt_azure, mixtral_azure
 
 sys.path.append("../..")
 
@@ -33,23 +29,32 @@
 def prepare_messages(model, task_name, prompt_type, main_prompt):
     """Prepare the messages based on the task and prompt type."""
     messages = [{"role": "system", "content": SYSTEM_PROMPT}]
-    if model == "mistral-8x7b":
+    if model == "mixtral-8x7b":
         format_instruction = "\nNever use double quotes in your output. \
                              Use single quotes instead.\n"
-        messages = [
-            {"role": "user", "content": SYSTEM_PROMPT + format_instruction},
-            {"role": "assistant", "content": "I got it. \
-             Please give me further instructions!"}
-        ]
+
+        messages = [{
+            "role": "user",
+            "content": SYSTEM_PROMPT + format_instruction
+        }, {
+            "role":
+            "assistant",
+            "content":
+            "I got it. \
+             Please give me further instructions!"
+        }]
 
     if prompt_type in ["few-shot", "few-shot-cot"]:
         examples = few_shot_examples if prompt_type == "few-shot"\
             else cot_examples
         for example in examples.get(task_name, []):
-            messages.extend([
-                {"role": "user", "content": example["user"]},
-                {"role": "assistant", "content": example["assistant"]}
-            ])
+            messages.extend([{
+                "role": "user",
+                "content": example["user"]
+            }, {
+                "role": "assistant",
+                "content": example["assistant"]
+            }])
 
     if prompt_type == "chain-of-thought":
         main_prompt += "Let's think step by step. \
@@ -67,19 +72,16 @@ def process_task(generate_prompt_fn, test_cases, task_name, **kwargs):
     messages = []
     for entry in test_cases:
         main_prompt, _ = generate_prompt_fn(entry)
-        message = prepare_messages(kwargs['model'],
-                                   task_name,
-                                   kwargs['prompt_type'],
-                                   main_prompt)
+        message = prepare_messages(kwargs['model'], task_name,
+                                   kwargs['prompt_type'], main_prompt)
         messages.append((entry['pid'], message))
 
     model = kwargs['model']
-    if model == "mistral-8x7b":
-        chat_results = prompts_to_raw_output_mistral(messages)
+    if model == "mixtral-8x7b":
+        chat_results = mixtral_azure.prompts_to_raw_output(messages)
     elif model in ["gpt-35-turbo", "gpt-4"]:
-        chat_results = prompts_to_raw_output_gpt(messages,
-                                                 model,
-                                                 kwargs.get('tpm'))
+        chat_results = gpt_azure.prompts_to_raw_output(messages, model,
+                                                       kwargs.get('tpm'))
     else:
         raise ValueError(f"Model {model} not supported. \
                          You can modify the code here \
@@ -95,18 +97,10 @@ def main():
     parser.add_argument("--test_data",
                         type=str,
                         default="data/benchmark_0531.jsonl")
-    parser.add_argument("--output_dir",
-                        type=str,
-                        default=False)
-    parser.add_argument("--model",
-                        type=str,
-                        default="gpt-35-turbo")
-    parser.add_argument("--prompt",
-                        type=str,
-                        default="zero-shot")
-    parser.add_argument("--num_samples",
-                        type=int,
-                        default=5)
+    parser.add_argument("--output_dir", type=str, default=False)
+    parser.add_argument("--model", type=str, default="gpt-35-turbo")
+    parser.add_argument("--prompt", type=str, default="zero-shot")
+    parser.add_argument("--num_samples", type=int, default=5)
     args = parser.parse_args()
 
     if args.model not in MODEL_CHOICES:
@@ -118,38 +112,36 @@ def main():
                          Choose from {PROMPT_CHOICES}")
 
     if not args.output_dir:
-        args.output_dir = os.path.join("benchmark",
-                                       "aspect_prediction",
+        args.output_dir = os.path.join("benchmark", "aspect_prediction",
                                        "outputs",
                                        f"{args.model}_{args.prompt}")
 
     # Load test data
     with jl.open(args.test_data) as file:
-        test_data = [record for record, _ in
-                     zip(file, range(args.num_samples))]
+        test_data = [
+            record for record, _ in zip(file, range(args.num_samples))
+        ]
 
-    tasks = [
-        ("idea_generation", idea_generation),
-        ("method_recommendation", method_recommendation),
-        ("outcome_prediction", outcome_prediction),
-        ("future_work_recommendation", future_work_recommendation),
-        ("title_prediction", predict_title)
-    ]
+    tasks = [("idea_generation", idea_generation),
+             ("method_recommendation", method_recommendation),
+             ("outcome_prediction", outcome_prediction),
+             ("future_work_recommendation", future_work_recommendation),
+             ("title_prediction", predict_title)]
 
-    tokens_per_minute = {"gpt-35-turbo": 40000,
-                         "gpt-4": 10000,
-                         "mistral-8x7b": None}
+    tokens_per_minute = {
+        "gpt-35-turbo": 40000,
+        "gpt-4": 10000,
+        "mixtral-8x7b": None
+    }
 
     for task_name, generate_prompt_fn in tasks:
         print(f"Processing task: {task_name}")
-        chat_results = process_task(
-            generate_prompt_fn,
-            test_data,
-            task_name,
-            model=args.model,
-            prompt_type=args.prompt,
-            tpm=tokens_per_minute[args.model]
-        )
+        chat_results = process_task(generate_prompt_fn,
+                                    test_data,
+                                    task_name,
+                                    model=args.model,
+                                    prompt_type=args.prompt,
+                                    tpm=tokens_per_minute[args.model])
         print(f"{chat_results = }")
         save_results(chat_results, args.output_dir, task_name)
 

diff --git a/benchmark/aspect_prediction/utils.py b/benchmark/aspect_prediction/utils.py
@@ -1,9 +1,9 @@
 """This script includes utility functions for benchmarking scripts."""
-import ssl
 import json
 import os
+import ssl
 
-MODEL_CHOICES = ["gpt-35-turbo", "gpt-4", "mistral-8x7b"]
+MODEL_CHOICES = ["gpt-35-turbo", "gpt-4", "mixtral-8x7b"]
 PROMPT_CHOICES = ["zero-shot", "few-shot", "chain-of-thought", "few-shot-cot"]
 
 

diff --git a/massw/data.py b/massw/data.py
@@ -1,12 +1,12 @@
 """Data loading and processing functions."""
 
+import os
 from dataclasses import dataclass
-import numpy as np
+from typing import List, Union
 
-import os
-import pandas as pd
 import jsonlines as jl
-from typing import List, Union
+import numpy as np
+import pandas as pd
 
 from massw.download import download_dataset
 

diff --git a/massw/metrics.py b/massw/metrics.py
@@ -3,14 +3,15 @@
 See example usage in the `__main__` block at the end of the file.
 """
 
-import json
 import asyncio
+import json
 from typing import List, Union
 
 import evaluate
 import numpy as np
 from sentence_transformers import SentenceTransformer
-from massw.api.api_gpt import AzureConfig, Batch
+
+from massw.models.gpt_azure import AzureConfig, Batch
 
 LLM_SIM_PROMPT = """
 You are an expert in Computer Science with a specialization in text analysis,

diff --git a/massw/api/__init__.py → massw/models/__init__.py b/massw/api/__init__.py → massw/models/__init__.py