fix code/doc/lint style for benchmark, massw, except gpt_api

xingjian-zhang · Jun 14, 2024 · 6244220 · 6244220
1 parent 321845c
commit 6244220
Show file tree

Hide file tree

Showing 6 changed files with 55 additions and 93 deletions.
diff --git a/benchmark/aspect_prediction/task.py b/benchmark/aspect_prediction/task.py
@@ -100,17 +100,23 @@ def main():
                         default=False)
     parser.add_argument("--model",
                         type=str,
-                        choices=MODEL_CHOICES,
                         default="gpt-35-turbo")
     parser.add_argument("--prompt",
                         type=str,
-                        hoices=PROMPT_CHOICES,
                         default="zero-shot")
     parser.add_argument("--num_samples",
                         type=int,
                         default=5)
     args = parser.parse_args()
 
+    if args.model not in MODEL_CHOICES:
+        raise ValueError(f"Model {args.model} not supported. \
+                         Choose from {MODEL_CHOICES}")
+
+    if args.prompt not in PROMPT_CHOICES:
+        raise ValueError(f"Prompt type {args.prompt} not supported. \
+                         Choose from {PROMPT_CHOICES}")
+
     if not args.output_dir:
         args.output_dir = os.path.join("benchmark",
                                        "aspect_prediction",

diff --git a/massw/__init__.py b/massw/__init__.py
@@ -0,0 +1 @@
+"""Init file for massw package."""
diff --git a/massw/api/api_gpt.py b/massw/api/api_gpt.py
@@ -594,4 +594,4 @@ async def add(self,
         await self.__queue.put(request)
 
         self.__totals.queued += 1
-        self.log(f"QUEUED | {model}")
+        self.log(f"QUEUED | {model}")
diff --git a/massw/api/api_mistral.py b/massw/api/api_mistral.py
@@ -8,9 +8,7 @@
 
 
 def prompts_to_raw_output_mistral(messages):
-    """
-    Process prompts using the specified Mistral model endpoint and return the results.
-    """
+    """Process prompts using the specified Mistral model endpoint."""
     final_results = pd.DataFrame(columns=['pid', 'output'])
 
     url = os.environ.get("MISTRAL_API_URL")
@@ -25,14 +23,15 @@ def prompts_to_raw_output_mistral(messages):
 
     for pid, msg in messages:
         response_df = handle_mistral_model(url, headers, msg, {"pid": pid})
-        final_results = pd.concat([final_results, response_df], ignore_index=True)
+        final_results = pd.concat([final_results,
+                                   response_df], ignore_index=True)
 
     return final_results
 
 
 def raw_output_to_dict_mistral(model_path: str) -> Dict[str, str]:
     """
-    Load and convert raw output from the Mistral model into a dictionary mapping pid to output.
+    Load and convert raw output from the Mistral model into a dictionary.
 
     Args:
         model_path (str): Path to the model output CSV file.
@@ -42,9 +41,9 @@ def raw_output_to_dict_mistral(model_path: str) -> Dict[str, str]:
     """
     output_dict = {}
     task_output = pd.read_csv(model_path, sep="\t",
-                              converters={'result': lambda x: json.loads(x) if x else None})
+                              converters={'result': lambda x: json.loads(x)
+                                          if x else None})
     for _, row in task_output.iterrows():
-        # processed_output = row['result'] if 'result' in row and row['result'] else ""
         output = row["output"]
         output_dict[row['pid']] = output
     return output_dict
@@ -74,7 +73,9 @@ def handle_mistral_model(url, headers, messages, entry):
             print(f"{req = }")
             with urllib.request.urlopen(req) as response:
                 result_json = json.loads(response.read())
-                output_df = output_df.append({"pid": entry["pid"], "output": result_json}, ignore_index=True)
+                output_df = output_df.append({"pid": entry["pid"],
+                                             "output": result_json},
+                                             ignore_index=True)
             break
         except urllib.error.HTTPError as error:
             print(f"The request failed with status code: {error.code}")

diff --git a/massw/download.py b/massw/download.py
@@ -24,15 +24,16 @@ def download_dataset(version="v1"):
         files = urls[version]
     except KeyError as e:
         raise ValueError(
-            f"Invalid version: {version}. Choose from {list(urls.keys())}") from e
+            f"Invalid version: {version}.\
+            Choose from {list(urls.keys())}") from e
     for filename, url in files.items():
         print(f"Downloading {filename}...")
         # Constructing the output path
         out_path = os.path.join(PROJECT_DIR, "data", filename)
         wget.download(url, out=out_path, bar=bar_progress)
 
 
-def bar_progress(current, total):
+def bar_progress(current, total, width=80):
     """Display a progress bar for the download."""
     progress_message = f"Downloading: {current / total * 100:.0f}% \
                          [{current} / {total}] bytes"

diff --git a/massw/metrics.py b/massw/metrics.py
@@ -19,20 +19,26 @@
 aspect (Context, Key Idea, Method, Outcome, or Projected Impact) of the same
 paper.
 
-For each pair of summarizations, classify the aspect, and assign a similarity score
-on a scale from 1 to 10, where 1 indicates completely dissimilar and 10
+For each pair of summarizations, classify the aspect,
+and assign a similarity score on a scale from 1 to 10,
+where 1 indicates completely dissimilar and 10
 indicates identical content. Before scoring, include a brief justification for
-your score. You should output your results in JSON format as shown in the example.
+your score.
+You should output your results in JSON format as shown in the example.
 
 Example Input:
-Input 1: The experiments demonstrated a 20% increase in efficiency, confirming the proposed model's effectiveness.
-Input 2: Results show that the new model outperforms existing ones by improving efficiency by approximately 20%.
+Input 1: The experiments demonstrated a 20% increase in efficiency,
+confirming the proposed model's effectiveness.
+Input 2: Results show that the new model outperforms existing ones
+by improving efficiency by approximately 20%.
 
 Example JSON Output:
 {
   "aspect": "Outcome",
   "score": 9,
-  "justification": "Both texts describe similar measurable improvements in efficiency, closely aligning in their depiction of the model's effectiveness."
+  "justification": "Both texts describe similar measurable improvements
+  in efficiency, closely aligning in their depiction
+  of the model's effectiveness."
 }
 """
 
@@ -41,6 +47,7 @@ class LLMSimilarity:
     """Evaluate the similarity between two texts using a language model."""
 
     def __init__(self, model_name: str = "gpt-4"):
+        """Initialize the language model similarity evaluator."""
         assert model_name in ["gpt-4", "gpt-35-turbo"]
         self.model_name = model_name
         self.tpm = {"gpt-4": 4000, "gpt-35-turbo": 40000}[model_name]
@@ -109,6 +116,7 @@ class CosineSimilarity:
     """Compute cosine similarity between two ordered list of texts."""
 
     def __init__(self):
+        """Initialize the SentenceTransformer model."""
         self.encoder = SentenceTransformer(
             'intfloat/multilingual-e5-large-instruct')
 
@@ -217,15 +225,20 @@ def compute(
 nahit = NAHit()
 
 
-
-
 def compute_metrics(predictions: List[str],
                     references: List[List[str]],
                     metric_names=None):
-    """Compute cosine similarity, ROUGE, BLEU, METEOR, and BERTScore metrics."""
+    """Compute cosine similarity, ROUGE, BLEU, METEOR, and BERTScore."""
     if metric_names is None:
         metric_names = [
-            "cosine", "rouge", "bleu", "meteor", "bleurt", "bertscore", "nahit", "llm_sim"
+            "cosine",
+            "rouge",
+            "bleu",
+            "meteor",
+            "bleurt",
+            "bertscore",
+            "nahit",
+            "llm_sim"
         ]
     metrics = {}
     if "nahit" in metric_names:
@@ -280,88 +293,28 @@ def compute_metrics(predictions: List[str],
                     grouped_references = list(zip(*references))
                     scores = []
                     for grouped_reference in grouped_references:
-                        score = metric_computation_functions[metric_name].compute(
-                            predictions=predictions,
-                            references=grouped_reference,
-                        )
+                        score = \
+                            metric_computation_functions[metric_name].compute(
+                                predictions=predictions,
+                                references=grouped_reference)
                         scores.append(score["scores"])
                     scores = np.array(scores)  # (num_refs, num_preds)
                     score = np.mean(np.max(scores, axis=0))
                 else:
                     score = metric_computation_functions[metric_name].compute(
                         predictions=predictions,
-                        references=references,
-                    )
+                        references=references)
                     score = np.mean(score["scores"])
                 metrics[metric_name] = {"bleurt": score}
             else:
-                metrics[metric_name] = metric_computation_functions[metric_name].compute(
-                    predictions=predictions,
-                    references=references
+                metrics[metric_name] = \
+                    metric_computation_functions[metric_name].compute(
+                        predictions=predictions,
+                        references=references
                 )
 
     return metrics
 
-    # if "cosine" in metric_names:
-    #     metrics["cosine"] = cs.compute(
-    #         predictions=predictions,
-    #         references=references,
-    #     )
-    # if "llm_sim" in metric_names:
-    #     metrics["llm_sim"] = llm_sim.compute(
-    #         predictions=predictions,
-    #         references=references,
-    #     )
-    # if "rouge" in metric_names:
-    #     metrics["rouge"] = rouge.compute(
-    #         predictions=predictions,
-    #         references=references,
-    #     )
-    # if "bleu" in metric_names:
-    #     metrics["bleu"] = bleu.compute(
-    #         predictions=predictions,
-    #         references=references,
-    #     )
-    # if "meteor" in metric_names:
-    #     metrics["meteor"] = meteor.compute(
-    #         predictions=predictions,
-    #         references=references,
-    #     )
-    # if "bertscore" in metric_names:
-    #     score = bertscore.compute(
-    #         predictions=predictions,
-    #         references=references,
-    #         lang="en",
-    #     )
-    #     metrics["bertscore"] = {
-    #         "precision": np.array(score["precision"]).mean(),
-    #         "recall": np.array(score["recall"]).mean(),
-    #         "f1": np.array(score["f1"]).mean(),
-    #     }
-    # if "bleurt" in metric_names:
-    #     # Compute the maximum BLEURT score for each prediction
-    #     if isinstance(references[0], list):
-    #         grouped_references = list(zip(*references))
-    #         scores = []
-    #         for grouped_reference in grouped_references:
-    #             score = bleurt.compute(
-    #                 predictions=predictions,
-    #                 references=grouped_reference,
-    #             )
-    #             scores.append(score["scores"])
-    #         scores = np.array(scores)  # (num_refs, num_preds)
-    #         score = np.mean(np.max(scores, axis=0))
-    #     else:
-    #         score = bleurt.compute(
-    #             predictions=predictions,
-    #             references=references,
-    #         )
-    #         score = np.mean(score["scores"])
-    #     metrics["bleurt"] = {
-    #         "bleurt": score,
-    #     }
-    # return metrics
-
 
 def flatten_metrics(metric_dict: dict):
     """Flatten the metric dictionary for easy display."""
@@ -399,7 +352,7 @@ def flatten_metrics(metric_dict: dict):
 if __name__ == "__main__":
     predictions_demo = ["The cat sat on the mat.", "The dog ate my homework."]
     references_demo = [["The cat sat on the mat.", "The cat sat on the desk."],
-                  ["The dog ate my homework.", "The dog ate my lunch."]]
+                       ["The dog ate my homework.", "The dog ate my lunch."]]
 
     # Compute metrics
     metrics_demo = compute_metrics(predictions=predictions_demo,