Skip to content

Commit

Permalink
fix code/doc/lint style for benchmark, massw, except gpt_api
Browse files Browse the repository at this point in the history
  • Loading branch information
Jn-Huang committed Jun 14, 2024
1 parent 321845c commit 6244220
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 93 deletions.
10 changes: 8 additions & 2 deletions benchmark/aspect_prediction/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,17 +100,23 @@ def main():
default=False)
parser.add_argument("--model",
type=str,
choices=MODEL_CHOICES,
default="gpt-35-turbo")
parser.add_argument("--prompt",
type=str,
hoices=PROMPT_CHOICES,
default="zero-shot")
parser.add_argument("--num_samples",
type=int,
default=5)
args = parser.parse_args()

if args.model not in MODEL_CHOICES:
raise ValueError(f"Model {args.model} not supported. \
Choose from {MODEL_CHOICES}")

if args.prompt not in PROMPT_CHOICES:
raise ValueError(f"Prompt type {args.prompt} not supported. \
Choose from {PROMPT_CHOICES}")

if not args.output_dir:
args.output_dir = os.path.join("benchmark",
"aspect_prediction",
Expand Down
1 change: 1 addition & 0 deletions massw/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Init file for massw package."""
2 changes: 1 addition & 1 deletion massw/api/api_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,4 +594,4 @@ async def add(self,
await self.__queue.put(request)

self.__totals.queued += 1
self.log(f"QUEUED | {model}")
self.log(f"QUEUED | {model}")
17 changes: 9 additions & 8 deletions massw/api/api_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@


def prompts_to_raw_output_mistral(messages):
"""
Process prompts using the specified Mistral model endpoint and return the results.
"""
"""Process prompts using the specified Mistral model endpoint."""
final_results = pd.DataFrame(columns=['pid', 'output'])

url = os.environ.get("MISTRAL_API_URL")
Expand All @@ -25,14 +23,15 @@ def prompts_to_raw_output_mistral(messages):

for pid, msg in messages:
response_df = handle_mistral_model(url, headers, msg, {"pid": pid})
final_results = pd.concat([final_results, response_df], ignore_index=True)
final_results = pd.concat([final_results,
response_df], ignore_index=True)

return final_results


def raw_output_to_dict_mistral(model_path: str) -> Dict[str, str]:
"""
Load and convert raw output from the Mistral model into a dictionary mapping pid to output.
Load and convert raw output from the Mistral model into a dictionary.
Args:
model_path (str): Path to the model output CSV file.
Expand All @@ -42,9 +41,9 @@ def raw_output_to_dict_mistral(model_path: str) -> Dict[str, str]:
"""
output_dict = {}
task_output = pd.read_csv(model_path, sep="\t",
converters={'result': lambda x: json.loads(x) if x else None})
converters={'result': lambda x: json.loads(x)
if x else None})
for _, row in task_output.iterrows():
# processed_output = row['result'] if 'result' in row and row['result'] else ""
output = row["output"]
output_dict[row['pid']] = output
return output_dict
Expand Down Expand Up @@ -74,7 +73,9 @@ def handle_mistral_model(url, headers, messages, entry):
print(f"{req = }")
with urllib.request.urlopen(req) as response:
result_json = json.loads(response.read())
output_df = output_df.append({"pid": entry["pid"], "output": result_json}, ignore_index=True)
output_df = output_df.append({"pid": entry["pid"],
"output": result_json},
ignore_index=True)
break
except urllib.error.HTTPError as error:
print(f"The request failed with status code: {error.code}")
Expand Down
5 changes: 3 additions & 2 deletions massw/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,16 @@ def download_dataset(version="v1"):
files = urls[version]
except KeyError as e:
raise ValueError(
f"Invalid version: {version}. Choose from {list(urls.keys())}") from e
f"Invalid version: {version}.\
Choose from {list(urls.keys())}") from e
for filename, url in files.items():
print(f"Downloading {filename}...")
# Constructing the output path
out_path = os.path.join(PROJECT_DIR, "data", filename)
wget.download(url, out=out_path, bar=bar_progress)


def bar_progress(current, total):
def bar_progress(current, total, width=80):
"""Display a progress bar for the download."""
progress_message = f"Downloading: {current / total * 100:.0f}% \
[{current} / {total}] bytes"
Expand Down
113 changes: 33 additions & 80 deletions massw/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,26 @@
aspect (Context, Key Idea, Method, Outcome, or Projected Impact) of the same
paper.
For each pair of summarizations, classify the aspect, and assign a similarity score
on a scale from 1 to 10, where 1 indicates completely dissimilar and 10
For each pair of summarizations, classify the aspect,
and assign a similarity score on a scale from 1 to 10,
where 1 indicates completely dissimilar and 10
indicates identical content. Before scoring, include a brief justification for
your score. You should output your results in JSON format as shown in the example.
your score.
You should output your results in JSON format as shown in the example.
Example Input:
Input 1: The experiments demonstrated a 20% increase in efficiency, confirming the proposed model's effectiveness.
Input 2: Results show that the new model outperforms existing ones by improving efficiency by approximately 20%.
Input 1: The experiments demonstrated a 20% increase in efficiency,
confirming the proposed model's effectiveness.
Input 2: Results show that the new model outperforms existing ones
by improving efficiency by approximately 20%.
Example JSON Output:
{
"aspect": "Outcome",
"score": 9,
"justification": "Both texts describe similar measurable improvements in efficiency, closely aligning in their depiction of the model's effectiveness."
"justification": "Both texts describe similar measurable improvements
in efficiency, closely aligning in their depiction
of the model's effectiveness."
}
"""

Expand All @@ -41,6 +47,7 @@ class LLMSimilarity:
"""Evaluate the similarity between two texts using a language model."""

def __init__(self, model_name: str = "gpt-4"):
"""Initialize the language model similarity evaluator."""
assert model_name in ["gpt-4", "gpt-35-turbo"]
self.model_name = model_name
self.tpm = {"gpt-4": 4000, "gpt-35-turbo": 40000}[model_name]
Expand Down Expand Up @@ -109,6 +116,7 @@ class CosineSimilarity:
"""Compute cosine similarity between two ordered list of texts."""

def __init__(self):
"""Initialize the SentenceTransformer model."""
self.encoder = SentenceTransformer(
'intfloat/multilingual-e5-large-instruct')

Expand Down Expand Up @@ -217,15 +225,20 @@ def compute(
nahit = NAHit()




def compute_metrics(predictions: List[str],
references: List[List[str]],
metric_names=None):
"""Compute cosine similarity, ROUGE, BLEU, METEOR, and BERTScore metrics."""
"""Compute cosine similarity, ROUGE, BLEU, METEOR, and BERTScore."""
if metric_names is None:
metric_names = [
"cosine", "rouge", "bleu", "meteor", "bleurt", "bertscore", "nahit", "llm_sim"
"cosine",
"rouge",
"bleu",
"meteor",
"bleurt",
"bertscore",
"nahit",
"llm_sim"
]
metrics = {}
if "nahit" in metric_names:
Expand Down Expand Up @@ -280,88 +293,28 @@ def compute_metrics(predictions: List[str],
grouped_references = list(zip(*references))
scores = []
for grouped_reference in grouped_references:
score = metric_computation_functions[metric_name].compute(
predictions=predictions,
references=grouped_reference,
)
score = \
metric_computation_functions[metric_name].compute(
predictions=predictions,
references=grouped_reference)
scores.append(score["scores"])
scores = np.array(scores) # (num_refs, num_preds)
score = np.mean(np.max(scores, axis=0))
else:
score = metric_computation_functions[metric_name].compute(
predictions=predictions,
references=references,
)
references=references)
score = np.mean(score["scores"])
metrics[metric_name] = {"bleurt": score}
else:
metrics[metric_name] = metric_computation_functions[metric_name].compute(
predictions=predictions,
references=references
metrics[metric_name] = \
metric_computation_functions[metric_name].compute(
predictions=predictions,
references=references
)

return metrics

# if "cosine" in metric_names:
# metrics["cosine"] = cs.compute(
# predictions=predictions,
# references=references,
# )
# if "llm_sim" in metric_names:
# metrics["llm_sim"] = llm_sim.compute(
# predictions=predictions,
# references=references,
# )
# if "rouge" in metric_names:
# metrics["rouge"] = rouge.compute(
# predictions=predictions,
# references=references,
# )
# if "bleu" in metric_names:
# metrics["bleu"] = bleu.compute(
# predictions=predictions,
# references=references,
# )
# if "meteor" in metric_names:
# metrics["meteor"] = meteor.compute(
# predictions=predictions,
# references=references,
# )
# if "bertscore" in metric_names:
# score = bertscore.compute(
# predictions=predictions,
# references=references,
# lang="en",
# )
# metrics["bertscore"] = {
# "precision": np.array(score["precision"]).mean(),
# "recall": np.array(score["recall"]).mean(),
# "f1": np.array(score["f1"]).mean(),
# }
# if "bleurt" in metric_names:
# # Compute the maximum BLEURT score for each prediction
# if isinstance(references[0], list):
# grouped_references = list(zip(*references))
# scores = []
# for grouped_reference in grouped_references:
# score = bleurt.compute(
# predictions=predictions,
# references=grouped_reference,
# )
# scores.append(score["scores"])
# scores = np.array(scores) # (num_refs, num_preds)
# score = np.mean(np.max(scores, axis=0))
# else:
# score = bleurt.compute(
# predictions=predictions,
# references=references,
# )
# score = np.mean(score["scores"])
# metrics["bleurt"] = {
# "bleurt": score,
# }
# return metrics


def flatten_metrics(metric_dict: dict):
"""Flatten the metric dictionary for easy display."""
Expand Down Expand Up @@ -399,7 +352,7 @@ def flatten_metrics(metric_dict: dict):
if __name__ == "__main__":
predictions_demo = ["The cat sat on the mat.", "The dog ate my homework."]
references_demo = [["The cat sat on the mat.", "The cat sat on the desk."],
["The dog ate my homework.", "The dog ate my lunch."]]
["The dog ate my homework.", "The dog ate my lunch."]]

# Compute metrics
metrics_demo = compute_metrics(predictions=predictions_demo,
Expand Down

0 comments on commit 6244220

Please sign in to comment.