Skip to content

Commit

Permalink
Merge pull request #207 from princeton-nlp/improvement/fix-run-report…
Browse files Browse the repository at this point in the history
…-instance_ids

Make `run_report` more intuitive when using `instance_ids` filter during evaluation
  • Loading branch information
carlosejimenez authored Aug 17, 2024
2 parents f85bbc2 + 312b914 commit d8586ed
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 26 deletions.
18 changes: 4 additions & 14 deletions swebench/harness/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,8 @@ def get_dataset_from_preds(
dataset_ids = {i[KEY_INSTANCE_ID] for i in dataset}

if instance_ids:
# check that all instance IDs are in the dataset
instance_ids = set(instance_ids)
if instance_ids - dataset_ids:
raise ValueError(
(
"Some instance IDs not found in dataset!"
f"\nMissing IDs:\n{' '.join(instance_ids - dataset_ids)}"
)
)
# check that all instance IDs have predictions
missing_preds = instance_ids - set(predictions.keys())
missing_preds = set(instance_ids) - set(predictions.keys())
if missing_preds:
print(f"Warning: Missing predictions for {len(missing_preds)} instance IDs.")

Expand All @@ -329,9 +320,7 @@ def get_dataset_from_preds(
f"\nMissing IDs:\n{' '.join(prediction_ids - dataset_ids)}"
)
)

if instance_ids:
# filter dataset to just the instance IDs
dataset = [i for i in dataset if i[KEY_INSTANCE_ID] in instance_ids]

# check which instance IDs have already been run
Expand Down Expand Up @@ -437,8 +426,9 @@ def make_run_report(
unstopped_containers.add(container.name)

# print final report
dataset_ids = {i[KEY_INSTANCE_ID] for i in full_dataset}
print(f"Total instances: {len(full_dataset)}")
print(f"Instances submitted: {len(predictions)}")
print(f"Instances submitted: {len(set(predictions.keys()) & dataset_ids)}")
print(f"Instances completed: {len(completed_ids)}")
print(f"Instances incomplete: {len(incomplete_ids)}")
print(f"Instances resolved: {len(resolved_ids)}")
Expand Down Expand Up @@ -532,7 +522,7 @@ def main(

# get dataset from predictions
dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id)
full_dataset = load_swebench_dataset(dataset_name, split)
full_dataset = load_swebench_dataset(dataset_name, split, instance_ids)
existing_images = list_images(client)
print(f"Running {len(dataset)} unevaluated instances...")
if not dataset:
Expand Down
36 changes: 24 additions & 12 deletions swebench/harness/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,40 @@
MAP_REPO_TO_REQS_PATHS,
NON_TEST_EXTS,
SWE_BENCH_URL_RAW,
KEY_INSTANCE_ID,
)

load_dotenv()


def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test") -> list[SWEbenchInstance]:
def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance_ids=None) -> list[SWEbenchInstance]:
"""
Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file
"""
# check that all instance IDs are in the dataset
if instance_ids:
instance_ids = set(instance_ids)
# Load from local .json/.jsonl file
if name.endswith(".json") or name.endswith(".jsonl"):
return [
cast(SWEbenchInstance, instance)
for instance in json.loads(Path(name).read_text())
]

# Load from Hugging Face Datasets
if name.lower() in {"swe-bench", "swebench", "swe_bench"}:
name = "princeton-nlp/SWE-bench"
elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}:
name = "princeton-nlp/SWE-bench_Lite"
dataset = cast(Dataset, load_dataset(name, split=split))
dataset = json.loads(Path(name).read_text())
dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset}
else:
# Load from Hugging Face Datasets
if name.lower() in {"swe-bench", "swebench", "swe_bench"}:
name = "princeton-nlp/SWE-bench"
elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}:
name = "princeton-nlp/SWE-bench_Lite"
dataset = cast(Dataset, load_dataset(name, split=split))
dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset}
if instance_ids:
if instance_ids - dataset_ids:
raise ValueError(
(
"Some instance IDs not found in dataset!"
f"\nMissing IDs:\n{' '.join(instance_ids - dataset_ids)}"
)
)
dataset = [instance for instance in dataset if instance[KEY_INSTANCE_ID] in instance_ids]
return [cast(SWEbenchInstance, instance) for instance in dataset]


Expand Down

0 comments on commit d8586ed

Please sign in to comment.