diff --git a/lite/examples/object-detection.ipynb b/lite/examples/object-detection.ipynb index 5fb06d7a0..50189079f 100644 --- a/lite/examples/object-detection.ipynb +++ b/lite/examples/object-detection.ipynb @@ -959,7 +959,7 @@ "id": "98edc4dd", "metadata": {}, "source": [ - "### Hallucinations" + "### Unmatched Predictions" ] }, { @@ -1764,7 +1764,7 @@ } ], "source": [ - "metric.value[\"hallucinations\"]" + "metric.value[\"unmatched_predictions\"]" ] }, { @@ -1773,7 +1773,7 @@ "id": "415335e4", "metadata": {}, "source": [ - "### Ground Truths Missing Predictions" + "### Unmatched Ground Truths" ] }, { @@ -2995,7 +2995,7 @@ } ], "source": [ - "metric.value[\"missing_predictions\"]" + "metric.value[\"unmatched_ground_truths\"]" ] } ], diff --git a/lite/examples/tabular_classification.ipynb b/lite/examples/tabular_classification.ipynb index ed06e48c2..db468ee8e 100644 --- a/lite/examples/tabular_classification.ipynb +++ b/lite/examples/tabular_classification.ipynb @@ -605,7 +605,7 @@ } ], "source": [ - "cm.value[\"missing_predictions\"]" + "cm.value[\"unmatched_ground_truths\"]" ] } ], diff --git a/lite/tests/classification/test_confusion_matrix.py b/lite/tests/classification/test_confusion_matrix.py index e4a98758e..f42c3dafc 100644 --- a/lite/tests/classification/test_confusion_matrix.py +++ b/lite/tests/classification/test_confusion_matrix.py @@ -40,7 +40,7 @@ def test_compute_confusion_matrix(): score_thresholds = np.array([0.25, 0.75], dtype=np.float64) - confusion_matrix, missing_predictions = compute_confusion_matrix( + confusion_matrix, unmatched_ground_truths = compute_confusion_matrix( data=data, label_metadata=label_metadata, score_thresholds=score_thresholds, @@ -74,15 +74,15 @@ def test_compute_confusion_matrix(): ) ).all() - assert missing_predictions.shape == (2, 4, 1) + assert unmatched_ground_truths.shape == (2, 4, 1) assert ( # score >= 0.25 - missing_predictions[0, :, 0] + unmatched_ground_truths[0, :, 0] == np.array([-1.0, -1.0, -1.0, -1.0]) ).all() assert ( # score >= 0.75 - missing_predictions[1, :, 0] + unmatched_ground_truths[1, :, 0] == np.array([-1.0, -1.0, -1.0, 1.0]) ).all() @@ -144,7 +144,7 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]): } }, }, - "missing_predictions": {}, + "unmatched_ground_truths": {}, }, "parameters": { "score_threshold": 0.25, @@ -166,7 +166,7 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]): }, } }, - "missing_predictions": { + "unmatched_ground_truths": { "3": {"count": 1, "examples": [{"datum": "uid2"}]} }, }, @@ -179,7 +179,7 @@ def test_confusion_matrix_basic(basic_classifications: list[Classification]): for m in actual_metrics: _filter_elements_with_zero_count( cm=m["value"]["confusion_matrix"], - mp=m["value"]["missing_predictions"], + mp=m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: @@ -212,7 +212,7 @@ def test_confusion_matrix_unit( "1": {"1": {"count": 1, "examples": []}}, "2": {"1": {"count": 2, "examples": []}}, }, - "missing_predictions": {}, + "unmatched_ground_truths": {}, }, "parameters": { "score_threshold": 0.5, @@ -223,7 +223,7 @@ def test_confusion_matrix_unit( for m in actual_metrics: _filter_elements_with_zero_count( cm=m["value"]["confusion_matrix"], - mp=m["value"]["missing_predictions"], + mp=m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: @@ -282,7 +282,7 @@ def test_confusion_matrix_with_animal_example( } }, }, - "missing_predictions": { + "unmatched_ground_truths": { "dog": {"count": 1, "examples": [{"datum": "uid5"}]} }, }, @@ -295,7 +295,7 @@ def test_confusion_matrix_with_animal_example( for m in actual_metrics: _filter_elements_with_zero_count( cm=m["value"]["confusion_matrix"], - mp=m["value"]["missing_predictions"], + mp=m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: @@ -356,7 +356,7 @@ def test_confusion_matrix_with_color_example( } }, }, - "missing_predictions": { + "unmatched_ground_truths": { "red": {"count": 1, "examples": [{"datum": "uid2"}]} }, }, @@ -369,7 +369,7 @@ def test_confusion_matrix_with_color_example( for m in actual_metrics: _filter_elements_with_zero_count( cm=m["value"]["confusion_matrix"], - mp=m["value"]["missing_predictions"], + mp=m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: @@ -438,7 +438,7 @@ def test_confusion_matrix_multiclass( } }, }, - "missing_predictions": {}, + "unmatched_ground_truths": {}, }, "parameters": { "score_threshold": 0.05, @@ -466,7 +466,7 @@ def test_confusion_matrix_multiclass( } }, }, - "missing_predictions": { + "unmatched_ground_truths": { "cat": { "count": 2, "examples": [{"datum": "uid0"}, {"datum": "uid2"}], @@ -483,7 +483,7 @@ def test_confusion_matrix_multiclass( "type": "ConfusionMatrix", "value": { "confusion_matrix": {}, - "missing_predictions": { + "unmatched_ground_truths": { "cat": { "count": 2, "examples": [{"datum": "uid0"}, {"datum": "uid2"}], @@ -504,7 +504,7 @@ def test_confusion_matrix_multiclass( for m in actual_metrics: _filter_elements_with_zero_count( cm=m["value"]["confusion_matrix"], - mp=m["value"]["missing_predictions"], + mp=m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: @@ -560,7 +560,7 @@ def test_confusion_matrix_without_hardmax_animal_example( }, } }, - "missing_predictions": {}, + "unmatched_ground_truths": {}, }, "parameters": { "score_threshold": 0.05, @@ -580,7 +580,7 @@ def test_confusion_matrix_without_hardmax_animal_example( } } }, - "missing_predictions": {}, + "unmatched_ground_truths": {}, }, "parameters": { "score_threshold": 0.4, @@ -591,7 +591,7 @@ def test_confusion_matrix_without_hardmax_animal_example( "type": "ConfusionMatrix", "value": { "confusion_matrix": {}, - "missing_predictions": { + "unmatched_ground_truths": { "ant": { "count": 1, "examples": [ @@ -611,7 +611,7 @@ def test_confusion_matrix_without_hardmax_animal_example( for m in actual_metrics: _filter_elements_with_zero_count( cm=m["value"]["confusion_matrix"], - mp=m["value"]["missing_predictions"], + mp=m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: diff --git a/lite/tests/classification/test_dataloader.py b/lite/tests/classification/test_dataloader.py index 26dee6562..4fa2ae842 100644 --- a/lite/tests/classification/test_dataloader.py +++ b/lite/tests/classification/test_dataloader.py @@ -8,7 +8,7 @@ def test_no_data(): loader.finalize() -def test_missing_predictions( +def test_unmatched_ground_truths( classifications_no_predictions: list[Classification], ): loader = DataLoader() diff --git a/lite/tests/object_detection/conftest.py b/lite/tests/object_detection/conftest.py index ed32b6434..2eec09766 100644 --- a/lite/tests/object_detection/conftest.py +++ b/lite/tests/object_detection/conftest.py @@ -704,7 +704,7 @@ def false_negatives_two_images_one_only_with_different_class_high_confidence_of_ @pytest.fixture -def detections_fp_hallucination_edge_case() -> list[Detection]: +def detections_fp_unmatched_prediction_edge_case() -> list[Detection]: return [ Detection( uid="uid1", @@ -1093,7 +1093,7 @@ def detections_for_detailed_counting( xmax=rect4[1], ymin=rect4[2], ymax=rect4[3], - labels=["hallucination"], + labels=["no_overlap"], scores=[0.1], ), ], diff --git a/lite/tests/object_detection/test_accuracy.py b/lite/tests/object_detection/test_accuracy.py index 16d3be78a..f62dd8f2f 100644 --- a/lite/tests/object_detection/test_accuracy.py +++ b/lite/tests/object_detection/test_accuracy.py @@ -95,9 +95,9 @@ def test_accuracy_metrics_first_class( groundtruths datum uid1 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 - box 2 - label v1 - fn missing prediction + box 2 - label v1 - fn unmatched ground truths predictions datum uid1 @@ -176,7 +176,7 @@ def test_accuracy_metrics_second_class( groundtruths datum uid1 - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 none predictions diff --git a/lite/tests/object_detection/test_average_precision.py b/lite/tests/object_detection/test_average_precision.py index 947b2f403..68c7dc24c 100644 --- a/lite/tests/object_detection/test_average_precision.py +++ b/lite/tests/object_detection/test_average_precision.py @@ -72,7 +72,7 @@ def test_ap_metrics_first_class( datum uid1 box 1 - label v1 - tp datum uid2 - box 2 - label v1 - fn missing prediction + box 2 - label v1 - fn unmatched ground truths predictions datum uid1 @@ -192,7 +192,7 @@ def test_ap_metrics_second_class( groundtruths datum uid1 - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 none predictions diff --git a/lite/tests/object_detection/test_average_recall.py b/lite/tests/object_detection/test_average_recall.py index 3d3472c27..2bccef4e4 100644 --- a/lite/tests/object_detection/test_average_recall.py +++ b/lite/tests/object_detection/test_average_recall.py @@ -86,7 +86,7 @@ def test_ar_metrics_first_class( datum uid1 box 1 - label v1 - tp datum uid2 - box 2 - label v1 - fn missing prediction + box 2 - label v1 - fn unmatched ground truths predictions datum uid1 @@ -198,7 +198,7 @@ def test_ar_metrics_second_class( groundtruths datum uid1 - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 none predictions diff --git a/lite/tests/object_detection/test_confusion_matrix.py b/lite/tests/object_detection/test_confusion_matrix.py index 85e29e481..d2668327d 100644 --- a/lite/tests/object_detection/test_confusion_matrix.py +++ b/lite/tests/object_detection/test_confusion_matrix.py @@ -35,8 +35,8 @@ def _test_compute_confusion_matrix( ( confusion_matrix, - hallucinations, - missing_predictions, + unmatched_predictions, + unmatched_ground_truths, ) = compute_confusion_matrix( data=sorted_pairs, label_metadata=label_metadata, @@ -52,37 +52,37 @@ def _test_compute_confusion_matrix( 2, 1 + n_examples * 4, ) # iou, score, gt label, pd label, metrics - assert hallucinations.shape == ( + assert unmatched_predictions.shape == ( 1, 100, 2, 1 + n_examples * 3, ) # iou, score, pd label, metrics - assert missing_predictions.shape == ( + assert unmatched_ground_truths.shape == ( 1, 100, 2, 1 + n_examples * 2, ) # iou, score, gt label, metrics - return (confusion_matrix, hallucinations, missing_predictions) + return (confusion_matrix, unmatched_predictions, unmatched_ground_truths) def test_compute_confusion_matrix(): ( confusion_matrix, - hallucinations, - missing_predictions, + unmatched_predictions, + unmatched_ground_truths, ) = _test_compute_confusion_matrix(n_examples=0) """ @ iou=0.5, score<0.1 3x tp 1x fp misclassification - 1x fp hallucination + 1x fp unmatched prediction 0x fn misclassification - 1x fn missing prediction + 1x fn unmatched ground truth """ indices = slice(10) @@ -98,25 +98,26 @@ def test_compute_confusion_matrix(): hal_pd0 = np.array([1.0]) hal_pd1 = np.array([-1.0]) - expected_hallucinations = np.array([hal_pd0, hal_pd1]) + expected_unmatched_predictions = np.array([hal_pd0, hal_pd1]) assert np.isclose( - hallucinations[0, indices, :, :], expected_hallucinations + unmatched_predictions[0, indices, :, :], expected_unmatched_predictions ).all() misprd_gt0 = np.array([1.0]) misprd_gt1 = np.array([-1.0]) - expected_missing_predictions = np.array([misprd_gt0, misprd_gt1]) + expected_unmatched_ground_truths = np.array([misprd_gt0, misprd_gt1]) assert np.isclose( - missing_predictions[0, indices, :, :], expected_missing_predictions + unmatched_ground_truths[0, indices, :, :], + expected_unmatched_ground_truths, ).all() """ @ iou=0.5, 0.1 <= score < 0.65 1x tp 1x fp misclassification - 1x fp hallucination + 1x fp unmatched prediction 1x fn misclassification - 3x fn missing prediction + 3x fn unmatched ground truth """ indices = slice(10, 65) @@ -132,25 +133,26 @@ def test_compute_confusion_matrix(): hal_pd0 = np.array([1.0]) hal_pd1 = np.array([-1.0]) - expected_hallucinations = np.array([hal_pd0, hal_pd1]) + expected_unmatched_predictions = np.array([hal_pd0, hal_pd1]) assert np.isclose( - hallucinations[0, indices, :, :], expected_hallucinations + unmatched_predictions[0, indices, :, :], expected_unmatched_predictions ).all() misprd_gt0 = np.array([3.0]) misprd_gt1 = np.array([-1.0]) - expected_missing_predictions = np.array([misprd_gt0, misprd_gt1]) + expected_unmatched_ground_truths = np.array([misprd_gt0, misprd_gt1]) assert np.isclose( - missing_predictions[0, indices, :, :], expected_missing_predictions + unmatched_ground_truths[0, indices, :, :], + expected_unmatched_ground_truths, ).all() """ @ iou=0.5, 0.65 <= score < 0.9 1x tp 1x fp misclassification - 0x fp hallucination + 0x fp unmatched prediction 1x fn misclassification - 3x fn missing prediction + 3x fn unmatched ground truth """ indices = slice(65, 90) @@ -166,25 +168,26 @@ def test_compute_confusion_matrix(): hal_pd0 = np.array([-1.0]) hal_pd1 = np.array([-1.0]) - expected_hallucinations = np.array([hal_pd0, hal_pd1]) + expected_unmatched_predictions = np.array([hal_pd0, hal_pd1]) assert np.isclose( - hallucinations[0, indices, :, :], expected_hallucinations + unmatched_predictions[0, indices, :, :], expected_unmatched_predictions ).all() misprd_gt0 = np.array([3.0]) misprd_gt1 = np.array([-1.0]) - expected_missing_predictions = np.array([misprd_gt0, misprd_gt1]) + expected_unmatched_ground_truths = np.array([misprd_gt0, misprd_gt1]) assert np.isclose( - missing_predictions[0, indices, :, :], expected_missing_predictions + unmatched_ground_truths[0, indices, :, :], + expected_unmatched_ground_truths, ).all() """ @ iou=0.5, score>=0.9 0x tp 0x fp misclassification - 0x fp hallucination + 0x fp unmatched prediction 0x fn misclassification - 4x fn missing prediction + 4x fn unmatched ground truth """ indices = slice(90, None) @@ -200,16 +203,17 @@ def test_compute_confusion_matrix(): hal_pd0 = np.array([-1.0]) hal_pd1 = np.array([-1.0]) - expected_hallucinations = np.array([hal_pd0, hal_pd1]) + expected_unmatched_predictions = np.array([hal_pd0, hal_pd1]) assert np.isclose( - hallucinations[0, indices, :, :], expected_hallucinations + unmatched_predictions[0, indices, :, :], expected_unmatched_predictions ).all() misprd_gt0 = np.array([4.0]) misprd_gt1 = np.array([1.0]) - expected_missing_predictions = np.array([misprd_gt0, misprd_gt1]) + expected_unmatched_ground_truths = np.array([misprd_gt0, misprd_gt1]) assert np.isclose( - missing_predictions[0, indices, :, :], expected_missing_predictions + unmatched_ground_truths[0, indices, :, :], + expected_unmatched_ground_truths, ).all() @@ -217,17 +221,17 @@ def test_compute_confusion_matrix_with_examples(): ( confusion_matrix, - hallucinations, - missing_predictions, + unmatched_predictions, + unmatched_ground_truths, ) = _test_compute_confusion_matrix(n_examples=2) """ @ iou=0.5, score<0.1 3x tp 1x fp misclassification - 1x fp hallucination + 1x fp unmatched prediction 0x fn misclassification - 1x fn missing prediction + 1x fn unmatched ground truth """ indices = slice(10) @@ -249,26 +253,27 @@ def test_compute_confusion_matrix_with_examples(): # total count, datum 0, pd 0, score 0, datum 1, pd 1, score 1 hal_pd0 = np.array([1.0, 2.0, 4.0, 0.65, -1.0, -1.0, -1.0]) hal_pd1 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]) - expected_hallucinations = np.array([hal_pd0, hal_pd1]) + expected_unmatched_predictions = np.array([hal_pd0, hal_pd1]) assert np.isclose( - hallucinations[0, indices, :, :], expected_hallucinations + unmatched_predictions[0, indices, :, :], expected_unmatched_predictions ).all() # total count, datum 0, gt 0, datum1, gt 1 misprd_gt0 = np.array([1.0, 4.0, 5.0, -1.0, -1.0]) misprd_gt1 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0]) - expected_missing_predictions = np.array([misprd_gt0, misprd_gt1]) + expected_unmatched_ground_truths = np.array([misprd_gt0, misprd_gt1]) assert np.isclose( - missing_predictions[0, indices, :, :], expected_missing_predictions + unmatched_ground_truths[0, indices, :, :], + expected_unmatched_ground_truths, ).all() """ @ iou=0.5, 0.1 <= score < 0.65 1x tp 1x fp misclassification - 1x fp hallucination + 1x fp unmatched prediction 1x fn misclassification - 2x fn missing prediction + 2x fn unmatched ground truth """ indices = slice(10, 65) @@ -290,26 +295,27 @@ def test_compute_confusion_matrix_with_examples(): # total count, datum 0, pd 0, score 0, datum 1, pd 1, score 1 hal_pd0 = np.array([1.0, 2.0, 4.0, 0.65, -1.0, -1.0, -1.0]) hal_pd1 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]) - expected_hallucinations = np.array([hal_pd0, hal_pd1]) + expected_unmatched_predictions = np.array([hal_pd0, hal_pd1]) assert np.isclose( - hallucinations[0, indices, :, :], expected_hallucinations + unmatched_predictions[0, indices, :, :], expected_unmatched_predictions ).all() # total count, datum 0, gt 0, datum1, gt 1 misprd_gt0 = np.array([3.0, 1.0, 2.0, 3.0, 4.0]) misprd_gt1 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0]) - expected_missing_predictions = np.array([misprd_gt0, misprd_gt1]) + expected_unmatched_ground_truths = np.array([misprd_gt0, misprd_gt1]) assert np.isclose( - missing_predictions[0, indices, :, :], expected_missing_predictions + unmatched_ground_truths[0, indices, :, :], + expected_unmatched_ground_truths, ).all() """ @ iou=0.5, 0.65 <= score < 0.9 1x tp 1x fp misclassification - 0x fp hallucination + 0x fp unmatched prediction 1x fn misclassification - 2x fn missing prediction + 2x fn unmatched ground truth """ indices = slice(65, 90) @@ -331,26 +337,27 @@ def test_compute_confusion_matrix_with_examples(): # total count, datum 0, pd 0, score 0, datum 1, pd 1, score 1 hal_pd0 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]) hal_pd1 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]) - expected_hallucinations = np.array([hal_pd0, hal_pd1]) + expected_unmatched_predictions = np.array([hal_pd0, hal_pd1]) assert np.isclose( - hallucinations[0, indices, :, :], expected_hallucinations + unmatched_predictions[0, indices, :, :], expected_unmatched_predictions ).all() # total count, datum 0, gt 0, datum1, gt 1 misprd_gt0 = np.array([3.0, 1.0, 2.0, 3.0, 4.0]) misprd_gt1 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0]) - expected_missing_predictions = np.array([misprd_gt0, misprd_gt1]) + expected_unmatched_ground_truths = np.array([misprd_gt0, misprd_gt1]) assert np.isclose( - missing_predictions[0, indices, :, :], expected_missing_predictions + unmatched_ground_truths[0, indices, :, :], + expected_unmatched_ground_truths, ).all() """ @ iou=0.5, score>=0.9 0x tp 0x fp misclassification - 0x fp hallucination + 0x fp unmatched prediction 0x fn misclassification - 4x fn missing prediction + 4x fn unmatched ground truth """ indices = slice(90, None) @@ -376,17 +383,18 @@ def test_compute_confusion_matrix_with_examples(): # total count, datum 0, pd 0, score 0, datum 1, pd 1, score 1 hal_pd0 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]) hal_pd1 = np.array([-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0]) - expected_hallucinations = np.array([hal_pd0, hal_pd1]) + expected_unmatched_predictions = np.array([hal_pd0, hal_pd1]) assert np.isclose( - hallucinations[0, indices, :, :], expected_hallucinations + unmatched_predictions[0, indices, :, :], expected_unmatched_predictions ).all() # total count, datum 0, gt 0, datum1, gt 1 misprd_gt0 = np.array([4.0, 0.0, 0.0, 1.0, 2.0]) misprd_gt1 = np.array([1.0, 1.0, 1.0, -1.0, -1.0]) - expected_missing_predictions = np.array([misprd_gt0, misprd_gt1]) + expected_unmatched_ground_truths = np.array([misprd_gt0, misprd_gt1]) assert np.isclose( - missing_predictions[0, indices, :, :], expected_missing_predictions + unmatched_ground_truths[0, indices, :, :], + expected_unmatched_ground_truths, ).all() @@ -424,7 +432,7 @@ def test_confusion_matrix( assert evaluator.ignored_prediction_labels == [ "not_v2", - "hallucination", + "no_overlap", ] assert evaluator.missing_prediction_labels == [ "missed_detection", @@ -467,7 +475,7 @@ def test_confusion_matrix( } } }, - "hallucinations": { + "unmatched_predictions": { "not_v2": { "count": 1, "examples": [ @@ -478,7 +486,7 @@ def test_confusion_matrix( } ], }, - "hallucination": { + "no_overlap": { "count": 1, "examples": [ { @@ -499,7 +507,7 @@ def test_confusion_matrix( ], }, }, - "missing_predictions": { + "unmatched_ground_truths": { "missed_detection": { "count": 1, "examples": [ @@ -544,7 +552,7 @@ def test_confusion_matrix( } } }, - "hallucinations": { + "unmatched_predictions": { "not_v2": { "count": 1, "examples": [ @@ -566,7 +574,7 @@ def test_confusion_matrix( ], }, }, - "missing_predictions": { + "unmatched_ground_truths": { "missed_detection": { "count": 1, "examples": [ @@ -611,7 +619,7 @@ def test_confusion_matrix( } } }, - "hallucinations": { + "unmatched_predictions": { "low_iou": { "count": 1, "examples": [ @@ -623,7 +631,7 @@ def test_confusion_matrix( ], } }, - "missing_predictions": { + "unmatched_ground_truths": { "missed_detection": { "count": 1, "examples": [ @@ -668,7 +676,7 @@ def test_confusion_matrix( } } }, - "hallucinations": { + "unmatched_predictions": { "low_iou": { "count": 1, "examples": [ @@ -680,7 +688,7 @@ def test_confusion_matrix( ], } }, - "missing_predictions": { + "unmatched_ground_truths": { "missed_detection": { "count": 1, "examples": [ @@ -711,8 +719,8 @@ def test_confusion_matrix( "type": "ConfusionMatrix", "value": { "confusion_matrix": {}, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "v1": { "count": 1, "examples": [ @@ -749,8 +757,8 @@ def test_confusion_matrix( "type": "ConfusionMatrix", "value": { "confusion_matrix": {}, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "v1": { "count": 1, "examples": [ @@ -787,8 +795,8 @@ def test_confusion_matrix( for m in actual_metrics: _filter_out_zero_counts( m["value"]["confusion_matrix"], - m["value"]["hallucinations"], - m["value"]["missing_predictions"], + m["value"]["unmatched_predictions"], + m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: @@ -835,8 +843,8 @@ def test_confusion_matrix( } }, }, - "hallucinations": { - "hallucination": { + "unmatched_predictions": { + "no_overlap": { "count": 1, "examples": [ { @@ -857,7 +865,7 @@ def test_confusion_matrix( ], }, }, - "missing_predictions": { + "unmatched_ground_truths": { "missed_detection": { "count": 1, "examples": [ @@ -912,7 +920,7 @@ def test_confusion_matrix( } }, }, - "hallucinations": { + "unmatched_predictions": { "low_iou": { "count": 1, "examples": [ @@ -924,7 +932,7 @@ def test_confusion_matrix( ], } }, - "missing_predictions": { + "unmatched_ground_truths": { "missed_detection": { "count": 1, "examples": [ @@ -966,7 +974,7 @@ def test_confusion_matrix( } } }, - "hallucinations": { + "unmatched_predictions": { "low_iou": { "count": 1, "examples": [ @@ -978,7 +986,7 @@ def test_confusion_matrix( ], } }, - "missing_predictions": { + "unmatched_ground_truths": { "missed_detection": { "count": 1, "examples": [ @@ -1026,7 +1034,7 @@ def test_confusion_matrix( } } }, - "hallucinations": { + "unmatched_predictions": { "low_iou": { "count": 1, "examples": [ @@ -1038,7 +1046,7 @@ def test_confusion_matrix( ], } }, - "missing_predictions": { + "unmatched_ground_truths": { "missed_detection": { "count": 1, "examples": [ @@ -1072,8 +1080,8 @@ def test_confusion_matrix( "type": "ConfusionMatrix", "value": { "confusion_matrix": {}, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "v1": { "count": 1, "examples": [ @@ -1116,8 +1124,8 @@ def test_confusion_matrix( "type": "ConfusionMatrix", "value": { "confusion_matrix": {}, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "v1": { "count": 1, "examples": [ @@ -1161,8 +1169,8 @@ def test_confusion_matrix( for m in actual_metrics: _filter_out_zero_counts( m["value"]["confusion_matrix"], - m["value"]["hallucinations"], - m["value"]["missing_predictions"], + m["value"]["unmatched_predictions"], + m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: @@ -1210,8 +1218,10 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 5, "examples": []}}, "49": {"49": {"count": 9, "examples": []}}, }, - "hallucinations": {}, - "missing_predictions": {"49": {"count": 1, "examples": []}}, + "unmatched_predictions": {}, + "unmatched_ground_truths": { + "49": {"count": 1, "examples": []} + }, }, "parameters": { "score_threshold": 0.05, @@ -1232,8 +1242,8 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 5, "examples": []}}, "49": {"49": {"count": 6, "examples": []}}, }, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "4": {"count": 1, "examples": []}, "49": {"count": 4, "examples": []}, }, @@ -1253,8 +1263,8 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 4, "examples": []}}, "49": {"49": {"count": 4, "examples": []}}, }, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "4": {"count": 1, "examples": []}, "2": {"count": 1, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1276,8 +1286,8 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 3, "examples": []}}, "49": {"49": {"count": 3, "examples": []}}, }, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 1, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1298,8 +1308,8 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 2, "examples": []}}, "49": {"49": {"count": 2, "examples": []}}, }, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 2, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1320,8 +1330,8 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 2, "examples": []}}, "49": {"49": {"count": 1, "examples": []}}, }, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 2, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1342,8 +1352,8 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 1, "examples": []}}, "49": {"49": {"count": 1, "examples": []}}, }, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 2, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1361,8 +1371,8 @@ def test_confusion_matrix_using_torch_metrics_example( "type": "ConfusionMatrix", "value": { "confusion_matrix": {"0": {"0": {"count": 1, "examples": []}}}, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 2, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1384,14 +1394,14 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 1, "examples": []}}, "49": {"49": {"count": 2, "examples": []}}, }, - "hallucinations": { + "unmatched_predictions": { "4": {"count": 2, "examples": []}, "3": {"count": 1, "examples": []}, "1": {"count": 1, "examples": []}, "0": {"count": 4, "examples": []}, "49": {"count": 7, "examples": []}, }, - "missing_predictions": { + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 1, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1413,14 +1423,14 @@ def test_confusion_matrix_using_torch_metrics_example( "0": {"0": {"count": 1, "examples": []}}, "49": {"49": {"count": 2, "examples": []}}, }, - "hallucinations": { + "unmatched_predictions": { "4": {"count": 1, "examples": []}, "3": {"count": 1, "examples": []}, "1": {"count": 1, "examples": []}, "0": {"count": 4, "examples": []}, "49": {"count": 4, "examples": []}, }, - "missing_predictions": { + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 1, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1441,12 +1451,12 @@ def test_confusion_matrix_using_torch_metrics_example( "2": {"2": {"count": 1, "examples": []}}, "49": {"49": {"count": 2, "examples": []}}, }, - "hallucinations": { + "unmatched_predictions": { "4": {"count": 1, "examples": []}, "0": {"count": 4, "examples": []}, "49": {"count": 2, "examples": []}, }, - "missing_predictions": { + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 1, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1467,11 +1477,11 @@ def test_confusion_matrix_using_torch_metrics_example( "2": {"2": {"count": 1, "examples": []}}, "49": {"49": {"count": 1, "examples": []}}, }, - "hallucinations": { + "unmatched_predictions": { "0": {"count": 3, "examples": []}, "49": {"count": 2, "examples": []}, }, - "missing_predictions": { + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 1, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1491,11 +1501,11 @@ def test_confusion_matrix_using_torch_metrics_example( "confusion_matrix": { "49": {"49": {"count": 1, "examples": []}} }, - "hallucinations": { + "unmatched_predictions": { "0": {"count": 2, "examples": []}, "49": {"count": 1, "examples": []}, }, - "missing_predictions": { + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 2, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1515,8 +1525,8 @@ def test_confusion_matrix_using_torch_metrics_example( "confusion_matrix": { "49": {"49": {"count": 1, "examples": []}} }, - "hallucinations": {"0": {"count": 2, "examples": []}}, - "missing_predictions": { + "unmatched_predictions": {"0": {"count": 2, "examples": []}}, + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 2, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1536,8 +1546,8 @@ def test_confusion_matrix_using_torch_metrics_example( "confusion_matrix": { "49": {"49": {"count": 1, "examples": []}} }, - "hallucinations": {"0": {"count": 1, "examples": []}}, - "missing_predictions": { + "unmatched_predictions": {"0": {"count": 1, "examples": []}}, + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 2, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1555,8 +1565,8 @@ def test_confusion_matrix_using_torch_metrics_example( "type": "ConfusionMatrix", "value": { "confusion_matrix": {}, - "hallucinations": {"0": {"count": 1, "examples": []}}, - "missing_predictions": { + "unmatched_predictions": {"0": {"count": 1, "examples": []}}, + "unmatched_ground_truths": { "4": {"count": 2, "examples": []}, "2": {"count": 2, "examples": []}, "1": {"count": 1, "examples": []}, @@ -1574,20 +1584,20 @@ def test_confusion_matrix_using_torch_metrics_example( for m in actual_metrics: _filter_out_zero_counts( m["value"]["confusion_matrix"], - m["value"]["hallucinations"], - m["value"]["missing_predictions"], + m["value"]["unmatched_predictions"], + m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: assert m in actual_metrics -def test_confusion_matrix_fp_hallucination_edge_case( - detections_fp_hallucination_edge_case: list[Detection], +def test_confusion_matrix_fp_unmatched_prediction_edge_case( + detections_fp_unmatched_prediction_edge_case: list[Detection], ): loader = DataLoader() - loader.add_bounding_boxes(detections_fp_hallucination_edge_case) + loader.add_bounding_boxes(detections_fp_unmatched_prediction_edge_case) evaluator = loader.finalize() assert evaluator.ignored_prediction_labels == [] @@ -1635,7 +1645,7 @@ def test_confusion_matrix_fp_hallucination_edge_case( } } }, - "hallucinations": { + "unmatched_predictions": { "v1": { "count": 1, "examples": [ @@ -1652,7 +1662,7 @@ def test_confusion_matrix_fp_hallucination_edge_case( ], } }, - "missing_predictions": { + "unmatched_ground_truths": { "v1": { "count": 1, "examples": [ @@ -1679,8 +1689,8 @@ def test_confusion_matrix_fp_hallucination_edge_case( "type": "ConfusionMatrix", "value": { "confusion_matrix": {}, - "hallucinations": {}, - "missing_predictions": { + "unmatched_predictions": {}, + "unmatched_ground_truths": { "v1": { "count": 2, "examples": [ @@ -1707,8 +1717,8 @@ def test_confusion_matrix_fp_hallucination_edge_case( for m in actual_metrics: _filter_out_zero_counts( m["value"]["confusion_matrix"], - m["value"]["hallucinations"], - m["value"]["missing_predictions"], + m["value"]["unmatched_predictions"], + m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: @@ -1763,11 +1773,11 @@ def test_confusion_matrix_ranked_pair_ordering( "label1": {"label1": {"count": 1, "examples": []}}, "label2": {"label2": {"count": 1, "examples": []}}, }, - "hallucinations": { + "unmatched_predictions": { "label3": {"count": 1, "examples": []}, "label4": {"count": 1, "examples": []}, }, - "missing_predictions": { + "unmatched_ground_truths": { "label3": {"count": 1, "examples": []} }, }, @@ -1781,8 +1791,8 @@ def test_confusion_matrix_ranked_pair_ordering( for m in actual_metrics: _filter_out_zero_counts( m["value"]["confusion_matrix"], - m["value"]["hallucinations"], - m["value"]["missing_predictions"], + m["value"]["unmatched_predictions"], + m["value"]["unmatched_ground_truths"], ) assert m in expected_metrics for m in expected_metrics: diff --git a/lite/tests/object_detection/test_counts.py b/lite/tests/object_detection/test_counts.py index c267e3b82..2ddc55167 100644 --- a/lite/tests/object_detection/test_counts.py +++ b/lite/tests/object_detection/test_counts.py @@ -12,7 +12,7 @@ def test_counts_metrics_first_class( datum uid1 box 1 - label v1 - tp datum uid2 - box 2 - label v1 - fn missing prediction + box 2 - label v1 - fn unmatched ground truths predictions datum uid1 @@ -112,7 +112,7 @@ def test_counts_metrics_second_class( groundtruths datum uid1 - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 none predictions diff --git a/lite/tests/object_detection/test_f1.py b/lite/tests/object_detection/test_f1.py index 078581559..20b38deca 100644 --- a/lite/tests/object_detection/test_f1.py +++ b/lite/tests/object_detection/test_f1.py @@ -49,9 +49,9 @@ def test_f1_metrics_first_class( groundtruths datum uid1 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 - box 2 - label v1 - fn missing prediction + box 2 - label v1 - fn unmatched ground truths predictions datum uid1 @@ -134,7 +134,7 @@ def test_f1_metrics_second_class( groundtruths datum uid1 - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 none predictions diff --git a/lite/tests/object_detection/test_filtering.py b/lite/tests/object_detection/test_filtering.py index 4d0daa4e8..33b707bec 100644 --- a/lite/tests/object_detection/test_filtering.py +++ b/lite/tests/object_detection/test_filtering.py @@ -68,7 +68,7 @@ def test_filtering_one_detection(one_detection: list[Detection]): groundtruths datum uid1 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths predictions datum uid1 @@ -184,7 +184,7 @@ def test_filtering_two_detections(two_detections: list[Detection]): groundtruths datum uid1 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 box 2 - label v1 - fn misclassification @@ -305,12 +305,12 @@ def test_filtering_four_detections(four_detections: list[Detection]): groundtruths datum uid1 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 box 2 - label v1 - fn misclassification datum uid3 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid4 box 2 - label v1 - fn misclassification @@ -443,12 +443,12 @@ def test_filtering_all_detections(four_detections: list[Detection]): groundtruths datum uid1 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 box 2 - label v1 - fn misclassification datum uid3 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid4 box 2 - label v1 - fn misclassification diff --git a/lite/tests/object_detection/test_precision.py b/lite/tests/object_detection/test_precision.py index 0a86d4e5e..9b6ae6868 100644 --- a/lite/tests/object_detection/test_precision.py +++ b/lite/tests/object_detection/test_precision.py @@ -49,9 +49,9 @@ def test_precision_metrics_first_class( groundtruths datum uid1 box 1 - label v1 - tp - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 - box 2 - label v1 - fn missing prediction + box 2 - label v1 - fn unmatched ground truths predictions datum uid1 @@ -134,7 +134,7 @@ def test_precision_metrics_second_class( groundtruths datum uid1 - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 none predictions diff --git a/lite/tests/object_detection/test_recall.py b/lite/tests/object_detection/test_recall.py index 662aa00e2..4e210b4d5 100644 --- a/lite/tests/object_detection/test_recall.py +++ b/lite/tests/object_detection/test_recall.py @@ -50,7 +50,7 @@ def test_recall_metrics_first_class( datum uid1 box 1 - label v1 - tp datum uid2 - box 2 - label v1 - fn missing prediction + box 2 - label v1 - fn unmatched ground truths predictions datum uid1 @@ -133,7 +133,7 @@ def test_recall_metrics_second_class( groundtruths datum uid1 - box 3 - label v2 - fn missing prediction + box 3 - label v2 - fn unmatched ground truths datum uid2 none predictions diff --git a/lite/tests/semantic_segmentation/test_confusion_matrix.py b/lite/tests/semantic_segmentation/test_confusion_matrix.py index 2ad2afdd5..5b4178b52 100644 --- a/lite/tests/semantic_segmentation/test_confusion_matrix.py +++ b/lite/tests/semantic_segmentation/test_confusion_matrix.py @@ -25,11 +25,11 @@ def test_confusion_matrix_basic_segmentations( "v1": {"v1": {"iou": 0.5}, "v2": {"iou": 0.0}}, "v2": {"v1": {"iou": 0.0}, "v2": {"iou": 0.5}}, }, - "hallucinations": { + "unmatched_predictions": { "v1": {"ratio": 0.0}, "v2": {"ratio": 0.5}, }, - "missing_predictions": { + "unmatched_ground_truths": { "v1": {"ratio": 0.5}, "v2": {"ratio": 0.0}, }, @@ -71,13 +71,13 @@ def test_confusion_matrix_segmentations_from_boxes( }, }, }, - "hallucinations": { + "unmatched_predictions": { "v1": {"ratio": 5000 / 10000}, # 50% overlap "v2": { "ratio": 4999 / 5000 }, # overlaps 1 pixel out of 5000 predictions }, - "missing_predictions": { + "unmatched_ground_truths": { "v1": {"ratio": 5000 / 10000}, "v2": { "ratio": 14999 / 15000 diff --git a/lite/valor_lite/classification/computation.py b/lite/valor_lite/classification/computation.py index 04a6fd2af..4a463a74d 100644 --- a/lite/valor_lite/classification/computation.py +++ b/lite/valor_lite/classification/computation.py @@ -282,7 +282,7 @@ def compute_confusion_matrix( NDArray[np.float64] Confusion matrix. NDArray[np.int32] - Ground truths with missing predictions. + Unmatched Ground Truths. """ n_labels = label_metadata.shape[0] @@ -292,7 +292,7 @@ def compute_confusion_matrix( (n_scores, n_labels, n_labels, 2 * n_examples + 1), dtype=np.float32, ) - missing_predictions = -1 * np.ones( + unmatched_ground_truths = -1 * np.ones( (n_scores, n_labels, n_examples + 1), dtype=np.int32, ) @@ -339,7 +339,7 @@ def compute_confusion_matrix( score_idx, misclf_labels[:, 0], misclf_labels[:, 1], 0 ] = misclf_counts - missing_predictions[score_idx, misprd_labels, 0] = misprd_counts + unmatched_ground_truths[score_idx, misprd_labels, 0] = misprd_counts if n_examples > 0: for label_idx in range(n_labels): @@ -375,16 +375,16 @@ def compute_confusion_matrix( 1 : 2 * misclf_label_examples.shape[0] + 1, ] = misclf_label_examples[:, [0, 3]].flatten() - # missing prediction examples + # unmatched ground truth examples mask_misprd_label = misprd_examples[:, 1] == label_idx if misprd_examples.size > 0: misprd_label_examples = misprd_examples[mask_misprd_label][ :n_examples ] - missing_predictions[ + unmatched_ground_truths[ score_idx, label_idx, 1 : misprd_label_examples.shape[0] + 1, ] = misprd_label_examples[:, 0].flatten() - return confusion_matrix, missing_predictions + return confusion_matrix, unmatched_ground_truths diff --git a/lite/valor_lite/classification/metric.py b/lite/valor_lite/classification/metric.py index cce5e8a52..3810d48dd 100644 --- a/lite/valor_lite/classification/metric.py +++ b/lite/valor_lite/classification/metric.py @@ -321,7 +321,7 @@ def confusion_matrix( ], ], ], - missing_predictions: dict[ + unmatched_ground_truths: dict[ str, # ground truth label value dict[ str, # either `count` or `examples` @@ -335,8 +335,8 @@ def confusion_matrix( The confusion matrix and related metrics for the classification task. This class encapsulates detailed information about the model's performance, including correct - predictions, misclassifications, hallucinations (false positives), and missing predictions - (false negatives). It provides counts and examples for each category to facilitate in-depth analysis. + predictions, misclassifications, unmatched predictions (subset of false positives), and unmatched ground truths + (subset of false negatives). It provides counts and examples for each category to facilitate in-depth analysis. Confusion Matrix Structure: { @@ -358,7 +358,7 @@ def confusion_matrix( ... } - Missing Prediction Structure: + Unmatched Ground Truths Structure: { ground_truth_label: { 'count': int, @@ -379,7 +379,7 @@ def confusion_matrix( A nested dictionary where the first key is the ground truth label value, the second key is the prediction label value, and the innermost dictionary contains either a `count` or a list of `examples`. Each example includes the datum UID and prediction score. - missing_predictions : dict + unmatched_ground_truths : dict A dictionary where each key is a ground truth label value for which the model failed to predict (false negatives). The value is a dictionary containing either a `count` or a list of `examples`. Each example includes the datum UID. @@ -396,7 +396,7 @@ def confusion_matrix( type=MetricType.ConfusionMatrix.value, value={ "confusion_matrix": confusion_matrix, - "missing_predictions": missing_predictions, + "unmatched_ground_truths": unmatched_ground_truths, }, parameters={ "score_threshold": score_threshold, diff --git a/lite/valor_lite/classification/utilities.py b/lite/valor_lite/classification/utilities.py index a82c5d330..86faf0c70 100644 --- a/lite/valor_lite/classification/utilities.py +++ b/lite/valor_lite/classification/utilities.py @@ -153,20 +153,20 @@ def _unpack_confusion_matrix_value( } -def _unpack_missing_predictions_value( - missing_predictions: NDArray[np.int32], +def _unpack_unmatched_ground_truths_value( + unmatched_ground_truths: NDArray[np.int32], number_of_labels: int, number_of_examples: int, index_to_uid: dict[int, str], index_to_label: dict[int, str], ) -> dict[str, dict[str, int | list[dict[str, str]]]]: """ - Unpacks a numpy array of missing prediction counts and examples. + Unpacks a numpy array of unmatched ground truth counts and examples. """ datum_idx = ( lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn - missing_predictions[ + unmatched_ground_truths[ gt_label_idx, example_idx + 1, ] @@ -176,7 +176,7 @@ def _unpack_missing_predictions_value( return { index_to_label[gt_label_idx]: { "count": max( - int(missing_predictions[gt_label_idx, 0]), + int(unmatched_ground_truths[gt_label_idx, 0]), 0, ), "examples": [ @@ -197,7 +197,7 @@ def unpack_confusion_matrix_into_metric_list( index_to_label: dict[int, str], ) -> list[Metric]: - (confusion_matrix, missing_predictions) = results + (confusion_matrix, unmatched_ground_truths) = results n_scores, n_labels, _, _ = confusion_matrix.shape return [ Metric.confusion_matrix( @@ -210,8 +210,10 @@ def unpack_confusion_matrix_into_metric_list( index_to_label=index_to_label, index_to_uid=index_to_uid, ), - missing_predictions=_unpack_missing_predictions_value( - missing_predictions=missing_predictions[score_idx, :, :], + unmatched_ground_truths=_unpack_unmatched_ground_truths_value( + unmatched_ground_truths=unmatched_ground_truths[ + score_idx, :, : + ], number_of_labels=n_labels, number_of_examples=number_of_examples, index_to_label=index_to_label, diff --git a/lite/valor_lite/object_detection/computation.py b/lite/valor_lite/object_detection/computation.py index 3f21c8e1c..6e0e9163f 100644 --- a/lite/valor_lite/object_detection/computation.py +++ b/lite/valor_lite/object_detection/computation.py @@ -669,9 +669,9 @@ def compute_confusion_matrix( NDArray[np.float64] Confusion matrix. NDArray[np.float64] - Hallucinations. + Unmatched Predictions. NDArray[np.int32] - Missing Predictions. + Unmatched Ground Truths. """ n_labels = label_metadata.shape[0] @@ -683,12 +683,12 @@ def compute_confusion_matrix( (n_ious, n_scores, n_labels, n_labels, 4 * n_examples + 1), dtype=np.float32, ) - hallucinations = -1 * np.ones( + unmatched_predictions = -1 * np.ones( # (datum idx, pd idx, pd score) * n_examples + count (n_ious, n_scores, n_labels, 3 * n_examples + 1), dtype=np.float32, ) - missing_predictions = -1 * np.ones( + unmatched_ground_truths = -1 * np.ones( # (datum idx, gt idx) * n_examples + count (n_ious, n_scores, n_labels, 2 * n_examples + 1), dtype=np.int32, @@ -793,7 +793,7 @@ def compute_confusion_matrix( data[mask_misclf], unique_idx=[0, 1, 2, 4, 5], label_idx=[3, 4] ) - # count hallucinations + # count unmatched predictions ( halluc_examples, halluc_labels, @@ -802,7 +802,7 @@ def compute_confusion_matrix( data[mask_halluc], unique_idx=[0, 2, 5], label_idx=2 ) - # count missing predictions + # count unmatched ground truths ( misprd_examples, misprd_labels, @@ -822,13 +822,13 @@ def compute_confusion_matrix( misclf_labels[:, 1], 0, ] = misclf_counts - hallucinations[ + unmatched_predictions[ iou_idx, score_idx, halluc_labels, 0, ] = halluc_counts - missing_predictions[ + unmatched_ground_truths[ iou_idx, score_idx, misprd_labels, @@ -877,26 +877,26 @@ def compute_confusion_matrix( :, [0, 1, 2, 6] ].flatten() - # hallucination examples + # unmatched prediction examples mask_halluc_label = halluc_examples[:, 5] == label_idx if mask_halluc_label.sum() > 0: halluc_label_examples = halluc_examples[ mask_halluc_label ][:n_examples] - hallucinations[ + unmatched_predictions[ iou_idx, score_idx, label_idx, 1 : 3 * halluc_label_examples.shape[0] + 1, ] = halluc_label_examples[:, [0, 2, 6]].flatten() - # missing prediction examples + # unmatched ground truth examples mask_misprd_label = misprd_examples[:, 4] == label_idx if misprd_examples.size > 0: misprd_label_examples = misprd_examples[ mask_misprd_label ][:n_examples] - missing_predictions[ + unmatched_ground_truths[ iou_idx, score_idx, label_idx, @@ -905,6 +905,6 @@ def compute_confusion_matrix( return ( confusion_matrix, - hallucinations, - missing_predictions, + unmatched_predictions, + unmatched_ground_truths, ) diff --git a/lite/valor_lite/object_detection/metric.py b/lite/valor_lite/object_detection/metric.py index d8a589cd1..171a72861 100644 --- a/lite/valor_lite/object_detection/metric.py +++ b/lite/valor_lite/object_detection/metric.py @@ -619,7 +619,7 @@ def confusion_matrix( ], ], ], - hallucinations: dict[ + unmatched_predictions: dict[ str, # prediction label value dict[ str, # either `count` or `examples` @@ -636,7 +636,7 @@ def confusion_matrix( ], ], ], - missing_predictions: dict[ + unmatched_ground_truths: dict[ str, # ground truth label value dict[ str, # either `count` or `examples` @@ -660,8 +660,8 @@ def confusion_matrix( Confusion matrix for object detection tasks. This class encapsulates detailed information about the model's performance, including correct - predictions, misclassifications, hallucinations (false positives), and missing predictions - (false negatives). It provides counts and examples for each category to facilitate in-depth analysis. + predictions, misclassifications, unmatched_predictions (subset of false positives), and unmatched ground truths + (subset of false negatives). It provides counts and examples for each category to facilitate in-depth analysis. Confusion Matrix Format: { @@ -683,7 +683,7 @@ def confusion_matrix( ... } - Hallucinations Format: + Unmatched Predictions Format: { : { 'count': int, @@ -699,7 +699,7 @@ def confusion_matrix( ... } - Missing Prediction Format: + Unmatched Ground Truths Format: { : { 'count': int, @@ -721,13 +721,13 @@ def confusion_matrix( is the prediction label value, and the innermost dictionary contains either a `count` or a list of `examples`. Each example includes the datum UID, ground truth bounding box, predicted bounding box, and prediction scores. - hallucinations : dict + unmatched_predictions : dict A dictionary where each key is a prediction label value with no corresponding ground truth - (false positives). The value is a dictionary containing either a `count` or a list of + (subset of false positives). The value is a dictionary containing either a `count` or a list of `examples`. Each example includes the datum UID, predicted bounding box, and prediction score. - missing_predictions : dict + unmatched_ground_truths : dict A dictionary where each key is a ground truth label value for which the model failed to predict - (false negatives). The value is a dictionary containing either a `count` or a list of `examples`. + (subset of false negatives). The value is a dictionary containing either a `count` or a list of `examples`. Each example includes the datum UID and ground truth bounding box. score_threshold : float The confidence score threshold used to filter predictions. @@ -744,8 +744,8 @@ def confusion_matrix( type=MetricType.ConfusionMatrix.value, value={ "confusion_matrix": confusion_matrix, - "hallucinations": hallucinations, - "missing_predictions": missing_predictions, + "unmatched_predictions": unmatched_predictions, + "unmatched_ground_truths": unmatched_ground_truths, }, parameters={ "score_threshold": score_threshold, diff --git a/lite/valor_lite/object_detection/utilities.py b/lite/valor_lite/object_detection/utilities.py index 9e9de43a4..dde4b33e6 100644 --- a/lite/valor_lite/object_detection/utilities.py +++ b/lite/valor_lite/object_detection/utilities.py @@ -321,8 +321,8 @@ def _unpack_confusion_matrix_value( } -def _unpack_hallucinations_value( - hallucinations: NDArray[np.float64], +def _unpack_unmatched_predictions_value( + unmatched_predictions: NDArray[np.float64], number_of_labels: int, number_of_examples: int, index_to_uid: dict[int, str], @@ -336,12 +336,12 @@ def _unpack_hallucinations_value( ], ]: """ - Unpacks a numpy array of hallucination counts and examples. + Unpacks a numpy array of unmatched_prediction counts and examples. """ datum_idx = ( lambda pd_label_idx, example_idx: int( # noqa: E731 - lambda fn - hallucinations[ + unmatched_predictions[ pd_label_idx, example_idx * 3 + 1, ] @@ -350,7 +350,7 @@ def _unpack_hallucinations_value( prediction_idx = ( lambda pd_label_idx, example_idx: int( # noqa: E731 - lambda fn - hallucinations[ + unmatched_predictions[ pd_label_idx, example_idx * 3 + 2, ] @@ -359,7 +359,7 @@ def _unpack_hallucinations_value( score_idx = ( lambda pd_label_idx, example_idx: float( # noqa: E731 - lambda fn - hallucinations[ + unmatched_predictions[ pd_label_idx, example_idx * 3 + 3, ] @@ -369,7 +369,7 @@ def _unpack_hallucinations_value( return { index_to_label[pd_label_idx]: { "count": max( - int(hallucinations[pd_label_idx, 0]), + int(unmatched_predictions[pd_label_idx, 0]), 0, ), "examples": [ @@ -392,8 +392,8 @@ def _unpack_hallucinations_value( } -def _unpack_missing_predictions_value( - missing_predictions: NDArray[np.int32], +def _unpack_unmatched_ground_truths_value( + unmatched_ground_truths: NDArray[np.int32], number_of_labels: int, number_of_examples: int, index_to_uid: dict[int, str], @@ -401,12 +401,12 @@ def _unpack_missing_predictions_value( groundtruth_examples: dict[int, NDArray[np.float16]], ) -> dict[str, dict[str, int | list[dict[str, str | dict[str, float]]]]]: """ - Unpacks a numpy array of missing prediction counts and examples. + Unpacks a numpy array of unmatched ground truth counts and examples. """ datum_idx = ( lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn - missing_predictions[ + unmatched_ground_truths[ gt_label_idx, example_idx * 2 + 1, ] @@ -415,7 +415,7 @@ def _unpack_missing_predictions_value( groundtruth_idx = ( lambda gt_label_idx, example_idx: int( # noqa: E731 - lambda fn - missing_predictions[ + unmatched_ground_truths[ gt_label_idx, example_idx * 2 + 2, ] @@ -425,7 +425,7 @@ def _unpack_missing_predictions_value( return { index_to_label[gt_label_idx]: { "count": max( - int(missing_predictions[gt_label_idx, 0]), + int(unmatched_ground_truths[gt_label_idx, 0]), 0, ), "examples": [ @@ -463,8 +463,8 @@ def unpack_confusion_matrix_into_metric_list( ) -> list[Metric]: ( confusion_matrix, - hallucinations, - missing_predictions, + unmatched_predictions, + unmatched_ground_truths, ) = results n_labels = len(index_to_label) return [ @@ -481,16 +481,18 @@ def unpack_confusion_matrix_into_metric_list( groundtruth_examples=groundtruth_examples, prediction_examples=prediction_examples, ), - hallucinations=_unpack_hallucinations_value( - hallucinations=hallucinations[iou_idx, score_idx, :, :], + unmatched_predictions=_unpack_unmatched_predictions_value( + unmatched_predictions=unmatched_predictions[ + iou_idx, score_idx, :, : + ], number_of_labels=n_labels, number_of_examples=number_of_examples, index_to_label=index_to_label, index_to_uid=index_to_uid, prediction_examples=prediction_examples, ), - missing_predictions=_unpack_missing_predictions_value( - missing_predictions=missing_predictions[ + unmatched_ground_truths=_unpack_unmatched_ground_truths_value( + unmatched_ground_truths=unmatched_ground_truths[ iou_idx, score_idx, :, : ], number_of_labels=n_labels, diff --git a/lite/valor_lite/semantic_segmentation/computation.py b/lite/valor_lite/semantic_segmentation/computation.py index 807c2bde6..25c43a180 100644 --- a/lite/valor_lite/semantic_segmentation/computation.py +++ b/lite/valor_lite/semantic_segmentation/computation.py @@ -98,9 +98,9 @@ def compute_metrics( NDArray[np.float64] Confusion matrix containing IOU values. NDArray[np.float64] - Hallucination ratios. + Unmatched prediction ratios. NDArray[np.float64] - Missing prediction ratios. + Unmatched ground truth ratios. """ n_labels = label_metadata.shape[0] gt_counts = label_metadata[:, 0] @@ -108,7 +108,7 @@ def compute_metrics( counts = data.sum(axis=0) - # compute iou, missing_predictions and hallucinations + # compute iou, unmatched_ground_truth and unmatched predictions intersection_ = counts[1:, 1:] union_ = ( gt_counts[:, np.newaxis] + pd_counts[np.newaxis, :] - intersection_ @@ -122,20 +122,20 @@ def compute_metrics( out=ious, ) - hallucination_ratio = np.zeros((n_labels), dtype=np.float64) + unmatched_prediction_ratio = np.zeros((n_labels), dtype=np.float64) np.divide( counts[0, 1:], pd_counts, where=pd_counts > 1e-9, - out=hallucination_ratio, + out=unmatched_prediction_ratio, ) - missing_prediction_ratio = np.zeros((n_labels), dtype=np.float64) + unmatched_ground_truth_ratio = np.zeros((n_labels), dtype=np.float64) np.divide( counts[1:, 0], gt_counts, where=gt_counts > 1e-9, - out=missing_prediction_ratio, + out=unmatched_ground_truth_ratio, ) # compute precision, recall, f1 @@ -168,6 +168,6 @@ def compute_metrics( f1_score, accuracy, ious, - hallucination_ratio, - missing_prediction_ratio, + unmatched_prediction_ratio, + unmatched_ground_truth_ratio, ) diff --git a/lite/valor_lite/semantic_segmentation/metric.py b/lite/valor_lite/semantic_segmentation/metric.py index 509d1f424..949ca02c4 100644 --- a/lite/valor_lite/semantic_segmentation/metric.py +++ b/lite/valor_lite/semantic_segmentation/metric.py @@ -209,11 +209,11 @@ def confusion_matrix( dict[str, float], # iou ], ], - hallucinations: dict[ + unmatched_predictions: dict[ str, # prediction label value dict[str, float], # pixel ratio ], - missing_predictions: dict[ + unmatched_ground_truths: dict[ str, # ground truth label value dict[str, float], # pixel ratio ], @@ -222,8 +222,8 @@ def confusion_matrix( The confusion matrix and related metrics for semantic segmentation tasks. This class encapsulates detailed information about the model's performance, including correct - predictions, misclassifications, hallucinations (false positives), and missing predictions - (false negatives). It provides counts for each category to facilitate in-depth analysis. + predictions, misclassifications, unmatched_predictions (subset of false positives), and unmatched ground truths + (subset of false negatives). It provides counts for each category to facilitate in-depth analysis. Confusion Matrix Format: { @@ -234,14 +234,14 @@ def confusion_matrix( }, } - Hallucinations Format: + Unmatched Predictions Format: { : { 'iou': , }, } - Missing Predictions Format: + Unmatched Ground Truths Format: { : { 'iou': , @@ -253,10 +253,10 @@ def confusion_matrix( confusion_matrix : dict Nested dictionaries representing the Intersection over Union (IOU) scores for each ground truth label and prediction label pair. - hallucinations : dict + unmatched_predictions : dict Dictionary representing the pixel ratios for predicted labels that do not correspond to any ground truth labels (false positives). - missing_predictions : dict + unmatched_ground_truths : dict Dictionary representing the pixel ratios for ground truth labels that were not predicted (false negatives). @@ -268,8 +268,8 @@ def confusion_matrix( type=MetricType.ConfusionMatrix.value, value={ "confusion_matrix": confusion_matrix, - "hallucinations": hallucinations, - "missing_predictions": missing_predictions, + "unmatched_predictions": unmatched_predictions, + "unmatched_ground_truths": unmatched_ground_truths, }, parameters={}, ) diff --git a/lite/valor_lite/semantic_segmentation/utilities.py b/lite/valor_lite/semantic_segmentation/utilities.py index 577e974f5..452ad5b4a 100644 --- a/lite/valor_lite/semantic_segmentation/utilities.py +++ b/lite/valor_lite/semantic_segmentation/utilities.py @@ -18,8 +18,8 @@ def unpack_precision_recall_iou_into_metric_lists( f1_score, accuracy, ious, - hallucination_ratios, - missing_prediction_ratios, + unmatched_prediction_ratios, + unmatched_ground_truth_ratios, ) = results metrics = defaultdict(list) @@ -43,16 +43,16 @@ def unpack_precision_recall_iou_into_metric_lists( for gt_label_idx in range(n_labels) if label_metadata[gt_label_idx, 0] > 0 }, - hallucinations={ + unmatched_predictions={ index_to_label[pd_label_idx]: { - "ratio": float(hallucination_ratios[pd_label_idx]) + "ratio": float(unmatched_prediction_ratios[pd_label_idx]) } for pd_label_idx in range(n_labels) if label_metadata[pd_label_idx, 0] > 0 }, - missing_predictions={ + unmatched_ground_truths={ index_to_label[gt_label_idx]: { - "ratio": float(missing_prediction_ratios[gt_label_idx]) + "ratio": float(unmatched_ground_truth_ratios[gt_label_idx]) } for gt_label_idx in range(n_labels) if label_metadata[gt_label_idx, 0] > 0