From a4bad6b7bdabb80592fe23d6d46590a1f8470c7a Mon Sep 17 00:00:00 2001
From: Gguidini <giovanni@codecov.io>
Date: Tue, 2 Jan 2024 17:09:07 -0300
Subject: [PATCH] feat+fix: preserve test collection order

We run into issues with test ordering with some frequency in the ATS
step in CI. This is because ATS scrambles the test order and this
uncovers certain dependencies within tests.

Sure there's some value in doing that, but fixing these issues is
somewhat annoying and actually disrupts the workflow more than is
helpful. Scrambling test order needs to be deliberate, not accidental

These changes make sure that we preserve the order that pytest
collected tests when deciding the order to run them.

Notice that it can still run into ordering issues if a test that
depends on another test does't get picked to run.
---
 codecov_cli/commands/labelanalysis.py         |  43 ++++----
 codecov_cli/runners/pytest_standard_runner.py |  25 ++---
 codecov_cli/runners/types.py                  |  28 +++++
 tests/commands/test_invoke_labelanalysis.py   | 101 +++++++++++++++---
 tests/runners/test_pytest_standard_runner.py  |  19 +++-
 5 files changed, 158 insertions(+), 58 deletions(-)

diff --git a/codecov_cli/commands/labelanalysis.py b/codecov_cli/commands/labelanalysis.py
index a2eda197..4d825179 100644
--- a/codecov_cli/commands/labelanalysis.py
+++ b/codecov_cli/commands/labelanalysis.py
@@ -192,7 +192,7 @@ def label_analysis(
                 runner.process_labelanalysis_result(request_result)
             else:
                 _dry_run_output(
-                    LabelAnalysisRequestResult(request_result),
+                    request_result,
                     runner,
                     dry_run_format,
                     # It's possible that the task had processing errors and fallback to all tests
@@ -287,16 +287,14 @@ def _potentially_calculate_absent_labels(
         global_level_labels_set = set(request_result.get("global_level_labels", []))
         final_result = LabelAnalysisRequestResult(
             {
-                "present_report_labels": sorted(
+                "present_report_labels": list(
                     present_report_labels_set & requested_labels_set
                 ),
-                "present_diff_labels": sorted(
+                "present_diff_labels": list(
                     present_diff_labels_set & requested_labels_set
                 ),
-                "absent_labels": sorted(
-                    requested_labels_set - present_report_labels_set
-                ),
-                "global_level_labels": sorted(
+                "absent_labels": list(requested_labels_set - present_report_labels_set),
+                "global_level_labels": list(
                     global_level_labels_set & requested_labels_set
                 ),
             }
@@ -312,6 +310,10 @@ def _potentially_calculate_absent_labels(
             )
         ),
     )
+    # Requested labels is important because it preserves the collection order
+    # of the testing tool. Running tests out of order might surface errors with
+    # the tests.
+    final_result["collected_labels_in_order"] = requested_labels
     return final_result
 
 
@@ -369,15 +371,15 @@ def _send_labelanalysis_request(payload, url, token_header):
 
 
 def _dry_run_json_output(
-    labels_to_run: set,
-    labels_to_skip: set,
+    labels_to_run: List[str],
+    labels_to_skip: List[str],
     runner_options: List[str],
     fallback_reason: str = None,
 ) -> None:
     output_as_dict = dict(
         runner_options=runner_options,
-        ats_tests_to_run=sorted(labels_to_run),
-        ats_tests_to_skip=sorted(labels_to_skip),
+        ats_tests_to_run=labels_to_run,
+        ats_tests_to_skip=labels_to_skip,
         ats_fallback_reason=fallback_reason,
     )
     # ⚠️ DON'T use logger
@@ -386,8 +388,8 @@ def _dry_run_json_output(
 
 
 def _dry_run_list_output(
-    labels_to_run: set,
-    labels_to_skip: set,
+    labels_to_run: List[str],
+    labels_to_skip: List[str],
     runner_options: List[str],
     fallback_reason: str = None,
 ) -> None:
@@ -395,12 +397,12 @@ def _dry_run_list_output(
         logger.warning(f"label-analysis didn't run correctly. Error: {fallback_reason}")
 
     to_run_line = " ".join(
-        sorted(map(lambda l: f"'{l}'", runner_options))
-        + sorted(map(lambda l: f"'{l}'", labels_to_run))
+        list(map(lambda l: f"'{l}'", runner_options))
+        + list(map(lambda l: f"'{l}'", labels_to_run))
     )
     to_skip_line = " ".join(
-        sorted(map(lambda l: f"'{l}'", runner_options))
-        + sorted(map(lambda l: f"'{l}'", labels_to_skip))
+        list(map(lambda l: f"'{l}'", runner_options))
+        + list(map(lambda l: f"'{l}'", labels_to_skip))
     )
     # ⚠️ DON'T use logger
     # logger goes to stderr, we want it in stdout
@@ -417,10 +419,8 @@ def _dry_run_output(
     # failed at some point. So it was not a completely successful task.
     fallback_reason: str = None,
 ):
-    labels_to_run = set(
-        result.absent_labels + result.global_level_labels + result.present_diff_labels
-    )
-    labels_to_skip = set(result.present_report_labels) - labels_to_run
+    labels_to_run = result.get_tests_to_run_in_collection_order()
+    labels_to_skip = result.get_tests_to_skip_in_collection_order()
 
     format_lookup = {
         "json": _dry_run_json_output,
@@ -451,6 +451,7 @@ def _fallback_to_collected_labels(
                 "absent_labels": collected_labels,
                 "present_diff_labels": [],
                 "global_level_labels": [],
+                "collected_labels_in_order": collected_labels,
             }
         )
         if not dry_run:
diff --git a/codecov_cli/runners/pytest_standard_runner.py b/codecov_cli/runners/pytest_standard_runner.py
index 0ab327cf..d48f081a 100644
--- a/codecov_cli/runners/pytest_standard_runner.py
+++ b/codecov_cli/runners/pytest_standard_runner.py
@@ -140,35 +140,26 @@ def process_labelanalysis_result(self, result: LabelAnalysisRequestResult):
             f"--cov={self.params.coverage_root}",
             "--cov-context=test",
         ] + self.params.execute_tests_options
-        all_labels = set(
-            result.absent_labels
-            + result.present_diff_labels
-            + result.global_level_labels
-        )
-        skipped_tests = set(result.present_report_labels) - all_labels
-        if skipped_tests:
+        tests_to_run = result.get_tests_to_run_in_collection_order()
+        tests_to_skip = result.get_tests_to_skip_in_collection_order()
+        if tests_to_skip:
             logger.info(
                 "Some tests are being skipped. (run in verbose mode to get list of tests skipped)",
                 extra=dict(
-                    extra_log_attributes=dict(skipped_tests_count=len(skipped_tests))
+                    extra_log_attributes=dict(skipped_tests_count=len(tests_to_skip))
                 ),
             )
             logger.debug(
                 "List of skipped tests",
-                extra=dict(
-                    extra_log_attributes=dict(skipped_tests=sorted(skipped_tests))
-                ),
+                extra=dict(extra_log_attributes=dict(skipped_tests=tests_to_skip)),
             )
 
-        if len(all_labels) == 0:
-            all_labels = [random.choice(result.present_report_labels)]
+        if len(tests_to_run) == 0:
+            tests_to_run = [random.choice(result.present_report_labels)]
             logger.info(
                 "All tests are being skipped. Selected random label to run",
-                extra=dict(extra_log_attributes=dict(selected_label=all_labels[0])),
+                extra=dict(extra_log_attributes=dict(selected_label=tests_to_run[0])),
             )
-        tests_to_run = [
-            label.split("[")[0] if "[" in label else label for label in all_labels
-        ]
         command_array = default_options + tests_to_run
         logger.info(
             "Running tests. (run in verbose mode to get list of tests executed)"
diff --git a/codecov_cli/runners/types.py b/codecov_cli/runners/types.py
index f18a562e..d46f5275 100644
--- a/codecov_cli/runners/types.py
+++ b/codecov_cli/runners/types.py
@@ -21,6 +21,34 @@ def present_diff_labels(self) -> List[str]:
     def global_level_labels(self) -> List[str]:
         return self.get("global_level_labels", [])
 
+    @property
+    def collected_labels_in_order(self) -> List[str]:
+        """The list of collected labels, in the order returned by the testing tool.
+        This is a superset of all other lists.
+        """
+        return self.get("collected_labels_in_order", [])
+
+    def get_tests_to_run_in_collection_order(self) -> List[str]:
+        labels_to_run = set(
+            self.absent_labels + self.global_level_labels + self.present_diff_labels
+        )
+        output = []
+        for test_name in self.collected_labels_in_order:
+            if test_name in labels_to_run:
+                output.append(test_name)
+        return output
+
+    def get_tests_to_skip_in_collection_order(self) -> List[str]:
+        labels_to_run = set(
+            self.absent_labels + self.global_level_labels + self.present_diff_labels
+        )
+        labels_to_skip = set(self.present_report_labels) - labels_to_run
+        output = []
+        for test_name in self.collected_labels_in_order:
+            if test_name in labels_to_skip:
+                output.append(test_name)
+        return output
+
 
 class LabelAnalysisRunnerInterface(object):
     params: Dict = None
diff --git a/tests/commands/test_invoke_labelanalysis.py b/tests/commands/test_invoke_labelanalysis.py
index 230e5b12..cfd41201 100644
--- a/tests/commands/test_invoke_labelanalysis.py
+++ b/tests/commands/test_invoke_labelanalysis.py
@@ -67,24 +67,87 @@ def test_potentially_calculate_labels_recalculate(self):
                 "label_1",
                 "label_2",
                 "label_3",
+                "label_5",
                 "label_old",
                 "label_older",
             ],
             "absent_labels": [],
-            "present_diff_labels": ["label_2", "label_3", "label_old"],
+            "present_diff_labels": ["label_5", "label_3", "label_old"],
             "global_level_labels": ["label_1", "label_older"],
         }
-        collected_labels = ["label_1", "label_2", "label_3", "label_4"]
+        collected_labels = [
+            "label_2",
+            "label_3",
+            "label_6",
+            "label_1",
+            "label_4",
+            "label_5",
+        ]
         expected = {
-            "present_diff_labels": ["label_2", "label_3"],
+            "present_diff_labels": ["label_3", "label_5"],
             "global_level_labels": ["label_1"],
-            "absent_labels": ["label_4"],
-            "present_report_labels": ["label_1", "label_2", "label_3"],
+            "absent_labels": ["label_6", "label_4"],
+            "present_report_labels": ["label_2", "label_3", "label_1", "label_5"],
         }
-        assert (
-            _potentially_calculate_absent_labels(request_result, collected_labels)
-            == expected
-        )
+        result = _potentially_calculate_absent_labels(request_result, collected_labels)
+        assert result.get_tests_to_run_in_collection_order() == [
+            "label_3",
+            "label_6",
+            "label_1",
+            "label_4",
+            "label_5",
+        ]
+        assert result.get_tests_to_skip_in_collection_order() == ["label_2"]
+
+    def test_potentially_calculate_labels_recalculate(self):
+        request_result = {
+            "present_report_labels": [
+                "label_1",
+                "label_2",
+                "label_3",
+                "label_5",
+                "label_old",
+                "label_older",
+            ],
+            "absent_labels": [],
+            "present_diff_labels": ["label_5", "label_3", "label_old"],
+            "global_level_labels": ["label_1", "label_older"],
+        }
+        collected_labels = [
+            "label_2",
+            "label_3",
+            "label_6",
+            "label_1",
+            "label_4",
+            "label_5",
+        ]
+        expected = {
+            "present_diff_labels": ["label_3", "label_5"],
+            "global_level_labels": ["label_1"],
+            "absent_labels": ["label_4", "label_6"],
+            "present_report_labels": ["label_1", "label_2", "label_3", "label_5"],
+        }
+        res = _potentially_calculate_absent_labels(request_result, collected_labels)
+        # It's tricky to assert correctedness because the ordering is not guaranteed
+        # So we force some and test the individual lists.
+        # Plus we test the ordering that actually matters
+        assert sorted(res.absent_labels) == expected["absent_labels"]
+        assert sorted(res.global_level_labels) == expected["global_level_labels"]
+        assert sorted(res.present_diff_labels) == expected["present_diff_labels"]
+        assert sorted(res.present_report_labels) == expected["present_report_labels"]
+        # This is the only list that needs to actually be ordered
+        assert res.collected_labels_in_order == collected_labels
+        # And the ordering that matters most
+        assert res.get_tests_to_run_in_collection_order() == [
+            "label_3",
+            "label_6",
+            "label_1",
+            "label_4",
+            "label_5",
+        ]
+        assert res.get_tests_to_skip_in_collection_order() == [
+            "label_2",
+        ]
 
     def test_send_label_analysis_bad_payload(self):
         payload = {
@@ -274,7 +337,7 @@ def test_invoke_label_analysis(
             assert result.exit_code == 0
         mock_get_runner.assert_called()
         fake_runner.process_labelanalysis_result.assert_called_with(
-            label_analysis_result
+            {**label_analysis_result, "collected_labels_in_order": collected_labels}
         )
         print(result.output)
 
@@ -341,7 +404,7 @@ def test_invoke_label_analysis_dry_run(
         )
         assert json.loads(result.stdout) == {
             "runner_options": ["--labels"],
-            "ats_tests_to_run": ["test_absent", "test_global", "test_in_diff"],
+            "ats_tests_to_run": ["test_absent", "test_in_diff", "test_global"],
             "ats_tests_to_skip": ["test_present"],
             "ats_fallback_reason": ats_fallback_reason,
         }
@@ -401,7 +464,7 @@ def test_invoke_label_analysis_dry_run_pytest_format(
         assert result.exit_code == 0
         assert (
             result.stdout
-            == "TESTS_TO_RUN='--labels' 'test_absent' 'test_global' 'test_in_diff'\nTESTS_TO_SKIP='--labels' 'test_present'\n"
+            == "TESTS_TO_RUN='--labels' 'test_absent' 'test_in_diff' 'test_global'\nTESTS_TO_SKIP='--labels' 'test_present'\n"
         )
 
     def test_fallback_to_collected_labels(self, mocker):
@@ -413,6 +476,7 @@ def test_fallback_to_collected_labels(self, mocker):
                 "absent_labels": collected_labels,
                 "present_diff_labels": [],
                 "global_level_labels": [],
+                "collected_labels_in_order": collected_labels,
             }
         )
         _fallback_to_collected_labels(collected_labels, mock_runner)
@@ -457,6 +521,7 @@ def test_fallback_collected_labels_covecov_500_error(
                     "absent_labels": collected_labels,
                     "present_diff_labels": [],
                     "global_level_labels": [],
+                    "collected_labels_in_order": collected_labels,
                 }
             )
             print(result.output)
@@ -494,7 +559,7 @@ def test_fallback_collected_labels_covecov_500_error_dry_run(
         # Dry run format defaults to json
         assert json.loads(result.stdout) == {
             "runner_options": ["--labels"],
-            "ats_tests_to_run": sorted(collected_labels),
+            "ats_tests_to_run": collected_labels,
             "ats_tests_to_skip": [],
             "ats_fallback_reason": "codecov_unavailable",
         }
@@ -554,6 +619,7 @@ def test_fallback_collected_labels_codecov_error_processing_label_analysis(
                     "absent_labels": collected_labels,
                     "present_diff_labels": [],
                     "global_level_labels": [],
+                    "collected_labels_in_order": collected_labels,
                 }
             )
             print(result.output)
@@ -612,7 +678,7 @@ def test_fallback_collected_labels_codecov_error_processing_label_analysis_dry_r
         # Dry run format defaults to json
         assert json.loads(result.stdout) == {
             "runner_options": ["--labels"],
-            "ats_tests_to_run": sorted(collected_labels),
+            "ats_tests_to_run": collected_labels,
             "ats_tests_to_skip": [],
             "ats_fallback_reason": "test_list_processing_failed",
         }
@@ -670,6 +736,7 @@ def test_fallback_collected_labels_codecov_max_wait_time_exceeded(
                 "absent_labels": collected_labels,
                 "present_diff_labels": [],
                 "global_level_labels": [],
+                "collected_labels_in_order": collected_labels,
             }
         )
 
@@ -720,13 +787,13 @@ def test_fallback_collected_labels_codecov_max_wait_time_exceeded_dry_run(
             mock_get_runner.assert_called()
             fake_runner.process_labelanalysis_result.assert_not_called()
         # Dry run format defaults to json
+        assert result.exit_code == 0
         assert json.loads(result.stdout) == {
             "runner_options": ["--labels"],
-            "ats_tests_to_run": sorted(collected_labels),
+            "ats_tests_to_run": collected_labels,
             "ats_tests_to_skip": [],
             "ats_fallback_reason": "max_wait_time_exceeded",
         }
-        assert result.exit_code == 0
 
     def test_first_labelanalysis_request_fails_but_second_works(
         self, get_labelanalysis_deps, mocker, use_verbose_option
@@ -778,6 +845,6 @@ def test_first_labelanalysis_request_fails_but_second_works(
             assert result.exit_code == 0
         mock_get_runner.assert_called()
         fake_runner.process_labelanalysis_result.assert_called_with(
-            label_analysis_result
+            {**label_analysis_result, "collected_labels_in_order": collected_labels}
         )
         print(result.output)
diff --git a/tests/runners/test_pytest_standard_runner.py b/tests/runners/test_pytest_standard_runner.py
index aadbcd3f..53e539d6 100644
--- a/tests/runners/test_pytest_standard_runner.py
+++ b/tests/runners/test_pytest_standard_runner.py
@@ -1,16 +1,14 @@
 from subprocess import CalledProcessError
-from unittest.mock import MagicMock, call, patch
+from unittest.mock import MagicMock, patch
 
 import click
 import pytest
-from pytest import ExitCode
 
 from codecov_cli.runners.pytest_standard_runner import (
     PytestStandardRunner,
     PytestStandardRunnerConfigParams,
 )
 from codecov_cli.runners.pytest_standard_runner import logger as runner_logger
-from codecov_cli.runners.pytest_standard_runner import stdout as pyrunner_stdout
 from codecov_cli.runners.types import LabelAnalysisRequestResult
 
 
@@ -203,6 +201,11 @@ def test_process_label_analysis_result(self, mocker):
             "absent_labels": ["test_absent"],
             "present_diff_labels": ["test_in_diff"],
             "global_level_labels": ["test_global"],
+            "collected_labels_in_order": [
+                "test_absent",
+                "test_global",
+                "test_in_diff",
+            ],
         }
         mock_execute = mocker.patch.object(PytestStandardRunner, "_execute_pytest")
 
@@ -229,6 +232,11 @@ def test_process_label_analysis_result_diff_coverage_root(self, mocker):
             "absent_labels": ["test_absent"],
             "present_diff_labels": ["test_in_diff"],
             "global_level_labels": ["test_global"],
+            "collected_labels_in_order": [
+                "test_absent",
+                "test_global",
+                "test_in_diff",
+            ],
         }
         mock_execute = mocker.patch.object(PytestStandardRunner, "_execute_pytest")
 
@@ -257,6 +265,11 @@ def test_process_label_analysis_result_with_options(self, mocker):
             "absent_labels": ["test_absent"],
             "present_diff_labels": ["test_in_diff"],
             "global_level_labels": ["test_global"],
+            "collected_labels_in_order": [
+                "test_absent",
+                "test_global",
+                "test_in_diff",
+            ],
         }
         mock_execute = mocker.patch.object(PytestStandardRunner, "_execute_pytest")
         mock_warning = mocker.patch.object(runner_logger, "warning")