From cff5aa211665c43d154ba9d69ba7affd8d5007fe Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 14 Feb 2023 11:35:47 -0800 Subject: [PATCH 1/2] Only fetch alerts belong to the target repo and branch --- torchci/scripts/check_alerts.py | 100 ++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 32 deletions(-) diff --git a/torchci/scripts/check_alerts.py b/torchci/scripts/check_alerts.py index 9e68087831..7a3965bbbc 100755 --- a/torchci/scripts/check_alerts.py +++ b/torchci/scripts/check_alerts.py @@ -7,9 +7,9 @@ from datetime import datetime, timedelta from difflib import SequenceMatcher from typing import Any, Dict, List, Set, Tuple -from setuptools import distutils # type: ignore[import] import requests +from setuptools import distutils # type: ignore[import] ALL_SKIPPED_THRESHOLD = 100 SIMILARITY_THRESHOLD = 0.75 @@ -172,7 +172,9 @@ def __repr__(self) -> str: return f"jobName: {self.job_name}" -def fetch_alerts(alert_repo: str, labels: List[str]) -> List[Any]: +def fetch_alerts( + repo: str, branch: str, alert_repo: str, labels: List[str] +) -> List[Any]: try: variables = {"owner": REPO_OWNER, "name": alert_repo, "labels": labels} r = requests.post( @@ -181,8 +183,14 @@ def fetch_alerts(alert_repo: str, labels: List[str]) -> List[Any]: headers=headers, ) r.raise_for_status() + data = json.loads(r.text) - return data["data"]["repository"]["issues"]["nodes"] + # Return only alert belonging to the target repo and branch + return list(filter( + lambda alert: f"Recurrently Failing Jobs on {repo} {branch}" + in alert["title"], + data["data"]["repository"]["issues"]["nodes"], + )) except Exception as e: raise RuntimeError("Error fetching alerts", e) @@ -211,7 +219,9 @@ def generate_failed_job_hud_link(failed_job: JobStatus) -> str: return f"[{failed_job.job_name}]({hud_link})" -def generate_failed_job_issue(repo: str, branch: str, failed_jobs: List[JobStatus]) -> Any: +def generate_failed_job_issue( + repo: str, branch: str, failed_jobs: List[JobStatus] +) -> Any: failed_jobs.sort(key=lambda status: status.job_name) issue = {} issue[ @@ -220,7 +230,9 @@ def generate_failed_job_issue(repo: str, branch: str, failed_jobs: List[JobStatu body = "Within the last 50 commits, there are the following failures on the master branch of pytorch: \n" for job in failed_jobs: failing_sha = job.failure_chain[-1]["sha"] - body += f"- {generate_failed_job_hud_link(job)} failed consecutively starting with " + body += ( + f"- {generate_failed_job_hud_link(job)} failed consecutively starting with " + ) body += f"commit [{failing_sha}](https://hud.pytorch.org/commit/{repo}/{failing_sha})" body += "\n\n" @@ -275,7 +287,9 @@ def generate_no_flaky_tests_issue() -> Any: return issue -def update_issue(issue: Dict, old_issue: Any, update_comment: str, dry_run: bool) -> None: +def update_issue( + issue: Dict, old_issue: Any, update_comment: str, dry_run: bool +) -> None: print(f"Updating issue {issue} with content:{os.linesep}{update_comment}") if dry_run: print(f"NOTE: Dry run, not doing any real work") @@ -455,46 +469,66 @@ def handle_flaky_tests_alert(existing_alerts: List[Dict]) -> Dict: # filter job names that don't match the regex def filter_job_names(job_names: List[str], job_name_regex: str) -> List[str]: if job_name_regex: - return [job_name for job_name in job_names if re.match(job_name_regex, job_name)] + return [ + job_name for job_name in job_names if re.match(job_name_regex, job_name) + ] return job_names -def check_for_recurrently_failing_jobs_alert(repo: str, branch: str, job_name_regex: str, dry_run: bool): +def check_for_recurrently_failing_jobs_alert( + repo: str, branch: str, job_name_regex: str, dry_run: bool +): job_names, sha_grid = fetch_hud_data(repo=repo, branch=branch) filtered_job_names = set(filter_job_names(job_names, job_name_regex)) - (jobs_to_alert_on, flaky_jobs) = classify_jobs(job_names, sha_grid, filtered_job_names) + (jobs_to_alert_on, flaky_jobs) = classify_jobs( + job_names, sha_grid, filtered_job_names + ) # Fetch alerts - existing_alerts = fetch_alerts(TEST_INFRA_REPO_NAME, PYTORCH_ALERT_LABEL) + existing_alerts = fetch_alerts( + repo=repo, + branch=branch, + alert_repo=TEST_INFRA_REPO_NAME, + labels=PYTORCH_ALERT_LABEL, + ) # Auto-clear any existing alerts if the current status is green if len(jobs_to_alert_on) == 0 or trunk_is_green(sha_grid): - print("Didn't find anything to alert on.") + print(f"Didn't find anything to alert on for {repo} {branch}") clear_alerts(existing_alerts) return - # Find the existing alert for recurrently failing jobs (if any). - # We're going to update the existing alert if possible instead of filing a new one. - existing_issue = None - for alert in existing_alerts: - if f"Recurrently Failing Jobs on {repo} {branch}" in alert["title"]: - existing_issue = alert - break - # Create a new alert if no alerts are active or edit the original one if there's a new update - if existing_issue: + # In the current design, there should be at most one alert issue per repo and branch + assert len(existing_alerts) <= 1 + + if existing_alerts: + # New update, edit the current active alert + existing_issue = existing_alerts[0] update_comment = gen_update_comment(existing_issue["body"], jobs_to_alert_on) + if update_comment: - new_issue = generate_failed_job_issue(repo=repo, branch=branch, failed_jobs=jobs_to_alert_on) + new_issue = generate_failed_job_issue( + repo=repo, branch=branch, failed_jobs=jobs_to_alert_on + ) update_issue(new_issue, existing_issue, update_comment, dry_run=dry_run) else: - print("No new updates. Not updating any alerts.") + print(f"No new change. Not updating any alert for {repo} {branch}") else: - create_issue(generate_failed_job_issue(repo=repo, branch=branch, failed_jobs=jobs_to_alert_on), dry_run=dry_run) + # No active alert exists, create a new one + create_issue( + generate_failed_job_issue( + repo=repo, branch=branch, failed_jobs=jobs_to_alert_on + ), + dry_run=dry_run, + ) -def check_for_no_flaky_tests_alert(): +def check_for_no_flaky_tests_alert(repo: str, branch: str): existing_no_flaky_tests_alerts = fetch_alerts( - TEST_INFRA_REPO_NAME, NO_FLAKY_TESTS_LABEL + repo=repo, + branch=branch, + alert_repo=TEST_INFRA_REPO_NAME, + labels=NO_FLAKY_TESTS_LABEL, ) handle_flaky_tests_alert(existing_no_flaky_tests_alerts) @@ -505,41 +539,43 @@ def parse_args() -> argparse.Namespace: "--repo", help="Repository to do checks for", type=str, - default=os.getenv("REPO_TO_CHECK", "pytorch/pytorch") + default=os.getenv("REPO_TO_CHECK", "pytorch/pytorch"), ) parser.add_argument( "--branch", help="Branch to do checks for", type=str, - default=os.getenv("BRANCH_TO_CHECK", "master") + default=os.getenv("BRANCH_TO_CHECK", "master"), ) parser.add_argument( "--job-name-regex", help="Consider only job names matching given regex (if omitted, all jobs are matched)", type=str, - default=os.getenv("JOB_NAME_REGEX", "") + default=os.getenv("JOB_NAME_REGEX", ""), ) parser.add_argument( "--with-flaky-test-alert", help="Run this script with the flaky test alerting", type=distutils.util.strtobool, - default=os.getenv("WITH_FLAKY_TEST_ALERT", "NO") + default=os.getenv("WITH_FLAKY_TEST_ALERT", "NO"), ) parser.add_argument( "--dry-run", help="Whether or not to actually post issues", type=distutils.util.strtobool, - default=os.getenv("DRY_RUN", "YES") + default=os.getenv("DRY_RUN", "YES"), ) return parser.parse_args() def main(): args = parse_args() - check_for_recurrently_failing_jobs_alert(args.repo, args.branch, args.job_name_regex, args.dry_run) + check_for_recurrently_failing_jobs_alert( + args.repo, args.branch, args.job_name_regex, args.dry_run + ) # TODO: Fill out dry run for flaky test alerting, not going to do in one PR if args.with_flaky_test_alert: - check_for_no_flaky_tests_alert() + check_for_no_flaky_tests_alert(args.repo, args.branch) if __name__ == "__main__": From bb2e6bf2fee49e018e0e6a4b4784cbfe94f110ea Mon Sep 17 00:00:00 2001 From: Huy Do Date: Tue, 14 Feb 2023 13:26:43 -0800 Subject: [PATCH 2/2] Add unit test for fetch_alerts --- torchci/scripts/check_alerts.py | 13 +-- torchci/scripts/test_check_alerts.py | 114 +++++++++++++++++++++++---- 2 files changed, 106 insertions(+), 21 deletions(-) diff --git a/torchci/scripts/check_alerts.py b/torchci/scripts/check_alerts.py index 7a3965bbbc..867711fce9 100755 --- a/torchci/scripts/check_alerts.py +++ b/torchci/scripts/check_alerts.py @@ -186,11 +186,14 @@ def fetch_alerts( data = json.loads(r.text) # Return only alert belonging to the target repo and branch - return list(filter( - lambda alert: f"Recurrently Failing Jobs on {repo} {branch}" - in alert["title"], - data["data"]["repository"]["issues"]["nodes"], - )) + return [ + item + for item in filter( + lambda alert: f"Recurrently Failing Jobs on {repo} {branch}" + in alert["title"], + data["data"]["repository"]["issues"]["nodes"], + ) + ] except Exception as e: raise RuntimeError("Error fetching alerts", e) diff --git a/torchci/scripts/test_check_alerts.py b/torchci/scripts/test_check_alerts.py index 8c6a9393c0..d208026557 100644 --- a/torchci/scripts/test_check_alerts.py +++ b/torchci/scripts/test_check_alerts.py @@ -1,13 +1,17 @@ +import json from datetime import datetime from unittest import main, TestCase from unittest.mock import patch from check_alerts import ( + fetch_alerts, filter_job_names, gen_update_comment, generate_no_flaky_tests_issue, handle_flaky_tests_alert, JobStatus, + PYTORCH_ALERT_LABEL, + TEST_INFRA_REPO_NAME, ) @@ -129,34 +133,112 @@ def test_handle_flaky_tests_alert( # test filter job names def test_job_filter(self): - job_names = ["pytorch_linux_xenial_py3_6_gcc5_4_test", "pytorch_linux_xenial_py3_6_gcc5_4_test2"] + job_names = [ + "pytorch_linux_xenial_py3_6_gcc5_4_test", + "pytorch_linux_xenial_py3_6_gcc5_4_test2", + ] self.assertListEqual( filter_job_names(job_names, ""), job_names, - "empty regex should match all jobs" - ) - self.assertListEqual( - filter_job_names(job_names, ".*"), - job_names - ) - self.assertListEqual( - filter_job_names(job_names, ".*xenial.*"), - job_names + "empty regex should match all jobs", ) + self.assertListEqual(filter_job_names(job_names, ".*"), job_names) + self.assertListEqual(filter_job_names(job_names, ".*xenial.*"), job_names) self.assertListEqual( filter_job_names(job_names, ".*xenial.*test2"), - ["pytorch_linux_xenial_py3_6_gcc5_4_test2"] - ) - self.assertListEqual( - filter_job_names(job_names, ".*xenial.*test3"), - [] + ["pytorch_linux_xenial_py3_6_gcc5_4_test2"], ) + self.assertListEqual(filter_job_names(job_names, ".*xenial.*test3"), []) self.assertRaises( Exception, lambda: filter_job_names(job_names, "["), - msg="malformed regex should throw exception" + msg="malformed regex should throw exception", ) + def mock_fetch_alerts(*args, **kwargs): + """ + Return the mock JSON response when trying to fetch all existing alerts + """ + + class MockResponse: + def __init__(self, json_data, status_code): + self.text = json_data + self.status_code = status_code + + def raise_for_status(self): + pass + + response = { + "data": { + "repository": { + "issues": { + "nodes": [ + { + "title": "[Pytorch] There are 3 Recurrently Failing Jobs on pytorch/pytorch master", + "closed": False, + "number": 3763, + "body": "", + "comments": {"nodes": []}, + }, + { + "title": "[Pytorch] There are 3 Recurrently Failing Jobs on pytorch/pytorch nightly", + "closed": False, + "number": 3764, + "body": "", + "comments": {"nodes": []}, + }, + ] + } + } + } + } + return MockResponse(json.dumps(response), 200) + + @patch("requests.post", side_effect=mock_fetch_alerts) + def test_fetch_alert(self, mocked_alerts): + cases = [ + { + "repo": "pytorch/builder", + "branch": "main", + "expected": [], + }, + { + "repo": "pytorch/pytorch", + "branch": "master", + "expected": [ + { + "title": "[Pytorch] There are 3 Recurrently Failing Jobs on pytorch/pytorch master", + "closed": False, + "number": 3763, + "body": "", + "comments": {"nodes": []}, + }, + ], + }, + { + "repo": "pytorch/pytorch", + "branch": "nightly", + "expected": [ + { + "title": "[Pytorch] There are 3 Recurrently Failing Jobs on pytorch/pytorch nightly", + "closed": False, + "number": 3764, + "body": "", + "comments": {"nodes": []}, + }, + ], + }, + ] + + for case in cases: + alerts = fetch_alerts( + repo=case["repo"], + branch=case["branch"], + alert_repo=TEST_INFRA_REPO_NAME, + labels=PYTORCH_ALERT_LABEL, + ) + self.assertListEqual(alerts, case["expected"]) + if __name__ == "__main__": main()