From 0245c3f79dba02dbf31735379064fc08b0044fd1 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Tue, 17 Sep 2024 11:10:32 +0800 Subject: [PATCH] scripts: Improve diversity of changes --- scripts/filter_pr_changes.py | 41 +++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/scripts/filter_pr_changes.py b/scripts/filter_pr_changes.py index 6821e3cdced..8a49e547ea8 100755 --- a/scripts/filter_pr_changes.py +++ b/scripts/filter_pr_changes.py @@ -2,31 +2,52 @@ import subprocess import os +import heapq max_diff_per_file = 500 max_diff_total = 15000 max_file_total = 200 +trivial_penalty = 200 +diversity_penalty_inc = 30 stats = subprocess.check_output(['git', 'diff', '--numstat']).decode().splitlines() -diffs = [] -# TODO: maximize diff diversity -diff_pattern = set() +diffs = dict() for line in stats: add, sub, file = line.removesuffix('\n').split() count = int(add)+int(sub) if count > max_diff_per_file: continue - key = (add, sub) - if key in diff_pattern: - continue - diff_pattern.add(key) - diffs.append((file, count)) -diffs.sort(key=lambda x: x[1]) + if add == sub: + count += trivial_penalty + proj = os.path.basename(os.path.dirname(os.path.dirname(file))) + diff_list = diffs.get(proj, list()) + diff_list.append((count, file, proj, int(add), int(sub))) + diffs[proj] = diff_list + +diff_heap = [] +for list in diffs.values(): + list.sort(key=lambda x: x[0]) + diff_heap.append(list.pop(0)) +heapq.heapify(diff_heap) +diversity_penalty = dict() +diff_pattern = set() file_count = 0 diff_count = 0 +while len(diff_heap) != 0: + cnt, file, proj, add, sub = heapq.heappop(diff_heap) + proj_list = diffs[proj] + if len(proj_list) != 0: + diversity_penalty[proj] = diversity_penalty.get(proj, 0) + diversity_penalty_inc + cnt2, file2, proj2, add2, sub2 = proj_list.pop(0) + cnt2 += diversity_penalty[proj] + heapq.heappush(diff_heap, (cnt2, file2, proj2, add2, sub2)) -for file, count in diffs: + key = (add, sub) + if key in diff_pattern: + continue + diff_pattern.add(key) + count = add + sub if file_count < max_file_total and diff_count + count <= max_diff_total: file_count += 1 diff_count += count