From 6d2284e5b795728ea8ad268e16f1f5a5b7d2e92a Mon Sep 17 00:00:00 2001 From: Adam Tornhill Date: Thu, 20 Oct 2022 17:49:08 +0200 Subject: [PATCH 1/5] Rename test suite to clarify that it's end to end --- ...me_based_grouper_test.clj => time_based_end_to_end_test.clj} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename test/code_maat/app/{time_based_grouper_test.clj => time_based_end_to_end_test.clj} (97%) diff --git a/test/code_maat/app/time_based_grouper_test.clj b/test/code_maat/app/time_based_end_to_end_test.clj similarity index 97% rename from test/code_maat/app/time_based_grouper_test.clj rename to test/code_maat/app/time_based_end_to_end_test.clj index 00da1fe4..05e7073b 100644 --- a/test/code_maat/app/time_based_grouper_test.clj +++ b/test/code_maat/app/time_based_end_to_end_test.clj @@ -1,7 +1,7 @@ ;;; Copyright (C) 2014 Adam Tornhill ;;; -(ns code-maat.app.time-based-grouper-test +(ns code-maat.app.time-based-end-to-end-test (:require [code-maat.app.app :as app]) (:use [clojure.test] [code-maat.tools.test-tools])) From 0323ec473c0a3058e3e9b60a11903977427bc8b8 Mon Sep 17 00:00:00 2001 From: Adam Tornhill Date: Mon, 24 Oct 2022 12:40:11 +0200 Subject: [PATCH 2/5] Support a custom temporal period for grouping commits into logical changesets --- project.clj | 1 + src/code_maat/app/app.clj | 7 +- src/code_maat/app/time_based_grouper.clj | 134 ++++++++++++++---- .../code_maat/app/time_based_grouper_test.clj | 49 +++++++ 4 files changed, 164 insertions(+), 27 deletions(-) create mode 100644 test/code_maat/app/time_based_grouper_test.clj diff --git a/project.clj b/project.clj index edc5e1dc..c31872f1 100644 --- a/project.clj +++ b/project.clj @@ -15,6 +15,7 @@ [clj-time "0.9.0"] [org.clojure/math.numeric-tower "0.0.4"] [org.clojure/math.combinatorics "0.1.1"] + [medley "1.4.0"] [semantic-csv "0.2.1-alpha1"] [instaparse "1.4.1"]] :main code-maat.cmd-line diff --git a/src/code_maat/app/app.clj b/src/code_maat/app/app.clj index 52182e43..057354a1 100644 --- a/src/code_maat/app/app.clj +++ b/src/code_maat/app/app.clj @@ -176,9 +176,10 @@ the given temporal period. Allows the user to treat all commits during one day as a single, logical change set. NOTE: will probably not work with author's analyses!!!" - [options commits] - (if-let [time-period (:temporal-period options)] - (time-grouper/run commits time-period) + [{:keys [temporal-period] :as options} + commits] + (if temporal-period + (time-grouper/by-time-period commits options) commits)) (defn- aggregate-authors-in-teams diff --git a/src/code_maat/app/time_based_grouper.clj b/src/code_maat/app/time_based_grouper.clj index 0bed59ac..e1e92a7f 100644 --- a/src/code_maat/app/time_based_grouper.clj +++ b/src/code_maat/app/time_based_grouper.clj @@ -1,7 +1,12 @@ ;;; Copyright (C) 2014 Adam Tornhill ;;; -(ns code-maat.app.time-based-grouper) +(ns code-maat.app.time-based-grouper + (:require [clj-time.core :as t] + [clj-time.format :as tf] + [clj-time.periodic :as time-period] + [clj-time.core :as tc] + [medley.core :as m])) ;;; Sometimes we'd like to use a different temporal window than ;;; the commit. For example, when multiple teams are involved @@ -11,30 +16,111 @@ ;;; To remove these biases we use this module to re-group all ;;; changes according to a given time window before analysis. ;;; -;;; LIMITATION: At the moment we only support grouping commits that -;;; occour within the same day. This is because I could implement -;;; that aggregation easily. I plan to extend Code Maat with -;;; support for arbitrary temporal windows. - -(defn- date-as-commit-id - [commit] - (let [date (:date commit)] - (update-in commit [:rev] (fn [_old] date)))) - -(defn- throw-on-invalid - [time-period] - (when (not (= "1" time-period)) ; Let's support more in the future... - (throw - (IllegalArgumentException. - (str "Invalid time-period: the current version only supports one (1) day"))))) - -(defn run +;;; Grouping commits by time involves a sliding window over the +;;; original commits. This means that logically, the same physical commit +;;; can be counted multiple times since it overlaps with several slides +;;; of the window. This works well for change coupling but not hotspots. +;;; Hence, the validation ensures it's a supported analysis before +;;; applying the filter. + +(defn- string->date + [s] + (tf/parse (tf/formatters :year-month-day) s)) + +(defn date->string + [d] + (tf/unparse (tf/formatters :year-month-day) d)) + +(defn- date-of + [cs] + (some-> cs first :date string->date)) + +(defn- daily-dates-between + "Create a range of DateTime objects where each date represens one day." + [start end] + (let [feeding-range (time-period/periodic-seq start (tc/days 1)) + end-condition-date (tc/plus end (tc/days 1)) + full-range? (fn [current-date] (t/before? current-date end-condition-date))] + (take-while full-range? feeding-range))) + +(defn- pad-commits-to-complete-time-series + "There are probably many days which don't have any commits. + This functions pads up those days with empty commit sets. That way, we can + partition over the sequence and easily create the sliding window commit set." + [commits] + (let [commits-ascending (sort-by :date commits) + first-commit-date (date-of commits-ascending) + last-commit-date (date-of (reverse commits-ascending)) + commits-on-non-active-days []] + (reduce (fn [acc date-in-range] + (let [as-date (date->string date-in-range) + commits-on-day (get acc as-date commits-on-non-active-days)] + (assoc acc as-date commits-on-day))) + (group-by :date commits) + (daily-dates-between first-commit-date last-commit-date)))) + +(defn- drop-date-key + "We used group-by to get commits by date. Now, drop the key so that + only the commits remain." + [grouped-commits] + (map second grouped-commits)) + +(defn- remove-empty-windows + "Not all dates have commit activity." + [commits-within-sliding-windows] + (remove (fn [cs] + (every? empty? cs)) + commits-within-sliding-windows)) + +(defn- adjust-revision-to + "The edge case is that the same file should only be included once, so + let's filter out duplicates." + [new-rev cs] + (->> cs + (map (fn [c] + (assoc c :rev new-rev))) + (m/distinct-by :entity))) + +(defn- combine-commits-to-logical-changesets + [commits-within-sliding-windows] + (mapcat (fn [commits-in-window] + (let [cs (reduce (partial into) commits-in-window) + latest-day (->> cs (sort-by :date) reverse first :date)] + (adjust-revision-to latest-day cs))) + commits-within-sliding-windows)) + +(defn- combine-sliding-commits + "After partitioning commits according to the sliding window, we + need to deliver a flat sequence where each commit group in the window + represents a logical commitset." + [commits-within-sliding-windows] + (->> commits-within-sliding-windows + remove-empty-windows + combine-commits-to-logical-changesets)) + +(defn- commits->sliding-window-seq + [time-period cs] + (->> cs + pad-commits-to-complete-time-series + (sort-by first) + drop-date-key + (partition time-period 1) + combine-sliding-commits)) + +(defn- validated-time-period-from + [{:keys [temporal-period] :as _options}] + (if (re-matches #"\d+" temporal-period) + (int (Double/parseDouble temporal-period)) + (throw (IllegalArgumentException. + (str "Invalid time-period: the given value '" temporal-period "' is not an integer."))))) + +(defn by-time-period "Alright, this is a hack: we just set the commit ID to the current date. That makes the rest of the analyses treat our faked grouping as beloning to the same change set." - ([raw-data] - (run raw-data "1")) - ([raw-data time-period] - (throw-on-invalid time-period) - (map date-as-commit-id raw-data))) + [cs options] + (let [time-period (validated-time-period-from options)] + (if (seq cs) + (commits->sliding-window-seq time-period cs) + cs))) diff --git a/test/code_maat/app/time_based_grouper_test.clj b/test/code_maat/app/time_based_grouper_test.clj new file mode 100644 index 00000000..67e96c98 --- /dev/null +++ b/test/code_maat/app/time_based_grouper_test.clj @@ -0,0 +1,49 @@ +(ns code-maat.app.time-based-grouper-test + (:require [code-maat.app.time-based-grouper :as grouper]) + (:use [clojure.test])) + +(deftest commits-by-day + (testing "Expect a non-modifying operation" + (let [input-commits [{:entity "A" :rev 1 :date "2022-10-20"} + {:entity "B" :rev 2 :date "2022-10-20"}]] + (is (= [{:date "2022-10-20" :entity "A" :rev "2022-10-20"} + {:date "2022-10-20" :entity "B" :rev "2022-10-20"}] + (grouper/by-time-period input-commits {:temporal-period "1"})))))) + +(deftest multiple-days-give-a-rolling-dataset + (let [input-commits [{:entity "A" :rev 1 :date "2022-10-20"} + {:entity "B" :rev 2 :date "2022-10-20"} + + {:entity "B" :rev 3 :date "2022-10-19"} ; double entry, two B's when looking at last two days + {:entity "D" :rev 3 :date "2022-10-19"} + + {:entity "C" :rev 4 :date "2022-10-18"} + {:entity "D" :rev 4 :date "2022-10-18"} + + {:entity "D" :rev 5 :date "2022-10-15"}]] ; a gap in days between the commits + (is (= [ + ; Only commits on 2022-10-15, not on subsequent day: + {:date "2022-10-15" :entity "D" :rev "2022-10-15"} + + ; 17-18th + {:date "2022-10-18" :entity "C" :rev "2022-10-18"} + {:date "2022-10-18" :entity "D" :rev "2022-10-18"} + + ; 18-19th + {:date "2022-10-18" :entity "C" :rev "2022-10-19"} + {:date "2022-10-18" :entity "D" :rev "2022-10-19"} + {:date "2022-10-19" :entity "B" :rev "2022-10-19"} + + ; 19-20th + {:date "2022-10-19" :entity "B" :rev "2022-10-20"} + {:date "2022-10-19" :entity "D" :rev "2022-10-20"} + {:date "2022-10-20" :entity "A" :rev "2022-10-20"}] + (grouper/by-time-period input-commits {:temporal-period "2"}))))) + +(deftest edge-cases + (testing "Works on an empty input sequence, ie. no commits" + (is (= [] + (grouper/by-time-period [] {:temporal-period "2"})))) + (testing "Works on a single commit" + (is (= [{:date "2022-10-19" :entity "B" :rev "2022-10-19"}] + (grouper/by-time-period [{:entity "B" :rev 3 :date "2022-10-19"}] {:temporal-period "1"}))))) From 8f6af084356b11f4c71f40ecb3d92a1f5c65d9a5 Mon Sep 17 00:00:00 2001 From: Adam Tornhill Date: Mon, 24 Oct 2022 12:42:26 +0200 Subject: [PATCH 3/5] Update the end-to-end test now that we support arbitrary time periods --- test/code_maat/app/time_based_end_to_end_test.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/code_maat/app/time_based_end_to_end_test.clj b/test/code_maat/app/time_based_end_to_end_test.clj index 05e7073b..14883964 100644 --- a/test/code_maat/app/time_based_end_to_end_test.clj +++ b/test/code_maat/app/time_based_end_to_end_test.clj @@ -36,7 +36,7 @@ "entity,coupled,degree,average-revs\n/Infrastrucure/Network/Connection.cs,/Presentation/Status/ClientPresenter.cs,100,1\n/Infrastrucure/Network/Connection.cs,/Infrastrucure/Network/TcpConnection.cs,100,1\n/Infrastrucure/Network/TcpConnection.cs,/Presentation/Status/ClientPresenter.cs,100,1\n"))) (def ^:const options-with-invalid-time-period - (merge csv-options {:temporal-period "2"})) + (merge csv-options {:temporal-period "not a number"})) (deftest throws-on-unsupported-time-periods "We hope to support more options in the future." From 946e5be97fd66f6fb60efacaa8f5941f66f79a07 Mon Sep 17 00:00:00 2001 From: Adam Tornhill Date: Mon, 24 Oct 2022 12:51:53 +0200 Subject: [PATCH 4/5] Update the command line options for the change in temporal-period --- src/code_maat/cmd_line.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/code_maat/cmd_line.clj b/src/code_maat/cmd_line.clj index fa4aef98..0299aea2 100644 --- a/src/code_maat/cmd_line.clj +++ b/src/code_maat/cmd_line.clj @@ -33,7 +33,7 @@ :default 30 :parse-fn #(Integer/parseInt %)] ["-e" "--expression-to-match MATCH-EXPRESSION" "A regex to match against commit messages. Used with -messages analyses"] ["-t" "--temporal-period TEMPORAL-PERIOD" - "Instructs Code Maat to consider all commits during the same day as a single, logical commit"] + "Used for coupling analyses. Instructs Code Maat to consider all commits during the rolling temporal period as a single, logical commit set"] ["-d" "--age-time-now AGE-TIME_NOW" "Specify a date as YYYY-MM-dd that counts as time zero when doing a code age analysis"] ["-h" "--help"]]) From 48292293652912af17835e1f31190ff720882dc2 Mon Sep 17 00:00:00 2001 From: Adam Tornhill Date: Mon, 24 Oct 2022 17:16:59 +0200 Subject: [PATCH 5/5] Refactor: put a new on the complex behavior for organizing commits into sliding windows --- src/code_maat/app/time_based_grouper.clj | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/code_maat/app/time_based_grouper.clj b/src/code_maat/app/time_based_grouper.clj index e1e92a7f..a1895343 100644 --- a/src/code_maat/app/time_based_grouper.clj +++ b/src/code_maat/app/time_based_grouper.clj @@ -98,13 +98,18 @@ remove-empty-windows combine-commits-to-logical-changesets)) +(defn- partition-commits-into-sliding-periods-of + [time-period padded-cs] + (->> padded-cs + (sort-by first) + drop-date-key + (partition time-period 1))) + (defn- commits->sliding-window-seq [time-period cs] (->> cs pad-commits-to-complete-time-series - (sort-by first) - drop-date-key - (partition time-period 1) + (partition-commits-into-sliding-periods-of time-period) combine-sliding-commits)) (defn- validated-time-period-from