diff --git a/project.clj b/project.clj index edc5e1dc..c31872f1 100644 --- a/project.clj +++ b/project.clj @@ -15,6 +15,7 @@ [clj-time "0.9.0"] [org.clojure/math.numeric-tower "0.0.4"] [org.clojure/math.combinatorics "0.1.1"] + [medley "1.4.0"] [semantic-csv "0.2.1-alpha1"] [instaparse "1.4.1"]] :main code-maat.cmd-line diff --git a/src/code_maat/app/app.clj b/src/code_maat/app/app.clj index 52182e43..057354a1 100644 --- a/src/code_maat/app/app.clj +++ b/src/code_maat/app/app.clj @@ -176,9 +176,10 @@ the given temporal period. Allows the user to treat all commits during one day as a single, logical change set. NOTE: will probably not work with author's analyses!!!" - [options commits] - (if-let [time-period (:temporal-period options)] - (time-grouper/run commits time-period) + [{:keys [temporal-period] :as options} + commits] + (if temporal-period + (time-grouper/by-time-period commits options) commits)) (defn- aggregate-authors-in-teams diff --git a/src/code_maat/app/time_based_grouper.clj b/src/code_maat/app/time_based_grouper.clj index 0bed59ac..a1895343 100644 --- a/src/code_maat/app/time_based_grouper.clj +++ b/src/code_maat/app/time_based_grouper.clj @@ -1,7 +1,12 @@ ;;; Copyright (C) 2014 Adam Tornhill ;;; -(ns code-maat.app.time-based-grouper) +(ns code-maat.app.time-based-grouper + (:require [clj-time.core :as t] + [clj-time.format :as tf] + [clj-time.periodic :as time-period] + [clj-time.core :as tc] + [medley.core :as m])) ;;; Sometimes we'd like to use a different temporal window than ;;; the commit. For example, when multiple teams are involved @@ -11,30 +16,116 @@ ;;; To remove these biases we use this module to re-group all ;;; changes according to a given time window before analysis. ;;; -;;; LIMITATION: At the moment we only support grouping commits that -;;; occour within the same day. This is because I could implement -;;; that aggregation easily. I plan to extend Code Maat with -;;; support for arbitrary temporal windows. - -(defn- date-as-commit-id - [commit] - (let [date (:date commit)] - (update-in commit [:rev] (fn [_old] date)))) - -(defn- throw-on-invalid - [time-period] - (when (not (= "1" time-period)) ; Let's support more in the future... - (throw - (IllegalArgumentException. - (str "Invalid time-period: the current version only supports one (1) day"))))) - -(defn run +;;; Grouping commits by time involves a sliding window over the +;;; original commits. This means that logically, the same physical commit +;;; can be counted multiple times since it overlaps with several slides +;;; of the window. This works well for change coupling but not hotspots. +;;; Hence, the validation ensures it's a supported analysis before +;;; applying the filter. + +(defn- string->date + [s] + (tf/parse (tf/formatters :year-month-day) s)) + +(defn date->string + [d] + (tf/unparse (tf/formatters :year-month-day) d)) + +(defn- date-of + [cs] + (some-> cs first :date string->date)) + +(defn- daily-dates-between + "Create a range of DateTime objects where each date represens one day." + [start end] + (let [feeding-range (time-period/periodic-seq start (tc/days 1)) + end-condition-date (tc/plus end (tc/days 1)) + full-range? (fn [current-date] (t/before? current-date end-condition-date))] + (take-while full-range? feeding-range))) + +(defn- pad-commits-to-complete-time-series + "There are probably many days which don't have any commits. + This functions pads up those days with empty commit sets. That way, we can + partition over the sequence and easily create the sliding window commit set." + [commits] + (let [commits-ascending (sort-by :date commits) + first-commit-date (date-of commits-ascending) + last-commit-date (date-of (reverse commits-ascending)) + commits-on-non-active-days []] + (reduce (fn [acc date-in-range] + (let [as-date (date->string date-in-range) + commits-on-day (get acc as-date commits-on-non-active-days)] + (assoc acc as-date commits-on-day))) + (group-by :date commits) + (daily-dates-between first-commit-date last-commit-date)))) + +(defn- drop-date-key + "We used group-by to get commits by date. Now, drop the key so that + only the commits remain." + [grouped-commits] + (map second grouped-commits)) + +(defn- remove-empty-windows + "Not all dates have commit activity." + [commits-within-sliding-windows] + (remove (fn [cs] + (every? empty? cs)) + commits-within-sliding-windows)) + +(defn- adjust-revision-to + "The edge case is that the same file should only be included once, so + let's filter out duplicates." + [new-rev cs] + (->> cs + (map (fn [c] + (assoc c :rev new-rev))) + (m/distinct-by :entity))) + +(defn- combine-commits-to-logical-changesets + [commits-within-sliding-windows] + (mapcat (fn [commits-in-window] + (let [cs (reduce (partial into) commits-in-window) + latest-day (->> cs (sort-by :date) reverse first :date)] + (adjust-revision-to latest-day cs))) + commits-within-sliding-windows)) + +(defn- combine-sliding-commits + "After partitioning commits according to the sliding window, we + need to deliver a flat sequence where each commit group in the window + represents a logical commitset." + [commits-within-sliding-windows] + (->> commits-within-sliding-windows + remove-empty-windows + combine-commits-to-logical-changesets)) + +(defn- partition-commits-into-sliding-periods-of + [time-period padded-cs] + (->> padded-cs + (sort-by first) + drop-date-key + (partition time-period 1))) + +(defn- commits->sliding-window-seq + [time-period cs] + (->> cs + pad-commits-to-complete-time-series + (partition-commits-into-sliding-periods-of time-period) + combine-sliding-commits)) + +(defn- validated-time-period-from + [{:keys [temporal-period] :as _options}] + (if (re-matches #"\d+" temporal-period) + (int (Double/parseDouble temporal-period)) + (throw (IllegalArgumentException. + (str "Invalid time-period: the given value '" temporal-period "' is not an integer."))))) + +(defn by-time-period "Alright, this is a hack: we just set the commit ID to the current date. That makes the rest of the analyses treat our faked grouping as beloning to the same change set." - ([raw-data] - (run raw-data "1")) - ([raw-data time-period] - (throw-on-invalid time-period) - (map date-as-commit-id raw-data))) + [cs options] + (let [time-period (validated-time-period-from options)] + (if (seq cs) + (commits->sliding-window-seq time-period cs) + cs))) diff --git a/src/code_maat/cmd_line.clj b/src/code_maat/cmd_line.clj index fa4aef98..0299aea2 100644 --- a/src/code_maat/cmd_line.clj +++ b/src/code_maat/cmd_line.clj @@ -33,7 +33,7 @@ :default 30 :parse-fn #(Integer/parseInt %)] ["-e" "--expression-to-match MATCH-EXPRESSION" "A regex to match against commit messages. Used with -messages analyses"] ["-t" "--temporal-period TEMPORAL-PERIOD" - "Instructs Code Maat to consider all commits during the same day as a single, logical commit"] + "Used for coupling analyses. Instructs Code Maat to consider all commits during the rolling temporal period as a single, logical commit set"] ["-d" "--age-time-now AGE-TIME_NOW" "Specify a date as YYYY-MM-dd that counts as time zero when doing a code age analysis"] ["-h" "--help"]]) diff --git a/test/code_maat/app/time_based_end_to_end_test.clj b/test/code_maat/app/time_based_end_to_end_test.clj new file mode 100644 index 00000000..14883964 --- /dev/null +++ b/test/code_maat/app/time_based_end_to_end_test.clj @@ -0,0 +1,44 @@ +;;; Copyright (C) 2014 Adam Tornhill +;;; + +(ns code-maat.app.time-based-end-to-end-test + (:require [code-maat.app.app :as app]) + (:use [clojure.test] + [code-maat.tools.test-tools])) + +;;; End-to-end tests to simulate a time-based analysis. +;;; +;;; The test data contains two commits done the same day. +;;; With the default options we'll treat them as separate. +;;; In a time-base analysis we consider them as a logical +;;; part of the same work. + +(def ^:const log-file "./test/code_maat/app/day_coupled_entities_git.txt") + +(def ^:const csv-options + {:version-control "git" + :analysis "coupling" + :min-revs 1 + :min-shared-revs 1 + :min-coupling 10 + :max-coupling 100 + :max-changeset-size 10}) + +(def ^:const csv-options-for-time-based + (merge csv-options {:temporal-period "1"})) + +(deftest only-calculates-coupling-within-same-commit-by-default + (is (= (run-with-str-output log-file csv-options) + "entity,coupled,degree,average-revs\n/Infrastrucure/Network/Connection.cs,/Presentation/Status/ClientPresenter.cs,100,1\n"))) + +(deftest calculates-coupling-within-same-day + (is (= (run-with-str-output log-file csv-options-for-time-based) + "entity,coupled,degree,average-revs\n/Infrastrucure/Network/Connection.cs,/Presentation/Status/ClientPresenter.cs,100,1\n/Infrastrucure/Network/Connection.cs,/Infrastrucure/Network/TcpConnection.cs,100,1\n/Infrastrucure/Network/TcpConnection.cs,/Presentation/Status/ClientPresenter.cs,100,1\n"))) + +(def ^:const options-with-invalid-time-period + (merge csv-options {:temporal-period "not a number"})) + +(deftest throws-on-unsupported-time-periods + "We hope to support more options in the future." + (is (thrown? IllegalArgumentException + (run-with-str-output log-file options-with-invalid-time-period)))) diff --git a/test/code_maat/app/time_based_grouper_test.clj b/test/code_maat/app/time_based_grouper_test.clj index 00da1fe4..67e96c98 100644 --- a/test/code_maat/app/time_based_grouper_test.clj +++ b/test/code_maat/app/time_based_grouper_test.clj @@ -1,44 +1,49 @@ -;;; Copyright (C) 2014 Adam Tornhill -;;; - (ns code-maat.app.time-based-grouper-test - (:require [code-maat.app.app :as app]) - (:use [clojure.test] - [code-maat.tools.test-tools])) - -;;; End-to-end tests to simulate a time-based analysis. -;;; -;;; The test data contains two commits done the same day. -;;; With the default options we'll treat them as separate. -;;; In a time-base analysis we consider them as a logical -;;; part of the same work. - -(def ^:const log-file "./test/code_maat/app/day_coupled_entities_git.txt") - -(def ^:const csv-options - {:version-control "git" - :analysis "coupling" - :min-revs 1 - :min-shared-revs 1 - :min-coupling 10 - :max-coupling 100 - :max-changeset-size 10}) - -(def ^:const csv-options-for-time-based - (merge csv-options {:temporal-period "1"})) - -(deftest only-calculates-coupling-within-same-commit-by-default - (is (= (run-with-str-output log-file csv-options) - "entity,coupled,degree,average-revs\n/Infrastrucure/Network/Connection.cs,/Presentation/Status/ClientPresenter.cs,100,1\n"))) - -(deftest calculates-coupling-within-same-day - (is (= (run-with-str-output log-file csv-options-for-time-based) - "entity,coupled,degree,average-revs\n/Infrastrucure/Network/Connection.cs,/Presentation/Status/ClientPresenter.cs,100,1\n/Infrastrucure/Network/Connection.cs,/Infrastrucure/Network/TcpConnection.cs,100,1\n/Infrastrucure/Network/TcpConnection.cs,/Presentation/Status/ClientPresenter.cs,100,1\n"))) - -(def ^:const options-with-invalid-time-period - (merge csv-options {:temporal-period "2"})) - -(deftest throws-on-unsupported-time-periods - "We hope to support more options in the future." - (is (thrown? IllegalArgumentException - (run-with-str-output log-file options-with-invalid-time-period)))) + (:require [code-maat.app.time-based-grouper :as grouper]) + (:use [clojure.test])) + +(deftest commits-by-day + (testing "Expect a non-modifying operation" + (let [input-commits [{:entity "A" :rev 1 :date "2022-10-20"} + {:entity "B" :rev 2 :date "2022-10-20"}]] + (is (= [{:date "2022-10-20" :entity "A" :rev "2022-10-20"} + {:date "2022-10-20" :entity "B" :rev "2022-10-20"}] + (grouper/by-time-period input-commits {:temporal-period "1"})))))) + +(deftest multiple-days-give-a-rolling-dataset + (let [input-commits [{:entity "A" :rev 1 :date "2022-10-20"} + {:entity "B" :rev 2 :date "2022-10-20"} + + {:entity "B" :rev 3 :date "2022-10-19"} ; double entry, two B's when looking at last two days + {:entity "D" :rev 3 :date "2022-10-19"} + + {:entity "C" :rev 4 :date "2022-10-18"} + {:entity "D" :rev 4 :date "2022-10-18"} + + {:entity "D" :rev 5 :date "2022-10-15"}]] ; a gap in days between the commits + (is (= [ + ; Only commits on 2022-10-15, not on subsequent day: + {:date "2022-10-15" :entity "D" :rev "2022-10-15"} + + ; 17-18th + {:date "2022-10-18" :entity "C" :rev "2022-10-18"} + {:date "2022-10-18" :entity "D" :rev "2022-10-18"} + + ; 18-19th + {:date "2022-10-18" :entity "C" :rev "2022-10-19"} + {:date "2022-10-18" :entity "D" :rev "2022-10-19"} + {:date "2022-10-19" :entity "B" :rev "2022-10-19"} + + ; 19-20th + {:date "2022-10-19" :entity "B" :rev "2022-10-20"} + {:date "2022-10-19" :entity "D" :rev "2022-10-20"} + {:date "2022-10-20" :entity "A" :rev "2022-10-20"}] + (grouper/by-time-period input-commits {:temporal-period "2"}))))) + +(deftest edge-cases + (testing "Works on an empty input sequence, ie. no commits" + (is (= [] + (grouper/by-time-period [] {:temporal-period "2"})))) + (testing "Works on a single commit" + (is (= [{:date "2022-10-19" :entity "B" :rev "2022-10-19"}] + (grouper/by-time-period [{:entity "B" :rev 3 :date "2022-10-19"}] {:temporal-period "1"})))))