From 0a63e54742a5774f2a5ef34805b5d11572e69edf Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 08:55:03 -0400
Subject: [PATCH 01/18] survey: stub in new experimental `git-survey` command

Start work on a new `git survey` command to scan the repository
for monorepo performance and scaling problems.  The goal is to
measure the various known "dimensions of scale" and serve as a
foundation for adding additional measurements as we learn more
about Git monorepo scaling problems.

Results will be logged to the console and to Trace2.

The initial goal is to complement the scanning and analysis performed
by the GO-based `git-sizer` (https://github.com/github/git-sizer) tool.
It is hoped that by creating a builtin command, we may be able to take
advantage of internal Git data structures and code that is not
accessible from GO to gain further insight into potential scaling
problems.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 .gitignore                   |  1 +
 Documentation/git-survey.txt | 36 ++++++++++++++++++++++
 Makefile                     |  1 +
 builtin.h                    |  1 +
 builtin/survey.c             | 58 ++++++++++++++++++++++++++++++++++++
 command-list.txt             |  1 +
 git.c                        |  1 +
 7 files changed, 99 insertions(+)
 create mode 100644 Documentation/git-survey.txt
 create mode 100644 builtin/survey.c
diff --git a/.gitignore b/.gitignore
index bf97276163b19b..8f82ba30d5a17a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,6 +164,7 @@
 /git-submodule
 /git-submodule--helper
 /git-subtree
+/git-survey
 /git-svn
 /git-switch
 /git-symbolic-ref
diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt
new file mode 100644
index 00000000000000..cdd1ec4358b8bb
--- /dev/null
+++ b/Documentation/git-survey.txt
@@ -0,0 +1,36 @@
+git-survey(1)
+=============
+
+NAME
+----
+git-survey - EXPERIMENTAL: Measure various repository dimensions of scale
+
+SYNOPSIS
+--------
+[verse]
+(EXPERIMENTAL!) `git survey` <options>
+
+DESCRIPTION
+-----------
+
+Survey the repository and measure various dimensions of scale.
+
+As repositories grow to "monorepo" size, certain data shapes can cause
+performance problems.  `git-survey` attempts to measure and report on
+known problem areas.
+
+OPTIONS
+-------
+
+--progress::
+	Show progress.  This is automatically enabled when interactive.
+
+OUTPUT
+------
+
+By default, `git survey` will print information about the repository in a
+human-readable format that includes overviews and tables.
+
+GIT
+---
+Part of the linkgit:git[1] suite
diff --git a/Makefile b/Makefile
index 522c8a612c9882..13a73d99585ce4 100644
--- a/Makefile
+++ b/Makefile
@@ -1309,6 +1309,7 @@ BUILTIN_OBJS += builtin/sparse-checkout.o
 BUILTIN_OBJS += builtin/stash.o
 BUILTIN_OBJS += builtin/stripspace.o
 BUILTIN_OBJS += builtin/submodule--helper.o
+BUILTIN_OBJS += builtin/survey.o
 BUILTIN_OBJS += builtin/symbolic-ref.o
 BUILTIN_OBJS += builtin/tag.o
 BUILTIN_OBJS += builtin/unpack-file.o
diff --git a/builtin.h b/builtin.h
index 14fa0171607b17..8ab1623b32679c 100644
--- a/builtin.h
+++ b/builtin.h
@@ -238,6 +238,7 @@ int cmd_status(int argc, const char **argv, const char *prefix);
 int cmd_stash(int argc, const char **argv, const char *prefix);
 int cmd_stripspace(int argc, const char **argv, const char *prefix);
 int cmd_submodule__helper(int argc, const char **argv, const char *prefix);
+int cmd_survey(int argc, const char **argv, const char *prefix);
 int cmd_switch(int argc, const char **argv, const char *prefix);
 int cmd_symbolic_ref(int argc, const char **argv, const char *prefix);
 int cmd_tag(int argc, const char **argv, const char *prefix);
diff --git a/builtin/survey.c b/builtin/survey.c
new file mode 100644
index 00000000000000..730ad9e4552048
--- /dev/null
+++ b/builtin/survey.c
@@ -0,0 +1,58 @@
+#include "builtin.h"
+#include "config.h"
+#include "parse-options.h"
+
+static const char * const survey_usage[] = {
+	N_("(EXPERIMENTAL!) git survey <options>"),
+	NULL,
+};
+
+struct survey_opts {
+	int verbose;
+	int show_progress;
+};
+
+static struct survey_opts survey_opts = {
+	.verbose = 0,
+	.show_progress = -1, /* defaults to isatty(2) */
+};
+
+static struct option survey_options[] = {
+	OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")),
+	OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")),
+	OPT_END(),
+};
+
+static int survey_load_config_cb(const char *var, const char *value,
+				 const struct config_context *ctx, void *pvoid)
+{
+	if (!strcmp(var, "survey.verbose")) {
+		survey_opts.verbose = git_config_bool(var, value);
+		return 0;
+	}
+	if (!strcmp(var, "survey.progress")) {
+		survey_opts.show_progress = git_config_bool(var, value);
+		return 0;
+	}
+
+	return git_default_config(var, value, ctx, pvoid);
+}
+
+static void survey_load_config(void)
+{
+	git_config(survey_load_config_cb, NULL);
+}
+
+int cmd_survey(int argc, const char **argv, const char *prefix)
+{
+	survey_load_config();
+
+	argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0);
+
+	prepare_repo_settings(the_repository);
+
+	if (survey_opts.show_progress < 0)
+		survey_opts.show_progress = isatty(2);
+
+	return 0;
+}
diff --git a/command-list.txt b/command-list.txt
index e0bb87b3b5c278..d389561a5f1161 100644
--- a/command-list.txt
+++ b/command-list.txt
@@ -186,6 +186,7 @@ git-stash                               mainporcelain
 git-status                              mainporcelain           info
 git-stripspace                          purehelpers
 git-submodule                           mainporcelain
+git-survey                              mainporcelain
 git-svn                                 foreignscminterface
 git-switch                              mainporcelain           history
 git-symbolic-ref                        plumbingmanipulators
diff --git a/git.c b/git.c
index e35af9b0e5e976..7c70fff6218d90 100644
--- a/git.c
+++ b/git.c
@@ -622,6 +622,7 @@ static struct cmd_struct commands[] = {
 	{ "status", cmd_status, RUN_SETUP | NEED_WORK_TREE },
 	{ "stripspace", cmd_stripspace },
 	{ "submodule--helper", cmd_submodule__helper, RUN_SETUP },
+	{ "survey", cmd_survey, RUN_SETUP },
 	{ "switch", cmd_switch, RUN_SETUP | NEED_WORK_TREE },
 	{ "symbolic-ref", cmd_symbolic_ref, RUN_SETUP },
 	{ "tag", cmd_tag, RUN_SETUP | DELAY_PAGER_CONFIG },

From 8b66a8babbc77f76479950f9759df09fa1cc71bb Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 09:51:34 -0400
Subject: [PATCH 02/18] survey: add command line opts to select references

By default we will scan all references in "refs/heads/", "refs/tags/"
and "refs/remotes/".

Add command line opts let the use ask for all refs or a subset of them
and to include a detached HEAD.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 Documentation/git-survey.txt | 34 +++++++++++++
 builtin/survey.c             | 99 ++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+)

diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt
index cdd1ec4358b8bb..c648ef704e3806 100644
--- a/Documentation/git-survey.txt
+++ b/Documentation/git-survey.txt
@@ -19,12 +19,46 @@ As repositories grow to "monorepo" size, certain data shapes can cause
 performance problems.  `git-survey` attempts to measure and report on
 known problem areas.
 
+Ref Selection and Reachable Objects
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In this first analysis phase, `git survey` will iterate over the set of
+requested branches, tags, and other refs and treewalk over all of the
+reachable commits, trees, and blobs and generate various statistics.
+
 OPTIONS
 -------
 
 --progress::
 	Show progress.  This is automatically enabled when interactive.
 
+Ref Selection
+~~~~~~~~~~~~~
+
+The following options control the set of refs that `git survey` will examine.
+By default, `git survey` will look at tags, local branches, and remote refs.
+If any of the following options are given, the default set is cleared and
+only refs for the given options are added.
+
+--all-refs::
+	Use all refs.  This includes local branches, tags, remote refs,
+	notes, and stashes.  This option overrides all of the following.
+
+--branches::
+	Add local branches (`refs/heads/`) to the set.
+
+--tags::
+	Add tags (`refs/tags/`) to the set.
+
+--remotes::
+	Add remote branches (`refs/remote/`) to the set.
+
+--detached::
+	Add HEAD to the set.
+
+--other::
+	Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set.
+
 OUTPUT
 ------
 
diff --git a/builtin/survey.c b/builtin/survey.c
index 730ad9e4552048..443af35954d590 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -7,19 +7,117 @@ static const char * const survey_usage[] = {
 	NULL,
 };
 
+struct survey_refs_wanted {
+	int want_all_refs; /* special override */
+
+	int want_branches;
+	int want_tags;
+	int want_remotes;
+	int want_detached;
+	int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */
+};
+
+/*
+ * The set of refs that we will search if the user doesn't select
+ * any on the command line.
+ */
+static struct survey_refs_wanted refs_if_unspecified = {
+	.want_all_refs = 0,
+
+	.want_branches = 1,
+	.want_tags = 1,
+	.want_remotes = 1,
+	.want_detached = 0,
+	.want_other = 0,
+};
+
 struct survey_opts {
 	int verbose;
 	int show_progress;
+	struct survey_refs_wanted refs;
 };
 
 static struct survey_opts survey_opts = {
 	.verbose = 0,
 	.show_progress = -1, /* defaults to isatty(2) */
+
+	.refs.want_all_refs = -1,
+
+	.refs.want_branches = -1, /* default these to undefined */
+	.refs.want_tags = -1,
+	.refs.want_remotes = -1,
+	.refs.want_detached = -1,
+	.refs.want_other = -1,
 };
 
+/*
+ * After parsing the command line arguments, figure out which refs we
+ * should scan.
+ *
+ * If ANY were given in positive sense, then we ONLY include them and
+ * do not use the builtin values.
+ */
+static void fixup_refs_wanted(void)
+{
+	struct survey_refs_wanted *rw = &survey_opts.refs;
+
+	/*
+	 * `--all-refs` overrides and enables everything.
+	 */
+	if (rw->want_all_refs == 1) {
+		rw->want_branches = 1;
+		rw->want_tags = 1;
+		rw->want_remotes = 1;
+		rw->want_detached = 1;
+		rw->want_other = 1;
+		return;
+	}
+
+	/*
+	 * If none of the `--<ref-type>` were given, we assume all
+	 * of the builtin unspecified values.
+	 */
+	if (rw->want_branches == -1 &&
+	    rw->want_tags == -1 &&
+	    rw->want_remotes == -1 &&
+	    rw->want_detached == -1 &&
+	    rw->want_other == -1) {
+		*rw = refs_if_unspecified;
+		return;
+	}
+
+	/*
+	 * Since we only allow positive boolean values on the command
+	 * line, we will only have true values where they specified
+	 * a `--<ref-type>`.
+	 *
+	 * So anything that still has an unspecified value should be
+	 * set to false.
+	 */
+	if (rw->want_branches == -1)
+		rw->want_branches = 0;
+	if (rw->want_tags == -1)
+		rw->want_tags = 0;
+	if (rw->want_remotes == -1)
+		rw->want_remotes = 0;
+	if (rw->want_detached == -1)
+		rw->want_detached = 0;
+	if (rw->want_other == -1)
+		rw->want_other = 0;
+}
+
 static struct option survey_options[] = {
 	OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")),
 	OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")),
+
+	OPT_BOOL_F(0, "all-refs", &survey_opts.refs.want_all_refs, N_("include all refs"),          PARSE_OPT_NONEG),
+
+	OPT_BOOL_F(0, "branches", &survey_opts.refs.want_branches, N_("include branches"),          PARSE_OPT_NONEG),
+	OPT_BOOL_F(0, "tags",     &survey_opts.refs.want_tags,     N_("include tags"),              PARSE_OPT_NONEG),
+	OPT_BOOL_F(0, "remotes",  &survey_opts.refs.want_remotes,  N_("include all remotes refs"),  PARSE_OPT_NONEG),
+	OPT_BOOL_F(0, "detached", &survey_opts.refs.want_detached, N_("include detached HEAD"),     PARSE_OPT_NONEG),
+	OPT_BOOL_F(0, "other",    &survey_opts.refs.want_other,    N_("include notes and stashes"), PARSE_OPT_NONEG),
+
 	OPT_END(),
 };
 
@@ -53,6 +151,7 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 
 	if (survey_opts.show_progress < 0)
 		survey_opts.show_progress = isatty(2);
+	fixup_refs_wanted();
 
 	return 0;
 }

From 3e4359389c070be2dbae321a447f6b2b1091d807 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 10:38:17 -0400
Subject: [PATCH 03/18] survey: collect the set of requested refs

Collect the set of requested branches, tags, and etc into a
ref_array and collect the set of requested patterns into a
strvec.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index 443af35954d590..9d7faa71ebc75d 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -1,12 +1,19 @@
 #include "builtin.h"
 #include "config.h"
 #include "parse-options.h"
+#include "progress.h"
+#include "ref-filter.h"
+#include "strvec.h"
+#include "trace2.h"
 
 static const char * const survey_usage[] = {
 	N_("(EXPERIMENTAL!) git survey <options>"),
 	NULL,
 };
 
+static struct progress *survey_progress = NULL;
+static uint64_t survey_progress_total = 0;
+
 struct survey_refs_wanted {
 	int want_all_refs; /* special override */
 
@@ -17,6 +24,8 @@ struct survey_refs_wanted {
 	int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */
 };
 
+static struct strvec survey_vec_refs_wanted = STRVEC_INIT;
+
 /*
  * The set of refs that we will search if the user doesn't select
  * any on the command line.
@@ -141,6 +150,83 @@ static void survey_load_config(void)
 	git_config(survey_load_config_cb, NULL);
 }
 
+static void do_load_refs(struct ref_array *ref_array)
+{
+	struct ref_filter filter = REF_FILTER_INIT;
+	struct ref_sorting *sorting;
+	struct string_list sorting_options = STRING_LIST_INIT_DUP;
+
+	string_list_append(&sorting_options, "objectname");
+	sorting = ref_sorting_options(&sorting_options);
+
+	if (survey_opts.refs.want_detached)
+		strvec_push(&survey_vec_refs_wanted, "HEAD");
+
+	if (survey_opts.refs.want_all_refs) {
+		strvec_push(&survey_vec_refs_wanted, "refs/");
+	} else {
+		if (survey_opts.refs.want_branches)
+			strvec_push(&survey_vec_refs_wanted, "refs/heads/");
+		if (survey_opts.refs.want_tags)
+			strvec_push(&survey_vec_refs_wanted, "refs/tags/");
+		if (survey_opts.refs.want_remotes)
+			strvec_push(&survey_vec_refs_wanted, "refs/remotes/");
+		if (survey_opts.refs.want_other) {
+			strvec_push(&survey_vec_refs_wanted, "refs/notes/");
+			strvec_push(&survey_vec_refs_wanted, "refs/stash/");
+		}
+	}
+
+	filter.name_patterns = survey_vec_refs_wanted.v;
+	filter.ignore_case = 0;
+	filter.match_as_path = 1;
+
+	if (survey_opts.show_progress) {
+		survey_progress_total = 0;
+		survey_progress = start_progress(_("Scanning refs..."), 0);
+	}
+
+	filter_refs(ref_array, &filter, FILTER_REFS_KIND_MASK);
+
+	if (survey_opts.show_progress) {
+		survey_progress_total = ref_array->nr;
+		display_progress(survey_progress, survey_progress_total);
+	}
+
+	ref_array_sort(sorting, ref_array);
+
+	if (survey_opts.show_progress)
+		stop_progress(&survey_progress);
+
+	ref_filter_clear(&filter);
+	ref_sorting_release(sorting);
+}
+
+/*
+ * The REFS phase:
+ *
+ * Load the set of requested refs and assess them for scalablity problems.
+ * Use that set to start a treewalk to all reachable objects and assess
+ * them.
+ *
+ * This data will give us insights into the repository itself (the number
+ * of refs, the size and shape of the DAG, the number and size of the
+ * objects).
+ *
+ * Theoretically, this data is independent of the on-disk representation
+ * (e.g. independent of packing concerns).
+ */
+static void survey_phase_refs(struct repository *r)
+{
+	struct ref_array ref_array = { 0 };
+
+	trace2_region_enter("survey", "phase/refs", the_repository);
+	do_load_refs(&ref_array);
+	trace2_region_leave("survey", "phase/refs", the_repository);
+
+	ref_array_clear(&ref_array);
+}
+
 int cmd_survey(int argc, const char **argv, const char *prefix)
 {
 	survey_load_config();
@@ -153,5 +239,9 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 		survey_opts.show_progress = isatty(2);
 	fixup_refs_wanted();
 
+	survey_phase_refs(the_repository);
+
+	strvec_clear(&survey_vec_refs_wanted);
+
 	return 0;
 }

From f984f7c156d047f08153fc52e69faf9116b10011 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 12:47:27 -0400
Subject: [PATCH 04/18] survey: calculate stats on refs and print results

Calculate stats on the set of refs. This includes the number of
branches, plain and annotated tags, remotes, and etc.  Calculate
the number of packed vs loose refs.  Calculate the size of the
set of refnames.

Print results on the console in JSON format.

Add Trace2 logging of the results as a data_json event.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 345 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 344 insertions(+), 1 deletion(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index 9d7faa71ebc75d..8e1a0e4e871246 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -1,8 +1,13 @@
 #include "builtin.h"
 #include "config.h"
+#include "json-writer.h"
+#include "object-store.h"
 #include "parse-options.h"
 #include "progress.h"
 #include "ref-filter.h"
+#include "refs.h"
+#include "strbuf.h"
+#include "strmap.h"
 #include "strvec.h"
 #include "trace2.h"
 
@@ -50,7 +55,7 @@ static struct survey_opts survey_opts = {
 	.verbose = 0,
 	.show_progress = -1, /* defaults to isatty(2) */
 
-	.refs.want_all_refs = -1,
+	.refs.want_all_refs = 0,
 
 	.refs.want_branches = -1, /* default these to undefined */
 	.refs.want_tags = -1,
@@ -150,6 +155,43 @@ static void survey_load_config(void)
 	git_config(survey_load_config_cb, NULL);
 }
 
+/*
+ * Stats on the set of refs that we found.
+ */
+struct survey_stats_refs {
+	uint32_t cnt_total;
+	uint32_t cnt_lightweight_tags;
+	uint32_t cnt_annotated_tags;
+	uint32_t cnt_branches;
+	uint32_t cnt_remotes;
+	uint32_t cnt_detached;
+	uint32_t cnt_other;
+
+	uint32_t cnt_symref;
+
+	uint32_t cnt_packed;
+	uint32_t cnt_loose;
+
+	/*
+	 * Measure the length of the refnames.  We can look for
+	 * potential platform limits.  The partial sums may help us
+	 * estimate the size of a haves/wants conversation, since each
+	 * refname and a SHA must be transmitted.
+	 */
+	size_t len_max_local_refname;
+	size_t len_sum_local_refnames;
+	size_t len_max_remote_refname;
+	size_t len_sum_remote_refnames;
+
+	struct strintmap refsmap;
+};
+
+struct survey_stats {
+	struct survey_stats_refs refs;
+};
+
+static struct survey_stats survey_stats = { 0 };
+
 static void do_load_refs(struct ref_array *ref_array)
 {
 	struct ref_filter filter = REF_FILTER_INIT;
@@ -202,6 +244,178 @@ static void do_load_refs(struct ref_array *ref_array)
 	ref_sorting_release(sorting);
 }
 
+/*
+ * If we want this type of ref, increment counters and return 1.
+ */
+static int maybe_count_ref(struct repository *r, struct ref_array_item *p)
+{
+	struct survey_refs_wanted *rw = &survey_opts.refs;
+	struct survey_stats_refs *prs = &survey_stats.refs;
+	struct object_id peeled;
+
+	/*
+	 * Classify the ref using the `kind` value.  Note that
+	 * p->kind was populated by `ref_kind_from_refname()`
+	 * based strictly on the refname.  This only knows about
+	 * the basic stock categories and returns FILTER_REFS_OTHERS
+	 * for notes, stashes, and any custom namespaces (like
+	 * "refs/prefetch/").
+	 */
+	switch (p->kind) {
+	case FILTER_REFS_TAGS:
+		if (rw->want_all_refs || rw->want_tags) {
+			/*
+			 * NEEDSWORK: Both types of tags have the same
+			 * "refs/tags/" prefix. Do we want to count them
+			 * in separate buckets in the refsmap?
+			 */
+			strintmap_incr(&prs->refsmap, "refs/tags/", 1);
+
+			if (!peel_iterated_oid(r, &p->objectname, &peeled))
+				prs->cnt_annotated_tags++;
+			else
+				prs->cnt_lightweight_tags++;
+
+			return 1;
+		}
+		return 0;
+
+	case FILTER_REFS_BRANCHES:
+		if (rw->want_all_refs || rw->want_branches) {
+			strintmap_incr(&prs->refsmap, "refs/heads/", 1);
+
+			prs->cnt_branches++;
+			return 1;
+		}
+		return 0;
+
+	case FILTER_REFS_REMOTES:
+		if (rw->want_all_refs || rw->want_remotes) {
+			/*
+			 * For the refsmap, group them by the "refs/remotes/<remote>/".
+			 * For example:
+			 *   "refs/remotes/origin/..."
+			 */
+			if (starts_with(p->refname, "refs/remotes/")) {
+				struct strbuf buf = STRBUF_INIT;
+				int begin = strlen("refs/remotes/");
+				size_t j;
+
+				strbuf_addstr(&buf, p->refname);
+				for (j = begin; j < buf.len; j++) {
+					if (buf.buf[j] == '/') {
+						strbuf_setlen(&buf, j+1);
+						break;
+					}
+				}
+				strintmap_incr(&prs->refsmap, buf.buf, 1);
+				strbuf_release(&buf);
+			}
+
+			prs->cnt_remotes++;
+			return 1;
+		}
+		return 0;
+
+	case FILTER_REFS_OTHERS:
+		if (rw->want_all_refs || rw->want_other) {
+			/*
+			 * For the refsmap, group them by their "refs/<class>/".
+			 * For example:
+			 *   "refs/notes/..."
+			 *   "refs/stash/..."
+			 *   "refs/<custom>/..."
+			 */
+			if (starts_with(p->refname, "refs/")) {
+				struct strbuf buf = STRBUF_INIT;
+				int begin = strlen("refs/");
+				size_t j;
+
+				strbuf_addstr(&buf, p->refname);
+				for (j = begin; j < buf.len; j++) {
+					if (buf.buf[j] == '/') {
+						strbuf_setlen(&buf, j+1);
+						break;
+					}
+				}
+				strintmap_incr(&prs->refsmap, buf.buf, 1);
+				strbuf_release(&buf);
+			}
+
+			prs->cnt_other++;
+			return 1;
+		}
+		return 0;
+
+	case FILTER_REFS_DETACHED_HEAD:
+		if (rw->want_all_refs || rw->want_detached) {
+			strintmap_incr(&prs->refsmap, p->refname, 1);
+
+			prs->cnt_detached++;
+			return 1;
+		}
+		return 0;
+
+	default:
+		if (rw->want_all_refs) {
+			strintmap_incr(&prs->refsmap, p->refname, 1); /* probably "HEAD" */
+
+			return 1;
+		}
+		return 0;
+	}
+}
+
+/*
+ * Calculate stats on the set of refs that we found.
+ */
+static void do_calc_stats_refs(struct repository *r, struct ref_array *ref_array)
+{
+	struct survey_stats_refs *prs = &survey_stats.refs;
+	int k;
+
+	strintmap_init(&prs->refsmap, 0);
+
+	for (k = 0; k < ref_array->nr; k++) {
+		struct ref_array_item *p = ref_array->items[k];
+		size_t len;
+
+		if (!maybe_count_ref(r, p))
+			continue;
+
+		prs->cnt_total++;
+
+		/*
+		 * SymRefs are somewhat orthogonal to the above
+		 * classification (e.g. "HEAD" --> detached
+		 * and "refs/remotes/origin/HEAD" --> remote) so
+		 * our totals will already include them.
+		 */
+		if (p->flag & REF_ISSYMREF)
+			prs->cnt_symref++;
+
+		/*
+		 * Where/how is the ref stored in GITDIR.
+		 */
+		if (p->flag & REF_ISPACKED)
+			prs->cnt_packed++;
+		else
+			prs->cnt_loose++;
+
+		len = strlen(p->refname);
+
+		if (p->kind == FILTER_REFS_REMOTES) {
+			prs->len_sum_remote_refnames += len;
+			if (len > prs->len_max_remote_refname)
+				prs->len_max_remote_refname = len;
+		} else {
+			prs->len_sum_local_refnames += len;
+			if (len > prs->len_max_local_refname)
+				prs->len_max_local_refname = len;
+		}
+	}
+}
+
 /*
  * The REFS phase:
  *
@@ -224,9 +438,135 @@ static void survey_phase_refs(struct repository *r)
 	do_load_refs(&ref_array);
 	trace2_region_leave("survey", "phase/refs", the_repository);
 
+	trace2_region_enter("survey", "phase/calcstats", the_repository);
+	do_calc_stats_refs(r, &ref_array);
+	trace2_region_leave("survey", "phase/calcstats", the_repository);
+
 	ref_array_clear(&ref_array);
 }
 
+static void json_refs_section(struct json_writer *jw_top, int pretty, int want_trace2)
+{
+	struct survey_stats_refs *prs = &survey_stats.refs;
+	struct json_writer jw_refs = JSON_WRITER_INIT;
+	int k;
+
+	jw_object_begin(&jw_refs, pretty);
+	{
+		jw_object_intmax(&jw_refs, "count", prs->cnt_total);
+
+		jw_object_inline_begin_object(&jw_refs, "count_by_type");
+		{
+			if (survey_opts.refs.want_branches)
+				jw_object_intmax(&jw_refs, "branches", prs->cnt_branches);
+			if (survey_opts.refs.want_tags) {
+				jw_object_intmax(&jw_refs, "lightweight_tags", prs->cnt_lightweight_tags);
+				jw_object_intmax(&jw_refs, "annotated_tags", prs->cnt_annotated_tags);
+			}
+			if (survey_opts.refs.want_remotes)
+				jw_object_intmax(&jw_refs, "remotes", prs->cnt_remotes);
+			if (survey_opts.refs.want_detached)
+				jw_object_intmax(&jw_refs, "detached", prs->cnt_detached);
+			if (survey_opts.refs.want_other)
+				jw_object_intmax(&jw_refs, "other", prs->cnt_other);
+
+			/*
+			 * SymRefs are somewhat orthogonal to
+			 * the above classification
+			 * (e.g. "HEAD" --> detached and
+			 * "refs/remotes/origin/HEAD" -->
+			 * remote) so the above classified
+			 * counts will already include them,
+			 * but it is less confusing to display
+			 * them here than to create a whole
+			 * new section.
+			 */
+			if (prs->cnt_symref)
+				jw_object_intmax(&jw_refs, "symrefs", prs->cnt_symref);
+		}
+		jw_end(&jw_refs);
+
+		jw_object_inline_begin_object(&jw_refs, "count_by_storage");
+		{
+			jw_object_intmax(&jw_refs, "loose_refs", prs->cnt_loose);
+			jw_object_intmax(&jw_refs, "packed_refs", prs->cnt_packed);
+		}
+		jw_end(&jw_refs);
+
+		jw_object_inline_begin_object(&jw_refs, "refname_length");
+		{
+			if (prs->len_sum_local_refnames) {
+				jw_object_intmax(&jw_refs, "max_local", prs->len_max_local_refname);
+				jw_object_intmax(&jw_refs, "sum_local", prs->len_sum_local_refnames);
+			}
+			if (prs->len_sum_remote_refnames) {
+				jw_object_intmax(&jw_refs, "max_remote", prs->len_max_remote_refname);
+				jw_object_intmax(&jw_refs, "sum_remote", prs->len_sum_remote_refnames);
+			}
+		}
+		jw_end(&jw_refs);
+
+		jw_object_inline_begin_array(&jw_refs, "requested");
+		{
+			for (k = 0; k < survey_vec_refs_wanted.nr; k++)
+				jw_array_string(&jw_refs, survey_vec_refs_wanted.v[k]);
+		}
+		jw_end(&jw_refs);
+
+		jw_object_inline_begin_array(&jw_refs, "count_by_class");
+		{
+			struct hashmap_iter iter;
+			struct strmap_entry *entry;
+
+			strintmap_for_each_entry(&prs->refsmap, &iter, entry) {
+				const char *key = entry->key;
+				intptr_t count = (intptr_t)entry->value;
+				int value = count;
+				jw_array_inline_begin_object(&jw_refs);
+				{
+					jw_object_string(&jw_refs, "class", key);
+					jw_object_intmax(&jw_refs, "count", value);
+				}
+				jw_end(&jw_refs);
+			}
+		}
+		jw_end(&jw_refs);
+	}
+	jw_end(&jw_refs);
+
+	if (jw_top)
+		jw_object_sub_jw(jw_top, "refs", &jw_refs);
+
+	if (want_trace2)
+		trace2_data_json("survey", the_repository, "refs", &jw_refs);
+
+	jw_release(&jw_refs);
+}
+
+static void survey_print_json(void)
+{
+	struct json_writer jw_top = JSON_WRITER_INIT;
+	int pretty = 1;
+
+	jw_object_begin(&jw_top, pretty);
+	{
+		json_refs_section(&jw_top, pretty, 0);
+	}
+	jw_end(&jw_top);
+
+	printf("%s\n", jw_top.json.buf);
+
+	jw_release(&jw_top);
+}
+
+static void survey_emit_trace2(void)
+{
+	if (!trace2_is_enabled())
+		return;
+
+	json_refs_section(NULL, 0, 1);
+}
+
 int cmd_survey(int argc, const char **argv, const char *prefix)
 {
 	survey_load_config();
@@ -241,6 +581,9 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 
 	survey_phase_refs(the_repository);
 
+	survey_emit_trace2();
+	survey_print_json();
+
 	strvec_clear(&survey_vec_refs_wanted);
 
 	return 0;

From c9e2ad6a6fa5eb9e54fbf02b2206abb1278a04ce Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 13:21:49 -0400
Subject: [PATCH 05/18] survey: stub in treewalk of reachable commits and
 objects

Add treewalk on the commits and objects reachable from the set
of refs.

This commit sets up the treewalk, but only stubs in the traverse
callbacks.  We'll actually look at the commit and object data
in the next commit.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index 8e1a0e4e871246..2b5e21fa4786f7 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -1,15 +1,20 @@
 #include "builtin.h"
 #include "config.h"
+#include "environment.h"
 #include "json-writer.h"
+#include "list-objects.h"
+#include "object-name.h"
 #include "object-store.h"
 #include "parse-options.h"
 #include "progress.h"
 #include "ref-filter.h"
 #include "refs.h"
+#include "revision.h"
 #include "strbuf.h"
 #include "strmap.h"
 #include "strvec.h"
 #include "trace2.h"
+#include "tree-walk.h"
 
 static const char * const survey_usage[] = {
 	N_("(EXPERIMENTAL!) git survey <options>"),
@@ -244,6 +249,93 @@ static void do_load_refs(struct ref_array *ref_array)
 	ref_sorting_release(sorting);
 }
 
+/*
+ * Populate a "rev_info" with the OIDs of the REFS of interest.
+ * The treewalk will start from all of those starting points
+ * and walk backwards in the DAG to get the set of all reachable
+ * objects from those starting points.
+ */
+static void load_rev_info(struct rev_info *rev_info,
+			  struct ref_array *ref_array)
+{
+	unsigned int add_flags = 0;
+	int k;
+
+	for (k = 0; k < ref_array->nr; k++) {
+		struct ref_array_item *p = ref_array->items[k];
+		struct object_id peeled;
+
+		switch (p->kind) {
+		case FILTER_REFS_TAGS:
+			if (!peel_iterated_oid(rev_info->repo, &p->objectname, &peeled))
+				add_pending_oid(rev_info, NULL, &peeled, add_flags);
+			else
+				add_pending_oid(rev_info, NULL, &p->objectname, add_flags);
+			break;
+		case FILTER_REFS_BRANCHES:
+			add_pending_oid(rev_info, NULL, &p->objectname, add_flags);
+			break;
+		case FILTER_REFS_REMOTES:
+			add_pending_oid(rev_info, NULL, &p->objectname, add_flags);
+			break;
+		case FILTER_REFS_OTHERS:
+			/*
+			 * This may be a note, stash, or custom namespace branch.
+			 */
+			add_pending_oid(rev_info, NULL, &p->objectname, add_flags);
+			break;
+		case FILTER_REFS_DETACHED_HEAD:
+			add_pending_oid(rev_info, NULL, &p->objectname, add_flags);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+static void traverse_commit_cb(struct commit *commit, void *data)
+{
+	if ((++survey_progress_total % 1000) == 0)
+		display_progress(survey_progress, survey_progress_total);
+}
+
+static void traverse_object_cb(struct object *obj, const char *name, void *data)
+{
+	if ((++survey_progress_total % 1000) == 0)
+		display_progress(survey_progress, survey_progress_total);
+}
+
+/*
+ * Treewalk all of the commits and objects reachable from the
+ * set of refs.
+ */
+static void do_treewalk_reachable(struct ref_array *ref_array)
+{
+	struct rev_info rev_info = REV_INFO_INIT;
+
+	repo_init_revisions(the_repository, &rev_info, NULL);
+	rev_info.tree_objects = 1;
+	rev_info.blob_objects = 1;
+	load_rev_info(&rev_info, ref_array);
+	if (prepare_revision_walk(&rev_info))
+		die(_("revision walk setup failed"));
+
+	if (survey_opts.show_progress) {
+		survey_progress_total = 0;
+		survey_progress = start_progress(_("Walking reachable objects..."), 0);
+	}
+
+	traverse_commit_list(&rev_info,
+			     traverse_commit_cb,
+			     traverse_object_cb,
+			     NULL);
+
+	if (survey_opts.show_progress)
+		stop_progress(&survey_progress);
+
+	release_revisions(&rev_info);
+}
+
 /*
  * If we want this type of ref, increment counters and return 1.
  */
@@ -438,6 +530,10 @@ static void survey_phase_refs(struct repository *r)
 	do_load_refs(&ref_array);
 	trace2_region_leave("survey", "phase/refs", the_repository);
 
+	trace2_region_enter("survey", "phase/treewalk", the_repository);
+	do_treewalk_reachable(&ref_array);
+	trace2_region_leave("survey", "phase/treewalk", the_repository);
+
 	trace2_region_enter("survey", "phase/calcstats", the_repository);
 	do_calc_stats_refs(r, &ref_array);
 	trace2_region_leave("survey", "phase/calcstats", the_repository);

From 72491d74d8573805957ad509124a54ee83bd0cdd Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 29 Apr 2024 15:40:00 -0400
Subject: [PATCH 06/18] survey: add traverse callback for commits

Add callback to handle commit objects during the treewalk. Count the
number of commits and group them by the number of parents.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 474 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 473 insertions(+), 1 deletion(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index 2b5e21fa4786f7..fa3ae48821e563 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -1,6 +1,7 @@
 #include "builtin.h"
 #include "config.h"
 #include "environment.h"
+#include "hex.h"
 #include "json-writer.h"
 #include "list-objects.h"
 #include "object-name.h"
@@ -14,6 +15,7 @@
 #include "strmap.h"
 #include "strvec.h"
 #include "trace2.h"
+#include "tree.h"
 #include "tree-walk.h"
 
 static const char * const survey_usage[] = {
@@ -191,8 +193,162 @@ struct survey_stats_refs {
 	struct strintmap refsmap;
 };
 
+/*
+ * HBIN -- hex binning (histogram bucketing).
+ *
+ * We create histograms for various counts and sums.  Since we have a
+ * wide range of values (objects range in size from 1 to 4G bytes), a
+ * linear bucketing is not interesting.  Instead, lets use a
+ * log16()-based bucketing.  This gives us a better spread on the low
+ * and middle range and a coarse bucketing on the high end.
+ *
+ * The idea here is that it doesn't matter if you have n 1GB blobs or
+ * n/2 1GB blobs and n/2 1.5GB blobs -- either way you have a scaling
+ * problem that we want to report on.
+ */
+#define HBIN_LEN (sizeof(unsigned long) * 2)
+#define HBIN_MASK (0xF)
+#define HBIN_SHIFT (4)
+
+static int hbin(unsigned long value)
+{
+	int k;
+
+	for (k = 0; k < HBIN_LEN; k++) {
+		if ((value & ~(HBIN_MASK)) == 0)
+			return k;
+		value >>= HBIN_SHIFT;
+	}
+
+	return 0; /* should not happen */
+}
+
+/*
+ * QBIN -- base4 binning (histogram bucketing).
+ *
+ * This is the same idea as the above, but we want better granularity
+ * in the low end and don't expect as many large values.
+ */
+#define QBIN_LEN (sizeof(unsigned long) * 4)
+#define QBIN_MASK (0x3)
+#define QBIN_SHIFT (2)
+
+static int qbin(unsigned long value)
+{
+	int k;
+
+	for (k = 0; k < QBIN_LEN; k++) {
+		if ((value & ~(QBIN_MASK)) == 0)
+			return k;
+		value >>= (QBIN_SHIFT);
+	}
+
+	return 0; /* should not happen */
+}
+
+/*
+ * histogram bin for objects.
+ */
+struct obj_hist_bin {
+	uint64_t sum_size;      /* sum(object_size) for all objects in this bin */
+	uint64_t sum_disk_size; /* sum(on_disk_size) for all objects in this bin */
+	uint32_t cnt_seen;      /* number seen in this bin */
+};
+
+static void incr_obj_hist_bin(struct obj_hist_bin *pbin,
+			       unsigned long object_length,
+			       off_t disk_sizep)
+{
+	pbin->sum_size += object_length;
+	pbin->sum_disk_size += disk_sizep;
+	pbin->cnt_seen++;
+}
+
+/*
+ * Common fields for any type of object.
+ */
+struct survey_stats_base_object {
+	uint32_t cnt_seen;
+
+	uint32_t cnt_missing; /* we may have a partial clone. */
+
+	/*
+	 * Number of objects grouped by where they are stored on disk.
+	 * This is a function of how the ODB is packed.
+	 */
+	uint32_t cnt_cached;   /* see oi.whence */
+	uint32_t cnt_loose;    /* see oi.whence */
+	uint32_t cnt_packed;   /* see oi.whence */
+	uint32_t cnt_dbcached; /* see oi.whence */
+
+	uint64_t sum_size; /* sum(object_size) */
+	uint64_t sum_disk_size; /* sum(disk_size) */
+
+	/*
+	 * A histogram of the count of objects, the observed size, and
+	 * the on-disk size grouped by the observed size.
+	 */
+	struct obj_hist_bin size_hbin[HBIN_LEN];
+};
+
+/*
+ * PBIN -- parent vector binning (histogram bucketing).
+ *
+ * We create a histogram based upon the number of parents
+ * in a commit.  This is a simple linear vector.  It starts
+ * at zero for "initial" commits.
+ *
+ * If a commit has more parents, just put it in the last bin.
+ */
+#define PBIN_VEC_LEN (17)
+
+struct survey_stats_commits {
+	struct survey_stats_base_object base;
+
+	/*
+	 * Count of commits with k parents.
+	 */
+	uint32_t parent_cnt_pbin[PBIN_VEC_LEN];
+};
+
+/*
+ * Stats for reachable trees.
+ */
+struct survey_stats_trees {
+	struct survey_stats_base_object base;
+
+	/*
+	 * In the following, nr_entries refers to the number of files or
+	 * subdirectories in a tree.  We are interested in how wide the
+	 * tree is and if the repo has gigantic directories.
+	 */
+	uint64_t max_entries; /* max(nr_entries) -- the width of the largest tree */
+
+	/*
+	 * Computing the sum of the number of entries across all trees
+	 * is probably not that interesting.
+	 */
+	uint64_t sum_entries; /* sum(nr_entries) -- sum across all trees */
+
+	/*
+	 * A histogram of the count of trees, the observed size, and
+	 * the on-disk size grouped by the number of entries in the tree.
+	 */
+	struct obj_hist_bin entry_qbin[QBIN_LEN];
+};
+
+/*
+ * Stats for reachable blobs.
+ */
+struct survey_stats_blobs {
+	struct survey_stats_base_object base;
+};
+
 struct survey_stats {
-	struct survey_stats_refs refs;
+	struct survey_stats_refs    refs;
+	struct survey_stats_commits commits;
+	struct survey_stats_trees   trees;
+	struct survey_stats_blobs   blobs;
 };
 
 static struct survey_stats survey_stats = { 0 };
@@ -293,16 +449,134 @@ static void load_rev_info(struct rev_info *rev_info,
 	}
 }
 
+static int fill_in_base_object(struct survey_stats_base_object *base,
+			       struct object *object,
+			       enum object_type type_expected,
+			       unsigned long *p_object_length,
+			       off_t *p_disk_sizep)
+{
+	struct object_info oi = OBJECT_INFO_INIT;
+	unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH;
+	unsigned long object_length = 0;
+	off_t disk_sizep = 0;
+	enum object_type type;
+	int hb;
+
+	base->cnt_seen++;
+
+	oi.typep = &type;
+	oi.sizep = &object_length;
+	oi.disk_sizep = &disk_sizep;
+
+	if (oid_object_info_extended(the_repository, &object->oid, &oi, oi_flags) < 0 ||
+	    type != type_expected) {
+		base->cnt_missing++;
+		return 1;
+	}
+
+	switch (oi.whence) {
+	case OI_CACHED:
+		base->cnt_cached++;
+		break;
+	case OI_LOOSE:
+		base->cnt_loose++;
+		break;
+	case OI_PACKED:
+		base->cnt_packed++;
+		break;
+	case OI_DBCACHED:
+		base->cnt_dbcached++;
+		break;
+	default:
+		break;
+	}
+
+	base->sum_size += object_length;
+	base->sum_disk_size += disk_sizep;
+
+	hb = hbin(object_length);
+	incr_obj_hist_bin(&base->size_hbin[hb], object_length, disk_sizep);
+
+	if (p_object_length)
+		*p_object_length = object_length;
+	if (p_disk_sizep)
+		*p_disk_sizep = disk_sizep;
+
+	return 0;
+}
+
 static void traverse_commit_cb(struct commit *commit, void *data)
 {
+	struct survey_stats_commits *psc = &survey_stats.commits;
+	unsigned k;
+
 	if ((++survey_progress_total % 1000) == 0)
 		display_progress(survey_progress, survey_progress_total);
+
+	fill_in_base_object(&psc->base, &commit->object, OBJ_COMMIT, NULL, NULL);
+
+	k = commit_list_count(commit->parents);
+	if (k >= PBIN_VEC_LEN)
+		k = PBIN_VEC_LEN - 1;
+
+	psc->parent_cnt_pbin[k]++;
+}
+
+static void traverse_object_cb_tree(struct object *obj)
+{
+	struct survey_stats_trees *pst = &survey_stats.trees;
+	unsigned long object_length;
+	off_t disk_sizep;
+	struct tree_desc desc;
+	struct name_entry entry;
+	struct tree *tree;
+	int nr_entries;
+	int qb;
+
+	if (fill_in_base_object(&pst->base, obj, OBJ_TREE, &object_length, &disk_sizep))
+		return;
+
+	tree = lookup_tree(the_repository, &obj->oid);
+	if (!tree)
+		return;
+	init_tree_desc(&desc, &obj->oid, tree->buffer, tree->size);
+	nr_entries = 0;
+	while (tree_entry(&desc, &entry))
+		nr_entries++;
+
+	pst->sum_entries += nr_entries;
+
+	if (nr_entries > pst->max_entries)
+		pst->max_entries = nr_entries;
+
+	qb = qbin(nr_entries);
+	incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep);
+}
+
+static void traverse_object_cb_blob(struct object *obj)
+{
+	struct survey_stats_blobs *psb = &survey_stats.blobs;
+
+	fill_in_base_object(&psb->base, obj, OBJ_BLOB, NULL, NULL);
 }
 
 static void traverse_object_cb(struct object *obj, const char *name, void *data)
 {
 	if ((++survey_progress_total % 1000) == 0)
 		display_progress(survey_progress, survey_progress_total);
+
+	switch (obj->type) {
+	case OBJ_TREE:
+		traverse_object_cb_tree(obj);
+		return;
+	case OBJ_BLOB:
+		traverse_object_cb_blob(obj);
+		return;
+	case OBJ_TAG:    /* ignore     -- counted when loading REFS */
+	case OBJ_COMMIT: /* ignore/bug -- seen in the other callback */
+	default:         /* ignore/bug -- unknown type */
+		return;
+	}
 }
 
 /*
@@ -639,6 +913,198 @@ static void json_refs_section(struct json_writer *jw_top, int pretty, int want_t
 	jw_release(&jw_refs);
 }
 
+#define JW_OBJ_INT_NZ(jw, key, value) do { if (value) jw_object_intmax((jw), (key), (value)); } while (0)
+
+static void write_qbin_json(struct json_writer *jw, const char *label,
+			    struct obj_hist_bin qbin[QBIN_LEN])
+{
+	struct strbuf buf = STRBUF_INIT;
+	uint32_t lower = 0;
+	uint32_t upper = QBIN_MASK;
+	int k;
+
+	jw_object_inline_begin_object(jw, label);
+	{
+		for (k = 0; k < QBIN_LEN; k++) {
+			struct obj_hist_bin *p = &qbin[k];
+			uint32_t lower_k = lower;
+			uint32_t upper_k = upper;
+
+			lower = upper+1;
+			upper = (upper << QBIN_SHIFT) + QBIN_MASK;
+
+			if (!p->cnt_seen)
+				continue;
+
+			strbuf_reset(&buf);
+			strbuf_addf(&buf, "Q%02d", k);
+			jw_object_inline_begin_object(jw, buf.buf);
+			{
+				jw_object_intmax(jw, "count", p->cnt_seen);
+				jw_object_intmax(jw, "sum_size", p->sum_size);
+				jw_object_intmax(jw, "sum_disk_size", p->sum_disk_size);
+
+				/* maybe only include these in verbose mode */
+				jw_object_intmax(jw, "qbin_lower", lower_k);
+				jw_object_intmax(jw, "qbin_upper", upper_k);
+			}
+			jw_end(jw);
+		}
+	}
+	jw_end(jw);
+
+	strbuf_release(&buf);
+}
+
+static void write_hbin_json(struct json_writer *jw, const char *label,
+			    struct obj_hist_bin hbin[HBIN_LEN])
+{
+	struct strbuf buf = STRBUF_INIT;
+	uint32_t lower = 0;
+	uint32_t upper = HBIN_MASK;
+	int k;
+
+	jw_object_inline_begin_object(jw, label);
+	{
+		for (k = 0; k < HBIN_LEN; k++) {
+			struct obj_hist_bin *p = &hbin[k];
+			uint32_t lower_k = lower;
+			uint32_t upper_k = upper;
+
+			lower = upper+1;
+			upper = (upper << HBIN_SHIFT) + HBIN_MASK;
+
+			if (!p->cnt_seen)
+				continue;
+
+			strbuf_reset(&buf);
+			strbuf_addf(&buf, "H%d", k);
+			jw_object_inline_begin_object(jw, buf.buf);
+			{
+				jw_object_intmax(jw, "count", p->cnt_seen);
+				jw_object_intmax(jw, "sum_size", p->sum_size);
+				jw_object_intmax(jw, "sum_disk_size", p->sum_disk_size);
+
+				/* maybe only include these in verbose mode */
+				jw_object_intmax(jw, "hbin_lower", lower_k);
+				jw_object_intmax(jw, "hbin_upper", upper_k);
+			}
+			jw_end(jw);
+		}
+	}
+	jw_end(jw);
+
+	strbuf_release(&buf);
+}
+
+static void write_base_object_json(struct json_writer *jw,
+				   struct survey_stats_base_object *base)
+{
+	jw_object_intmax(jw, "count", base->cnt_seen);
+
+	jw_object_intmax(jw, "sum_size", base->sum_size);
+	jw_object_intmax(jw, "sum_disk_size", base->sum_disk_size);
+
+	jw_object_inline_begin_object(jw, "count_by_whence");
+	{
+		/*
+		 * Missing is not technically a "whence" value, but
+		 * we don't need to clutter up the results with that
+		 * distinction.
+		 */
+		JW_OBJ_INT_NZ(jw, "missing", base->cnt_missing);
+
+		JW_OBJ_INT_NZ(jw, "cached", base->cnt_cached);
+		JW_OBJ_INT_NZ(jw, "loose", base->cnt_loose);
+		JW_OBJ_INT_NZ(jw, "packed", base->cnt_packed);
+		JW_OBJ_INT_NZ(jw, "dbcached", base->cnt_dbcached);
+	}
+	jw_end(jw);
+
+	write_hbin_json(jw, "dist_by_size", base->size_hbin);
+}
+
+static void json_commits_section(struct json_writer *jw_top, int pretty, int want_trace2)
+{
+	struct survey_stats_commits *psc = &survey_stats.commits;
+	struct json_writer jw_commits = JSON_WRITER_INIT;
+
+	jw_object_begin(&jw_commits, pretty);
+	{
+		write_base_object_json(&jw_commits, &psc->base);
+
+		jw_object_inline_begin_object(&jw_commits, "count_by_nr_parents");
+		{
+			struct strbuf parent_key = STRBUF_INIT;
+			int k;
+
+			for (k = 0; k < PBIN_VEC_LEN; k++)
+				if (psc->parent_cnt_pbin[k]) {
+					strbuf_reset(&parent_key);
+					strbuf_addf(&parent_key, "P%02d", k);
+					jw_object_intmax(&jw_commits, parent_key.buf, psc->parent_cnt_pbin[k]);
+				}
+
+			strbuf_release(&parent_key);
+		}
+		jw_end(&jw_commits);
+	}
+	jw_end(&jw_commits);
+
+	if (jw_top)
+		jw_object_sub_jw(jw_top, "commits", &jw_commits);
+
+	if (want_trace2)
+		trace2_data_json("survey", the_repository, "commits", &jw_commits);
+
+	jw_release(&jw_commits);
+}
+
+static void json_trees_section(struct json_writer *jw_top, int pretty, int want_trace2)
+{
+	struct survey_stats_trees *pst = &survey_stats.trees;
+	struct json_writer jw_trees = JSON_WRITER_INIT;
+
+	jw_object_begin(&jw_trees, pretty);
+	{
+		write_base_object_json(&jw_trees, &pst->base);
+
+		jw_object_intmax(&jw_trees, "max_entries", pst->max_entries);
+		jw_object_intmax(&jw_trees, "sum_entries", pst->sum_entries);
+
+		write_qbin_json(&jw_trees, "dist_by_nr_entries", pst->entry_qbin);
+	}
+	jw_end(&jw_trees);
+
+	if (jw_top)
+		jw_object_sub_jw(jw_top, "trees", &jw_trees);
+
+	if (want_trace2)
+		trace2_data_json("survey", the_repository, "trees", &jw_trees);
+
+	jw_release(&jw_trees);
+}
+
+static void json_blobs_section(struct json_writer *jw_top, int pretty, int want_trace2)
+{
+	struct survey_stats_blobs *psb = &survey_stats.blobs;
+	struct json_writer jw_blobs = JSON_WRITER_INIT;
+
+	jw_object_begin(&jw_blobs, pretty);
+	{
+		write_base_object_json(&jw_blobs, &psb->base);
+	}
+	jw_end(&jw_blobs);
+
+	if (jw_top)
+		jw_object_sub_jw(jw_top, "blobs", &jw_blobs);
+
+	if (want_trace2)
+		trace2_data_json("survey", the_repository, "blobs", &jw_blobs);
+
+	jw_release(&jw_blobs);
+}
+
 static void survey_print_json(void)
 {
 	struct json_writer jw_top = JSON_WRITER_INIT;
@@ -647,6 +1113,9 @@ static void survey_print_json(void)
 	jw_object_begin(&jw_top, pretty);
 	{
 		json_refs_section(&jw_top, pretty, 0);
+		json_commits_section(&jw_top, pretty, 0);
+		json_trees_section(&jw_top, pretty, 0);
+		json_blobs_section(&jw_top, pretty, 0);
 	}
 	jw_end(&jw_top);
 
@@ -661,6 +1130,9 @@ static void survey_emit_trace2(void)
 		return;
 
 	json_refs_section(NULL, 0, 1);
+	json_commits_section(NULL, 0, 1);
+	json_trees_section(NULL, 0, 1);
+	json_blobs_section(NULL, 0, 1);
 }
 
 int cmd_survey(int argc, const char **argv, const char *prefix)

From 79f0a6bc3a3cf7fe26d95131083f5eb9b9ac694e Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Wed, 1 May 2024 12:56:38 -0400
Subject: [PATCH 07/18] survey: add vector of largest objects for various
 scaling dimensions

Create `struct large_item` and `struct large_item_vec` to capture the
n largest commits, trees, and blobs under various scaling dimensions,
such as size in bytes, number of commit parents, or number of entries
in a tree.

Each of these have a command line option to set them independently.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 Documentation/config.txt        |   2 +
 Documentation/config/survey.txt |  36 +++++
 Documentation/git-survey.txt    |  31 ++++
 builtin/survey.c                | 245 ++++++++++++++++++++++++++++++--
 4 files changed, 304 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/config/survey.txt

diff --git a/Documentation/config.txt b/Documentation/config.txt
index fedfaf30cd0d8b..939cc1387992f8 100644
--- a/Documentation/config.txt
+++ b/Documentation/config.txt
@@ -536,6 +536,8 @@ include::config/status.txt[]
 
 include::config/submodule.txt[]
 
+include::config/survey.txt[]
+
 include::config/tag.txt[]
 
 include::config/tar.txt[]
diff --git a/Documentation/config/survey.txt b/Documentation/config/survey.txt
new file mode 100644
index 00000000000000..672e7890ed2f79
--- /dev/null
+++ b/Documentation/config/survey.txt
@@ -0,0 +1,36 @@
+survey.progress::
+	Boolean to show/hide progress information.  Defaults to
+	true when interactive (stderr is bound to a TTY).
+
+survey.showBlobSizes::
+	A non-negative integer value.  Requests details on the <n>
+	largest file blobs by size in bytes.  Provides a default
+	value for `--blob-sizes=<n>` in linkgit:git-survey[1].
+
+survey.showCommitParents::
+	A non-negative integer value.  Requests details on the <n>
+	commits with the most number of parents.  Provides a default
+	value for `--commit-parents=<n>` in linkgit:git-survey[1].
+
+survey.showCommitSizes::
+	A non-negative integer value.  Requests details on the <n>
+	largest commits by size in bytes.  Generally, these are the
+	commits with the largest commit messages.  Provides a default
+	value for `--commit-sizes=<n>` in linkgit:git-survey[1].
+
+survey.showTreeEntries::
+	A non-negative integer value.  Requests details on the <n>
+	trees (directories) with the most number of entries (files
+	and subdirectories).  Provides a default value for
+	`--tree-entries=<n>` in linkgit:git-survey[1].
+
+survey.showTreeSizes::
+	A non-negative integer value.  Requests details on the <n>
+	largest trees (directories) by size in bytes.  This will
+	set will usually be equal to the `survey.showTreeEntries`
+	set, but may be skewed by very long file or subdirectory
+	entry names.  Provides a default value for
+	`--tree-sizes=<n>` in linkgit:git-survey[1].
+
+survey.verbose::
+	Boolean to show/hide verbose output.  Default to false.
diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt
index c648ef704e3806..771a063efdc594 100644
--- a/Documentation/git-survey.txt
+++ b/Documentation/git-survey.txt
@@ -59,12 +59,43 @@ only refs for the given options are added.
 --other::
 	Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set.
 
+Large Item Selection
+~~~~~~~~~~~~~~~~~~~~
+
+The following options control the optional display of large items under
+various dimensions of scale.  The OID of the largest `n` objects will be
+displayed in reverse sorted order.  For each, `n` defaults to 10.
+
+--commit-parents::
+	Shows the OIDs of the commits with the most parent commits.
+
+--commit-sizes::
+	Shows the OIDs of the largest commits by size in bytes.  This is
+	usually the ones with the largest commit messages.
+
+--tree-entries::
+	Shows the OIDs of the trees with the most number of entries.  These
+	are the directories with the most number of files or subdirectories.
+
+--tree-sizes::
+	Shows the OIDs of the largest trees by size in bytes.  This set
+	will usually be the same as the vector of number of entries unless
+	skewed by very long entry names.
+
+--blob-sizes::
+	Shows the OIDs of the largest blobs by size in bytes.
+
 OUTPUT
 ------
 
 By default, `git survey` will print information about the repository in a
 human-readable format that includes overviews and tables.
 
+CONFIGURATION
+-------------
+
+include::config/survey.txt[]
+
 GIT
 ---
 Part of the linkgit:git[1] suite
diff --git a/builtin/survey.c b/builtin/survey.c
index fa3ae48821e563..cfface7948884d 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -55,13 +55,36 @@ static struct survey_refs_wanted refs_if_unspecified = {
 struct survey_opts {
 	int verbose;
 	int show_progress;
+
+	int show_largest_commits_by_nr_parents;
+	int show_largest_commits_by_size_bytes;
+
+	int show_largest_trees_by_nr_entries;
+	int show_largest_trees_by_size_bytes;
+
+	int show_largest_blobs_by_size_bytes;
+
 	struct survey_refs_wanted refs;
 };
 
+#define DEFAULT_SHOW_LARGEST_VALUE (10)
+
 static struct survey_opts survey_opts = {
 	.verbose = 0,
 	.show_progress = -1, /* defaults to isatty(2) */
 
+	/*
+	 * Show the largest `n` objects for some scaling dimension.
+	 * We allow each to be requested independently.
+	 */
+	.show_largest_commits_by_nr_parents = DEFAULT_SHOW_LARGEST_VALUE,
+	.show_largest_commits_by_size_bytes = DEFAULT_SHOW_LARGEST_VALUE,
+
+	.show_largest_trees_by_nr_entries = DEFAULT_SHOW_LARGEST_VALUE,
+	.show_largest_trees_by_size_bytes = DEFAULT_SHOW_LARGEST_VALUE,
+
+	.show_largest_blobs_by_size_bytes = DEFAULT_SHOW_LARGEST_VALUE,
+
 	.refs.want_all_refs = 0,
 
 	.refs.want_branches = -1, /* default these to undefined */
@@ -139,6 +162,14 @@ static struct option survey_options[] = {
 	OPT_BOOL_F(0, "detached", &survey_opts.refs.want_detached, N_("include detached HEAD"),     PARSE_OPT_NONEG),
 	OPT_BOOL_F(0, "other",    &survey_opts.refs.want_other,    N_("include notes and stashes"), PARSE_OPT_NONEG),
 
+	OPT_INTEGER_F(0, "commit-parents", &survey_opts.show_largest_commits_by_nr_parents, N_("show N largest commits by parent count"),  PARSE_OPT_NONEG),
+	OPT_INTEGER_F(0, "commit-sizes",   &survey_opts.show_largest_commits_by_size_bytes, N_("show N largest commits by size in bytes"), PARSE_OPT_NONEG),
+
+	OPT_INTEGER_F(0, "tree-entries",   &survey_opts.show_largest_trees_by_nr_entries,   N_("show N largest trees by entry count"),     PARSE_OPT_NONEG),
+	OPT_INTEGER_F(0, "tree-sizes",     &survey_opts.show_largest_trees_by_size_bytes,   N_("show N largest trees by size in bytes"),   PARSE_OPT_NONEG),
+
+	OPT_INTEGER_F(0, "blob-sizes",     &survey_opts.show_largest_blobs_by_size_bytes,   N_("show N largest blobs by size in bytes"),   PARSE_OPT_NONEG),
+
 	OPT_END(),
 };
 
@@ -154,6 +185,29 @@ static int survey_load_config_cb(const char *var, const char *value,
 		return 0;
 	}
 
+	if (!strcmp(var, "survey.showcommitparents")) {
+		survey_opts.show_largest_commits_by_nr_parents = git_config_ulong(var, value, ctx->kvi);
+		return 0;
+	}
+	if (!strcmp(var, "survey.showcommitsizes")) {
+		survey_opts.show_largest_commits_by_size_bytes = git_config_ulong(var, value, ctx->kvi);
+		return 0;
+	}
+
+	if (!strcmp(var, "survey.showtreeentries")) {
+		survey_opts.show_largest_trees_by_nr_entries = git_config_ulong(var, value, ctx->kvi);
+		return 0;
+	}
+	if (!strcmp(var, "survey.showtreesizes")) {
+		survey_opts.show_largest_trees_by_size_bytes = git_config_ulong(var, value, ctx->kvi);
+		return 0;
+	}
+
+	if (!strcmp(var, "survey.showblobsizes")) {
+		survey_opts.show_largest_blobs_by_size_bytes = git_config_ulong(var, value, ctx->kvi);
+		return 0;
+	}
+
 	return git_default_config(var, value, ctx, pvoid);
 }
 
@@ -264,6 +318,84 @@ static void incr_obj_hist_bin(struct obj_hist_bin *pbin,
 	pbin->cnt_seen++;
 }
 
+/*
+ * Remember the largest n objects for some scaling dimension.  This
+ * could be the observed object size or number of entries in a tree.
+ * We'll use this to generate a sorted vector in the output for that
+ * dimension.
+ */
+struct large_item {
+	uint64_t size;
+	struct object_id oid;
+};
+
+struct large_item_vec {
+	char *dimension_label;
+	char *item_label;
+	uint64_t nr_items;
+	struct large_item items[FLEX_ARRAY]; /* nr_items */
+};
+
+static struct large_item_vec *alloc_large_item_vec(const char *dimension_label,
+						   const char *item_label,
+						   uint64_t nr_items)
+{
+	struct large_item_vec *vec;
+	size_t flex_len = nr_items * sizeof(struct large_item);
+
+	if (!nr_items)
+		return NULL;
+
+	vec = xcalloc(1, (sizeof(struct large_item_vec) + flex_len));
+	vec->dimension_label = strdup(dimension_label);
+	vec->item_label = strdup(item_label);
+	vec->nr_items = nr_items;
+
+	return vec;
+}
+
+static void free_large_item_vec(struct large_item_vec *vec)
+{
+	free(vec->dimension_label);
+	free(vec->item_label);
+	free(vec);
+}
+
+static void maybe_insert_large_item(struct large_item_vec *vec,
+				    uint64_t size,
+				    struct object_id *oid)
+{
+	size_t rest_len;
+	size_t k;
+
+	if (!vec || !vec->nr_items)
+		return;
+
+	/*
+	 * Since the odds an object being among the largest n
+	 * is small, shortcut and see if it is smaller than
+	 * the smallest one in our set and quickly reject it.
+	 */
+	if (size < vec->items[vec->nr_items - 1].size)
+		return;
+
+	for (k = 0; k < vec->nr_items; k++) {
+		if (size < vec->items[k].size)
+			continue;
+
+		/* push items[k..] down one and insert it here */
+
+		rest_len = (vec->nr_items - k - 1) * sizeof(struct large_item);
+		if (rest_len)
+			memmove(&vec->items[k + 1], &vec->items[k], rest_len);
+
+		memset(&vec->items[k], 0, sizeof(struct large_item));
+		vec->items[k].size = size;
+		oidcpy(&vec->items[k].oid, oid);
+		return;
+	}
+}
+
 /*
  * Common fields for any type of object.
  */
@@ -309,6 +441,9 @@ struct survey_stats_commits {
 	 * Count of commits with k parents.
 	 */
 	uint32_t parent_cnt_pbin[PBIN_VEC_LEN];
+
+	struct large_item_vec *vec_largest_by_nr_parents;
+	struct large_item_vec *vec_largest_by_size_bytes;
 };
 
 /*
@@ -318,11 +453,18 @@ struct survey_stats_trees {
 	struct survey_stats_base_object base;
 
 	/*
-	 * In the following, nr_entries refers to the number of files or
-	 * subdirectories in a tree.  We are interested in how wide the
-	 * tree is and if the repo has gigantic directories.
+	 * Keep a vector of the trees with the most number of entries.
+	 * This gives us a feel for the width of a tree when there are
+	 * gigantic directories.
+	 */
+	struct large_item_vec *vec_largest_by_nr_entries;
+
+	/*
+	 * Keep a vector of the trees with the largest size in bytes.
+	 * The contents of this may or may not match items in the other
+	 * vector, since entryname length can alter the results.
 	 */
-	uint64_t max_entries; /* max(nr_entries) -- the width of the largest tree */
+	struct large_item_vec *vec_largest_by_size_bytes;
 
 	/*
 	 * Computing the sum of the number of entries across all trees
@@ -342,6 +484,11 @@ struct survey_stats_trees {
  */
 struct survey_stats_blobs {
 	struct survey_stats_base_object base;
+
+	/*
+	 * Remember the OIDs of the largest n blobs.
+	 */
+	struct large_item_vec *vec_largest_by_size_bytes;
 };
 
 struct survey_stats {
@@ -508,17 +655,21 @@ static int fill_in_base_object(struct survey_stats_base_object *base,
 static void traverse_commit_cb(struct commit *commit, void *data)
 {
 	struct survey_stats_commits *psc = &survey_stats.commits;
+	unsigned long object_length;
 	unsigned k;
 
 	if ((++survey_progress_total % 1000) == 0)
 		display_progress(survey_progress, survey_progress_total);
 
-	fill_in_base_object(&psc->base, &commit->object, OBJ_COMMIT, NULL, NULL);
+	fill_in_base_object(&psc->base, &commit->object, OBJ_COMMIT, &object_length, NULL);
 
 	k = commit_list_count(commit->parents);
+
+	maybe_insert_large_item(psc->vec_largest_by_nr_parents, k, &commit->object.oid);
+	maybe_insert_large_item(psc->vec_largest_by_size_bytes, object_length, &commit->object.oid);
+
 	if (k >= PBIN_VEC_LEN)
 		k = PBIN_VEC_LEN - 1;
-
 	psc->parent_cnt_pbin[k]++;
 }
 
@@ -546,8 +697,8 @@ static void traverse_object_cb_tree(struct object *obj)
 
 	pst->sum_entries += nr_entries;
 
-	if (nr_entries > pst->max_entries)
-		pst->max_entries = nr_entries;
+	maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries, &obj->oid);
+	maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length, &obj->oid);
 
 	qb = qbin(nr_entries);
 	incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep);
@@ -556,8 +707,11 @@ static void traverse_object_cb_tree(struct object *obj)
 static void traverse_object_cb_blob(struct object *obj)
 {
 	struct survey_stats_blobs *psb = &survey_stats.blobs;
+	unsigned long object_length;
 
-	fill_in_base_object(&psb->base, obj, OBJ_BLOB, NULL, NULL);
+	fill_in_base_object(&psb->base, obj, OBJ_BLOB, &object_length, NULL);
+
+	maybe_insert_large_item(psb->vec_largest_by_size_bytes, object_length, &obj->oid);
 }
 
 static void traverse_object_cb(struct object *obj, const char *name, void *data)
@@ -1024,6 +1178,32 @@ static void write_base_object_json(struct json_writer *jw,
 	write_hbin_json(jw, "dist_by_size", base->size_hbin);
 }
 
+static void write_large_item_vec_json(struct json_writer *jw,
+				      struct large_item_vec *vec)
+{
+	if (!vec || !vec->nr_items)
+		return;
+
+	jw_object_inline_begin_array(jw, vec->dimension_label);
+	{
+		int k;
+
+		for (k = 0; k < vec->nr_items; k++) {
+			struct large_item *pk = &vec->items[k];
+			if (is_null_oid(&pk->oid))
+				break;
+
+			jw_array_inline_begin_object(jw);
+			{
+				jw_object_intmax(jw, vec->item_label, pk->size);
+				jw_object_string(jw, "oid", oid_to_hex(&pk->oid));
+			}
+			jw_end(jw);
+		}
+	}
+	jw_end(jw);
+}
+
 static void json_commits_section(struct json_writer *jw_top, int pretty, int want_trace2)
 {
 	struct survey_stats_commits *psc = &survey_stats.commits;
@@ -1033,6 +1213,9 @@ static void json_commits_section(struct json_writer *jw_top, int pretty, int wan
 	{
 		write_base_object_json(&jw_commits, &psc->base);
 
+		write_large_item_vec_json(&jw_commits, psc->vec_largest_by_nr_parents);
+		write_large_item_vec_json(&jw_commits, psc->vec_largest_by_size_bytes);
+
 		jw_object_inline_begin_object(&jw_commits, "count_by_nr_parents");
 		{
 			struct strbuf parent_key = STRBUF_INIT;
@@ -1069,9 +1252,11 @@ static void json_trees_section(struct json_writer *jw_top, int pretty, int want_
 	{
 		write_base_object_json(&jw_trees, &pst->base);
 
-		jw_object_intmax(&jw_trees, "max_entries", pst->max_entries);
 		jw_object_intmax(&jw_trees, "sum_entries", pst->sum_entries);
 
+		write_large_item_vec_json(&jw_trees, pst->vec_largest_by_nr_entries);
+		write_large_item_vec_json(&jw_trees, pst->vec_largest_by_size_bytes);
+
 		write_qbin_json(&jw_trees, "dist_by_nr_entries", pst->entry_qbin);
 	}
 	jw_end(&jw_trees);
@@ -1093,6 +1278,8 @@ static void json_blobs_section(struct json_writer *jw_top, int pretty, int want_
 	jw_object_begin(&jw_blobs, pretty);
 	{
 		write_base_object_json(&jw_blobs, &psb->base);
+
+		write_large_item_vec_json(&jw_blobs, psb->vec_largest_by_size_bytes);
 	}
 	jw_end(&jw_blobs);
 
@@ -1147,12 +1334,50 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 		survey_opts.show_progress = isatty(2);
 	fixup_refs_wanted();
 
+	if (survey_opts.show_largest_commits_by_nr_parents)
+		survey_stats.commits.vec_largest_by_nr_parents =
+			alloc_large_item_vec(
+				"largest_commits_by_nr_parents",
+				"nr_parents",
+				survey_opts.show_largest_commits_by_nr_parents);
+	if (survey_opts.show_largest_commits_by_size_bytes)
+		survey_stats.commits.vec_largest_by_size_bytes =
+			alloc_large_item_vec(
+				"largest_commits_by_size_bytes",
+				"size",
+				survey_opts.show_largest_commits_by_size_bytes);
+
+	if (survey_opts.show_largest_trees_by_nr_entries)
+		survey_stats.trees.vec_largest_by_nr_entries =
+			alloc_large_item_vec(
+				"largest_trees_by_nr_entries",
+				"nr_entries",
+				survey_opts.show_largest_trees_by_nr_entries);
+	if (survey_opts.show_largest_trees_by_size_bytes)
+		survey_stats.trees.vec_largest_by_size_bytes =
+			alloc_large_item_vec(
+				"largest_trees_by_size_bytes",
+				"size",
+				survey_opts.show_largest_trees_by_size_bytes);
+
+	if (survey_opts.show_largest_blobs_by_size_bytes)
+		survey_stats.blobs.vec_largest_by_size_bytes =
+			alloc_large_item_vec(
+				"largest_blobs_by_size_bytes",
+				"size",
+				survey_opts.show_largest_blobs_by_size_bytes);
+
 	survey_phase_refs(the_repository);
 
 	survey_emit_trace2();
 	survey_print_json();
 
 	strvec_clear(&survey_vec_refs_wanted);
+	free_large_item_vec(survey_stats.commits.vec_largest_by_nr_parents);
+	free_large_item_vec(survey_stats.commits.vec_largest_by_size_bytes);
+	free_large_item_vec(survey_stats.trees.vec_largest_by_nr_entries);
+	free_large_item_vec(survey_stats.trees.vec_largest_by_size_bytes);
+	free_large_item_vec(survey_stats.blobs.vec_largest_by_size_bytes);
 
 	return 0;
 }

From ab094c405db33dd23817276aa69e5dfb24326685 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Wed, 15 May 2024 15:56:36 -0400
Subject: [PATCH 08/18] survey: add pathname of blob or tree to large_item_vec

Include the pathname of each blob or tree in the large_item_vec
to help identify the file or directory associated with the OID
and size information.

This pathname is computed during the treewalk, so it reflects the
first observed pathname seen for that OID during the traversal over
all of the refs.  Since the file or directory could have moved
(without being modified), there may be multiple "correct" pathnames
for a particular OID.  Since we do not control the ref traversal
order, we should consider it to be a "suggested pathname" for the OID.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 54 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 43 insertions(+), 11 deletions(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index cfface7948884d..aa4a43e78ba503 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -327,6 +327,7 @@ static void incr_obj_hist_bin(struct obj_hist_bin *pbin,
 struct large_item {
 	uint64_t size;
 	struct object_id oid;
+	struct strbuf *name;
 };
 
 struct large_item_vec {
@@ -342,6 +343,7 @@ static struct large_item_vec *alloc_large_item_vec(const char *dimension_label,
 {
 	struct large_item_vec *vec;
 	size_t flex_len = nr_items * sizeof(struct large_item);
+	size_t k;
 
 	if (!nr_items)
 		return NULL;
@@ -351,11 +353,24 @@ static struct large_item_vec *alloc_large_item_vec(const char *dimension_label,
 	vec->item_label = strdup(item_label);
 	vec->nr_items = nr_items;
 
+	for (k = 0; k < nr_items; k++) {
+		struct strbuf *p = xcalloc(1, sizeof(struct strbuf));
+		strbuf_init(p, 0);
+		vec->items[k].name = p;
+	}
+
 	return vec;
 }
 
 static void free_large_item_vec(struct large_item_vec *vec)
 {
+	size_t k;
+
+	for (k = 0; k < vec->nr_items; k++) {
+		strbuf_release(vec->items[k].name);
+		free(vec->items[k].name);
+	}
+
 	free(vec->dimension_label);
 	free(vec->item_label);
 	free(vec);
@@ -363,8 +378,10 @@ static void free_large_item_vec(struct large_item_vec *vec)
 
 static void maybe_insert_large_item(struct large_item_vec *vec,
 				    uint64_t size,
-				    struct object_id *oid)
+				    struct object_id *oid,
+				    const char *name)
 {
+	struct strbuf *pbuf_temp;
 	size_t rest_len;
 	size_t k;
 
@@ -383,7 +400,17 @@ static void maybe_insert_large_item(struct large_item_vec *vec,
 		if (size < vec->items[k].size)
 			continue;
 
-		/* push items[k..] down one and insert it here */
+		/*
+		 * The last large_item in the vector is about to be
+		 * overwritten by the previous one during the shift.
+		 * Steal its allocated strbuf and reuse it.
+		 */
+		pbuf_temp = vec->items[vec->nr_items - 1].name;
+		strbuf_reset(pbuf_temp);
+		if (name && *name)
+			strbuf_addstr(pbuf_temp, name);
+
+		/* push items[k..] down one and insert data for this item here */
 
 		rest_len = (vec->nr_items - k - 1) * sizeof(struct large_item);
 		if (rest_len)
@@ -392,6 +419,9 @@ static void maybe_insert_large_item(struct large_item_vec *vec,
 		memset(&vec->items[k], 0, sizeof(struct large_item));
 		vec->items[k].size = size;
 		oidcpy(&vec->items[k].oid, oid);
+
+		vec->items[k].name = pbuf_temp;
+
 		return;
 	}
 }
@@ -665,15 +695,15 @@ static void traverse_commit_cb(struct commit *commit, void *data)
 
 	k = commit_list_count(commit->parents);
 
-	maybe_insert_large_item(psc->vec_largest_by_nr_parents, k, &commit->object.oid);
-	maybe_insert_large_item(psc->vec_largest_by_size_bytes, object_length, &commit->object.oid);
+	maybe_insert_large_item(psc->vec_largest_by_nr_parents, k, &commit->object.oid, NULL);
+	maybe_insert_large_item(psc->vec_largest_by_size_bytes, object_length, &commit->object.oid, NULL);
 
 	if (k >= PBIN_VEC_LEN)
 		k = PBIN_VEC_LEN - 1;
 	psc->parent_cnt_pbin[k]++;
 }
 
-static void traverse_object_cb_tree(struct object *obj)
+static void traverse_object_cb_tree(struct object *obj, const char *name)
 {
 	struct survey_stats_trees *pst = &survey_stats.trees;
 	unsigned long object_length;
@@ -697,21 +727,21 @@ static void traverse_object_cb_tree(struct object *obj)
 
 	pst->sum_entries += nr_entries;
 
-	maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries, &obj->oid);
-	maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length, &obj->oid);
+	maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries, &obj->oid, name);
+	maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length, &obj->oid, name);
 
 	qb = qbin(nr_entries);
 	incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep);
 }
 
-static void traverse_object_cb_blob(struct object *obj)
+static void traverse_object_cb_blob(struct object *obj, const char *name)
 {
 	struct survey_stats_blobs *psb = &survey_stats.blobs;
 	unsigned long object_length;
 
 	fill_in_base_object(&psb->base, obj, OBJ_BLOB, &object_length, NULL);
 
-	maybe_insert_large_item(psb->vec_largest_by_size_bytes, object_length, &obj->oid);
+	maybe_insert_large_item(psb->vec_largest_by_size_bytes, object_length, &obj->oid, name);
 }
 
 static void traverse_object_cb(struct object *obj, const char *name, void *data)
@@ -721,10 +751,10 @@ static void traverse_object_cb(struct object *obj, const char *name, void *data)
 
 	switch (obj->type) {
 	case OBJ_TREE:
-		traverse_object_cb_tree(obj);
+		traverse_object_cb_tree(obj, name);
 		return;
 	case OBJ_BLOB:
-		traverse_object_cb_blob(obj);
+		traverse_object_cb_blob(obj, name);
 		return;
 	case OBJ_TAG:    /* ignore     -- counted when loading REFS */
 	case OBJ_COMMIT: /* ignore/bug -- seen in the other callback */
@@ -1197,6 +1227,8 @@ static void write_large_item_vec_json(struct json_writer *jw,
 			{
 				jw_object_intmax(jw, vec->item_label, pk->size);
 				jw_object_string(jw, "oid", oid_to_hex(&pk->oid));
+				if (pk->name->len)
+					jw_object_string(jw, "name", pk->name->buf);
 			}
 			jw_end(jw);
 		}

From 9ea4cce60b43124c22acd47560c0be16e4635b6d Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Wed, 15 May 2024 17:44:41 -0400
Subject: [PATCH 09/18] survey: add commit-oid to large_item detail

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 59 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 6 deletions(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index aa4a43e78ba503..d403e039104ca5 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -327,7 +327,21 @@ static void incr_obj_hist_bin(struct obj_hist_bin *pbin,
 struct large_item {
 	uint64_t size;
 	struct object_id oid;
+
+	/*
+	 * For blobs and trees the name field is the pathname of the
+	 * file or directory.  Root trees will have a zero-length
+	 * name.  The name field is not currenly used for commits.
+	 */
 	struct strbuf *name;
+
+	/*
+	 * For blobs and trees remember the transient commit from
+	 * the treewalk so that we can say that this large item
+	 * first appeared in this commit (relative to the treewalk
+	 * order).
+	 */
+	struct object_id containing_commit_oid;
 };
 
 struct large_item_vec {
@@ -379,7 +393,8 @@ static void free_large_item_vec(struct large_item_vec *vec)
 static void maybe_insert_large_item(struct large_item_vec *vec,
 				    uint64_t size,
 				    struct object_id *oid,
-				    const char *name)
+				    const char *name,
+				    const struct object_id *containing_commit_oid)
 {
 	struct strbuf *pbuf_temp;
 	size_t rest_len;
@@ -419,6 +434,7 @@ static void maybe_insert_large_item(struct large_item_vec *vec,
 		memset(&vec->items[k], 0, sizeof(struct large_item));
 		vec->items[k].size = size;
 		oidcpy(&vec->items[k].oid, oid);
+		oidcpy(&vec->items[k].containing_commit_oid, containing_commit_oid);
 
 		vec->items[k].name = pbuf_temp;
 
@@ -682,6 +698,14 @@ static int fill_in_base_object(struct survey_stats_base_object *base,
 	return 0;
 }
 
+/*
+ * Transient OID of the commit currently being visited
+ * during the treewalk.  We can use this to create the
+ * <ref>:<pathname> pair when a notable large file was
+ * created, for example.
+ */
+static struct object_id treewalk_transient_commit_oid;
+
 static void traverse_commit_cb(struct commit *commit, void *data)
 {
 	struct survey_stats_commits *psc = &survey_stats.commits;
@@ -691,12 +715,23 @@ static void traverse_commit_cb(struct commit *commit, void *data)
 	if ((++survey_progress_total % 1000) == 0)
 		display_progress(survey_progress, survey_progress_total);
 
+	oidcpy(&treewalk_transient_commit_oid, &commit->object.oid);
+
 	fill_in_base_object(&psc->base, &commit->object, OBJ_COMMIT, &object_length, NULL);
 
 	k = commit_list_count(commit->parents);
 
-	maybe_insert_large_item(psc->vec_largest_by_nr_parents, k, &commit->object.oid, NULL);
-	maybe_insert_large_item(psc->vec_largest_by_size_bytes, object_length, &commit->object.oid, NULL);
+	/*
+	 * Send the commit-oid as both the OID and the CONTAINING-COMMIT-OID.
+	 * This is somewhat redundant, but lets us later do `git name-rev`
+	 * using the containing-oid in a consistent fashion.
+	 */
+	maybe_insert_large_item(psc->vec_largest_by_nr_parents, k,
+				&commit->object.oid, NULL,
+				&commit->object.oid);
+	maybe_insert_large_item(psc->vec_largest_by_size_bytes, object_length,
+				&commit->object.oid, NULL,
+				&commit->object.oid);
 
 	if (k >= PBIN_VEC_LEN)
 		k = PBIN_VEC_LEN - 1;
@@ -727,8 +762,12 @@ static void traverse_object_cb_tree(struct object *obj, const char *name)
 
 	pst->sum_entries += nr_entries;
 
-	maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries, &obj->oid, name);
-	maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length, &obj->oid, name);
+	maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries,
+				&obj->oid, name,
+				&treewalk_transient_commit_oid);
+	maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length,
+				&obj->oid, name,
+				&treewalk_transient_commit_oid);
 
 	qb = qbin(nr_entries);
 	incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep);
@@ -741,7 +780,9 @@ static void traverse_object_cb_blob(struct object *obj, const char *name)
 
 	fill_in_base_object(&psb->base, obj, OBJ_BLOB, &object_length, NULL);
 
-	maybe_insert_large_item(psb->vec_largest_by_size_bytes, object_length, &obj->oid, name);
+	maybe_insert_large_item(psb->vec_largest_by_size_bytes, object_length,
+				&obj->oid, name,
+				&treewalk_transient_commit_oid);
 }
 
 static void traverse_object_cb(struct object *obj, const char *name, void *data)
@@ -774,6 +815,7 @@ static void do_treewalk_reachable(struct ref_array *ref_array)
 	repo_init_revisions(the_repository, &rev_info, NULL);
 	rev_info.tree_objects = 1;
 	rev_info.blob_objects = 1;
+	rev_info.tree_blobs_in_commit_order = 1;
 	load_rev_info(&rev_info, ref_array);
 	if (prepare_revision_walk(&rev_info))
 		die(_("revision walk setup failed"));
@@ -783,10 +825,12 @@ static void do_treewalk_reachable(struct ref_array *ref_array)
 		survey_progress = start_progress(_("Walking reachable objects..."), 0);
 	}
 
+	oidcpy(&treewalk_transient_commit_oid, null_oid());
 	traverse_commit_list(&rev_info,
 			     traverse_commit_cb,
 			     traverse_object_cb,
 			     NULL);
+	oidcpy(&treewalk_transient_commit_oid, null_oid());
 
 	if (survey_opts.show_progress)
 		stop_progress(&survey_progress);
@@ -1229,6 +1273,9 @@ static void write_large_item_vec_json(struct json_writer *jw,
 				jw_object_string(jw, "oid", oid_to_hex(&pk->oid));
 				if (pk->name->len)
 					jw_object_string(jw, "name", pk->name->buf);
+				if (!is_null_oid(&pk->containing_commit_oid))
+					jw_object_string(jw, "commit_oid",
+							 oid_to_hex(&pk->containing_commit_oid));
 			}
 			jw_end(jw);
 		}

From 5330029f74e82bcf66093e6eee92b5e0cb58994b Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 20 May 2024 17:23:39 -0400
Subject: [PATCH 10/18] survey: add commit name-rev lookup to each large_item

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index d403e039104ca5..9cc1d0e23f0d00 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -11,6 +11,7 @@
 #include "ref-filter.h"
 #include "refs.h"
 #include "revision.h"
+#include "run-command.h"
 #include "strbuf.h"
 #include "strmap.h"
 #include "strvec.h"
@@ -342,6 +343,12 @@ struct large_item {
 	 * order).
 	 */
 	struct object_id containing_commit_oid;
+
+	/*
+	 * Lookup `containing_commit_oid` using `git name-rev`.
+	 * Lazy allocate this post-treewalk.
+	 */
+	struct strbuf *name_rev;
 };
 
 struct large_item_vec {
@@ -383,6 +390,11 @@ static void free_large_item_vec(struct large_item_vec *vec)
 	for (k = 0; k < vec->nr_items; k++) {
 		strbuf_release(vec->items[k].name);
 		free(vec->items[k].name);
+
+		if (vec->items[k].name_rev) {
+			strbuf_release(vec->items[k].name_rev);
+			free(vec->items[k].name_rev);
+		}
 	}
 
 	free(vec->dimension_label);
@@ -419,6 +431,9 @@ static void maybe_insert_large_item(struct large_item_vec *vec,
 		 * The last large_item in the vector is about to be
 		 * overwritten by the previous one during the shift.
 		 * Steal its allocated strbuf and reuse it.
+		 *
+		 * We can ignore .name_rev because it will not be
+		 * allocated until after the treewalk.
 		 */
 		pbuf_temp = vec->items[vec->nr_items - 1].name;
 		strbuf_reset(pbuf_temp);
@@ -442,6 +457,54 @@ static void maybe_insert_large_item(struct large_item_vec *vec,
 	}
 }
 
+/*
+ * Try to run `git name-rev` on each of the containing-commit-oid's
+ * in this large-item-vec to get a pretty name for each OID.  Silently
+ * ignore errors if it fails because this info is nice to have but not
+ * essential.
+ */
+static void large_item_vec_lookup_name_rev(struct large_item_vec *vec)
+{
+	struct child_process cp = CHILD_PROCESS_INIT;
+	struct strbuf in = STRBUF_INIT;
+	struct strbuf out = STRBUF_INIT;
+	const char *line;
+	size_t k;
+
+	if (!vec || !vec->nr_items)
+		return;
+
+	survey_progress_total += vec->nr_items;
+	display_progress(survey_progress, survey_progress_total);
+
+	for (k = 0; k < vec->nr_items; k++)
+		strbuf_addf(&in, "%s\n", oid_to_hex(&vec->items[k].containing_commit_oid));
+
+	cp.git_cmd = 1;
+	strvec_pushl(&cp.args, "name-rev", "--name-only", "--annotate-stdin", NULL);
+	if (pipe_command(&cp, in.buf, in.len, &out, 0, NULL, 0)) {
+		strbuf_release(&in);
+		strbuf_release(&out);
+		return;
+	}
+
+	line = out.buf;
+	k = 0;
+	while (*line) {
+		const char *eol = strchrnul(line, '\n');
+
+		vec->items[k].name_rev = xcalloc(1, sizeof(struct strbuf));
+		strbuf_init(vec->items[k].name_rev, 0);
+		strbuf_add(vec->items[k].name_rev, line, (eol - line));
+
+		line = eol + 1;
+		k++;
+	}
+
+	strbuf_release(&in);
+	strbuf_release(&out);
+}
+
 /*
  * Common fields for any type of object.
  */
@@ -1010,6 +1073,25 @@ static void do_calc_stats_refs(struct repository *r, struct ref_array *ref_array
 	}
 }
 
+static void do_lookup_name_rev(void)
+{
+	if (survey_opts.show_progress) {
+		survey_progress_total = 0;
+		survey_progress = start_progress(_("Resolving name-revs..."), 0);
+	}
+
+	large_item_vec_lookup_name_rev(survey_stats.commits.vec_largest_by_nr_parents);
+	large_item_vec_lookup_name_rev(survey_stats.commits.vec_largest_by_size_bytes);
+
+	large_item_vec_lookup_name_rev(survey_stats.trees.vec_largest_by_nr_entries);
+	large_item_vec_lookup_name_rev(survey_stats.trees.vec_largest_by_size_bytes);
+
+	large_item_vec_lookup_name_rev(survey_stats.blobs.vec_largest_by_size_bytes);
+
+	if (survey_opts.show_progress)
+		stop_progress(&survey_progress);
+}
+
 /*
  * The REFS phase:
  *
@@ -1040,6 +1122,10 @@ static void survey_phase_refs(struct repository *r)
 	do_calc_stats_refs(r, &ref_array);
 	trace2_region_leave("survey", "phase/calcstats", the_repository);
 
+	trace2_region_enter("survey", "phase/namerev", the_repository);
+	do_lookup_name_rev();
+	trace2_region_enter("survey", "phase/namerev", the_repository);
+
 	ref_array_clear(&ref_array);
 }
 
@@ -1276,6 +1362,9 @@ static void write_large_item_vec_json(struct json_writer *jw,
 				if (!is_null_oid(&pk->containing_commit_oid))
 					jw_object_string(jw, "commit_oid",
 							 oid_to_hex(&pk->containing_commit_oid));
+				if (pk->name_rev->len)
+					jw_object_string(jw, "name_rev",
+							 pk->name_rev->buf);
 			}
 			jw_end(jw);
 		}

From 0d9f6ae78927b5824ce1c66e9351c4367c7169e5 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Tue, 21 May 2024 13:44:07 -0400
Subject: [PATCH 11/18] survey: add --json option and setup for pretty output

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 Documentation/git-survey.txt |  3 +++
 builtin/survey.c             | 20 +++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt
index 771a063efdc594..0e42fb32ac2964 100644
--- a/Documentation/git-survey.txt
+++ b/Documentation/git-survey.txt
@@ -32,6 +32,9 @@ OPTIONS
 --progress::
 	Show progress.  This is automatically enabled when interactive.
 
+--json::
+	Print results in JSON rather than in a human-friendly format.
+
 Ref Selection
 ~~~~~~~~~~~~~
 
diff --git a/builtin/survey.c b/builtin/survey.c
index 9cc1d0e23f0d00..8632c974ebc3b1 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -56,6 +56,7 @@ static struct survey_refs_wanted refs_if_unspecified = {
 struct survey_opts {
 	int verbose;
 	int show_progress;
+	int show_json;
 
 	int show_largest_commits_by_nr_parents;
 	int show_largest_commits_by_size_bytes;
@@ -73,6 +74,7 @@ struct survey_opts {
 static struct survey_opts survey_opts = {
 	.verbose = 0,
 	.show_progress = -1, /* defaults to isatty(2) */
+	.show_json = 0, /* defaults to pretty */
 
 	/*
 	 * Show the largest `n` objects for some scaling dimension.
@@ -154,6 +156,7 @@ static void fixup_refs_wanted(void)
 static struct option survey_options[] = {
 	OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")),
 	OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")),
+	OPT_BOOL(0, "json",     &survey_opts.show_json, N_("report stats in JSON")),
 
 	OPT_BOOL_F(0, "all-refs", &survey_opts.refs.want_all_refs, N_("include all refs"),          PARSE_OPT_NONEG),
 
@@ -185,6 +188,10 @@ static int survey_load_config_cb(const char *var, const char *value,
 		survey_opts.show_progress = git_config_bool(var, value);
 		return 0;
 	}
+	if (!strcmp(var, "survey.json")) {
+		survey_opts.show_json = git_config_bool(var, value);
+		return 0;
+	}
 
 	if (!strcmp(var, "survey.showcommitparents")) {
 		survey_opts.show_largest_commits_by_nr_parents = git_config_ulong(var, value, ctx->kvi);
@@ -1490,6 +1497,14 @@ static void survey_emit_trace2(void)
 	json_blobs_section(NULL, 0, 1);
 }
 
+/*
+ * Print all of the stats that we have collected in a more pretty format.
+ */
+static void survey_print_results_pretty(void)
+{
+	printf("TODO....\n");
+}
+
 int cmd_survey(int argc, const char **argv, const char *prefix)
 {
 	survey_load_config();
@@ -1538,7 +1553,10 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 	survey_phase_refs(the_repository);
 
 	survey_emit_trace2();
-	survey_print_json();
+	if (survey_opts.show_json)
+		survey_print_json();
+	else
+		survey_print_results_pretty();
 
 	strvec_clear(&survey_vec_refs_wanted);
 	free_large_item_vec(survey_stats.commits.vec_largest_by_nr_parents);

From 50d22037857fa083f119deaedecd023e044a5cec Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Tue, 21 May 2024 15:47:58 -0400
Subject: [PATCH 12/18] survey: add pretty printing of stats

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 788 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 742 insertions(+), 46 deletions(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index 8632c974ebc3b1..d2e53885fd5820 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -338,8 +338,9 @@ struct large_item {
 
 	/*
 	 * For blobs and trees the name field is the pathname of the
-	 * file or directory.  Root trees will have a zero-length
-	 * name.  The name field is not currenly used for commits.
+	 * file or directory (as reported by the treewalk). Root trees
+	 * are reported with a zero-length name, but we'll fix them up.
+	 * The name field is not currenly used for commits.
 	 */
 	struct strbuf *name;
 
@@ -358,16 +359,24 @@ struct large_item {
 	struct strbuf *name_rev;
 };
 
+struct large_item_vec_labels {
+	const char *dimension;
+	const char *item;
+};
+
 struct large_item_vec {
-	char *dimension_label;
-	char *item_label;
+	const struct large_item_vec_labels *labels_json;
+	const struct large_item_vec_labels *labels_pretty;
 	uint64_t nr_items;
+	enum object_type type;
 	struct large_item items[FLEX_ARRAY]; /* nr_items */
 };
 
-static struct large_item_vec *alloc_large_item_vec(const char *dimension_label,
-						   const char *item_label,
-						   uint64_t nr_items)
+static struct large_item_vec *alloc_large_item_vec(
+	const struct large_item_vec_labels *labels_json,
+	const struct large_item_vec_labels *labels_pretty,
+	uint64_t nr_items,
+	enum object_type type)
 {
 	struct large_item_vec *vec;
 	size_t flex_len = nr_items * sizeof(struct large_item);
@@ -377,9 +386,10 @@ static struct large_item_vec *alloc_large_item_vec(const char *dimension_label,
 		return NULL;
 
 	vec = xcalloc(1, (sizeof(struct large_item_vec) + flex_len));
-	vec->dimension_label = strdup(dimension_label);
-	vec->item_label = strdup(item_label);
+	vec->labels_json = labels_json;
+	vec->labels_pretty = labels_pretty;
 	vec->nr_items = nr_items;
+	vec->type = type;
 
 	for (k = 0; k < nr_items; k++) {
 		struct strbuf *p = xcalloc(1, sizeof(struct strbuf));
@@ -404,8 +414,6 @@ static void free_large_item_vec(struct large_item_vec *vec)
 		}
 	}
 
-	free(vec->dimension_label);
-	free(vec->item_label);
 	free(vec);
 }
 
@@ -446,6 +454,15 @@ static void maybe_insert_large_item(struct large_item_vec *vec,
 		strbuf_reset(pbuf_temp);
 		if (name && *name)
 			strbuf_addstr(pbuf_temp, name);
+		else if (vec->type == OBJ_TREE) {
+			/*
+			 * NEEDSWORK: Would it be better to wait and create
+			 * a name of the form "<name_rev>^{tree}" after the
+			 * treewalk is finished?
+			 */
+			strbuf_addf(pbuf_temp, "%s^{tree}",
+				    oid_to_hex(containing_commit_oid));
+		}
 
 		/* push items[k..] down one and insert data for this item here */
 
@@ -616,6 +633,95 @@ struct survey_stats {
 
 static struct survey_stats survey_stats = { 0 };
 
+static void alloc_commit_by_parents(void)
+{
+	static struct large_item_vec_labels json = {
+		.dimension = "largest_commits_by_nr_parents",
+		.item = "nr_parents",
+	};
+	static struct large_item_vec_labels pretty = {
+		.dimension = "Largest Commits by Number of Parents",
+		.item = "Parents",
+	};
+
+	if (survey_opts.show_largest_commits_by_nr_parents)
+		survey_stats.commits.vec_largest_by_nr_parents =
+			alloc_large_item_vec(&json, &pretty,
+					     survey_opts.show_largest_commits_by_nr_parents,
+					     OBJ_COMMIT);
+}
+
+static void alloc_commit_by_size(void) {
+	static struct large_item_vec_labels json = {
+		.dimension = "largest_commits_by_size_bytes",
+		.item = "size",
+	};
+	static struct large_item_vec_labels pretty = {
+		.dimension = "Largest Commits by Size in Bytes",
+		.item = "Size",
+	};
+
+	if (survey_opts.show_largest_commits_by_size_bytes)
+		survey_stats.commits.vec_largest_by_size_bytes =
+			alloc_large_item_vec(&json, &pretty,
+					     survey_opts.show_largest_commits_by_size_bytes,
+					     OBJ_COMMIT);
+}
+
+static void alloc_tree_by_entries(void)
+{
+	static struct large_item_vec_labels json = {
+		.dimension = "largest_trees_by_nr_entries",
+		.item = "nr_entries",
+	};
+	static struct large_item_vec_labels pretty = {
+		.dimension = "Largest Trees by Number of Entries",
+		.item = "Entries",
+	};
+
+	if (survey_opts.show_largest_trees_by_nr_entries)
+		survey_stats.trees.vec_largest_by_nr_entries =
+			alloc_large_item_vec(&json, &pretty,
+					     survey_opts.show_largest_trees_by_nr_entries,
+					     OBJ_TREE);
+}
+
+static void alloc_tree_by_size(void)
+{
+	static struct large_item_vec_labels json = {
+		.dimension = "largest_trees_by_size_bytes",
+		.item = "size",
+	};
+	static struct large_item_vec_labels pretty = {
+		.dimension = "Largest Trees by Size in Bytes",
+		.item = "Size",
+	};
+
+	if (survey_opts.show_largest_trees_by_size_bytes)
+		survey_stats.trees.vec_largest_by_size_bytes =
+			alloc_large_item_vec(&json, &pretty,
+					     survey_opts.show_largest_trees_by_size_bytes,
+					     OBJ_TREE);
+}
+
+static void alloc_blob_by_size(void)
+{
+	static struct large_item_vec_labels json = {
+		.dimension = "largest_blobs_by_size_bytes",
+		.item = "size",
+	};
+	static struct large_item_vec_labels pretty = {
+		.dimension = "Largest Blobs by Size in Bytes",
+		.item = "Size",
+	};
+
+	if (survey_opts.show_largest_blobs_by_size_bytes)
+		survey_stats.blobs.vec_largest_by_size_bytes =
+			alloc_large_item_vec(&json, &pretty,
+					     survey_opts.show_largest_blobs_by_size_bytes,
+					     OBJ_BLOB);
+}
+
 static void do_load_refs(struct ref_array *ref_array)
 {
 	struct ref_filter filter = REF_FILTER_INIT;
@@ -1351,7 +1457,7 @@ static void write_large_item_vec_json(struct json_writer *jw,
 	if (!vec || !vec->nr_items)
 		return;
 
-	jw_object_inline_begin_array(jw, vec->dimension_label);
+	jw_object_inline_begin_array(jw, vec->labels_json->dimension);
 	{
 		int k;
 
@@ -1362,7 +1468,7 @@ static void write_large_item_vec_json(struct json_writer *jw,
 
 			jw_array_inline_begin_object(jw);
 			{
-				jw_object_intmax(jw, vec->item_label, pk->size);
+				jw_object_intmax(jw, vec->labels_json->item, pk->size);
 				jw_object_string(jw, "oid", oid_to_hex(&pk->oid));
 				if (pk->name->len)
 					jw_object_string(jw, "name", pk->name->buf);
@@ -1497,12 +1603,629 @@ static void survey_emit_trace2(void)
 	json_blobs_section(NULL, 0, 1);
 }
 
+static void fmt_txt_line(struct strbuf *buf, int indent, const char *txt)
+{
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addstr(buf, txt);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_txt_pair_ui64(struct strbuf *buf,
+			      int indent,
+			      const char *label,
+			      uint64_t value)
+{
+	int column0 = 62;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addf(buf, "%-*s : %14"PRIu64,
+		    column0 - indent, label,
+		    value);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_size_tbl_caption(struct strbuf *buf,
+				 int indent,
+				 const char *caption)
+{
+	strbuf_addch(buf, '\n');
+	fmt_txt_line(buf, indent, caption);
+}
+
+static void fmt_size_tbl_hdr(struct strbuf *buf,
+			     int indent,
+			     const char *bucket_hdr,
+			     const char *count_hdr,
+			     const char *size_hdr,
+			     const char *disk_size_hdr)
+{
+	int column0 = 28;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addf(buf, "%-*s | %14s | %14s | %14s",
+		    column0 - indent, bucket_hdr,
+		    count_hdr, size_hdr, disk_size_hdr);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_size_tbl_hr(struct strbuf *buf,
+			    int indent)
+{
+	int column0 = 28;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addchars(buf, '-', column0 - indent);
+	strbuf_addstr(buf, "-+-");
+	strbuf_addchars(buf, '-', 14);
+	strbuf_addstr(buf, "-+-");
+	strbuf_addchars(buf, '-', 14);
+	strbuf_addstr(buf, "-+-");
+	strbuf_addchars(buf, '-', 14);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_size_tbl_row(struct strbuf *buf,
+			     int indent,
+			     const char *bucket,
+			     uint64_t count,
+			     uint64_t size,
+			     uint64_t disk_size)
+{
+	int column0 = 28;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addf(buf, "%-*s | %14"PRIu64" | %14"PRIu64" | %14"PRIu64,
+		    column0 - indent, bucket, count, size, disk_size);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_qbin(struct strbuf *buf,
+		     int indent, const char *title_caption,
+		     const char *bucket_hdr,
+		     struct obj_hist_bin qbin[QBIN_LEN])
+{
+	struct strbuf bucket = STRBUF_INIT;
+	uint64_t lower = 0;
+	uint64_t upper = QBIN_MASK;
+	int k;
+
+	fmt_size_tbl_caption(buf, indent, title_caption);
+	fmt_size_tbl_hr(buf, indent);
+	fmt_size_tbl_hdr(buf, indent, bucket_hdr, "Count", "Size", "Disk Size");
+	fmt_size_tbl_hr(buf, indent);
+
+	for (k = 0; k < QBIN_LEN; k++) {
+		struct obj_hist_bin *p = &qbin[k];
+		uint64_t lower_k = lower;
+		uint64_t upper_k = upper;
+
+		lower = upper+1;
+		upper = (upper << QBIN_SHIFT) + QBIN_MASK;
+
+		if (!p->cnt_seen)
+			continue;
+
+		strbuf_reset(&bucket);
+		strbuf_addf(&bucket, "%"PRIu64"..%"PRIu64, lower_k, upper_k);
+
+		fmt_size_tbl_row(buf, indent, bucket.buf,
+			     p->cnt_seen, p->sum_size, p->sum_disk_size);
+	}
+	fmt_size_tbl_hr(buf, indent);
+
+	strbuf_release(&bucket);
+}
+
+static void fmt_hbin(struct strbuf *buf,
+		     int indent, const char *title_caption,
+		     const char *bucket_hdr,
+		     struct obj_hist_bin hbin[HBIN_LEN])
+{
+	struct strbuf bucket = STRBUF_INIT;
+	uint64_t lower = 0;
+	uint64_t upper = HBIN_MASK;
+	int k;
+
+	fmt_size_tbl_caption(buf, indent, title_caption);
+	fmt_size_tbl_hr(buf, indent);
+	fmt_size_tbl_hdr(buf, indent, bucket_hdr, "Count", "Size", "Disk Size");
+	fmt_size_tbl_hr(buf, indent);
+
+	for (k = 0; k < HBIN_LEN; k++) {
+		struct obj_hist_bin *p = &hbin[k];
+		uint64_t lower_k = lower;
+		uint64_t upper_k = upper;
+
+		lower = upper+1;
+		upper = (upper << HBIN_SHIFT) + HBIN_MASK;
+
+		if (!p->cnt_seen)
+			continue;
+
+		strbuf_reset(&bucket);
+		strbuf_addf(&bucket, "%"PRIu64"..%"PRIu64, lower_k, upper_k);
+
+		fmt_size_tbl_row(buf, indent, bucket.buf,
+				 p->cnt_seen, p->sum_size, p->sum_disk_size);
+	}
+	fmt_size_tbl_hr(buf, indent);
+
+	strbuf_release(&bucket);
+}
+
+static void fmt_pbin_hdr(struct strbuf *buf,
+			 int indent,
+			 const char *bucket,
+			 const char *count)
+{
+	int column0 = 28;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addf(buf, "%-*s | %14s",
+		    column0 - indent, bucket,
+		    count);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_pbin_hr(struct strbuf *buf,
+			int indent)
+{
+	int column0 = 28;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addchars(buf, '-', column0 - indent);
+	strbuf_addstr(buf, "-+-");
+	strbuf_addchars(buf, '-', 14);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_pbin_row(struct strbuf *buf,
+			 int indent,
+			 int nr,
+			 int count)
+{
+	struct strbuf bucket = STRBUF_INIT;
+	int column0 = 28;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addf(&bucket, "%2d", nr);
+	strbuf_addf(buf, "%-*s | %14d",
+		    column0 - indent, bucket.buf,
+		    count);
+
+	strbuf_addch(buf, '\n');
+	strbuf_release(&bucket);
+}
+
+static void fmt_base_object(struct strbuf *buf,
+			    int indent,
+			    struct survey_stats_base_object *base)
+{
+	int indent1 = indent + 4;
+
+	fmt_txt_pair_ui64(buf, indent, "Total Count", base->cnt_seen);
+
+	strbuf_addch(buf, '\n');
+	fmt_txt_line(buf, indent, "Count by Storage Location");
+	if (base->cnt_missing)
+		fmt_txt_pair_ui64(buf, indent1, "Missing", base->cnt_missing);
+	if (base->cnt_cached)
+		fmt_txt_pair_ui64(buf, indent1, "Cached", base->cnt_cached);
+	if (base->cnt_loose)
+		fmt_txt_pair_ui64(buf, indent1, "Loose", base->cnt_loose);
+	if (base->cnt_packed)
+		fmt_txt_pair_ui64(buf, indent1, "Packed", base->cnt_packed);
+	if (base->cnt_dbcached)
+		fmt_txt_pair_ui64(buf, indent1, "DBCached", base->cnt_dbcached);
+
+	strbuf_addch(buf, '\n');
+	fmt_txt_pair_ui64(buf, indent, "Total Size in Bytes", base->sum_size);
+	fmt_txt_pair_ui64(buf, indent, "Total Disk Size in Bytes", base->sum_disk_size);
+
+	fmt_hbin(buf, indent, "Histogram by Size in Bytes", "Byte Range", base->size_hbin);
+}
+
+static void fmt_large_item_hdr(struct strbuf *buf,
+			       int indent,
+			       int name_length,
+			       int name_rev_length,
+			       const char *item_hdr_label)
+{
+	int column0 = the_hash_algo->hexsz;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addf(buf, "%-*s | %14s", column0, "OID", item_hdr_label);
+	if (name_length)
+		strbuf_addf(buf, " | %-*s", name_length, "Name");
+	strbuf_addf(buf, " | %-*s", name_rev_length, "Name Rev");
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_large_item_hr(struct strbuf *buf,
+			      int indent,
+			      int name_length,
+			      int name_rev_length)
+{
+	int column0 = the_hash_algo->hexsz;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addchars(buf, '-', column0);
+	strbuf_addstr(buf, "-+-");
+	strbuf_addchars(buf, '-', 14);
+	if (name_length) {
+		strbuf_addstr(buf, "-+-");
+		strbuf_addchars(buf, '-', name_length);
+	}
+	strbuf_addstr(buf, "-+-");
+	strbuf_addchars(buf, '-', name_rev_length);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_large_item_row(struct strbuf *buf,
+			       int indent,
+			       int name_length,
+			       int name_rev_length,
+			       struct large_item *pitem)
+{
+	int column0 = the_hash_algo->hexsz;
+
+	if (indent)
+		strbuf_addchars(buf, ' ', indent);
+
+	strbuf_addf(buf, "%-*s | %14"PRIu64,
+		    column0, oid_to_hex(&pitem->oid),
+		    pitem->size);
+	if (name_length)
+		strbuf_addf(buf, " | %-*s", name_length,
+			    (pitem->name ? pitem->name->buf: ""));
+	strbuf_addf(buf, " | %-*s", name_rev_length, pitem->name_rev->buf);
+
+	strbuf_addch(buf, '\n');
+}
+
+static void fmt_large_item_vec(struct strbuf *buf,
+			       int indent,
+			       struct large_item_vec *pvec)
+{
+	int name_length = 0;
+	int name_rev_length = 10;
+	int k;
+
+	if (pvec->type != OBJ_COMMIT) {
+		/* Add "Name" column for trees and blobs. */
+		for (k = 0; k < pvec->nr_items; k++)
+			if (pvec->items[k].name && pvec->items[k].name->len > name_length)
+				name_length = pvec->items[k].name->len;
+		if (name_length)
+			if (name_length < 4) /* strlen("Name") */
+				name_length = 4;
+	}
+
+	for (k = 0; k < pvec->nr_items; k++) {
+		struct large_item *pk = &pvec->items[k];
+		if (pk->name_rev->len > name_rev_length)
+			name_rev_length = pk->name_rev->len;
+	}
+
+	strbuf_addch(buf, '\n');
+	fmt_txt_line(buf, indent, pvec->labels_pretty->dimension);
+	fmt_large_item_hr(buf, indent, name_length, name_rev_length);
+	fmt_large_item_hdr(buf, indent, name_length, name_rev_length, pvec->labels_pretty->item);
+	fmt_large_item_hr(buf, indent, name_length, name_rev_length);
+
+	for (k = 0; k < pvec->nr_items; k++) {
+		struct large_item *pk = &pvec->items[k];
+		if (is_null_oid(&pk->oid))
+			break;
+
+		fmt_large_item_row(buf, indent, name_length, name_rev_length, pk);
+	}
+
+	fmt_large_item_hr(buf, indent, name_length, name_rev_length);
+}
+
+static void pretty_print_survey_hdr(void)
+{
+	struct strbuf buf = STRBUF_INIT;
+	int indent = 0;
+	int k;
+
+	const char *intro[] = {
+		"",
+		"===============================================================================",
+		"Git Survey Results",
+		"===============================================================================",
+		"",
+		NULL
+	};
+
+	k = 0;
+	while (intro[k])
+		fmt_txt_line(&buf, indent, intro[k++]);
+
+	/*
+	 * NEEDSWORK: Consider adding information about the repo pathname,
+	 * the date, command line args, git version, etc.
+	 */
+
+	fwrite(buf.buf, 1, buf.len, stdout);
+	strbuf_release(&buf);
+}
+
+static void pretty_print_overview(int indent)
+{
+	struct survey_stats_refs *prs = &survey_stats.refs;
+	struct survey_stats_commits *psc = &survey_stats.commits;
+	struct survey_stats_trees *pst = &survey_stats.trees;
+	struct survey_stats_blobs *psb = &survey_stats.blobs;
+	struct strbuf buf = STRBUF_INIT;
+	int indent1 = indent + 4;
+	int indent2 = indent + 8;
+	int k;
+
+	const char *intro[] = {
+		"",
+		"OVERVIEW",
+		"-------------------------------------------------------------------------------",
+		"",
+		NULL
+	};
+
+	k = 0;
+	while (intro[k])
+		fmt_txt_line(&buf, indent, intro[k++]);
+
+	fmt_txt_pair_ui64(&buf, indent1, "Total Number of Refs", prs->cnt_total);
+
+	fmt_size_tbl_caption(&buf, indent1, "Overview by Object Type");
+
+	fmt_size_tbl_hr(&buf, indent1);
+	fmt_size_tbl_hdr(&buf, indent1, "Type", "Count", "Size", "Disk Size");
+	fmt_size_tbl_hr(&buf, indent1);
+
+	fmt_size_tbl_row(&buf, indent2, "Commits", psc->base.cnt_seen, psc->base.sum_size, psc->base.sum_disk_size);
+	fmt_size_tbl_row(&buf, indent2, "Trees", pst->base.cnt_seen, pst->base.sum_size, pst->base.sum_disk_size);
+	fmt_size_tbl_row(&buf, indent2, "Blobs", psb->base.cnt_seen, psb->base.sum_size, psb->base.sum_disk_size);
+
+	fmt_size_tbl_hr(&buf, indent1);
+	fmt_size_tbl_row(&buf, indent1, "Total",
+			psc->base.cnt_seen + pst->base.cnt_seen + psb->base.cnt_seen,
+			psc->base.sum_size + pst->base.sum_size + psb->base.sum_size,
+			psc->base.sum_disk_size + pst->base.sum_disk_size + psb->base.sum_disk_size);
+	fmt_size_tbl_hr(&buf, indent1);
+
+	strbuf_addch(&buf, '\n');
+	fwrite(buf.buf, 1, buf.len, stdout);
+	strbuf_release(&buf);
+}
+
+/*
+ * Pretty print information on the set of REFS that we examined.
+ */
+static void pretty_print_refs(int indent)
+{
+	struct survey_refs_wanted *prw = &survey_opts.refs;
+	struct survey_stats_refs *prs = &survey_stats.refs;
+	struct strbuf buf = STRBUF_INIT;
+	int indent1 = indent + 4;
+	int indent2 = indent + 8;
+	int indent3 = indent + 12;
+	int k;
+
+	const char *intro[] = {
+		"",
+		"REFS",
+		"-------------------------------------------------------------------------------",
+		"",
+		NULL
+	};
+
+	k = 0;
+	while (intro[k])
+		fmt_txt_line(&buf, indent, intro[k++]);
+
+	fmt_txt_pair_ui64(&buf, indent1, "Total Number of Refs", prs->cnt_total);
+
+	strbuf_addch(&buf, '\n');
+	fmt_txt_line(&buf, indent1, "Reference Count by Type");
+
+	if (prw->want_remotes && prs->cnt_remotes)
+		fmt_txt_pair_ui64(&buf, indent2, "Remote Tracking Branches", prs->cnt_remotes);
+
+	if (prw->want_branches && prs->cnt_branches)
+		fmt_txt_pair_ui64(&buf, indent2, "Branches", prs->cnt_branches);
+	if (prw->want_tags && prs->cnt_lightweight_tags)
+		fmt_txt_pair_ui64(&buf, indent2, "Tags (Lightweight)", prs->cnt_lightweight_tags);
+	if (prw->want_tags && prs->cnt_annotated_tags)
+		fmt_txt_pair_ui64(&buf, indent2, "Tags (Annotated)", prs->cnt_annotated_tags);
+	if (prw->want_detached && prs->cnt_detached)
+		fmt_txt_pair_ui64(&buf, indent2, "Detached", prs->cnt_detached);
+	if (prw->want_other && prs->cnt_other)
+		fmt_txt_pair_ui64(&buf, indent2, "Other (Notes and Stashes)", prs->cnt_other);
+
+	if (prs->cnt_symref)
+		fmt_txt_pair_ui64(&buf, indent2, "Symbolic Refs (like 'HEAD')", prs->cnt_symref);
+
+	strbuf_addch(&buf, '\n');
+	fmt_txt_pair_ui64(&buf, indent1, "Reference Count by Class", strintmap_get_size(&prs->refsmap));
+	{
+		struct hashmap_iter iter;
+		struct strmap_entry *entry;
+
+		strintmap_for_each_entry(&prs->refsmap, &iter, entry) {
+			const char *key = entry->key;
+			intptr_t count = (intptr_t)entry->value;
+			int value = count;
+
+			fmt_txt_pair_ui64(&buf, indent2, key, value);
+		}
+	}
+
+	strbuf_addch(&buf, '\n');
+	fmt_txt_line(&buf, indent1, "Reference Count by Storage Location");
+	fmt_txt_pair_ui64(&buf, indent2, "Loose", prs->cnt_loose);
+	fmt_txt_pair_ui64(&buf, indent2, "Packed", prs->cnt_packed);
+
+	strbuf_addch(&buf, '\n');
+	fmt_txt_line(&buf, indent1, "String Length of Refnames");
+	if (prs->len_sum_remote_refnames) {
+		fmt_txt_line(&buf, indent2, "Remote Refs");
+		fmt_txt_pair_ui64(&buf, indent3, "Max", prs->len_max_remote_refname);
+		fmt_txt_pair_ui64(&buf, indent3, "Sum", prs->len_sum_remote_refnames);
+		}
+	if (prs->len_sum_local_refnames) {
+		fmt_txt_line(&buf, indent2, "Local Refs");
+		fmt_txt_pair_ui64(&buf, indent3, "Max", prs->len_max_local_refname);
+		fmt_txt_pair_ui64(&buf, indent3, "Sum", prs->len_sum_local_refnames);
+	}
+
+	strbuf_addch(&buf, '\n');
+	fwrite(buf.buf, 1, buf.len, stdout);
+	strbuf_release(&buf);
+}
+
+static void pretty_print_commits(int indent)
+{
+	struct survey_stats_commits *psc = &survey_stats.commits;
+	struct survey_stats_base_object *base = &psc->base;
+	struct strbuf buf = STRBUF_INIT;
+	int indent1 = indent + 4;
+	int k;
+
+	const char *intro[] = {
+		"",
+		"COMMITS",
+		"-------------------------------------------------------------------------------",
+		"",
+		NULL
+	};
+
+	k = 0;
+	while (intro[k])
+		fmt_txt_line(&buf, indent, intro[k++]);
+
+	fmt_base_object(&buf, indent1, base);
+
+	fmt_large_item_vec(&buf, indent1, psc->vec_largest_by_size_bytes);
+
+	strbuf_addch(&buf, '\n');
+	fmt_txt_line(&buf, indent1, "Histogram by Number of Parents");
+	fmt_pbin_hr(&buf, indent1);
+	fmt_pbin_hdr(&buf, indent1, "Parents", "Count");
+	fmt_pbin_hr(&buf, indent1);
+	for (k = 0; k < PBIN_VEC_LEN; k++)
+		if (psc->parent_cnt_pbin[k])
+			fmt_pbin_row(&buf, indent1, k, psc->parent_cnt_pbin[k]);
+	fmt_pbin_hr(&buf, indent1);
+
+	fmt_large_item_vec(&buf, indent1, psc->vec_largest_by_nr_parents);
+
+	strbuf_addch(&buf, '\n');
+	fwrite(buf.buf, 1, buf.len, stdout);
+	strbuf_release(&buf);
+}
+
+static void pretty_print_trees(int indent)
+{
+	struct survey_stats_trees *pst = &survey_stats.trees;
+	struct survey_stats_base_object *base = &pst->base;
+	struct strbuf buf = STRBUF_INIT;
+	int indent1 = indent + 4;
+	int k;
+
+	const char *intro[] = {
+		"",
+		"TREES",
+		"-------------------------------------------------------------------------------",
+		"",
+		NULL
+	};
+
+	k = 0;
+	while (intro[k])
+		fmt_txt_line(&buf, indent, intro[k++]);
+
+	fmt_base_object(&buf, indent1, base);
+
+	fmt_large_item_vec(&buf, indent1, pst->vec_largest_by_size_bytes);
+
+	fmt_qbin(&buf, indent1, "Tree Histogram by Number of Entries", "Entry Range", pst->entry_qbin);
+	fmt_large_item_vec(&buf, indent1, pst->vec_largest_by_nr_entries);
+
+	strbuf_addch(&buf, '\n');
+	fwrite(buf.buf, 1, buf.len, stdout);
+	strbuf_release(&buf);
+}
+
+static void pretty_print_blobs(int indent)
+{
+	struct survey_stats_blobs *psb = &survey_stats.blobs;
+	struct survey_stats_base_object *base = &psb->base;
+	struct strbuf buf = STRBUF_INIT;
+	int indent1 = indent + 4;
+	int k;
+
+	const char *intro[] = {
+		"",
+		"BLOBS",
+		"-------------------------------------------------------------------------------",
+		"",
+		NULL
+	};
+
+	k = 0;
+	while (intro[k])
+		fmt_txt_line(&buf, indent, intro[k++]);
+
+	fmt_base_object(&buf, indent1, base);
+
+	fmt_large_item_vec(&buf, indent1, psb->vec_largest_by_size_bytes);
+
+	strbuf_addch(&buf, '\n');
+	fwrite(buf.buf, 1, buf.len, stdout);
+	strbuf_release(&buf);
+}
+
 /*
  * Print all of the stats that we have collected in a more pretty format.
  */
 static void survey_print_results_pretty(void)
 {
-	printf("TODO....\n");
+	pretty_print_survey_hdr();
+	pretty_print_overview(0);
+	pretty_print_refs(0);
+	pretty_print_commits(0);
+	pretty_print_trees(0);
+	pretty_print_blobs(0);
 }
 
 int cmd_survey(int argc, const char **argv, const char *prefix)
@@ -1517,38 +2240,11 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 		survey_opts.show_progress = isatty(2);
 	fixup_refs_wanted();
 
-	if (survey_opts.show_largest_commits_by_nr_parents)
-		survey_stats.commits.vec_largest_by_nr_parents =
-			alloc_large_item_vec(
-				"largest_commits_by_nr_parents",
-				"nr_parents",
-				survey_opts.show_largest_commits_by_nr_parents);
-	if (survey_opts.show_largest_commits_by_size_bytes)
-		survey_stats.commits.vec_largest_by_size_bytes =
-			alloc_large_item_vec(
-				"largest_commits_by_size_bytes",
-				"size",
-				survey_opts.show_largest_commits_by_size_bytes);
-
-	if (survey_opts.show_largest_trees_by_nr_entries)
-		survey_stats.trees.vec_largest_by_nr_entries =
-			alloc_large_item_vec(
-				"largest_trees_by_nr_entries",
-				"nr_entries",
-				survey_opts.show_largest_trees_by_nr_entries);
-	if (survey_opts.show_largest_trees_by_size_bytes)
-		survey_stats.trees.vec_largest_by_size_bytes =
-			alloc_large_item_vec(
-				"largest_trees_by_size_bytes",
-				"size",
-				survey_opts.show_largest_trees_by_size_bytes);
-
-	if (survey_opts.show_largest_blobs_by_size_bytes)
-		survey_stats.blobs.vec_largest_by_size_bytes =
-			alloc_large_item_vec(
-				"largest_blobs_by_size_bytes",
-				"size",
-				survey_opts.show_largest_blobs_by_size_bytes);
+	alloc_commit_by_parents();
+	alloc_commit_by_size();
+	alloc_tree_by_entries();
+	alloc_tree_by_size();
+	alloc_blob_by_size();
 
 	survey_phase_refs(the_repository);
 

From acf0691e80510dbd1e85f72952971b23703b8b24 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Wed, 29 May 2024 11:12:24 -0400
Subject: [PATCH 13/18] t8100: create test for git-survey

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 t/t8100-git-survey.sh          |  98 +++++++
 t/t8100/survey_parse_json.perl | 520 +++++++++++++++++++++++++++++++++
 2 files changed, 618 insertions(+)
 create mode 100755 t/t8100-git-survey.sh
 create mode 100644 t/t8100/survey_parse_json.perl

diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh
new file mode 100755
index 00000000000000..f74e8743b6b746
--- /dev/null
+++ b/t/t8100-git-survey.sh
@@ -0,0 +1,98 @@
+#!/bin/sh
+
+test_description='measure repository and report on scaling dimensions'
+
+. ./test-lib.sh
+
+perl -MJSON::PP -e 0 >/dev/null 2>&1 && test_set_prereq JSON_PP
+
+test_expect_success 'verify zero counts before initial commit' '
+	test_when_finished "rm -rf data.json actual* expect*" &&
+
+	git survey --json >data.json &&
+
+	# Verify that there are no refs and no objects of any kind.
+	#
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" count <data.json >actual.count &&
+	cat >expect.count <<-\EOF &&
+		refs.count:0
+		commits.count:0
+		trees.count:0
+		blobs.count:0
+	EOF
+	test_cmp expect.count actual.count &&
+
+	# Verify that each of the histograms and large-item arrays are empty.
+	# This is mainly to test the perl script, since `git survey` will generate
+	# JSON with empty objects and arrays and will get parsed into empty hashes
+	# and arrays which behave differently in perl.
+	#
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" \
+	     commits.mostparents \
+	     commits.histparents \
+	     trees.histentries \
+	     trees.mostentries \
+	     blobs.histsize \
+	     blobs.largest \
+	     <data.json >actual.empty &&
+	cat >expect.empty <<-\EOF &&
+	EOF
+	test_cmp expect.empty actual.empty
+'
+
+test_expect_success 'initial commit' '
+	test_when_finished "rm -rf data.json actual* expect*" &&
+
+	touch file0 &&
+	git add file* &&
+	git commit -m "initial" &&
+
+	git survey --json >data.json &&
+
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" count <data.json >actual.count &&
+	cat >expect.count <<-\EOF &&
+		refs.count:1
+		commits.count:1
+		trees.count:1
+		blobs.count:1
+	EOF
+	test_cmp expect.count actual.count &&
+
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" commits.mostparents <data.json >actual-mp &&
+	cat >expect-mp <<-\EOF &&
+		commits.mostparents[0].nr_parents:0
+	EOF
+	test_cmp expect-mp actual-mp &&
+
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" commits.histparents <data.json >actual-hp &&
+	cat >expect-hp <<-\EOF &&
+		commits.histparents[P00].count:1
+	EOF
+	test_cmp expect-hp actual-hp &&
+
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" trees.histentries <data.json >actual-he &&
+	cat >expect-he <<-\EOF &&
+		trees.histentries.Q00.count:1
+	EOF
+	test_cmp expect-he actual-he &&
+
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" trees.mostentries <data.json >actual-me &&
+	cat >expect-me <<-\EOF &&
+		trees.mostentries[0].nr_entries:1
+	EOF
+	test_cmp expect-me actual-me &&
+
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" blobs.histsize <data.json >actual-hs &&
+	cat >expect-hs <<-\EOF &&
+		blobs.histsize.H0.count:1
+	EOF
+	test_cmp expect-hs actual-hs &&
+
+	perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" blobs.largest <data.json >actual-lb &&
+	cat >expect-lb <<-\EOF &&
+		blobs.largest[0].size:0
+	EOF
+	test_cmp expect-lb actual-lb
+'
+
+test_done
diff --git a/t/t8100/survey_parse_json.perl b/t/t8100/survey_parse_json.perl
new file mode 100644
index 00000000000000..5ac1d4dc732536
--- /dev/null
+++ b/t/t8100/survey_parse_json.perl
@@ -0,0 +1,520 @@
+#!/usr/bin/perl
+#
+# Parse the JSON output generated by `git survey --json` to
+# support the actual unit tests in the shell script.
+
+use strict;
+use warnings;
+use JSON::PP;
+use Data::Dumper;
+
+$Data::Dumper::Sortkeys = 1;
+$Data::Dumper::Indent = 1;
+$Data::Dumper::Purity = 1;
+$Data::Dumper::Pair = ':';
+
+my $stdin = join("", <STDIN>);
+my $data = decode_json $stdin;
+
+#my $dump = Dumper($data);
+#print $dump;
+
+# Create a series of functions / command line args to extract certain
+# key values so that the shell script can verify them.
+#
+# (1) The full JSON data set contains too much data to sanely test in
+# a shell script
+#
+# (2) Some JSON fields are fundamental/foundational, like the number
+# of objects, the size of the largest item, or the pathname of the
+# largest item.  But others are transient, like whether an object or
+# ref is packed or loose.  And then there are some really transient
+# values, like the SHAs of commits when we don't control for the
+# data/time.  So for simplicity our shell script test will verify the
+# basics and not try to do an exact match on the entire data set.
+#
+# (3) Most of the functionality in `git survey` comes from the various
+# existing iterators, for example to enumerate the desired set of refs
+# and to treewalk the set of reachable commits, trees, and blobs and
+# we are just using iterator callbacks to collect data on the repo.
+# We do not need to verify the correctness of the iterator code; we
+# just need to verify that we've used it properly when we collected
+# our stats.
+
+# Print various '....count' values from the JSON data.
+#
+# We assume that the JSON looks like:
+#
+# {
+#  ...
+#   "refs": {
+#     "count": 3545,
+#     ...
+#   },
+#   "commits": {
+#     "count": 197615,
+#     ...
+#   },
+#   "trees": {
+#     "count": 331409,
+#     ...
+#   },
+#   "blobs": {
+#     "count": 191847,
+#     ...
+#   },
+#   ...
+# }
+#
+# And we want to emit:
+#
+# refs.count:3545
+# commits.count:197615
+# trees.count:331409
+# blobs.count:191847
+#
+sub Count {
+    print "refs.count:$data->{'refs'}->{'count'}\n";
+    print "commits.count:$data->{'commits'}->{'count'}\n";
+    print "trees.count:$data->{'trees'}->{'count'}\n";
+    print "blobs.count:$data->{'blobs'}->{'count'}\n";
+}
+
+# We currently do not expose the "commits.dist_by_size.*" histogram
+# for testing.  The data is valid, but sensitive to the length of the
+# SHAs of the parent commits and root tree and the length of the text
+# of the commit message.  This is not very interesting and we'll test
+# the histogram construction for the other types of objects.
+#
+# {
+#   ...
+#   "commits": {
+#     "count": 197615,
+#     ...
+#     "dist_by_size": {
+#       "H1": {
+#         "count": 2268,
+#         "sum_size": 549925,
+#         "sum_disk_size": 388778,
+#         "hbin_lower": 16,
+#         "hbin_upper": 255
+#       },
+#       "H2": {
+#         "count": 194926,
+#         "sum_size": 138557614,
+#         "sum_disk_size": 76535965,
+#         "hbin_lower": 256,
+#         "hbin_upper": 4095
+#       },
+#       ...
+#     },
+#     ...
+#   },
+
+# We also do not expose the "commits.largest_commits_by_size_bytes"
+# array for testing.  This is also sensitive to the length of the SHAs
+# and the commit message.  We'll explore the histogram construction in
+# other types of objects below.
+#
+# {
+#   ...
+#   "commits": {
+#     "count": 197615,
+#     ...
+#     "largest_commits_by_size_bytes": [
+#       {
+#         "size": 78970,
+#         "oid": "0ab955aac3217bdc64a5df6dd747e8a2238f0473",
+#         ...
+#       },
+#       {
+#         "size": 25831,
+#         "oid": "e74f1e05be5adb88b1d3b282fa500e15b3b04aa7",
+#         ...
+#       },
+#       ...
+#     },
+#     ...
+#   },
+
+
+# Print details for "Largest Commits by Number of Parents".  This is
+# an array sorted in descending order.  For multiple commits with the
+# same number of parents, the relative order is undefined.
+#
+# We assume that the JSON looks like:
+#
+# {
+#   ...
+#   "commits": {
+#     "count": 197615,
+#     ...
+#     "largest_commits_by_nr_parents": [
+#       {
+#         "nr_parents": 10,
+#         "oid": "16d7601e176cd53f3c2f02367698d06b85e08879",
+#         ...
+#       },
+#       {
+#         "nr_parents": 6,
+#         "oid": "d425142e2a045a9dd7879d028ec68bd748df48a3",
+#         ...
+#       },
+#       ...
+#     ],
+#     ...
+#   },
+#
+# And we want to emit:
+#
+# commits.mostparents[0].nr_parents:10
+# commits.mostparents[1].nr_parents:6
+# ...
+#
+sub CommitsMostParents {
+    my $nr_items = scalar @{ $data->{'commits'}->{'largest_commits_by_nr_parents'} };
+    if ($nr_items == 0) {
+	return 0;
+    }
+    my @arr = @{ $data->{'commits'}->{'largest_commits_by_nr_parents'} };
+    my $k;
+    for ($k=0; $k < $nr_items; $k++) {
+	print "commits.mostparents[$k].nr_parents:$arr[$k]->{'nr_parents'}\n";
+    }
+}
+
+# Print details of the "Histogram by Number of Parents" data.
+#
+# We assume that the JSON looks like:
+#
+# {
+#   ...
+#   "commits": {
+#     "count": 197615,
+#     ...
+#     "count_by_nr_parents": {
+#       "P00": 13,
+#       "P01": 148603,
+#       "P02": 48950,
+#       "P03": 37,
+#       "P04": 7,
+#       "P05": 3,
+#       "P06": 1,
+#       "P10": 1
+#     }
+#   },
+#   ...
+# }
+#
+# And we want to emit:
+#
+# commits.histparents[P00].count:13
+# commits.histparents[P01].count:148603
+# ...
+#
+sub CommitsHistParents {
+    my $nr_buckets = keys %{ $data->{'commits'}->{'count_by_nr_parents'} };
+    if ($nr_buckets == 0) {
+	return 0;
+    }
+    my %dist = %{ $data->{'commits'}->{'count_by_nr_parents'} };
+    foreach my $key ( sort keys %dist ) {
+	my $value = $dist{$key};
+	print "commits.histparents[$key].count:$value\n";
+    }
+}
+
+# We currently do not expose the "trees.dist_by_size" histogram for
+# testing.  The data is valid, but sensitive to the length of a SHA
+# and the filenames in the tree.  That makes it a little trickier to
+# test and probably not worth the bother (since we'll test the
+# histogram setup code with the "trees.dist_by_nr_entries" and the
+# histogram size code in the "blobs.dist_by_size" cases.
+#
+# {
+#   ...
+#   "trees": {
+#     "count": 331409,
+#     ...
+#     "dist_by_size": {
+#       "H1": {
+#         "count": 13349,
+#         "sum_size": 1953155,
+#         "sum_disk_size": 912044,
+#         "hbin_lower": 16,
+#         "hbin_upper": 255
+#       },
+#       "H2": {
+#         "count": 52677,
+#         "sum_size": 101507410,
+#         "sum_disk_size": 6549425,
+#         "hbin_lower": 256,
+#         "hbin_upper": 4095
+#       },
+#       ...
+#     },
+#     ...
+#   },
+#   ...
+# }
+
+# We also do not expose the "trees.largest_trees_by_size" array for
+# testing (for the same SHA and filename reasons).  We'll assume that
+# the same code is used to build the array of largest blobs.
+#
+# {
+#   ...
+#   "trees": {
+#     "count": 331409,
+#     ...
+#     "largest_trees_by_size_bytes": [
+#       {
+#         "size": 58487,
+#         "oid": "140160ee18ed56aeaf5e028c60e01874faa9c12d",
+#         "name": "t",
+#         ...
+#       },
+#       {
+#         "size": 58487,
+#         "oid": "2d5af5733ab1061aae9a7babaabf9064783e3891",
+#         "name": "t",
+#         ...
+#       },
+#       ...
+#     },
+#     ...
+#   },
+#   ...
+# }
+
+# Print details for "Histogram by Number of Entries" for trees.  For
+# example, the bucket `Q00` contains the count of the trees that have
+# between 0 and 3 files/subdirectories.
+#
+# We assume that the JSON looks like:
+#
+# {
+#   ...
+#   "trees": {
+#     "count": 331409,
+#     "sum_size": 5376309652,
+#     ...
+#     "dist_by_nr_entries": {
+#       "Q00": {
+#         "count": 5798,
+#         "sum_size": 480428,
+#         "sum_disk_size": 390478,
+#         "qbin_lower": 0,
+#         "qbin_upper": 3
+#       },
+#       "Q01": {
+#         "count": 15217,
+#         "sum_size": 4587357,
+#         "sum_disk_size": 1177431,
+#         "qbin_lower": 4,
+#         "qbin_upper": 15
+#       },
+#       ...
+#       "Q05": {
+#         "count": 12965,
+#         "sum_size": 714372748,
+#         "sum_disk_size": 11298665,
+#         "qbin_lower": 1024,
+#         "qbin_upper": 4095
+#       },
+#       ...
+#     }
+#   },
+#   ...
+# }
+#
+# And we want to emit:
+#
+# trees.histentries.Q00.count:5798
+# trees.histentries.Q01.count:15217
+# ...
+# trees.histentries.Q05.count:12965
+# ...
+#
+sub TreesHistEntries {
+    my $nr_buckets = keys %{ $data->{'trees'}->{'dist_by_nr_entries'} };
+    if ($nr_buckets == 0) {
+	return 0;
+    }
+    my %dist = %{ $data->{'trees'}->{'dist_by_nr_entries'} };
+    foreach my $key ( sort keys %dist ) {
+	my $value = $dist{$key};
+	print "trees.histentries.$key.count:$value->{'count'}\n";
+    }
+}    
+
+# Print details for "Largest Trees by Number of Entries".  This is an
+# array sorted in descending order.  For multiple trees with the same
+# number of entries, the relative order is undefined.
+#
+# We assume that the JSON looks like:
+#
+# {
+#   ...
+#   "trees": {
+#     "count": 331409,
+#     ...
+#     "largest_trees_by_nr_entries": [
+#       {
+#         "nr_entries": 1148,
+#         "oid": "140160ee18ed56aeaf5e028c60e01874faa9c12d",
+#         "name": "t",
+#         ...
+#       },
+#       {
+#         "nr_entries": 942,
+#         "oid": "2d5af5733ab1061aae9a7babaabf9064783e3891",
+#         "name": "t",
+#         ...
+#       },
+#       ...
+#     ],
+#     ...
+#   },
+#   ...
+# }
+#
+# And we want to emit:
+#
+# trees.mostentries[0].nr_entries:1148
+# trees.mostentries[1].nr_entries:942
+# ...
+#
+sub TreesMostEntries {
+    my $nr_items = scalar @{ $data->{'trees'}->{'largest_trees_by_nr_entries'} };
+    if ($nr_items == 0) {
+	return 0;
+    }
+    my @arr = @{ $data->{'trees'}->{'largest_trees_by_nr_entries'} };
+    my $k;
+    for ($k=0; $k < $nr_items; $k++) {
+	print "trees.mostentries[$k].nr_entries:$arr[$k]->{'nr_entries'}\n";
+    }
+}
+
+# Print details for the "Histogram by Size in Bytes" for blobs.
+#
+# We assume that the JSON looks like:
+#
+# {
+#   ...
+#   "blobs": {
+#     "count": 191847,
+#     ...
+#     "dist_by_size": {
+#       "H0": {
+#         "count": 47,
+#         "sum_size": 433,
+#         "sum_disk_size": 856,
+#         "hbin_lower": 0,
+#         "hbin_upper": 15
+#       },
+#       "H1": {
+#         "count": 2045,
+#         "sum_size": 224602,
+#         "sum_disk_size": 145658,
+#         "hbin_lower": 16,
+#         "hbin_upper": 255
+#       },
+#       ...
+#     }
+#   },
+#   ...
+# }
+#
+# And we want to emit:
+#
+# blobs.histsize.H0.count:47
+# blobs.histsize.H1.count:2045
+# ...
+#
+sub BlobsHistSize {
+    my $nr_buckets = keys %{ $data->{'blobs'}->{'dist_by_size'} };
+    if ($nr_buckets == 0) {
+	return 0;
+    }
+    my %dist = %{ $data->{'blobs'}->{'dist_by_size'} };
+    foreach my $key ( sort keys %dist ) {
+	my $value = $dist{$key};
+	print "blobs.histsize.$key.count:$value->{'count'}\n";
+    }
+}
+
+# Print details for the "Largest Blobs by Size in Bytes" table.
+# This is an array sorted in descending order.  If there are multiple
+# blobs with the same size, the relative order is undefined.
+#
+# We assume that the JSON looks like:
+#
+# {
+#   ...
+#   "blobs": {
+#     "count": 191847,
+#     ...
+#     "largest_blobs_by_size_bytes": [
+#       {
+#         "size": 10577552,
+#         "oid": "667824451d9202e721b6d9413ce4c6b7ce58c36e",
+#         ...
+#       },
+#       {
+#         "size": 6655520,
+#         "oid": "78bcd7f596df79b580e793957928be457a61c3f5",
+#         ...
+#       },
+#       ...
+#     ],
+#   },
+#   ...
+# }
+#
+# And we want to emit:
+#
+# blobs.largest[0].size:10577552
+# blobs.largest[1].size:6655520
+# ...
+#
+sub BlobsLargest {
+    my $nr_items = scalar @{ $data->{'blobs'}->{'largest_blobs_by_size_bytes'} };
+    if ($nr_items == 0) {
+	return 0;
+    }
+    my @arr = @{ $data->{'blobs'}->{'largest_blobs_by_size_bytes'} };
+    my $k;
+    for ($k=0; $k < $nr_items; $k++) {
+	print "blobs.largest[$k].size:$arr[$k]->{'size'}\n";
+    }
+}
+
+foreach my $arg_k(@ARGV) {
+    if ($arg_k eq 'count') {
+	Count;
+    }
+    elsif ($arg_k eq 'commits.mostparents') {
+	CommitsMostParents;
+    }
+    elsif ($arg_k eq 'commits.histparents') {
+	CommitsHistParents;
+    }
+    elsif ($arg_k eq 'trees.histentries') {
+	TreesHistEntries;
+    }
+    elsif ($arg_k eq 'trees.mostentries') {
+	TreesMostEntries;
+    }
+    elsif ($arg_k eq 'blobs.histsize') {
+	BlobsHistSize;
+    }
+    elsif ($arg_k eq 'blobs.largest') {
+	BlobsLargest;
+    }
+    else {
+	print "ERROR: unknown command '$arg_k'\n";
+	exit 1;
+    }
+}

From 440477b72462bfb1c79c3b413bb082edd53679cf Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Tue, 4 Jun 2024 10:37:23 -0400
Subject: [PATCH 14/18] survey: add --no-name-rev option

Computing `git name-rev` on each commit, tree, and blob in each
of the various large_item_vec can be very expensive if there are
too many refs, especially if the user doesn't need the result.
Lets make it optional.

The `--no-name-rev` option can save 50 calls to `git name-rev`
since we have 5 large_item_vec's and each defaults to 10 items.

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 Documentation/config/survey.txt |  5 +++
 Documentation/git-survey.txt    |  4 +++
 builtin/survey.c                | 59 +++++++++++++++++++++++++--------
 3 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/Documentation/config/survey.txt b/Documentation/config/survey.txt
index 672e7890ed2f79..857c1c3fff2d6a 100644
--- a/Documentation/config/survey.txt
+++ b/Documentation/config/survey.txt
@@ -1,3 +1,8 @@
+survey.namerev::
+	Boolean to show/hide `git name-rev` information for
+	each reported commit and the containing commit of each
+	reported tree and blob.
+
 survey.progress::
 	Boolean to show/hide progress information.  Defaults to
 	true when interactive (stderr is bound to a TTY).
diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt
index 0e42fb32ac2964..7be11e4683822d 100644
--- a/Documentation/git-survey.txt
+++ b/Documentation/git-survey.txt
@@ -35,6 +35,10 @@ OPTIONS
 --json::
 	Print results in JSON rather than in a human-friendly format.
 
+--[no-]name-rev::
+	Print `git name-rev` output for each commit, tree, and blob.
+	Defaults to true.
+
 Ref Selection
 ~~~~~~~~~~~~~
 
diff --git a/builtin/survey.c b/builtin/survey.c
index d2e53885fd5820..81019a34d8616b 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -57,6 +57,7 @@ struct survey_opts {
 	int verbose;
 	int show_progress;
 	int show_json;
+	int show_name_rev;
 
 	int show_largest_commits_by_nr_parents;
 	int show_largest_commits_by_size_bytes;
@@ -75,6 +76,7 @@ static struct survey_opts survey_opts = {
 	.verbose = 0,
 	.show_progress = -1, /* defaults to isatty(2) */
 	.show_json = 0, /* defaults to pretty */
+	.show_name_rev = 1,
 
 	/*
 	 * Show the largest `n` objects for some scaling dimension.
@@ -157,6 +159,7 @@ static struct option survey_options[] = {
 	OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")),
 	OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")),
 	OPT_BOOL(0, "json",     &survey_opts.show_json, N_("report stats in JSON")),
+	OPT_BOOL(0, "name-rev", &survey_opts.show_name_rev, N_("run name-rev on each reported commit")),
 
 	OPT_BOOL_F(0, "all-refs", &survey_opts.refs.want_all_refs, N_("include all refs"),          PARSE_OPT_NONEG),
 
@@ -192,6 +195,10 @@ static int survey_load_config_cb(const char *var, const char *value,
 		survey_opts.show_json = git_config_bool(var, value);
 		return 0;
 	}
+	if (!strcmp(var, "survey.namerev")) {
+		survey_opts.show_name_rev = git_config_bool(var, value);
+		return 0;
+	}
 
 	if (!strcmp(var, "survey.showcommitparents")) {
 		survey_opts.show_largest_commits_by_nr_parents = git_config_ulong(var, value, ctx->kvi);
@@ -1188,6 +1195,13 @@ static void do_calc_stats_refs(struct repository *r, struct ref_array *ref_array
 
 static void do_lookup_name_rev(void)
 {
+	/*
+	 * `git name-rev` can be very expensive when there are lots of
+	 * refs, so make it optional.
+	 */
+	if (!survey_opts.show_name_rev)
+		return;
+
 	if (survey_opts.show_progress) {
 		survey_progress_total = 0;
 		survey_progress = start_progress(_("Resolving name-revs..."), 0);
@@ -1235,9 +1249,11 @@ static void survey_phase_refs(struct repository *r)
 	do_calc_stats_refs(r, &ref_array);
 	trace2_region_leave("survey", "phase/calcstats", the_repository);
 
-	trace2_region_enter("survey", "phase/namerev", the_repository);
-	do_lookup_name_rev();
-	trace2_region_enter("survey", "phase/namerev", the_repository);
+	if (survey_opts.show_name_rev) {
+		trace2_region_enter("survey", "phase/namerev", the_repository);
+		do_lookup_name_rev();
+		trace2_region_enter("survey", "phase/namerev", the_repository);
+	}
 
 	ref_array_clear(&ref_array);
 }
@@ -1475,7 +1491,9 @@ static void write_large_item_vec_json(struct json_writer *jw,
 				if (!is_null_oid(&pk->containing_commit_oid))
 					jw_object_string(jw, "commit_oid",
 							 oid_to_hex(&pk->containing_commit_oid));
-				if (pk->name_rev->len)
+				if (survey_opts.show_name_rev &&
+				    pk->name_rev &&
+				    pk->name_rev->len)
 					jw_object_string(jw, "name_rev",
 							 pk->name_rev->buf);
 			}
@@ -1862,7 +1880,8 @@ static void fmt_large_item_hdr(struct strbuf *buf,
 	strbuf_addf(buf, "%-*s | %14s", column0, "OID", item_hdr_label);
 	if (name_length)
 		strbuf_addf(buf, " | %-*s", name_length, "Name");
-	strbuf_addf(buf, " | %-*s", name_rev_length, "Name Rev");
+	if (name_rev_length)
+		strbuf_addf(buf, " | %-*s", name_rev_length, "Commit / Name Rev");
 
 	strbuf_addch(buf, '\n');
 }
@@ -1884,8 +1903,10 @@ static void fmt_large_item_hr(struct strbuf *buf,
 		strbuf_addstr(buf, "-+-");
 		strbuf_addchars(buf, '-', name_length);
 	}
-	strbuf_addstr(buf, "-+-");
-	strbuf_addchars(buf, '-', name_rev_length);
+	if (name_rev_length) {
+		strbuf_addstr(buf, "-+-");
+		strbuf_addchars(buf, '-', name_rev_length);
+	}
 
 	strbuf_addch(buf, '\n');
 }
@@ -1907,7 +1928,11 @@ static void fmt_large_item_row(struct strbuf *buf,
 	if (name_length)
 		strbuf_addf(buf, " | %-*s", name_length,
 			    (pitem->name ? pitem->name->buf: ""));
-	strbuf_addf(buf, " | %-*s", name_rev_length, pitem->name_rev->buf);
+	if (name_rev_length)
+		strbuf_addf(buf, " | %-*s", name_rev_length,
+			    ((pitem->name_rev)
+			     ? pitem->name_rev->buf
+			     : oid_to_hex(&pitem->containing_commit_oid)));
 
 	strbuf_addch(buf, '\n');
 }
@@ -1917,11 +1942,11 @@ static void fmt_large_item_vec(struct strbuf *buf,
 			       struct large_item_vec *pvec)
 {
 	int name_length = 0;
-	int name_rev_length = 10;
+	int name_rev_length = 0;
 	int k;
 
 	if (pvec->type != OBJ_COMMIT) {
-		/* Add "Name" column for trees and blobs. */
+		/* Add "Name" column for trees and blobs. This is relative pathname. */
 		for (k = 0; k < pvec->nr_items; k++)
 			if (pvec->items[k].name && pvec->items[k].name->len > name_length)
 				name_length = pvec->items[k].name->len;
@@ -1930,10 +1955,16 @@ static void fmt_large_item_vec(struct strbuf *buf,
 				name_length = 4;
 	}
 
-	for (k = 0; k < pvec->nr_items; k++) {
-		struct large_item *pk = &pvec->items[k];
-		if (pk->name_rev->len > name_rev_length)
-			name_rev_length = pk->name_rev->len;
+	if (survey_opts.show_name_rev) {
+		name_rev_length = 17; /* strlen("Commit / Name Rev") */
+		for (k = 0; k < pvec->nr_items; k++) {
+			struct large_item *pk = &pvec->items[k];
+			if (pk->name_rev && pk->name_rev->len > name_rev_length)
+				name_rev_length = pk->name_rev->len;
+		}
+	} else if (pvec->type != OBJ_COMMIT) {
+		/* for trees and blobs, just show containing commit OID */
+		name_rev_length = the_hash_algo->hexsz;
 	}
 
 	strbuf_addch(buf, '\n');

From 31ecb35d3f1c7f81717a266312e064f48fa354a4 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 17 Jun 2024 15:20:05 -0400
Subject: [PATCH 15/18] survey: started TODO list at bottom of source file

---
 builtin/survey.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index 81019a34d8616b..0aca935a8e95de 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -2294,3 +2294,49 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 
 	return 0;
 }
+
+/*
+ * NEEDSWORK: The following is a bit of a laundry list of things
+ * that I'd like to add.
+ *
+ * [] Dump stats on all of the packfiles. The number and size of each.
+ * Whether each is in the .git directory or in an alternate.  The state
+ * of the IDX or MIDX files and etc.  Delta chain stats.  All of this
+ * data is relative to the "lived-in" state of the repository.  Stuff
+ * that may change after a GC or repack.
+ *
+ * [] Dump stats on each remote.  When we fetch from a remote the size
+ * of the response is related to the set of haves on the server.  You
+ * can see this in `GIT_TRACE_CURL=1 git fetch`. We get a `ls-refs`
+ * payload that lists all of the branches and tags on the server, so
+ * at a minimum the RefName and SHA for each. But for annotated tags
+ * we also get the peeled SHA.  The size of this overhead on every
+ * fetch is proporational to the size of the `git ls-remote` response
+ * (roughly, although the latter repeats the RefName of the peeled
+ * tag).  If, for example, you have 500K refs on a remote, you're
+ * going to have a long "haves" message, so every fetch will be slow
+ * just because of that overhead (not counting new objects to be
+ * downloaded).
+ *
+ * Note that the local set of tags in "refs/tags/" is a union over all
+ * remotes.  However, since most people only have one remote, we can
+ * probaly estimate the overhead value directly from the size of the
+ * set of "refs/tags/" that we visited while building the `ref_info`
+ * and `ref_array` and not need to ask the remote.
+ *
+ * [] Dump info on the complexity of the DAG.  Criss-cross merges.
+ * The number of edges that must be touched to compute merge bases.
+ * Edge length. The number of parallel lanes in the history that must
+ * be navigated to get to the merge base.  What affects the cost of
+ * the Ahead/Behind computation?  How often do criss-crosses occur and
+ * do they cause various operations to slow down?
+ *
+ * [] If there are primary branches (like "main" or "master") are they
+ * always on the left side of merges?  Does the graph have a clean
+ * left edge?  Or are there normal and "backwards" merges?  Do these
+ * cause problems at scale?
+ *
+ * [] If we have a hierarchy of FI/RI branches like "L1", "L2, ...,
+ * can we learn anything about the shape of the repo around these FI
+ * and RI integrations?
+ */

From 630e4a624609b68a3ab612642237e335c3213030 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Fri, 28 Jun 2024 15:22:46 -0400
Subject: [PATCH 16/18] survey: expanded TODO list at the bottom of the source
 file

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 148 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 116 insertions(+), 32 deletions(-)

diff --git a/builtin/survey.c b/builtin/survey.c
index 0aca935a8e95de..91ac0bd45dadf6 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -2296,47 +2296,131 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 }
 
 /*
- * NEEDSWORK: The following is a bit of a laundry list of things
- * that I'd like to add.
+ * NEEDSWORK: So far, I only have iteration on the requested set of
+ * refs and treewalk/reachable objects on that set of refs.  The
+ * following is a bit of a laundry list of things that I'd like to
+ * add.
  *
  * [] Dump stats on all of the packfiles. The number and size of each.
- * Whether each is in the .git directory or in an alternate.  The state
- * of the IDX or MIDX files and etc.  Delta chain stats.  All of this
- * data is relative to the "lived-in" state of the repository.  Stuff
- * that may change after a GC or repack.
+ *    Whether each is in the .git directory or in an alternate.  The
+ *    state of the IDX or MIDX files and etc.  Delta chain stats.  All
+ *    of this data is relative to the "lived-in" state of the
+ *    repository.  Stuff that may change after a GC or repack.
+ *
+ * [] Clone and Index stats. partial, shallow, sparse-checkout,
+ *    sparse-index, etc.  Hydration stats.
  *
  * [] Dump stats on each remote.  When we fetch from a remote the size
- * of the response is related to the set of haves on the server.  You
- * can see this in `GIT_TRACE_CURL=1 git fetch`. We get a `ls-refs`
- * payload that lists all of the branches and tags on the server, so
- * at a minimum the RefName and SHA for each. But for annotated tags
- * we also get the peeled SHA.  The size of this overhead on every
- * fetch is proporational to the size of the `git ls-remote` response
- * (roughly, although the latter repeats the RefName of the peeled
- * tag).  If, for example, you have 500K refs on a remote, you're
- * going to have a long "haves" message, so every fetch will be slow
- * just because of that overhead (not counting new objects to be
- * downloaded).
+ *    of the response is related to the set of haves on the server.
+ *    You can see this in `GIT_TRACE_CURL=1 git fetch`. We get a
+ *    `ls-refs` payload that lists all of the branches and tags on the
+ *    server, so at a minimum the RefName and SHA for each. But for
+ *    annotated tags we also get the peeled SHA.  The size of this
+ *    overhead on every fetch is proporational to the size of the `git
+ *    ls-remote` response (roughly, although the latter repeats the
+ *    RefName of the peeled tag).  If, for example, you have 500K refs
+ *    on a remote, you're going to have a long "haves" message, so
+ *    every fetch will be slow just because of that overhead (not
+ *    counting new objects to be downloaded).
  *
- * Note that the local set of tags in "refs/tags/" is a union over all
- * remotes.  However, since most people only have one remote, we can
- * probaly estimate the overhead value directly from the size of the
- * set of "refs/tags/" that we visited while building the `ref_info`
- * and `ref_array` and not need to ask the remote.
+ *    Note that the local set of tags in "refs/tags/" is a union over
+ *    all remotes.  However, since most people only have one remote,
+ *    we can probaly estimate the overhead value directly from the
+ *    size of the set of "refs/tags/" that we visited while building
+ *    the `ref_info` and `ref_array` and not need to ask the remote.
  *
  * [] Dump info on the complexity of the DAG.  Criss-cross merges.
- * The number of edges that must be touched to compute merge bases.
- * Edge length. The number of parallel lanes in the history that must
- * be navigated to get to the merge base.  What affects the cost of
- * the Ahead/Behind computation?  How often do criss-crosses occur and
- * do they cause various operations to slow down?
+ *    The number of edges that must be touched to compute merge bases.
+ *    Edge length. The number of parallel lanes in the history that
+ *    must be navigated to get to the merge base.  What affects the
+ *    cost of the Ahead/Behind computation?  How often do
+ *    criss-crosses occur and do they cause various operations to slow
+ *    down?
  *
  * [] If there are primary branches (like "main" or "master") are they
- * always on the left side of merges?  Does the graph have a clean
- * left edge?  Or are there normal and "backwards" merges?  Do these
- * cause problems at scale?
+ *    always on the left side of merges?  Does the graph have a clean
+ *    left edge?  Or are there normal and "backwards" merges?  Do
+ *    these cause problems at scale?
  *
  * [] If we have a hierarchy of FI/RI branches like "L1", "L2, ...,
- * can we learn anything about the shape of the repo around these FI
- * and RI integrations?
+ *    can we learn anything about the shape of the repo around these
+ *    FI and RI integrations?
+ *
+ * [] Do we need a no-PII flag to omit pathnames or branch/tag names
+ *    in the various histograms?  (This would turn off --name-rev
+ *    too.)
+ *
+ * [] I have so far avoided adding opinions about individual fields
+ *    (such as the way `git-sizer` prints a row of stars or bangs in
+ *    the last column).
+ *
+ *    I'm wondering if that is a job of this executable or if it
+ *    should be done in a post-processing step using the JSON output.
+ *
+ *    My problem with the `git-sizer` approach is that it doesn't give
+ *    the (casual) user any information on why it has stars or bangs.
+ *    And there isn't a good way to print detailed information in the
+ *    ASCII-art tables that would be easy to understand.
+ *
+ *    [] For example, a large number of refs does not define a cliff.
+ *       Performance will drop off (linearly, quadratically, ... ??).
+ *       The tool should refer them to article(s) talking about the
+ *       different problems that it could cause.  So should `git
+ *       survey` just print the number and (implicitly) refer them to
+ *       the man page (chapter/verse) or to a tool that will interpret
+ *       the number and explain it?
+ *
+ *    [] Alternatively, should `git survey` do that analysis too and
+ *       just print footnotes for each large number?
+ *
+ *    [] The computation of the raw survey JSON data can take HOURS on
+ *       a very large repo (like Windows), so I'm wondering if we
+ *       want to keep the opinion portion separate.
+ *
+ * [] In addition to opinions based on the static data, I would like
+ *    to dump the JSON results (or the Trace2 telemetry) into a DB and
+ *    aggregate it with other users.
+ *
+ *    Granted, they should all see the same DAG and the same set of
+ *    reachable objects, but we could average across all datasets
+ *    generated on a particular date and detect outlier users.
+ *
+ *    [] Maybe someone cloned from the `_full` endpoint rather than
+ *       the limited refs endpoint.
+ *
+ *    [] Maybe that user is having problems with repacking / GC /
+ *       maintenance without knowing it.
+ *
+ * [] I'd also like to dump use the DB to compare survey datasets over
+ *    a time.  How fast is their repository growing and in what ways?
+ *
+ *    [] I'd rather have the delta analysis NOT be inside `git
+ *       survey`, so it makes sense to consider having all of it in a
+ *       post-process step.
+ *
+ * [] Another reason to put the opinion analysis in a post-process
+ *    is that it would be easier to generate plots on the data tables.
+ *    Granted, we can get plots from telemetry, but a stand-alone user
+ *    could run the JSON thru python or jq or something and generate
+ *    something nicer than ASCII-art and it could handle cross-referencing
+ *    and hyperlinking to helpful information on each issue.
+ *
+ * [] I think there are several classes of data that we can report on:
+ *
+ *    [] The "inherit repo properties", such as the shape and size of
+ *       the DAG -- these should be universal in each enlistment.
+ *
+ *    [] The "ODB lived in properties", such as the efficiency
+ *       of the repack and things like partial and shallow clone.
+ *       These will vary, but indicate health of the ODB.
+ *
+ *    [] The "index related properties", such as sparse-checkout,
+ *       sparse-index, cache-tree, untracked-cache, fsmonitor, and
+ *       etc.  These will also vary, but are more like knobs for
+ *       the user to adjust.
+ *
+ *    [] I want to compare these with Matt's "dimensions of scale"
+ *       notes and see if there are other pieces of data that we
+ *       could compute/consider.
+ *
  */

From 458e9bc4f2b309638679a66e7964d5818fb1ade6 Mon Sep 17 00:00:00 2001
From: Jeff Hostetler <jeffhostetler@github.com>
Date: Mon, 1 Jul 2024 12:07:01 -0400
Subject: [PATCH 17/18] survey: expanded TODO with more notes

Signed-off-by: Jeff Hostetler <jeffhostetler@github.com>
---
 builtin/survey.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index 91ac0bd45dadf6..719e2e21e915d3 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -2329,6 +2329,16 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
  *    size of the set of "refs/tags/" that we visited while building
  *    the `ref_info` and `ref_array` and not need to ask the remote.
  *
+ *    [] Should the "string length of refnames / remote refs", for
+ *       example, be sub-divided by remote so we can project the
+ *       cost of the haves/wants overhead a fetch.
+ *
+ * [] Can we examine the merge commits and classify them as clean or
+ *    dirty?  (ie. ones with merge conflicts that needed to be
+ *    addressed during the merge itself.)
+ *
+ *    [] Do dirty merges affect performance of later operations?
+ *
  * [] Dump info on the complexity of the DAG.  Criss-cross merges.
  *    The number of edges that must be touched to compute merge bases.
  *    Edge length. The number of parallel lanes in the history that

From cb271c6ebe8aca731d3f4c433f8b391a3d6a8078 Mon Sep 17 00:00:00 2001
From: Johannes Schindelin <johannes.schindelin@gmx.de>
Date: Mon, 1 Jul 2024 23:28:45 +0200
Subject: [PATCH 18/18] survey: clearly note the experimental nature in the
 output

While this command is definitely something we _want_, chances are that
upstreaming this will require substantial changes.

We still want to be able to experiment with this before that, to focus
on what we need out of this command: To assist with diagnosing issues
with large repositories, as well as to help monitoring the growth and
the associated painpoints of such repositories.

To that end, we are about to integrate this command into
`microsoft/git`, to get the tool into the hands of users who need it
most, with the idea to iterate in close collaboration between these
users and the developers familar with Git's internals.

However, we will definitely want to avoid letting anybody have the
impression that this command, its exact inner workings, as well as its
output format, are anywhere close to stable. To make that fact utterly
clear (and thereby protect the freedom to iterate and innovate freely
before upstreaming the command), let's mark its output as experimental
in all-caps, as the first thing we do.

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
---
 builtin/survey.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/builtin/survey.c b/builtin/survey.c
index 719e2e21e915d3..4d0f0f86c46649 100644
--- a/builtin/survey.c
+++ b/builtin/survey.c
@@ -18,6 +18,7 @@
 #include "trace2.h"
 #include "tree.h"
 #include "tree-walk.h"
+#include "color.h"
 
 static const char * const survey_usage[] = {
 	N_("(EXPERIMENTAL!) git survey <options>"),
@@ -2265,6 +2266,10 @@ int cmd_survey(int argc, const char **argv, const char *prefix)
 
 	argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0);
 
+	color_fprintf_ln(stderr,
+			 want_color_fd(2, GIT_COLOR_AUTO) ? GIT_COLOR_YELLOW : "",
+			 "(THIS IS EXPERIMENTAL, EXPECT THE OUTPUT FORMAT TO CHANGE!)");
+
 	prepare_repo_settings(the_repository);
 
 	if (survey_opts.show_progress < 0)