diff --git a/.gitignore b/.gitignore index bf97276163b19b..8f82ba30d5a17a 100644 --- a/.gitignore +++ b/.gitignore @@ -164,6 +164,7 @@ /git-submodule /git-submodule--helper /git-subtree +/git-survey /git-svn /git-switch /git-symbolic-ref diff --git a/Documentation/config.txt b/Documentation/config.txt index fedfaf30cd0d8b..939cc1387992f8 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -536,6 +536,8 @@ include::config/status.txt[] include::config/submodule.txt[] +include::config/survey.txt[] + include::config/tag.txt[] include::config/tar.txt[] diff --git a/Documentation/config/survey.txt b/Documentation/config/survey.txt new file mode 100644 index 00000000000000..857c1c3fff2d6a --- /dev/null +++ b/Documentation/config/survey.txt @@ -0,0 +1,41 @@ +survey.namerev:: + Boolean to show/hide `git name-rev` information for + each reported commit and the containing commit of each + reported tree and blob. + +survey.progress:: + Boolean to show/hide progress information. Defaults to + true when interactive (stderr is bound to a TTY). + +survey.showBlobSizes:: + A non-negative integer value. Requests details on the + largest file blobs by size in bytes. Provides a default + value for `--blob-sizes=` in linkgit:git-survey[1]. + +survey.showCommitParents:: + A non-negative integer value. Requests details on the + commits with the most number of parents. Provides a default + value for `--commit-parents=` in linkgit:git-survey[1]. + +survey.showCommitSizes:: + A non-negative integer value. Requests details on the + largest commits by size in bytes. Generally, these are the + commits with the largest commit messages. Provides a default + value for `--commit-sizes=` in linkgit:git-survey[1]. + +survey.showTreeEntries:: + A non-negative integer value. Requests details on the + trees (directories) with the most number of entries (files + and subdirectories). Provides a default value for + `--tree-entries=` in linkgit:git-survey[1]. + +survey.showTreeSizes:: + A non-negative integer value. Requests details on the + largest trees (directories) by size in bytes. This will + set will usually be equal to the `survey.showTreeEntries` + set, but may be skewed by very long file or subdirectory + entry names. Provides a default value for + `--tree-sizes=` in linkgit:git-survey[1]. + +survey.verbose:: + Boolean to show/hide verbose output. Default to false. diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt new file mode 100644 index 00000000000000..7be11e4683822d --- /dev/null +++ b/Documentation/git-survey.txt @@ -0,0 +1,108 @@ +git-survey(1) +============= + +NAME +---- +git-survey - EXPERIMENTAL: Measure various repository dimensions of scale + +SYNOPSIS +-------- +[verse] +(EXPERIMENTAL!) `git survey` + +DESCRIPTION +----------- + +Survey the repository and measure various dimensions of scale. + +As repositories grow to "monorepo" size, certain data shapes can cause +performance problems. `git-survey` attempts to measure and report on +known problem areas. + +Ref Selection and Reachable Objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this first analysis phase, `git survey` will iterate over the set of +requested branches, tags, and other refs and treewalk over all of the +reachable commits, trees, and blobs and generate various statistics. + +OPTIONS +------- + +--progress:: + Show progress. This is automatically enabled when interactive. + +--json:: + Print results in JSON rather than in a human-friendly format. + +--[no-]name-rev:: + Print `git name-rev` output for each commit, tree, and blob. + Defaults to true. + +Ref Selection +~~~~~~~~~~~~~ + +The following options control the set of refs that `git survey` will examine. +By default, `git survey` will look at tags, local branches, and remote refs. +If any of the following options are given, the default set is cleared and +only refs for the given options are added. + +--all-refs:: + Use all refs. This includes local branches, tags, remote refs, + notes, and stashes. This option overrides all of the following. + +--branches:: + Add local branches (`refs/heads/`) to the set. + +--tags:: + Add tags (`refs/tags/`) to the set. + +--remotes:: + Add remote branches (`refs/remote/`) to the set. + +--detached:: + Add HEAD to the set. + +--other:: + Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set. + +Large Item Selection +~~~~~~~~~~~~~~~~~~~~ + +The following options control the optional display of large items under +various dimensions of scale. The OID of the largest `n` objects will be +displayed in reverse sorted order. For each, `n` defaults to 10. + +--commit-parents:: + Shows the OIDs of the commits with the most parent commits. + +--commit-sizes:: + Shows the OIDs of the largest commits by size in bytes. This is + usually the ones with the largest commit messages. + +--tree-entries:: + Shows the OIDs of the trees with the most number of entries. These + are the directories with the most number of files or subdirectories. + +--tree-sizes:: + Shows the OIDs of the largest trees by size in bytes. This set + will usually be the same as the vector of number of entries unless + skewed by very long entry names. + +--blob-sizes:: + Shows the OIDs of the largest blobs by size in bytes. + +OUTPUT +------ + +By default, `git survey` will print information about the repository in a +human-readable format that includes overviews and tables. + +CONFIGURATION +------------- + +include::config/survey.txt[] + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Makefile b/Makefile index 522c8a612c9882..13a73d99585ce4 100644 --- a/Makefile +++ b/Makefile @@ -1309,6 +1309,7 @@ BUILTIN_OBJS += builtin/sparse-checkout.o BUILTIN_OBJS += builtin/stash.o BUILTIN_OBJS += builtin/stripspace.o BUILTIN_OBJS += builtin/submodule--helper.o +BUILTIN_OBJS += builtin/survey.o BUILTIN_OBJS += builtin/symbolic-ref.o BUILTIN_OBJS += builtin/tag.o BUILTIN_OBJS += builtin/unpack-file.o diff --git a/builtin.h b/builtin.h index 14fa0171607b17..8ab1623b32679c 100644 --- a/builtin.h +++ b/builtin.h @@ -238,6 +238,7 @@ int cmd_status(int argc, const char **argv, const char *prefix); int cmd_stash(int argc, const char **argv, const char *prefix); int cmd_stripspace(int argc, const char **argv, const char *prefix); int cmd_submodule__helper(int argc, const char **argv, const char *prefix); +int cmd_survey(int argc, const char **argv, const char *prefix); int cmd_switch(int argc, const char **argv, const char *prefix); int cmd_symbolic_ref(int argc, const char **argv, const char *prefix); int cmd_tag(int argc, const char **argv, const char *prefix); diff --git a/builtin/survey.c b/builtin/survey.c new file mode 100644 index 00000000000000..4d0f0f86c46649 --- /dev/null +++ b/builtin/survey.c @@ -0,0 +1,2441 @@ +#include "builtin.h" +#include "config.h" +#include "environment.h" +#include "hex.h" +#include "json-writer.h" +#include "list-objects.h" +#include "object-name.h" +#include "object-store.h" +#include "parse-options.h" +#include "progress.h" +#include "ref-filter.h" +#include "refs.h" +#include "revision.h" +#include "run-command.h" +#include "strbuf.h" +#include "strmap.h" +#include "strvec.h" +#include "trace2.h" +#include "tree.h" +#include "tree-walk.h" +#include "color.h" + +static const char * const survey_usage[] = { + N_("(EXPERIMENTAL!) git survey "), + NULL, +}; + +static struct progress *survey_progress = NULL; +static uint64_t survey_progress_total = 0; + +struct survey_refs_wanted { + int want_all_refs; /* special override */ + + int want_branches; + int want_tags; + int want_remotes; + int want_detached; + int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */ +}; + +static struct strvec survey_vec_refs_wanted = STRVEC_INIT; + +/* + * The set of refs that we will search if the user doesn't select + * any on the command line. + */ +static struct survey_refs_wanted refs_if_unspecified = { + .want_all_refs = 0, + + .want_branches = 1, + .want_tags = 1, + .want_remotes = 1, + .want_detached = 0, + .want_other = 0, +}; + +struct survey_opts { + int verbose; + int show_progress; + int show_json; + int show_name_rev; + + int show_largest_commits_by_nr_parents; + int show_largest_commits_by_size_bytes; + + int show_largest_trees_by_nr_entries; + int show_largest_trees_by_size_bytes; + + int show_largest_blobs_by_size_bytes; + + struct survey_refs_wanted refs; +}; + +#define DEFAULT_SHOW_LARGEST_VALUE (10) + +static struct survey_opts survey_opts = { + .verbose = 0, + .show_progress = -1, /* defaults to isatty(2) */ + .show_json = 0, /* defaults to pretty */ + .show_name_rev = 1, + + /* + * Show the largest `n` objects for some scaling dimension. + * We allow each to be requested independently. + */ + .show_largest_commits_by_nr_parents = DEFAULT_SHOW_LARGEST_VALUE, + .show_largest_commits_by_size_bytes = DEFAULT_SHOW_LARGEST_VALUE, + + .show_largest_trees_by_nr_entries = DEFAULT_SHOW_LARGEST_VALUE, + .show_largest_trees_by_size_bytes = DEFAULT_SHOW_LARGEST_VALUE, + + .show_largest_blobs_by_size_bytes = DEFAULT_SHOW_LARGEST_VALUE, + + .refs.want_all_refs = 0, + + .refs.want_branches = -1, /* default these to undefined */ + .refs.want_tags = -1, + .refs.want_remotes = -1, + .refs.want_detached = -1, + .refs.want_other = -1, +}; + +/* + * After parsing the command line arguments, figure out which refs we + * should scan. + * + * If ANY were given in positive sense, then we ONLY include them and + * do not use the builtin values. + */ +static void fixup_refs_wanted(void) +{ + struct survey_refs_wanted *rw = &survey_opts.refs; + + /* + * `--all-refs` overrides and enables everything. + */ + if (rw->want_all_refs == 1) { + rw->want_branches = 1; + rw->want_tags = 1; + rw->want_remotes = 1; + rw->want_detached = 1; + rw->want_other = 1; + return; + } + + /* + * If none of the `--` were given, we assume all + * of the builtin unspecified values. + */ + if (rw->want_branches == -1 && + rw->want_tags == -1 && + rw->want_remotes == -1 && + rw->want_detached == -1 && + rw->want_other == -1) { + *rw = refs_if_unspecified; + return; + } + + /* + * Since we only allow positive boolean values on the command + * line, we will only have true values where they specified + * a `--`. + * + * So anything that still has an unspecified value should be + * set to false. + */ + if (rw->want_branches == -1) + rw->want_branches = 0; + if (rw->want_tags == -1) + rw->want_tags = 0; + if (rw->want_remotes == -1) + rw->want_remotes = 0; + if (rw->want_detached == -1) + rw->want_detached = 0; + if (rw->want_other == -1) + rw->want_other = 0; +} + +static struct option survey_options[] = { + OPT__VERBOSE(&survey_opts.verbose, N_("verbose output")), + OPT_BOOL(0, "progress", &survey_opts.show_progress, N_("show progress")), + OPT_BOOL(0, "json", &survey_opts.show_json, N_("report stats in JSON")), + OPT_BOOL(0, "name-rev", &survey_opts.show_name_rev, N_("run name-rev on each reported commit")), + + OPT_BOOL_F(0, "all-refs", &survey_opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), + + OPT_BOOL_F(0, "branches", &survey_opts.refs.want_branches, N_("include branches"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "tags", &survey_opts.refs.want_tags, N_("include tags"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "remotes", &survey_opts.refs.want_remotes, N_("include all remotes refs"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "detached", &survey_opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "other", &survey_opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG), + + OPT_INTEGER_F(0, "commit-parents", &survey_opts.show_largest_commits_by_nr_parents, N_("show N largest commits by parent count"), PARSE_OPT_NONEG), + OPT_INTEGER_F(0, "commit-sizes", &survey_opts.show_largest_commits_by_size_bytes, N_("show N largest commits by size in bytes"), PARSE_OPT_NONEG), + + OPT_INTEGER_F(0, "tree-entries", &survey_opts.show_largest_trees_by_nr_entries, N_("show N largest trees by entry count"), PARSE_OPT_NONEG), + OPT_INTEGER_F(0, "tree-sizes", &survey_opts.show_largest_trees_by_size_bytes, N_("show N largest trees by size in bytes"), PARSE_OPT_NONEG), + + OPT_INTEGER_F(0, "blob-sizes", &survey_opts.show_largest_blobs_by_size_bytes, N_("show N largest blobs by size in bytes"), PARSE_OPT_NONEG), + + OPT_END(), +}; + +static int survey_load_config_cb(const char *var, const char *value, + const struct config_context *ctx, void *pvoid) +{ + if (!strcmp(var, "survey.verbose")) { + survey_opts.verbose = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "survey.progress")) { + survey_opts.show_progress = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "survey.json")) { + survey_opts.show_json = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "survey.namerev")) { + survey_opts.show_name_rev = git_config_bool(var, value); + return 0; + } + + if (!strcmp(var, "survey.showcommitparents")) { + survey_opts.show_largest_commits_by_nr_parents = git_config_ulong(var, value, ctx->kvi); + return 0; + } + if (!strcmp(var, "survey.showcommitsizes")) { + survey_opts.show_largest_commits_by_size_bytes = git_config_ulong(var, value, ctx->kvi); + return 0; + } + + if (!strcmp(var, "survey.showtreeentries")) { + survey_opts.show_largest_trees_by_nr_entries = git_config_ulong(var, value, ctx->kvi); + return 0; + } + if (!strcmp(var, "survey.showtreesizes")) { + survey_opts.show_largest_trees_by_size_bytes = git_config_ulong(var, value, ctx->kvi); + return 0; + } + + if (!strcmp(var, "survey.showblobsizes")) { + survey_opts.show_largest_blobs_by_size_bytes = git_config_ulong(var, value, ctx->kvi); + return 0; + } + + return git_default_config(var, value, ctx, pvoid); +} + +static void survey_load_config(void) +{ + git_config(survey_load_config_cb, NULL); +} + +/* + * Stats on the set of refs that we found. + */ +struct survey_stats_refs { + uint32_t cnt_total; + uint32_t cnt_lightweight_tags; + uint32_t cnt_annotated_tags; + uint32_t cnt_branches; + uint32_t cnt_remotes; + uint32_t cnt_detached; + uint32_t cnt_other; + + uint32_t cnt_symref; + + uint32_t cnt_packed; + uint32_t cnt_loose; + + /* + * Measure the length of the refnames. We can look for + * potential platform limits. The partial sums may help us + * estimate the size of a haves/wants conversation, since each + * refname and a SHA must be transmitted. + */ + size_t len_max_local_refname; + size_t len_sum_local_refnames; + size_t len_max_remote_refname; + size_t len_sum_remote_refnames; + + struct strintmap refsmap; +}; + +/* + * HBIN -- hex binning (histogram bucketing). + * + * We create histograms for various counts and sums. Since we have a + * wide range of values (objects range in size from 1 to 4G bytes), a + * linear bucketing is not interesting. Instead, lets use a + * log16()-based bucketing. This gives us a better spread on the low + * and middle range and a coarse bucketing on the high end. + * + * The idea here is that it doesn't matter if you have n 1GB blobs or + * n/2 1GB blobs and n/2 1.5GB blobs -- either way you have a scaling + * problem that we want to report on. + */ +#define HBIN_LEN (sizeof(unsigned long) * 2) +#define HBIN_MASK (0xF) +#define HBIN_SHIFT (4) + +static int hbin(unsigned long value) +{ + int k; + + for (k = 0; k < HBIN_LEN; k++) { + if ((value & ~(HBIN_MASK)) == 0) + return k; + value >>= HBIN_SHIFT; + } + + return 0; /* should not happen */ +} + +/* + * QBIN -- base4 binning (histogram bucketing). + * + * This is the same idea as the above, but we want better granularity + * in the low end and don't expect as many large values. + */ +#define QBIN_LEN (sizeof(unsigned long) * 4) +#define QBIN_MASK (0x3) +#define QBIN_SHIFT (2) + +static int qbin(unsigned long value) +{ + int k; + + for (k = 0; k < QBIN_LEN; k++) { + if ((value & ~(QBIN_MASK)) == 0) + return k; + value >>= (QBIN_SHIFT); + } + + return 0; /* should not happen */ +} + +/* + * histogram bin for objects. + */ +struct obj_hist_bin { + uint64_t sum_size; /* sum(object_size) for all objects in this bin */ + uint64_t sum_disk_size; /* sum(on_disk_size) for all objects in this bin */ + uint32_t cnt_seen; /* number seen in this bin */ +}; + +static void incr_obj_hist_bin(struct obj_hist_bin *pbin, + unsigned long object_length, + off_t disk_sizep) +{ + pbin->sum_size += object_length; + pbin->sum_disk_size += disk_sizep; + pbin->cnt_seen++; +} + +/* + * Remember the largest n objects for some scaling dimension. This + * could be the observed object size or number of entries in a tree. + * We'll use this to generate a sorted vector in the output for that + * dimension. + */ +struct large_item { + uint64_t size; + struct object_id oid; + + /* + * For blobs and trees the name field is the pathname of the + * file or directory (as reported by the treewalk). Root trees + * are reported with a zero-length name, but we'll fix them up. + * The name field is not currenly used for commits. + */ + struct strbuf *name; + + /* + * For blobs and trees remember the transient commit from + * the treewalk so that we can say that this large item + * first appeared in this commit (relative to the treewalk + * order). + */ + struct object_id containing_commit_oid; + + /* + * Lookup `containing_commit_oid` using `git name-rev`. + * Lazy allocate this post-treewalk. + */ + struct strbuf *name_rev; +}; + +struct large_item_vec_labels { + const char *dimension; + const char *item; +}; + +struct large_item_vec { + const struct large_item_vec_labels *labels_json; + const struct large_item_vec_labels *labels_pretty; + uint64_t nr_items; + enum object_type type; + struct large_item items[FLEX_ARRAY]; /* nr_items */ +}; + +static struct large_item_vec *alloc_large_item_vec( + const struct large_item_vec_labels *labels_json, + const struct large_item_vec_labels *labels_pretty, + uint64_t nr_items, + enum object_type type) +{ + struct large_item_vec *vec; + size_t flex_len = nr_items * sizeof(struct large_item); + size_t k; + + if (!nr_items) + return NULL; + + vec = xcalloc(1, (sizeof(struct large_item_vec) + flex_len)); + vec->labels_json = labels_json; + vec->labels_pretty = labels_pretty; + vec->nr_items = nr_items; + vec->type = type; + + for (k = 0; k < nr_items; k++) { + struct strbuf *p = xcalloc(1, sizeof(struct strbuf)); + strbuf_init(p, 0); + vec->items[k].name = p; + } + + return vec; +} + +static void free_large_item_vec(struct large_item_vec *vec) +{ + size_t k; + + for (k = 0; k < vec->nr_items; k++) { + strbuf_release(vec->items[k].name); + free(vec->items[k].name); + + if (vec->items[k].name_rev) { + strbuf_release(vec->items[k].name_rev); + free(vec->items[k].name_rev); + } + } + + free(vec); +} + +static void maybe_insert_large_item(struct large_item_vec *vec, + uint64_t size, + struct object_id *oid, + const char *name, + const struct object_id *containing_commit_oid) +{ + struct strbuf *pbuf_temp; + size_t rest_len; + size_t k; + + if (!vec || !vec->nr_items) + return; + + /* + * Since the odds an object being among the largest n + * is small, shortcut and see if it is smaller than + * the smallest one in our set and quickly reject it. + */ + if (size < vec->items[vec->nr_items - 1].size) + return; + + for (k = 0; k < vec->nr_items; k++) { + if (size < vec->items[k].size) + continue; + + /* + * The last large_item in the vector is about to be + * overwritten by the previous one during the shift. + * Steal its allocated strbuf and reuse it. + * + * We can ignore .name_rev because it will not be + * allocated until after the treewalk. + */ + pbuf_temp = vec->items[vec->nr_items - 1].name; + strbuf_reset(pbuf_temp); + if (name && *name) + strbuf_addstr(pbuf_temp, name); + else if (vec->type == OBJ_TREE) { + /* + * NEEDSWORK: Would it be better to wait and create + * a name of the form "^{tree}" after the + * treewalk is finished? + */ + strbuf_addf(pbuf_temp, "%s^{tree}", + oid_to_hex(containing_commit_oid)); + } + + /* push items[k..] down one and insert data for this item here */ + + rest_len = (vec->nr_items - k - 1) * sizeof(struct large_item); + if (rest_len) + memmove(&vec->items[k + 1], &vec->items[k], rest_len); + + memset(&vec->items[k], 0, sizeof(struct large_item)); + vec->items[k].size = size; + oidcpy(&vec->items[k].oid, oid); + oidcpy(&vec->items[k].containing_commit_oid, containing_commit_oid); + + vec->items[k].name = pbuf_temp; + + return; + } +} + +/* + * Try to run `git name-rev` on each of the containing-commit-oid's + * in this large-item-vec to get a pretty name for each OID. Silently + * ignore errors if it fails because this info is nice to have but not + * essential. + */ +static void large_item_vec_lookup_name_rev(struct large_item_vec *vec) +{ + struct child_process cp = CHILD_PROCESS_INIT; + struct strbuf in = STRBUF_INIT; + struct strbuf out = STRBUF_INIT; + const char *line; + size_t k; + + if (!vec || !vec->nr_items) + return; + + survey_progress_total += vec->nr_items; + display_progress(survey_progress, survey_progress_total); + + for (k = 0; k < vec->nr_items; k++) + strbuf_addf(&in, "%s\n", oid_to_hex(&vec->items[k].containing_commit_oid)); + + cp.git_cmd = 1; + strvec_pushl(&cp.args, "name-rev", "--name-only", "--annotate-stdin", NULL); + if (pipe_command(&cp, in.buf, in.len, &out, 0, NULL, 0)) { + strbuf_release(&in); + strbuf_release(&out); + return; + } + + line = out.buf; + k = 0; + while (*line) { + const char *eol = strchrnul(line, '\n'); + + vec->items[k].name_rev = xcalloc(1, sizeof(struct strbuf)); + strbuf_init(vec->items[k].name_rev, 0); + strbuf_add(vec->items[k].name_rev, line, (eol - line)); + + line = eol + 1; + k++; + } + + strbuf_release(&in); + strbuf_release(&out); +} + +/* + * Common fields for any type of object. + */ +struct survey_stats_base_object { + uint32_t cnt_seen; + + uint32_t cnt_missing; /* we may have a partial clone. */ + + /* + * Number of objects grouped by where they are stored on disk. + * This is a function of how the ODB is packed. + */ + uint32_t cnt_cached; /* see oi.whence */ + uint32_t cnt_loose; /* see oi.whence */ + uint32_t cnt_packed; /* see oi.whence */ + uint32_t cnt_dbcached; /* see oi.whence */ + + uint64_t sum_size; /* sum(object_size) */ + uint64_t sum_disk_size; /* sum(disk_size) */ + + /* + * A histogram of the count of objects, the observed size, and + * the on-disk size grouped by the observed size. + */ + struct obj_hist_bin size_hbin[HBIN_LEN]; +}; + +/* + * PBIN -- parent vector binning (histogram bucketing). + * + * We create a histogram based upon the number of parents + * in a commit. This is a simple linear vector. It starts + * at zero for "initial" commits. + * + * If a commit has more parents, just put it in the last bin. + */ +#define PBIN_VEC_LEN (17) + +struct survey_stats_commits { + struct survey_stats_base_object base; + + /* + * Count of commits with k parents. + */ + uint32_t parent_cnt_pbin[PBIN_VEC_LEN]; + + struct large_item_vec *vec_largest_by_nr_parents; + struct large_item_vec *vec_largest_by_size_bytes; +}; + +/* + * Stats for reachable trees. + */ +struct survey_stats_trees { + struct survey_stats_base_object base; + + /* + * Keep a vector of the trees with the most number of entries. + * This gives us a feel for the width of a tree when there are + * gigantic directories. + */ + struct large_item_vec *vec_largest_by_nr_entries; + + /* + * Keep a vector of the trees with the largest size in bytes. + * The contents of this may or may not match items in the other + * vector, since entryname length can alter the results. + */ + struct large_item_vec *vec_largest_by_size_bytes; + + /* + * Computing the sum of the number of entries across all trees + * is probably not that interesting. + */ + uint64_t sum_entries; /* sum(nr_entries) -- sum across all trees */ + + /* + * A histogram of the count of trees, the observed size, and + * the on-disk size grouped by the number of entries in the tree. + */ + struct obj_hist_bin entry_qbin[QBIN_LEN]; +}; + +/* + * Stats for reachable blobs. + */ +struct survey_stats_blobs { + struct survey_stats_base_object base; + + /* + * Remember the OIDs of the largest n blobs. + */ + struct large_item_vec *vec_largest_by_size_bytes; +}; + +struct survey_stats { + struct survey_stats_refs refs; + struct survey_stats_commits commits; + struct survey_stats_trees trees; + struct survey_stats_blobs blobs; +}; + +static struct survey_stats survey_stats = { 0 }; + +static void alloc_commit_by_parents(void) +{ + static struct large_item_vec_labels json = { + .dimension = "largest_commits_by_nr_parents", + .item = "nr_parents", + }; + static struct large_item_vec_labels pretty = { + .dimension = "Largest Commits by Number of Parents", + .item = "Parents", + }; + + if (survey_opts.show_largest_commits_by_nr_parents) + survey_stats.commits.vec_largest_by_nr_parents = + alloc_large_item_vec(&json, &pretty, + survey_opts.show_largest_commits_by_nr_parents, + OBJ_COMMIT); +} + +static void alloc_commit_by_size(void) { + static struct large_item_vec_labels json = { + .dimension = "largest_commits_by_size_bytes", + .item = "size", + }; + static struct large_item_vec_labels pretty = { + .dimension = "Largest Commits by Size in Bytes", + .item = "Size", + }; + + if (survey_opts.show_largest_commits_by_size_bytes) + survey_stats.commits.vec_largest_by_size_bytes = + alloc_large_item_vec(&json, &pretty, + survey_opts.show_largest_commits_by_size_bytes, + OBJ_COMMIT); +} + +static void alloc_tree_by_entries(void) +{ + static struct large_item_vec_labels json = { + .dimension = "largest_trees_by_nr_entries", + .item = "nr_entries", + }; + static struct large_item_vec_labels pretty = { + .dimension = "Largest Trees by Number of Entries", + .item = "Entries", + }; + + if (survey_opts.show_largest_trees_by_nr_entries) + survey_stats.trees.vec_largest_by_nr_entries = + alloc_large_item_vec(&json, &pretty, + survey_opts.show_largest_trees_by_nr_entries, + OBJ_TREE); +} + +static void alloc_tree_by_size(void) +{ + static struct large_item_vec_labels json = { + .dimension = "largest_trees_by_size_bytes", + .item = "size", + }; + static struct large_item_vec_labels pretty = { + .dimension = "Largest Trees by Size in Bytes", + .item = "Size", + }; + + if (survey_opts.show_largest_trees_by_size_bytes) + survey_stats.trees.vec_largest_by_size_bytes = + alloc_large_item_vec(&json, &pretty, + survey_opts.show_largest_trees_by_size_bytes, + OBJ_TREE); +} + +static void alloc_blob_by_size(void) +{ + static struct large_item_vec_labels json = { + .dimension = "largest_blobs_by_size_bytes", + .item = "size", + }; + static struct large_item_vec_labels pretty = { + .dimension = "Largest Blobs by Size in Bytes", + .item = "Size", + }; + + if (survey_opts.show_largest_blobs_by_size_bytes) + survey_stats.blobs.vec_largest_by_size_bytes = + alloc_large_item_vec(&json, &pretty, + survey_opts.show_largest_blobs_by_size_bytes, + OBJ_BLOB); +} + +static void do_load_refs(struct ref_array *ref_array) +{ + struct ref_filter filter = REF_FILTER_INIT; + struct ref_sorting *sorting; + struct string_list sorting_options = STRING_LIST_INIT_DUP; + + string_list_append(&sorting_options, "objectname"); + sorting = ref_sorting_options(&sorting_options); + + if (survey_opts.refs.want_detached) + strvec_push(&survey_vec_refs_wanted, "HEAD"); + + if (survey_opts.refs.want_all_refs) { + strvec_push(&survey_vec_refs_wanted, "refs/"); + } else { + if (survey_opts.refs.want_branches) + strvec_push(&survey_vec_refs_wanted, "refs/heads/"); + if (survey_opts.refs.want_tags) + strvec_push(&survey_vec_refs_wanted, "refs/tags/"); + if (survey_opts.refs.want_remotes) + strvec_push(&survey_vec_refs_wanted, "refs/remotes/"); + if (survey_opts.refs.want_other) { + strvec_push(&survey_vec_refs_wanted, "refs/notes/"); + strvec_push(&survey_vec_refs_wanted, "refs/stash/"); + } + } + + filter.name_patterns = survey_vec_refs_wanted.v; + filter.ignore_case = 0; + filter.match_as_path = 1; + + if (survey_opts.show_progress) { + survey_progress_total = 0; + survey_progress = start_progress(_("Scanning refs..."), 0); + } + + filter_refs(ref_array, &filter, FILTER_REFS_KIND_MASK); + + if (survey_opts.show_progress) { + survey_progress_total = ref_array->nr; + display_progress(survey_progress, survey_progress_total); + } + + ref_array_sort(sorting, ref_array); + + if (survey_opts.show_progress) + stop_progress(&survey_progress); + + ref_filter_clear(&filter); + ref_sorting_release(sorting); +} + +/* + * Populate a "rev_info" with the OIDs of the REFS of interest. + * The treewalk will start from all of those starting points + * and walk backwards in the DAG to get the set of all reachable + * objects from those starting points. + */ +static void load_rev_info(struct rev_info *rev_info, + struct ref_array *ref_array) +{ + unsigned int add_flags = 0; + int k; + + for (k = 0; k < ref_array->nr; k++) { + struct ref_array_item *p = ref_array->items[k]; + struct object_id peeled; + + switch (p->kind) { + case FILTER_REFS_TAGS: + if (!peel_iterated_oid(rev_info->repo, &p->objectname, &peeled)) + add_pending_oid(rev_info, NULL, &peeled, add_flags); + else + add_pending_oid(rev_info, NULL, &p->objectname, add_flags); + break; + case FILTER_REFS_BRANCHES: + add_pending_oid(rev_info, NULL, &p->objectname, add_flags); + break; + case FILTER_REFS_REMOTES: + add_pending_oid(rev_info, NULL, &p->objectname, add_flags); + break; + case FILTER_REFS_OTHERS: + /* + * This may be a note, stash, or custom namespace branch. + */ + add_pending_oid(rev_info, NULL, &p->objectname, add_flags); + break; + case FILTER_REFS_DETACHED_HEAD: + add_pending_oid(rev_info, NULL, &p->objectname, add_flags); + break; + default: + break; + } + } +} + +static int fill_in_base_object(struct survey_stats_base_object *base, + struct object *object, + enum object_type type_expected, + unsigned long *p_object_length, + off_t *p_disk_sizep) +{ + struct object_info oi = OBJECT_INFO_INIT; + unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH; + unsigned long object_length = 0; + off_t disk_sizep = 0; + enum object_type type; + int hb; + + base->cnt_seen++; + + oi.typep = &type; + oi.sizep = &object_length; + oi.disk_sizep = &disk_sizep; + + if (oid_object_info_extended(the_repository, &object->oid, &oi, oi_flags) < 0 || + type != type_expected) { + base->cnt_missing++; + return 1; + } + + switch (oi.whence) { + case OI_CACHED: + base->cnt_cached++; + break; + case OI_LOOSE: + base->cnt_loose++; + break; + case OI_PACKED: + base->cnt_packed++; + break; + case OI_DBCACHED: + base->cnt_dbcached++; + break; + default: + break; + } + + base->sum_size += object_length; + base->sum_disk_size += disk_sizep; + + hb = hbin(object_length); + incr_obj_hist_bin(&base->size_hbin[hb], object_length, disk_sizep); + + if (p_object_length) + *p_object_length = object_length; + if (p_disk_sizep) + *p_disk_sizep = disk_sizep; + + return 0; +} + +/* + * Transient OID of the commit currently being visited + * during the treewalk. We can use this to create the + * : pair when a notable large file was + * created, for example. + */ +static struct object_id treewalk_transient_commit_oid; + +static void traverse_commit_cb(struct commit *commit, void *data) +{ + struct survey_stats_commits *psc = &survey_stats.commits; + unsigned long object_length; + unsigned k; + + if ((++survey_progress_total % 1000) == 0) + display_progress(survey_progress, survey_progress_total); + + oidcpy(&treewalk_transient_commit_oid, &commit->object.oid); + + fill_in_base_object(&psc->base, &commit->object, OBJ_COMMIT, &object_length, NULL); + + k = commit_list_count(commit->parents); + + /* + * Send the commit-oid as both the OID and the CONTAINING-COMMIT-OID. + * This is somewhat redundant, but lets us later do `git name-rev` + * using the containing-oid in a consistent fashion. + */ + maybe_insert_large_item(psc->vec_largest_by_nr_parents, k, + &commit->object.oid, NULL, + &commit->object.oid); + maybe_insert_large_item(psc->vec_largest_by_size_bytes, object_length, + &commit->object.oid, NULL, + &commit->object.oid); + + if (k >= PBIN_VEC_LEN) + k = PBIN_VEC_LEN - 1; + psc->parent_cnt_pbin[k]++; +} + +static void traverse_object_cb_tree(struct object *obj, const char *name) +{ + struct survey_stats_trees *pst = &survey_stats.trees; + unsigned long object_length; + off_t disk_sizep; + struct tree_desc desc; + struct name_entry entry; + struct tree *tree; + int nr_entries; + int qb; + + if (fill_in_base_object(&pst->base, obj, OBJ_TREE, &object_length, &disk_sizep)) + return; + + tree = lookup_tree(the_repository, &obj->oid); + if (!tree) + return; + init_tree_desc(&desc, &obj->oid, tree->buffer, tree->size); + nr_entries = 0; + while (tree_entry(&desc, &entry)) + nr_entries++; + + pst->sum_entries += nr_entries; + + maybe_insert_large_item(pst->vec_largest_by_nr_entries, nr_entries, + &obj->oid, name, + &treewalk_transient_commit_oid); + maybe_insert_large_item(pst->vec_largest_by_size_bytes, object_length, + &obj->oid, name, + &treewalk_transient_commit_oid); + + qb = qbin(nr_entries); + incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep); +} + +static void traverse_object_cb_blob(struct object *obj, const char *name) +{ + struct survey_stats_blobs *psb = &survey_stats.blobs; + unsigned long object_length; + + fill_in_base_object(&psb->base, obj, OBJ_BLOB, &object_length, NULL); + + maybe_insert_large_item(psb->vec_largest_by_size_bytes, object_length, + &obj->oid, name, + &treewalk_transient_commit_oid); +} + +static void traverse_object_cb(struct object *obj, const char *name, void *data) +{ + if ((++survey_progress_total % 1000) == 0) + display_progress(survey_progress, survey_progress_total); + + switch (obj->type) { + case OBJ_TREE: + traverse_object_cb_tree(obj, name); + return; + case OBJ_BLOB: + traverse_object_cb_blob(obj, name); + return; + case OBJ_TAG: /* ignore -- counted when loading REFS */ + case OBJ_COMMIT: /* ignore/bug -- seen in the other callback */ + default: /* ignore/bug -- unknown type */ + return; + } +} + +/* + * Treewalk all of the commits and objects reachable from the + * set of refs. + */ +static void do_treewalk_reachable(struct ref_array *ref_array) +{ + struct rev_info rev_info = REV_INFO_INIT; + + repo_init_revisions(the_repository, &rev_info, NULL); + rev_info.tree_objects = 1; + rev_info.blob_objects = 1; + rev_info.tree_blobs_in_commit_order = 1; + load_rev_info(&rev_info, ref_array); + if (prepare_revision_walk(&rev_info)) + die(_("revision walk setup failed")); + + if (survey_opts.show_progress) { + survey_progress_total = 0; + survey_progress = start_progress(_("Walking reachable objects..."), 0); + } + + oidcpy(&treewalk_transient_commit_oid, null_oid()); + traverse_commit_list(&rev_info, + traverse_commit_cb, + traverse_object_cb, + NULL); + oidcpy(&treewalk_transient_commit_oid, null_oid()); + + if (survey_opts.show_progress) + stop_progress(&survey_progress); + + release_revisions(&rev_info); +} + +/* + * If we want this type of ref, increment counters and return 1. + */ +static int maybe_count_ref(struct repository *r, struct ref_array_item *p) +{ + struct survey_refs_wanted *rw = &survey_opts.refs; + struct survey_stats_refs *prs = &survey_stats.refs; + struct object_id peeled; + + /* + * Classify the ref using the `kind` value. Note that + * p->kind was populated by `ref_kind_from_refname()` + * based strictly on the refname. This only knows about + * the basic stock categories and returns FILTER_REFS_OTHERS + * for notes, stashes, and any custom namespaces (like + * "refs/prefetch/"). + */ + switch (p->kind) { + case FILTER_REFS_TAGS: + if (rw->want_all_refs || rw->want_tags) { + /* + * NEEDSWORK: Both types of tags have the same + * "refs/tags/" prefix. Do we want to count them + * in separate buckets in the refsmap? + */ + strintmap_incr(&prs->refsmap, "refs/tags/", 1); + + if (!peel_iterated_oid(r, &p->objectname, &peeled)) + prs->cnt_annotated_tags++; + else + prs->cnt_lightweight_tags++; + + return 1; + } + return 0; + + case FILTER_REFS_BRANCHES: + if (rw->want_all_refs || rw->want_branches) { + strintmap_incr(&prs->refsmap, "refs/heads/", 1); + + prs->cnt_branches++; + return 1; + } + return 0; + + case FILTER_REFS_REMOTES: + if (rw->want_all_refs || rw->want_remotes) { + /* + * For the refsmap, group them by the "refs/remotes//". + * For example: + * "refs/remotes/origin/..." + */ + if (starts_with(p->refname, "refs/remotes/")) { + struct strbuf buf = STRBUF_INIT; + int begin = strlen("refs/remotes/"); + size_t j; + + strbuf_addstr(&buf, p->refname); + for (j = begin; j < buf.len; j++) { + if (buf.buf[j] == '/') { + strbuf_setlen(&buf, j+1); + break; + } + } + strintmap_incr(&prs->refsmap, buf.buf, 1); + strbuf_release(&buf); + } + + prs->cnt_remotes++; + return 1; + } + return 0; + + case FILTER_REFS_OTHERS: + if (rw->want_all_refs || rw->want_other) { + /* + * For the refsmap, group them by their "refs//". + * For example: + * "refs/notes/..." + * "refs/stash/..." + * "refs//..." + */ + if (starts_with(p->refname, "refs/")) { + struct strbuf buf = STRBUF_INIT; + int begin = strlen("refs/"); + size_t j; + + strbuf_addstr(&buf, p->refname); + for (j = begin; j < buf.len; j++) { + if (buf.buf[j] == '/') { + strbuf_setlen(&buf, j+1); + break; + } + } + strintmap_incr(&prs->refsmap, buf.buf, 1); + strbuf_release(&buf); + } + + prs->cnt_other++; + return 1; + } + return 0; + + case FILTER_REFS_DETACHED_HEAD: + if (rw->want_all_refs || rw->want_detached) { + strintmap_incr(&prs->refsmap, p->refname, 1); + + prs->cnt_detached++; + return 1; + } + return 0; + + default: + if (rw->want_all_refs) { + strintmap_incr(&prs->refsmap, p->refname, 1); /* probably "HEAD" */ + + return 1; + } + return 0; + } +} + +/* + * Calculate stats on the set of refs that we found. + */ +static void do_calc_stats_refs(struct repository *r, struct ref_array *ref_array) +{ + struct survey_stats_refs *prs = &survey_stats.refs; + int k; + + strintmap_init(&prs->refsmap, 0); + + for (k = 0; k < ref_array->nr; k++) { + struct ref_array_item *p = ref_array->items[k]; + size_t len; + + if (!maybe_count_ref(r, p)) + continue; + + prs->cnt_total++; + + /* + * SymRefs are somewhat orthogonal to the above + * classification (e.g. "HEAD" --> detached + * and "refs/remotes/origin/HEAD" --> remote) so + * our totals will already include them. + */ + if (p->flag & REF_ISSYMREF) + prs->cnt_symref++; + + /* + * Where/how is the ref stored in GITDIR. + */ + if (p->flag & REF_ISPACKED) + prs->cnt_packed++; + else + prs->cnt_loose++; + + len = strlen(p->refname); + + if (p->kind == FILTER_REFS_REMOTES) { + prs->len_sum_remote_refnames += len; + if (len > prs->len_max_remote_refname) + prs->len_max_remote_refname = len; + } else { + prs->len_sum_local_refnames += len; + if (len > prs->len_max_local_refname) + prs->len_max_local_refname = len; + } + } +} + +static void do_lookup_name_rev(void) +{ + /* + * `git name-rev` can be very expensive when there are lots of + * refs, so make it optional. + */ + if (!survey_opts.show_name_rev) + return; + + if (survey_opts.show_progress) { + survey_progress_total = 0; + survey_progress = start_progress(_("Resolving name-revs..."), 0); + } + + large_item_vec_lookup_name_rev(survey_stats.commits.vec_largest_by_nr_parents); + large_item_vec_lookup_name_rev(survey_stats.commits.vec_largest_by_size_bytes); + + large_item_vec_lookup_name_rev(survey_stats.trees.vec_largest_by_nr_entries); + large_item_vec_lookup_name_rev(survey_stats.trees.vec_largest_by_size_bytes); + + large_item_vec_lookup_name_rev(survey_stats.blobs.vec_largest_by_size_bytes); + + if (survey_opts.show_progress) + stop_progress(&survey_progress); +} + +/* + * The REFS phase: + * + * Load the set of requested refs and assess them for scalablity problems. + * Use that set to start a treewalk to all reachable objects and assess + * them. + * + * This data will give us insights into the repository itself (the number + * of refs, the size and shape of the DAG, the number and size of the + * objects). + * + * Theoretically, this data is independent of the on-disk representation + * (e.g. independent of packing concerns). + */ +static void survey_phase_refs(struct repository *r) +{ + struct ref_array ref_array = { 0 }; + + trace2_region_enter("survey", "phase/refs", the_repository); + do_load_refs(&ref_array); + trace2_region_leave("survey", "phase/refs", the_repository); + + trace2_region_enter("survey", "phase/treewalk", the_repository); + do_treewalk_reachable(&ref_array); + trace2_region_leave("survey", "phase/treewalk", the_repository); + + trace2_region_enter("survey", "phase/calcstats", the_repository); + do_calc_stats_refs(r, &ref_array); + trace2_region_leave("survey", "phase/calcstats", the_repository); + + if (survey_opts.show_name_rev) { + trace2_region_enter("survey", "phase/namerev", the_repository); + do_lookup_name_rev(); + trace2_region_enter("survey", "phase/namerev", the_repository); + } + + ref_array_clear(&ref_array); +} + +static void json_refs_section(struct json_writer *jw_top, int pretty, int want_trace2) +{ + struct survey_stats_refs *prs = &survey_stats.refs; + struct json_writer jw_refs = JSON_WRITER_INIT; + int k; + + jw_object_begin(&jw_refs, pretty); + { + jw_object_intmax(&jw_refs, "count", prs->cnt_total); + + jw_object_inline_begin_object(&jw_refs, "count_by_type"); + { + if (survey_opts.refs.want_branches) + jw_object_intmax(&jw_refs, "branches", prs->cnt_branches); + if (survey_opts.refs.want_tags) { + jw_object_intmax(&jw_refs, "lightweight_tags", prs->cnt_lightweight_tags); + jw_object_intmax(&jw_refs, "annotated_tags", prs->cnt_annotated_tags); + } + if (survey_opts.refs.want_remotes) + jw_object_intmax(&jw_refs, "remotes", prs->cnt_remotes); + if (survey_opts.refs.want_detached) + jw_object_intmax(&jw_refs, "detached", prs->cnt_detached); + if (survey_opts.refs.want_other) + jw_object_intmax(&jw_refs, "other", prs->cnt_other); + + /* + * SymRefs are somewhat orthogonal to + * the above classification + * (e.g. "HEAD" --> detached and + * "refs/remotes/origin/HEAD" --> + * remote) so the above classified + * counts will already include them, + * but it is less confusing to display + * them here than to create a whole + * new section. + */ + if (prs->cnt_symref) + jw_object_intmax(&jw_refs, "symrefs", prs->cnt_symref); + } + jw_end(&jw_refs); + + jw_object_inline_begin_object(&jw_refs, "count_by_storage"); + { + jw_object_intmax(&jw_refs, "loose_refs", prs->cnt_loose); + jw_object_intmax(&jw_refs, "packed_refs", prs->cnt_packed); + } + jw_end(&jw_refs); + + jw_object_inline_begin_object(&jw_refs, "refname_length"); + { + if (prs->len_sum_local_refnames) { + jw_object_intmax(&jw_refs, "max_local", prs->len_max_local_refname); + jw_object_intmax(&jw_refs, "sum_local", prs->len_sum_local_refnames); + } + if (prs->len_sum_remote_refnames) { + jw_object_intmax(&jw_refs, "max_remote", prs->len_max_remote_refname); + jw_object_intmax(&jw_refs, "sum_remote", prs->len_sum_remote_refnames); + } + } + jw_end(&jw_refs); + + jw_object_inline_begin_array(&jw_refs, "requested"); + { + for (k = 0; k < survey_vec_refs_wanted.nr; k++) + jw_array_string(&jw_refs, survey_vec_refs_wanted.v[k]); + } + jw_end(&jw_refs); + + jw_object_inline_begin_array(&jw_refs, "count_by_class"); + { + struct hashmap_iter iter; + struct strmap_entry *entry; + + strintmap_for_each_entry(&prs->refsmap, &iter, entry) { + const char *key = entry->key; + intptr_t count = (intptr_t)entry->value; + int value = count; + jw_array_inline_begin_object(&jw_refs); + { + jw_object_string(&jw_refs, "class", key); + jw_object_intmax(&jw_refs, "count", value); + } + jw_end(&jw_refs); + } + } + jw_end(&jw_refs); + } + jw_end(&jw_refs); + + if (jw_top) + jw_object_sub_jw(jw_top, "refs", &jw_refs); + + if (want_trace2) + trace2_data_json("survey", the_repository, "refs", &jw_refs); + + jw_release(&jw_refs); +} + +#define JW_OBJ_INT_NZ(jw, key, value) do { if (value) jw_object_intmax((jw), (key), (value)); } while (0) + +static void write_qbin_json(struct json_writer *jw, const char *label, + struct obj_hist_bin qbin[QBIN_LEN]) +{ + struct strbuf buf = STRBUF_INIT; + uint32_t lower = 0; + uint32_t upper = QBIN_MASK; + int k; + + jw_object_inline_begin_object(jw, label); + { + for (k = 0; k < QBIN_LEN; k++) { + struct obj_hist_bin *p = &qbin[k]; + uint32_t lower_k = lower; + uint32_t upper_k = upper; + + lower = upper+1; + upper = (upper << QBIN_SHIFT) + QBIN_MASK; + + if (!p->cnt_seen) + continue; + + strbuf_reset(&buf); + strbuf_addf(&buf, "Q%02d", k); + jw_object_inline_begin_object(jw, buf.buf); + { + jw_object_intmax(jw, "count", p->cnt_seen); + jw_object_intmax(jw, "sum_size", p->sum_size); + jw_object_intmax(jw, "sum_disk_size", p->sum_disk_size); + + /* maybe only include these in verbose mode */ + jw_object_intmax(jw, "qbin_lower", lower_k); + jw_object_intmax(jw, "qbin_upper", upper_k); + } + jw_end(jw); + } + } + jw_end(jw); + + strbuf_release(&buf); +} + +static void write_hbin_json(struct json_writer *jw, const char *label, + struct obj_hist_bin hbin[HBIN_LEN]) +{ + struct strbuf buf = STRBUF_INIT; + uint32_t lower = 0; + uint32_t upper = HBIN_MASK; + int k; + + jw_object_inline_begin_object(jw, label); + { + for (k = 0; k < HBIN_LEN; k++) { + struct obj_hist_bin *p = &hbin[k]; + uint32_t lower_k = lower; + uint32_t upper_k = upper; + + lower = upper+1; + upper = (upper << HBIN_SHIFT) + HBIN_MASK; + + if (!p->cnt_seen) + continue; + + strbuf_reset(&buf); + strbuf_addf(&buf, "H%d", k); + jw_object_inline_begin_object(jw, buf.buf); + { + jw_object_intmax(jw, "count", p->cnt_seen); + jw_object_intmax(jw, "sum_size", p->sum_size); + jw_object_intmax(jw, "sum_disk_size", p->sum_disk_size); + + /* maybe only include these in verbose mode */ + jw_object_intmax(jw, "hbin_lower", lower_k); + jw_object_intmax(jw, "hbin_upper", upper_k); + } + jw_end(jw); + } + } + jw_end(jw); + + strbuf_release(&buf); +} + +static void write_base_object_json(struct json_writer *jw, + struct survey_stats_base_object *base) +{ + jw_object_intmax(jw, "count", base->cnt_seen); + + jw_object_intmax(jw, "sum_size", base->sum_size); + jw_object_intmax(jw, "sum_disk_size", base->sum_disk_size); + + jw_object_inline_begin_object(jw, "count_by_whence"); + { + /* + * Missing is not technically a "whence" value, but + * we don't need to clutter up the results with that + * distinction. + */ + JW_OBJ_INT_NZ(jw, "missing", base->cnt_missing); + + JW_OBJ_INT_NZ(jw, "cached", base->cnt_cached); + JW_OBJ_INT_NZ(jw, "loose", base->cnt_loose); + JW_OBJ_INT_NZ(jw, "packed", base->cnt_packed); + JW_OBJ_INT_NZ(jw, "dbcached", base->cnt_dbcached); + } + jw_end(jw); + + write_hbin_json(jw, "dist_by_size", base->size_hbin); +} + +static void write_large_item_vec_json(struct json_writer *jw, + struct large_item_vec *vec) +{ + if (!vec || !vec->nr_items) + return; + + jw_object_inline_begin_array(jw, vec->labels_json->dimension); + { + int k; + + for (k = 0; k < vec->nr_items; k++) { + struct large_item *pk = &vec->items[k]; + if (is_null_oid(&pk->oid)) + break; + + jw_array_inline_begin_object(jw); + { + jw_object_intmax(jw, vec->labels_json->item, pk->size); + jw_object_string(jw, "oid", oid_to_hex(&pk->oid)); + if (pk->name->len) + jw_object_string(jw, "name", pk->name->buf); + if (!is_null_oid(&pk->containing_commit_oid)) + jw_object_string(jw, "commit_oid", + oid_to_hex(&pk->containing_commit_oid)); + if (survey_opts.show_name_rev && + pk->name_rev && + pk->name_rev->len) + jw_object_string(jw, "name_rev", + pk->name_rev->buf); + } + jw_end(jw); + } + } + jw_end(jw); +} + +static void json_commits_section(struct json_writer *jw_top, int pretty, int want_trace2) +{ + struct survey_stats_commits *psc = &survey_stats.commits; + struct json_writer jw_commits = JSON_WRITER_INIT; + + jw_object_begin(&jw_commits, pretty); + { + write_base_object_json(&jw_commits, &psc->base); + + write_large_item_vec_json(&jw_commits, psc->vec_largest_by_nr_parents); + write_large_item_vec_json(&jw_commits, psc->vec_largest_by_size_bytes); + + jw_object_inline_begin_object(&jw_commits, "count_by_nr_parents"); + { + struct strbuf parent_key = STRBUF_INIT; + int k; + + for (k = 0; k < PBIN_VEC_LEN; k++) + if (psc->parent_cnt_pbin[k]) { + strbuf_reset(&parent_key); + strbuf_addf(&parent_key, "P%02d", k); + jw_object_intmax(&jw_commits, parent_key.buf, psc->parent_cnt_pbin[k]); + } + + strbuf_release(&parent_key); + } + jw_end(&jw_commits); + } + jw_end(&jw_commits); + + if (jw_top) + jw_object_sub_jw(jw_top, "commits", &jw_commits); + + if (want_trace2) + trace2_data_json("survey", the_repository, "commits", &jw_commits); + + jw_release(&jw_commits); +} + +static void json_trees_section(struct json_writer *jw_top, int pretty, int want_trace2) +{ + struct survey_stats_trees *pst = &survey_stats.trees; + struct json_writer jw_trees = JSON_WRITER_INIT; + + jw_object_begin(&jw_trees, pretty); + { + write_base_object_json(&jw_trees, &pst->base); + + jw_object_intmax(&jw_trees, "sum_entries", pst->sum_entries); + + write_large_item_vec_json(&jw_trees, pst->vec_largest_by_nr_entries); + write_large_item_vec_json(&jw_trees, pst->vec_largest_by_size_bytes); + + write_qbin_json(&jw_trees, "dist_by_nr_entries", pst->entry_qbin); + } + jw_end(&jw_trees); + + if (jw_top) + jw_object_sub_jw(jw_top, "trees", &jw_trees); + + if (want_trace2) + trace2_data_json("survey", the_repository, "trees", &jw_trees); + + jw_release(&jw_trees); +} + +static void json_blobs_section(struct json_writer *jw_top, int pretty, int want_trace2) +{ + struct survey_stats_blobs *psb = &survey_stats.blobs; + struct json_writer jw_blobs = JSON_WRITER_INIT; + + jw_object_begin(&jw_blobs, pretty); + { + write_base_object_json(&jw_blobs, &psb->base); + + write_large_item_vec_json(&jw_blobs, psb->vec_largest_by_size_bytes); + } + jw_end(&jw_blobs); + + if (jw_top) + jw_object_sub_jw(jw_top, "blobs", &jw_blobs); + + if (want_trace2) + trace2_data_json("survey", the_repository, "blobs", &jw_blobs); + + jw_release(&jw_blobs); +} + +static void survey_print_json(void) +{ + struct json_writer jw_top = JSON_WRITER_INIT; + int pretty = 1; + + jw_object_begin(&jw_top, pretty); + { + json_refs_section(&jw_top, pretty, 0); + json_commits_section(&jw_top, pretty, 0); + json_trees_section(&jw_top, pretty, 0); + json_blobs_section(&jw_top, pretty, 0); + } + jw_end(&jw_top); + + printf("%s\n", jw_top.json.buf); + + jw_release(&jw_top); +} + +static void survey_emit_trace2(void) +{ + if (!trace2_is_enabled()) + return; + + json_refs_section(NULL, 0, 1); + json_commits_section(NULL, 0, 1); + json_trees_section(NULL, 0, 1); + json_blobs_section(NULL, 0, 1); +} + +static void fmt_txt_line(struct strbuf *buf, int indent, const char *txt) +{ + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addstr(buf, txt); + + strbuf_addch(buf, '\n'); +} + +static void fmt_txt_pair_ui64(struct strbuf *buf, + int indent, + const char *label, + uint64_t value) +{ + int column0 = 62; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addf(buf, "%-*s : %14"PRIu64, + column0 - indent, label, + value); + + strbuf_addch(buf, '\n'); +} + +static void fmt_size_tbl_caption(struct strbuf *buf, + int indent, + const char *caption) +{ + strbuf_addch(buf, '\n'); + fmt_txt_line(buf, indent, caption); +} + +static void fmt_size_tbl_hdr(struct strbuf *buf, + int indent, + const char *bucket_hdr, + const char *count_hdr, + const char *size_hdr, + const char *disk_size_hdr) +{ + int column0 = 28; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addf(buf, "%-*s | %14s | %14s | %14s", + column0 - indent, bucket_hdr, + count_hdr, size_hdr, disk_size_hdr); + + strbuf_addch(buf, '\n'); +} + +static void fmt_size_tbl_hr(struct strbuf *buf, + int indent) +{ + int column0 = 28; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addchars(buf, '-', column0 - indent); + strbuf_addstr(buf, "-+-"); + strbuf_addchars(buf, '-', 14); + strbuf_addstr(buf, "-+-"); + strbuf_addchars(buf, '-', 14); + strbuf_addstr(buf, "-+-"); + strbuf_addchars(buf, '-', 14); + + strbuf_addch(buf, '\n'); +} + +static void fmt_size_tbl_row(struct strbuf *buf, + int indent, + const char *bucket, + uint64_t count, + uint64_t size, + uint64_t disk_size) +{ + int column0 = 28; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addf(buf, "%-*s | %14"PRIu64" | %14"PRIu64" | %14"PRIu64, + column0 - indent, bucket, count, size, disk_size); + + strbuf_addch(buf, '\n'); +} + +static void fmt_qbin(struct strbuf *buf, + int indent, const char *title_caption, + const char *bucket_hdr, + struct obj_hist_bin qbin[QBIN_LEN]) +{ + struct strbuf bucket = STRBUF_INIT; + uint64_t lower = 0; + uint64_t upper = QBIN_MASK; + int k; + + fmt_size_tbl_caption(buf, indent, title_caption); + fmt_size_tbl_hr(buf, indent); + fmt_size_tbl_hdr(buf, indent, bucket_hdr, "Count", "Size", "Disk Size"); + fmt_size_tbl_hr(buf, indent); + + for (k = 0; k < QBIN_LEN; k++) { + struct obj_hist_bin *p = &qbin[k]; + uint64_t lower_k = lower; + uint64_t upper_k = upper; + + lower = upper+1; + upper = (upper << QBIN_SHIFT) + QBIN_MASK; + + if (!p->cnt_seen) + continue; + + strbuf_reset(&bucket); + strbuf_addf(&bucket, "%"PRIu64"..%"PRIu64, lower_k, upper_k); + + fmt_size_tbl_row(buf, indent, bucket.buf, + p->cnt_seen, p->sum_size, p->sum_disk_size); + } + fmt_size_tbl_hr(buf, indent); + + strbuf_release(&bucket); +} + +static void fmt_hbin(struct strbuf *buf, + int indent, const char *title_caption, + const char *bucket_hdr, + struct obj_hist_bin hbin[HBIN_LEN]) +{ + struct strbuf bucket = STRBUF_INIT; + uint64_t lower = 0; + uint64_t upper = HBIN_MASK; + int k; + + fmt_size_tbl_caption(buf, indent, title_caption); + fmt_size_tbl_hr(buf, indent); + fmt_size_tbl_hdr(buf, indent, bucket_hdr, "Count", "Size", "Disk Size"); + fmt_size_tbl_hr(buf, indent); + + for (k = 0; k < HBIN_LEN; k++) { + struct obj_hist_bin *p = &hbin[k]; + uint64_t lower_k = lower; + uint64_t upper_k = upper; + + lower = upper+1; + upper = (upper << HBIN_SHIFT) + HBIN_MASK; + + if (!p->cnt_seen) + continue; + + strbuf_reset(&bucket); + strbuf_addf(&bucket, "%"PRIu64"..%"PRIu64, lower_k, upper_k); + + fmt_size_tbl_row(buf, indent, bucket.buf, + p->cnt_seen, p->sum_size, p->sum_disk_size); + } + fmt_size_tbl_hr(buf, indent); + + strbuf_release(&bucket); +} + +static void fmt_pbin_hdr(struct strbuf *buf, + int indent, + const char *bucket, + const char *count) +{ + int column0 = 28; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addf(buf, "%-*s | %14s", + column0 - indent, bucket, + count); + + strbuf_addch(buf, '\n'); +} + +static void fmt_pbin_hr(struct strbuf *buf, + int indent) +{ + int column0 = 28; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addchars(buf, '-', column0 - indent); + strbuf_addstr(buf, "-+-"); + strbuf_addchars(buf, '-', 14); + + strbuf_addch(buf, '\n'); +} + +static void fmt_pbin_row(struct strbuf *buf, + int indent, + int nr, + int count) +{ + struct strbuf bucket = STRBUF_INIT; + int column0 = 28; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addf(&bucket, "%2d", nr); + strbuf_addf(buf, "%-*s | %14d", + column0 - indent, bucket.buf, + count); + + strbuf_addch(buf, '\n'); + strbuf_release(&bucket); +} + +static void fmt_base_object(struct strbuf *buf, + int indent, + struct survey_stats_base_object *base) +{ + int indent1 = indent + 4; + + fmt_txt_pair_ui64(buf, indent, "Total Count", base->cnt_seen); + + strbuf_addch(buf, '\n'); + fmt_txt_line(buf, indent, "Count by Storage Location"); + if (base->cnt_missing) + fmt_txt_pair_ui64(buf, indent1, "Missing", base->cnt_missing); + if (base->cnt_cached) + fmt_txt_pair_ui64(buf, indent1, "Cached", base->cnt_cached); + if (base->cnt_loose) + fmt_txt_pair_ui64(buf, indent1, "Loose", base->cnt_loose); + if (base->cnt_packed) + fmt_txt_pair_ui64(buf, indent1, "Packed", base->cnt_packed); + if (base->cnt_dbcached) + fmt_txt_pair_ui64(buf, indent1, "DBCached", base->cnt_dbcached); + + strbuf_addch(buf, '\n'); + fmt_txt_pair_ui64(buf, indent, "Total Size in Bytes", base->sum_size); + fmt_txt_pair_ui64(buf, indent, "Total Disk Size in Bytes", base->sum_disk_size); + + fmt_hbin(buf, indent, "Histogram by Size in Bytes", "Byte Range", base->size_hbin); +} + +static void fmt_large_item_hdr(struct strbuf *buf, + int indent, + int name_length, + int name_rev_length, + const char *item_hdr_label) +{ + int column0 = the_hash_algo->hexsz; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addf(buf, "%-*s | %14s", column0, "OID", item_hdr_label); + if (name_length) + strbuf_addf(buf, " | %-*s", name_length, "Name"); + if (name_rev_length) + strbuf_addf(buf, " | %-*s", name_rev_length, "Commit / Name Rev"); + + strbuf_addch(buf, '\n'); +} + +static void fmt_large_item_hr(struct strbuf *buf, + int indent, + int name_length, + int name_rev_length) +{ + int column0 = the_hash_algo->hexsz; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addchars(buf, '-', column0); + strbuf_addstr(buf, "-+-"); + strbuf_addchars(buf, '-', 14); + if (name_length) { + strbuf_addstr(buf, "-+-"); + strbuf_addchars(buf, '-', name_length); + } + if (name_rev_length) { + strbuf_addstr(buf, "-+-"); + strbuf_addchars(buf, '-', name_rev_length); + } + + strbuf_addch(buf, '\n'); +} + +static void fmt_large_item_row(struct strbuf *buf, + int indent, + int name_length, + int name_rev_length, + struct large_item *pitem) +{ + int column0 = the_hash_algo->hexsz; + + if (indent) + strbuf_addchars(buf, ' ', indent); + + strbuf_addf(buf, "%-*s | %14"PRIu64, + column0, oid_to_hex(&pitem->oid), + pitem->size); + if (name_length) + strbuf_addf(buf, " | %-*s", name_length, + (pitem->name ? pitem->name->buf: "")); + if (name_rev_length) + strbuf_addf(buf, " | %-*s", name_rev_length, + ((pitem->name_rev) + ? pitem->name_rev->buf + : oid_to_hex(&pitem->containing_commit_oid))); + + strbuf_addch(buf, '\n'); +} + +static void fmt_large_item_vec(struct strbuf *buf, + int indent, + struct large_item_vec *pvec) +{ + int name_length = 0; + int name_rev_length = 0; + int k; + + if (pvec->type != OBJ_COMMIT) { + /* Add "Name" column for trees and blobs. This is relative pathname. */ + for (k = 0; k < pvec->nr_items; k++) + if (pvec->items[k].name && pvec->items[k].name->len > name_length) + name_length = pvec->items[k].name->len; + if (name_length) + if (name_length < 4) /* strlen("Name") */ + name_length = 4; + } + + if (survey_opts.show_name_rev) { + name_rev_length = 17; /* strlen("Commit / Name Rev") */ + for (k = 0; k < pvec->nr_items; k++) { + struct large_item *pk = &pvec->items[k]; + if (pk->name_rev && pk->name_rev->len > name_rev_length) + name_rev_length = pk->name_rev->len; + } + } else if (pvec->type != OBJ_COMMIT) { + /* for trees and blobs, just show containing commit OID */ + name_rev_length = the_hash_algo->hexsz; + } + + strbuf_addch(buf, '\n'); + fmt_txt_line(buf, indent, pvec->labels_pretty->dimension); + fmt_large_item_hr(buf, indent, name_length, name_rev_length); + fmt_large_item_hdr(buf, indent, name_length, name_rev_length, pvec->labels_pretty->item); + fmt_large_item_hr(buf, indent, name_length, name_rev_length); + + for (k = 0; k < pvec->nr_items; k++) { + struct large_item *pk = &pvec->items[k]; + if (is_null_oid(&pk->oid)) + break; + + fmt_large_item_row(buf, indent, name_length, name_rev_length, pk); + } + + fmt_large_item_hr(buf, indent, name_length, name_rev_length); +} + +static void pretty_print_survey_hdr(void) +{ + struct strbuf buf = STRBUF_INIT; + int indent = 0; + int k; + + const char *intro[] = { + "", + "===============================================================================", + "Git Survey Results", + "===============================================================================", + "", + NULL + }; + + k = 0; + while (intro[k]) + fmt_txt_line(&buf, indent, intro[k++]); + + /* + * NEEDSWORK: Consider adding information about the repo pathname, + * the date, command line args, git version, etc. + */ + + fwrite(buf.buf, 1, buf.len, stdout); + strbuf_release(&buf); +} + +static void pretty_print_overview(int indent) +{ + struct survey_stats_refs *prs = &survey_stats.refs; + struct survey_stats_commits *psc = &survey_stats.commits; + struct survey_stats_trees *pst = &survey_stats.trees; + struct survey_stats_blobs *psb = &survey_stats.blobs; + struct strbuf buf = STRBUF_INIT; + int indent1 = indent + 4; + int indent2 = indent + 8; + int k; + + const char *intro[] = { + "", + "OVERVIEW", + "-------------------------------------------------------------------------------", + "", + NULL + }; + + k = 0; + while (intro[k]) + fmt_txt_line(&buf, indent, intro[k++]); + + fmt_txt_pair_ui64(&buf, indent1, "Total Number of Refs", prs->cnt_total); + + fmt_size_tbl_caption(&buf, indent1, "Overview by Object Type"); + + fmt_size_tbl_hr(&buf, indent1); + fmt_size_tbl_hdr(&buf, indent1, "Type", "Count", "Size", "Disk Size"); + fmt_size_tbl_hr(&buf, indent1); + + fmt_size_tbl_row(&buf, indent2, "Commits", psc->base.cnt_seen, psc->base.sum_size, psc->base.sum_disk_size); + fmt_size_tbl_row(&buf, indent2, "Trees", pst->base.cnt_seen, pst->base.sum_size, pst->base.sum_disk_size); + fmt_size_tbl_row(&buf, indent2, "Blobs", psb->base.cnt_seen, psb->base.sum_size, psb->base.sum_disk_size); + + fmt_size_tbl_hr(&buf, indent1); + fmt_size_tbl_row(&buf, indent1, "Total", + psc->base.cnt_seen + pst->base.cnt_seen + psb->base.cnt_seen, + psc->base.sum_size + pst->base.sum_size + psb->base.sum_size, + psc->base.sum_disk_size + pst->base.sum_disk_size + psb->base.sum_disk_size); + fmt_size_tbl_hr(&buf, indent1); + + strbuf_addch(&buf, '\n'); + fwrite(buf.buf, 1, buf.len, stdout); + strbuf_release(&buf); +} + +/* + * Pretty print information on the set of REFS that we examined. + */ +static void pretty_print_refs(int indent) +{ + struct survey_refs_wanted *prw = &survey_opts.refs; + struct survey_stats_refs *prs = &survey_stats.refs; + struct strbuf buf = STRBUF_INIT; + int indent1 = indent + 4; + int indent2 = indent + 8; + int indent3 = indent + 12; + int k; + + const char *intro[] = { + "", + "REFS", + "-------------------------------------------------------------------------------", + "", + NULL + }; + + k = 0; + while (intro[k]) + fmt_txt_line(&buf, indent, intro[k++]); + + fmt_txt_pair_ui64(&buf, indent1, "Total Number of Refs", prs->cnt_total); + + strbuf_addch(&buf, '\n'); + fmt_txt_line(&buf, indent1, "Reference Count by Type"); + + if (prw->want_remotes && prs->cnt_remotes) + fmt_txt_pair_ui64(&buf, indent2, "Remote Tracking Branches", prs->cnt_remotes); + + if (prw->want_branches && prs->cnt_branches) + fmt_txt_pair_ui64(&buf, indent2, "Branches", prs->cnt_branches); + if (prw->want_tags && prs->cnt_lightweight_tags) + fmt_txt_pair_ui64(&buf, indent2, "Tags (Lightweight)", prs->cnt_lightweight_tags); + if (prw->want_tags && prs->cnt_annotated_tags) + fmt_txt_pair_ui64(&buf, indent2, "Tags (Annotated)", prs->cnt_annotated_tags); + if (prw->want_detached && prs->cnt_detached) + fmt_txt_pair_ui64(&buf, indent2, "Detached", prs->cnt_detached); + if (prw->want_other && prs->cnt_other) + fmt_txt_pair_ui64(&buf, indent2, "Other (Notes and Stashes)", prs->cnt_other); + + if (prs->cnt_symref) + fmt_txt_pair_ui64(&buf, indent2, "Symbolic Refs (like 'HEAD')", prs->cnt_symref); + + strbuf_addch(&buf, '\n'); + fmt_txt_pair_ui64(&buf, indent1, "Reference Count by Class", strintmap_get_size(&prs->refsmap)); + { + struct hashmap_iter iter; + struct strmap_entry *entry; + + strintmap_for_each_entry(&prs->refsmap, &iter, entry) { + const char *key = entry->key; + intptr_t count = (intptr_t)entry->value; + int value = count; + + fmt_txt_pair_ui64(&buf, indent2, key, value); + } + } + + strbuf_addch(&buf, '\n'); + fmt_txt_line(&buf, indent1, "Reference Count by Storage Location"); + fmt_txt_pair_ui64(&buf, indent2, "Loose", prs->cnt_loose); + fmt_txt_pair_ui64(&buf, indent2, "Packed", prs->cnt_packed); + + strbuf_addch(&buf, '\n'); + fmt_txt_line(&buf, indent1, "String Length of Refnames"); + if (prs->len_sum_remote_refnames) { + fmt_txt_line(&buf, indent2, "Remote Refs"); + fmt_txt_pair_ui64(&buf, indent3, "Max", prs->len_max_remote_refname); + fmt_txt_pair_ui64(&buf, indent3, "Sum", prs->len_sum_remote_refnames); + } + if (prs->len_sum_local_refnames) { + fmt_txt_line(&buf, indent2, "Local Refs"); + fmt_txt_pair_ui64(&buf, indent3, "Max", prs->len_max_local_refname); + fmt_txt_pair_ui64(&buf, indent3, "Sum", prs->len_sum_local_refnames); + } + + strbuf_addch(&buf, '\n'); + fwrite(buf.buf, 1, buf.len, stdout); + strbuf_release(&buf); +} + +static void pretty_print_commits(int indent) +{ + struct survey_stats_commits *psc = &survey_stats.commits; + struct survey_stats_base_object *base = &psc->base; + struct strbuf buf = STRBUF_INIT; + int indent1 = indent + 4; + int k; + + const char *intro[] = { + "", + "COMMITS", + "-------------------------------------------------------------------------------", + "", + NULL + }; + + k = 0; + while (intro[k]) + fmt_txt_line(&buf, indent, intro[k++]); + + fmt_base_object(&buf, indent1, base); + + fmt_large_item_vec(&buf, indent1, psc->vec_largest_by_size_bytes); + + strbuf_addch(&buf, '\n'); + fmt_txt_line(&buf, indent1, "Histogram by Number of Parents"); + fmt_pbin_hr(&buf, indent1); + fmt_pbin_hdr(&buf, indent1, "Parents", "Count"); + fmt_pbin_hr(&buf, indent1); + for (k = 0; k < PBIN_VEC_LEN; k++) + if (psc->parent_cnt_pbin[k]) + fmt_pbin_row(&buf, indent1, k, psc->parent_cnt_pbin[k]); + fmt_pbin_hr(&buf, indent1); + + fmt_large_item_vec(&buf, indent1, psc->vec_largest_by_nr_parents); + + strbuf_addch(&buf, '\n'); + fwrite(buf.buf, 1, buf.len, stdout); + strbuf_release(&buf); +} + +static void pretty_print_trees(int indent) +{ + struct survey_stats_trees *pst = &survey_stats.trees; + struct survey_stats_base_object *base = &pst->base; + struct strbuf buf = STRBUF_INIT; + int indent1 = indent + 4; + int k; + + const char *intro[] = { + "", + "TREES", + "-------------------------------------------------------------------------------", + "", + NULL + }; + + k = 0; + while (intro[k]) + fmt_txt_line(&buf, indent, intro[k++]); + + fmt_base_object(&buf, indent1, base); + + fmt_large_item_vec(&buf, indent1, pst->vec_largest_by_size_bytes); + + fmt_qbin(&buf, indent1, "Tree Histogram by Number of Entries", "Entry Range", pst->entry_qbin); + fmt_large_item_vec(&buf, indent1, pst->vec_largest_by_nr_entries); + + strbuf_addch(&buf, '\n'); + fwrite(buf.buf, 1, buf.len, stdout); + strbuf_release(&buf); +} + +static void pretty_print_blobs(int indent) +{ + struct survey_stats_blobs *psb = &survey_stats.blobs; + struct survey_stats_base_object *base = &psb->base; + struct strbuf buf = STRBUF_INIT; + int indent1 = indent + 4; + int k; + + const char *intro[] = { + "", + "BLOBS", + "-------------------------------------------------------------------------------", + "", + NULL + }; + + k = 0; + while (intro[k]) + fmt_txt_line(&buf, indent, intro[k++]); + + fmt_base_object(&buf, indent1, base); + + fmt_large_item_vec(&buf, indent1, psb->vec_largest_by_size_bytes); + + strbuf_addch(&buf, '\n'); + fwrite(buf.buf, 1, buf.len, stdout); + strbuf_release(&buf); +} + +/* + * Print all of the stats that we have collected in a more pretty format. + */ +static void survey_print_results_pretty(void) +{ + pretty_print_survey_hdr(); + pretty_print_overview(0); + pretty_print_refs(0); + pretty_print_commits(0); + pretty_print_trees(0); + pretty_print_blobs(0); +} + +int cmd_survey(int argc, const char **argv, const char *prefix) +{ + survey_load_config(); + + argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0); + + color_fprintf_ln(stderr, + want_color_fd(2, GIT_COLOR_AUTO) ? GIT_COLOR_YELLOW : "", + "(THIS IS EXPERIMENTAL, EXPECT THE OUTPUT FORMAT TO CHANGE!)"); + + prepare_repo_settings(the_repository); + + if (survey_opts.show_progress < 0) + survey_opts.show_progress = isatty(2); + fixup_refs_wanted(); + + alloc_commit_by_parents(); + alloc_commit_by_size(); + alloc_tree_by_entries(); + alloc_tree_by_size(); + alloc_blob_by_size(); + + survey_phase_refs(the_repository); + + survey_emit_trace2(); + if (survey_opts.show_json) + survey_print_json(); + else + survey_print_results_pretty(); + + strvec_clear(&survey_vec_refs_wanted); + free_large_item_vec(survey_stats.commits.vec_largest_by_nr_parents); + free_large_item_vec(survey_stats.commits.vec_largest_by_size_bytes); + free_large_item_vec(survey_stats.trees.vec_largest_by_nr_entries); + free_large_item_vec(survey_stats.trees.vec_largest_by_size_bytes); + free_large_item_vec(survey_stats.blobs.vec_largest_by_size_bytes); + + return 0; +} + +/* + * NEEDSWORK: So far, I only have iteration on the requested set of + * refs and treewalk/reachable objects on that set of refs. The + * following is a bit of a laundry list of things that I'd like to + * add. + * + * [] Dump stats on all of the packfiles. The number and size of each. + * Whether each is in the .git directory or in an alternate. The + * state of the IDX or MIDX files and etc. Delta chain stats. All + * of this data is relative to the "lived-in" state of the + * repository. Stuff that may change after a GC or repack. + * + * [] Clone and Index stats. partial, shallow, sparse-checkout, + * sparse-index, etc. Hydration stats. + * + * [] Dump stats on each remote. When we fetch from a remote the size + * of the response is related to the set of haves on the server. + * You can see this in `GIT_TRACE_CURL=1 git fetch`. We get a + * `ls-refs` payload that lists all of the branches and tags on the + * server, so at a minimum the RefName and SHA for each. But for + * annotated tags we also get the peeled SHA. The size of this + * overhead on every fetch is proporational to the size of the `git + * ls-remote` response (roughly, although the latter repeats the + * RefName of the peeled tag). If, for example, you have 500K refs + * on a remote, you're going to have a long "haves" message, so + * every fetch will be slow just because of that overhead (not + * counting new objects to be downloaded). + * + * Note that the local set of tags in "refs/tags/" is a union over + * all remotes. However, since most people only have one remote, + * we can probaly estimate the overhead value directly from the + * size of the set of "refs/tags/" that we visited while building + * the `ref_info` and `ref_array` and not need to ask the remote. + * + * [] Should the "string length of refnames / remote refs", for + * example, be sub-divided by remote so we can project the + * cost of the haves/wants overhead a fetch. + * + * [] Can we examine the merge commits and classify them as clean or + * dirty? (ie. ones with merge conflicts that needed to be + * addressed during the merge itself.) + * + * [] Do dirty merges affect performance of later operations? + * + * [] Dump info on the complexity of the DAG. Criss-cross merges. + * The number of edges that must be touched to compute merge bases. + * Edge length. The number of parallel lanes in the history that + * must be navigated to get to the merge base. What affects the + * cost of the Ahead/Behind computation? How often do + * criss-crosses occur and do they cause various operations to slow + * down? + * + * [] If there are primary branches (like "main" or "master") are they + * always on the left side of merges? Does the graph have a clean + * left edge? Or are there normal and "backwards" merges? Do + * these cause problems at scale? + * + * [] If we have a hierarchy of FI/RI branches like "L1", "L2, ..., + * can we learn anything about the shape of the repo around these + * FI and RI integrations? + * + * [] Do we need a no-PII flag to omit pathnames or branch/tag names + * in the various histograms? (This would turn off --name-rev + * too.) + * + * [] I have so far avoided adding opinions about individual fields + * (such as the way `git-sizer` prints a row of stars or bangs in + * the last column). + * + * I'm wondering if that is a job of this executable or if it + * should be done in a post-processing step using the JSON output. + * + * My problem with the `git-sizer` approach is that it doesn't give + * the (casual) user any information on why it has stars or bangs. + * And there isn't a good way to print detailed information in the + * ASCII-art tables that would be easy to understand. + * + * [] For example, a large number of refs does not define a cliff. + * Performance will drop off (linearly, quadratically, ... ??). + * The tool should refer them to article(s) talking about the + * different problems that it could cause. So should `git + * survey` just print the number and (implicitly) refer them to + * the man page (chapter/verse) or to a tool that will interpret + * the number and explain it? + * + * [] Alternatively, should `git survey` do that analysis too and + * just print footnotes for each large number? + * + * [] The computation of the raw survey JSON data can take HOURS on + * a very large repo (like Windows), so I'm wondering if we + * want to keep the opinion portion separate. + * + * [] In addition to opinions based on the static data, I would like + * to dump the JSON results (or the Trace2 telemetry) into a DB and + * aggregate it with other users. + * + * Granted, they should all see the same DAG and the same set of + * reachable objects, but we could average across all datasets + * generated on a particular date and detect outlier users. + * + * [] Maybe someone cloned from the `_full` endpoint rather than + * the limited refs endpoint. + * + * [] Maybe that user is having problems with repacking / GC / + * maintenance without knowing it. + * + * [] I'd also like to dump use the DB to compare survey datasets over + * a time. How fast is their repository growing and in what ways? + * + * [] I'd rather have the delta analysis NOT be inside `git + * survey`, so it makes sense to consider having all of it in a + * post-process step. + * + * [] Another reason to put the opinion analysis in a post-process + * is that it would be easier to generate plots on the data tables. + * Granted, we can get plots from telemetry, but a stand-alone user + * could run the JSON thru python or jq or something and generate + * something nicer than ASCII-art and it could handle cross-referencing + * and hyperlinking to helpful information on each issue. + * + * [] I think there are several classes of data that we can report on: + * + * [] The "inherit repo properties", such as the shape and size of + * the DAG -- these should be universal in each enlistment. + * + * [] The "ODB lived in properties", such as the efficiency + * of the repack and things like partial and shallow clone. + * These will vary, but indicate health of the ODB. + * + * [] The "index related properties", such as sparse-checkout, + * sparse-index, cache-tree, untracked-cache, fsmonitor, and + * etc. These will also vary, but are more like knobs for + * the user to adjust. + * + * [] I want to compare these with Matt's "dimensions of scale" + * notes and see if there are other pieces of data that we + * could compute/consider. + * + */ diff --git a/command-list.txt b/command-list.txt index e0bb87b3b5c278..d389561a5f1161 100644 --- a/command-list.txt +++ b/command-list.txt @@ -186,6 +186,7 @@ git-stash mainporcelain git-status mainporcelain info git-stripspace purehelpers git-submodule mainporcelain +git-survey mainporcelain git-svn foreignscminterface git-switch mainporcelain history git-symbolic-ref plumbingmanipulators diff --git a/git.c b/git.c index e35af9b0e5e976..7c70fff6218d90 100644 --- a/git.c +++ b/git.c @@ -622,6 +622,7 @@ static struct cmd_struct commands[] = { { "status", cmd_status, RUN_SETUP | NEED_WORK_TREE }, { "stripspace", cmd_stripspace }, { "submodule--helper", cmd_submodule__helper, RUN_SETUP }, + { "survey", cmd_survey, RUN_SETUP }, { "switch", cmd_switch, RUN_SETUP | NEED_WORK_TREE }, { "symbolic-ref", cmd_symbolic_ref, RUN_SETUP }, { "tag", cmd_tag, RUN_SETUP | DELAY_PAGER_CONFIG }, diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh new file mode 100755 index 00000000000000..f74e8743b6b746 --- /dev/null +++ b/t/t8100-git-survey.sh @@ -0,0 +1,98 @@ +#!/bin/sh + +test_description='measure repository and report on scaling dimensions' + +. ./test-lib.sh + +perl -MJSON::PP -e 0 >/dev/null 2>&1 && test_set_prereq JSON_PP + +test_expect_success 'verify zero counts before initial commit' ' + test_when_finished "rm -rf data.json actual* expect*" && + + git survey --json >data.json && + + # Verify that there are no refs and no objects of any kind. + # + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" count actual.count && + cat >expect.count <<-\EOF && + refs.count:0 + commits.count:0 + trees.count:0 + blobs.count:0 + EOF + test_cmp expect.count actual.count && + + # Verify that each of the histograms and large-item arrays are empty. + # This is mainly to test the perl script, since `git survey` will generate + # JSON with empty objects and arrays and will get parsed into empty hashes + # and arrays which behave differently in perl. + # + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" \ + commits.mostparents \ + commits.histparents \ + trees.histentries \ + trees.mostentries \ + blobs.histsize \ + blobs.largest \ + actual.empty && + cat >expect.empty <<-\EOF && + EOF + test_cmp expect.empty actual.empty +' + +test_expect_success 'initial commit' ' + test_when_finished "rm -rf data.json actual* expect*" && + + touch file0 && + git add file* && + git commit -m "initial" && + + git survey --json >data.json && + + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" count actual.count && + cat >expect.count <<-\EOF && + refs.count:1 + commits.count:1 + trees.count:1 + blobs.count:1 + EOF + test_cmp expect.count actual.count && + + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" commits.mostparents actual-mp && + cat >expect-mp <<-\EOF && + commits.mostparents[0].nr_parents:0 + EOF + test_cmp expect-mp actual-mp && + + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" commits.histparents actual-hp && + cat >expect-hp <<-\EOF && + commits.histparents[P00].count:1 + EOF + test_cmp expect-hp actual-hp && + + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" trees.histentries actual-he && + cat >expect-he <<-\EOF && + trees.histentries.Q00.count:1 + EOF + test_cmp expect-he actual-he && + + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" trees.mostentries actual-me && + cat >expect-me <<-\EOF && + trees.mostentries[0].nr_entries:1 + EOF + test_cmp expect-me actual-me && + + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" blobs.histsize actual-hs && + cat >expect-hs <<-\EOF && + blobs.histsize.H0.count:1 + EOF + test_cmp expect-hs actual-hs && + + perl "$TEST_DIRECTORY/t8100/survey_parse_json.perl" blobs.largest actual-lb && + cat >expect-lb <<-\EOF && + blobs.largest[0].size:0 + EOF + test_cmp expect-lb actual-lb +' + +test_done diff --git a/t/t8100/survey_parse_json.perl b/t/t8100/survey_parse_json.perl new file mode 100644 index 00000000000000..5ac1d4dc732536 --- /dev/null +++ b/t/t8100/survey_parse_json.perl @@ -0,0 +1,520 @@ +#!/usr/bin/perl +# +# Parse the JSON output generated by `git survey --json` to +# support the actual unit tests in the shell script. + +use strict; +use warnings; +use JSON::PP; +use Data::Dumper; + +$Data::Dumper::Sortkeys = 1; +$Data::Dumper::Indent = 1; +$Data::Dumper::Purity = 1; +$Data::Dumper::Pair = ':'; + +my $stdin = join("", ); +my $data = decode_json $stdin; + +#my $dump = Dumper($data); +#print $dump; + +# Create a series of functions / command line args to extract certain +# key values so that the shell script can verify them. +# +# (1) The full JSON data set contains too much data to sanely test in +# a shell script +# +# (2) Some JSON fields are fundamental/foundational, like the number +# of objects, the size of the largest item, or the pathname of the +# largest item. But others are transient, like whether an object or +# ref is packed or loose. And then there are some really transient +# values, like the SHAs of commits when we don't control for the +# data/time. So for simplicity our shell script test will verify the +# basics and not try to do an exact match on the entire data set. +# +# (3) Most of the functionality in `git survey` comes from the various +# existing iterators, for example to enumerate the desired set of refs +# and to treewalk the set of reachable commits, trees, and blobs and +# we are just using iterator callbacks to collect data on the repo. +# We do not need to verify the correctness of the iterator code; we +# just need to verify that we've used it properly when we collected +# our stats. + +# Print various '....count' values from the JSON data. +# +# We assume that the JSON looks like: +# +# { +# ... +# "refs": { +# "count": 3545, +# ... +# }, +# "commits": { +# "count": 197615, +# ... +# }, +# "trees": { +# "count": 331409, +# ... +# }, +# "blobs": { +# "count": 191847, +# ... +# }, +# ... +# } +# +# And we want to emit: +# +# refs.count:3545 +# commits.count:197615 +# trees.count:331409 +# blobs.count:191847 +# +sub Count { + print "refs.count:$data->{'refs'}->{'count'}\n"; + print "commits.count:$data->{'commits'}->{'count'}\n"; + print "trees.count:$data->{'trees'}->{'count'}\n"; + print "blobs.count:$data->{'blobs'}->{'count'}\n"; +} + +# We currently do not expose the "commits.dist_by_size.*" histogram +# for testing. The data is valid, but sensitive to the length of the +# SHAs of the parent commits and root tree and the length of the text +# of the commit message. This is not very interesting and we'll test +# the histogram construction for the other types of objects. +# +# { +# ... +# "commits": { +# "count": 197615, +# ... +# "dist_by_size": { +# "H1": { +# "count": 2268, +# "sum_size": 549925, +# "sum_disk_size": 388778, +# "hbin_lower": 16, +# "hbin_upper": 255 +# }, +# "H2": { +# "count": 194926, +# "sum_size": 138557614, +# "sum_disk_size": 76535965, +# "hbin_lower": 256, +# "hbin_upper": 4095 +# }, +# ... +# }, +# ... +# }, + +# We also do not expose the "commits.largest_commits_by_size_bytes" +# array for testing. This is also sensitive to the length of the SHAs +# and the commit message. We'll explore the histogram construction in +# other types of objects below. +# +# { +# ... +# "commits": { +# "count": 197615, +# ... +# "largest_commits_by_size_bytes": [ +# { +# "size": 78970, +# "oid": "0ab955aac3217bdc64a5df6dd747e8a2238f0473", +# ... +# }, +# { +# "size": 25831, +# "oid": "e74f1e05be5adb88b1d3b282fa500e15b3b04aa7", +# ... +# }, +# ... +# }, +# ... +# }, + + +# Print details for "Largest Commits by Number of Parents". This is +# an array sorted in descending order. For multiple commits with the +# same number of parents, the relative order is undefined. +# +# We assume that the JSON looks like: +# +# { +# ... +# "commits": { +# "count": 197615, +# ... +# "largest_commits_by_nr_parents": [ +# { +# "nr_parents": 10, +# "oid": "16d7601e176cd53f3c2f02367698d06b85e08879", +# ... +# }, +# { +# "nr_parents": 6, +# "oid": "d425142e2a045a9dd7879d028ec68bd748df48a3", +# ... +# }, +# ... +# ], +# ... +# }, +# +# And we want to emit: +# +# commits.mostparents[0].nr_parents:10 +# commits.mostparents[1].nr_parents:6 +# ... +# +sub CommitsMostParents { + my $nr_items = scalar @{ $data->{'commits'}->{'largest_commits_by_nr_parents'} }; + if ($nr_items == 0) { + return 0; + } + my @arr = @{ $data->{'commits'}->{'largest_commits_by_nr_parents'} }; + my $k; + for ($k=0; $k < $nr_items; $k++) { + print "commits.mostparents[$k].nr_parents:$arr[$k]->{'nr_parents'}\n"; + } +} + +# Print details of the "Histogram by Number of Parents" data. +# +# We assume that the JSON looks like: +# +# { +# ... +# "commits": { +# "count": 197615, +# ... +# "count_by_nr_parents": { +# "P00": 13, +# "P01": 148603, +# "P02": 48950, +# "P03": 37, +# "P04": 7, +# "P05": 3, +# "P06": 1, +# "P10": 1 +# } +# }, +# ... +# } +# +# And we want to emit: +# +# commits.histparents[P00].count:13 +# commits.histparents[P01].count:148603 +# ... +# +sub CommitsHistParents { + my $nr_buckets = keys %{ $data->{'commits'}->{'count_by_nr_parents'} }; + if ($nr_buckets == 0) { + return 0; + } + my %dist = %{ $data->{'commits'}->{'count_by_nr_parents'} }; + foreach my $key ( sort keys %dist ) { + my $value = $dist{$key}; + print "commits.histparents[$key].count:$value\n"; + } +} + +# We currently do not expose the "trees.dist_by_size" histogram for +# testing. The data is valid, but sensitive to the length of a SHA +# and the filenames in the tree. That makes it a little trickier to +# test and probably not worth the bother (since we'll test the +# histogram setup code with the "trees.dist_by_nr_entries" and the +# histogram size code in the "blobs.dist_by_size" cases. +# +# { +# ... +# "trees": { +# "count": 331409, +# ... +# "dist_by_size": { +# "H1": { +# "count": 13349, +# "sum_size": 1953155, +# "sum_disk_size": 912044, +# "hbin_lower": 16, +# "hbin_upper": 255 +# }, +# "H2": { +# "count": 52677, +# "sum_size": 101507410, +# "sum_disk_size": 6549425, +# "hbin_lower": 256, +# "hbin_upper": 4095 +# }, +# ... +# }, +# ... +# }, +# ... +# } + +# We also do not expose the "trees.largest_trees_by_size" array for +# testing (for the same SHA and filename reasons). We'll assume that +# the same code is used to build the array of largest blobs. +# +# { +# ... +# "trees": { +# "count": 331409, +# ... +# "largest_trees_by_size_bytes": [ +# { +# "size": 58487, +# "oid": "140160ee18ed56aeaf5e028c60e01874faa9c12d", +# "name": "t", +# ... +# }, +# { +# "size": 58487, +# "oid": "2d5af5733ab1061aae9a7babaabf9064783e3891", +# "name": "t", +# ... +# }, +# ... +# }, +# ... +# }, +# ... +# } + +# Print details for "Histogram by Number of Entries" for trees. For +# example, the bucket `Q00` contains the count of the trees that have +# between 0 and 3 files/subdirectories. +# +# We assume that the JSON looks like: +# +# { +# ... +# "trees": { +# "count": 331409, +# "sum_size": 5376309652, +# ... +# "dist_by_nr_entries": { +# "Q00": { +# "count": 5798, +# "sum_size": 480428, +# "sum_disk_size": 390478, +# "qbin_lower": 0, +# "qbin_upper": 3 +# }, +# "Q01": { +# "count": 15217, +# "sum_size": 4587357, +# "sum_disk_size": 1177431, +# "qbin_lower": 4, +# "qbin_upper": 15 +# }, +# ... +# "Q05": { +# "count": 12965, +# "sum_size": 714372748, +# "sum_disk_size": 11298665, +# "qbin_lower": 1024, +# "qbin_upper": 4095 +# }, +# ... +# } +# }, +# ... +# } +# +# And we want to emit: +# +# trees.histentries.Q00.count:5798 +# trees.histentries.Q01.count:15217 +# ... +# trees.histentries.Q05.count:12965 +# ... +# +sub TreesHistEntries { + my $nr_buckets = keys %{ $data->{'trees'}->{'dist_by_nr_entries'} }; + if ($nr_buckets == 0) { + return 0; + } + my %dist = %{ $data->{'trees'}->{'dist_by_nr_entries'} }; + foreach my $key ( sort keys %dist ) { + my $value = $dist{$key}; + print "trees.histentries.$key.count:$value->{'count'}\n"; + } +} + +# Print details for "Largest Trees by Number of Entries". This is an +# array sorted in descending order. For multiple trees with the same +# number of entries, the relative order is undefined. +# +# We assume that the JSON looks like: +# +# { +# ... +# "trees": { +# "count": 331409, +# ... +# "largest_trees_by_nr_entries": [ +# { +# "nr_entries": 1148, +# "oid": "140160ee18ed56aeaf5e028c60e01874faa9c12d", +# "name": "t", +# ... +# }, +# { +# "nr_entries": 942, +# "oid": "2d5af5733ab1061aae9a7babaabf9064783e3891", +# "name": "t", +# ... +# }, +# ... +# ], +# ... +# }, +# ... +# } +# +# And we want to emit: +# +# trees.mostentries[0].nr_entries:1148 +# trees.mostentries[1].nr_entries:942 +# ... +# +sub TreesMostEntries { + my $nr_items = scalar @{ $data->{'trees'}->{'largest_trees_by_nr_entries'} }; + if ($nr_items == 0) { + return 0; + } + my @arr = @{ $data->{'trees'}->{'largest_trees_by_nr_entries'} }; + my $k; + for ($k=0; $k < $nr_items; $k++) { + print "trees.mostentries[$k].nr_entries:$arr[$k]->{'nr_entries'}\n"; + } +} + +# Print details for the "Histogram by Size in Bytes" for blobs. +# +# We assume that the JSON looks like: +# +# { +# ... +# "blobs": { +# "count": 191847, +# ... +# "dist_by_size": { +# "H0": { +# "count": 47, +# "sum_size": 433, +# "sum_disk_size": 856, +# "hbin_lower": 0, +# "hbin_upper": 15 +# }, +# "H1": { +# "count": 2045, +# "sum_size": 224602, +# "sum_disk_size": 145658, +# "hbin_lower": 16, +# "hbin_upper": 255 +# }, +# ... +# } +# }, +# ... +# } +# +# And we want to emit: +# +# blobs.histsize.H0.count:47 +# blobs.histsize.H1.count:2045 +# ... +# +sub BlobsHistSize { + my $nr_buckets = keys %{ $data->{'blobs'}->{'dist_by_size'} }; + if ($nr_buckets == 0) { + return 0; + } + my %dist = %{ $data->{'blobs'}->{'dist_by_size'} }; + foreach my $key ( sort keys %dist ) { + my $value = $dist{$key}; + print "blobs.histsize.$key.count:$value->{'count'}\n"; + } +} + +# Print details for the "Largest Blobs by Size in Bytes" table. +# This is an array sorted in descending order. If there are multiple +# blobs with the same size, the relative order is undefined. +# +# We assume that the JSON looks like: +# +# { +# ... +# "blobs": { +# "count": 191847, +# ... +# "largest_blobs_by_size_bytes": [ +# { +# "size": 10577552, +# "oid": "667824451d9202e721b6d9413ce4c6b7ce58c36e", +# ... +# }, +# { +# "size": 6655520, +# "oid": "78bcd7f596df79b580e793957928be457a61c3f5", +# ... +# }, +# ... +# ], +# }, +# ... +# } +# +# And we want to emit: +# +# blobs.largest[0].size:10577552 +# blobs.largest[1].size:6655520 +# ... +# +sub BlobsLargest { + my $nr_items = scalar @{ $data->{'blobs'}->{'largest_blobs_by_size_bytes'} }; + if ($nr_items == 0) { + return 0; + } + my @arr = @{ $data->{'blobs'}->{'largest_blobs_by_size_bytes'} }; + my $k; + for ($k=0; $k < $nr_items; $k++) { + print "blobs.largest[$k].size:$arr[$k]->{'size'}\n"; + } +} + +foreach my $arg_k(@ARGV) { + if ($arg_k eq 'count') { + Count; + } + elsif ($arg_k eq 'commits.mostparents') { + CommitsMostParents; + } + elsif ($arg_k eq 'commits.histparents') { + CommitsHistParents; + } + elsif ($arg_k eq 'trees.histentries') { + TreesHistEntries; + } + elsif ($arg_k eq 'trees.mostentries') { + TreesMostEntries; + } + elsif ($arg_k eq 'blobs.histsize') { + BlobsHistSize; + } + elsif ($arg_k eq 'blobs.largest') { + BlobsLargest; + } + else { + print "ERROR: unknown command '$arg_k'\n"; + exit 1; + } +}