From 03bbdc818abec24752bb7e4a3b00a06b69b18e0a Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 29 Apr 2024 08:55:03 -0400 Subject: [PATCH 01/10] survey: stub in new experimental 'git-survey' command Start work on a new 'git survey' command to scan the repository for monorepo performance and scaling problems. The goal is to measure the various known "dimensions of scale" and serve as a foundation for adding additional measurements as we learn more about Git monorepo scaling problems. The initial goal is to complement the scanning and analysis performed by the GO-based 'git-sizer' (https://github.com/github/git-sizer) tool. It is hoped that by creating a builtin command, we may be able to take advantage of internal Git data structures and code that is not accessible from GO to gain further insight into potential scaling problems. Co-authored-by: Derrick Stolee Signed-off-by: Jeff Hostetler Signed-off-by: Derrick Stolee --- .gitignore | 1 + Documentation/config.txt | 2 + Documentation/config/survey.txt | 11 +++++ Documentation/git-survey.txt | 36 ++++++++++++++++ Makefile | 1 + builtin.h | 1 + builtin/survey.c | 75 +++++++++++++++++++++++++++++++++ command-list.txt | 1 + git.c | 1 + t/t8100-git-survey.sh | 18 ++++++++ 10 files changed, 147 insertions(+) create mode 100644 Documentation/config/survey.txt create mode 100644 Documentation/git-survey.txt create mode 100644 builtin/survey.c create mode 100755 t/t8100-git-survey.sh diff --git a/.gitignore b/.gitignore index 0f9e7de2ec306d..adfede56a56aaf 100644 --- a/.gitignore +++ b/.gitignore @@ -166,6 +166,7 @@ /git-submodule /git-submodule--helper /git-subtree +/git-survey /git-svn /git-switch /git-symbolic-ref diff --git a/Documentation/config.txt b/Documentation/config.txt index 8c0b3ed8075214..4ee8b693022ef7 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -534,6 +534,8 @@ include::config/status.txt[] include::config/submodule.txt[] +include::config/survey.txt[] + include::config/tag.txt[] include::config/tar.txt[] diff --git a/Documentation/config/survey.txt b/Documentation/config/survey.txt new file mode 100644 index 00000000000000..c1b0f852a1250e --- /dev/null +++ b/Documentation/config/survey.txt @@ -0,0 +1,11 @@ +survey.*:: + These variables adjust the default behavior of the `git survey` + command. The intention is that this command could be run in the + background with these options. ++ +-- + verbose:: + This boolean value implies the `--[no-]verbose` option. + progress:: + This boolean value implies the `--[no-]progress` option. +-- diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt new file mode 100644 index 00000000000000..5f8ec9bfea673b --- /dev/null +++ b/Documentation/git-survey.txt @@ -0,0 +1,36 @@ +git-survey(1) +============= + +NAME +---- +git-survey - EXPERIMENTAL: Measure various repository dimensions of scale + +SYNOPSIS +-------- +[verse] +(EXPERIMENTAL!) 'git survey' + +DESCRIPTION +----------- + +Survey the repository and measure various dimensions of scale. + +As repositories grow to "monorepo" size, certain data shapes can cause +performance problems. `git-survey` attempts to measure and report on +known problem areas. + +OPTIONS +------- + +--progress:: + Show progress. This is automatically enabled when interactive. + +OUTPUT +------ + +By default, `git survey` will print information about the repository in a +human-readable format that includes overviews and tables. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Makefile b/Makefile index 117d319003adc1..503a33b5724723 100644 --- a/Makefile +++ b/Makefile @@ -1313,6 +1313,7 @@ BUILTIN_OBJS += builtin/sparse-checkout.o BUILTIN_OBJS += builtin/stash.o BUILTIN_OBJS += builtin/stripspace.o BUILTIN_OBJS += builtin/submodule--helper.o +BUILTIN_OBJS += builtin/survey.o BUILTIN_OBJS += builtin/symbolic-ref.o BUILTIN_OBJS += builtin/tag.o BUILTIN_OBJS += builtin/unpack-file.o diff --git a/builtin.h b/builtin.h index 89928ccf92f532..5f64730cf0273d 100644 --- a/builtin.h +++ b/builtin.h @@ -231,6 +231,7 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix, struct int cmd_status(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_stash(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_stripspace(int argc, const char **argv, const char *prefix, struct repository *repo); +int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_submodule__helper(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_switch(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_symbolic_ref(int argc, const char **argv, const char *prefix, struct repository *repo); diff --git a/builtin/survey.c b/builtin/survey.c new file mode 100644 index 00000000000000..4c7751f8708640 --- /dev/null +++ b/builtin/survey.c @@ -0,0 +1,75 @@ +#define USE_THE_REPOSITORY_VARIABLE + +#include "builtin.h" +#include "config.h" +#include "parse-options.h" + +static const char * const survey_usage[] = { + N_("(EXPERIMENTAL!) git survey "), + NULL, +}; + +struct survey_opts { + int verbose; + int show_progress; +}; + +struct survey_context { + struct repository *repo; + + /* Options that control what is done. */ + struct survey_opts opts; +}; + +static int survey_load_config_cb(const char *var, const char *value, + const struct config_context *cctx, void *pvoid) +{ + struct survey_context *ctx = pvoid; + + if (!strcmp(var, "survey.verbose")) { + ctx->opts.verbose = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "survey.progress")) { + ctx->opts.show_progress = git_config_bool(var, value); + return 0; + } + + return git_default_config(var, value, cctx, pvoid); +} + +static void survey_load_config(struct survey_context *ctx) +{ + git_config(survey_load_config_cb, ctx); +} + +int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo) +{ + static struct survey_context ctx = { + .opts = { + .verbose = 0, + .show_progress = -1, /* defaults to isatty(2) */ + }, + }; + + static struct option survey_options[] = { + OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), + OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + OPT_END(), + }; + + if (argc == 2 && !strcmp(argv[1], "-h")) + usage_with_options(survey_usage, survey_options); + + ctx.repo = repo; + + prepare_repo_settings(ctx.repo); + survey_load_config(&ctx); + + argc = parse_options(argc, argv, prefix, survey_options, survey_usage, 0); + + if (ctx.opts.show_progress < 0) + ctx.opts.show_progress = isatty(2); + + return 0; +} diff --git a/command-list.txt b/command-list.txt index c537114b4687b8..ecc9d2281a0909 100644 --- a/command-list.txt +++ b/command-list.txt @@ -187,6 +187,7 @@ git-stash mainporcelain git-status mainporcelain info git-stripspace purehelpers git-submodule mainporcelain +git-survey mainporcelain git-svn foreignscminterface git-switch mainporcelain history git-symbolic-ref plumbingmanipulators diff --git a/git.c b/git.c index c5cfb43fcd2ef7..1825350267470d 100644 --- a/git.c +++ b/git.c @@ -630,6 +630,7 @@ static struct cmd_struct commands[] = { { "status", cmd_status, RUN_SETUP | NEED_WORK_TREE }, { "stripspace", cmd_stripspace }, { "submodule--helper", cmd_submodule__helper, RUN_SETUP }, + { "survey", cmd_survey, RUN_SETUP }, { "switch", cmd_switch, RUN_SETUP | NEED_WORK_TREE }, { "symbolic-ref", cmd_symbolic_ref, RUN_SETUP }, { "tag", cmd_tag, RUN_SETUP | DELAY_PAGER_CONFIG }, diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh new file mode 100755 index 00000000000000..2df7fa83629301 --- /dev/null +++ b/t/t8100-git-survey.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +test_description='git survey' + +GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME=main +export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME + +TEST_PASSES_SANITIZE_LEAK=0 +export TEST_PASSES_SANITIZE_LEAK + +. ./test-lib.sh + +test_expect_success 'git survey -h shows experimental warning' ' + test_expect_code 129 git survey -h 2>usage && + grep "EXPERIMENTAL!" usage +' + +test_done From b09a67f36904cf73268b28cb053c7c7d98a2f31e Mon Sep 17 00:00:00 2001 From: Jeff Hostetler Date: Mon, 29 Apr 2024 09:51:34 -0400 Subject: [PATCH 02/10] survey: add command line opts to select references By default we will scan all references in "refs/heads/", "refs/tags/" and "refs/remotes/". Add command line opts let the use ask for all refs or a subset of them and to include a detached HEAD. Signed-off-by: Jeff Hostetler Signed-off-by: Derrick Stolee --- Documentation/git-survey.txt | 34 +++++ builtin/survey.c | 247 +++++++++++++++++++++++++++++++++++ t/t8100-git-survey.sh | 9 ++ 3 files changed, 290 insertions(+) diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt index 5f8ec9bfea673b..56060d14b5cfef 100644 --- a/Documentation/git-survey.txt +++ b/Documentation/git-survey.txt @@ -19,12 +19,46 @@ As repositories grow to "monorepo" size, certain data shapes can cause performance problems. `git-survey` attempts to measure and report on known problem areas. +Ref Selection and Reachable Objects +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In this first analysis phase, `git survey` will iterate over the set of +requested branches, tags, and other refs and treewalk over all of the +reachable commits, trees, and blobs and generate various statistics. + OPTIONS ------- --progress:: Show progress. This is automatically enabled when interactive. +Ref Selection +~~~~~~~~~~~~~ + +The following options control the set of refs that `git survey` will examine. +By default, `git survey` will look at tags, local branches, and remote refs. +If any of the following options are given, the default set is cleared and +only refs for the given options are added. + +--all-refs:: + Use all refs. This includes local branches, tags, remote refs, + notes, and stashes. This option overrides all of the following. + +--branches:: + Add local branches (`refs/heads/`) to the set. + +--tags:: + Add tags (`refs/tags/`) to the set. + +--remotes:: + Add remote branches (`refs/remote/`) to the set. + +--detached:: + Add HEAD to the set. + +--other:: + Add notes (`refs/notes/`) and stashes (`refs/stash/`) to the set. + OUTPUT ------ diff --git a/builtin/survey.c b/builtin/survey.c index 4c7751f8708640..6fb26e6b6eaaf0 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -2,16 +2,55 @@ #include "builtin.h" #include "config.h" +#include "object.h" +#include "object-store-ll.h" #include "parse-options.h" +#include "progress.h" +#include "ref-filter.h" +#include "strvec.h" +#include "trace2.h" static const char * const survey_usage[] = { N_("(EXPERIMENTAL!) git survey "), NULL, }; +struct survey_refs_wanted { + int want_all_refs; /* special override */ + + int want_branches; + int want_tags; + int want_remotes; + int want_detached; + int want_other; /* see FILTER_REFS_OTHERS -- refs/notes/, refs/stash/ */ +}; + +static struct survey_refs_wanted default_ref_options = { + .want_all_refs = 1, +}; + struct survey_opts { int verbose; int show_progress; + struct survey_refs_wanted refs; +}; + +struct survey_report_ref_summary { + size_t refs_nr; + size_t branches_nr; + size_t remote_refs_nr; + size_t tags_nr; + size_t tags_annotated_nr; + size_t others_nr; + size_t unknown_nr; +}; + +/** + * This struct contains all of the information that needs to be printed + * at the end of the exploration of the repository and its references. + */ +struct survey_report { + struct survey_report_ref_summary refs; }; struct survey_context { @@ -19,8 +58,84 @@ struct survey_context { /* Options that control what is done. */ struct survey_opts opts; + + /* Info for output only. */ + struct survey_report report; + + /* + * The rest of the members are about enabling the activity + * of the 'git survey' command, including ref listings, object + * pointers, and progress. + */ + + struct progress *progress; + size_t progress_nr; + size_t progress_total; + + struct strvec refs; }; +static void clear_survey_context(struct survey_context *ctx) +{ + strvec_clear(&ctx->refs); +} + +/* + * After parsing the command line arguments, figure out which refs we + * should scan. + * + * If ANY were given in positive sense, then we ONLY include them and + * do not use the builtin values. + */ +static void fixup_refs_wanted(struct survey_context *ctx) +{ + struct survey_refs_wanted *rw = &ctx->opts.refs; + + /* + * `--all-refs` overrides and enables everything. + */ + if (rw->want_all_refs == 1) { + rw->want_branches = 1; + rw->want_tags = 1; + rw->want_remotes = 1; + rw->want_detached = 1; + rw->want_other = 1; + return; + } + + /* + * If none of the `--` were given, we assume all + * of the builtin unspecified values. + */ + if (rw->want_branches == -1 && + rw->want_tags == -1 && + rw->want_remotes == -1 && + rw->want_detached == -1 && + rw->want_other == -1) { + *rw = default_ref_options; + return; + } + + /* + * Since we only allow positive boolean values on the command + * line, we will only have true values where they specified + * a `--`. + * + * So anything that still has an unspecified value should be + * set to false. + */ + if (rw->want_branches == -1) + rw->want_branches = 0; + if (rw->want_tags == -1) + rw->want_tags = 0; + if (rw->want_remotes == -1) + rw->want_remotes = 0; + if (rw->want_detached == -1) + rw->want_detached = 0; + if (rw->want_other == -1) + rw->want_other = 0; +} + static int survey_load_config_cb(const char *var, const char *value, const struct config_context *cctx, void *pvoid) { @@ -43,18 +158,145 @@ static void survey_load_config(struct survey_context *ctx) git_config(survey_load_config_cb, ctx); } +static void do_load_refs(struct survey_context *ctx, + struct ref_array *ref_array) +{ + struct ref_filter filter = REF_FILTER_INIT; + struct ref_sorting *sorting; + struct string_list sorting_options = STRING_LIST_INIT_DUP; + + string_list_append(&sorting_options, "objectname"); + sorting = ref_sorting_options(&sorting_options); + + if (ctx->opts.refs.want_detached) + strvec_push(&ctx->refs, "HEAD"); + + if (ctx->opts.refs.want_all_refs) { + strvec_push(&ctx->refs, "refs/"); + } else { + if (ctx->opts.refs.want_branches) + strvec_push(&ctx->refs, "refs/heads/"); + if (ctx->opts.refs.want_tags) + strvec_push(&ctx->refs, "refs/tags/"); + if (ctx->opts.refs.want_remotes) + strvec_push(&ctx->refs, "refs/remotes/"); + if (ctx->opts.refs.want_other) { + strvec_push(&ctx->refs, "refs/notes/"); + strvec_push(&ctx->refs, "refs/stash/"); + } + } + + filter.name_patterns = ctx->refs.v; + filter.ignore_case = 0; + filter.match_as_path = 1; + + if (ctx->opts.show_progress) { + ctx->progress_total = 0; + ctx->progress = start_progress(_("Scanning refs..."), 0); + } + + filter_refs(ref_array, &filter, FILTER_REFS_KIND_MASK); + + if (ctx->opts.show_progress) { + ctx->progress_total = ref_array->nr; + display_progress(ctx->progress, ctx->progress_total); + } + + ref_array_sort(sorting, ref_array); + + stop_progress(&ctx->progress); + ref_filter_clear(&filter); + ref_sorting_release(sorting); +} + +/* + * The REFS phase: + * + * Load the set of requested refs and assess them for scalablity problems. + * Use that set to start a treewalk to all reachable objects and assess + * them. + * + * This data will give us insights into the repository itself (the number + * of refs, the size and shape of the DAG, the number and size of the + * objects). + * + * Theoretically, this data is independent of the on-disk representation + * (e.g. independent of packing concerns). + */ +static void survey_phase_refs(struct survey_context *ctx) +{ + struct ref_array ref_array = { 0 }; + + trace2_region_enter("survey", "phase/refs", ctx->repo); + do_load_refs(ctx, &ref_array); + + ctx->report.refs.refs_nr = ref_array.nr; + for (size_t i = 0; i < ref_array.nr; i++) { + unsigned long size; + struct ref_array_item *item = ref_array.items[i]; + + switch (item->kind) { + case FILTER_REFS_TAGS: + ctx->report.refs.tags_nr++; + if (oid_object_info(ctx->repo, + &item->objectname, + &size) == OBJ_TAG) + ctx->report.refs.tags_annotated_nr++; + break; + + case FILTER_REFS_BRANCHES: + ctx->report.refs.branches_nr++; + break; + + case FILTER_REFS_REMOTES: + ctx->report.refs.remote_refs_nr++; + break; + + case FILTER_REFS_OTHERS: + ctx->report.refs.others_nr++; + break; + + default: + ctx->report.refs.unknown_nr++; + break; + } + } + + trace2_region_leave("survey", "phase/refs", ctx->repo); + + ref_array_clear(&ref_array); +} + int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo) { static struct survey_context ctx = { .opts = { .verbose = 0, .show_progress = -1, /* defaults to isatty(2) */ + + .refs.want_all_refs = -1, + + .refs.want_branches = -1, /* default these to undefined */ + .refs.want_tags = -1, + .refs.want_remotes = -1, + .refs.want_detached = -1, + .refs.want_other = -1, }, + .refs = STRVEC_INIT, }; static struct option survey_options[] = { OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + + OPT_BOOL_F(0, "all-refs", &ctx.opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), + + OPT_BOOL_F(0, "branches", &ctx.opts.refs.want_branches, N_("include branches"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "tags", &ctx.opts.refs.want_tags, N_("include tags"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "remotes", &ctx.opts.refs.want_remotes, N_("include all remotes refs"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "detached", &ctx.opts.refs.want_detached, N_("include detached HEAD"), PARSE_OPT_NONEG), + OPT_BOOL_F(0, "other", &ctx.opts.refs.want_other, N_("include notes and stashes"), PARSE_OPT_NONEG), + OPT_END(), }; @@ -71,5 +313,10 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor if (ctx.opts.show_progress < 0) ctx.opts.show_progress = isatty(2); + fixup_refs_wanted(&ctx); + + survey_phase_refs(&ctx); + + clear_survey_context(&ctx); return 0; } diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 2df7fa83629301..6656cf20bf7a17 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -15,4 +15,13 @@ test_expect_success 'git survey -h shows experimental warning' ' grep "EXPERIMENTAL!" usage ' +test_expect_success 'create a semi-interesting repo' ' + test_commit_bulk 10 +' + +test_expect_success 'git survey (default)' ' + git survey >out 2>err && + test_line_count = 0 err +' + test_done From 0089ec675de757cc726fb7ca0d83e942b6f94aa3 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 15:58:32 -0400 Subject: [PATCH 03/10] survey: start pretty printing data in table form When 'git survey' provides information to the user, this will be presented in one of two formats: plaintext and JSON. The JSON implementation will be delayed until the functionality is complete for the plaintext format. The most important parts of the plaintext format are headers specifying the different sections of the report and tables providing concreted data. Create a custom table data structure that allows specifying a list of strings for the row values. When printing the table, check each column for the maximum width so we can create a table of the correct size from the start. The table structure is designed to be flexible to the different kinds of output that will be implemented in future changes. Signed-off-by: Derrick Stolee --- Documentation/git-survey.txt | 7 ++ builtin/survey.c | 157 +++++++++++++++++++++++++++++++++++ t/t8100-git-survey.sh | 18 +++- 3 files changed, 181 insertions(+), 1 deletion(-) diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt index 56060d14b5cfef..120ecb9a4d49f2 100644 --- a/Documentation/git-survey.txt +++ b/Documentation/git-survey.txt @@ -65,6 +65,13 @@ OUTPUT By default, `git survey` will print information about the repository in a human-readable format that includes overviews and tables. +References Summary +~~~~~~~~~~~~~~~~~~ + +The references summary includes a count of each kind of reference, +including branches, remote refs, and tags (split by "all" and +"annotated"). + GIT --- Part of the linkgit:git[1] suite diff --git a/builtin/survey.c b/builtin/survey.c index 6fb26e6b6eaaf0..8345eb6412649c 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -7,6 +7,7 @@ #include "parse-options.h" #include "progress.h" #include "ref-filter.h" +#include "strbuf.h" #include "strvec.h" #include "trace2.h" @@ -80,6 +81,160 @@ static void clear_survey_context(struct survey_context *ctx) strvec_clear(&ctx->refs); } +struct survey_table { + const char *table_name; + struct strvec header; + struct strvec *rows; + size_t rows_nr; + size_t rows_alloc; +}; + +#define SURVEY_TABLE_INIT { \ + .header = STRVEC_INIT, \ +} + +static void clear_table(struct survey_table *table) +{ + strvec_clear(&table->header); + for (size_t i = 0; i < table->rows_nr; i++) + strvec_clear(&table->rows[i]); + free(table->rows); +} + +static void insert_table_rowv(struct survey_table *table, ...) +{ + va_list ap; + char *arg; + ALLOC_GROW(table->rows, table->rows_nr + 1, table->rows_alloc); + + memset(&table->rows[table->rows_nr], 0, sizeof(struct strvec)); + + va_start(ap, table); + while ((arg = va_arg(ap, char *))) + strvec_push(&table->rows[table->rows_nr], arg); + va_end(ap); + + table->rows_nr++; +} + +#define SECTION_SEGMENT "========================================" +#define SECTION_SEGMENT_LEN 40 +static const char *section_line = SECTION_SEGMENT + SECTION_SEGMENT + SECTION_SEGMENT + SECTION_SEGMENT; +static const size_t section_len = 4 * SECTION_SEGMENT_LEN; + +static void print_table_title(const char *name, size_t *widths, size_t nr) +{ + size_t width = 3 * (nr - 1); + + for (size_t i = 0; i < nr; i++) + width += widths[i]; + + if (width > section_len) + width = section_len; + + printf("\n%s\n%.*s\n", name, (int)width, section_line); +} + +static void print_row_plaintext(struct strvec *row, size_t *widths) +{ + static struct strbuf line = STRBUF_INIT; + strbuf_setlen(&line, 0); + + for (size_t i = 0; i < row->nr; i++) { + const char *str = row->v[i]; + size_t len = strlen(str); + if (i) + strbuf_add(&line, " | ", 3); + strbuf_addchars(&line, ' ', widths[i] - len); + strbuf_add(&line, str, len); + } + printf("%s\n", line.buf); +} + +static void print_divider_plaintext(size_t *widths, size_t nr) +{ + static struct strbuf line = STRBUF_INIT; + strbuf_setlen(&line, 0); + + for (size_t i = 0; i < nr; i++) { + if (i) + strbuf_add(&line, "-+-", 3); + strbuf_addchars(&line, '-', widths[i]); + } + printf("%s\n", line.buf); +} + +static void print_table_plaintext(struct survey_table *table) +{ + size_t *column_widths; + size_t columns_nr = table->header.nr; + CALLOC_ARRAY(column_widths, columns_nr); + + for (size_t i = 0; i < columns_nr; i++) { + column_widths[i] = strlen(table->header.v[i]); + + for (size_t j = 0; j < table->rows_nr; j++) { + size_t rowlen = strlen(table->rows[j].v[i]); + if (column_widths[i] < rowlen) + column_widths[i] = rowlen; + } + } + + print_table_title(table->table_name, column_widths, columns_nr); + print_row_plaintext(&table->header, column_widths); + print_divider_plaintext(column_widths, columns_nr); + + for (size_t j = 0; j < table->rows_nr; j++) + print_row_plaintext(&table->rows[j], column_widths); + + free(column_widths); +} + +static void survey_report_plaintext_refs(struct survey_context *ctx) +{ + struct survey_report_ref_summary *refs = &ctx->report.refs; + struct survey_table table = SURVEY_TABLE_INIT; + + table.table_name = _("REFERENCES SUMMARY"); + + strvec_push(&table.header, _("Ref Type")); + strvec_push(&table.header, _("Count")); + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_branches) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->branches_nr); + insert_table_rowv(&table, _("Branches"), fmt, NULL); + free(fmt); + } + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_remotes) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->remote_refs_nr); + insert_table_rowv(&table, _("Remote refs"), fmt, NULL); + free(fmt); + } + + if (ctx->opts.refs.want_all_refs || ctx->opts.refs.want_tags) { + char *fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->tags_nr); + insert_table_rowv(&table, _("Tags (all)"), fmt, NULL); + free(fmt); + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)refs->tags_annotated_nr); + insert_table_rowv(&table, _("Tags (annotated)"), fmt, NULL); + free(fmt); + } + + print_table_plaintext(&table); + clear_table(&table); +} + +static void survey_report_plaintext(struct survey_context *ctx) +{ + printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); + printf("-----------------------------------------------------\n"); + survey_report_plaintext_refs(ctx); +} + /* * After parsing the command line arguments, figure out which refs we * should scan. @@ -317,6 +472,8 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor survey_phase_refs(&ctx); + survey_report_plaintext(&ctx); + clear_survey_context(&ctx); return 0; } diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 6656cf20bf7a17..b76064b2a867ac 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -21,7 +21,23 @@ test_expect_success 'create a semi-interesting repo' ' test_expect_success 'git survey (default)' ' git survey >out 2>err && - test_line_count = 0 err + test_line_count = 0 err && + + tr , " " >expect <<-EOF && + GIT SURVEY for "$(pwd)" + ----------------------------------------------------- + + REFERENCES SUMMARY + ======================== + , Ref Type | Count + -----------------+------ + , Branches | 1 + Remote refs | 0 + Tags (all) | 0 + Tags (annotated) | 0 + EOF + + test_cmp expect out ' test_done From b718340a762c21495a88774293499b7e0567c938 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 20:33:47 -0400 Subject: [PATCH 04/10] survey: add object count summary At the moment, nothing is obvious about the reason for the use of the path-walk API, but this will become more prevelant in future iterations. For now, use the path-walk API to sum up the counts of each kind of object. For example, this is the reachable object summary output for my local repo: REACHABLE OBJECT SUMMARY ======================== Object Type | Count ------------+------- Tags | 1343 Commits | 179344 Trees | 314350 Blobs | 184030 Signed-off-by: Derrick Stolee --- Documentation/git-survey.txt | 6 ++ builtin/survey.c | 131 +++++++++++++++++++++++++++++++++-- t/t8100-git-survey.sh | 23 ++++-- 3 files changed, 149 insertions(+), 11 deletions(-) diff --git a/Documentation/git-survey.txt b/Documentation/git-survey.txt index 120ecb9a4d49f2..44f3a0568b7697 100644 --- a/Documentation/git-survey.txt +++ b/Documentation/git-survey.txt @@ -72,6 +72,12 @@ The references summary includes a count of each kind of reference, including branches, remote refs, and tags (split by "all" and "annotated"). +Reachable Object Summary +~~~~~~~~~~~~~~~~~~~~~~~~ + +The reachable object summary shows the total number of each kind of Git +object, including tags, commits, trees, and blobs. + GIT --- Part of the linkgit:git[1] suite diff --git a/builtin/survey.c b/builtin/survey.c index 8345eb6412649c..6643b08892ea8e 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -2,13 +2,20 @@ #include "builtin.h" #include "config.h" +#include "environment.h" +#include "hex.h" #include "object.h" +#include "object-name.h" #include "object-store-ll.h" #include "parse-options.h" +#include "path-walk.h" #include "progress.h" #include "ref-filter.h" +#include "refs.h" +#include "revision.h" #include "strbuf.h" #include "strvec.h" +#include "tag.h" #include "trace2.h" static const char * const survey_usage[] = { @@ -46,12 +53,20 @@ struct survey_report_ref_summary { size_t unknown_nr; }; +struct survey_report_object_summary { + size_t commits_nr; + size_t tags_nr; + size_t trees_nr; + size_t blobs_nr; +}; + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. */ struct survey_report { struct survey_report_ref_summary refs; + struct survey_report_object_summary reachable_objects; }; struct survey_context { @@ -74,10 +89,12 @@ struct survey_context { size_t progress_total; struct strvec refs; + struct ref_array ref_array; }; static void clear_survey_context(struct survey_context *ctx) { + ref_array_clear(&ctx->ref_array); strvec_clear(&ctx->refs); } @@ -128,10 +145,14 @@ static const size_t section_len = 4 * SECTION_SEGMENT_LEN; static void print_table_title(const char *name, size_t *widths, size_t nr) { size_t width = 3 * (nr - 1); + size_t min_width = strlen(name); for (size_t i = 0; i < nr; i++) width += widths[i]; + if (width < min_width) + width = min_width; + if (width > section_len) width = section_len; @@ -228,11 +249,43 @@ static void survey_report_plaintext_refs(struct survey_context *ctx) clear_table(&table); } +static void survey_report_plaintext_reachable_object_summary(struct survey_context *ctx) +{ + struct survey_report_object_summary *objs = &ctx->report.reachable_objects; + struct survey_table table = SURVEY_TABLE_INIT; + char *fmt; + + table.table_name = _("REACHABLE OBJECT SUMMARY"); + + strvec_push(&table.header, _("Object Type")); + strvec_push(&table.header, _("Count")); + + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)objs->tags_nr); + insert_table_rowv(&table, _("Tags"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)objs->commits_nr); + insert_table_rowv(&table, _("Commits"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)objs->trees_nr); + insert_table_rowv(&table, _("Trees"), fmt, NULL); + free(fmt); + + fmt = xstrfmt("%"PRIuMAX"", (uintmax_t)objs->blobs_nr); + insert_table_rowv(&table, _("Blobs"), fmt, NULL); + free(fmt); + + print_table_plaintext(&table); + clear_table(&table); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); printf("-----------------------------------------------------\n"); survey_report_plaintext_refs(ctx); + survey_report_plaintext_reachable_object_summary(ctx); } /* @@ -380,15 +433,13 @@ static void do_load_refs(struct survey_context *ctx, */ static void survey_phase_refs(struct survey_context *ctx) { - struct ref_array ref_array = { 0 }; - trace2_region_enter("survey", "phase/refs", ctx->repo); - do_load_refs(ctx, &ref_array); + do_load_refs(ctx, &ctx->ref_array); - ctx->report.refs.refs_nr = ref_array.nr; - for (size_t i = 0; i < ref_array.nr; i++) { + ctx->report.refs.refs_nr = ctx->ref_array.nr; + for (size_t i = 0; i < ctx->ref_array.nr; i++) { unsigned long size; - struct ref_array_item *item = ref_array.items[i]; + struct ref_array_item *item = ctx->ref_array.items[i]; switch (item->kind) { case FILTER_REFS_TAGS: @@ -418,8 +469,72 @@ static void survey_phase_refs(struct survey_context *ctx) } trace2_region_leave("survey", "phase/refs", ctx->repo); +} + +static void increment_object_counts( + struct survey_report_object_summary *summary, + enum object_type type, + size_t nr) +{ + switch (type) { + case OBJ_COMMIT: + summary->commits_nr += nr; + break; - ref_array_clear(&ref_array); + case OBJ_TREE: + summary->trees_nr += nr; + break; + + case OBJ_BLOB: + summary->blobs_nr += nr; + break; + + case OBJ_TAG: + summary->tags_nr += nr; + break; + + default: + break; + } +} + +static int survey_objects_path_walk_fn(const char *path, + struct oid_array *oids, + enum object_type type, + void *data) +{ + struct survey_context *ctx = data; + + increment_object_counts(&ctx->report.reachable_objects, + type, oids->nr); + + return 0; +} + +static void survey_phase_objects(struct survey_context *ctx) +{ + struct rev_info revs = REV_INFO_INIT; + struct path_walk_info info = PATH_WALK_INFO_INIT; + unsigned int add_flags = 0; + + trace2_region_enter("survey", "phase/objects", ctx->repo); + + info.revs = &revs; + info.path_fn = survey_objects_path_walk_fn; + info.path_fn_data = ctx; + + repo_init_revisions(ctx->repo, &revs, ""); + revs.tag_objects = 1; + + for (size_t i = 0; i < ctx->ref_array.nr; i++) { + struct ref_array_item *item = ctx->ref_array.items[i]; + add_pending_oid(&revs, NULL, &item->objectname, add_flags); + } + + walk_objects_by_path(&info); + + release_revisions(&revs); + trace2_region_leave("survey", "phase/objects", ctx->repo); } int cmd_survey(int argc, const char **argv, const char *prefix, struct repository *repo) @@ -472,6 +587,8 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor survey_phase_refs(&ctx); + survey_phase_objects(&ctx); + survey_report_plaintext(&ctx); clear_survey_context(&ctx); diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index b76064b2a867ac..7a37da1bb2dadc 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -16,11 +16,17 @@ test_expect_success 'git survey -h shows experimental warning' ' ' test_expect_success 'create a semi-interesting repo' ' - test_commit_bulk 10 + test_commit_bulk 10 && + git tag -a -m one one HEAD~5 && + git tag -a -m two two HEAD~3 && + git tag -a -m three three two && + git tag -a -m four four three && + git update-ref -d refs/tags/three && + git update-ref -d refs/tags/two ' test_expect_success 'git survey (default)' ' - git survey >out 2>err && + git survey --all-refs >out 2>err && test_line_count = 0 err && tr , " " >expect <<-EOF && @@ -33,8 +39,17 @@ test_expect_success 'git survey (default)' ' -----------------+------ , Branches | 1 Remote refs | 0 - Tags (all) | 0 - Tags (annotated) | 0 + Tags (all) | 2 + Tags (annotated) | 2 + + REACHABLE OBJECT SUMMARY + ======================== + Object Type | Count + ------------+------ + Tags | 4 + Commits | 10 + Trees | 10 + Blobs | 10 EOF test_cmp expect out From 60cfc961b435eb366ea8a17a864622a3e494238c Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 20:58:35 -0400 Subject: [PATCH 05/10] survey: summarize total sizes by object type Now that we have explored objects by count, we can expand that a bit more to summarize the data for the on-disk and inflated size of those objects. This information is helpful for diagnosing both why disk space (and perhaps clone or fetch times) is growing but also why certain operations are slow because the inflated size of the abstract objects that must be processed is so large. Signed-off-by: Derrick Stolee --- builtin/survey.c | 132 ++++++++++++++++++++++++++++++++++++++++++ t/t8100-git-survey.sh | 29 ++++++++++ 2 files changed, 161 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index 6643b08892ea8e..c527643be53334 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -60,6 +60,19 @@ struct survey_report_object_summary { size_t blobs_nr; }; +/** + * For some category given by 'label', count the number of objects + * that match that label along with the on-disk size and the size + * after decompressing (both with delta bases and zlib). + */ +struct survey_report_object_size_summary { + char *label; + size_t nr; + size_t disk_size; + size_t inflated_size; + size_t num_missing; +}; + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. @@ -67,8 +80,16 @@ struct survey_report_object_summary { struct survey_report { struct survey_report_ref_summary refs; struct survey_report_object_summary reachable_objects; + + struct survey_report_object_size_summary *by_type; }; +#define REPORT_TYPE_COMMIT 0 +#define REPORT_TYPE_TREE 1 +#define REPORT_TYPE_BLOB 2 +#define REPORT_TYPE_TAG 3 +#define REPORT_TYPE_COUNT 4 + struct survey_context { struct repository *repo; @@ -280,12 +301,48 @@ static void survey_report_plaintext_reachable_object_summary(struct survey_conte clear_table(&table); } +static void survey_report_object_sizes(const char *title, + const char *categories, + struct survey_report_object_size_summary *summary, + size_t summary_nr) +{ + struct survey_table table = SURVEY_TABLE_INIT; + table.table_name = title; + + strvec_push(&table.header, categories); + strvec_push(&table.header, _("Count")); + strvec_push(&table.header, _("Disk Size")); + strvec_push(&table.header, _("Inflated Size")); + + for (size_t i = 0; i < summary_nr; i++) { + char *label_str = xstrdup(summary[i].label); + char *nr_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].nr); + char *disk_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].disk_size); + char *inflate_str = xstrfmt("%"PRIuMAX, (uintmax_t)summary[i].inflated_size); + + insert_table_rowv(&table, label_str, nr_str, + disk_str, inflate_str, NULL); + + free(label_str); + free(nr_str); + free(disk_str); + free(inflate_str); + } + + print_table_plaintext(&table); + clear_table(&table); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); printf("-----------------------------------------------------\n"); survey_report_plaintext_refs(ctx); survey_report_plaintext_reachable_object_summary(ctx); + survey_report_object_sizes(_("TOTAL OBJECT SIZES BY TYPE"), + _("Object Type"), + ctx->report.by_type, + REPORT_TYPE_COUNT); } /* @@ -498,6 +555,68 @@ static void increment_object_counts( } } +static void increment_totals(struct survey_context *ctx, + struct oid_array *oids, + struct survey_report_object_size_summary *summary) +{ + for (size_t i = 0; i < oids->nr; i++) { + struct object_info oi = OBJECT_INFO_INIT; + unsigned oi_flags = OBJECT_INFO_FOR_PREFETCH; + unsigned long object_length = 0; + off_t disk_sizep = 0; + enum object_type type; + + oi.typep = &type; + oi.sizep = &object_length; + oi.disk_sizep = &disk_sizep; + + if (oid_object_info_extended(ctx->repo, &oids->oid[i], + &oi, oi_flags) < 0) { + summary->num_missing++; + } else { + summary->nr++; + summary->disk_size += disk_sizep; + summary->inflated_size += object_length; + } + } +} + +static void increment_object_totals(struct survey_context *ctx, + struct oid_array *oids, + enum object_type type) +{ + struct survey_report_object_size_summary *total; + struct survey_report_object_size_summary summary = { 0 }; + + increment_totals(ctx, oids, &summary); + + switch (type) { + case OBJ_COMMIT: + total = &ctx->report.by_type[REPORT_TYPE_COMMIT]; + break; + + case OBJ_TREE: + total = &ctx->report.by_type[REPORT_TYPE_TREE]; + break; + + case OBJ_BLOB: + total = &ctx->report.by_type[REPORT_TYPE_BLOB]; + break; + + case OBJ_TAG: + total = &ctx->report.by_type[REPORT_TYPE_TAG]; + break; + + default: + BUG("No other type allowed"); + } + + total->nr += summary.nr; + total->disk_size += summary.disk_size; + total->inflated_size += summary.inflated_size; + total->num_missing += summary.num_missing; +} + static int survey_objects_path_walk_fn(const char *path, struct oid_array *oids, enum object_type type, @@ -507,10 +626,20 @@ static int survey_objects_path_walk_fn(const char *path, increment_object_counts(&ctx->report.reachable_objects, type, oids->nr); + increment_object_totals(ctx, oids, type); return 0; } +static void initialize_report(struct survey_context *ctx) +{ + CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); + ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); + ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); + ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); + ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags")); +} + static void survey_phase_objects(struct survey_context *ctx) { struct rev_info revs = REV_INFO_INIT; @@ -523,12 +652,15 @@ static void survey_phase_objects(struct survey_context *ctx) info.path_fn = survey_objects_path_walk_fn; info.path_fn_data = ctx; + initialize_report(ctx); + repo_init_revisions(ctx->repo, &revs, ""); revs.tag_objects = 1; for (size_t i = 0; i < ctx->ref_array.nr; i++) { struct ref_array_item *item = ctx->ref_array.items[i]; add_pending_oid(&revs, NULL, &item->objectname, add_flags); + display_progress(ctx->progress, ++(ctx->progress_nr)); } walk_objects_by_path(&info); diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 7a37da1bb2dadc..e738d6421a3224 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -29,6 +29,26 @@ test_expect_success 'git survey (default)' ' git survey --all-refs >out 2>err && test_line_count = 0 err && + test_oid_cache <<-EOF && + commits_size_on_disk sha1: 1523 + commits_size_on_disk sha256: 1811 + + commits_size sha1: 2153 + commits_size sha256: 2609 + + trees_size_on_disk sha1: 495 + trees_size_on_disk sha256: 635 + + trees_size sha1: 1706 + trees_size sha256: 2366 + + tags_size sha1: 528 + tags_size sha256: 624 + + tags_size_on_disk sha1: 510 + tags_size_on_disk sha256: 569 + EOF + tr , " " >expect <<-EOF && GIT SURVEY for "$(pwd)" ----------------------------------------------------- @@ -50,6 +70,15 @@ test_expect_success 'git survey (default)' ' Commits | 10 Trees | 10 Blobs | 10 + + TOTAL OBJECT SIZES BY TYPE + =============================================== + Object Type | Count | Disk Size | Inflated Size + ------------+-------+-----------+-------------- + Commits | 10 | $(test_oid commits_size_on_disk) | $(test_oid commits_size) + Trees | 10 | $(test_oid trees_size_on_disk) | $(test_oid trees_size) + Blobs | 10 | 191 | 101 + Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size) EOF test_cmp expect out From 5132089c639bbf7e3c50c696f5ff4599659ed64b Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 21:21:54 -0400 Subject: [PATCH 06/10] survey: show progress during object walk Signed-off-by: Derrick Stolee --- builtin/survey.c | 14 ++++++++++++++ t/t8100-git-survey.sh | 5 +++++ 2 files changed, 19 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index c527643be53334..f0c6812092f9c6 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -628,6 +628,9 @@ static int survey_objects_path_walk_fn(const char *path, type, oids->nr); increment_object_totals(ctx, oids, type); + ctx->progress_nr += oids->nr; + display_progress(ctx->progress, ctx->progress_nr); + return 0; } @@ -657,13 +660,24 @@ static void survey_phase_objects(struct survey_context *ctx) repo_init_revisions(ctx->repo, &revs, ""); revs.tag_objects = 1; + ctx->progress_nr = 0; + ctx->progress_total = ctx->ref_array.nr; + if (ctx->opts.show_progress) + ctx->progress = start_progress(_("Preparing object walk"), + ctx->progress_total); for (size_t i = 0; i < ctx->ref_array.nr; i++) { struct ref_array_item *item = ctx->ref_array.items[i]; add_pending_oid(&revs, NULL, &item->objectname, add_flags); display_progress(ctx->progress, ++(ctx->progress_nr)); } + stop_progress(&ctx->progress); + ctx->progress_nr = 0; + ctx->progress_total = 0; + if (ctx->opts.show_progress) + ctx->progress = start_progress(_("Walking objects"), 0); walk_objects_by_path(&info); + stop_progress(&ctx->progress); release_revisions(&revs); trace2_region_leave("survey", "phase/objects", ctx->repo); diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index e738d6421a3224..6c2867c11c323c 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -25,6 +25,11 @@ test_expect_success 'create a semi-interesting repo' ' git update-ref -d refs/tags/two ' +test_expect_success 'git survey --progress' ' + GIT_PROGRESS_DELAY=0 git survey --all-refs --progress >out 2>err && + grep "Preparing object walk" err +' + test_expect_success 'git survey (default)' ' git survey --all-refs >out 2>err && test_line_count = 0 err && From 6e8f4b53e4071085b979580edeb189613b0ea3e3 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 22:35:06 -0400 Subject: [PATCH 07/10] survey: add ability to track prioritized lists In future changes, we will make use of these methods. The intention is to keep track of the top contributors according to some metric. We don't want to store all of the entries and do a sort at the end, so track a constant-size table and remove rows that get pushed out depending on the chosen sorting algorithm. Co-authored-by: Jeff Hostetler Signed-off-by; Jeff Hostetler Signed-off-by: Derrick Stolee --- builtin/survey.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index f0c6812092f9c6..3c23fdb9de1890 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -73,6 +73,119 @@ struct survey_report_object_size_summary { size_t num_missing; }; +typedef int (*survey_top_cmp)(void *v1, void *v2); + +MAYBE_UNUSED +static int cmp_by_nr(void *v1, void *v2) +{ + struct survey_report_object_size_summary *s1 = v1; + struct survey_report_object_size_summary *s2 = v2; + + if (s1->nr < s2->nr) + return -1; + if (s1->nr > s2->nr) + return 1; + return 0; +} + +MAYBE_UNUSED +static int cmp_by_disk_size(void *v1, void *v2) +{ + struct survey_report_object_size_summary *s1 = v1; + struct survey_report_object_size_summary *s2 = v2; + + if (s1->disk_size < s2->disk_size) + return -1; + if (s1->disk_size > s2->disk_size) + return 1; + return 0; +} + +MAYBE_UNUSED +static int cmp_by_inflated_size(void *v1, void *v2) +{ + struct survey_report_object_size_summary *s1 = v1; + struct survey_report_object_size_summary *s2 = v2; + + if (s1->inflated_size < s2->inflated_size) + return -1; + if (s1->inflated_size > s2->inflated_size) + return 1; + return 0; +} + +/** + * Store a list of "top" categories by some sorting function. When + * inserting a new category, reorder the list and free the one that + * got ejected (if any). + */ +struct survey_report_top_table { + const char *name; + survey_top_cmp cmp_fn; + size_t nr; + size_t alloc; + + /** + * 'data' stores an array of structs and must be cast into + * the proper array type before evaluating an index. + */ + void *data; +}; + +MAYBE_UNUSED +static void init_top_sizes(struct survey_report_top_table *top, + size_t limit, const char *name, + survey_top_cmp cmp) +{ + struct survey_report_object_size_summary *sz_array; + + top->name = name; + top->cmp_fn = cmp; + top->alloc = limit; + top->nr = 0; + + CALLOC_ARRAY(sz_array, limit); + top->data = sz_array; +} + +MAYBE_UNUSED +static void clear_top_sizes(struct survey_report_top_table *top) +{ + struct survey_report_object_size_summary *sz_array = top->data; + + for (size_t i = 0; i < top->nr; i++) + free(sz_array[i].label); + free(sz_array); +} + +MAYBE_UNUSED +static void maybe_insert_into_top_size(struct survey_report_top_table *top, + struct survey_report_object_size_summary *summary) +{ + struct survey_report_object_size_summary *sz_array = top->data; + size_t pos = top->nr; + + /* Compare against list from the bottom. */ + while (pos > 0 && top->cmp_fn(&sz_array[pos - 1], summary) < 0) + pos--; + + /* Not big enough! */ + if (pos >= top->alloc) + return; + + /* We need to shift the data. */ + if (top->nr == top->alloc) + free(sz_array[top->nr - 1].label); + else + top->nr++; + + for (size_t i = top->nr - 1; i > pos; i--) + memcpy(&sz_array[i], &sz_array[i - 1], sizeof(*sz_array)); + + memcpy(&sz_array[pos], summary, sizeof(*summary)); + sz_array[pos].label = xstrdup(summary->label); +} + /** * This struct contains all of the information that needs to be printed * at the end of the exploration of the repository and its references. From 948cda6c427db68e135abb409f385a20c1ecece6 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Sun, 1 Sep 2024 22:35:40 -0400 Subject: [PATCH 08/10] survey: add report of "largest" paths Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. Since the on-disk size is likely to be fragile, stop testing the exact output of 'git survey' and check that the correct set of headers is output. Signed-off-by: Derrick Stolee --- builtin/survey.c | 77 +++++++++++++++++++++++++++++++++++++++---- t/t8100-git-survey.sh | 12 ++++++- 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/builtin/survey.c b/builtin/survey.c index 3c23fdb9de1890..e11ef15da8fa47 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -75,7 +75,6 @@ struct survey_report_object_size_summary { typedef int (*survey_top_cmp)(void *v1, void *v2); -MAYBE_UNUSED static int cmp_by_nr(void *v1, void *v2) { struct survey_report_object_size_summary *s1 = v1; @@ -88,7 +87,6 @@ static int cmp_by_nr(void *v1, void *v2) return 0; } -MAYBE_UNUSED static int cmp_by_disk_size(void *v1, void *v2) { struct survey_report_object_size_summary *s1 = v1; @@ -101,7 +99,6 @@ static int cmp_by_disk_size(void *v1, void *v2) return 0; } -MAYBE_UNUSED static int cmp_by_inflated_size(void *v1, void *v2) { struct survey_report_object_size_summary *s1 = v1; @@ -132,7 +129,6 @@ struct survey_report_top_table { void *data; }; -MAYBE_UNUSED static void init_top_sizes(struct survey_report_top_table *top, size_t limit, const char *name, survey_top_cmp cmp) @@ -158,7 +154,6 @@ static void clear_top_sizes(struct survey_report_top_table *top) free(sz_array); } -MAYBE_UNUSED static void maybe_insert_into_top_size(struct survey_report_top_table *top, struct survey_report_object_size_summary *summary) { @@ -195,6 +190,10 @@ struct survey_report { struct survey_report_object_summary reachable_objects; struct survey_report_object_size_summary *by_type; + + struct survey_report_top_table *top_paths_by_count; + struct survey_report_top_table *top_paths_by_disk; + struct survey_report_top_table *top_paths_by_inflate; }; #define REPORT_TYPE_COMMIT 0 @@ -446,6 +445,13 @@ static void survey_report_object_sizes(const char *title, clear_table(&table); } +static void survey_report_plaintext_sorted_size( + struct survey_report_top_table *top) +{ + survey_report_object_sizes(top->name, _("Path"), + top->data, top->nr); +} + static void survey_report_plaintext(struct survey_context *ctx) { printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree); @@ -456,6 +462,21 @@ static void survey_report_plaintext(struct survey_context *ctx) _("Object Type"), ctx->report.by_type, REPORT_TYPE_COUNT); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]); + + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]); + survey_report_plaintext_sorted_size( + &ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]); } /* @@ -696,7 +717,8 @@ static void increment_totals(struct survey_context *ctx, static void increment_object_totals(struct survey_context *ctx, struct oid_array *oids, - enum object_type type) + enum object_type type, + const char *path) { struct survey_report_object_size_summary *total; struct survey_report_object_size_summary summary = { 0 }; @@ -728,6 +750,27 @@ static void increment_object_totals(struct survey_context *ctx, total->disk_size += summary.disk_size; total->inflated_size += summary.inflated_size; total->num_missing += summary.num_missing; + + if (type == OBJ_TREE || type == OBJ_BLOB) { + int index = type == OBJ_TREE ? + REPORT_TYPE_TREE : REPORT_TYPE_BLOB; + struct survey_report_top_table *top; + + /* + * Temporarily store (const char *) here, but it will + * be duped if inserted and will not be freed. + */ + summary.label = (char *)path; + + top = ctx->report.top_paths_by_count; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_disk; + maybe_insert_into_top_size(&top[index], &summary); + + top = ctx->report.top_paths_by_inflate; + maybe_insert_into_top_size(&top[index], &summary); + } } static int survey_objects_path_walk_fn(const char *path, @@ -739,7 +782,7 @@ static int survey_objects_path_walk_fn(const char *path, increment_object_counts(&ctx->report.reachable_objects, type, oids->nr); - increment_object_totals(ctx, oids, type); + increment_object_totals(ctx, oids, type, path); ctx->progress_nr += oids->nr; display_progress(ctx->progress, ctx->progress_nr); @@ -749,11 +792,31 @@ static int survey_objects_path_walk_fn(const char *path, static void initialize_report(struct survey_context *ctx) { + const int top_limit = 100; + CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs")); ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags")); + + CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); + init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY COUNT"), cmp_by_nr); + + CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); + init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); + + CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE], + top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); + init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB], + top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); } static void survey_phase_objects(struct survey_context *ctx) diff --git a/t/t8100-git-survey.sh b/t/t8100-git-survey.sh index 6c2867c11c323c..8c6edfcae0c6c2 100755 --- a/t/t8100-git-survey.sh +++ b/t/t8100-git-survey.sh @@ -86,7 +86,17 @@ test_expect_success 'git survey (default)' ' Tags | 4 | $(test_oid tags_size_on_disk) | $(test_oid tags_size) EOF - test_cmp expect out + lines=$(wc -l out-trimmed && + test_cmp expect out-trimmed && + + for type in "DIRECTORIES" "FILES" + do + for metric in "COUNT" "DISK SIZE" "INFLATED SIZE" + do + grep "TOP $type BY $metric" out || return 1 + done || return 1 + done ' test_done From b5a4b48066af389ef7f8dfa3e85dc8dbd8ede528 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Mon, 23 Sep 2024 15:38:25 -0400 Subject: [PATCH 09/10] survey: add --top= option and config The 'git survey' builtin provides several detail tables, such as "top files by on-disk size". The size of these tables defaults to 10, currently. Allow the user to specify this number via a new --top= option or the new survey.top config key. Signed-off-by: Derrick Stolee Signed-off-by: Johannes Schindelin --- Documentation/config/survey.txt | 3 +++ builtin/survey.c | 22 ++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/Documentation/config/survey.txt b/Documentation/config/survey.txt index c1b0f852a1250e..9e594a2092f225 100644 --- a/Documentation/config/survey.txt +++ b/Documentation/config/survey.txt @@ -8,4 +8,7 @@ survey.*:: This boolean value implies the `--[no-]verbose` option. progress:: This boolean value implies the `--[no-]progress` option. + top:: + This integer value implies `--top=`, specifying the + number of entries in the detail tables. -- diff --git a/builtin/survey.c b/builtin/survey.c index e11ef15da8fa47..9404ea355ce0f0 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -40,6 +40,7 @@ static struct survey_refs_wanted default_ref_options = { struct survey_opts { int verbose; int show_progress; + int top_nr; struct survey_refs_wanted refs; }; @@ -548,6 +549,10 @@ static int survey_load_config_cb(const char *var, const char *value, ctx->opts.show_progress = git_config_bool(var, value); return 0; } + if (!strcmp(var, "survey.top")) { + ctx->opts.top_nr = git_config_bool(var, value); + return 0; + } return git_default_config(var, value, cctx, pvoid); } @@ -792,8 +797,6 @@ static int survey_objects_path_walk_fn(const char *path, static void initialize_report(struct survey_context *ctx) { - const int top_limit = 100; - CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT); ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits")); ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees")); @@ -802,21 +805,21 @@ static void initialize_report(struct survey_context *ctx) CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT); init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE], - top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); + ctx->opts.top_nr, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr); init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB], - top_limit, _("TOP FILES BY COUNT"), cmp_by_nr); + ctx->opts.top_nr, _("TOP FILES BY COUNT"), cmp_by_nr); CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT); init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE], - top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); + ctx->opts.top_nr, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size); init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB], - top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); + ctx->opts.top_nr, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size); CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT); init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE], - top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); + ctx->opts.top_nr, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size); init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB], - top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); + ctx->opts.top_nr, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size); } static void survey_phase_objects(struct survey_context *ctx) @@ -865,6 +868,7 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor .opts = { .verbose = 0, .show_progress = -1, /* defaults to isatty(2) */ + .top_nr = 10, .refs.want_all_refs = -1, @@ -880,6 +884,8 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor static struct option survey_options[] = { OPT__VERBOSE(&ctx.opts.verbose, N_("verbose output")), OPT_BOOL(0, "progress", &ctx.opts.show_progress, N_("show progress")), + OPT_INTEGER('n', "top", &ctx.opts.top_nr, + N_("number of entries to include in detail tables")), OPT_BOOL_F(0, "all-refs", &ctx.opts.refs.want_all_refs, N_("include all refs"), PARSE_OPT_NONEG), From d2b3e507948bd894d402cbc25f2278beb70fe033 Mon Sep 17 00:00:00 2001 From: Johannes Schindelin Date: Mon, 1 Jul 2024 23:28:45 +0200 Subject: [PATCH 10/10] survey: clearly note the experimental nature in the output While this command is definitely something we _want_, chances are that upstreaming this will require substantial changes. We still want to be able to experiment with this before that, to focus on what we need out of this command: To assist with diagnosing issues with large repositories, as well as to help monitoring the growth and the associated painpoints of such repositories. To that end, we are about to integrate this command into `microsoft/git`, to get the tool into the hands of users who need it most, with the idea to iterate in close collaboration between these users and the developers familar with Git's internals. However, we will definitely want to avoid letting anybody have the impression that this command, its exact inner workings, as well as its output format, are anywhere close to stable. To make that fact utterly clear (and thereby protect the freedom to iterate and innovate freely before upstreaming the command), let's mark its output as experimental in all-caps, as the first thing we do. Signed-off-by: Johannes Schindelin --- builtin/survey.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/builtin/survey.c b/builtin/survey.c index 9404ea355ce0f0..1ae5dd5756b97d 100644 --- a/builtin/survey.c +++ b/builtin/survey.c @@ -17,6 +17,7 @@ #include "strvec.h" #include "tag.h" #include "trace2.h" +#include "color.h" static const char * const survey_usage[] = { N_("(EXPERIMENTAL!) git survey "), @@ -901,6 +902,11 @@ int cmd_survey(int argc, const char **argv, const char *prefix, struct repositor if (argc == 2 && !strcmp(argv[1], "-h")) usage_with_options(survey_usage, survey_options); + if (isatty(2)) + color_fprintf_ln(stderr, + want_color_fd(2, GIT_COLOR_AUTO) ? GIT_COLOR_YELLOW : "", + "(THIS IS EXPERIMENTAL, EXPECT THE OUTPUT FORMAT TO CHANGE!)"); + ctx.repo = repo; prepare_repo_settings(ctx.repo);