From 57531c900aea58eca6671ffc6437964269425d7a Mon Sep 17 00:00:00 2001 From: Eric Mecklenburg Date: Tue, 2 May 2017 15:53:03 -0400 Subject: [PATCH] gvfs: use a hashmap to back gitignore/sparse checkout The always_exclude mechanism is used both to implement the excludes defined in, say, .gitignore, and the sparse checkout feature. Due to its use of wildcards, any sizable list of entries will slow down every path-related operation dramatically. This is particularly noticeable with GVFS, where the sparse-checkout file is populated and updated frequently by the backend. In GVFS' case, the use of wildcards is limited to same-directory wildcards (i.e. something of the form `/this/directory/*`), though, therefore we can speed up everything dramatically by using exact matches backed by a hashmap. Note: the use of a hashmap disallows more free-form sparse-checkout entries such as `*.txt`. Using a hashmap restricts us to only allow exact file names, same-directory, and negative entries in the sparse-checkout file, but with GVFS that is all we need. This patch introduces the option to make it so. Signed-off-by: Eric Mecklenburg --- Documentation/config.txt | 6 + dir.c | 103 +++++++++++++-- dir.h | 1 + gvfs.h | 1 + t/t3009-ls-files-always-exclude-hashmap.sh | 138 +++++++++++++++++++++ 5 files changed, 240 insertions(+), 9 deletions(-) create mode 100644 t/t3009-ls-files-always-exclude-hashmap.sh diff --git a/Documentation/config.txt b/Documentation/config.txt index 730b678c021c1f..032a7206f1eb83 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -948,6 +948,12 @@ core.gvfs:: happened. This drops the merge time from ~1 hour to ~5 minutes and the unmerged entries goes down from ~40,000 to 1. + GVFS_ALWAYS_EXCLUDE_HASHMAP:: + Bit value 512 + Changes always_exclude to use a hashmap to speed up lookups. + Only works with exact matches and same-folder wildcards (pa/th/*) + and supports negatives. Behavior depends on GVFS's usage pattern + and is unlikely to work correctly otherwise. -- core.sparseCheckout:: diff --git a/dir.c b/dir.c index b25c45c9f2c006..2c170348a4a8f7 100644 --- a/dir.c +++ b/dir.c @@ -17,6 +17,7 @@ #include "utf8.h" #include "varint.h" #include "ewah/ewok.h" +#include "gvfs.h" /* * Tells read_directory_recursive how a file or directory should be treated. @@ -50,6 +51,22 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir, int check_only, const struct pathspec *pathspec); static int get_dtype(struct dirent *de, const char *path, int len); +static int path_hashmap_cmp(const void *a, const void *b, const void *key) +{ + const struct exclude *e1 = a; + const struct exclude *e2 = b; + + return strncmp(e1->pattern, e2->pattern, e1->patternlen); +} + +static int path_hashmap_icmp(const void *a, const void *b, const void *key) +{ + const struct exclude *e1 = a; + const struct exclude *e2 = b; + + return strnicmp(e1->pattern, e2->pattern, e1->patternlen); +} + int fspathcmp(const char *a, const char *b) { return ignore_case ? strcasecmp(a, b) : strcmp(a, b); @@ -845,9 +862,11 @@ struct exclude_list *add_exclude_list(struct dir_struct *dir, * Used to set up core.excludesfile and .git/info/exclude lists. */ static void add_excludes_from_file_1(struct dir_struct *dir, const char *fname, + int setup_hashmap, struct sha1_stat *sha1_stat) { struct exclude_list *el; + int i; /* * catch setup_standard_excludes() that's called before * dir->untracked is assigned. That function behaves @@ -858,12 +877,27 @@ static void add_excludes_from_file_1(struct dir_struct *dir, const char *fname, el = add_exclude_list(dir, EXC_FILE, fname); if (add_excludes(fname, "", 0, el, 0, sha1_stat) < 0) die("cannot use %s as an exclude file", fname); + + if (setup_hashmap && el->nr) { + hashmap_init(&el->pattern_hash, + ignore_case ? path_hashmap_icmp : path_hashmap_cmp, + el->nr); + + for (i = el->nr - 1; 0 <= i; i--) { + struct exclude *x = el->excludes[i]; + hashmap_entry_init(&x->ent, + ignore_case ? + strihash(x->pattern) : + strhash(x->pattern)); + hashmap_add(&el->pattern_hash, &x->ent); + } + } } void add_excludes_from_file(struct dir_struct *dir, const char *fname) { dir->unmanaged_exclude_files++; /* see validate_untracked_cache() */ - add_excludes_from_file_1(dir, fname, NULL); + add_excludes_from_file_1(dir, fname, 0, NULL); } int match_basename(const char *basename, int basenamelen, @@ -949,6 +983,19 @@ int match_pathname(const char *pathname, int pathlen, WM_PATHNAME) == 0; } +static struct exclude *find_exclude_matching_hash(const char *pattern, + int pattern_len, + struct exclude_list *el) +{ + struct exclude search; + + hashmap_entry_init(&search, + ignore_case ? strihash(pattern) : strhash(pattern)); + search.pattern = pattern; + search.patternlen = pattern_len; + return hashmap_get(&el->pattern_hash, &search, NULL); +} + /* * Scan the given exclude list in reverse to see whether pathname * should be ignored. The first match (i.e. the last on the list), if @@ -964,6 +1011,42 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname, struct exclude *exc = NULL; /* undecided */ int i; + if (el->pattern_hash.size) { + /* + * We cannot search for every possible rule that matches the + * current path because there are countless odd permutations + * with wildcards. Instead we search for most common cases + * and fall through to the old logic if we fail. + */ + static struct strbuf sb = STRBUF_INIT; + const char *slash; + struct exclude *match; + + /* Check exact match with leading slash "/a/b/c" */ + strbuf_reset(&sb); + strbuf_addch(&sb, '/'); + strbuf_add(&sb, pathname, pathlen); + match = find_exclude_matching_hash(sb.buf, sb.len, el); + if (match) + return match; + + /* Check wildcard match with leading slash "/a/b/ *" */ + slash = strrchr(pathname, '/'); + strbuf_reset(&sb); + strbuf_addch(&sb, '/'); + if (slash) + strbuf_add(&sb, pathname, slash - pathname + 1); + strbuf_addch(&sb, '*'); + match = find_exclude_matching_hash(sb.buf, sb.len, el); + if (match) + return match; + + /* Check general wildcard "*" */ + match = find_exclude_matching_hash("*", 1, el); + if (match) + return match; + } + if (!el->nr) return NULL; /* undefined */ @@ -981,9 +1064,9 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname, if (x->flags & EXC_FLAG_NODIR) { if (match_basename(basename, - pathlen - (basename - pathname), - exclude, prefix, x->patternlen, - x->flags)) { + pathlen - (basename - pathname), + exclude, prefix, x->patternlen, + x->flags)) { exc = x; break; } @@ -992,8 +1075,8 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname, assert(x->baselen == 0 || x->base[x->baselen - 1] == '/'); if (match_pathname(pathname, pathlen, - x->base, x->baselen ? x->baselen - 1 : 0, - exclude, prefix, x->patternlen, x->flags)) { + x->base, x->baselen ? x->baselen - 1 : 0, + exclude, prefix, x->patternlen, x->flags)) { exc = x; break; } @@ -2320,27 +2403,29 @@ static GIT_PATH_FUNC(git_path_info_exclude, "info/exclude") void setup_standard_excludes(struct dir_struct *dir) { + int always_exclude_hashmap = gvfs_config_is_set(GVFS_ALWAYS_EXCLUDE_HASHMAP); + dir->exclude_per_dir = ".gitignore"; /* always_exclude */ if (startup_info->have_repository) { const char *path = git_path_info_always_exclude(); if (!access_or_warn(path, R_OK, 0)) - add_excludes_from_file_1(dir, path, NULL); + add_excludes_from_file_1(dir, path, always_exclude_hashmap, NULL ); } /* core.excludesfile defaulting to $XDG_HOME/git/ignore */ if (!excludes_file) excludes_file = xdg_config_home("ignore"); if (excludes_file && !access_or_warn(excludes_file, R_OK, 0)) - add_excludes_from_file_1(dir, excludes_file, + add_excludes_from_file_1(dir, excludes_file, 0, dir->untracked ? &dir->ss_excludes_file : NULL); /* per repository user preference */ if (startup_info->have_repository) { const char *path = git_path_info_exclude(); if (!access_or_warn(path, R_OK, 0)) - add_excludes_from_file_1(dir, path, + add_excludes_from_file_1(dir, path, 0, dir->untracked ? &dir->ss_info_exclude : NULL); } } diff --git a/dir.h b/dir.h index 2293638e58b7bb..8a61c920adfff6 100644 --- a/dir.h +++ b/dir.h @@ -17,6 +17,7 @@ struct dir_entry { #define EXC_FLAG_NEGATIVE 16 struct exclude { + /* This must be the first element for hashmaps to work */ struct hashmap_entry ent; /* * This allows callers of last_exclude_matching() etc. diff --git a/gvfs.h b/gvfs.h index 3816d4086ce1b0..5ac88867a2133c 100644 --- a/gvfs.h +++ b/gvfs.h @@ -20,6 +20,7 @@ #define GVFS_BLOCK_FILTERS_AND_EOL_CONVERSIONS (1 << 6) #define GVFS_SKIP_MERGE_IN_CHECKOUT (1 << 7) #define GVFS_DEFAULT_MERGE_OPTIONS (1 << 8) +#define GVFS_ALWAYS_EXCLUDE_HASHMAP (1 << 9) void gvfs_load_config_value(const char *value); int gvfs_config_is_set(int mask); diff --git a/t/t3009-ls-files-always-exclude-hashmap.sh b/t/t3009-ls-files-always-exclude-hashmap.sh new file mode 100644 index 00000000000000..66e1820572928c --- /dev/null +++ b/t/t3009-ls-files-always-exclude-hashmap.sh @@ -0,0 +1,138 @@ +#!/bin/sh + +test_description='always_exclude hashmap tests' + +. ./test-lib.sh + +test_expect_success 'setup' ' + mkdir -p dir1/dir2 && + >a && + >dir1/a && + >dir1/b && + >dir1/dir2/a && + cat >.gitignore <<\EOF +standard +with_hashmap +EOF +' + +test_expect_success 'status with positive simple exact match' ' + cat >.git/info/always_exclude <<\EOF && +a +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with negative simple exact match' ' + cat >.git/info/always_exclude <<\EOF && +!a +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with matching exact match' ' + cat >.git/info/always_exclude <<\EOF && +/dir1/a +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with non-matching exact match' ' + cat >.git/info/always_exclude <<\EOF && +/dir1/dir +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with matching wildcard match' ' + cat >.git/info/always_exclude <<\EOF && +/dir1/* +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with non-matching wildcard match' ' + cat >.git/info/always_exclude <<\EOF && +/dir1/dir3/* +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with everything excluded' ' + cat >.git/info/always_exclude <<\EOF && +* +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with some excluded' ' + cat >.git/info/always_exclude <<\EOF && +* +!/* +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with less excluded' ' + cat >.git/info/always_exclude <<\EOF && +* +!/* +!/dir1 +!/dir1/* +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_expect_success 'status with nothing excluded' ' + cat >.git/info/always_exclude <<\EOF && +* +!/* +!/dir1 +!/dir1/* +!/dir1/dir2 +!/dir1/dir2/* +EOF + git status -u >standard && + git config --local core.gvfs 512 && + git status -u >with_hashmap && + git config --local core.gvfs 0 && + test_cmp standard with_hashmap +' + +test_done