From 621565397e4dd566a498364df01acc793a28ccf9 Mon Sep 17 00:00:00 2001 From: Neeraj Singh Date: Fri, 5 Feb 2021 18:28:33 -0800 Subject: [PATCH] unpack-trees:virtualfilesystem: Improve efficiency of clear_ce_flags When the virtualfilesystem is enabled the previous implementation of clear_ce_flags would iterate all of the cache entries and query whether each one is in the virtual filesystem to determine whether to clear one of the SKIP_WORKTREE bits. For each cache entry, we would do a hash lookup for each parent directory in the is_included_in_virtualfilesystem function. The former approach is slow for a typical Windows OS enlistment with 3 million files where only a small percentage is in the virtual filesystem. The cost is O(n_index_entries * n_chars_per_path * n_parent_directories_per_path). In this change, we use the same approach as apply_virtualfilesystem, which iterates the set of entries in the virtualfilesystem and searches in the cache for the corresponding entries in order to clear their flags. This approach has a cost of O(n_virtual_filesystem_entries * n_chars_per_path * log(n_index_entries)). The apply_virtualfilesystem code was refactored a bit and modified to clear flags for all names that 'alias' a given virtual filesystem name when ignore_case is set. n_virtual_filesystem_entries is typically much less than n_index_entries, in which case the new approach is much faster. We wind up building the name hash for the index, but this occurs quickly thanks to the multi-threading. Signed-off-by: Neeraj Singh --- cache.h | 1 + name-hash.c | 20 +++++++ unpack-trees.c | 27 ++++----- virtualfilesystem.c | 142 ++++++++++++++++++++++++++++---------------- virtualfilesystem.h | 7 +++ 5 files changed, 132 insertions(+), 65 deletions(-) diff --git a/cache.h b/cache.h index 35239144d228c6..508ec6cc98e8a1 100644 --- a/cache.h +++ b/cache.h @@ -810,6 +810,7 @@ int strcmp_offset(const char *s1, const char *s2, size_t *first_change); int index_dir_exists(struct index_state *istate, const char *name, int namelen); void adjust_dirname_case(struct index_state *istate, char *name); struct cache_entry *index_file_exists(struct index_state *istate, const char *name, int namelen, int igncase); +struct cache_entry *index_file_next_match(struct index_state *istate, struct cache_entry *ce, int igncase); /* * Searches for an entry defined by name and namelen in the given index. diff --git a/name-hash.c b/name-hash.c index 7487d331240e2a..f1bf449e9639ea 100644 --- a/name-hash.c +++ b/name-hash.c @@ -730,6 +730,26 @@ struct cache_entry *index_file_exists(struct index_state *istate, const char *na return NULL; } +struct cache_entry *index_file_next_match(struct index_state *istate, struct cache_entry *ce, int igncase) +{ + struct cache_entry *next; + + if (!igncase || !ce) { + return NULL; + } + + next = hashmap_get_next_entry(&istate->name_hash, ce, ent); + if (!next) + return NULL; + + hashmap_for_each_entry_from(&istate->name_hash, next, ent) { + if (same_name(next, ce->name, ce_namelen(ce), igncase)) + return next; + } + + return NULL; +} + void free_name_hash(struct index_state *istate) { if (!istate->name_hash_initialized) diff --git a/unpack-trees.c b/unpack-trees.c index d089a3a0cc8b0f..792528367b478a 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1612,14 +1612,6 @@ static int clear_ce_flags_1(struct index_state *istate, continue; } - /* if it's not in the virtual file system, exit early */ - if (core_virtualfilesystem) { - if (is_included_in_virtualfilesystem(ce->name, ce->ce_namelen) > 0) - ce->ce_flags &= ~clear_mask; - cache++; - continue; - } - if (prefix->len && strncmp(ce->name, prefix->buf, prefix->len)) break; @@ -1696,12 +1688,19 @@ static int clear_ce_flags(struct index_state *istate, xsnprintf(label, sizeof(label), "clear_ce_flags/0x%08lx_0x%08lx", (unsigned long)select_mask, (unsigned long)clear_mask); trace2_region_enter("unpack_trees", label, the_repository); - rval = clear_ce_flags_1(istate, - istate->cache, - istate->cache_nr, - &prefix, - select_mask, clear_mask, - pl, 0, 0); + if (core_virtualfilesystem) { + rval = clear_ce_flags_virtualfilesystem(istate, + select_mask, + clear_mask); + } else { + rval = clear_ce_flags_1(istate, + istate->cache, + istate->cache_nr, + &prefix, + select_mask, clear_mask, + pl, 0, 0); + } + trace2_region_leave("unpack_trees", label, the_repository); stop_progress(&istate->progress); diff --git a/virtualfilesystem.c b/virtualfilesystem.c index 1de397cefb6878..1dd50b08edcbc6 100644 --- a/virtualfilesystem.c +++ b/virtualfilesystem.c @@ -247,93 +247,133 @@ int is_excluded_from_virtualfilesystem(const char *pathname, int pathlen, int dt return -1; } -/* - * Update the CE_SKIP_WORKTREE bits based on the virtual file system. - */ -void apply_virtualfilesystem(struct index_state *istate) +struct apply_virtual_filesystem_stats { + int nr_unknown; + int nr_vfs_dirs; + int nr_vfs_rows; + int nr_bulk_skip; + int nr_explicit_skip; +}; + +static void clear_ce_flags_virtualfilesystem_1(struct index_state *istate, int select_mask, int clear_mask, + struct apply_virtual_filesystem_stats *stats) { char *buf, *entry; int i; - int nr_unknown = 0; - int nr_vfs_dirs = 0; - int nr_vfs_rows = 0; - int nr_bulk_skip = 0; - int nr_explicit_skip = 0; - - if (!git_config_get_virtualfilesystem()) - return; - - trace2_region_enter("vfs", "apply", the_repository); if (!virtual_filesystem_data.len) get_virtual_filesystem_data(&virtual_filesystem_data); - /* set CE_SKIP_WORKTREE bit on all entries */ - for (i = 0; i < istate->cache_nr; i++) - istate->cache[i]->ce_flags |= CE_SKIP_WORKTREE; - - /* clear CE_SKIP_WORKTREE bit for everything in the virtual file system */ + /* clear specified flag bits for everything in the virtual file system */ entry = buf = virtual_filesystem_data.buf; for (i = 0; i < virtual_filesystem_data.len; i++) { if (buf[i] == '\0') { + struct cache_entry *ce; int pos, len; - nr_vfs_rows++; + stats->nr_vfs_rows++; len = buf + i - entry; /* look for a directory wild card (ie "dir1/") */ if (buf[i - 1] == '/') { - nr_vfs_dirs++; + stats->nr_vfs_dirs++; if (ignore_case) adjust_dirname_case(istate, entry); pos = index_name_pos(istate, entry, len); if (pos < 0) { - pos = -pos - 1; - while (pos < istate->cache_nr && !fspathncmp(istate->cache[pos]->name, entry, len)) { - if (istate->cache[pos]->ce_flags & CE_SKIP_WORKTREE) - nr_bulk_skip++; - istate->cache[pos]->ce_flags &= ~CE_SKIP_WORKTREE; - pos++; + for (pos = -pos - 1; pos < istate->cache_nr; pos++) { + ce = istate->cache[pos]; + if (fspathncmp(ce->name, entry, len)) + break; + + if (select_mask && !(ce->ce_flags & select_mask)) + continue; + + if (ce->ce_flags & clear_mask) + stats->nr_bulk_skip++; + ce->ce_flags &= ~clear_mask; } } } else { if (ignore_case) { - struct cache_entry *ce = index_file_exists(istate, entry, len, ignore_case); - if (ce) { - if (ce->ce_flags & CE_SKIP_WORKTREE) - nr_explicit_skip++; - ce->ce_flags &= ~CE_SKIP_WORKTREE; - } - else { - nr_unknown++; - } + ce = index_file_exists(istate, entry, len, ignore_case); } else { int pos = index_name_pos(istate, entry, len); - if (pos >= 0) { - if (istate->cache[pos]->ce_flags & CE_SKIP_WORKTREE) - nr_explicit_skip++; - istate->cache[pos]->ce_flags &= ~CE_SKIP_WORKTREE; - } - else { - nr_unknown++; - } + + ce = NULL; + if (pos >= 0) + ce = istate->cache[pos]; + } + + if (ce) { + do { + if (!select_mask || (ce->ce_flags & select_mask)) { + if (ce->ce_flags & clear_mask) + stats->nr_explicit_skip++; + ce->ce_flags &= ~clear_mask; + } + + /* + * There may be aliases with different cases of the same + * name that also need to be modified. + */ + if (ignore_case) + ce = index_file_next_match(istate, ce, ignore_case); + else + break; + + } while (ce); + } else { + stats->nr_unknown++; } } entry += len + 1; } } +} + +/* + * Clear the specified flags for all entries in the virtual file system + * that match the specified select mask. Returns the number of entries + * processed. + */ +int clear_ce_flags_virtualfilesystem(struct index_state *istate, int select_mask, int clear_mask) +{ + struct apply_virtual_filesystem_stats stats = {0}; + + clear_ce_flags_virtualfilesystem_1(istate, select_mask, clear_mask, &stats); + return istate->cache_nr; +} + +/* + * Update the CE_SKIP_WORKTREE bits based on the virtual file system. + */ +void apply_virtualfilesystem(struct index_state *istate) +{ + int i; + struct apply_virtual_filesystem_stats stats = {0}; + + if (!git_config_get_virtualfilesystem()) + return; + + trace2_region_enter("vfs", "apply", the_repository); + + /* set CE_SKIP_WORKTREE bit on all entries */ + for (i = 0; i < istate->cache_nr; i++) + istate->cache[i]->ce_flags |= CE_SKIP_WORKTREE; - if (nr_vfs_rows > 0) { - trace2_data_intmax("vfs", the_repository, "apply/tracked", nr_bulk_skip + nr_explicit_skip); + clear_ce_flags_virtualfilesystem_1(istate, 0, CE_SKIP_WORKTREE, &stats); + if (stats.nr_vfs_rows > 0) { + trace2_data_intmax("vfs", the_repository, "apply/tracked", stats.nr_bulk_skip + stats.nr_explicit_skip); - trace2_data_intmax("vfs", the_repository, "apply/vfs_rows", nr_vfs_rows); - trace2_data_intmax("vfs", the_repository, "apply/vfs_dirs", nr_vfs_dirs); + trace2_data_intmax("vfs", the_repository, "apply/vfs_rows", stats.nr_vfs_rows); + trace2_data_intmax("vfs", the_repository, "apply/vfs_dirs", stats.nr_vfs_dirs); - trace2_data_intmax("vfs", the_repository, "apply/nr_unknown", nr_unknown); - trace2_data_intmax("vfs", the_repository, "apply/nr_bulk_skip", nr_bulk_skip); - trace2_data_intmax("vfs", the_repository, "apply/nr_explicit_skip", nr_explicit_skip); + trace2_data_intmax("vfs", the_repository, "apply/nr_unknown", stats.nr_unknown); + trace2_data_intmax("vfs", the_repository, "apply/nr_bulk_skip", stats.nr_bulk_skip); + trace2_data_intmax("vfs", the_repository, "apply/nr_explicit_skip", stats.nr_explicit_skip); } trace2_region_leave("vfs", "apply", the_repository); diff --git a/virtualfilesystem.h b/virtualfilesystem.h index 5e8c5b096df09a..7a31126ab9ea8c 100644 --- a/virtualfilesystem.h +++ b/virtualfilesystem.h @@ -6,6 +6,13 @@ */ void apply_virtualfilesystem(struct index_state *istate); +/* + * Clear the specified flags for all entries in the virtual file system + * that match the specified select mask. Returns the number of entries + * processed. + */ +int clear_ce_flags_virtualfilesystem(struct index_state *istate, int select_mask, int clear_mask); + /* * Return 1 if the requested item is found in the virtual file system, * 0 for not found and -1 for undecided.