Skip to content

Commit

Permalink
Merge pull request #421: Sparse index: integrate with the `sparse-che…
Browse files Browse the repository at this point in the history
…ckout` builtin

This integrates the `sparse-checkout` builtin with the sparse index. The tricky part here is that we need to partially expand the index when we are modifying the sparse-checkout definition.

Note that we modify the pattern list in a careful way: we create a `struct pattern_list` in-memory in `builtin/sparse-checkout.c` then apply those patterns to the index before writing the patterns to the sparse-checkout file. The `update_sparsity()` method does the work to assign the `SKIP_WORKTREE` bit appropriately, but this doesn't work if the files that are within the new sparse-checkout cone are still hidden behind a sparse directory.

The new `expand_to_pattern_list()` method does the hard work of expanding the sparse directories that are now within the new patterns. This expands only as far as needed, possibly creating new sparse directory entries.

This method does not contract existing files to sparse directories, and a big reason why is because of the check for ignored files as we delete those directories. The `clean_tracked_sparse_directories()` method is called after `update_sparsity()`, but we need to read the `A/B/.gitignore` file (or lack thereof) before we can delete `A/B/`. If we convert to sparse too quickly, then we lose this information and cause a full expansion.

Most of the correctness is handled by existing tests in `t1092`, but I add checks for `ensure_not_expanded` in some hopefully interesting cases.

As for performance, `git sparse-checkout set` can be slow if it needs to move a lot of files. However, no-op `git sparse-checkout set` (i.e. set the sparse-checkout cone to only include files at root, and do this on repeat) has these performance results on Linux in a monorepo with 2+ million files at `HEAD`:

```
Benchmark #1: baseline
  Time (mean ± σ):     10.465 s ±  0.018 s    [User: 9.885 s, System: 0.573 s]
  Range (min … max):   10.450 s … 10.497 s    5 runs
 
Benchmark #2: new code
  Time (mean ± σ):      68.9 ms ±   2.9 ms    [User: 45.8 ms, System: 17.1 ms]
  Range (min … max):    63.4 ms …  74.0 ms    41 runs
 
Summary
  'new code' ran
  151.89 ± 6.30 times faster than 'baseline'
```
  • Loading branch information
derrickstolee authored and vdye committed May 11, 2022
2 parents 88f5a70 + 89cda2e commit 46d7605
Show file tree
Hide file tree
Showing 11 changed files with 259 additions and 64 deletions.
8 changes: 7 additions & 1 deletion builtin/sparse-checkout.c
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ static void clean_tracked_sparse_directories(struct repository *r)
* sparse index will not delete directories that contain
* conflicted entries or submodules.
*/
if (!r->index->sparse_index) {
if (r->index->sparse_index == COMPLETELY_FULL) {
/*
* If something, such as a merge conflict or other concern,
* prevents us from converting to a sparse index, then do
Expand Down Expand Up @@ -420,6 +420,9 @@ static int update_modes(int *cone_mode, int *sparse_index)
/* force an index rewrite */
repo_read_index(the_repository);
the_repository->index->updated_workdir = 1;

if (!*sparse_index)
ensure_full_index(the_repository->index);
}

return 0;
Expand Down Expand Up @@ -941,6 +944,9 @@ int cmd_sparse_checkout(int argc, const char **argv, const char *prefix)

git_config(git_default_config, NULL);

prepare_repo_settings(the_repository);
the_repository->settings.command_requires_full_index = 0;

if (argc > 0) {
if (!strcmp(argv[0], "list"))
return sparse_checkout_list(argc, argv);
Expand Down
25 changes: 25 additions & 0 deletions cache-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,31 @@ struct cache_tree_sub *cache_tree_sub(struct cache_tree *it, const char *path)
return find_subtree(it, path, pathlen, 1);
}

struct cache_tree *cache_tree_find_path(struct cache_tree *it, const char *path)
{
const char *slash;
int namelen;
struct cache_tree_sub *down;

if (!it)
return NULL;
slash = strchrnul(path, '/');
namelen = slash - path;
it->entry_count = -1;
if (!*slash) {
int pos;
pos = cache_tree_subtree_pos(it, path, namelen);
if (0 <= pos) {
return it->down[pos]->cache_tree;
}
return NULL;
}
down = find_subtree(it, path, namelen, 0);
if (down)
return cache_tree_find_path(down->cache_tree, slash + 1);
return NULL;
}

static int do_invalidate_path(struct cache_tree *it, const char *path)
{
/* a/b/c
Expand Down
2 changes: 2 additions & 0 deletions cache-tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ struct cache_tree_sub *cache_tree_sub(struct cache_tree *, const char *);

int cache_tree_subtree_pos(struct cache_tree *it, const char *path, int pathlen);

struct cache_tree *cache_tree_find_path(struct cache_tree *it, const char *path);

void cache_tree_write(struct strbuf *, struct cache_tree *root);
struct cache_tree *cache_tree_read(const char *buffer, unsigned long size);

Expand Down
32 changes: 24 additions & 8 deletions cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,28 @@ struct untracked_cache;
struct progress;
struct pattern_list;

enum sparse_index_mode {
/*
* COMPLETELY_FULL: there are no sparse directories
* in the index at all.
*/
COMPLETELY_FULL = 0,

/*
* COLLAPSED: the index has already been collapsed to sparse
* directories whereever possible.
*/
COLLAPSED = 1,

/*
* PARTIALLY_SPARSE: the sparse directories that exist are
* outside the sparse-checkout boundary, but it is possible
* that some file entries could collapse to sparse directory
* entries.
*/
PARTIALLY_SPARSE = 2,
};

struct index_state {
struct cache_entry **cache;
unsigned int version;
Expand All @@ -323,14 +345,8 @@ struct index_state {
drop_cache_tree : 1,
updated_workdir : 1,
updated_skipworktree : 1,
fsmonitor_has_run_once : 1,

/*
* sparse_index == 1 when sparse-directory
* entries exist. Requires sparse-checkout
* in cone mode.
*/
sparse_index : 1;
fsmonitor_has_run_once : 1;
enum sparse_index_mode sparse_index;
struct hashmap name_hash;
struct hashmap dir_hash;
struct object_id oid;
Expand Down
78 changes: 44 additions & 34 deletions dir.c
Original file line number Diff line number Diff line change
Expand Up @@ -1400,46 +1400,16 @@ static struct path_pattern *last_matching_pattern_from_list(const char *pathname
return res;
}

/*
* Scan the list of patterns to determine if the ordered list
* of patterns matches on 'pathname'.
*
* Return 1 for a match, 0 for not matched and -1 for undecided.
*/
enum pattern_match_result path_matches_pattern_list(
enum pattern_match_result path_matches_cone_mode_pattern_list(
const char *pathname, int pathlen,
const char *basename, int *dtype,
struct pattern_list *pl,
struct index_state *istate)
struct pattern_list *pl)
{
struct path_pattern *pattern;
struct strbuf parent_pathname = STRBUF_INIT;
int result = NOT_MATCHED;
size_t slash_pos;

/*
* The virtual file system data is used to prevent git from traversing
* any part of the tree that is not in the virtual file system. Return
* 1 to exclude the entry if it is not found in the virtual file system,
* else fall through to the regular excludes logic as it may further exclude.
*/
if (*dtype == DT_UNKNOWN)
*dtype = resolve_dtype(DT_UNKNOWN, istate, pathname, pathlen);
if (is_excluded_from_virtualfilesystem(pathname, pathlen, *dtype) > 0)
return 1;

if (!pl->use_cone_patterns) {
pattern = last_matching_pattern_from_list(pathname, pathlen, basename,
dtype, pl, istate);
if (pattern) {
if (pattern->flags & PATTERN_FLAG_NEGATIVE)
return NOT_MATCHED;
else
return MATCHED;
}

return UNDECIDED;
}
if (!pl->use_cone_patterns)
BUG("path_matches_cone_mode_pattern_list requires cone mode patterns");

if (pl->full_cone)
return MATCHED;
Expand Down Expand Up @@ -1492,6 +1462,46 @@ enum pattern_match_result path_matches_pattern_list(
return result;
}

/*
* Scan the list of patterns to determine if the ordered list
* of patterns matches on 'pathname'.
*
* Return 1 for a match, 0 for not matched and -1 for undecided.
*/
enum pattern_match_result path_matches_pattern_list(
const char *pathname, int pathlen,
const char *basename, int *dtype,
struct pattern_list *pl,
struct index_state *istate)
{
/*
* The virtual file system data is used to prevent git from traversing
* any part of the tree that is not in the virtual file system. Return
* 1 to exclude the entry if it is not found in the virtual file system,
* else fall through to the regular excludes logic as it may further exclude.
*/
if (*dtype == DT_UNKNOWN)
*dtype = resolve_dtype(DT_UNKNOWN, istate, pathname, pathlen);
if (is_excluded_from_virtualfilesystem(pathname, pathlen, *dtype) > 0)
return 1;

if (!pl->use_cone_patterns) {
struct path_pattern *pattern = last_matching_pattern_from_list(
pathname, pathlen, basename,
dtype, pl, istate);
if (pattern) {
if (pattern->flags & PATTERN_FLAG_NEGATIVE)
return NOT_MATCHED;
else
return MATCHED;
}

return UNDECIDED;
}

return path_matches_cone_mode_pattern_list(pathname, pathlen, pl);
}

int init_sparse_checkout_patterns(struct index_state *istate)
{
if (!core_apply_sparse_checkout)
Expand Down
9 changes: 9 additions & 0 deletions dir.h
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,15 @@ enum pattern_match_result {
MATCHED_RECURSIVE = 2,
};

/*
* Test if a given path is contained in the given pattern list.
*
* The given pattern list _must_ use cone mode patterns.
*/
enum pattern_match_result path_matches_cone_mode_pattern_list(
const char *pathname, int pathlen,
struct pattern_list *pl);

/*
* Scan the list of patterns to determine if the ordered list
* of patterns matches on 'pathname'.
Expand Down
6 changes: 3 additions & 3 deletions read-cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ static const char *alternate_index_output;
static void set_index_entry(struct index_state *istate, int nr, struct cache_entry *ce)
{
if (S_ISSPARSEDIR(ce->ce_mode))
istate->sparse_index = 1;
istate->sparse_index = COLLAPSED;

istate->cache[nr] = ce;
add_name_hash(istate, ce);
Expand Down Expand Up @@ -1874,7 +1874,7 @@ static int read_index_extension(struct index_state *istate,
break;
case CACHE_EXT_SPARSE_DIRECTORIES:
/* no content, only an indicator */
istate->sparse_index = 1;
istate->sparse_index = COLLAPSED;
break;
default:
if (*ext < 'A' || 'Z' < *ext)
Expand Down Expand Up @@ -3191,7 +3191,7 @@ static int do_write_locked_index(struct index_state *istate, struct lock_file *l
unsigned flags)
{
int ret;
int was_full = !istate->sparse_index;
int was_full = istate->sparse_index == COMPLETELY_FULL;

ret = convert_to_sparse(istate, 0);

Expand Down
Loading

0 comments on commit 46d7605

Please sign in to comment.