diff --git a/.gitignore b/.gitignore index 833ef3b0b783b8..545e195f2a2138 100644 --- a/.gitignore +++ b/.gitignore @@ -95,6 +95,7 @@ /git-merge-subtree /git-mergetool /git-mergetool--lib +/git-midx /git-mktag /git-mktree /git-name-rev diff --git a/Documentation/config.txt b/Documentation/config.txt index 64c1dbba940f1b..dc7cb4b9000dfd 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -896,6 +896,9 @@ core.notesRef:: This setting defaults to "refs/notes/commits", and it can be overridden by the `GIT_NOTES_REF` environment variable. See linkgit:git-notes[1]. +core.midx:: + Enable "multi-pack-index" feature. Set to true to read and write MIDX files. + core.sparseCheckout:: Enable "sparse checkout" feature. See section "Sparse checkout" in linkgit:git-read-tree[1] for more information. diff --git a/Documentation/git-midx.txt b/Documentation/git-midx.txt new file mode 100644 index 00000000000000..4635247d0df6d9 --- /dev/null +++ b/Documentation/git-midx.txt @@ -0,0 +1,106 @@ +git-midx(1) +============ + +NAME +---- +git-midx - Write and verify multi-pack-indexes (MIDX files). + + +SYNOPSIS +-------- +[verse] +'git midx' [--write|--read|--clear] [--pack-dir ] + +DESCRIPTION +----------- +Write a MIDX file. + +OPTIONS +------- + +--pack-dir :: + Use given directory for the location of packfiles, pack-indexes, + and MIDX files. + +--clear:: + If specified, delete the midx file specified by midx-head, and + midx-head. (Cannot be combined with --write or --read.) + +--read:: + If specified, read a midx file specified by the midx-head file + and output basic details about the midx file. (Cannot be combined + with --write.) + +--midx-id :: + If specified with --read, use the given oid to read midx-[oid].midx + instead of using midx-head. +--write:: + If specified, write a new midx file to the pack directory using + the packfiles present. Outputs the hash of the result midx file. + (Cannot be combined with --read.) + +--update-head:: + If specified with --write, update the midx-head file to point to + the written midx file. + +--delete-expired:: + If specified with --write and --update-head, delete the midx file + previously pointed to by midx-head (if changed). + +EXAMPLES +-------- + +* Read the midx-head file and output the OID of the head MIDX file. ++ +------------------------------------------------ +$ git midx +------------------------------------------------ + +* Write a MIDX file for the packfiles in your local .git folder. ++ +------------------------------------------------ +$ git midx --write +------------------------------------------------ + +* Write a MIDX file for the packfiles in your local .git folder and +* update the midx-head file. ++ +------------------------------------------------ +$ git midx --write --update-head +------------------------------------------------ + +* Write a MIDX file for the packfiles in a different folder ++ +--------------------------------------------------------- +$ git midx --write --pack-dir ../../alt/pack/ +--------------------------------------------------------- + +* Read the current midx-head. ++ +----------------------------------------------- +$ git midx --read +----------------------------------------------- + +* Read a specific MIDX file in the local .git folder. ++ +-------------------------------------------------------------------- +$ git midx --read --midx-id 3e50d982a2257168c7fd0ff12ffe5cf6af38c74e +-------------------------------------------------------------------- + +* Delete the current midx-head and the file it references. ++ +----------------------------------------------- +$ git midx --clear +----------------------------------------------- + +CONFIGURATION +------------- + +core.midx:: + The midx command will fail if core.midx is false. + Also, the written MIDX files will be ignored by other commands + unless core.midx is true. + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Documentation/technical/multi-pack-index.txt b/Documentation/technical/multi-pack-index.txt new file mode 100644 index 00000000000000..d31b03dec546cb --- /dev/null +++ b/Documentation/technical/multi-pack-index.txt @@ -0,0 +1,149 @@ +Multi-Pack-Index (MIDX) Design Notes +==================================== + +The Git object directory contains a 'pack' directory containing +packfiles (with suffix ".pack") and pack-indexes (with suffix +".idx"). The pack-indexes provide a way to lookup objects and +navigate to their offset within the pack, but these must come +in pairs with the packfiles. This pairing depends on the file +names, as the pack-index differs only in suffix with its pack- +file. While the pack-indexes provide fast lookup per packfile, +this performance degrades as the number of packfiles increases, +because abbreviations need to inspect every packfile and we are +more likely to have a miss on our most-recently-used packfile. +For some large repositories, repacking into a single packfile +is not feasible due to storage space or excessive repack times. + +The multi-pack-index (MIDX for short, with suffix ".midx") +stores a list of objects and their offsets into multiple pack- +files. It contains: + +- A list of packfile names. +- A sorted list of object IDs. +- A list of metadata for the ith object ID including: + - A value j referring to the jth packfile. + - An offset within the jth packfile for the object. +- If large offsets are required, we use another list of large + offsets similar to version 2 pack-indexes. + +Thus, we can provide O(log N) lookup time for any number +of packfiles. + +A new config setting 'core.midx' must be enabled before writing +or reading MIDX files. + +The MIDX files are updated by the 'midx' builtin with the +following common parameter combinations: + +- 'git midx' gives the hash of the current MIDX head. +- 'git midx --write --update-head --delete-expired' writes a new + MIDX file, points the MIDX head to that file, and deletes the + existing MIDX file if out-of-date. +- 'git midx --read' lists some basic information about the current + MIDX head. Used for basic tests. +- 'git midx --clear' deletes the current MIDX head. + +Design Details +-------------- + +- The MIDX file refers only to packfiles in the same directory + as the MIDX file. + +- A special file, 'midx-head', stores the hash of the latest + MIDX file so we can load the file without performing a dirstat. + This file is especially important with incremental MIDX files, + pointing to the newest file. + +- If a packfile exists in the pack directory but is not referenced + by the MIDX file, then the packfile is loaded into the packed_git + list and Git can access the objects as usual. This behavior is + necessary since other tools could add packfiles to the pack + directory without notifying Git. + +- The MIDX file should be only a supplemental structure. If a + user downgrades or disables the `core.midx` config setting, + then the existing .idx and .pack files should be sufficient + to operate correctly. + +- The file format includes parameters for the object id length + and hash algorithm, so a future change of hash algorithm does + not require a change in format. + +- If an object appears in multiple packfiles, then only one copy + is stored in the MIDX. This has a possible performance issue: + If an object appears as the delta-base of multiple objects from + multiple packs, then cross-pack delta calculations may slow down. + This is currently only theoretical and has not been demonstrated + to be a measurable issue. + +Current Limitations +------------------- + +- MIDX files are managed only by the midx builtin and is not + automatically updated on clone or fetch. + +- There is no '--verify' option for the midx builtin to verify + the contents of the MIDX file against the pack contents. + +- Constructing a MIDX file currently requires the single-pack + index for every pack being added to the MIDX. + +- The fsck builtin does not check MIDX files, but should. + +- The repack builtin is not aware of the MIDX files, and may + invalidate the MIDX files by deleting existing packfiles. The + MIDX may also be extended in the future to store metadata about + a packfile that can be used for faster repack commands. + +- The naive Git HTTP server advertises lists of packfiles using + the file system directly. + +Future Work +----------- + +- The current file-format requires between 28 and 36 bytes per + object. As the repository grows, the MIDX file can become + very large and become a bottleneck when updating the file. To + fix this "big write" problem, we can make the MIDX file + incremental. Instead of just one MIDX file, we will have a + sequence of MIDX files that can be unioned together. Then + on write we take the new objects to add and consider how many + existing files should be merged into a new file containing + the latest objects. + + This list of "base indexes" will be presented as an optional + chunk in the MIDX format and contains the OIDs for the base + files. Thus, the `midx_head` file only stores the OID for the + "tip" MIDX file and then the rest are loaded based on those + pointers, such as the following figure: + + [ BIG ] <- [ MEDIUM ] <- [tiny] <- midx_head + ^___________________________| + + The plan being that every write replaces the "tiny" index, + and when that index becomes large enough it merges with the + "medium" index and a new tiny index is created in the next + write. Very rarely, the "big" index would be updated, causing + a slow write. + +- After the MIDX feature is sufficiently hardened and widely used, + consider making Git more fully depend on the MIDX file. If MIDX + is the default, then we can delete the single-pack-indexes from + the pack directory. We could also allow thin packs in the pack + directory. + +- The MIDX could be extended to store a "stable object order" such + that adding objects to the order does not change the existing + objects. This would enable re-using the reachability bitmaps after + repacking and updating the MIDX file. + +Related Links +------------- + +[0] https://bugs.chromium.org/p/git/issues/detail?id=6 + Chromium work item for: Multi-Pack Index (MIDX) + +[1] https://public-inbox.org/git/CB5074CF.3AD7A%25joshua.redstone@fb.com/T/#u + Subject: Git performance results on a large repository + Date: 3 Feb 2012 + diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt index 8e5bf60be3f068..ab459ef14258a6 100644 --- a/Documentation/technical/pack-format.txt +++ b/Documentation/technical/pack-format.txt @@ -160,3 +160,88 @@ Pack file entry: <+ corresponding packfile. 20-byte SHA-1-checksum of all of the above. + +== midx-*.midx files have the following format: + +The multi-pack-index (MIDX) files refer to multiple pack-files. + +In order to allow extensions that add extra data to the MIDX format, we +organize the body into "chunks" and provide a lookup table at the beginning +of the body. The header includes certain length values, such as the number +of packs, the number of base MIDX files, hash lengths and types. + +All 4-byte numbers are in network order. + +HEADER: + + 4-byte signature: + The signature is: {'M', 'I', 'D', 'X'} + + 4-byte version number: + Git currently only supports version 1. + + 1-byte Object Id Version (1 = SHA-1) + + 1-byte Object Id Length (H) + + 1-byte number (I) of base multi-pack-index files: + This value is currently always zero. + + 1-byte number (C) of "chunks" + + 4-byte number (P) of pack files + +CHUNK LOOKUP: + + (C + 1) * 12 bytes providing the chunk offsets: + First 4 bytes describe chunk id. Value 0 is a terminating label. + Other 8 bytes provide offset in current file for chunk to start. + (Chunks are provided in file-order, so you can infer the length + using the next chunk position if necessary.) + + The remaining data in the body is described one chunk at a time, and + these chunks may be given in any order. Chunks are required unless + otherwise specified. + +CHUNK DATA: + + OID Fanout (ID: {'O', 'I', 'D', 'F'}) (256 * 4 bytes) + The ith entry, F[i], stores the number of OIDs with first + byte at most i. Thus F[255] stores the total + number of objects (N). The number of objects with first byte + value i is (F[i] - F[i-1]) for i > 0. + + OID Lookup (ID: {'O', 'I', 'D', 'L'}) (N * H bytes) + The OIDs for all objects in the MIDX are stored in lexicographic + order in this chunk. + + Object Offsets (ID: {'O', 'O', 'F', 'F'}) (N * 8 bytes) + Stores two 4-byte values for every object. + 1: The pack-int-id for the pack storing this object. + 2: The offset within the pack. + If all offsets are less than 2^31, then the large offset chunk + will not exist and offsets are stored as in IDX v1. + If there is at least one offset value larger than 2^32-1, then + the large offset chunk must exist. If the large offset chunk + exists and the 31st bit is on, then removing that bit reveals + the row in the large offsets containing the 8-byte offset of + this object. + + [Optional] Object Large Offsets (ID: {'L', 'O', 'F', 'F'}) + 8-byte offsets into large packfiles. + + Packfile Name Lookup (ID: {'P', 'L', 'O', 'O'}) (P * 4 bytes) + P * 4 bytes storing the offset in the packfile name chunk for + the null-terminated string containing the filename for the + ith packfile. The filename is relative to the MIDX file's parent + directory. + + Packfile Names (ID: {'P', 'N', 'A', 'M'}) + Stores the packfile names as concatenated, null-terminated strings. + Packfiles must be listed in lexicographic order for fast lookups by + name. This is the only chunk not guaranteed to be a multiple of four + bytes in length, so it should be the last chunk for alignment reasons. + +TRAILER: + + H-byte HASH-checksum of all of the above. diff --git a/Makefile b/Makefile index 2a81ae22e92cc5..5c458705c18831 100644 --- a/Makefile +++ b/Makefile @@ -827,6 +827,7 @@ LIB_OBJS += merge.o LIB_OBJS += merge-blobs.o LIB_OBJS += merge-recursive.o LIB_OBJS += mergesort.o +LIB_OBJS += midx.o LIB_OBJS += mru.o LIB_OBJS += name-hash.o LIB_OBJS += notes.o @@ -979,6 +980,7 @@ BUILTIN_OBJS += builtin/merge-index.o BUILTIN_OBJS += builtin/merge-ours.o BUILTIN_OBJS += builtin/merge-recursive.o BUILTIN_OBJS += builtin/merge-tree.o +BUILTIN_OBJS += builtin/midx.o BUILTIN_OBJS += builtin/mktag.o BUILTIN_OBJS += builtin/mktree.o BUILTIN_OBJS += builtin/mv.o diff --git a/builtin.h b/builtin.h index 42378f3aa471eb..880383e341472d 100644 --- a/builtin.h +++ b/builtin.h @@ -188,6 +188,7 @@ extern int cmd_merge_ours(int argc, const char **argv, const char *prefix); extern int cmd_merge_file(int argc, const char **argv, const char *prefix); extern int cmd_merge_recursive(int argc, const char **argv, const char *prefix); extern int cmd_merge_tree(int argc, const char **argv, const char *prefix); +extern int cmd_midx(int argc, const char **argv, const char *prefix); extern int cmd_mktag(int argc, const char **argv, const char *prefix); extern int cmd_mktree(int argc, const char **argv, const char *prefix); extern int cmd_mv(int argc, const char **argv, const char *prefix); diff --git a/builtin/midx.c b/builtin/midx.c new file mode 100644 index 00000000000000..6f56f39390a3bd --- /dev/null +++ b/builtin/midx.c @@ -0,0 +1,352 @@ +#include "builtin.h" +#include "cache.h" +#include "config.h" +#include "dir.h" +#include "git-compat-util.h" +#include "lockfile.h" +#include "packfile.h" +#include "parse-options.h" +#include "midx.h" + +static char const * const builtin_midx_usage[] = { + N_("git midx [--pack-dir ]"), + N_("git midx --write [--update-head [--delete-expired]] [--pack-dir ]"), + N_("git midx --clear [--pack-dir ]"), + NULL +}; + +static struct opts_midx { + const char *pack_dir; + int clear; + int read; + const char *midx_id; + int write; + int update_head; + int delete_expired; + int has_existing; + struct object_id old_midx_oid; +} opts; + +static int midx_clear(void) +{ + struct strbuf head_path = STRBUF_INIT; + char *old_path; + + if (!opts.has_existing) + return 0; + + strbuf_addstr(&head_path, opts.pack_dir); + strbuf_addstr(&head_path, "/"); + strbuf_addstr(&head_path, "midx-head"); + if (remove_path(head_path.buf)) + die("failed to remove path %s", head_path.buf); + strbuf_release(&head_path); + + old_path = get_midx_filename_oid(opts.pack_dir, &opts.old_midx_oid); + if (remove_path(old_path)) + die("failed to remove path %s", old_path); + free(old_path); + + return 0; +} + +static int midx_read(void) +{ + struct object_id midx_oid; + struct midxed_git *midx; + uint32_t i; + + if (opts.midx_id && strlen(opts.midx_id) == GIT_MAX_HEXSZ) + get_oid_hex(opts.midx_id, &midx_oid); + else if (!get_midx_head_oid(opts.pack_dir, &midx_oid)) + die("No midx-head exists."); + + midx = get_midxed_git(opts.pack_dir, &midx_oid); + + printf("header: %08x %x %d %d %d %d %d\n", + ntohl(midx->hdr->midx_signature), + ntohl(midx->hdr->midx_version), + midx->hdr->hash_version, + midx->hdr->hash_len, + midx->hdr->num_base_midx, + midx->hdr->num_chunks, + ntohl(midx->hdr->num_packs)); + printf("num_objects: %d\n", midx->num_objects); + printf("chunks:"); + + if (midx->chunk_pack_lookup) + printf(" pack_lookup"); + if (midx->chunk_pack_names) + printf(" pack_names"); + if (midx->chunk_oid_fanout) + printf(" oid_fanout"); + if (midx->chunk_oid_lookup) + printf(" oid_lookup"); + if (midx->chunk_object_offsets) + printf(" object_offsets"); + if (midx->chunk_large_offsets) + printf(" large_offsets"); + printf("\n"); + + printf("pack_names:\n"); + for (i = 0; i < midx->num_packs; i++) + printf("%s\n", midx->pack_names[i]); + + printf("pack_dir: %s\n", midx->pack_dir); + return 0; +} + +static int build_midx_from_packs( + const char *pack_dir, + const char **pack_names, uint32_t nr_packs, + const char **midx_id, struct midxed_git *midx) +{ + struct packed_git **packs; + const char **installed_pack_names; + uint32_t i, j, nr_installed_packs = 0; + uint32_t nr_objects = 0; + struct pack_midx_entry *objects; + struct pack_midx_entry **obj_ptrs; + uint32_t nr_total_packs = nr_packs; + uint32_t pack_offset = 0; + struct strbuf pack_path = STRBUF_INIT; + int baselen; + + if (midx) + nr_total_packs += midx->num_packs; + + if (!nr_total_packs) { + *midx_id = NULL; + return 0; + } + + ALLOC_ARRAY(packs, nr_total_packs); + ALLOC_ARRAY(installed_pack_names, nr_total_packs); + + if (midx) { + for (i = 0; i < midx->num_packs; i++) + installed_pack_names[nr_installed_packs++] = midx->pack_names[i]; + pack_offset = midx->num_packs; + } + + strbuf_addstr(&pack_path, pack_dir); + strbuf_addch(&pack_path, '/'); + baselen = pack_path.len; + for (i = 0; i < nr_packs; i++) { + strbuf_setlen(&pack_path, baselen); + strbuf_addstr(&pack_path, pack_names[i]); + + if (midx && contains_pack(midx, pack_names[i])) + continue; + + strbuf_strip_suffix(&pack_path, ".pack"); + strbuf_addstr(&pack_path, ".idx"); + + packs[nr_installed_packs] = add_packed_git(pack_path.buf, pack_path.len, 0); + + if (packs[nr_installed_packs] != NULL) { + if (open_pack_index(packs[nr_installed_packs])) + continue; + + nr_objects += packs[nr_installed_packs]->num_objects; + installed_pack_names[nr_installed_packs] = pack_names[i]; + nr_installed_packs++; + } + } + strbuf_release(&pack_path); + + if (!nr_objects || !nr_installed_packs) { + FREE_AND_NULL(packs); + FREE_AND_NULL(installed_pack_names); + + if (opts.has_existing) + *midx_id = oid_to_hex(&opts.old_midx_oid); + else + *midx_id = NULL; + + return 0; + } + + if (midx) + nr_objects += midx->num_objects; + + ALLOC_ARRAY(objects, nr_objects); + nr_objects = 0; + + for (i = 0; midx && i < midx->num_objects; i++) + nth_midxed_object_entry(midx, i, &objects[nr_objects++]); + + for (i = pack_offset; i < nr_installed_packs; i++) { + struct packed_git *p = packs[i]; + + for (j = 0; j < p->num_objects; j++) { + struct pack_midx_entry entry; + + if (!nth_packed_object_oid(&entry.oid, p, j)) + die("unable to get sha1 of object %u in %s", + i, p->pack_name); + + entry.pack_int_id = i; + entry.offset = nth_packed_object_offset(p, j); + + objects[nr_objects] = entry; + nr_objects++; + } + } + + ALLOC_ARRAY(obj_ptrs, nr_objects); + for (i = 0; i < nr_objects; i++) + obj_ptrs[i] = &objects[i]; + + *midx_id = write_midx_file(pack_dir, NULL, + installed_pack_names, nr_installed_packs, + obj_ptrs, nr_objects); + + FREE_AND_NULL(packs); + FREE_AND_NULL(installed_pack_names); + FREE_AND_NULL(obj_ptrs); + FREE_AND_NULL(objects); + + return 0; +} + +static void update_head_file(const char *pack_dir, const char *midx_id) +{ + int fd; + struct lock_file lk = LOCK_INIT; + char *head_path = get_midx_head_filename(pack_dir); + + fd = hold_lock_file_for_update(&lk, head_path, LOCK_DIE_ON_ERROR); + FREE_AND_NULL(head_path); + + if (fd < 0) + die_errno("unable to open midx-head"); + + write_in_full(fd, midx_id, GIT_MAX_HEXSZ); + commit_lock_file(&lk); +} + +static int midx_write(void) +{ + const char **pack_names = NULL; + uint32_t i, nr_packs = 0; + const char *midx_id = 0; + DIR *dir; + struct dirent *de; + struct midxed_git *midx = NULL; + + if (opts.has_existing) + midx = get_midxed_git(opts.pack_dir, &opts.old_midx_oid); + + dir = opendir(opts.pack_dir); + if (!dir) { + error_errno("unable to open object pack directory: %s", + opts.pack_dir); + return 1; + } + + nr_packs = 256; + ALLOC_ARRAY(pack_names, nr_packs); + + i = 0; + while ((de = readdir(dir)) != NULL) { + if (is_dot_or_dotdot(de->d_name)) + continue; + + if (ends_with(de->d_name, ".pack")) { + ALLOC_GROW(pack_names, i + 1, nr_packs); + pack_names[i++] = xstrdup(de->d_name); + } + } + + nr_packs = i; + closedir(dir); + + if (!nr_packs) + goto cleanup; + + if (build_midx_from_packs(opts.pack_dir, pack_names, + nr_packs, &midx_id, midx)) + die("failed to build MIDX"); + + if (midx_id == NULL) + goto cleanup; + + printf("%s\n", midx_id); + + if (opts.update_head) + update_head_file(opts.pack_dir, midx_id); + + if (opts.delete_expired && opts.update_head && opts.has_existing && + strcmp(midx_id, oid_to_hex(&opts.old_midx_oid))) { + char *old_path = get_midx_filename_oid(opts.pack_dir, &opts.old_midx_oid); + close_midx(midx); + if (remove_path(old_path)) + die("failed to remove path %s", old_path); + + free(old_path); + } + +cleanup: + if (pack_names) + FREE_AND_NULL(pack_names); + return 0; +} + +int cmd_midx(int argc, const char **argv, const char *prefix) +{ + static struct option builtin_midx_options[] = { + { OPTION_STRING, 'p', "pack-dir", &opts.pack_dir, + N_("dir"), + N_("The pack directory containing set of packfile and pack-index pairs.") }, + OPT_BOOL('c', "clear", &opts.clear, + N_("clear midx file and midx-head")), + OPT_BOOL('r', "read", &opts.read, + N_("read midx file")), + { OPTION_STRING, 'M', "midx-id", &opts.midx_id, + N_("oid"), + N_("An OID for a specific midx file in the pack-dir."), + PARSE_OPT_OPTARG, NULL, (intptr_t) "" }, + OPT_BOOL('w', "write", &opts.write, + N_("write midx file")), + OPT_BOOL('u', "update-head", &opts.update_head, + N_("update midx-head to written midx file")), + OPT_BOOL('d', "delete-expired", &opts.delete_expired, + N_("delete expired head midx file")), + OPT_END(), + }; + + if (argc == 2 && !strcmp(argv[1], "-h")) + usage_with_options(builtin_midx_usage, builtin_midx_options); + + git_config(git_default_config, NULL); + if (!core_midx) + die(_("git-midx requires core.midx=true")); + + argc = parse_options(argc, argv, prefix, + builtin_midx_options, + builtin_midx_usage, 0); + + if (opts.write + opts.read + opts.clear > 1) + usage_with_options(builtin_midx_usage, builtin_midx_options); + + if (!opts.pack_dir) { + struct strbuf path = STRBUF_INIT; + strbuf_addstr(&path, get_object_directory()); + strbuf_addstr(&path, "/pack"); + opts.pack_dir = strbuf_detach(&path, NULL); + } + + opts.has_existing = !!get_midx_head_oid(opts.pack_dir, &opts.old_midx_oid); + + if (opts.clear) + return midx_clear(); + if (opts.read) + return midx_read(); + if (opts.write) + return midx_write(); + + if (opts.has_existing) + printf("%s\n", oid_to_hex(&opts.old_midx_oid)); + return 0; +} diff --git a/cache.h b/cache.h index a2ec8c0b55422f..f4943d3136bcd4 100644 --- a/cache.h +++ b/cache.h @@ -820,6 +820,7 @@ extern int precomposed_unicode; extern int protect_hfs; extern int protect_ntfs; extern const char *core_fsmonitor; +extern int core_midx; /* * Include broken refs in all ref iterations, which will diff --git a/command-list.txt b/command-list.txt index a1fad28fd82da1..a7b9412182de46 100644 --- a/command-list.txt +++ b/command-list.txt @@ -87,6 +87,7 @@ git-merge-index plumbingmanipulators git-merge-one-file purehelpers git-mergetool ancillarymanipulators git-merge-tree ancillaryinterrogators +git-midx plumbingmanipulators git-mktag plumbingmanipulators git-mktree plumbingmanipulators git-mv mainporcelain worktree diff --git a/config.c b/config.c index e617c2018d22b6..17f560ddc471f1 100644 --- a/config.c +++ b/config.c @@ -1223,6 +1223,11 @@ static int git_default_core_config(const char *var, const char *value) return 0; } + if (!strcmp(var, "core.midx")) { + core_midx = git_config_bool(var, value); + return 0; + } + if (!strcmp(var, "core.sparsecheckout")) { core_apply_sparse_checkout = git_config_bool(var, value); return 0; diff --git a/environment.c b/environment.c index 63ac38a46f8f01..57a39438494803 100644 --- a/environment.c +++ b/environment.c @@ -78,6 +78,8 @@ int protect_hfs = PROTECT_HFS_DEFAULT; int protect_ntfs = PROTECT_NTFS_DEFAULT; const char *core_fsmonitor; +int core_midx; + /* * The character that begins a commented line in user-editable file * that is subject to stripspace. diff --git a/git.c b/git.c index c870b9719c21b2..87fbda846350b1 100644 --- a/git.c +++ b/git.c @@ -431,6 +431,7 @@ static struct cmd_struct commands[] = { { "merge-recursive-theirs", cmd_merge_recursive, RUN_SETUP | NEED_WORK_TREE }, { "merge-subtree", cmd_merge_recursive, RUN_SETUP | NEED_WORK_TREE }, { "merge-tree", cmd_merge_tree, RUN_SETUP }, + { "midx", cmd_midx, RUN_SETUP }, { "mktag", cmd_mktag, RUN_SETUP }, { "mktree", cmd_mktree, RUN_SETUP }, { "mv", cmd_mv, RUN_SETUP | NEED_WORK_TREE }, diff --git a/midx.c b/midx.c new file mode 100644 index 00000000000000..4b2398b3eeee44 --- /dev/null +++ b/midx.c @@ -0,0 +1,850 @@ +#include "cache.h" +#include "git-compat-util.h" +#include "pack.h" +#include "packfile.h" +#include "midx.h" + +#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ +#define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */ +#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */ +#define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */ +#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */ +#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */ +#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */ + +#define MIDX_VERSION_1 1 +#define MIDX_VERSION MIDX_VERSION_1 + +#define MIDX_OID_VERSION_SHA1 1 +#define MIDX_OID_LEN_SHA1 20 +#define MIDX_OID_VERSION MIDX_OID_VERSION_SHA1 +#define MIDX_OID_LEN MIDX_OID_LEN_SHA1 + +#define MIDX_LARGE_OFFSET_NEEDED 0x80000000 + +/* MIDX-git global storage */ +struct midxed_git *midxed_git = 0; + +char* get_midx_filename_oid(const char *pack_dir, + struct object_id *oid) +{ + struct strbuf head_path = STRBUF_INIT; + strbuf_addstr(&head_path, pack_dir); + strbuf_addstr(&head_path, "/midx-"); + strbuf_addstr(&head_path, oid_to_hex(oid)); + strbuf_addstr(&head_path, ".midx"); + + return strbuf_detach(&head_path, NULL); +} + +char *get_midx_head_filename(const char *pack_dir) +{ + struct strbuf head_filename = STRBUF_INIT; + strbuf_addstr(&head_filename, pack_dir); + strbuf_addstr(&head_filename, "/midx-head"); + return strbuf_detach(&head_filename, NULL); +} + +struct object_id *get_midx_head_oid(const char *pack_dir, + struct object_id *oid) +{ + char oid_hex[GIT_MAX_HEXSZ + 1]; + FILE *f; + char *head_filename = get_midx_head_filename(pack_dir); + + f = fopen(head_filename, "r"); + FREE_AND_NULL(head_filename); + + if (!f) + return 0; + + if (!fgets(oid_hex, sizeof(oid_hex), f)) + die("failed to read midx-head"); + + fclose(f); + + if (get_oid_hex(oid_hex, oid)) + return 0; + return oid; +} + +static struct midxed_git *alloc_midxed_git(int extra) +{ + struct midxed_git *m = xmalloc(st_add(sizeof(*m), extra)); + memset(m, 0, sizeof(*m)); + m->midx_fd = -1; + + return m; +} + +static struct midxed_git *load_midxed_git_one(const char *midx_file, const char *pack_dir) +{ + void *midx_map; + const unsigned char *data; + struct pack_midx_header *hdr; + size_t midx_size, packs_len; + struct stat st; + uint32_t i; + struct midxed_git *midx; + int fd = git_open(midx_file); + + if (fd < 0) + return 0; + if (fstat(fd, &st)) { + close(fd); + return 0; + } + midx_size = xsize_t(st.st_size); + + if (midx_size < 16 + 8 * 5 + 4 * 256 + GIT_MAX_RAWSZ) { + close(fd); + die("midx file %s is too small", midx_file); + } + midx_map = xmmap(NULL, midx_size, PROT_READ, MAP_PRIVATE, fd, 0); + data = (const unsigned char *)midx_map; + + hdr = midx_map; + if (ntohl(hdr->midx_signature) != MIDX_SIGNATURE) { + munmap(midx_map, midx_size); + close(fd); + die("MIDX signature %X does not match signature %X", + ntohl(hdr->midx_signature), MIDX_SIGNATURE); + } + + if (ntohl(hdr->midx_version) != MIDX_VERSION) { + munmap(midx_map, midx_size); + die("MIDX version %X does not match version %X", + ntohl(hdr->midx_version), MIDX_VERSION); + } + + midx = alloc_midxed_git(strlen(pack_dir) + 1); + + midx->hdr = hdr; + midx->midx_fd = fd; + midx->data = midx_map; + midx->data_len = midx_size; + + for (i = 0; i <= hdr->num_chunks; i++) { + uint32_t chunk_id = ntohl(*(uint32_t*)(data + sizeof(*hdr) + 12 * i)); + uint64_t chunk_offset1 = ntohl(*(uint32_t*)(data + sizeof(*hdr) + 12 * i + 4)); + uint32_t chunk_offset2 = ntohl(*(uint32_t*)(data + sizeof(*hdr) + 12 * i + 8)); + uint64_t chunk_offset = (chunk_offset1 << 32) | chunk_offset2; + + if (sizeof(data) == 4 && chunk_offset >> 32) { + munmap(midx_map, midx_size); + close(fd); + die(_("unable to memory-map in 32-bit address space")); + } + + switch (chunk_id) { + case MIDX_CHUNKID_PACKLOOKUP: + midx->chunk_pack_lookup = data + chunk_offset; + break; + + case MIDX_CHUNKID_PACKNAMES: + midx->chunk_pack_names = data + chunk_offset; + break; + + case MIDX_CHUNKID_OIDFANOUT: + midx->chunk_oid_fanout = data + chunk_offset; + break; + + case MIDX_CHUNKID_OIDLOOKUP: + midx->chunk_oid_lookup = data + chunk_offset; + break; + + case MIDX_CHUNKID_OBJECTOFFSETS: + midx->chunk_object_offsets = data + chunk_offset; + break; + + case MIDX_CHUNKID_LARGEOFFSETS: + midx->chunk_large_offsets = data + chunk_offset; + break; + + case 0: + break; + + default: + munmap(midx_map, midx_size); + close(fd); + die("unrecognized MIDX chunk id: %08x", chunk_id); + } + } + + midx->num_objects = ntohl(*((uint32_t*)(midx->chunk_oid_fanout + 255 * 4))); + midx->num_packs = ntohl(midx->hdr->num_packs); + + packs_len = st_mult(sizeof(struct packed_git*), midx->num_packs); + + if (packs_len) { + ALLOC_ARRAY(midx->packs, midx->num_packs); + ALLOC_ARRAY(midx->pack_names, midx->num_packs); + memset(midx->packs, 0, packs_len); + + for (i = 0; i < midx->num_packs; i++) { + uint32_t name_offset = ntohl(*(uint32_t*)(midx->chunk_pack_lookup + 4 * i)); + midx->pack_names[i] = (const char*)(midx->chunk_pack_names + name_offset); + } + } + + strcpy(midx->pack_dir, pack_dir); + return midx; +} + +struct midxed_git *get_midxed_git(const char *pack_dir, struct object_id *oid) +{ + struct midxed_git *m; + char *fname = get_midx_filename_oid(pack_dir, oid); + m = load_midxed_git_one(fname, pack_dir); + free(fname); + return m; +} + +static char* get_midx_filename_dir(const char *pack_dir) +{ + struct object_id oid; + if (!get_midx_head_oid(pack_dir, &oid)) + return 0; + + return get_midx_filename_oid(pack_dir, &oid); +} + +static int prepare_midxed_git_head(char *pack_dir, int local) +{ + struct midxed_git *m = midxed_git; + char *midx_head_path = get_midx_filename_dir(pack_dir); + + if (!core_midx) + return 1; + + if (midx_head_path) { + midxed_git = load_midxed_git_one(midx_head_path, pack_dir); + midxed_git->next = m; + FREE_AND_NULL(midx_head_path); + return 1; + } + + return 0; +} + +int prepare_midxed_git_objdir(char *obj_dir, int local) +{ + int ret; + struct strbuf pack_dir = STRBUF_INIT; + strbuf_addstr(&pack_dir, obj_dir); + strbuf_add(&pack_dir, "/pack", 5); + + ret = prepare_midxed_git_head(pack_dir.buf, local); + strbuf_release(&pack_dir); + return ret; +} + +struct pack_midx_details_internal { + uint32_t pack_int_id; + uint32_t internal_offset; +}; + +struct pack_midx_details *nth_midxed_object_details(struct midxed_git *m, + uint32_t n, + struct pack_midx_details *d) +{ + struct pack_midx_details_internal *d_internal; + const unsigned char *details = m->chunk_object_offsets; + + if (n >= m->num_objects) + return NULL; + + d_internal = (struct pack_midx_details_internal*)(details + 8 * n); + d->pack_int_id = ntohl(d_internal->pack_int_id); + d->offset = ntohl(d_internal->internal_offset); + + if (m->chunk_large_offsets && d->offset & MIDX_LARGE_OFFSET_NEEDED) { + uint32_t large_offset = d->offset ^ MIDX_LARGE_OFFSET_NEEDED; + const unsigned char *large_offsets = m->chunk_large_offsets + 8 * large_offset; + + d->offset = (((uint64_t)ntohl(*((uint32_t *)(large_offsets + 0)))) << 32) | + ntohl(*((uint32_t *)(large_offsets + 4))); + } + + return d; +} + +struct pack_midx_entry *nth_midxed_object_entry(struct midxed_git *m, + uint32_t n, + struct pack_midx_entry *e) +{ + struct pack_midx_details details; + const unsigned char *index = m->chunk_oid_lookup; + + if (!nth_midxed_object_details(m, n, &details)) + return NULL; + + memcpy(e->oid.hash, index + m->hdr->hash_len * n, m->hdr->hash_len); + e->pack_int_id = details.pack_int_id; + e->offset = details.offset; + + return e; +} + +const struct object_id *nth_midxed_object_oid(struct object_id *oid, + struct midxed_git *m, + uint32_t n) +{ + struct pack_midx_entry e; + + if (!nth_midxed_object_entry(m, n, &e)) + return 0; + + hashcpy(oid->hash, e.oid.hash); + return oid; +} + +int bsearch_midx(struct midxed_git *m, const unsigned char *sha1, uint32_t *pos) +{ + uint32_t last, first = 0; + + if (sha1[0]) + first = ntohl(*(uint32_t*)(m->chunk_oid_fanout + 4 * (sha1[0] - 1))); + last = ntohl(*(uint32_t*)(m->chunk_oid_fanout + 4 * sha1[0])); + + while (first < last) { + uint32_t mid = first + (last - first) / 2; + const unsigned char *current; + int cmp; + + current = m->chunk_oid_lookup + m->hdr->hash_len * mid; + cmp = hashcmp(sha1, current); + if (!cmp) { + *pos = mid; + return 1; + } + if (cmp > 0) { + first = mid + 1; + continue; + } + last = mid; + } + + *pos = first; + return 0; +} + +static int prepare_midx_pack(struct midxed_git *m, uint32_t pack_int_id) +{ + struct strbuf pack_name = STRBUF_INIT; + + if (pack_int_id >= m->hdr->num_packs) + return 1; + + if (m->packs[pack_int_id]) + return 0; + + strbuf_addstr(&pack_name, m->pack_dir); + strbuf_addstr(&pack_name, "/"); + strbuf_addstr(&pack_name, m->pack_names[pack_int_id]); + strbuf_strip_suffix(&pack_name, ".pack"); + strbuf_addstr(&pack_name, ".idx"); + + m->packs[pack_int_id] = add_packed_git(pack_name.buf, pack_name.len, 1); + strbuf_release(&pack_name); + return !m->packs[pack_int_id]; +} + +static int find_pack_entry_midx(const unsigned char *sha1, + struct midxed_git *m, + struct packed_git **p, + off_t *offset) +{ + uint32_t pos; + struct pack_midx_details d; + + if (!bsearch_midx(m, sha1, &pos) || + !nth_midxed_object_details(m, pos, &d)) + return 0; + + if (d.pack_int_id >= m->num_packs) + die(_("bad pack-int-id %d"), d.pack_int_id); + + /* load packfile, if necessary */ + if (prepare_midx_pack(m, d.pack_int_id)) + return 0; + + *p = m->packs[d.pack_int_id]; + *offset = d.offset; + + return 1; +} + +int fill_pack_entry_midx(const unsigned char *sha1, + struct pack_entry *e) +{ + struct packed_git *p; + struct midxed_git *m; + + if (!core_midx) + return 0; + + m = midxed_git; + while (m) + { + off_t offset; + if (!find_pack_entry_midx(sha1, m, &p, &offset)) { + m = m->next; + continue; + } + + /* + * We are about to tell the caller where they can locate the + * requested object. We better make sure the packfile is + * still here and can be accessed before supplying that + * answer, as it may have been deleted since the MIDX was + * loaded! + */ + if (!is_pack_valid(p)) + return 0; + + e->offset = offset; + e->p = p; + hashcpy(e->sha1, sha1); + + return 1; + } + + return 0; +} + +int contains_pack(struct midxed_git *m, const char *pack_name) +{ + uint32_t first = 0, last = m->num_packs; + + while (first < last) { + uint32_t mid = first + (last - first) / 2; + const char *current; + int cmp; + + current = m->pack_names[mid]; + cmp = strcmp(pack_name, current); + if (!cmp) + return 1; + if (cmp > 0) { + first = mid + 1; + continue; + } + last = mid; + } + + return 0; +} + +static int midx_sha1_compare(const void *_a, const void *_b) +{ + struct pack_midx_entry *a = *(struct pack_midx_entry **)_a; + struct pack_midx_entry *b = *(struct pack_midx_entry **)_b; + return oidcmp(&a->oid, &b->oid); +} + +static void write_midx_chunk_packlookup( + struct sha1file *f, + const char **pack_names, uint32_t nr_packs) +{ + uint32_t i, cur_len = 0; + + for (i = 0; i < nr_packs; i++) { + uint32_t swap_len = htonl(cur_len); + sha1write(f, &swap_len, 4); + cur_len += strlen(pack_names[i]) + 1; + } +} + +static void write_midx_chunk_packnames( + struct sha1file *f, + const char **pack_names, uint32_t nr_packs) +{ + uint32_t i; + for (i = 0; i < nr_packs; i++) + sha1write(f, pack_names[i], strlen(pack_names[i]) + 1); +} + +static void write_midx_chunk_oidfanout( + struct sha1file *f, + struct pack_midx_entry **objects, uint32_t nr_objects) +{ + struct pack_midx_entry **list = objects; + struct pack_midx_entry **last = objects + nr_objects; + uint32_t count_distinct = 0; + uint32_t i; + + /* + * Write the first-level table (the list is sorted, + * but we use a 256-entry lookup to be able to avoid + * having to do eight extra binary search iterations). + */ + for (i = 0; i < 256; i++) { + struct pack_midx_entry **next = list; + struct pack_midx_entry *prev = 0; + uint32_t swap_distinct; + + while (next < last) { + struct pack_midx_entry *obj = *next; + if (obj->oid.hash[0] != i) + break; + + if (!prev || oidcmp(&(prev->oid), &(obj->oid))) + count_distinct++; + + prev = obj; + next++; + } + + swap_distinct = htonl(count_distinct); + sha1write(f, &swap_distinct, 4); + list = next; + } +} + +static void write_midx_chunk_oidlookup( + struct sha1file *f, unsigned char hash_len, + struct pack_midx_entry **objects, uint32_t nr_objects) +{ + struct pack_midx_entry **list = objects; + struct object_id *last_oid = 0; + uint32_t i; + + for (i = 0; i < nr_objects; i++) { + struct pack_midx_entry *obj = *list++; + + if (last_oid && !oidcmp(last_oid, &obj->oid)) + continue; + + last_oid = &obj->oid; + sha1write(f, obj->oid.hash, (int)hash_len); + } +} + +static void write_midx_chunk_objectoffsets( + struct sha1file *f, int large_offset_needed, + struct pack_midx_entry **objects, uint32_t nr_objects, uint32_t *pack_perm) +{ + struct pack_midx_entry **list = objects; + struct object_id *last_oid = 0; + uint32_t i, nr_large_offset = 0; + + for (i = 0; i < nr_objects; i++) { + struct pack_midx_details_internal details; + struct pack_midx_entry *obj = *list++; + + if (last_oid && !oidcmp(last_oid, &obj->oid)) + continue; + + last_oid = &obj->oid; + + details.pack_int_id = htonl(pack_perm[obj->pack_int_id]); + + if (large_offset_needed && obj->offset >> 31) + details.internal_offset = (MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++); + else + details.internal_offset = (uint32_t)obj->offset; + + details.internal_offset = htonl(details.internal_offset); + sha1write(f, &details, 8); + } +} + +static void write_midx_chunk_largeoffsets( + struct sha1file *f, uint32_t nr_large_offset, + struct pack_midx_entry **objects, uint32_t nr_objects) +{ + struct pack_midx_entry **list = objects; + struct object_id *last_oid = 0; + + while (nr_large_offset) { + struct pack_midx_entry *obj = *list++; + uint64_t offset = obj->offset; + uint32_t split[2]; + + if (last_oid && !oidcmp(last_oid, &obj->oid)) + continue; + + last_oid = &obj->oid; + + if (!(offset >> 31)) + continue; + + split[0] = htonl(offset >> 32); + split[1] = htonl(offset & 0xffffffff); + + sha1write(f, split, 8); + nr_large_offset--; + } +} + +struct pack_pair { + uint32_t pack_int_id; + const char *pack_name; +}; + +static int pack_pair_compare(const void *_a, const void *_b) +{ + struct pack_pair *a = (struct pack_pair *)_a; + struct pack_pair *b = (struct pack_pair *)_b; + return strcmp(a->pack_name, b->pack_name); +} + +static void sort_packs_by_name(const char **pack_names, uint32_t nr_packs, uint32_t *perm) +{ + uint32_t i; + struct pack_pair *pairs; + + ALLOC_ARRAY(pairs, nr_packs); + + for (i = 0; i < nr_packs; i++) { + pairs[i].pack_int_id = i; + pairs[i].pack_name = pack_names[i]; + } + + QSORT(pairs, nr_packs, pack_pair_compare); + + for (i = 0; i < nr_packs; i++) { + pack_names[i] = pairs[i].pack_name; + perm[pairs[i].pack_int_id] = i; + } +} + +const char *write_midx_file(const char *pack_dir, + const char *midx_name, + const char **pack_names, + uint32_t nr_packs, + struct pack_midx_entry **objects, + uint32_t nr_objects) +{ + struct sha1file *f; + struct pack_midx_entry **sorted_by_sha; + int i, chunk, fd; + struct pack_midx_header hdr; + uint32_t chunk_ids[7]; + uint64_t chunk_offsets[7]; + unsigned char large_offset_needed = 0; + unsigned int nr_large_offset = 0; + unsigned char final_hash[GIT_MAX_RAWSZ]; + const char *final_hex; + int rename_needed = 0; + uint32_t count_distinct = 0; + int total_name_len = 0; + uint32_t *pack_perm; + + if (!core_midx) + return 0; + + /* Sort packs */ + if (nr_packs) { + ALLOC_ARRAY(pack_perm, nr_packs); + sort_packs_by_name(pack_names, nr_packs, pack_perm); + } else { + pack_perm = 0; + } + + /* Sort objects */ + if (nr_objects) { + sorted_by_sha = objects; + + QSORT(sorted_by_sha, nr_objects, midx_sha1_compare); + + for (i = 0; i < nr_objects; i++) { + if (i && + !oidcmp(&sorted_by_sha[i-1]->oid, &sorted_by_sha[i]->oid)) + continue; + + count_distinct++; + + if (sorted_by_sha[i]->offset > 0x7fffffff) + nr_large_offset++; + if (sorted_by_sha[i]->offset > 0xffffffff) + large_offset_needed = 1; + } + } else { + sorted_by_sha = NULL; + } + + for (i = 0; i < nr_packs; i++) + total_name_len += strlen(pack_names[i]) + 1; + + /* open temp file, or direct file if given */ + if (!midx_name) { + struct strbuf tmp_file = STRBUF_INIT; + strbuf_addstr(&tmp_file, pack_dir); + strbuf_addstr(&tmp_file, "/tmp_midx_XXXXXX"); + + fd = git_mkstemp_mode(tmp_file.buf, 0444); + if (fd < 0) + die_errno("unable to create '%s'", tmp_file.buf); + + midx_name = strbuf_detach(&tmp_file, NULL); + rename_needed = 1; + } else { + unlink(midx_name); + fd = open(midx_name, O_CREAT|O_EXCL|O_WRONLY, 0600); + if (fd < 0) + die_errno("unable to create '%s'", midx_name); + } + f = sha1fd(fd, midx_name); + + /* fill header info */ + hdr.midx_signature = htonl(MIDX_SIGNATURE); + hdr.midx_version = htonl(MIDX_VERSION); + + hdr.hash_version = MIDX_OID_VERSION; + hdr.hash_len = MIDX_OID_LEN; + hdr.num_base_midx = 0; + hdr.num_packs = htonl(nr_packs); + + /* + * We expect the following chunks, which are required: + * + * Packfile Name Lookup + * Packfile Names + * OID Fanout + * OID Lookup + * Object Offsets + */ + hdr.num_chunks = large_offset_needed ? 6 : 5; + + /* write header to file */ + assert(sizeof(hdr) == 16); + sha1write(f, &hdr, sizeof(hdr)); + + /* + * Fill initial chunk values using offsets + * relative to first chunk. + */ + chunk_offsets[0] = sizeof(hdr) + 12 * (hdr.num_chunks + 1); + chunk_ids[0] = MIDX_CHUNKID_PACKLOOKUP; + chunk_offsets[1] = chunk_offsets[0] + nr_packs * 4; + chunk_ids[1] = MIDX_CHUNKID_OIDFANOUT; + chunk_offsets[2] = chunk_offsets[1] + 256 * 4; + chunk_ids[2] = MIDX_CHUNKID_OIDLOOKUP; + chunk_offsets[3] = chunk_offsets[2] + (uint64_t)count_distinct + * (uint64_t)hdr.hash_len; + chunk_ids[3] = MIDX_CHUNKID_OBJECTOFFSETS; + chunk_offsets[4] = chunk_offsets[3] + 8 * (uint64_t)count_distinct; + + if (large_offset_needed) { + chunk_ids[4] = MIDX_CHUNKID_LARGEOFFSETS; + chunk_offsets[5] = chunk_offsets[4] + 8 * (uint64_t)nr_large_offset; + chunk_ids[5] = MIDX_CHUNKID_PACKNAMES; + chunk_offsets[6] = chunk_offsets[5] + total_name_len; + chunk_ids[6] = 0; + } else { + chunk_ids[4] = MIDX_CHUNKID_PACKNAMES; + chunk_offsets[5] = chunk_offsets[4] + total_name_len; + chunk_ids[5] = 0; + } + + for (i = 0; i <= hdr.num_chunks; i++) { + uint32_t chunk_write[3]; + + chunk_write[0] = htonl(chunk_ids[i]); + chunk_write[1] = htonl(chunk_offsets[i] >> 32); + chunk_write[2] = htonl(chunk_offsets[i] & 0xffffffff); + sha1write(f, chunk_write, 12); + } + + for (chunk = 0; chunk < hdr.num_chunks; chunk++) { + switch (chunk_ids[chunk]) { + case MIDX_CHUNKID_PACKLOOKUP: + write_midx_chunk_packlookup(f, pack_names, nr_packs); + break; + + case MIDX_CHUNKID_PACKNAMES: + write_midx_chunk_packnames(f, pack_names, nr_packs); + break; + + case MIDX_CHUNKID_OIDFANOUT: + write_midx_chunk_oidfanout(f, sorted_by_sha, nr_objects); + break; + + case MIDX_CHUNKID_OIDLOOKUP: + write_midx_chunk_oidlookup(f, hdr.hash_len, sorted_by_sha, + nr_objects); + break; + + case MIDX_CHUNKID_OBJECTOFFSETS: + write_midx_chunk_objectoffsets(f, large_offset_needed, + sorted_by_sha, nr_objects, + pack_perm); + break; + + case MIDX_CHUNKID_LARGEOFFSETS: + write_midx_chunk_largeoffsets(f, nr_large_offset, + sorted_by_sha, nr_objects); + break; + + case 0: + break; + + default: + die("unrecognized MIDX chunk id: %08x", chunk_ids[chunk]); + } + } + + sha1close(f, final_hash, CSUM_CLOSE | CSUM_FSYNC); + + if (rename_needed) + { + struct object_id oid; + char *fname; + + memcpy(oid.hash, final_hash, GIT_MAX_RAWSZ); + fname = get_midx_filename_oid(pack_dir, &oid); + final_hex = sha1_to_hex(final_hash); + + if (rename(midx_name, fname)) + die("failed to rename %s to %s", midx_name, fname); + + free(fname); + } else { + final_hex = midx_name; + } + + return final_hex; +} + +int close_midx(struct midxed_git *m) +{ + int i; + if (m->midx_fd < 0) + return 0; + + for (i = 0; i < m->num_packs; i++) { + if (m->packs[i]) { + close_pack(m->packs[i]); + free(m->packs[i]); + m->packs[i] = NULL; + } + } + + munmap((void *)m->data, m->data_len); + m->data = 0; + + close(m->midx_fd); + m->midx_fd = -1; + + free(m->packs); + free(m->pack_names); + + return 1; +} + +void close_all_midx(void) +{ + struct midxed_git *m = midxed_git; + struct midxed_git *next; + + while (m) { + next = m->next; + close_midx(m); + free(m); + m = next; + } + + midxed_git = 0; +} diff --git a/midx.h b/midx.h new file mode 100644 index 00000000000000..b7e8b15fe4134f --- /dev/null +++ b/midx.h @@ -0,0 +1,136 @@ +#ifndef MIDX_H +#define MIDX_H + +#include "git-compat-util.h" +#include "object.h" +#include "csum-file.h" + +extern char *get_midx_filename_oid(const char *pack_dir, + struct object_id *oid); +extern char *get_midx_head_filename(const char *pack_dir); + +extern struct object_id *get_midx_head_oid(const char *pack_dir, struct object_id *oid); + +extern int fill_pack_entry_midx(const unsigned char *sha1, + struct pack_entry *e); + +struct pack_midx_entry { + struct object_id oid; + uint32_t pack_int_id; + off_t offset; +}; + +struct pack_midx_header { + uint32_t midx_signature; + uint32_t midx_version; + unsigned char hash_version; + unsigned char hash_len; + unsigned char num_base_midx; + unsigned char num_chunks; + uint32_t num_packs; +}; + +extern struct midxed_git { + struct midxed_git *next; + + int midx_fd; + + /* the mmap'd data for the midx file */ + const unsigned char *data; + size_t data_len; + + /* points into the mmap'd data */ + struct pack_midx_header *hdr; + + /* can construct filename from obj_dir + "/packs/midx-" + oid + ".midx" */ + struct object_id oid; + + /* derived from the fanout chunk */ + uint32_t num_objects; + + /* converted number of packs */ + uint32_t num_packs; + + /* hdr->num_packs * 4 bytes */ + const unsigned char *chunk_pack_lookup; + const unsigned char *chunk_pack_names; + + /* 256 * 4 bytes */ + const unsigned char *chunk_oid_fanout; + + /* num_objects * hdr->hash_len bytes */ + const unsigned char *chunk_oid_lookup; + + /* num_objects * 8 bytes */ + const unsigned char *chunk_object_offsets; + + /* + * 8 bytes per large offset. + * (Optional: may be null.) + */ + const unsigned char *chunk_large_offsets; + + /* + * Points into mmap'd data storing the pack filenames. + */ + const char **pack_names; + + /* + * Store an array of pack-pointers. If NULL, then the + * pack has not been loaded yet. The array indices + * correspond to the pack_int_ids from the midx storage. + */ + struct packed_git **packs; + + /* something like ".git/objects/pack" */ + char pack_dir[FLEX_ARRAY]; /* more */ +} *midxed_git; + +extern struct midxed_git *get_midxed_git(const char *pack_dir, struct object_id *oid); +extern int prepare_midxed_git_objdir(char *obj_dir, int local); + +struct pack_midx_details { + uint32_t pack_int_id; + off_t offset; +}; + +extern struct pack_midx_details *nth_midxed_object_details(struct midxed_git *m, + uint32_t n, + struct pack_midx_details *d); +extern struct pack_midx_entry *nth_midxed_object_entry(struct midxed_git *m, + uint32_t n, + struct pack_midx_entry *e); +extern const struct object_id *nth_midxed_object_oid(struct object_id *oid, + struct midxed_git *m, + uint32_t n); + +/* + * Perform a binary search on the object list in a MIDX file for the given sha1. + * + * If the object exists, then return 1 and set *pos to the position of the sha1. + * Otherwise, return 0 and set *pos to the position of the lex-first object greater + * than the given sha1. + */ +extern int bsearch_midx(struct midxed_git *m, const unsigned char *sha1, uint32_t *pos); + +extern int contains_pack(struct midxed_git *m, const char *pack_name); + +/* + * Write a single MIDX file storing the given entries for the + * given list of packfiles. If midx_name is null, then a temp + * file will be created and swapped using the result hash value. + * Otherwise, write directly to midx_name. + * + * Returns the final name of the MIDX file within pack_dir. + */ +extern const char *write_midx_file(const char *pack_dir, + const char *midx_name, + const char **pack_names, + uint32_t nr_packs, + struct pack_midx_entry **objects, + uint32_t nr_objects); + +extern int close_midx(struct midxed_git *m); +extern void close_all_midx(void); + +#endif diff --git a/packfile.c b/packfile.c index 4a5fe7ab188384..9ec39a83e93776 100644 --- a/packfile.c +++ b/packfile.c @@ -8,6 +8,7 @@ #include "list.h" #include "streaming.h" #include "sha1-lookup.h" +#include "midx.h" char *odb_pack_name(struct strbuf *buf, const unsigned char *sha1, @@ -299,7 +300,7 @@ void close_pack_index(struct packed_git *p) } } -static void close_pack(struct packed_git *p) +void close_pack(struct packed_git *p) { close_pack_windows(p); close_pack_fd(p); @@ -309,10 +310,22 @@ static void close_pack(struct packed_git *p) void close_all_packs(void) { struct packed_git *p; + struct midxed_git *m; + + for (m = midxed_git; m; m = m->next) { + int i; + for (i = 0; i < m->num_packs; i++) { + p = m->packs[i]; + if (p && p->do_not_close) + BUG("want to close pack marked 'do-not-close'"); + else if (p) + close_pack(p); + } + } for (p = packed_git; p; p = p->next) if (p->do_not_close) - die("BUG: want to close pack marked 'do-not-close'"); + BUG("want to close pack marked 'do-not-close'"); else close_pack(p); } @@ -748,6 +761,7 @@ static void prepare_packed_git_one(char *objdir, int local) dirnamelen = path.len; while ((de = readdir(dir)) != NULL) { struct packed_git *p; + struct midxed_git *m; size_t base_len; if (is_dot_or_dotdot(de->d_name)) @@ -758,15 +772,23 @@ static void prepare_packed_git_one(char *objdir, int local) base_len = path.len; if (strip_suffix_mem(path.buf, &base_len, ".idx")) { + strbuf_setlen(&path, base_len + 1); + strbuf_add(&path, "pack", 4); + /* Don't reopen a pack we already have. */ + for (m = midxed_git; m; m = m->next) + if (!memcmp(m->pack_dir, path.buf, dirnamelen - 1) && + contains_pack(m, path.buf + dirnamelen)) + break; for (p = packed_git; p; p = p->next) { - size_t len; - if (strip_suffix(p->pack_name, ".pack", &len) && - len == base_len && - !memcmp(p->pack_name, path.buf, len)) + if (!strcmp(p->pack_name, path.buf)) break; } - if (p == NULL && + + strbuf_setlen(&path, base_len + 1); + strbuf_add(&path, "idx", 3); + + if (m == NULL && p == NULL && /* * See if it really is a valid .idx file with * corresponding .pack file that we can map. @@ -781,7 +803,8 @@ static void prepare_packed_git_one(char *objdir, int local) if (ends_with(de->d_name, ".idx") || ends_with(de->d_name, ".pack") || ends_with(de->d_name, ".bitmap") || - ends_with(de->d_name, ".keep")) + ends_with(de->d_name, ".keep") || + ends_with(de->d_name, ".midx")) string_list_append(&garbage, path.buf); else report_garbage(PACKDIR_FILE_GARBAGE, path.buf); @@ -806,9 +829,12 @@ unsigned long approximate_object_count(void) static unsigned long count; if (!approximate_object_count_valid) { struct packed_git *p; + struct midxed_git *m; - prepare_packed_git(); + prepare_packed_git_internal(1); count = 0; + for (m = midxed_git; m; m = m->next) + count += m->num_objects; for (p = packed_git; p; p = p->next) { if (open_pack_index(p)) continue; @@ -872,21 +898,45 @@ static void prepare_packed_git_mru(void) } static int prepare_packed_git_run_once = 0; -void prepare_packed_git(void) +static int prepare_midxed_git_run_once = 0; +void prepare_packed_git_internal(int use_midx) { struct alternate_object_database *alt; + char *obj_dir; + + if (prepare_midxed_git_run_once) { + if (!use_midx) { + prepare_midxed_git_run_once = 0; + close_all_midx(); + reprepare_packed_git(); + } + return; + } if (prepare_packed_git_run_once) return; - prepare_packed_git_one(get_object_directory(), 1); + + obj_dir = get_object_directory(); + + if (use_midx) + prepare_midxed_git_objdir(obj_dir, 1); + prepare_packed_git_one(obj_dir, 1); prepare_alt_odb(); - for (alt = alt_odb_list; alt; alt = alt->next) + for (alt = alt_odb_list; alt; alt = alt->next) { + if (use_midx) + prepare_midxed_git_objdir(alt->path, 0); prepare_packed_git_one(alt->path, 0); + } rearrange_packed_git(); prepare_packed_git_mru(); prepare_packed_git_run_once = 1; + prepare_midxed_git_run_once = use_midx; } +void prepare_packed_git(void) +{ + prepare_packed_git_internal(0); +} void reprepare_packed_git(void) { approximate_object_count_valid = 0; @@ -1833,7 +1883,10 @@ int find_pack_entry(const unsigned char *sha1, struct pack_entry *e) { struct mru_entry *p; - prepare_packed_git(); + prepare_packed_git_internal(1); + if (fill_pack_entry_midx(sha1, e)) + return 1; + if (!packed_git) return 0; diff --git a/packfile.h b/packfile.h index 0cdeb54dcd97a6..25bac91efbf46d 100644 --- a/packfile.h +++ b/packfile.h @@ -32,6 +32,7 @@ extern struct packed_git *parse_pack_index(unsigned char *sha1, const char *idx_ #define PACKDIR_FILE_GARBAGE 4 extern void (*report_garbage)(unsigned seen_bits, const char *path); +extern void prepare_packed_git_internal(int use_midx); extern void prepare_packed_git(void); extern void reprepare_packed_git(void); extern void install_packed_git(struct packed_git *pack); @@ -61,6 +62,7 @@ extern void close_pack_index(struct packed_git *); extern unsigned char *use_pack(struct packed_git *, struct pack_window **, off_t, unsigned long *); extern void close_pack_windows(struct packed_git *); +extern void close_pack(struct packed_git *p); extern void close_all_packs(void); extern void unuse_pack(struct pack_window **); extern void clear_delta_base_cache(void); diff --git a/sha1_name.c b/sha1_name.c index 611c7d24ddee67..2f426e136eff06 100644 --- a/sha1_name.c +++ b/sha1_name.c @@ -10,6 +10,7 @@ #include "dir.h" #include "sha1-array.h" #include "packfile.h" +#include "midx.h" static int get_oid_oneline(const char *, struct object_id *, struct commit_list *); @@ -190,11 +191,40 @@ static void unique_in_pack(struct packed_git *p, } } +static void unique_in_midx(struct midxed_git *m, + struct disambiguate_state *ds) +{ + uint32_t num, i, first = 0; + const struct object_id *current = NULL; + + if (!m->num_objects) + return; + + num = m->num_objects; + bsearch_midx(m, ds->bin_pfx.hash, &first); + + /* + * At this point, "first" is the location of the lowest object + * with an object name that could match "bin_pfx". See if we have + * 0, 1 or more objects that actually match(es). + */ + for (i = first; i < num && !ds->ambiguous; i++) { + struct object_id oid; + current = nth_midxed_object_oid(&oid, m, i); + if (!match_sha(ds->len, ds->bin_pfx.hash, current->hash)) + break; + update_candidates(ds, current); + } +} + static void find_short_packed_object(struct disambiguate_state *ds) { struct packed_git *p; + struct midxed_git *m; - prepare_packed_git(); + prepare_packed_git_internal(1); + for (m = midxed_git; m && !ds->ambiguous; m = m->next) + unique_in_midx(m, ds); for (p = packed_git; p && !ds->ambiguous; p = p->next) unique_in_pack(p, ds); } @@ -508,6 +538,39 @@ static int extend_abbrev_len(const struct object_id *oid, void *cb_data) return 0; } +static void find_abbrev_len_for_midx(struct midxed_git *m, + struct min_abbrev_data *mad) +{ + int match = 0; + uint32_t first = 0; + struct object_id oid; + + if (!m->num_objects) + return; + + match = bsearch_midx(m, mad->hash, &first); + + /* + * first is now the position in the packfile where we would insert + * mad->hash if it does not exist (or the position of mad->hash if + * it does exist). Hence, we consider a maximum of three objects + * nearby for the abbreviation length. + */ + mad->init_len = 0; + if (!match) { + nth_midxed_object_oid(&oid, m, first); + extend_abbrev_len(&oid, mad); + } else if (first < m->num_objects - 1) { + nth_midxed_object_oid(&oid, m, first + 1); + extend_abbrev_len(&oid, mad); + } + if (first > 0) { + nth_midxed_object_oid(&oid, m, first - 1); + extend_abbrev_len(&oid, mad); + } + mad->init_len = mad->cur_len; +} + static void find_abbrev_len_for_pack(struct packed_git *p, struct min_abbrev_data *mad) { @@ -563,8 +626,11 @@ static void find_abbrev_len_for_pack(struct packed_git *p, static void find_abbrev_len_packed(struct min_abbrev_data *mad) { struct packed_git *p; + struct midxed_git *m; - prepare_packed_git(); + prepare_packed_git_internal(1); + for (m = midxed_git; m; m = m->next) + find_abbrev_len_for_midx(m, mad); for (p = packed_git; p; p = p->next) find_abbrev_len_for_pack(p, mad); } diff --git a/t/t5318-midx.sh b/t/t5318-midx.sh new file mode 100755 index 00000000000000..00be852ed33cbc --- /dev/null +++ b/t/t5318-midx.sh @@ -0,0 +1,189 @@ +#!/bin/sh + +test_description='multi-pack indexes' +. ./test-lib.sh + +test_expect_success 'config' \ + 'rm -rf .git && + mkdir full && + cd full && + git init && + git config core.midx true && + git config pack.threads 1 && + packdir=.git/objects/pack' + +test_expect_success 'write-midx with no packs' \ + 'midx0=$(git midx --write) && + test "a$midx0" = "a"' + +test_expect_success 'create objects' \ + 'for i in $(test_seq 100) + do + echo $i >file-1-$i + done && + git add file-* && + test_tick && + git commit -m "test data 1" && + git branch commit1 HEAD' + +_midx_read_expect() { + cat >expect <<- EOF + header: 4d494458 1 1 20 0 5 $1 + num_objects: $2 + chunks: pack_lookup pack_names oid_fanout oid_lookup object_offsets + pack_names: + $(ls $3 | grep pack | grep -v idx | sort) + pack_dir: $3 + EOF +} + +_midx_git_two_modes() { + git -c core.midx=true $1 >output + git -c core.midx=false $1 >expect +} + +_midx_git_behavior() { + test_expect_success 'check normal git operations' \ + '_midx_git_two_modes "log --patch master" && + cmp output expect && + _midx_git_two_modes "rev-list --all --objects" && + cmp output expect' +} + +test_expect_success 'write-midx from index version 1' \ + 'pack1=$(git rev-list --all --objects | git pack-objects --index-version=1 ${packdir}/test-1) && + midx1=$(git midx --write) && + test_path_is_file ${packdir}/midx-${midx1}.midx && + test_path_is_missing ${packdir}/midx-head && + _midx_read_expect \ + "1" "102" \ + "${packdir}" && + git midx --read --midx-id=${midx1} >output && + cmp output expect' + +_midx_git_behavior + +test_expect_success 'write-midx from index version 2' \ + 'rm "${packdir}/test-1-${pack1}.pack" && + pack2=$(git rev-list --all --objects | git pack-objects --index-version=2 ${packdir}/test-2) && + midx2=$(git midx --write --update-head) && + test_path_is_file ${packdir}/midx-${midx2}.midx && + test_path_is_file ${packdir}/midx-head && + test $(cat ${packdir}/midx-head) = "$midx2" && + _midx_read_expect \ + "1" "102" \ + "${packdir}" && + git midx --read> output && + cmp output expect' + +_midx_git_behavior + +test_expect_success 'Create more objects' \ + 'for i in $(test_seq 100) + do + echo extra-$i >file-2-$i + done && + git add file-* && + test_tick && + git commit -m "test data 2" && + git branch commit2 HEAD' + +_midx_git_behavior + +test_expect_success 'write-midx with two packs' \ + 'pack3=$(git rev-list --objects commit2 ^commit1 | git pack-objects --index-version=2 ${packdir}/test-3) && + midx3=$(git midx --write --update-head) && + test_path_is_file ${packdir}/midx-${midx3}.midx && + test_path_is_file ${packdir}/midx-${midx2}.midx && + test_path_is_file ${packdir}/midx-head && + test $(cat ${packdir}/midx-head) = "$midx3" && + _midx_read_expect \ + "2" "204" \ + "${packdir}" && + git midx --read >output && + cmp output expect' + +_midx_git_behavior + +test_expect_success 'Add more packs' \ + 'for i in $(test_seq 10) + do + iii=$(printf '%03i' $i) + test-genrandom "bar" 200 > wide_delta_$iii && + test-genrandom "baz $iii" 50 >> wide_delta_$iii && + test-genrandom "foo"$i 100 > deep_delta_$iii && + test-genrandom "foo"$(expr $i + 1) 100 >> deep_delta_$iii && + test-genrandom "foo"$(expr $i + 2) 100 >> deep_delta_$iii && + echo $iii >file_$iii && + test-genrandom "$iii" 8192 >>file_$iii && + git update-index --add file_$iii deep_delta_$iii wide_delta_$iii && + { echo 101 && test-genrandom 100 8192; } >file_101 && + git update-index --add file_101 && + tree=$(git write-tree) && + commit=$(git commit-tree $tree -p HEADobj-list && + git update-ref HEAD $commit && + git pack-objects --index-version=2 ${packdir}/test-pack output && + cmp output expect' + +_midx_git_behavior + +test_expect_success 'write-midx with no new packs' \ + 'midx5=$(git midx --write --update-head --delete-expired) && + test_path_is_file ${packdir}/midx-${midx5}.midx && + test "a$midx4" = "a$midx5" && + test_path_is_file ${packdir}/midx-head && + test $(cat ${packdir}/midx-head) = "$midx4"' + +_midx_git_behavior + +test_expect_success 'create bare repo' \ + 'cd .. && + git clone --bare full bare && + cd bare && + git config core.midx true && + git config pack.threads 1 && + baredir=./objects/pack' + +test_expect_success 'write-midx in bare repo' \ + 'midxbare=$(git midx --write --update-head --delete-expired) && + test_path_is_file ${baredir}/midx-${midxbare}.midx && + test_path_is_file ${baredir}/midx-head && + test $(cat ${baredir}/midx-head) = "$midxbare" && + _midx_read_expect \ + "12" "245" \ + "${baredir}" && + git midx --read >output && + cmp output expect' + +_midx_git_behavior + +test_expect_success 'midx --clear' \ + 'git midx --clear && + test_path_is_missing "${baredir}/midx-${midx4}.midx" && + test_path_is_missing "${baredir}/midx-head" && + cd ../full && + git midx --clear && + test_path_is_missing "${packdir}/midx-${midx4}.midx" && + test_path_is_missing "${packdir}/midx-head"' + +_midx_git_behavior + +test_done