Skip to content

Commit

Permalink
chunk-format: allow trailing table of contents
Browse files Browse the repository at this point in the history
The existing chunk formats use the table of contents at the beginning of
the file. This is intended as a way to speed up the initial loading of
the file, but comes at a cost during writes. Each example needs to fully
compute how big each chunk will be in advance, which usually requires
storing the full file contents in memory.

Future file formats may want to use the chunk format API in cases where
the writing stage is critical to performance, so we may want to stream
updates from an existing file and then only write the table of contents
at the end.

Add a new 'flags' parameter to write_chunkfile() that allows this
behavior. When this is specified, the defensive programming that checks
that the chunks are written with the precomputed sizes is disabled.
Then, the table of contents is written in reverse order at the end of
the hashfile, so a parser can read the chunk list starting from the end
of the file (minus the hash).

The parsing of these table of contents will come in a later change.

Signed-off-by: Derrick Stolee <derrickstolee@github.com>
  • Loading branch information
derrickstolee committed Nov 1, 2022
1 parent 239eab4 commit 7fca0e5
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 19 deletions.
53 changes: 37 additions & 16 deletions chunk-format.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,26 +57,31 @@ void add_chunk(struct chunkfile *cf,
cf->chunks_nr++;
}

int write_chunkfile(struct chunkfile *cf, void *data)
int write_chunkfile(struct chunkfile *cf,
enum chunkfile_flags flags,
void *data)
{
int i, result = 0;
uint64_t cur_offset = hashfile_total(cf->f);

trace2_region_enter("chunkfile", "write", the_repository);

/* Add the table of contents to the current offset */
cur_offset += (cf->chunks_nr + 1) * CHUNK_TOC_ENTRY_SIZE;
if (!(flags & CHUNKFILE_TRAILING_TOC)) {
uint64_t cur_offset = hashfile_total(cf->f);

for (i = 0; i < cf->chunks_nr; i++) {
hashwrite_be32(cf->f, cf->chunks[i].id);
hashwrite_be64(cf->f, cur_offset);
/* Add the table of contents to the current offset */
cur_offset += (cf->chunks_nr + 1) * CHUNK_TOC_ENTRY_SIZE;

cur_offset += cf->chunks[i].size;
}
for (i = 0; i < cf->chunks_nr; i++) {
hashwrite_be32(cf->f, cf->chunks[i].id);
hashwrite_be64(cf->f, cur_offset);

/* Trailing entry marks the end of the chunks */
hashwrite_be32(cf->f, 0);
hashwrite_be64(cf->f, cur_offset);
cur_offset += cf->chunks[i].size;
}

/* Trailing entry marks the end of the chunks */
hashwrite_be32(cf->f, 0);
hashwrite_be64(cf->f, cur_offset);
}

for (i = 0; i < cf->chunks_nr; i++) {
cf->chunks[i].offset = hashfile_total(cf->f);
Expand All @@ -85,10 +90,26 @@ int write_chunkfile(struct chunkfile *cf, void *data)
if (result)
goto cleanup;

if (hashfile_total(cf->f) - cf->chunks[i].offset != cf->chunks[i].size)
BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead",
cf->chunks[i].size, cf->chunks[i].id,
hashfile_total(cf->f) - cf->chunks[i].offset);
if (!(flags & CHUNKFILE_TRAILING_TOC)) {
if (hashfile_total(cf->f) - cf->chunks[i].offset != cf->chunks[i].size)
BUG("expected to write %"PRId64" bytes to chunk %"PRIx32", but wrote %"PRId64" instead",
cf->chunks[i].size, cf->chunks[i].id,
hashfile_total(cf->f) - cf->chunks[i].offset);
}

cf->chunks[i].size = hashfile_total(cf->f) - cf->chunks[i].offset;
}

if (flags & CHUNKFILE_TRAILING_TOC) {
size_t last_chunk_tail = hashfile_total(cf->f);
/* First entry marks the end of the chunks */
hashwrite_be32(cf->f, 0);
hashwrite_be64(cf->f, last_chunk_tail);

for (i = cf->chunks_nr - 1; i >= 0; i--) {
hashwrite_be32(cf->f, cf->chunks[i].id);
hashwrite_be64(cf->f, cf->chunks[i].offset);
}
}

cleanup:
Expand Down
9 changes: 8 additions & 1 deletion chunk-format.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,14 @@ void add_chunk(struct chunkfile *cf,
uint32_t id,
size_t size,
chunk_write_fn fn);
int write_chunkfile(struct chunkfile *cf, void *data);

enum chunkfile_flags {
CHUNKFILE_TRAILING_TOC = (1 << 0),
};

int write_chunkfile(struct chunkfile *cf,
enum chunkfile_flags flags,
void *data);

int read_table_of_contents(struct chunkfile *cf,
const unsigned char *mfile,
Expand Down
2 changes: 1 addition & 1 deletion commit-graph.c
Original file line number Diff line number Diff line change
Expand Up @@ -1932,7 +1932,7 @@ static int write_commit_graph_file(struct write_commit_graph_context *ctx)
get_num_chunks(cf) * ctx->commits.nr);
}

write_chunkfile(cf, ctx);
write_chunkfile(cf, 0, ctx);

stop_progress(&ctx->progress);
strbuf_release(&progress_title);
Expand Down
2 changes: 1 addition & 1 deletion midx.c
Original file line number Diff line number Diff line change
Expand Up @@ -1480,7 +1480,7 @@ static int write_midx_internal(const char *object_dir,
}

write_midx_header(f, get_num_chunks(cf), ctx.nr - dropped_packs);
write_chunkfile(cf, &ctx);
write_chunkfile(cf, 0, &ctx);

finalize_hashfile(f, midx_hash, FSYNC_COMPONENT_PACK_METADATA,
CSUM_FSYNC | CSUM_HASH_IN_STREAM);
Expand Down

0 comments on commit 7fca0e5

Please sign in to comment.