From 02a54e072a40e1ddd046c768dc186bdbd0bfe9b3 Mon Sep 17 00:00:00 2001 From: John Ericson Date: Sat, 4 Nov 2023 15:35:38 -0400 Subject: [PATCH] Git object hashing in libutil This is the core functionality but just unit-tested and not yet made part of the store layer. This is because there is some tech debt around (a) repeated boilerplate hashing objects (b) better integration of the new `SourceAccessor` type that needs to be cleaned up first. Part of RFC 133 Co-Authored-By: Matthew Bauer Co-Authored-By: Carlo Nucera Co-authored-by: Robert Hensing Co-authored-by: Florian Klink --- src/libutil/experimental-features.cc | 8 + src/libutil/experimental-features.hh | 1 + src/libutil/git.cc | 261 ++++++++++++++++++++++++++- src/libutil/git.hh | 141 ++++++++++++++- src/libutil/serialise.cc | 4 + src/libutil/serialise.hh | 1 + src/libutil/tests/git.cc | 241 ++++++++++++++++++++++--- 7 files changed, 620 insertions(+), 37 deletions(-) diff --git a/src/libutil/experimental-features.cc b/src/libutil/experimental-features.cc index 6b9427423ace..f37ee4899d59 100644 --- a/src/libutil/experimental-features.cc +++ b/src/libutil/experimental-features.cc @@ -82,6 +82,14 @@ constexpr std::array xpFeatureDetails [`nix`](@docroot@/command-ref/new-cli/nix.md) for details. )", }, + { + .tag = Xp::GitHashing, + .name = "git-hashing", + .description = R"( + Allow creating (content-addressed) store objects which are hashed via Git's hashing algorithm. + These store objects will not be understandable by older versions of Nix. + )", + }, { .tag = Xp::RecursiveNix, .name = "recursive-nix", diff --git a/src/libutil/experimental-features.hh b/src/libutil/experimental-features.hh index f005cc9eeee9..47f45cdc3db8 100644 --- a/src/libutil/experimental-features.hh +++ b/src/libutil/experimental-features.hh @@ -21,6 +21,7 @@ enum struct ExperimentalFeature ImpureDerivations, Flakes, NixCommand, + GitHashing, RecursiveNix, NoUrlLiterals, FetchClosure, diff --git a/src/libutil/git.cc b/src/libutil/git.cc index f35c2fdb75cf..82f4f1a35b34 100644 --- a/src/libutil/git.cc +++ b/src/libutil/git.cc @@ -1,9 +1,263 @@ +#include +#include +#include +#include +#include +#include // for strcasecmp + +#include "util.hh" +#include "config.hh" +#include "hash.hh" +#include "posix-source-accessor.hh" + #include "git.hh" +#include "serialise.hh" -#include +namespace nix::git { + +using namespace nix; +using namespace std::string_literals; + +std::optional decodeMode(RawMode m) { + switch (m) { + case (RawMode) Mode::Directory: + case (RawMode) Mode::Executable: + case (RawMode) Mode::Regular: + case (RawMode) Mode::Symlink: + return (Mode) m; + default: + return std::nullopt; + } +} + + +static std::string getStringUntil(Source & source, char byte) +{ + std::string s; + char n[1]; + source(std::string_view { n, 1 }); + while (*n != byte) { + s += *n; + source(std::string_view { n, 1 }); + } + return s; +} + + +static std::string getString(Source & source, int n) +{ + std::string v; + v.resize(n); + source(v); + return v; +} + + +void parse( + ParseSink & sink, + const Path & sinkPath, + Source & source, + std::function hook, + const ExperimentalFeatureSettings & xpSettings) +{ + xpSettings.require(Xp::GitHashing); + + auto type = getString(source, 5); + + if (type == "blob ") { + sink.createRegularFile(sinkPath); + + unsigned long long size = std::stoi(getStringUntil(source, 0)); + + sink.preallocateContents(size); + + unsigned long long left = size; + std::string buf; + buf.reserve(65536); + + while (left) { + checkInterrupt(); + buf.resize(std::min((unsigned long long)buf.capacity(), left)); + source(buf); + sink.receiveContents(buf); + left -= buf.size(); + } + } else if (type == "tree ") { + unsigned long long size = std::stoi(getStringUntil(source, 0)); + unsigned long long left = size; + + sink.createDirectory(sinkPath); + + while (left) { + std::string perms = getStringUntil(source, ' '); + left -= perms.size(); + left -= 1; + + RawMode rawMode = std::stoi(perms, 0, 8); + auto modeOpt = decodeMode(rawMode); + if (!modeOpt) + throw Error("Unknown Git permission: %o", perms); + auto mode = std::move(*modeOpt); + + std::string name = getStringUntil(source, '\0'); + left -= name.size(); + left -= 1; + + std::string hashs = getString(source, 20); + left -= 20; + + Hash hash(htSHA1); + std::copy(hashs.begin(), hashs.end(), hash.hash); + + hook(name, TreeEntry { + .mode = mode, + .hash = hash, + }); + + if (mode == Mode::Executable) + sink.isExecutable(); + } + } else throw Error("input doesn't look like a Git object"); +} + + +std::optional convertMode(SourceAccessor::Type type) +{ + switch (type) { + case SourceAccessor::tSymlink: return Mode::Symlink; + case SourceAccessor::tRegular: return Mode::Regular; + case SourceAccessor::tDirectory: return Mode::Directory; + case SourceAccessor::tMisc: return std::nullopt; + default: abort(); + } +} + + +void restore(ParseSink & sink, Source & source, std::function hook) +{ + parse(sink, "", source, [&](Path name, TreeEntry entry) { + auto [accessor, from] = hook(entry.hash); + auto stat = accessor->lstat(from); + auto gotOpt = convertMode(stat.type); + if (!gotOpt) + throw Error("file '%s' (git hash %s) has an unsupported type", + from, + entry.hash.to_string(HashFormat::Base16, false)); + auto & got = *gotOpt; + if (got != entry.mode) + throw Error("git mode of file '%s' (git hash %s) is %o but expected %o", + from, + entry.hash.to_string(HashFormat::Base16, false), + (RawMode) got, + (RawMode) entry.mode); + copyRecursive( + *accessor, from, + sink, name); + }); +} + + +void dumpBlobPrefix( + uint64_t size, Sink & sink, + const ExperimentalFeatureSettings & xpSettings) +{ + xpSettings.require(Xp::GitHashing); + auto s = fmt("blob %d\0"s, std::to_string(size)); + sink(s); +} + + +void dumpTree(const Tree & entries, Sink & sink, + const ExperimentalFeatureSettings & xpSettings) +{ + xpSettings.require(Xp::GitHashing); + + std::string v1; + + for (auto & [name, entry] : entries) { + auto name2 = name; + if (entry.mode == Mode::Directory) { + assert(name2.back() == '/'); + name2.pop_back(); + } + v1 += fmt("%o %s\0"s, static_cast(entry.mode), name2); + std::copy(entry.hash.hash, entry.hash.hash + entry.hash.hashSize, std::back_inserter(v1)); + } + + { + auto s = fmt("tree %d\0"s, v1.size()); + sink(s); + } + + sink(v1); +} + + +Mode dump( + SourceAccessor & accessor, const CanonPath & path, + Sink & sink, + std::function hook, + PathFilter & filter, + const ExperimentalFeatureSettings & xpSettings) +{ + auto st = accessor.lstat(path); + + switch (st.type) { + case SourceAccessor::tRegular: + { + accessor.readFile(path, sink, [&](uint64_t size) { + dumpBlobPrefix(size, sink, xpSettings); + }); + return st.isExecutable + ? Mode::Executable + : Mode::Regular; + } + + case SourceAccessor::tDirectory: + { + Tree entries; + for (auto & [name, _] : accessor.readDirectory(path)) { + auto child = path + name; + if (!filter(child.abs())) continue; + + auto entry = hook(child); + + auto name2 = name; + if (entry.mode == Mode::Directory) + name2 += "/"; + + entries.insert_or_assign(std::move(name2), std::move(entry)); + } + dumpTree(entries, sink, xpSettings); + return Mode::Directory; + } + + case SourceAccessor::tSymlink: + case SourceAccessor::tMisc: + default: + throw Error("file '%1%' has an unsupported type", path); + } +} + + +TreeEntry dumpHash( + HashType ht, + SourceAccessor & accessor, const CanonPath & path, PathFilter & filter) +{ + std::function hook; + hook = [&](const CanonPath & path) -> TreeEntry { + auto hashSink = HashSink(ht); + auto mode = dump(accessor, path, hashSink, hook, filter); + auto hash = hashSink.finish().first; + return { + .mode = mode, + .hash = hash, + }; + }; + + return hook(path); +} -namespace nix { -namespace git { std::optional parseLsRemoteLine(std::string_view line) { @@ -22,4 +276,3 @@ std::optional parseLsRemoteLine(std::string_view line) } } -} diff --git a/src/libutil/git.hh b/src/libutil/git.hh index bf2b9a2869ab..30346007280d 100644 --- a/src/libutil/git.hh +++ b/src/libutil/git.hh @@ -5,9 +5,127 @@ #include #include -namespace nix { +#include "types.hh" +#include "serialise.hh" +#include "hash.hh" +#include "source-accessor.hh" +#include "fs-sink.hh" -namespace git { +namespace nix::git { + +using RawMode = uint32_t; + +enum struct Mode : RawMode { + Directory = 0040000, + Executable = 0100755, + Regular = 0100644, + Symlink = 0120000, +}; + +std::optional decodeMode(RawMode m); + +/** + * An anonymous Git tree object entry (no name part). + */ +struct TreeEntry +{ + Mode mode; + Hash hash; + + GENERATE_CMP(TreeEntry, me->mode, me->hash); +}; + +/** + * A Git tree object, fully decoded and stored in memory. + * + * Directory names must end in a `/` for sake of sorting. See + * https://github.com/mirage/irmin/issues/352 + */ +using Tree = std::map; + +/** + * Callback for processing a child hash with `parse` + * + * The function should + * + * 1. Obtain the file system objects denoted by `gitHash` + * + * 2. Ensure they match `mode` + * + * 3. Feed them into the same sink `parse` was called with + * + * Implementations may seek to memoize resources (bandwidth, storage, + * etc.) for the same Git hash. + */ +using SinkHook = void(const Path & name, TreeEntry entry); + +void parse( + ParseSink & sink, const Path & sinkPath, + Source & source, + std::function hook, + const ExperimentalFeatureSettings & xpSettings = experimentalFeatureSettings); + +/** + * Assists with writing a `SinkHook` step (2). + */ +std::optional convertMode(SourceAccessor::Type type); + +/** + * Simplified version of `SinkHook` for `restore`. + * + * Given a `Hash`, return a `SourceAccessor` and `CanonPath` pointing to + * the file system object with that path. + */ +using RestoreHook = std::pair(Hash); + +/** + * Wrapper around `parse` and `RestoreSink` + */ +void restore(ParseSink & sink, Source & source, std::function hook); + +/** + * Dumps a single file to a sink + * + * @param xpSettings for testing purposes + */ +void dumpBlobPrefix( + uint64_t size, Sink & sink, + const ExperimentalFeatureSettings & xpSettings = experimentalFeatureSettings); + +/** + * Dumps a representation of a git tree to a sink + */ +void dumpTree( + const Tree & entries, Sink & sink, + const ExperimentalFeatureSettings & xpSettings = experimentalFeatureSettings); + +/** + * Callback for processing a child with `dump` + * + * The function should return the Git hash and mode of the file at the + * given path in the accessor passed to `dump`. + * + * Note that if the child is a directory, its child in must also be so + * processed in order to compute this information. + */ +using DumpHook = TreeEntry(const CanonPath & path); + +Mode dump( + SourceAccessor & accessor, const CanonPath & path, + Sink & sink, + std::function hook, + PathFilter & filter = defaultPathFilter, + const ExperimentalFeatureSettings & xpSettings = experimentalFeatureSettings); + +/** + * Recursively dumps path, hashing as we go. + * + * A smaller wrapper around `dump`. + */ +TreeEntry dumpHash( + HashType ht, + SourceAccessor & accessor, const CanonPath & path, + PathFilter & filter = defaultPathFilter); /** * A line from the output of `git ls-remote --symref`. @@ -16,15 +134,17 @@ namespace git { * * - Symbolic references of the form * - * ref: {target} {reference} - * - * where {target} is itself a reference and {reference} is optional + * ``` + * ref: {target} {reference} + * ``` + * where {target} is itself a reference and {reference} is optional * * - Object references of the form * - * {target} {reference} - * - * where {target} is a commit id and {reference} is mandatory + * ``` + * {target} {reference} + * ``` + * where {target} is a commit id and {reference} is mandatory */ struct LsRemoteRefLine { enum struct Kind { @@ -36,8 +156,9 @@ struct LsRemoteRefLine { std::optional reference; }; +/** + * Parse an `LsRemoteRefLine` + */ std::optional parseLsRemoteLine(std::string_view line); } - -} diff --git a/src/libutil/serialise.cc b/src/libutil/serialise.cc index 3d5121a19fa6..e5205ce79f00 100644 --- a/src/libutil/serialise.cc +++ b/src/libutil/serialise.cc @@ -74,6 +74,10 @@ void Source::operator () (char * data, size_t len) } } +void Source::operator () (std::string_view data) +{ + (*this)((char *)data.data(), data.size()); +} void Source::drainInto(Sink & sink) { diff --git a/src/libutil/serialise.hh b/src/libutil/serialise.hh index 333c254ea8e3..71c40c83a7d3 100644 --- a/src/libutil/serialise.hh +++ b/src/libutil/serialise.hh @@ -72,6 +72,7 @@ struct Source * an error if it is not going to be available. */ void operator () (char * data, size_t len); + void operator () (std::string_view data); /** * Store up to ‘len’ in the buffer pointed to by ‘data’, and diff --git a/src/libutil/tests/git.cc b/src/libutil/tests/git.cc index 5b5715fc26bb..e5d187230069 100644 --- a/src/libutil/tests/git.cc +++ b/src/libutil/tests/git.cc @@ -1,33 +1,228 @@ -#include "git.hh" #include +#include "git.hh" +#include "memory-source-accessor.hh" + +#include "tests/characterization.hh" + namespace nix { - TEST(GitLsRemote, parseSymrefLineWithReference) { - auto line = "ref: refs/head/main HEAD"; - auto res = git::parseLsRemoteLine(line); - ASSERT_TRUE(res.has_value()); - ASSERT_EQ(res->kind, git::LsRemoteRefLine::Kind::Symbolic); - ASSERT_EQ(res->target, "refs/head/main"); - ASSERT_EQ(res->reference, "HEAD"); - } +using namespace git; + +class GitTest : public CharacterizationTest +{ + Path unitTestData = getUnitTestData() + "/libutil/git"; - TEST(GitLsRemote, parseSymrefLineWithNoReference) { - auto line = "ref: refs/head/main"; - auto res = git::parseLsRemoteLine(line); - ASSERT_TRUE(res.has_value()); - ASSERT_EQ(res->kind, git::LsRemoteRefLine::Kind::Symbolic); - ASSERT_EQ(res->target, "refs/head/main"); - ASSERT_EQ(res->reference, std::nullopt); +public: + + Path goldenMaster(std::string_view testStem) const override { + return unitTestData + "/" + testStem; } - TEST(GitLsRemote, parseObjectRefLine) { - auto line = "abc123 refs/head/main"; - auto res = git::parseLsRemoteLine(line); - ASSERT_TRUE(res.has_value()); - ASSERT_EQ(res->kind, git::LsRemoteRefLine::Kind::Object); - ASSERT_EQ(res->target, "abc123"); - ASSERT_EQ(res->reference, "refs/head/main"); + /** + * We set these in tests rather than the regular globals so we don't have + * to worry about race conditions if the tests run concurrently. + */ + ExperimentalFeatureSettings mockXpSettings; + +private: + + void SetUp() override + { + mockXpSettings.set("experimental-features", "git-hashing"); } +}; + +TEST(GitMode, gitMode_directory) { + Mode m = Mode::Directory; + RawMode r = 0040000; + ASSERT_EQ(static_cast(m), r); + ASSERT_EQ(decodeMode(r), std::optional { m }); +}; + +TEST(GitMode, gitMode_executable) { + Mode m = Mode::Executable; + RawMode r = 0100755; + ASSERT_EQ(static_cast(m), r); + ASSERT_EQ(decodeMode(r), std::optional { m }); +}; + +TEST(GitMode, gitMode_regular) { + Mode m = Mode::Regular; + RawMode r = 0100644; + ASSERT_EQ(static_cast(m), r); + ASSERT_EQ(decodeMode(r), std::optional { m }); +}; + +TEST(GitMode, gitMode_symlink) { + Mode m = Mode::Symlink; + RawMode r = 0120000; + ASSERT_EQ(static_cast(m), r); + ASSERT_EQ(decodeMode(r), std::optional { m }); +}; + +TEST_F(GitTest, blob_read) { + readTest("hello-world-blob.bin", [&](const auto & encoded) { + StringSource in { encoded }; + StringSink out; + RegularFileSink out2 { out }; + parse(out2, "", in, [](auto &, auto) {}, mockXpSettings); + + auto expected = readFile(goldenMaster("hello-world.bin")); + + ASSERT_EQ(out.s, expected); + }); +} + +TEST_F(GitTest, blob_write) { + writeTest("hello-world-blob.bin", [&]() { + auto decoded = readFile(goldenMaster("hello-world.bin")); + StringSink s; + dumpBlobPrefix(decoded.size(), s, mockXpSettings); + s(decoded); + return s.s; + }); +} + +const static Tree tree = { + { + "Foo", + { + .mode = Mode::Regular, + .hash = Hash::parseAny("ffcc0268f56466471d2287140b4a2c054b9c5cb8", htSHA1), + }, + }, + { + "bAr", + { + .mode = Mode::Executable, + .hash = Hash::parseAny("38a5b9833333934dc0aae2d728a64612ac8a1a4d", htSHA1), + }, + }, + { + "baZ/", + { + .mode = Mode::Directory, + .hash = Hash::parseAny("43d728d3d8f2c3ca534b51569eb5aa521f45eb73", htSHA1), + }, + }, +}; + +TEST_F(GitTest, tree_read) { + readTest("tree.bin", [&](const auto & encoded) { + StringSource in { encoded }; + NullParseSink out; + Tree got; + parse(out, "", in, [&](auto & name, auto entry) { + auto name2 = name; + if (entry.mode == Mode::Directory) + name2 += '/'; + got.insert_or_assign(name2, std::move(entry)); + }, mockXpSettings); + + ASSERT_EQ(got, tree); + }); +} + +TEST_F(GitTest, tree_write) { + writeTest("tree.bin", [&]() { + StringSink s; + dumpTree(tree, s, mockXpSettings); + return s.s; + }); } +TEST_F(GitTest, both_roundrip) { + using File = MemorySourceAccessor::File; + + MemorySourceAccessor files; + files.root = File::Directory { + .contents { + { + "foo", + File::Regular { + .contents = "hello\n\0\n\tworld!", + }, + }, + { + "bar", + File::Directory { + .contents = { + { + "baz", + File::Regular { + .executable = true, + .contents = "good day,\n\0\n\tworld!", + }, + }, + }, + }, + }, + }, + }; + + std::map cas; + + std::function dumpHook; + dumpHook = [&](const CanonPath & path) { + StringSink s; + HashSink hashSink { htSHA1 }; + TeeSink s2 { s, hashSink }; + auto mode = dump( + files, path, s2, dumpHook, + defaultPathFilter, mockXpSettings); + auto hash = hashSink.finish().first; + cas.insert_or_assign(hash, std::move(s.s)); + return TreeEntry { + .mode = mode, + .hash = hash, + }; + }; + + auto root = dumpHook(CanonPath::root); + + MemorySourceAccessor files2; + + MemorySink sinkFiles2 { files2 }; + + std::function mkSinkHook; + mkSinkHook = [&](const Path prefix, const Hash & hash) { + StringSource in { cas[hash] }; + parse(sinkFiles2, prefix, in, [&](const Path & name, const auto & entry) { + mkSinkHook(prefix + "/" + name, entry.hash); + }, mockXpSettings); + }; + + mkSinkHook("", root.hash); + + ASSERT_EQ(files, files2); +} + +TEST(GitLsRemote, parseSymrefLineWithReference) { + auto line = "ref: refs/head/main HEAD"; + auto res = parseLsRemoteLine(line); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(res->kind, LsRemoteRefLine::Kind::Symbolic); + ASSERT_EQ(res->target, "refs/head/main"); + ASSERT_EQ(res->reference, "HEAD"); +} + +TEST(GitLsRemote, parseSymrefLineWithNoReference) { + auto line = "ref: refs/head/main"; + auto res = parseLsRemoteLine(line); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(res->kind, LsRemoteRefLine::Kind::Symbolic); + ASSERT_EQ(res->target, "refs/head/main"); + ASSERT_EQ(res->reference, std::nullopt); +} + +TEST(GitLsRemote, parseObjectRefLine) { + auto line = "abc123 refs/head/main"; + auto res = parseLsRemoteLine(line); + ASSERT_TRUE(res.has_value()); + ASSERT_EQ(res->kind, LsRemoteRefLine::Kind::Object); + ASSERT_EQ(res->target, "abc123"); + ASSERT_EQ(res->reference, "refs/head/main"); +} + +}