From 2cc4b58d3b54a23492cbe54c775934633a608387 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 21 Mar 2024 12:38:41 +0100 Subject: [PATCH 01/17] Playing around with this and that, let's see how fast we can get this done. --- src/engine/LocalVocab.cpp | 35 +++++++---------------------------- src/engine/LocalVocab.h | 20 ++++++-------------- src/global/IndexTypes.h | 9 ++++++++- src/global/ValueId.h | 11 +++++++---- test/util/IdTestHelpers.h | 2 +- 5 files changed, 29 insertions(+), 48 deletions(-) diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index a4ba9167ff..23278b5a8f 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -14,26 +14,6 @@ LocalVocab LocalVocab::clone() const { // First, make a deep copy of the `absl::node_hash_map` holding the actual // map of strings to indexes. localVocabClone.wordsToIndexesMap_ = this->wordsToIndexesMap_; - // The next free index should be the same. - localVocabClone.nextFreeIndex_ = this->nextFreeIndex_; - // The map from local ids to strings stores pointers to strings. So we cannot - // just copy these from `this->indexesToWordsMap_` to `localVocabClone`, but - // we need to make sure to store the pointers to the strings of the new map - // `localVocabClone.wordsToIndexesMap_`. - // - // NOTE: An alternative algorithm would be to sort the word-index pairs in - // `wordsToIndexesMap_` by index and then fill `indexesToWordsMap_` in order. - // This would have better locality, but the sorting takes non-linear time plus - // the sorting has to handle pairs of `LocalVocabIndex` and `std::string`. So - // for very large local vocabularies (and only then is this operation - // performance-criticial at all), the simpler approach below is probably - // better. - const size_t localVocabSize = this->size(); - localVocabClone.indexesToWordsMap_.resize(localVocabSize); - for (const auto& [wordInMap, index] : localVocabClone.wordsToIndexesMap_) { - AD_CONTRACT_CHECK(index.get() < localVocabSize); - localVocabClone.indexesToWordsMap_[index.get()] = std::addressof(wordInMap); - } // Return the clone. return localVocabClone; } @@ -53,14 +33,11 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { // called (even when the ID is not actually needed because the word is already // contained in the map). // + // TODO Make this work with transparent hashing and use try_emplace. auto [wordInMapAndIndex, isNewWord] = - wordsToIndexesMap_.insert({std::forward(word), nextFreeIndex_}); - const auto& [wordInMap, index] = *wordInMapAndIndex; - if (isNewWord) { - indexesToWordsMap_.push_back(&wordInMap); - nextFreeIndex_ = LocalVocabIndex::make(indexesToWordsMap_.size()); - } - return index; + wordsToIndexesMap_.insert(AlignedStr{std::forward(word)}); + const auto& wordInMap = *wordInMapAndIndex; + return &wordInMap; } // _____________________________________________________________________________ @@ -77,7 +54,9 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(std::string&& word) { // _____________________________________________________________________________ std::optional LocalVocab::getIndexOrNullopt( const std::string& word) const { - auto localVocabIndex = wordsToIndexesMap_.find(word); + // TODO Maybe we can make this work with transparent hashing, + // but if this is only a testing API this is probably not worth the hassle. + auto localVocabIndex = wordsToIndexesMap_.find(AlignedStr{word}); if (localVocabIndex != wordsToIndexesMap_.end()) { return localVocabIndex->second; } else { diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index 95572f7b22..b2e262c621 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -9,7 +9,7 @@ #include #include -#include "absl/container/node_hash_map.h" +#include "absl/container/node_hash_set.h" #include "global/Id.h" // A class for maintaing a local vocabulary with contiguous (local) IDs. This is @@ -21,21 +21,13 @@ // defined inside of the `ResultTable` class. You gotta start somewhere. class LocalVocab { private: + + // A map of the words in the local vocabulary to their local IDs. This is a // node hash map because we need the addresses of the words (which are of type // `std::string`) to remain stable over their lifetime in the hash map because // we refer to them in `wordsToIdsMap_` below. - absl::node_hash_map wordsToIndexesMap_; - - // A map of the local IDs to the words. Since the IDs are contiguous, we can - // use a `std::vector`. We store pointers to the actual words in - // `wordsToIdsMap_` to avoid storing every word twice. This saves space, but - // costs us an indirection when looking up a word by its ID. - std::vector indexesToWordsMap_; - - // The next free local ID (will be incremented by one each time we add a new - // word). - LocalVocabIndex nextFreeIndex_ = LocalVocabIndex::make(0); + absl::node_hash_set wordsToIndexesMap_; public: // Create a new, empty local vocabulary. @@ -65,10 +57,10 @@ class LocalVocab { const std::string& word) const; // The number of words in the vocabulary. - size_t size() const { return indexesToWordsMap_.size(); } + size_t size() const { return wordsToIndexesMap_.size(); } // Return true if and only if the local vocabulary is empty. - bool empty() const { return indexesToWordsMap_.empty(); } + bool empty() const { return wordsToIndexesMap_.empty(); } // Return a const reference to the word. const std::string& getWord(LocalVocabIndex localVocabIndex) const; diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h index 18506f58c5..507a597121 100644 --- a/src/global/IndexTypes.h +++ b/src/global/IndexTypes.h @@ -13,7 +13,14 @@ // this (very intrusive) renaming doesn't interfere with too many open pull // requests. using VocabIndex = ad_utility::TypedIndex; -using LocalVocabIndex = ad_utility::TypedIndex; + +// TODO Where to put this best... +struct alignas(16) AlignedStr : public std::string { + using std::string::basic_string; + explicit AlignedStr(std::string s) : std::string{std::move(s)} {} +}; +//using LocalVocabIndex = ad_utility::TypedIndex; +using LocalVocabIndex = const AlignedStr*; using TextRecordIndex = ad_utility::TypedIndex; using WordVocabIndex = ad_utility::TypedIndex; using BlankNodeIndex = ad_utility::TypedIndex; diff --git a/src/global/ValueId.h b/src/global/ValueId.h index 7c45b0045c..e575339bfc 100644 --- a/src/global/ValueId.h +++ b/src/global/ValueId.h @@ -211,7 +211,7 @@ class ValueId { return makeFromIndex(index.get(), Datatype::TextRecordIndex); } static ValueId makeFromLocalVocabIndex(LocalVocabIndex index) { - return makeFromIndex(index.get(), Datatype::LocalVocabIndex); + return makeFromIndex(reinterpret_cast(index) >> numDatatypeBits, Datatype::LocalVocabIndex); } static ValueId makeFromWordVocabIndex(WordVocabIndex index) { return makeFromIndex(index.get(), Datatype::WordVocabIndex); @@ -229,8 +229,8 @@ class ValueId { [[nodiscard]] constexpr TextRecordIndex getTextRecordIndex() const noexcept { return TextRecordIndex::make(removeDatatypeBits(_bits)); } - [[nodiscard]] constexpr LocalVocabIndex getLocalVocabIndex() const noexcept { - return LocalVocabIndex::make(removeDatatypeBits(_bits)); + [[nodiscard]] LocalVocabIndex getLocalVocabIndex() const noexcept { + return reinterpret_cast(_bits << numDatatypeBits); } [[nodiscard]] constexpr WordVocabIndex getWordVocabIndex() const noexcept { return WordVocabIndex::make(removeDatatypeBits(_bits)); @@ -334,8 +334,11 @@ class ValueId { ostr << (value ? "true" : "false"); } else if constexpr (ad_utility::isSimilar) { ostr << value.toStringAndType().first; + } else if constexpr (ad_utility::isSimilar) { + AD_CORRECTNESS_CHECK(value != nullptr); + ostr << *value; } else { - // T is `VocabIndex || LocalVocabIndex || TextRecordIndex` + // T is `VocabIndex | TextRecordIndex` ostr << std::to_string(value.get()); } }; diff --git a/test/util/IdTestHelpers.h b/test/util/IdTestHelpers.h index 95e8a5eef2..291e3c5b18 100644 --- a/test/util/IdTestHelpers.h +++ b/test/util/IdTestHelpers.h @@ -19,7 +19,7 @@ inline auto VocabId = [](const auto& v) { }; inline auto LocalVocabId = [](const auto& v) { - return Id::makeFromLocalVocabIndex(LocalVocabIndex::make(v)); + return Id::makeFromLocalVocabIndex(reinterpret_cast(v)); }; inline auto TextRecordId = [](const auto& t) { From ce3afef4df909210435f27eb7c52f8584d12d303 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 21 Mar 2024 17:22:25 +0100 Subject: [PATCH 02/17] A first draft of merging local vocabs. There are still some issues with the tests, but overall, this should work. --- src/engine/LocalVocab.cpp | 31 ++++++++++++++----------- src/engine/LocalVocab.h | 16 +++++++++---- src/engine/ResultTable.h | 8 +++++++ src/global/IndexTypes.h | 2 +- src/global/ValueId.h | 7 +++++- test/LocalVocabTest.cpp | 41 ++++++++++++++++----------------- test/ValueIdComparatorsTest.cpp | 3 +++ test/ValueIdTest.cpp | 21 ++++++++++++----- test/ValueIdTestHelpers.h | 6 +++-- test/ValuesTest.cpp | 4 +++- test/util/IdTestHelpers.h | 9 ++++++-- 11 files changed, 95 insertions(+), 53 deletions(-) diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index 23278b5a8f..88ed5fb893 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -11,13 +11,22 @@ // _____________________________________________________________________________ LocalVocab LocalVocab::clone() const { LocalVocab localVocabClone; - // First, make a deep copy of the `absl::node_hash_map` holding the actual - // map of strings to indexes. - localVocabClone.wordsToIndexesMap_ = this->wordsToIndexesMap_; + localVocabClone.previousSets_ = previousSets_; + localVocabClone.previousSets_.push_back(wordsToIndexesMap_); // Return the clone. return localVocabClone; } +LocalVocab LocalVocab::merge(const LocalVocab& a, const LocalVocab& b) { + LocalVocab res; + auto inserter = std::back_inserter(res.previousSets_); + std::ranges::copy(a.previousSets_, inserter); + std::ranges::copy(b.previousSets_, inserter); + *inserter = a.wordsToIndexesMap_; + *inserter = b.wordsToIndexesMap_; + return res; +} + // _____________________________________________________________________________ template LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { @@ -35,7 +44,7 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { // // TODO Make this work with transparent hashing and use try_emplace. auto [wordInMapAndIndex, isNewWord] = - wordsToIndexesMap_.insert(AlignedStr{std::forward(word)}); + wordsToIndexesMap().insert(AlignedStr{std::forward(word)}); const auto& wordInMap = *wordInMapAndIndex; return &wordInMap; } @@ -56,9 +65,9 @@ std::optional LocalVocab::getIndexOrNullopt( const std::string& word) const { // TODO Maybe we can make this work with transparent hashing, // but if this is only a testing API this is probably not worth the hassle. - auto localVocabIndex = wordsToIndexesMap_.find(AlignedStr{word}); - if (localVocabIndex != wordsToIndexesMap_.end()) { - return localVocabIndex->second; + auto localVocabIndex = wordsToIndexesMap().find(AlignedStr{word}); + if (localVocabIndex != wordsToIndexesMap().end()) { + return &(*localVocabIndex); } else { return std::nullopt; } @@ -66,11 +75,5 @@ std::optional LocalVocab::getIndexOrNullopt( // _____________________________________________________________________________ const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const { - if (localVocabIndex.get() >= indexesToWordsMap_.size()) { - throw std::runtime_error(absl::StrCat( - "LocalVocab error: request for word with local vocab index ", - localVocabIndex.get(), ", but size of local vocab is only ", - indexesToWordsMap_.size(), ", please contact the developers")); - } - return *(indexesToWordsMap_.at(localVocabIndex.get())); + return *localVocabIndex; } diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index b2e262c621..e7a3d5f611 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -21,13 +21,17 @@ // defined inside of the `ResultTable` class. You gotta start somewhere. class LocalVocab { private: - - // A map of the words in the local vocabulary to their local IDs. This is a // node hash map because we need the addresses of the words (which are of type // `std::string`) to remain stable over their lifetime in the hash map because // we refer to them in `wordsToIdsMap_` below. - absl::node_hash_set wordsToIndexesMap_; + using Set = absl::node_hash_set; + std::vector> previousSets_; + std::shared_ptr wordsToIndexesMap_ = + std::make_shared>(); + + auto& wordsToIndexesMap() { return *wordsToIndexesMap_; } + const auto& wordsToIndexesMap() const { return *wordsToIndexesMap_; } public: // Create a new, empty local vocabulary. @@ -57,14 +61,16 @@ class LocalVocab { const std::string& word) const; // The number of words in the vocabulary. - size_t size() const { return wordsToIndexesMap_.size(); } + size_t size() const { return wordsToIndexesMap().size(); } // Return true if and only if the local vocabulary is empty. - bool empty() const { return wordsToIndexesMap_.empty(); } + bool empty() const { return wordsToIndexesMap().empty(); } // Return a const reference to the word. const std::string& getWord(LocalVocabIndex localVocabIndex) const; + static LocalVocab merge(const LocalVocab& a, const LocalVocab& b); + private: // Common implementation for the two variants of // `getIndexAndAddIfNotContainedImpl` above. diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h index 39950c6266..c7df4c4713 100644 --- a/src/engine/ResultTable.h +++ b/src/engine/ResultTable.h @@ -150,6 +150,13 @@ class ResultTable { const ResultTable&> static SharedLocalVocabWrapper getSharedLocalVocabFromNonEmptyOf( R&& subResults) { + // TODO Implement this much better. + LocalVocab res; + for (const ResultTable& table : subResults) { + res = LocalVocab::merge(res, *table.localVocab_); + } + return SharedLocalVocabWrapper{std::move(res)}; + /* AD_CONTRACT_CHECK(!std::ranges::empty(subResults)); auto hasNonEmptyVocab = [](const ResultTable& tbl) { return !tbl.localVocab_->empty(); @@ -173,6 +180,7 @@ class ResultTable { *std::ranges::find_if(subResults, hasNonEmptyVocab)) .localVocab_}; } + */ } // Get a (deep) copy of the local vocabulary from the given result. Use this diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h index 507a597121..d2e46da5e1 100644 --- a/src/global/IndexTypes.h +++ b/src/global/IndexTypes.h @@ -19,7 +19,7 @@ struct alignas(16) AlignedStr : public std::string { using std::string::basic_string; explicit AlignedStr(std::string s) : std::string{std::move(s)} {} }; -//using LocalVocabIndex = ad_utility::TypedIndex; +// using LocalVocabIndex = ad_utility::TypedIndex; using LocalVocabIndex = const AlignedStr*; using TextRecordIndex = ad_utility::TypedIndex; using WordVocabIndex = ad_utility::TypedIndex; diff --git a/src/global/ValueId.h b/src/global/ValueId.h index e575339bfc..e4efe921cb 100644 --- a/src/global/ValueId.h +++ b/src/global/ValueId.h @@ -136,6 +136,10 @@ class ValueId { /// TODO Is this ordering also consistent for corner cases of doubles /// (inf, nan, denormalized numbers, negative zero)? constexpr auto operator<=>(const ValueId& other) const { + if (getDatatype() == Datatype::LocalVocabIndex && + other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] { + return *getLocalVocabIndex() <=> *other.getLocalVocabIndex(); + } return _bits <=> other._bits; } @@ -211,7 +215,8 @@ class ValueId { return makeFromIndex(index.get(), Datatype::TextRecordIndex); } static ValueId makeFromLocalVocabIndex(LocalVocabIndex index) { - return makeFromIndex(reinterpret_cast(index) >> numDatatypeBits, Datatype::LocalVocabIndex); + return makeFromIndex(reinterpret_cast(index) >> numDatatypeBits, + Datatype::LocalVocabIndex); } static ValueId makeFromWordVocabIndex(WordVocabIndex index) { return makeFromIndex(index.get(), Datatype::WordVocabIndex); diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index 9f8244b2f8..ee87a54e76 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -53,25 +53,26 @@ TEST(LocalVocab, constructionAndAccess) { // Add the words from our test vocabulary and check that they get the expected // local vocab indexes. for (size_t i = 0; i < testWords.size(); ++i) { - ASSERT_EQ(localVocab.getIndexAndAddIfNotContained(testWords[i]), - LocalVocabIndex::make(i)); + ASSERT_EQ(*localVocab.getIndexAndAddIfNotContained(testWords[i]), + testWords[i]); } + size_t localVocabSize = localVocab.size(); ASSERT_EQ(localVocab.size(), testWords.size()); // Check that we get the same indexes if we do this again, but that no new // words will be added. for (size_t i = 0; i < testWords.size(); ++i) { - ASSERT_EQ(localVocab.getIndexAndAddIfNotContained(testWords[i]), - LocalVocabIndex::make(i)); + ASSERT_EQ(*localVocab.getIndexAndAddIfNotContained(testWords[i]), + testWords[i]); + ASSERT_EQ(localVocab.size(), localVocabSize); } - ASSERT_EQ(localVocab.size(), testWords.size()); // Check again that we get the right indexes, but with `getIndexOrNullopt`. for (size_t i = 0; i < testWords.size(); ++i) { std::optional localVocabIndex = localVocab.getIndexOrNullopt(testWords[i]); ASSERT_TRUE(localVocabIndex.has_value()); - ASSERT_EQ(localVocabIndex.value(), LocalVocabIndex::make(i)); + ASSERT_EQ(*localVocabIndex.value(), testWords[i]); } // Check that `getIndexOrNullopt` returns `std::nullopt` for words that are @@ -81,29 +82,23 @@ TEST(LocalVocab, constructionAndAccess) { ASSERT_FALSE(localVocab.getIndexOrNullopt(testWords[i] + "A")); } - // Check that the lookup by ID gives the correct words. - for (size_t i = 0; i < testWords.size(); ++i) { - ASSERT_EQ(localVocab.getWord(LocalVocabIndex::make(i)), testWords[i]); - } - ASSERT_EQ(localVocab.size(), testWords.size()); - - // Check that out-of-bound lookups are detected (this would have a caught the - // one-off bug in #820, LocalVocab.cpp line 55). - ASSERT_THROW(localVocab.getWord(LocalVocabIndex::make(testWords.size())), - std::runtime_error); - ASSERT_THROW(localVocab.getWord(LocalVocabIndex::make(-1)), - std::runtime_error); - // Check that a move gives the expected result. + localVocabSize = localVocab.size(); auto localVocabMoved = std::move(localVocab); - ASSERT_EQ(localVocab.size(), 0); + // TODO Should we reset the pointer? + // ASSERT_EQ(localVocab.size(), 0); ASSERT_EQ(localVocabMoved.size(), testWords.size()); + // Check that we get the same indexes if we do this again, but that no new + // words will be added. for (size_t i = 0; i < testWords.size(); ++i) { - ASSERT_EQ(localVocabMoved.getWord(LocalVocabIndex::make(i)), testWords[i]); + ASSERT_EQ(*localVocabMoved.getIndexAndAddIfNotContained(testWords[i]), + testWords[i]); + ASSERT_EQ(localVocabMoved.size(), localVocabSize); } } // _____________________________________________________________________________ +/* TEST(LocalVocab, clone) { // Create a small local vocabulary. size_t localVocabSize = 100; @@ -132,9 +127,12 @@ TEST(LocalVocab, clone) { ASSERT_EQ(localVocabClone.getIndexAndAddIfNotContained("blubb"), LocalVocabIndex::make(localVocabSize)); } + */ // _____________________________________________________________________________ TEST(LocalVocab, propagation) { + // TODO This test has to be rewritten + /* // Query execution context (with small test index), see `IndexTestHelpers.h`. using ad_utility::AllocatorWithLimit; QueryExecutionContext* testQec = ad_utility::testing::getQec(); @@ -308,4 +306,5 @@ TEST(LocalVocab, propagation) { // // TODO Maybe add tests for the new TextIndexScanFor... classes, // they never introduce any local vocab. + */ } diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp index c59b43421e..6ca68eb7a0 100644 --- a/test/ValueIdComparatorsTest.cpp +++ b/test/ValueIdComparatorsTest.cpp @@ -213,6 +213,8 @@ auto testGetRangesForEqualIds(auto begin, auto end, ValueId idBegin, // Test that `getRangesFromId` works correctly for `ValueId`s of the unsigned // index types (`VocabIndex`, `TextRecordIndex`, `LocalVocabIndex`, // `WordVocabIndex`). +// TODO Reinstate all the tests.. +/* TEST(ValueIdComparators, IndexTypes) { auto ids = makeRandomIds(); std::sort(ids.begin(), ids.end(), compareByBits); @@ -255,6 +257,7 @@ TEST(ValueIdComparators, IndexTypes) { testImpl.operator()(&getLocalVocabIndex); testImpl.operator()(&getWordVocabIndex); } + */ // _______________________________________________________________________ TEST(ValueIdComparators, undefinedWithItself) { diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp index 04e13cf883..169291e439 100644 --- a/test/ValueIdTest.cpp +++ b/test/ValueIdTest.cpp @@ -108,11 +108,13 @@ TEST(ValueId, Indices) { testSingle(0); testSingle(ValueId::maxIndex); - for (size_t idx = 0; idx < 10'000; ++idx) { - auto value = invalidIndexGenerator(); - ASSERT_THROW(makeId(value), ValueId::IndexTooLargeException); - AD_EXPECT_THROW_WITH_MESSAGE(makeId(value), - ::testing::ContainsRegex("is bigger than")); + if (type != Datatype::LocalVocabIndex) { + for (size_t idx = 0; idx < 10'000; ++idx) { + auto value = invalidIndexGenerator(); + ASSERT_THROW(makeId(value), ValueId::IndexTooLargeException); + AD_EXPECT_THROW_WITH_MESSAGE( + makeId(value), ::testing::ContainsRegex("is bigger than")); + } } }; @@ -129,6 +131,7 @@ TEST(ValueId, Undefined) { ASSERT_EQ(id.getDatatype(), Datatype::Undefined); } +/* TEST(ValueId, OrderingDifferentDatatypes) { auto ids = makeRandomIds(); std::sort(ids.begin(), ids.end()); @@ -139,7 +142,9 @@ TEST(ValueId, OrderingDifferentDatatypes) { ASSERT_TRUE( std::is_sorted(ids.begin(), ids.end(), compareByDatatypeAndIndexTypes)); } + */ +/* TEST(ValueId, IndexOrdering) { auto testOrder = [](auto makeIdFromIndex, auto getIndexFromId) { std::vector ids; @@ -163,6 +168,7 @@ TEST(ValueId, IndexOrdering) { testOrder(&makeWordVocabId, &getWordVocabIndex); testOrder(&makeTextRecordId, &getTextRecordIndex); } + */ TEST(ValueId, DoubleOrdering) { auto ids = makeRandomDoubleIds(); @@ -257,6 +263,7 @@ TEST(ValueId, Serialization) { } } +/* TEST(ValueId, Hashing) { auto ids = makeRandomIds(); ad_utility::HashSet idsWithoutDuplicates; @@ -275,6 +282,7 @@ TEST(ValueId, Hashing) { ASSERT_EQ(ids, idsWithoutDuplicatesAsVector); } + */ TEST(ValueId, toDebugString) { auto test = [](ValueId id, std::string_view expected) { @@ -288,7 +296,8 @@ TEST(ValueId, toDebugString) { test(ValueId::makeFromBool(false), "Bool:false"); test(ValueId::makeFromBool(true), "Bool:true"); test(makeVocabId(15), "VocabIndex:15"); - test(makeLocalVocabId(25), "LocalVocabIndex:25"); + AlignedStr str{"SomeValue"}; + test(ValueId::makeFromLocalVocabIndex(&str), "LocalVocabIndex:SomeValue"); test(makeTextRecordId(37), "TextRecordIndex:37"); test(makeWordVocabId(42), "WordVocabIndex:42"); test(makeBlankNodeId(27), "BlankNodeIndex:27"); diff --git a/test/ValueIdTestHelpers.h b/test/ValueIdTestHelpers.h index 98a5727266..b4f6cddce3 100644 --- a/test/ValueIdTestHelpers.h +++ b/test/ValueIdTestHelpers.h @@ -45,7 +45,8 @@ inline ValueId makeVocabId(uint64_t value) { return ValueId::makeFromVocabIndex(VocabIndex::make(value)); } inline ValueId makeLocalVocabId(uint64_t value) { - return ValueId::makeFromLocalVocabIndex(LocalVocabIndex::make(value)); + return ValueId::makeFromLocalVocabIndex( + std::bit_cast(value << 4)); } inline ValueId makeTextRecordId(uint64_t value) { return ValueId::makeFromTextRecordIndex(TextRecordIndex::make(value)); @@ -58,8 +59,9 @@ inline ValueId makeBlankNodeId(uint64_t value) { } inline uint64_t getVocabIndex(ValueId id) { return id.getVocabIndex().get(); } +// TODO Make the tests more precise for the localVocabIndices. inline uint64_t getLocalVocabIndex(ValueId id) { - return id.getLocalVocabIndex().get(); + return std::bit_cast(id.getLocalVocabIndex()) >> 4; } inline uint64_t getTextRecordIndex(ValueId id) { return id.getTextRecordIndex().get(); diff --git a/test/ValuesTest.cpp b/test/ValuesTest.cpp index 3d28ea07a4..61f9caf9f8 100644 --- a/test/ValuesTest.cpp +++ b/test/ValuesTest.cpp @@ -71,8 +71,10 @@ TEST(Values, computeResult) { AD_CORRECTNESS_CHECK(success); auto I = ad_utility::testing::IntId; auto L = ad_utility::testing::LocalVocabId; + auto l = result->localVocab().getIndexOrNullopt(""); + ASSERT_TRUE(l.has_value()); auto U = Id::makeUndefined(); - ASSERT_EQ(table, makeIdTableFromVector({{I(12), x}, {U, L(0)}})); + ASSERT_EQ(table, makeIdTableFromVector({{I(12), x}, {U, L(l.value())}})); } // Check that if the number of variables and the number of values in each row diff --git a/test/util/IdTestHelpers.h b/test/util/IdTestHelpers.h index 291e3c5b18..8fbb448131 100644 --- a/test/util/IdTestHelpers.h +++ b/test/util/IdTestHelpers.h @@ -18,8 +18,13 @@ inline auto VocabId = [](const auto& v) { return Id::makeFromVocabIndex(VocabIndex::make(v)); }; -inline auto LocalVocabId = [](const auto& v) { - return Id::makeFromLocalVocabIndex(reinterpret_cast(v)); +inline auto LocalVocabId = [](const T& v) { + if constexpr (std::integral) { + return Id::makeFromLocalVocabIndex( + std::bit_cast(static_cast(v))); + } else { + return Id::makeFromLocalVocabIndex(std::bit_cast(v)); + } }; inline auto TextRecordId = [](const auto& t) { From aa8d3729e4beb4bf1e0cccd71596ad41a1059212 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 21 Mar 2024 18:17:24 +0100 Subject: [PATCH 03/17] A small fix for joins. --- src/engine/AddCombinedRowToTable.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h index 82203afa14..44b9617d33 100644 --- a/src/engine/AddCombinedRowToTable.h +++ b/src/engine/AddCombinedRowToTable.h @@ -226,8 +226,13 @@ class AddCombinedRowToIdTable { // and simply copy the values from this column without looking at the other // input. auto mergeWithUndefined = [](const ValueId a, const ValueId b) { + // NOTE: For localVocabIndices we cannot use the simple bitwise OR. + if (a.isUndefined()) {return b;} + return a; + /* static_assert(ValueId::makeUndefined().getBits() == 0u); return ValueId::fromBits(a.getBits() | b.getBits()); + */ }; // A lambda that writes the join column with the given `colIdx` to the From 2836a513d6b02f0166759ec2de3fc2274a3cf45b Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 22 Mar 2024 03:48:06 +0100 Subject: [PATCH 04/17] Minor clang-format fix --- src/engine/AddCombinedRowToTable.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h index 44b9617d33..49e212fd3b 100644 --- a/src/engine/AddCombinedRowToTable.h +++ b/src/engine/AddCombinedRowToTable.h @@ -227,7 +227,9 @@ class AddCombinedRowToIdTable { // input. auto mergeWithUndefined = [](const ValueId a, const ValueId b) { // NOTE: For localVocabIndices we cannot use the simple bitwise OR. - if (a.isUndefined()) {return b;} + if (a.isUndefined()) { + return b; + } return a; /* static_assert(ValueId::makeUndefined().getBits() == 0u); From cbccbc8f119e5bb4a96d23f94ade2bf7bdbabae4 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Mon, 8 Apr 2024 17:25:38 +0200 Subject: [PATCH 05/17] working on this and that. --- test/LocalVocabTest.cpp | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index ee87a54e76..e22879ed80 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -98,12 +98,11 @@ TEST(LocalVocab, constructionAndAccess) { } // _____________________________________________________________________________ -/* TEST(LocalVocab, clone) { // Create a small local vocabulary. size_t localVocabSize = 100; LocalVocab localVocabOriginal; - for (auto& word : getTestCollectionOfWords(localVocabSize)) { + for (const auto& word : getTestCollectionOfWords(localVocabSize)) { localVocabOriginal.getIndexAndAddIfNotContained(word); } ASSERT_EQ(localVocabOriginal.size(), localVocabSize); @@ -113,26 +112,17 @@ TEST(LocalVocab, clone) { LocalVocab localVocabClone = localVocabOriginal.clone(); ASSERT_EQ(localVocabOriginal.size(), localVocabSize); ASSERT_EQ(localVocabClone.size(), localVocabSize); - for (size_t i = 0; i < localVocabSize; ++i) { - LocalVocabIndex idx = LocalVocabIndex::make(i); - const std::string& wordFromOriginal = localVocabOriginal.getWord(idx); - const std::string& wordFromClone = localVocabClone.getWord(idx); - ASSERT_EQ(wordFromOriginal, wordFromClone); - ASSERT_NE(&wordFromOriginal, &wordFromClone); + for (const auto& word : getTestCollectionOfWords(localVocabSize)) { + auto fromOrig = localVocabOriginal.getIndexOrNullopt(word).value(); + auto fromClone = localVocabClone.getIndexOrNullopt(word).value(); + ASSERT_EQ(fromOrig, fromClone); + ASSERT_EQ(word, *fromOrig); } - - // Check that `nextFreeIndex_` of the clone is OK by adding another word to - // the clone. - localVocabClone.getIndexAndAddIfNotContained("blubb"); - ASSERT_EQ(localVocabClone.getIndexAndAddIfNotContained("blubb"), - LocalVocabIndex::make(localVocabSize)); } - */ +// TODO Rewrite // _____________________________________________________________________________ TEST(LocalVocab, propagation) { - // TODO This test has to be rewritten - /* // Query execution context (with small test index), see `IndexTestHelpers.h`. using ad_utility::AllocatorWithLimit; QueryExecutionContext* testQec = ad_utility::testing::getQec(); @@ -306,5 +296,5 @@ TEST(LocalVocab, propagation) { // // TODO Maybe add tests for the new TextIndexScanFor... classes, // they never introduce any local vocab. - */ } +*/ From de10849ac07a08f6b3217be4bd48241e0f7ad09a Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 10 Apr 2024 20:08:08 +0200 Subject: [PATCH 06/17] A Working version with improved test coverage. --- src/engine/AddCombinedRowToTable.h | 4 +- src/engine/LocalVocab.cpp | 10 ++++ src/engine/LocalVocab.h | 15 ++++-- src/global/ValueId.h | 8 +++- test/LocalVocabTest.cpp | 74 ++++++++++++++++-------------- 5 files changed, 70 insertions(+), 41 deletions(-) diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h index 44b9617d33..49e212fd3b 100644 --- a/src/engine/AddCombinedRowToTable.h +++ b/src/engine/AddCombinedRowToTable.h @@ -227,7 +227,9 @@ class AddCombinedRowToIdTable { // input. auto mergeWithUndefined = [](const ValueId a, const ValueId b) { // NOTE: For localVocabIndices we cannot use the simple bitwise OR. - if (a.isUndefined()) {return b;} + if (a.isUndefined()) { + return b; + } return a; /* static_assert(ValueId::makeUndefined().getBits() == 0u); diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index 88ed5fb893..aa09eef6f6 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -77,3 +77,13 @@ std::optional LocalVocab::getIndexOrNullopt( const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const { return *localVocabIndex; } + +// _____________________________________________________________________________ +std::vector LocalVocab::getAllWordsForTesting() const { + std::vector result; + std::ranges::copy(wordsToIndexesMap(), std::back_inserter(result)); + for (const auto& previous : previousSets_) { + std::ranges::copy(*previous, std::back_inserter(result)); + } + return result; +} diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index e7a3d5f611..5d57222414 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -27,8 +27,7 @@ class LocalVocab { // we refer to them in `wordsToIdsMap_` below. using Set = absl::node_hash_set; std::vector> previousSets_; - std::shared_ptr wordsToIndexesMap_ = - std::make_shared>(); + std::shared_ptr wordsToIndexesMap_ = std::make_shared(); auto& wordsToIndexesMap() { return *wordsToIndexesMap_; } const auto& wordsToIndexesMap() const { return *wordsToIndexesMap_; } @@ -61,16 +60,24 @@ class LocalVocab { const std::string& word) const; // The number of words in the vocabulary. - size_t size() const { return wordsToIndexesMap().size(); } + size_t size() const { + auto result = wordsToIndexesMap().size(); + for (const auto& previous : previousSets_) { + result += previous->size(); + } + return result; + } // Return true if and only if the local vocabulary is empty. - bool empty() const { return wordsToIndexesMap().empty(); } + bool empty() const { return size() == 0; } // Return a const reference to the word. const std::string& getWord(LocalVocabIndex localVocabIndex) const; static LocalVocab merge(const LocalVocab& a, const LocalVocab& b); + std::vector getAllWordsForTesting() const; + private: // Common implementation for the two variants of // `getIndexAndAddIfNotContainedImpl` above. diff --git a/src/global/ValueId.h b/src/global/ValueId.h index e4efe921cb..0a031d2bce 100644 --- a/src/global/ValueId.h +++ b/src/global/ValueId.h @@ -123,7 +123,13 @@ class ValueId { /// Equality comparison is performed directly on the underlying /// representation. - constexpr bool operator==(const ValueId&) const = default; + constexpr bool operator==(const ValueId& other) const { + if (getDatatype() == Datatype::LocalVocabIndex && + other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] { + return *getLocalVocabIndex() == *other.getLocalVocabIndex(); + } + return _bits == other._bits; + } /// Comparison is performed directly on the underlying representation. Note /// that because the type bits are the most significant bits, all values of diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index e22879ed80..00fbebfa99 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -2,6 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Hannah Bast +#include #include #include @@ -112,15 +113,12 @@ TEST(LocalVocab, clone) { LocalVocab localVocabClone = localVocabOriginal.clone(); ASSERT_EQ(localVocabOriginal.size(), localVocabSize); ASSERT_EQ(localVocabClone.size(), localVocabSize); - for (const auto& word : getTestCollectionOfWords(localVocabSize)) { - auto fromOrig = localVocabOriginal.getIndexOrNullopt(word).value(); - auto fromClone = localVocabClone.getIndexOrNullopt(word).value(); - ASSERT_EQ(fromOrig, fromClone); - ASSERT_EQ(word, *fromOrig); - } + ASSERT_THAT(localVocabClone.getAllWordsForTesting(), + ::testing::UnorderedElementsAreArray( + localVocabOriginal.getAllWordsForTesting())); + // TODO Maybe also test the indices. } -// TODO Rewrite // _____________________________________________________________________________ TEST(LocalVocab, propagation) { // Query execution context (with small test index), see `IndexTestHelpers.h`. @@ -137,18 +135,15 @@ TEST(LocalVocab, propagation) { std::shared_ptr resultTable = operation.getResult(); ASSERT_TRUE(resultTable) << "Operation: " << operation.getDescriptor() << std::endl; - std::vector localVocabWords; - for (size_t i = 0; i < resultTable->localVocab().size(); ++i) { - localVocabWords.emplace_back( - resultTable->localVocab().getWord(LocalVocabIndex::make(i))); - } - ASSERT_EQ(localVocabWords, expectedWords) - << "Operation: " << operation.getDescriptor() << std::endl; - }; - - // Lambda that checks that `computeResult` throws an exception. - auto checkThrow = [&](Operation& operation) -> void { - ASSERT_THROW(operation.getResult(), ad_utility::AbortException) + std::vector localVocabWords = + resultTable->localVocab().getAllWordsForTesting(); + // We currently allow the local vocab to have multiple IDs for the same + // word, so we have to deduplicate first. + std::ranges::sort(localVocabWords); + localVocabWords.erase(std::ranges::unique(localVocabWords).begin(), + localVocabWords.end()); + ASSERT_THAT(localVocabWords, + ::testing::UnorderedElementsAreArray(expectedWords)) << "Operation: " << operation.getDescriptor() << std::endl; }; @@ -175,7 +170,8 @@ TEST(LocalVocab, propagation) { {{TripleComponent{"x"}, TripleComponent{"y1"}}, {TripleComponent{"x"}, TripleComponent{"y2"}}}}); Values values1copy = values1; - checkLocalVocab(values1copy, std::vector{"x", "y1", "y2"}); + std::vector localVocab1{"x", "y1", "y2"}; + checkLocalVocab(values1copy, localVocab1); // VALUES operation that uses an existing literal (from the test index). Values values2(testQec, {{Variable{"?x"}, Variable{"?y"}}, @@ -183,27 +179,34 @@ TEST(LocalVocab, propagation) { Values values2copy = values2; checkLocalVocab(values2copy, std::vector{}); + // Contains local vocab words that are (at least partially) disjoint from the + // words in `values1`. + Values values3(testQec, {{Variable{"?x"}, Variable{"?y"}}, + {{TripleComponent{"x"}, TripleComponent{"y1"}}, + {TripleComponent{"x2"}, TripleComponent{"y3"}}}}); + std::vector localVocab13{"x", "y1", "y2", "x2", "y3"}; + // JOIN operation with exactly one non-empty local vocab and with two // non-empty local vocabs (the last two arguments are the two join columns). Join join1(testQec, qet(values1), qet(values2), 0, 0); checkLocalVocab(join1, std::vector{"x", "y1", "y2"}); - Join join2(testQec, qet(values1), qet(values1), 0, 0); - checkThrow(join2); + Join join2(testQec, qet(values1), qet(values3), 0, 0); + checkLocalVocab(join2, localVocab13); // OPTIONAL JOIN operation with exactly one non-empty local vocab. OptionalJoin optJoin1(testQec, qet(values1), qet(values2)); checkLocalVocab(optJoin1, std::vector{"x", "y1", "y2"}); // OPTIONAL JOIN operation with two non-empty local vocab. - OptionalJoin optJoin2(testQec, qet(values1), qet(values1)); - checkThrow(optJoin2); + OptionalJoin optJoin2(testQec, qet(values1), qet(values3)); + checkLocalVocab(optJoin2, localVocab13); // MULTI-COLUMN JOIN operation with exactly one non-empty local vocab and with // two non-empty local vocabs. MultiColumnJoin multiJoin1(testQec, qet(values1), qet(values2)); checkLocalVocab(multiJoin1, std::vector{"x", "y1", "y2"}); - MultiColumnJoin multiJoin2(testQec, qet(values1), qet(values1)); - checkThrow(multiJoin2); + MultiColumnJoin multiJoin2(testQec, qet(values1), qet(values3)); + checkLocalVocab(multiJoin2, localVocab13); // ORDER BY operation (the third argument are the indices of the columns to be // sorted, and the sort order; not important for this test). @@ -248,15 +251,15 @@ TEST(LocalVocab, propagation) { // non-empy local vocabs. Union union1(testQec, qet(values1), qet(values2)); checkLocalVocab(union1, std::vector{"x", "y1", "y2"}); - Union union2(testQec, qet(values1), qet(values1)); - checkThrow(union2); + Union union2(testQec, qet(values1), qet(values3)); + checkLocalVocab(union2, localVocab13); // MINUS operation with exactly one non-empty local vocab and with // two non-empty local vocabs. Minus minus1(testQec, qet(values1), qet(values2)); checkLocalVocab(minus1, std::vector{"x", "y1", "y2"}); - Minus minus2(testQec, qet(values1), qet(values1)); - checkThrow(minus2); + Minus minus2(testQec, qet(values1), qet(values3)); + checkLocalVocab(minus2, localVocab13); // FILTER operation (the third argument is an expression; which one doesn't // matter for this test). @@ -287,14 +290,15 @@ TEST(LocalVocab, propagation) { // PATTERN TRICK operations. HasPredicateScan hasPredicateScan(testQec, qet(values1), 0, Variable{"?z"}); checkLocalVocab(hasPredicateScan, std::vector{"x", "y1", "y2"}); + Values valuesPatternTrick( + testQec, {{Variable{"?x"}, Variable{"?y"}}, + {{TripleComponent{"x"}, TripleComponent{NO_PATTERN}}, + {TripleComponent{"x"}, TripleComponent{NO_PATTERN}}}}); CountAvailablePredicates countAvailablePredictes( - testQec, qet(values1), 0, Variable{"?x"}, Variable{"?y"}); - checkLocalVocab(countAvailablePredictes, - std::vector{"x", "y1", "y2"}); - + testQec, qet(valuesPatternTrick), 0, Variable{"?y"}, Variable{"?count"}); + checkLocalVocab(countAvailablePredictes, {"x"}); // TEXT operations. // // TODO Maybe add tests for the new TextIndexScanFor... classes, // they never introduce any local vocab. } -*/ From 4cedeeb71d7a4f6aec3bd1f79667ed5de58895db Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 10 Apr 2024 20:21:57 +0200 Subject: [PATCH 07/17] Fix the tests. --- test/LocalVocabTest.cpp | 65 +++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index 00fbebfa99..fc7746a09f 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -166,36 +166,43 @@ TEST(LocalVocab, propagation) { // literal "x", we would have to write TripleComponent{"\"x\""}. For the // purposes of this test, we just want something that's not yet in the index, // so "x" etc. is just fine (and also different from the "" below). - Values values1(testQec, {{Variable{"?x"}, Variable{"?y"}}, - {{TripleComponent{"x"}, TripleComponent{"y1"}}, - {TripleComponent{"x"}, TripleComponent{"y2"}}}}); + auto iri = ad_utility::testing::iri; + Values values1( + testQec, + {{Variable{"?x"}, Variable{"?y"}}, + {{TripleComponent{iri("")}, TripleComponent{iri("")}}, + {TripleComponent{iri("")}, TripleComponent{iri("")}}}}); Values values1copy = values1; - std::vector localVocab1{"x", "y1", "y2"}; + std::vector localVocab1{"", "", ""}; checkLocalVocab(values1copy, localVocab1); // VALUES operation that uses an existing literal (from the test index). - Values values2(testQec, {{Variable{"?x"}, Variable{"?y"}}, - {{TripleComponent{""}, TripleComponent{""}}}}); + Values values2( + testQec, {{Variable{"?x"}, Variable{"?y"}}, + {{TripleComponent{iri("")}, TripleComponent{iri("")}}}}); Values values2copy = values2; checkLocalVocab(values2copy, std::vector{}); // Contains local vocab words that are (at least partially) disjoint from the // words in `values1`. - Values values3(testQec, {{Variable{"?x"}, Variable{"?y"}}, - {{TripleComponent{"x"}, TripleComponent{"y1"}}, - {TripleComponent{"x2"}, TripleComponent{"y3"}}}}); - std::vector localVocab13{"x", "y1", "y2", "x2", "y3"}; + Values values3( + testQec, + {{Variable{"?x"}, Variable{"?y"}}, + {{TripleComponent{iri("")}, TripleComponent{iri("")}}, + {TripleComponent{iri("")}, TripleComponent{iri("")}}}}); + std::vector localVocab13{"", "", "", "", + ""}; // JOIN operation with exactly one non-empty local vocab and with two // non-empty local vocabs (the last two arguments are the two join columns). Join join1(testQec, qet(values1), qet(values2), 0, 0); - checkLocalVocab(join1, std::vector{"x", "y1", "y2"}); + checkLocalVocab(join1, localVocab1); Join join2(testQec, qet(values1), qet(values3), 0, 0); checkLocalVocab(join2, localVocab13); // OPTIONAL JOIN operation with exactly one non-empty local vocab. OptionalJoin optJoin1(testQec, qet(values1), qet(values2)); - checkLocalVocab(optJoin1, std::vector{"x", "y1", "y2"}); + checkLocalVocab(optJoin1, localVocab1); // OPTIONAL JOIN operation with two non-empty local vocab. OptionalJoin optJoin2(testQec, qet(values1), qet(values3)); @@ -204,23 +211,23 @@ TEST(LocalVocab, propagation) { // MULTI-COLUMN JOIN operation with exactly one non-empty local vocab and with // two non-empty local vocabs. MultiColumnJoin multiJoin1(testQec, qet(values1), qet(values2)); - checkLocalVocab(multiJoin1, std::vector{"x", "y1", "y2"}); + checkLocalVocab(multiJoin1, localVocab1); MultiColumnJoin multiJoin2(testQec, qet(values1), qet(values3)); checkLocalVocab(multiJoin2, localVocab13); // ORDER BY operation (the third argument are the indices of the columns to be // sorted, and the sort order; not important for this test). OrderBy orderBy(testQec, qet(values1), {{0, true}, {1, true}}); - checkLocalVocab(orderBy, std::vector{"x", "y1", "y2"}); + checkLocalVocab(orderBy, localVocab1); // SORT operation (the third operation is the sort column). Sort sort(testQec, qet(values1), {0}); - checkLocalVocab(sort, std::vector{"x", "y1", "y2"}); + checkLocalVocab(sort, localVocab1); // DISTINCT operation (the third argument are the indices of the input columns // that are considered for the output; not important for this test). Distinct distinct1(testQec, qet(values1), {0, 1}); - checkLocalVocab(distinct1, std::vector{"x", "y1", "y2"}); + checkLocalVocab(distinct1, localVocab1); // GROUP BY operation. auto groupConcatExpression = @@ -239,25 +246,26 @@ TEST(LocalVocab, propagation) { testQec, {Variable{"?x"}}, {Alias{groupConcatExpression("?y", "|"), Variable{"?concat"}}}, qet(values1)); - checkLocalVocab(groupBy, std::vector{"x", "y1", "y2", "y1|y2"}); + checkLocalVocab( + groupBy, std::vector{"", "", "", "yN1|yN2"}); // DISTINCT again, but after something has been added to the local vocabulary // (to check that the "y1|y2" added by the GROUP BY does not also appear here, // as it did before GroupBy::computeResult copied it's local vocabulary). Distinct distinct2(testQec, qet(values1), {0}); - checkLocalVocab(distinct2, std::vector{"x", "y1", "y2"}); + checkLocalVocab(distinct2, localVocab1); // UNION operation with exactly one non-empty local vocab and with two // non-empy local vocabs. Union union1(testQec, qet(values1), qet(values2)); - checkLocalVocab(union1, std::vector{"x", "y1", "y2"}); + checkLocalVocab(union1, localVocab1); Union union2(testQec, qet(values1), qet(values3)); checkLocalVocab(union2, localVocab13); // MINUS operation with exactly one non-empty local vocab and with // two non-empty local vocabs. Minus minus1(testQec, qet(values1), qet(values2)); - checkLocalVocab(minus1, std::vector{"x", "y1", "y2"}); + checkLocalVocab(minus1, localVocab1); Minus minus2(testQec, qet(values1), qet(values3)); checkLocalVocab(minus2, localVocab13); @@ -267,7 +275,7 @@ TEST(LocalVocab, propagation) { testQec, qet(values1), {std::make_unique(Variable{"?x"}), "Expression ?x"}); - checkLocalVocab(filter, std::vector{"x", "y1", "y2"}); + checkLocalVocab(filter, localVocab1); // BIND operation (the third argument is a `parsedQuery::Bind` object). Bind bind( @@ -275,7 +283,7 @@ TEST(LocalVocab, propagation) { {{std::make_unique(Variable{"?x"}), "Expression ?x"}, Variable{"?z"}}); - checkLocalVocab(bind, std::vector{"x", "y1", "y2"}); + checkLocalVocab(bind, localVocab1); // TRANSITIVE PATH operation. // @@ -285,18 +293,19 @@ TEST(LocalVocab, propagation) { TransitivePathSide left(std::nullopt, 0, Variable{"?x"}); TransitivePathSide right(std::nullopt, 1, Variable{"?y"}); TransitivePath transitivePath(testQec, qet(values1), left, right, 1, 1); - checkLocalVocab(transitivePath, std::vector{"x", "y1", "y2"}); + checkLocalVocab(transitivePath, localVocab1); // PATTERN TRICK operations. HasPredicateScan hasPredicateScan(testQec, qet(values1), 0, Variable{"?z"}); - checkLocalVocab(hasPredicateScan, std::vector{"x", "y1", "y2"}); + checkLocalVocab(hasPredicateScan, localVocab1); Values valuesPatternTrick( - testQec, {{Variable{"?x"}, Variable{"?y"}}, - {{TripleComponent{"x"}, TripleComponent{NO_PATTERN}}, - {TripleComponent{"x"}, TripleComponent{NO_PATTERN}}}}); + testQec, + {{Variable{"?x"}, Variable{"?y"}}, + {{TripleComponent{iri("")}, TripleComponent{NO_PATTERN}}, + {TripleComponent{iri("")}, TripleComponent{NO_PATTERN}}}}); CountAvailablePredicates countAvailablePredictes( testQec, qet(valuesPatternTrick), 0, Variable{"?y"}, Variable{"?count"}); - checkLocalVocab(countAvailablePredictes, {"x"}); + checkLocalVocab(countAvailablePredictes, {""}); // TEXT operations. // // TODO Maybe add tests for the new TextIndexScanFor... classes, From 2bc4d5dcb165ae2558e071927bb8e56460e9d5cf Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 10 Apr 2024 21:49:03 +0200 Subject: [PATCH 08/17] Small round of reviews. --- src/engine/AddCombinedRowToTable.h | 26 +++++++---------- src/engine/CartesianProductJoin.cpp | 2 +- src/engine/Join.cpp | 2 +- src/engine/LocalVocab.cpp | 38 ++++++++++-------------- src/engine/LocalVocab.h | 28 ++++++++++++------ src/engine/Minus.cpp | 3 +- src/engine/MultiColumnJoin.cpp | 3 +- src/engine/OptionalJoin.cpp | 3 +- src/engine/ResultTable.cpp | 6 ++-- src/engine/ResultTable.h | 45 +++++------------------------ src/engine/TransitivePath.cpp | 2 +- src/engine/Union.cpp | 5 ++-- src/global/IndexTypes.h | 9 +++--- src/global/ValueId.h | 10 +++++-- test/ValueIdTest.cpp | 2 +- 15 files changed, 79 insertions(+), 105 deletions(-) diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h index 49e212fd3b..ae3e3e94bf 100644 --- a/src/engine/AddCombinedRowToTable.h +++ b/src/engine/AddCombinedRowToTable.h @@ -218,28 +218,23 @@ class AddCombinedRowToIdTable { AD_CORRECTNESS_CHECK(inputLeftAndRight_.has_value()); result.resize(oldSize + nextIndex_); - // Sometimes columns are combined where one value is UNDEF and the other one - // is not. This function very efficiently returns the not-UNDEF value in - // this case. - // TODO If we keep track of the information that one of the - // involved columns contains no UNDEF values at all, we can omit this step - // and simply copy the values from this column without looking at the other - // input. - auto mergeWithUndefined = [](const ValueId a, const ValueId b) { - // NOTE: For localVocabIndices we cannot use the simple bitwise OR. + // Precondition: `a` and `b` compare equal or at least one of them is UNDEF + // If exactly one of them is UNDEF, return the other one, else return any of + // them (they are equal anyway). + auto getJoinValue = [](const ValueId a, const ValueId b) { + // NOTE: For localVocabIndices we might have different pointers that + // compare equal because they point to the same word. Therefore we cannot + // use a simple bitwise operation to handle the "one of them is UNDEF" + // case as we previously did. if (a.isUndefined()) { return b; } return a; - /* - static_assert(ValueId::makeUndefined().getBits() == 0u); - return ValueId::fromBits(a.getBits() | b.getBits()); - */ }; // A lambda that writes the join column with the given `colIdx` to the // `nextResultColIdx`-th column of the result. - auto writeJoinColumn = [&result, &mergeWithUndefined, oldSize, this]( + auto writeJoinColumn = [&result, &getJoinValue, oldSize, this]( size_t colIdx, size_t resultColIdx) { const auto& colLeft = inputLeft().getColumn(colIdx); const auto& colRight = inputRight().getColumn(colIdx); @@ -249,7 +244,8 @@ class AddCombinedRowToIdTable { // Write the matching rows. for (const auto& [targetIndex, sourceIndices] : indexBuffer_) { - auto resultId = mergeWithUndefined(colLeft[sourceIndices[0]], + auto resultId = + getJoinValue(colLeft[sourceIndices[0]], colRight[sourceIndices[1]]); numUndef += static_cast(resultId.isUndefined()); resultCol[oldSize + targetIndex] = resultId; diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp index e9059269b0..c1c631f5b7 100644 --- a/src/engine/CartesianProductJoin.cpp +++ b/src/engine/CartesianProductJoin.cpp @@ -202,7 +202,7 @@ ResultTable CartesianProductJoin::computeResult() { auto subResultsDeref = std::views::transform( subResults, [](auto& x) -> decltype(auto) { return *x; }); return {std::move(result), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(subResultsDeref)}; + ResultTable::getMergedLocalVocab(subResultsDeref)}; } // ____________________________________________________________________________ diff --git a/src/engine/Join.cpp b/src/engine/Join.cpp index df84b1696a..dc1063c611 100644 --- a/src/engine/Join.cpp +++ b/src/engine/Join.cpp @@ -187,7 +187,7 @@ ResultTable Join::computeResult() { // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftRes, *rightRes)}; + ResultTable::getMergedLocalVocab(*leftRes, *rightRes)}; } // _____________________________________________________________________________ diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index aa09eef6f6..e7559a65cf 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -11,42 +11,36 @@ // _____________________________________________________________________________ LocalVocab LocalVocab::clone() const { LocalVocab localVocabClone; - localVocabClone.previousSets_ = previousSets_; - localVocabClone.previousSets_.push_back(wordsToIndexesMap_); + localVocabClone.otherWordSets_ = otherWordSets_; + localVocabClone.otherWordSets_.push_back(primaryWordSet_); // Return the clone. return localVocabClone; } +// _____________________________________________________________________________ LocalVocab LocalVocab::merge(const LocalVocab& a, const LocalVocab& b) { LocalVocab res; - auto inserter = std::back_inserter(res.previousSets_); - std::ranges::copy(a.previousSets_, inserter); - std::ranges::copy(b.previousSets_, inserter); - *inserter = a.wordsToIndexesMap_; - *inserter = b.wordsToIndexesMap_; + auto inserter = std::back_inserter(res.otherWordSets_); + std::ranges::copy(a.otherWordSets_, inserter); + std::ranges::copy(b.otherWordSets_, inserter); + *inserter = a.primaryWordSet_; + *inserter = b.primaryWordSet_; return res; } // _____________________________________________________________________________ template LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { - // The following code contains two subtle, but important optimizations: + // The following code contains a subtle, but important optimizations: // // 1. The variant of `insert` used covers the case that `word` was already // contained in the map as well as the case that it is newly inserted. This // avoids computing the hash for `word` twice in case we see it for the first // time (note that hashing a string is not cheap). - // - // 2. The fact that we have a member variable `nextFreeIndex_` avoids that we - // tentatively have to compute the next free ID every time this function is - // called (even when the ID is not actually needed because the word is already - // contained in the map). - // // TODO Make this work with transparent hashing and use try_emplace. - auto [wordInMapAndIndex, isNewWord] = - wordsToIndexesMap().insert(AlignedStr{std::forward(word)}); - const auto& wordInMap = *wordInMapAndIndex; - return &wordInMap; + auto [wordIterator, isNewWord] = + primaryWordSet().insert(StringAligned16{std::forward(word)}); + return std::addressof(*wordIterator); } // _____________________________________________________________________________ @@ -65,8 +59,8 @@ std::optional LocalVocab::getIndexOrNullopt( const std::string& word) const { // TODO Maybe we can make this work with transparent hashing, // but if this is only a testing API this is probably not worth the hassle. - auto localVocabIndex = wordsToIndexesMap().find(AlignedStr{word}); - if (localVocabIndex != wordsToIndexesMap().end()) { + auto localVocabIndex = primaryWordSet().find(StringAligned16{word}); + if (localVocabIndex != primaryWordSet().end()) { return &(*localVocabIndex); } else { return std::nullopt; @@ -81,8 +75,8 @@ const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const { // _____________________________________________________________________________ std::vector LocalVocab::getAllWordsForTesting() const { std::vector result; - std::ranges::copy(wordsToIndexesMap(), std::back_inserter(result)); - for (const auto& previous : previousSets_) { + std::ranges::copy(primaryWordSet(), std::back_inserter(result)); + for (const auto& previous : otherWordSets_) { std::ranges::copy(*previous, std::back_inserter(result)); } return result; diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index 5d57222414..c2d7f6a8ce 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -24,13 +24,17 @@ class LocalVocab { // A map of the words in the local vocabulary to their local IDs. This is a // node hash map because we need the addresses of the words (which are of type // `std::string`) to remain stable over their lifetime in the hash map because - // we refer to them in `wordsToIdsMap_` below. - using Set = absl::node_hash_set; - std::vector> previousSets_; - std::shared_ptr wordsToIndexesMap_ = std::make_shared(); + // we hand out pointers to them. + using Set = absl::node_hash_set; + std::shared_ptr primaryWordSet_ = std::make_shared(); - auto& wordsToIndexesMap() { return *wordsToIndexesMap_; } - const auto& wordsToIndexesMap() const { return *wordsToIndexesMap_; } + // Local vocabularies from child operations that were merged into this + // vocabulary s.t. the pointers are kept alive. They have to be `const` + // because they are possibly shared concurrently (for example via the cache). + std::vector> otherWordSets_; + + auto& primaryWordSet() { return *primaryWordSet_; } + const auto& primaryWordSet() const { return *primaryWordSet_; } public: // Create a new, empty local vocabulary. @@ -40,7 +44,9 @@ class LocalVocab { LocalVocab(const LocalVocab&) = delete; LocalVocab& operator=(const LocalVocab&) = delete; - // Make a deep copy explicitly. + // Make a logical copy. The clone will have an empty primary set so it can + // safely be modified. The contents are copied as shared pointers to const, so + // the function runs in linear time in the number of word sets. LocalVocab clone() const; // Moving a local vocabulary is not problematic (though the typical use case @@ -60,9 +66,10 @@ class LocalVocab { const std::string& word) const; // The number of words in the vocabulary. + // Note: This is not constant time, but linear in the number of word sets. size_t size() const { - auto result = wordsToIndexesMap().size(); - for (const auto& previous : previousSets_) { + auto result = primaryWordSet().size(); + for (const auto& previous : otherWordSets_) { result += previous->size(); } return result; @@ -74,8 +81,11 @@ class LocalVocab { // Return a const reference to the word. const std::string& getWord(LocalVocabIndex localVocabIndex) const; + // Create a local vocab that contains and keeps alive all the words from `a` + // as well as `b`. static LocalVocab merge(const LocalVocab& a, const LocalVocab& b); + // Return all the words from all the word sets as a vector. std::vector getAllWordsForTesting() const; private: diff --git a/src/engine/Minus.cpp b/src/engine/Minus.cpp index 00ebd3be84..8b5c12c9ad 100644 --- a/src/engine/Minus.cpp +++ b/src/engine/Minus.cpp @@ -56,8 +56,7 @@ ResultTable Minus::computeResult() { // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult, - *rightResult)}; + ResultTable::getMergedLocalVocab(*leftResult, *rightResult)}; } // _____________________________________________________________________________ diff --git a/src/engine/MultiColumnJoin.cpp b/src/engine/MultiColumnJoin.cpp index 0de919c071..0d4de5c978 100644 --- a/src/engine/MultiColumnJoin.cpp +++ b/src/engine/MultiColumnJoin.cpp @@ -86,8 +86,7 @@ ResultTable MultiColumnJoin::computeResult() { // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult, - *rightResult)}; + ResultTable::getMergedLocalVocab(*leftResult, *rightResult)}; } // _____________________________________________________________________________ diff --git a/src/engine/OptionalJoin.cpp b/src/engine/OptionalJoin.cpp index 89917189b1..6ec87dc456 100644 --- a/src/engine/OptionalJoin.cpp +++ b/src/engine/OptionalJoin.cpp @@ -115,8 +115,7 @@ ResultTable OptionalJoin::computeResult() { // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult, - *rightResult)}; + ResultTable::getMergedLocalVocab(*leftResult, *rightResult)}; } // _____________________________________________________________________________ diff --git a/src/engine/ResultTable.cpp b/src/engine/ResultTable.cpp index 79fde3d57a..73edbaa506 100644 --- a/src/engine/ResultTable.cpp +++ b/src/engine/ResultTable.cpp @@ -23,10 +23,10 @@ string ResultTable::asDebugString() const { } // _____________________________________________________________________________ -auto ResultTable::getSharedLocalVocabFromNonEmptyOf( - const ResultTable& resultTable1, const ResultTable& resultTable2) +auto ResultTable::getMergedLocalVocab(const ResultTable& resultTable1, + const ResultTable& resultTable2) -> SharedLocalVocabWrapper { - return getSharedLocalVocabFromNonEmptyOf( + return getMergedLocalVocab( std::array{std::cref(resultTable1), std::cref(resultTable2)}); } diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h index c7df4c4713..d072ef5c39 100644 --- a/src/engine/ResultTable.h +++ b/src/engine/ResultTable.h @@ -133,54 +133,25 @@ class ResultTable { return SharedLocalVocabWrapper{localVocab_}; } - // Like `getSharedLocalVocabFrom`, but takes more than one result and assumes - // that exactly one of the local vocabularies is empty and gets the shared - // local vocab from the non-empty one (if all are empty, arbitrarily share - // with the first one). - // - // TODO: Eventually, we want to be able to merge two non-empty local - // vocabularies, but that requires more work since we have to rewrite IDs then - // (from the previous separate local vocabularies to the new merged one). - static SharedLocalVocabWrapper getSharedLocalVocabFromNonEmptyOf( + // Like `getSharedLocalVocabFrom`, but takes more than one result and merges + // all the corresponding local vocabs. + static SharedLocalVocabWrapper getMergedLocalVocab( const ResultTable& resultTable1, const ResultTable& resultTable2); // Overload for more than two `ResultTables` template requires std::convertible_to, const ResultTable&> - static SharedLocalVocabWrapper getSharedLocalVocabFromNonEmptyOf( - R&& subResults) { - // TODO Implement this much better. + static SharedLocalVocabWrapper getMergedLocalVocab(R&& subResults) { + // We could implement the k-way merge much smarter, but in practice we + // rarely have more than two inputs, so the performance should be fine (we + // are only copying a few shared pointers after all). + // TODO implement `LocalVocab::merge` in an n-way fashion`. LocalVocab res; for (const ResultTable& table : subResults) { res = LocalVocab::merge(res, *table.localVocab_); } return SharedLocalVocabWrapper{std::move(res)}; - /* - AD_CONTRACT_CHECK(!std::ranges::empty(subResults)); - auto hasNonEmptyVocab = [](const ResultTable& tbl) { - return !tbl.localVocab_->empty(); - }; - auto numNonEmptyVocabs = - std::ranges::count_if(subResults, hasNonEmptyVocab); - if (numNonEmptyVocabs > 1) { - throw std::runtime_error( - "Merging of more than one non-empty local vocabularies is currently " - "not supported, please contact the developers"); - } - // The static casts in the following are needed to make this code work for - // types that are implicitly convertible to `const ResultTable&`, in - // particular `std::reference_wrapper`. - if (numNonEmptyVocabs == 0) { - return SharedLocalVocabWrapper{ - static_cast(*subResults.begin()).localVocab_}; - } else { - return SharedLocalVocabWrapper{ - static_cast( - *std::ranges::find_if(subResults, hasNonEmptyVocab)) - .localVocab_}; - } - */ } // Get a (deep) copy of the local vocabulary from the given result. Use this diff --git a/src/engine/TransitivePath.cpp b/src/engine/TransitivePath.cpp index c086561faf..073085b89a 100644 --- a/src/engine/TransitivePath.cpp +++ b/src/engine/TransitivePath.cpp @@ -256,7 +256,7 @@ ResultTable TransitivePath::computeResult() { sideRes->idTable()); return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*sideRes, *subRes)}; + ResultTable::getMergedLocalVocab(*sideRes, *subRes)}; }; if (lhs_.isBoundVariable()) { diff --git a/src/engine/Union.cpp b/src/engine/Union.cpp index 7f3269fe37..c8615c5062 100644 --- a/src/engine/Union.cpp +++ b/src/engine/Union.cpp @@ -178,9 +178,8 @@ ResultTable Union::computeResult() { LOG(DEBUG) << "Union result computation done" << std::endl; // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). - return ResultTable{ - std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*subRes1, *subRes2)}; + return ResultTable{std::move(idTable), resultSortedOn(), + ResultTable::getMergedLocalVocab(*subRes1, *subRes2)}; } void Union::computeUnion( diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h index d2e46da5e1..2260bae25e 100644 --- a/src/global/IndexTypes.h +++ b/src/global/IndexTypes.h @@ -14,13 +14,14 @@ // requests. using VocabIndex = ad_utility::TypedIndex; -// TODO Where to put this best... -struct alignas(16) AlignedStr : public std::string { +// A `std::string` that is aligned to 16 bytes s.t. pointers always end with 4 +// bits that are zero and that are reused for payloads in the `ValueId` class. +struct alignas(16) StringAligned16 : public std::string { using std::string::basic_string; - explicit AlignedStr(std::string s) : std::string{std::move(s)} {} + explicit StringAligned16(std::string s) : std::string{std::move(s)} {} }; // using LocalVocabIndex = ad_utility::TypedIndex; -using LocalVocabIndex = const AlignedStr*; +using LocalVocabIndex = const StringAligned16*; using TextRecordIndex = ad_utility::TypedIndex; using WordVocabIndex = ad_utility::TypedIndex; using BlankNodeIndex = ad_utility::TypedIndex; diff --git a/src/global/ValueId.h b/src/global/ValueId.h index 0a031d2bce..33408479e7 100644 --- a/src/global/ValueId.h +++ b/src/global/ValueId.h @@ -123,6 +123,11 @@ class ValueId { /// Equality comparison is performed directly on the underlying /// representation. + // NOTE: (Also for the operator<=> below: These comparisons only work + // correctly if we only store entries in the local vocab that are NOT part of + // the vocabulary. This is currently not true for expression results from + // GROUP BY and BIND operations (for performance reaons). So a join with such + // results will currently lead to wrong results. constexpr bool operator==(const ValueId& other) const { if (getDatatype() == Datatype::LocalVocabIndex && other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] { @@ -139,8 +144,6 @@ class ValueId { /// For doubles it is first the positive doubles in order, then the negative /// doubles in reversed order. This is a direct consequence of comparing the /// bit representation of these values as unsigned integers. - /// TODO Is this ordering also consistent for corner cases of doubles - /// (inf, nan, denormalized numbers, negative zero)? constexpr auto operator<=>(const ValueId& other) const { if (getDatatype() == Datatype::LocalVocabIndex && other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] { @@ -221,6 +224,9 @@ class ValueId { return makeFromIndex(index.get(), Datatype::TextRecordIndex); } static ValueId makeFromLocalVocabIndex(LocalVocabIndex index) { + // The last `numDatatypeBits` of a `LocalVocabIndex` are always zero, so we + // can reuse them for the datatype. + static_assert(alignof(decltype(*index)) >= (1u << numDatatypeBits)); return makeFromIndex(reinterpret_cast(index) >> numDatatypeBits, Datatype::LocalVocabIndex); } diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp index 169291e439..17bc6b2e63 100644 --- a/test/ValueIdTest.cpp +++ b/test/ValueIdTest.cpp @@ -296,7 +296,7 @@ TEST(ValueId, toDebugString) { test(ValueId::makeFromBool(false), "Bool:false"); test(ValueId::makeFromBool(true), "Bool:true"); test(makeVocabId(15), "VocabIndex:15"); - AlignedStr str{"SomeValue"}; + StringAligned16 str{"SomeValue"}; test(ValueId::makeFromLocalVocabIndex(&str), "LocalVocabIndex:SomeValue"); test(makeTextRecordId(37), "TextRecordIndex:37"); test(makeWordVocabId(42), "WordVocabIndex:42"); From 556c293f2e3457f1f7241dcec922d63d862f022a Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 11:21:15 +0200 Subject: [PATCH 09/17] Fixed the remaining unit tests. --- src/engine/AddCombinedRowToTable.h | 3 +-- src/engine/LocalVocab.cpp | 27 +++++++++---------- src/engine/LocalVocab.h | 6 ++--- src/engine/ResultTable.h | 10 +++---- src/global/IndexTypes.h | 1 - src/global/ValueIdComparators.h | 43 ++++++++++++++++++++++++++---- test/LocalVocabTest.cpp | 40 ++++++++++++++++++++++++--- test/ValueIdComparatorsTest.cpp | 12 ++++----- test/ValueIdTest.cpp | 12 ++++----- test/ValueIdTestHelpers.h | 14 +++++----- test/ValuesTest.cpp | 5 ++-- test/util/IdTestHelpers.h | 12 ++++----- 12 files changed, 121 insertions(+), 64 deletions(-) diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h index ae3e3e94bf..75c2bcd848 100644 --- a/src/engine/AddCombinedRowToTable.h +++ b/src/engine/AddCombinedRowToTable.h @@ -245,8 +245,7 @@ class AddCombinedRowToIdTable { // Write the matching rows. for (const auto& [targetIndex, sourceIndices] : indexBuffer_) { auto resultId = - getJoinValue(colLeft[sourceIndices[0]], - colRight[sourceIndices[1]]); + getJoinValue(colLeft[sourceIndices[0]], colRight[sourceIndices[1]]); numUndef += static_cast(resultId.isUndefined()); resultCol[oldSize + targetIndex] = resultId; } diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index e7559a65cf..a4faa54817 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -18,29 +18,26 @@ LocalVocab LocalVocab::clone() const { } // _____________________________________________________________________________ -LocalVocab LocalVocab::merge(const LocalVocab& a, const LocalVocab& b) { +LocalVocab LocalVocab::merge(std::span vocabs) { LocalVocab res; auto inserter = std::back_inserter(res.otherWordSets_); - std::ranges::copy(a.otherWordSets_, inserter); - std::ranges::copy(b.otherWordSets_, inserter); - *inserter = a.primaryWordSet_; - *inserter = b.primaryWordSet_; + for (const auto* vocab : vocabs) { + std::ranges::copy(vocab->otherWordSets_, inserter); + *inserter = vocab->primaryWordSet_; + } return res; } // _____________________________________________________________________________ template LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { - // The following code contains a subtle, but important optimizations: - // - // 1. The variant of `insert` used covers the case that `word` was already - // contained in the map as well as the case that it is newly inserted. This - // avoids computing the hash for `word` twice in case we see it for the first - // time (note that hashing a string is not cheap). - // TODO Make this work with transparent hashing and use try_emplace. + // TODO As soon as we store `IdOrString` in the local vocab, we + // should definitely use `insert` instead of `emplace` here for some + // transparency optimizations. We currently need `emplace` because of the + // explicit conversion from `string` to `AlignedString16`. auto [wordIterator, isNewWord] = - primaryWordSet().insert(StringAligned16{std::forward(word)}); - return std::addressof(*wordIterator); + primaryWordSet().emplace(std::forward(word)); + return std::to_address(wordIterator); } // _____________________________________________________________________________ @@ -61,7 +58,7 @@ std::optional LocalVocab::getIndexOrNullopt( // but if this is only a testing API this is probably not worth the hassle. auto localVocabIndex = primaryWordSet().find(StringAligned16{word}); if (localVocabIndex != primaryWordSet().end()) { - return &(*localVocabIndex); + return std::to_address(localVocabIndex); } else { return std::nullopt; } diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index c2d7f6a8ce..fdd730c521 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -81,9 +81,9 @@ class LocalVocab { // Return a const reference to the word. const std::string& getWord(LocalVocabIndex localVocabIndex) const; - // Create a local vocab that contains and keeps alive all the words from `a` - // as well as `b`. - static LocalVocab merge(const LocalVocab& a, const LocalVocab& b); + // Create a local vocab that contains and keeps alive all the words from each + // of the `vocabs`. + static LocalVocab merge(std::span vocabs); // Return all the words from all the word sets as a vector. std::vector getAllWordsForTesting() const; diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h index d072ef5c39..605f9c64a2 100644 --- a/src/engine/ResultTable.h +++ b/src/engine/ResultTable.h @@ -143,15 +143,11 @@ class ResultTable { requires std::convertible_to, const ResultTable&> static SharedLocalVocabWrapper getMergedLocalVocab(R&& subResults) { - // We could implement the k-way merge much smarter, but in practice we - // rarely have more than two inputs, so the performance should be fine (we - // are only copying a few shared pointers after all). - // TODO implement `LocalVocab::merge` in an n-way fashion`. - LocalVocab res; + std::vector vocabs; for (const ResultTable& table : subResults) { - res = LocalVocab::merge(res, *table.localVocab_); + vocabs.push_back(std::to_address(table.localVocab_)); } - return SharedLocalVocabWrapper{std::move(res)}; + return SharedLocalVocabWrapper{LocalVocab::merge(vocabs)}; } // Get a (deep) copy of the local vocabulary from the given result. Use this diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h index 2260bae25e..adde06bbbc 100644 --- a/src/global/IndexTypes.h +++ b/src/global/IndexTypes.h @@ -20,7 +20,6 @@ struct alignas(16) StringAligned16 : public std::string { using std::string::basic_string; explicit StringAligned16(std::string s) : std::string{std::move(s)} {} }; -// using LocalVocabIndex = ad_utility::TypedIndex; using LocalVocabIndex = const StringAligned16*; using TextRecordIndex = ad_utility::TypedIndex; using WordVocabIndex = ad_utility::TypedIndex; diff --git a/src/global/ValueIdComparators.h b/src/global/ValueIdComparators.h index 02f9d2b5ce..48a5119aee 100644 --- a/src/global/ValueIdComparators.h +++ b/src/global/ValueIdComparators.h @@ -15,6 +15,30 @@ namespace valueIdComparators { // This enum encodes the different numeric comparators LessThan, LessEqual, // Equal, NotEqual, GreaterEqual, GreaterThan. enum struct Comparison { LT, LE, EQ, NE, GE, GT }; +inline std::ostream& operator<<(std::ostream& str, Comparison c) { + switch (c) { + using enum Comparison; + case LT: + str << "LT"; + break; + case LE: + str << "LE"; + break; + case EQ: + str << "EQ"; + break; + case NE: + str << "NE"; + break; + case GE: + str << "GE"; + break; + case GT: + str << "GT"; + break; + } + return str; +} // This enum can be used to configure the behavior of the `compareIds` method // below in the case when two `Id`s have incompatible datatypes (e.g. @@ -72,9 +96,7 @@ inline ValueId toValueId(ComparisonResult comparisonResult) { // negative doubles in reversed order. In detail it is [0.0 ... infinity, NaN, // -0.0, ... -infinity]. This is a direct consequence of comparing the bit // representation of these values as unsigned integers. -inline bool compareByBits(ValueId a, ValueId b) { - return a.getBits() < b.getBits(); -} +inline bool compareByBits(ValueId a, ValueId b) { return a < b; } namespace detail { @@ -308,6 +330,7 @@ inline std::vector> getRangesForIndexTypes( RangeFilter rangeFilter{comparison}; auto [eqBegin, eqEnd] = std::equal_range(beginType, endType, valueId, &compareByBits); + rangeFilter.addSmaller(beginType, eqBegin); rangeFilter.addEqual(eqBegin, eqEnd); rangeFilter.addGreater(eqEnd, endType); @@ -462,8 +485,18 @@ ComparisonResult compareIdsImpl(ValueId a, ValueId b, auto comparator) { } } - auto visitor = [comparator](const auto& aValue, - const auto& bValue) -> ComparisonResult { + auto visitor = [comparator]( + const A& aValue, const B& bValue) -> ComparisonResult { + if constexpr (std::is_same_v) { + if constexpr (!std::is_same_v) { + AD_FAIL(); + } else { + // TODO This is one of the places that has to be changed once + // we want to implement correct comparisons for the local vocab that use + // ICU collation. + return fromBool(std::invoke(comparator, *aValue, *bValue)); + } + } if constexpr (requires() { std::invoke(comparator, aValue, bValue); }) { return fromBool(std::invoke(comparator, aValue, bValue)); } else { diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index fc7746a09f..828d3b7746 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -103,8 +103,10 @@ TEST(LocalVocab, clone) { // Create a small local vocabulary. size_t localVocabSize = 100; LocalVocab localVocabOriginal; - for (const auto& word : getTestCollectionOfWords(localVocabSize)) { - localVocabOriginal.getIndexAndAddIfNotContained(word); + std::vector indices; + auto inputWords = getTestCollectionOfWords(localVocabSize); + for (const auto& word : inputWords) { + indices.push_back(localVocabOriginal.getIndexAndAddIfNotContained(word)); } ASSERT_EQ(localVocabOriginal.size(), localVocabSize); @@ -116,7 +118,39 @@ TEST(LocalVocab, clone) { ASSERT_THAT(localVocabClone.getAllWordsForTesting(), ::testing::UnorderedElementsAreArray( localVocabOriginal.getAllWordsForTesting())); - // TODO Maybe also test the indices. + + // Test that the indices are still valid after the original vocabulary was + // destroyed. + localVocabOriginal = LocalVocab{}; + + for (size_t i = 0; i < inputWords.size(); ++i) { + EXPECT_EQ(*indices[i], inputWords[i]); + } +} +// _____________________________________________________________________________ +TEST(LocalVocab, merge) { + // Create a small local vocabulary. + std::vector indices; + LocalVocab vocA; + LocalVocab vocB; + indices.push_back(vocA.getIndexAndAddIfNotContained("oneA")); + indices.push_back(vocA.getIndexAndAddIfNotContained("twoA")); + indices.push_back(vocA.getIndexAndAddIfNotContained("oneB")); + indices.push_back(vocA.getIndexAndAddIfNotContained("twoB")); + + // Clone it and test that the clone contains the same words, but under + // different addresses (that is, the word strings were deeply copied). + auto vocabs = std::vector{&std::as_const(vocA), &std::as_const(vocB)}; + LocalVocab localVocabMerged = LocalVocab::merge(vocabs); + ASSERT_EQ(localVocabMerged.size(), 4u); + ASSERT_THAT(localVocabMerged.getAllWordsForTesting(), + ::testing::UnorderedElementsAre("oneA", "twoA", "oneB", "twoB")); + vocA = LocalVocab{}; + vocB = LocalVocab{}; + EXPECT_EQ(*indices[0], "oneA"); + EXPECT_EQ(*indices[1], "twoA"); + EXPECT_EQ(*indices[2], "oneB"); + EXPECT_EQ(*indices[3], "twoB"); } // _____________________________________________________________________________ diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp index 6ca68eb7a0..c9641b00b3 100644 --- a/test/ValueIdComparatorsTest.cpp +++ b/test/ValueIdComparatorsTest.cpp @@ -77,20 +77,23 @@ auto testGetRangesForId(auto begin, auto end, ValueId id, using enum ComparisonResult; for (auto [rangeBegin, rangeEnd] : ranges) { while (it != rangeBegin) { - ASSERT_FALSE(isMatching(*it, id)) << *it << ' ' << id; + ASSERT_FALSE(isMatching(*it, id)) << *it << ' ' << id << comparison; auto expected = isMatchingDatatype(*it) ? False : Undef; ASSERT_EQ(compareIds(*it, id, comparison), expected) << *it << ' ' << id; ++it; } while (it != rangeEnd) { - ASSERT_TRUE(isMatching(*it, id)) << *it << ' ' << id; + ASSERT_TRUE(isMatching(*it, id)) << *it << ' ' << id << comparison; ASSERT_EQ(compareIds(*it, id, comparison), True) << *it << ' ' << id; ++it; } } while (it != end) { - ASSERT_FALSE(isMatching(*it, id)); + if (isMatching(*it, id)) { + LOG(INFO) << "bum" << std::endl; + } + ASSERT_FALSE(isMatching(*it, id)) << *it << ", " << id << comparison; auto expected = isMatchingDatatype(*it) ? False : Undef; ASSERT_EQ(compareIds(*it, id, comparison), expected) << *it << ' ' << id; ++it; @@ -213,8 +216,6 @@ auto testGetRangesForEqualIds(auto begin, auto end, ValueId idBegin, // Test that `getRangesFromId` works correctly for `ValueId`s of the unsigned // index types (`VocabIndex`, `TextRecordIndex`, `LocalVocabIndex`, // `WordVocabIndex`). -// TODO Reinstate all the tests.. -/* TEST(ValueIdComparators, IndexTypes) { auto ids = makeRandomIds(); std::sort(ids.begin(), ids.end(), compareByBits); @@ -257,7 +258,6 @@ TEST(ValueIdComparators, IndexTypes) { testImpl.operator()(&getLocalVocabIndex); testImpl.operator()(&getWordVocabIndex); } - */ // _______________________________________________________________________ TEST(ValueIdComparators, undefinedWithItself) { diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp index 17bc6b2e63..ca278bd5d3 100644 --- a/test/ValueIdTest.cpp +++ b/test/ValueIdTest.cpp @@ -121,7 +121,11 @@ TEST(ValueId, Indices) { testRandomIds(&makeTextRecordId, &getTextRecordIndex, Datatype::TextRecordIndex); testRandomIds(&makeVocabId, &getVocabIndex, Datatype::VocabIndex); - testRandomIds(&makeLocalVocabId, &getLocalVocabIndex, + + auto localVocabWordToInt = [](const auto& input) { + return std::atoll(getLocalVocabIndex(input).c_str()); + }; + testRandomIds(&makeLocalVocabId, localVocabWordToInt, Datatype::LocalVocabIndex); testRandomIds(&makeWordVocabId, &getWordVocabIndex, Datatype::WordVocabIndex); } @@ -131,7 +135,6 @@ TEST(ValueId, Undefined) { ASSERT_EQ(id.getDatatype(), Datatype::Undefined); } -/* TEST(ValueId, OrderingDifferentDatatypes) { auto ids = makeRandomIds(); std::sort(ids.begin(), ids.end()); @@ -142,9 +145,7 @@ TEST(ValueId, OrderingDifferentDatatypes) { ASSERT_TRUE( std::is_sorted(ids.begin(), ids.end(), compareByDatatypeAndIndexTypes)); } - */ -/* TEST(ValueId, IndexOrdering) { auto testOrder = [](auto makeIdFromIndex, auto getIndexFromId) { std::vector ids; @@ -168,7 +169,6 @@ TEST(ValueId, IndexOrdering) { testOrder(&makeWordVocabId, &getWordVocabIndex); testOrder(&makeTextRecordId, &getTextRecordIndex); } - */ TEST(ValueId, DoubleOrdering) { auto ids = makeRandomDoubleIds(); @@ -263,7 +263,6 @@ TEST(ValueId, Serialization) { } } -/* TEST(ValueId, Hashing) { auto ids = makeRandomIds(); ad_utility::HashSet idsWithoutDuplicates; @@ -282,7 +281,6 @@ TEST(ValueId, Hashing) { ASSERT_EQ(ids, idsWithoutDuplicatesAsVector); } - */ TEST(ValueId, toDebugString) { auto test = [](ValueId id, std::string_view expected) { diff --git a/test/ValueIdTestHelpers.h b/test/ValueIdTestHelpers.h index b4f6cddce3..19aba006a3 100644 --- a/test/ValueIdTestHelpers.h +++ b/test/ValueIdTestHelpers.h @@ -5,8 +5,9 @@ #ifndef QLEVER_VALUEIDTESTHELPERS_H #define QLEVER_VALUEIDTESTHELPERS_H -#include "../src/global/ValueId.h" -#include "../src/util/Random.h" +#include "./util/IdTestHelpers.h" +#include "global/ValueId.h" +#include "util/Random.h" // Enabling cheaper unit tests when building in Debug mode #ifdef QLEVER_RUN_EXPENSIVE_TESTS @@ -45,8 +46,7 @@ inline ValueId makeVocabId(uint64_t value) { return ValueId::makeFromVocabIndex(VocabIndex::make(value)); } inline ValueId makeLocalVocabId(uint64_t value) { - return ValueId::makeFromLocalVocabIndex( - std::bit_cast(value << 4)); + return ad_utility::testing::LocalVocabId(value); } inline ValueId makeTextRecordId(uint64_t value) { return ValueId::makeFromTextRecordIndex(TextRecordIndex::make(value)); @@ -60,8 +60,9 @@ inline ValueId makeBlankNodeId(uint64_t value) { inline uint64_t getVocabIndex(ValueId id) { return id.getVocabIndex().get(); } // TODO Make the tests more precise for the localVocabIndices. -inline uint64_t getLocalVocabIndex(ValueId id) { - return std::bit_cast(id.getLocalVocabIndex()) >> 4; +inline std::string getLocalVocabIndex(ValueId id) { + AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::LocalVocabIndex); + return *id.getLocalVocabIndex(); } inline uint64_t getTextRecordIndex(ValueId id) { return id.getTextRecordIndex().get(); @@ -106,6 +107,7 @@ inline auto makeRandomDoubleIds = []() { ad_utility::randomShuffle(ids.begin(), ids.end()); return ids; }; + inline auto makeRandomIds = []() { std::vector ids = makeRandomDoubleIds(); addIdsFromGenerator(indexGenerator, &makeVocabId, ids); diff --git a/test/ValuesTest.cpp b/test/ValuesTest.cpp index 87fc72e1ab..001eab9116 100644 --- a/test/ValuesTest.cpp +++ b/test/ValuesTest.cpp @@ -74,11 +74,12 @@ TEST(Values, computeResult) { const auto& table = result->idTable(); Id x = ad_utility::testing::makeGetId(testQec->getIndex())(""); auto I = ad_utility::testing::IntId; - auto L = ad_utility::testing::LocalVocabId; auto l = result->localVocab().getIndexOrNullopt(""); ASSERT_TRUE(l.has_value()); auto U = Id::makeUndefined(); - ASSERT_EQ(table, makeIdTableFromVector({{I(12), x}, {U, L(l.value())}})); + ASSERT_EQ(table, + makeIdTableFromVector( + {{I(12), x}, {U, Id::makeFromLocalVocabIndex(l.value())}})); } // Check that if the number of variables and the number of values in each row diff --git a/test/util/IdTestHelpers.h b/test/util/IdTestHelpers.h index 8fbb448131..1449ab0f88 100644 --- a/test/util/IdTestHelpers.h +++ b/test/util/IdTestHelpers.h @@ -4,6 +4,7 @@ #pragma once +#include "engine/LocalVocab.h" #include "global/Id.h" // Lambdas to simply create an `Id` with a given value and type during unit @@ -18,13 +19,10 @@ inline auto VocabId = [](const auto& v) { return Id::makeFromVocabIndex(VocabIndex::make(v)); }; -inline auto LocalVocabId = [](const T& v) { - if constexpr (std::integral) { - return Id::makeFromLocalVocabIndex( - std::bit_cast(static_cast(v))); - } else { - return Id::makeFromLocalVocabIndex(std::bit_cast(v)); - } +inline auto LocalVocabId = [](std::integral auto v) { + static ad_utility::Synchronized localVocab; + return Id::makeFromLocalVocabIndex( + localVocab.wlock()->getIndexAndAddIfNotContained(std::to_string(v))); }; inline auto TextRecordId = [](const auto& t) { From 4bd0884d857aedf51cf373b1c5e4275aeb727c3b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 11:58:06 +0200 Subject: [PATCH 10/17] Fix compilation and hopefully coverage. --- src/engine/LocalVocab.h | 1 + src/global/ValueIdComparators.h | 24 ------------------------ test/ValueIdComparatorsTest.cpp | 26 ++++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index fdd730c521..83c3bccae5 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -6,6 +6,7 @@ #include #include +#include #include #include diff --git a/src/global/ValueIdComparators.h b/src/global/ValueIdComparators.h index 48a5119aee..0c0aae44b3 100644 --- a/src/global/ValueIdComparators.h +++ b/src/global/ValueIdComparators.h @@ -15,30 +15,6 @@ namespace valueIdComparators { // This enum encodes the different numeric comparators LessThan, LessEqual, // Equal, NotEqual, GreaterEqual, GreaterThan. enum struct Comparison { LT, LE, EQ, NE, GE, GT }; -inline std::ostream& operator<<(std::ostream& str, Comparison c) { - switch (c) { - using enum Comparison; - case LT: - str << "LT"; - break; - case LE: - str << "LE"; - break; - case EQ: - str << "EQ"; - break; - case NE: - str << "NE"; - break; - case GE: - str << "GE"; - break; - case GT: - str << "GT"; - break; - } - return str; -} // This enum can be used to configure the behavior of the `compareIds` method // below in the case when two `Id`s have incompatible datatypes (e.g. diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp index c9641b00b3..0f13e8879b 100644 --- a/test/ValueIdComparatorsTest.cpp +++ b/test/ValueIdComparatorsTest.cpp @@ -12,6 +12,32 @@ #include "util/Random.h" using namespace valueIdComparators; +namespace valueIdComparators { +inline std::ostream& operator<<(std::ostream& str, Comparison c) { + switch (c) { + using enum Comparison; + case LT: + str << "LT"; + break; + case LE: + str << "LE"; + break; + case EQ: + str << "EQ"; + break; + case NE: + str << "NE"; + break; + case GE: + str << "GE"; + break; + case GT: + str << "GT"; + break; + } + return str; +} +} // namespace valueIdComparators using ad_utility::source_location; TEST(ValueIdComparators, GetRangeForDatatype) { From 71d7a8be4390605cb09b290c11c34ceffcb24b6f Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 14:16:29 +0200 Subject: [PATCH 11/17] Update the native macos build to clang 18 to fix std::to_address. --- .github/workflows/macos.yml | 20 +++++++++---------- .../{clang-16-macos => clang-18-macos} | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) rename conanprofiles/{clang-16-macos => clang-18-macos} (84%) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index d0efe2c515..21b30f31e3 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -39,14 +39,14 @@ jobs: - name: Install dependencies run: | - brew install llvm@16 + brew install llvm@18 brew install conan@2 - echo 'export PATH="/usr/local/opt/llvm@16/bin:$PATH"' >> ~/.bash_profile - echo PATH="/usr/local/opt/llvm@16/bin:$PATH" >> $GITHUB_ENV - echo 'export LDFLAGS="-L/usr/local/opt/llvm@16/lib -L/usr/local/opt/llvm@16/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@16/lib/c++"' >> ~/.bash_profile - echo LDFLAGS="-L/usr/local/opt/llvm@16/lib -L/usr/local/opt/llvm@16/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@16/lib/c++" >> $GITHUB_ENV - echo 'export CPPFLAGS="-I/usr/local/opt/llvm@16/include"' >> ~/.bash_profile - echo CPPFLAGS="/usr/local/opt/llvm@16/include" >> $GITHUB_ENV + echo 'export PATH="/usr/local/opt/llvm@18/bin:$PATH"' >> ~/.bash_profile + echo PATH="/usr/local/opt/llvm@18/bin:$PATH" >> $GITHUB_ENV + echo 'export LDFLAGS="-L/usr/local/opt/llvm@18/lib -L/usr/local/opt/llvm@18/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@18/lib/c++"' >> ~/.bash_profile + echo LDFLAGS="-L/usr/local/opt/llvm@18/lib -L/usr/local/opt/llvm@18/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@18/lib/c++" >> $GITHUB_ENV + echo 'export CPPFLAGS="-I/usr/local/opt/llvm@18/include"' >> ~/.bash_profile + echo CPPFLAGS="/usr/local/opt/llvm@18/include" >> $GITHUB_ENV source ~/.bash_profile - name: Print clang version run: clang++ --version @@ -57,15 +57,15 @@ jobs: cache-name: cache-conan-modules with: path: ~/.conan2 - key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('conanfile.txt') }} + key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('conanfile.txt') }}-${{ hashFiles('conanprofiles/clang-18-macos')}} - name: Create build directory run: mkdir ${{github.workspace}}/build - name: Install and run conan working-directory: ${{github.workspace}}/build run: > - conan install .. -pr:b=../conanprofiles/clang-16-macos -pr:h=../conanprofiles/clang-16-macos -of=. --build=missing; + conan install .. -pr:b=../conanprofiles/clang-18-macos -pr:h=../conanprofiles/clang-18-macos -of=. --build=missing; - name: Configure CMake - # For std::ranges::join_view we need the -fexperimental-library flag on libc++16, which on Mac requires to manually tinker with the linking flags. + # For std::ranges::join_view we need the -fexperimental-library flag on libc++18, which on Mac requires to manually tinker with the linking flags. run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.build-type}} -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DUSE_PARALLEL=true -DRUN_EXPENSIVE_TESTS=false -DENABLE_EXPENSIVE_CHECKS=true -DCMAKE_CXX_COMPILER=clang++ -DADDITIONAL_COMPILER_FLAGS="-fexperimental-library" -DADDITIONAL_LINKER_FLAGS="-L$(brew --prefix llvm)/lib/c++" - name: Build diff --git a/conanprofiles/clang-16-macos b/conanprofiles/clang-18-macos similarity index 84% rename from conanprofiles/clang-16-macos rename to conanprofiles/clang-18-macos index d1c5ccd365..b776d4594a 100644 --- a/conanprofiles/clang-16-macos +++ b/conanprofiles/clang-18-macos @@ -4,5 +4,5 @@ build_type=Release compiler=clang compiler.cppstd=gnu17 compiler.libcxx=libc++ -compiler.version=16 +compiler.version=18 os=Macos From 64f19dff49472c4afdd0bfb323ee52b2568b0388 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 14:25:05 +0200 Subject: [PATCH 12/17] Remove `std::to_address` because of MacOS. --- .github/workflows/macos.yml | 20 ++++++++++---------- src/engine/LocalVocab.cpp | 8 +++++--- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml index 21b30f31e3..d0efe2c515 100644 --- a/.github/workflows/macos.yml +++ b/.github/workflows/macos.yml @@ -39,14 +39,14 @@ jobs: - name: Install dependencies run: | - brew install llvm@18 + brew install llvm@16 brew install conan@2 - echo 'export PATH="/usr/local/opt/llvm@18/bin:$PATH"' >> ~/.bash_profile - echo PATH="/usr/local/opt/llvm@18/bin:$PATH" >> $GITHUB_ENV - echo 'export LDFLAGS="-L/usr/local/opt/llvm@18/lib -L/usr/local/opt/llvm@18/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@18/lib/c++"' >> ~/.bash_profile - echo LDFLAGS="-L/usr/local/opt/llvm@18/lib -L/usr/local/opt/llvm@18/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@18/lib/c++" >> $GITHUB_ENV - echo 'export CPPFLAGS="-I/usr/local/opt/llvm@18/include"' >> ~/.bash_profile - echo CPPFLAGS="/usr/local/opt/llvm@18/include" >> $GITHUB_ENV + echo 'export PATH="/usr/local/opt/llvm@16/bin:$PATH"' >> ~/.bash_profile + echo PATH="/usr/local/opt/llvm@16/bin:$PATH" >> $GITHUB_ENV + echo 'export LDFLAGS="-L/usr/local/opt/llvm@16/lib -L/usr/local/opt/llvm@16/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@16/lib/c++"' >> ~/.bash_profile + echo LDFLAGS="-L/usr/local/opt/llvm@16/lib -L/usr/local/opt/llvm@16/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@16/lib/c++" >> $GITHUB_ENV + echo 'export CPPFLAGS="-I/usr/local/opt/llvm@16/include"' >> ~/.bash_profile + echo CPPFLAGS="/usr/local/opt/llvm@16/include" >> $GITHUB_ENV source ~/.bash_profile - name: Print clang version run: clang++ --version @@ -57,15 +57,15 @@ jobs: cache-name: cache-conan-modules with: path: ~/.conan2 - key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('conanfile.txt') }}-${{ hashFiles('conanprofiles/clang-18-macos')}} + key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('conanfile.txt') }} - name: Create build directory run: mkdir ${{github.workspace}}/build - name: Install and run conan working-directory: ${{github.workspace}}/build run: > - conan install .. -pr:b=../conanprofiles/clang-18-macos -pr:h=../conanprofiles/clang-18-macos -of=. --build=missing; + conan install .. -pr:b=../conanprofiles/clang-16-macos -pr:h=../conanprofiles/clang-16-macos -of=. --build=missing; - name: Configure CMake - # For std::ranges::join_view we need the -fexperimental-library flag on libc++18, which on Mac requires to manually tinker with the linking flags. + # For std::ranges::join_view we need the -fexperimental-library flag on libc++16, which on Mac requires to manually tinker with the linking flags. run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.build-type}} -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DUSE_PARALLEL=true -DRUN_EXPENSIVE_TESTS=false -DENABLE_EXPENSIVE_CHECKS=true -DCMAKE_CXX_COMPILER=clang++ -DADDITIONAL_COMPILER_FLAGS="-fexperimental-library" -DADDITIONAL_LINKER_FLAGS="-L$(brew --prefix llvm)/lib/c++" - name: Build diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index a4faa54817..3c4a956e41 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -37,6 +37,8 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { // explicit conversion from `string` to `AlignedString16`. auto [wordIterator, isNewWord] = primaryWordSet().emplace(std::forward(word)); + // TODO Use std::to_address (more idiomatic, but currently breaks + // the MacOS build. return std::to_address(wordIterator); } @@ -54,11 +56,11 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(std::string&& word) { // _____________________________________________________________________________ std::optional LocalVocab::getIndexOrNullopt( const std::string& word) const { - // TODO Maybe we can make this work with transparent hashing, - // but if this is only a testing API this is probably not worth the hassle. auto localVocabIndex = primaryWordSet().find(StringAligned16{word}); if (localVocabIndex != primaryWordSet().end()) { - return std::to_address(localVocabIndex); + // TODO Use std::to_address (more idiomatic, but currently breaks + // the MacOS build. + return &(*localVocabIndex); } else { return std::nullopt; } From 700b518983e6eed73ffa458f9657812b03a97233 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 14:50:28 +0200 Subject: [PATCH 13/17] Fix mac for good now... --- conanprofiles/{clang-18-macos => clang-16-macos} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename conanprofiles/{clang-18-macos => clang-16-macos} (100%) diff --git a/conanprofiles/clang-18-macos b/conanprofiles/clang-16-macos similarity index 100% rename from conanprofiles/clang-18-macos rename to conanprofiles/clang-16-macos From 5724a263ab09c226c815fc7881e8402167aa7702 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 14:58:53 +0200 Subject: [PATCH 14/17] Update clang-16-macos --- conanprofiles/clang-16-macos | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conanprofiles/clang-16-macos b/conanprofiles/clang-16-macos index b776d4594a..d1c5ccd365 100644 --- a/conanprofiles/clang-16-macos +++ b/conanprofiles/clang-16-macos @@ -4,5 +4,5 @@ build_type=Release compiler=clang compiler.cppstd=gnu17 compiler.libcxx=libc++ -compiler.version=18 +compiler.version=16 os=Macos From b3c6fe2cd4ee8a3e8deb7f87e63c15fd38d86873 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 15:37:37 +0200 Subject: [PATCH 15/17] Finalinally fix the MacOS build. --- src/engine/LocalVocab.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index 3c4a956e41..52b4267ca9 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -39,7 +39,7 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { primaryWordSet().emplace(std::forward(word)); // TODO Use std::to_address (more idiomatic, but currently breaks // the MacOS build. - return std::to_address(wordIterator); + return &(*wordIterator); } // _____________________________________________________________________________ From a441f5e4ede4b4a13a01b50d6519658821ef7e88 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 19:56:08 +0200 Subject: [PATCH 16/17] A round of reviews. --- src/engine/LocalVocab.h | 2 +- src/global/ValueIdComparators.h | 20 +++++++++----------- test/LocalVocabTest.cpp | 6 ++---- test/ValueIdComparatorsTest.cpp | 3 --- 4 files changed, 12 insertions(+), 19 deletions(-) diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index 83c3bccae5..3ddd58567f 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -83,7 +83,7 @@ class LocalVocab { const std::string& getWord(LocalVocabIndex localVocabIndex) const; // Create a local vocab that contains and keeps alive all the words from each - // of the `vocabs`. + // of the `vocabs`. The primary word set of the newly create vocab is empty. static LocalVocab merge(std::span vocabs); // Return all the words from all the word sets as a vector. diff --git a/src/global/ValueIdComparators.h b/src/global/ValueIdComparators.h index 0c0aae44b3..84440c4707 100644 --- a/src/global/ValueIdComparators.h +++ b/src/global/ValueIdComparators.h @@ -463,17 +463,15 @@ ComparisonResult compareIdsImpl(ValueId a, ValueId b, auto comparator) { auto visitor = [comparator]( const A& aValue, const B& bValue) -> ComparisonResult { - if constexpr (std::is_same_v) { - if constexpr (!std::is_same_v) { - AD_FAIL(); - } else { - // TODO This is one of the places that has to be changed once - // we want to implement correct comparisons for the local vocab that use - // ICU collation. - return fromBool(std::invoke(comparator, *aValue, *bValue)); - } - } - if constexpr (requires() { std::invoke(comparator, aValue, bValue); }) { + if constexpr (std::is_same_v && + std::is_same_v) { + // TODO This is one of the places that has to be changed once + // we want to implement correct comparisons for the local vocab that use + // ICU collation. + return fromBool(std::invoke(comparator, *aValue, *bValue)); + } else if constexpr (requires() { + std::invoke(comparator, aValue, bValue); + }) { return fromBool(std::invoke(comparator, aValue, bValue)); } else { AD_FAIL(); diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index 828d3b7746..1f76ddb2bc 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -110,8 +110,7 @@ TEST(LocalVocab, clone) { } ASSERT_EQ(localVocabOriginal.size(), localVocabSize); - // Clone it and test that the clone contains the same words, but under - // different addresses (that is, the word strings were deeply copied). + // Clone it and test that the clone contains the same words. LocalVocab localVocabClone = localVocabOriginal.clone(); ASSERT_EQ(localVocabOriginal.size(), localVocabSize); ASSERT_EQ(localVocabClone.size(), localVocabSize); @@ -138,8 +137,7 @@ TEST(LocalVocab, merge) { indices.push_back(vocA.getIndexAndAddIfNotContained("oneB")); indices.push_back(vocA.getIndexAndAddIfNotContained("twoB")); - // Clone it and test that the clone contains the same words, but under - // different addresses (that is, the word strings were deeply copied). + // Clone it and test that the clone contains the same words. auto vocabs = std::vector{&std::as_const(vocA), &std::as_const(vocB)}; LocalVocab localVocabMerged = LocalVocab::merge(vocabs); ASSERT_EQ(localVocabMerged.size(), 4u); diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp index 0f13e8879b..8319bfa427 100644 --- a/test/ValueIdComparatorsTest.cpp +++ b/test/ValueIdComparatorsTest.cpp @@ -116,9 +116,6 @@ auto testGetRangesForId(auto begin, auto end, ValueId id, } } while (it != end) { - if (isMatching(*it, id)) { - LOG(INFO) << "bum" << std::endl; - } ASSERT_FALSE(isMatching(*it, id)) << *it << ", " << id << comparison; auto expected = isMatchingDatatype(*it) ? False : Undef; ASSERT_EQ(compareIds(*it, id, comparison), expected) << *it << ' ' << id; From 814e172800efcb9895c0dcebd5b0191dae0d3f3a Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 11 Apr 2024 20:00:09 +0200 Subject: [PATCH 17/17] The missing "d" --- src/engine/LocalVocab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index 3ddd58567f..16ed3827a1 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -83,7 +83,7 @@ class LocalVocab { const std::string& getWord(LocalVocabIndex localVocabIndex) const; // Create a local vocab that contains and keeps alive all the words from each - // of the `vocabs`. The primary word set of the newly create vocab is empty. + // of the `vocabs`. The primary word set of the newly created vocab is empty. static LocalVocab merge(std::span vocabs); // Return all the words from all the word sets as a vector.