diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h index 82203afa14..75c2bcd848 100644 --- a/src/engine/AddCombinedRowToTable.h +++ b/src/engine/AddCombinedRowToTable.h @@ -218,21 +218,23 @@ class AddCombinedRowToIdTable { AD_CORRECTNESS_CHECK(inputLeftAndRight_.has_value()); result.resize(oldSize + nextIndex_); - // Sometimes columns are combined where one value is UNDEF and the other one - // is not. This function very efficiently returns the not-UNDEF value in - // this case. - // TODO If we keep track of the information that one of the - // involved columns contains no UNDEF values at all, we can omit this step - // and simply copy the values from this column without looking at the other - // input. - auto mergeWithUndefined = [](const ValueId a, const ValueId b) { - static_assert(ValueId::makeUndefined().getBits() == 0u); - return ValueId::fromBits(a.getBits() | b.getBits()); + // Precondition: `a` and `b` compare equal or at least one of them is UNDEF + // If exactly one of them is UNDEF, return the other one, else return any of + // them (they are equal anyway). + auto getJoinValue = [](const ValueId a, const ValueId b) { + // NOTE: For localVocabIndices we might have different pointers that + // compare equal because they point to the same word. Therefore we cannot + // use a simple bitwise operation to handle the "one of them is UNDEF" + // case as we previously did. + if (a.isUndefined()) { + return b; + } + return a; }; // A lambda that writes the join column with the given `colIdx` to the // `nextResultColIdx`-th column of the result. - auto writeJoinColumn = [&result, &mergeWithUndefined, oldSize, this]( + auto writeJoinColumn = [&result, &getJoinValue, oldSize, this]( size_t colIdx, size_t resultColIdx) { const auto& colLeft = inputLeft().getColumn(colIdx); const auto& colRight = inputRight().getColumn(colIdx); @@ -242,8 +244,8 @@ class AddCombinedRowToIdTable { // Write the matching rows. for (const auto& [targetIndex, sourceIndices] : indexBuffer_) { - auto resultId = mergeWithUndefined(colLeft[sourceIndices[0]], - colRight[sourceIndices[1]]); + auto resultId = + getJoinValue(colLeft[sourceIndices[0]], colRight[sourceIndices[1]]); numUndef += static_cast(resultId.isUndefined()); resultCol[oldSize + targetIndex] = resultId; } diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp index e9059269b0..c1c631f5b7 100644 --- a/src/engine/CartesianProductJoin.cpp +++ b/src/engine/CartesianProductJoin.cpp @@ -202,7 +202,7 @@ ResultTable CartesianProductJoin::computeResult() { auto subResultsDeref = std::views::transform( subResults, [](auto& x) -> decltype(auto) { return *x; }); return {std::move(result), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(subResultsDeref)}; + ResultTable::getMergedLocalVocab(subResultsDeref)}; } // ____________________________________________________________________________ diff --git a/src/engine/Join.cpp b/src/engine/Join.cpp index df84b1696a..dc1063c611 100644 --- a/src/engine/Join.cpp +++ b/src/engine/Join.cpp @@ -187,7 +187,7 @@ ResultTable Join::computeResult() { // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftRes, *rightRes)}; + ResultTable::getMergedLocalVocab(*leftRes, *rightRes)}; } // _____________________________________________________________________________ diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp index a4ba9167ff..52b4267ca9 100644 --- a/src/engine/LocalVocab.cpp +++ b/src/engine/LocalVocab.cpp @@ -11,56 +11,35 @@ // _____________________________________________________________________________ LocalVocab LocalVocab::clone() const { LocalVocab localVocabClone; - // First, make a deep copy of the `absl::node_hash_map` holding the actual - // map of strings to indexes. - localVocabClone.wordsToIndexesMap_ = this->wordsToIndexesMap_; - // The next free index should be the same. - localVocabClone.nextFreeIndex_ = this->nextFreeIndex_; - // The map from local ids to strings stores pointers to strings. So we cannot - // just copy these from `this->indexesToWordsMap_` to `localVocabClone`, but - // we need to make sure to store the pointers to the strings of the new map - // `localVocabClone.wordsToIndexesMap_`. - // - // NOTE: An alternative algorithm would be to sort the word-index pairs in - // `wordsToIndexesMap_` by index and then fill `indexesToWordsMap_` in order. - // This would have better locality, but the sorting takes non-linear time plus - // the sorting has to handle pairs of `LocalVocabIndex` and `std::string`. So - // for very large local vocabularies (and only then is this operation - // performance-criticial at all), the simpler approach below is probably - // better. - const size_t localVocabSize = this->size(); - localVocabClone.indexesToWordsMap_.resize(localVocabSize); - for (const auto& [wordInMap, index] : localVocabClone.wordsToIndexesMap_) { - AD_CONTRACT_CHECK(index.get() < localVocabSize); - localVocabClone.indexesToWordsMap_[index.get()] = std::addressof(wordInMap); - } + localVocabClone.otherWordSets_ = otherWordSets_; + localVocabClone.otherWordSets_.push_back(primaryWordSet_); // Return the clone. return localVocabClone; } +// _____________________________________________________________________________ +LocalVocab LocalVocab::merge(std::span vocabs) { + LocalVocab res; + auto inserter = std::back_inserter(res.otherWordSets_); + for (const auto* vocab : vocabs) { + std::ranges::copy(vocab->otherWordSets_, inserter); + *inserter = vocab->primaryWordSet_; + } + return res; +} + // _____________________________________________________________________________ template LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) { - // The following code contains two subtle, but important optimizations: - // - // 1. The variant of `insert` used covers the case that `word` was already - // contained in the map as well as the case that it is newly inserted. This - // avoids computing the hash for `word` twice in case we see it for the first - // time (note that hashing a string is not cheap). - // - // 2. The fact that we have a member variable `nextFreeIndex_` avoids that we - // tentatively have to compute the next free ID every time this function is - // called (even when the ID is not actually needed because the word is already - // contained in the map). - // - auto [wordInMapAndIndex, isNewWord] = - wordsToIndexesMap_.insert({std::forward(word), nextFreeIndex_}); - const auto& [wordInMap, index] = *wordInMapAndIndex; - if (isNewWord) { - indexesToWordsMap_.push_back(&wordInMap); - nextFreeIndex_ = LocalVocabIndex::make(indexesToWordsMap_.size()); - } - return index; + // TODO As soon as we store `IdOrString` in the local vocab, we + // should definitely use `insert` instead of `emplace` here for some + // transparency optimizations. We currently need `emplace` because of the + // explicit conversion from `string` to `AlignedString16`. + auto [wordIterator, isNewWord] = + primaryWordSet().emplace(std::forward(word)); + // TODO Use std::to_address (more idiomatic, but currently breaks + // the MacOS build. + return &(*wordIterator); } // _____________________________________________________________________________ @@ -77,9 +56,11 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(std::string&& word) { // _____________________________________________________________________________ std::optional LocalVocab::getIndexOrNullopt( const std::string& word) const { - auto localVocabIndex = wordsToIndexesMap_.find(word); - if (localVocabIndex != wordsToIndexesMap_.end()) { - return localVocabIndex->second; + auto localVocabIndex = primaryWordSet().find(StringAligned16{word}); + if (localVocabIndex != primaryWordSet().end()) { + // TODO Use std::to_address (more idiomatic, but currently breaks + // the MacOS build. + return &(*localVocabIndex); } else { return std::nullopt; } @@ -87,11 +68,15 @@ std::optional LocalVocab::getIndexOrNullopt( // _____________________________________________________________________________ const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const { - if (localVocabIndex.get() >= indexesToWordsMap_.size()) { - throw std::runtime_error(absl::StrCat( - "LocalVocab error: request for word with local vocab index ", - localVocabIndex.get(), ", but size of local vocab is only ", - indexesToWordsMap_.size(), ", please contact the developers")); + return *localVocabIndex; +} + +// _____________________________________________________________________________ +std::vector LocalVocab::getAllWordsForTesting() const { + std::vector result; + std::ranges::copy(primaryWordSet(), std::back_inserter(result)); + for (const auto& previous : otherWordSets_) { + std::ranges::copy(*previous, std::back_inserter(result)); } - return *(indexesToWordsMap_.at(localVocabIndex.get())); + return result; } diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h index 95572f7b22..16ed3827a1 100644 --- a/src/engine/LocalVocab.h +++ b/src/engine/LocalVocab.h @@ -6,10 +6,11 @@ #include #include +#include #include #include -#include "absl/container/node_hash_map.h" +#include "absl/container/node_hash_set.h" #include "global/Id.h" // A class for maintaing a local vocabulary with contiguous (local) IDs. This is @@ -24,18 +25,17 @@ class LocalVocab { // A map of the words in the local vocabulary to their local IDs. This is a // node hash map because we need the addresses of the words (which are of type // `std::string`) to remain stable over their lifetime in the hash map because - // we refer to them in `wordsToIdsMap_` below. - absl::node_hash_map wordsToIndexesMap_; + // we hand out pointers to them. + using Set = absl::node_hash_set; + std::shared_ptr primaryWordSet_ = std::make_shared(); - // A map of the local IDs to the words. Since the IDs are contiguous, we can - // use a `std::vector`. We store pointers to the actual words in - // `wordsToIdsMap_` to avoid storing every word twice. This saves space, but - // costs us an indirection when looking up a word by its ID. - std::vector indexesToWordsMap_; + // Local vocabularies from child operations that were merged into this + // vocabulary s.t. the pointers are kept alive. They have to be `const` + // because they are possibly shared concurrently (for example via the cache). + std::vector> otherWordSets_; - // The next free local ID (will be incremented by one each time we add a new - // word). - LocalVocabIndex nextFreeIndex_ = LocalVocabIndex::make(0); + auto& primaryWordSet() { return *primaryWordSet_; } + const auto& primaryWordSet() const { return *primaryWordSet_; } public: // Create a new, empty local vocabulary. @@ -45,7 +45,9 @@ class LocalVocab { LocalVocab(const LocalVocab&) = delete; LocalVocab& operator=(const LocalVocab&) = delete; - // Make a deep copy explicitly. + // Make a logical copy. The clone will have an empty primary set so it can + // safely be modified. The contents are copied as shared pointers to const, so + // the function runs in linear time in the number of word sets. LocalVocab clone() const; // Moving a local vocabulary is not problematic (though the typical use case @@ -65,14 +67,28 @@ class LocalVocab { const std::string& word) const; // The number of words in the vocabulary. - size_t size() const { return indexesToWordsMap_.size(); } + // Note: This is not constant time, but linear in the number of word sets. + size_t size() const { + auto result = primaryWordSet().size(); + for (const auto& previous : otherWordSets_) { + result += previous->size(); + } + return result; + } // Return true if and only if the local vocabulary is empty. - bool empty() const { return indexesToWordsMap_.empty(); } + bool empty() const { return size() == 0; } // Return a const reference to the word. const std::string& getWord(LocalVocabIndex localVocabIndex) const; + // Create a local vocab that contains and keeps alive all the words from each + // of the `vocabs`. The primary word set of the newly created vocab is empty. + static LocalVocab merge(std::span vocabs); + + // Return all the words from all the word sets as a vector. + std::vector getAllWordsForTesting() const; + private: // Common implementation for the two variants of // `getIndexAndAddIfNotContainedImpl` above. diff --git a/src/engine/Minus.cpp b/src/engine/Minus.cpp index 00ebd3be84..8b5c12c9ad 100644 --- a/src/engine/Minus.cpp +++ b/src/engine/Minus.cpp @@ -56,8 +56,7 @@ ResultTable Minus::computeResult() { // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult, - *rightResult)}; + ResultTable::getMergedLocalVocab(*leftResult, *rightResult)}; } // _____________________________________________________________________________ diff --git a/src/engine/MultiColumnJoin.cpp b/src/engine/MultiColumnJoin.cpp index 0de919c071..0d4de5c978 100644 --- a/src/engine/MultiColumnJoin.cpp +++ b/src/engine/MultiColumnJoin.cpp @@ -86,8 +86,7 @@ ResultTable MultiColumnJoin::computeResult() { // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult, - *rightResult)}; + ResultTable::getMergedLocalVocab(*leftResult, *rightResult)}; } // _____________________________________________________________________________ diff --git a/src/engine/OptionalJoin.cpp b/src/engine/OptionalJoin.cpp index 89917189b1..6ec87dc456 100644 --- a/src/engine/OptionalJoin.cpp +++ b/src/engine/OptionalJoin.cpp @@ -115,8 +115,7 @@ ResultTable OptionalJoin::computeResult() { // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult, - *rightResult)}; + ResultTable::getMergedLocalVocab(*leftResult, *rightResult)}; } // _____________________________________________________________________________ diff --git a/src/engine/ResultTable.cpp b/src/engine/ResultTable.cpp index 79fde3d57a..73edbaa506 100644 --- a/src/engine/ResultTable.cpp +++ b/src/engine/ResultTable.cpp @@ -23,10 +23,10 @@ string ResultTable::asDebugString() const { } // _____________________________________________________________________________ -auto ResultTable::getSharedLocalVocabFromNonEmptyOf( - const ResultTable& resultTable1, const ResultTable& resultTable2) +auto ResultTable::getMergedLocalVocab(const ResultTable& resultTable1, + const ResultTable& resultTable2) -> SharedLocalVocabWrapper { - return getSharedLocalVocabFromNonEmptyOf( + return getMergedLocalVocab( std::array{std::cref(resultTable1), std::cref(resultTable2)}); } diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h index 39950c6266..605f9c64a2 100644 --- a/src/engine/ResultTable.h +++ b/src/engine/ResultTable.h @@ -133,46 +133,21 @@ class ResultTable { return SharedLocalVocabWrapper{localVocab_}; } - // Like `getSharedLocalVocabFrom`, but takes more than one result and assumes - // that exactly one of the local vocabularies is empty and gets the shared - // local vocab from the non-empty one (if all are empty, arbitrarily share - // with the first one). - // - // TODO: Eventually, we want to be able to merge two non-empty local - // vocabularies, but that requires more work since we have to rewrite IDs then - // (from the previous separate local vocabularies to the new merged one). - static SharedLocalVocabWrapper getSharedLocalVocabFromNonEmptyOf( + // Like `getSharedLocalVocabFrom`, but takes more than one result and merges + // all the corresponding local vocabs. + static SharedLocalVocabWrapper getMergedLocalVocab( const ResultTable& resultTable1, const ResultTable& resultTable2); // Overload for more than two `ResultTables` template requires std::convertible_to, const ResultTable&> - static SharedLocalVocabWrapper getSharedLocalVocabFromNonEmptyOf( - R&& subResults) { - AD_CONTRACT_CHECK(!std::ranges::empty(subResults)); - auto hasNonEmptyVocab = [](const ResultTable& tbl) { - return !tbl.localVocab_->empty(); - }; - auto numNonEmptyVocabs = - std::ranges::count_if(subResults, hasNonEmptyVocab); - if (numNonEmptyVocabs > 1) { - throw std::runtime_error( - "Merging of more than one non-empty local vocabularies is currently " - "not supported, please contact the developers"); - } - // The static casts in the following are needed to make this code work for - // types that are implicitly convertible to `const ResultTable&`, in - // particular `std::reference_wrapper`. - if (numNonEmptyVocabs == 0) { - return SharedLocalVocabWrapper{ - static_cast(*subResults.begin()).localVocab_}; - } else { - return SharedLocalVocabWrapper{ - static_cast( - *std::ranges::find_if(subResults, hasNonEmptyVocab)) - .localVocab_}; + static SharedLocalVocabWrapper getMergedLocalVocab(R&& subResults) { + std::vector vocabs; + for (const ResultTable& table : subResults) { + vocabs.push_back(std::to_address(table.localVocab_)); } + return SharedLocalVocabWrapper{LocalVocab::merge(vocabs)}; } // Get a (deep) copy of the local vocabulary from the given result. Use this diff --git a/src/engine/TransitivePath.cpp b/src/engine/TransitivePath.cpp index c086561faf..073085b89a 100644 --- a/src/engine/TransitivePath.cpp +++ b/src/engine/TransitivePath.cpp @@ -256,7 +256,7 @@ ResultTable TransitivePath::computeResult() { sideRes->idTable()); return {std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*sideRes, *subRes)}; + ResultTable::getMergedLocalVocab(*sideRes, *subRes)}; }; if (lhs_.isBoundVariable()) { diff --git a/src/engine/Union.cpp b/src/engine/Union.cpp index 7f3269fe37..c8615c5062 100644 --- a/src/engine/Union.cpp +++ b/src/engine/Union.cpp @@ -178,9 +178,8 @@ ResultTable Union::computeResult() { LOG(DEBUG) << "Union result computation done" << std::endl; // If only one of the two operands has a non-empty local vocabulary, share // with that one (otherwise, throws an exception). - return ResultTable{ - std::move(idTable), resultSortedOn(), - ResultTable::getSharedLocalVocabFromNonEmptyOf(*subRes1, *subRes2)}; + return ResultTable{std::move(idTable), resultSortedOn(), + ResultTable::getMergedLocalVocab(*subRes1, *subRes2)}; } void Union::computeUnion( diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h index 18506f58c5..adde06bbbc 100644 --- a/src/global/IndexTypes.h +++ b/src/global/IndexTypes.h @@ -13,7 +13,14 @@ // this (very intrusive) renaming doesn't interfere with too many open pull // requests. using VocabIndex = ad_utility::TypedIndex; -using LocalVocabIndex = ad_utility::TypedIndex; + +// A `std::string` that is aligned to 16 bytes s.t. pointers always end with 4 +// bits that are zero and that are reused for payloads in the `ValueId` class. +struct alignas(16) StringAligned16 : public std::string { + using std::string::basic_string; + explicit StringAligned16(std::string s) : std::string{std::move(s)} {} +}; +using LocalVocabIndex = const StringAligned16*; using TextRecordIndex = ad_utility::TypedIndex; using WordVocabIndex = ad_utility::TypedIndex; using BlankNodeIndex = ad_utility::TypedIndex; diff --git a/src/global/ValueId.h b/src/global/ValueId.h index 7c45b0045c..33408479e7 100644 --- a/src/global/ValueId.h +++ b/src/global/ValueId.h @@ -123,7 +123,18 @@ class ValueId { /// Equality comparison is performed directly on the underlying /// representation. - constexpr bool operator==(const ValueId&) const = default; + // NOTE: (Also for the operator<=> below: These comparisons only work + // correctly if we only store entries in the local vocab that are NOT part of + // the vocabulary. This is currently not true for expression results from + // GROUP BY and BIND operations (for performance reaons). So a join with such + // results will currently lead to wrong results. + constexpr bool operator==(const ValueId& other) const { + if (getDatatype() == Datatype::LocalVocabIndex && + other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] { + return *getLocalVocabIndex() == *other.getLocalVocabIndex(); + } + return _bits == other._bits; + } /// Comparison is performed directly on the underlying representation. Note /// that because the type bits are the most significant bits, all values of @@ -133,9 +144,11 @@ class ValueId { /// For doubles it is first the positive doubles in order, then the negative /// doubles in reversed order. This is a direct consequence of comparing the /// bit representation of these values as unsigned integers. - /// TODO Is this ordering also consistent for corner cases of doubles - /// (inf, nan, denormalized numbers, negative zero)? constexpr auto operator<=>(const ValueId& other) const { + if (getDatatype() == Datatype::LocalVocabIndex && + other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] { + return *getLocalVocabIndex() <=> *other.getLocalVocabIndex(); + } return _bits <=> other._bits; } @@ -211,7 +224,11 @@ class ValueId { return makeFromIndex(index.get(), Datatype::TextRecordIndex); } static ValueId makeFromLocalVocabIndex(LocalVocabIndex index) { - return makeFromIndex(index.get(), Datatype::LocalVocabIndex); + // The last `numDatatypeBits` of a `LocalVocabIndex` are always zero, so we + // can reuse them for the datatype. + static_assert(alignof(decltype(*index)) >= (1u << numDatatypeBits)); + return makeFromIndex(reinterpret_cast(index) >> numDatatypeBits, + Datatype::LocalVocabIndex); } static ValueId makeFromWordVocabIndex(WordVocabIndex index) { return makeFromIndex(index.get(), Datatype::WordVocabIndex); @@ -229,8 +246,8 @@ class ValueId { [[nodiscard]] constexpr TextRecordIndex getTextRecordIndex() const noexcept { return TextRecordIndex::make(removeDatatypeBits(_bits)); } - [[nodiscard]] constexpr LocalVocabIndex getLocalVocabIndex() const noexcept { - return LocalVocabIndex::make(removeDatatypeBits(_bits)); + [[nodiscard]] LocalVocabIndex getLocalVocabIndex() const noexcept { + return reinterpret_cast(_bits << numDatatypeBits); } [[nodiscard]] constexpr WordVocabIndex getWordVocabIndex() const noexcept { return WordVocabIndex::make(removeDatatypeBits(_bits)); @@ -334,8 +351,11 @@ class ValueId { ostr << (value ? "true" : "false"); } else if constexpr (ad_utility::isSimilar) { ostr << value.toStringAndType().first; + } else if constexpr (ad_utility::isSimilar) { + AD_CORRECTNESS_CHECK(value != nullptr); + ostr << *value; } else { - // T is `VocabIndex || LocalVocabIndex || TextRecordIndex` + // T is `VocabIndex | TextRecordIndex` ostr << std::to_string(value.get()); } }; diff --git a/src/global/ValueIdComparators.h b/src/global/ValueIdComparators.h index 02f9d2b5ce..84440c4707 100644 --- a/src/global/ValueIdComparators.h +++ b/src/global/ValueIdComparators.h @@ -72,9 +72,7 @@ inline ValueId toValueId(ComparisonResult comparisonResult) { // negative doubles in reversed order. In detail it is [0.0 ... infinity, NaN, // -0.0, ... -infinity]. This is a direct consequence of comparing the bit // representation of these values as unsigned integers. -inline bool compareByBits(ValueId a, ValueId b) { - return a.getBits() < b.getBits(); -} +inline bool compareByBits(ValueId a, ValueId b) { return a < b; } namespace detail { @@ -308,6 +306,7 @@ inline std::vector> getRangesForIndexTypes( RangeFilter rangeFilter{comparison}; auto [eqBegin, eqEnd] = std::equal_range(beginType, endType, valueId, &compareByBits); + rangeFilter.addSmaller(beginType, eqBegin); rangeFilter.addEqual(eqBegin, eqEnd); rangeFilter.addGreater(eqEnd, endType); @@ -462,9 +461,17 @@ ComparisonResult compareIdsImpl(ValueId a, ValueId b, auto comparator) { } } - auto visitor = [comparator](const auto& aValue, - const auto& bValue) -> ComparisonResult { - if constexpr (requires() { std::invoke(comparator, aValue, bValue); }) { + auto visitor = [comparator]( + const A& aValue, const B& bValue) -> ComparisonResult { + if constexpr (std::is_same_v && + std::is_same_v) { + // TODO This is one of the places that has to be changed once + // we want to implement correct comparisons for the local vocab that use + // ICU collation. + return fromBool(std::invoke(comparator, *aValue, *bValue)); + } else if constexpr (requires() { + std::invoke(comparator, aValue, bValue); + }) { return fromBool(std::invoke(comparator, aValue, bValue)); } else { AD_FAIL(); diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp index 22844fe39c..1f76ddb2bc 100644 --- a/test/LocalVocabTest.cpp +++ b/test/LocalVocabTest.cpp @@ -2,6 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Hannah Bast +#include #include #include @@ -39,7 +40,6 @@ std::vector getTestCollectionOfWords(size_t size) { } return testCollectionOfWords; } -auto iri = ad_utility::testing::iri; } // namespace // _____________________________________________________________________________ @@ -54,25 +54,26 @@ TEST(LocalVocab, constructionAndAccess) { // Add the words from our test vocabulary and check that they get the expected // local vocab indexes. for (size_t i = 0; i < testWords.size(); ++i) { - ASSERT_EQ(localVocab.getIndexAndAddIfNotContained(testWords[i]), - LocalVocabIndex::make(i)); + ASSERT_EQ(*localVocab.getIndexAndAddIfNotContained(testWords[i]), + testWords[i]); } + size_t localVocabSize = localVocab.size(); ASSERT_EQ(localVocab.size(), testWords.size()); // Check that we get the same indexes if we do this again, but that no new // words will be added. for (size_t i = 0; i < testWords.size(); ++i) { - ASSERT_EQ(localVocab.getIndexAndAddIfNotContained(testWords[i]), - LocalVocabIndex::make(i)); + ASSERT_EQ(*localVocab.getIndexAndAddIfNotContained(testWords[i]), + testWords[i]); + ASSERT_EQ(localVocab.size(), localVocabSize); } - ASSERT_EQ(localVocab.size(), testWords.size()); // Check again that we get the right indexes, but with `getIndexOrNullopt`. for (size_t i = 0; i < testWords.size(); ++i) { std::optional localVocabIndex = localVocab.getIndexOrNullopt(testWords[i]); ASSERT_TRUE(localVocabIndex.has_value()); - ASSERT_EQ(localVocabIndex.value(), LocalVocabIndex::make(i)); + ASSERT_EQ(*localVocabIndex.value(), testWords[i]); } // Check that `getIndexOrNullopt` returns `std::nullopt` for words that are @@ -82,25 +83,18 @@ TEST(LocalVocab, constructionAndAccess) { ASSERT_FALSE(localVocab.getIndexOrNullopt(testWords[i] + "A")); } - // Check that the lookup by ID gives the correct words. - for (size_t i = 0; i < testWords.size(); ++i) { - ASSERT_EQ(localVocab.getWord(LocalVocabIndex::make(i)), testWords[i]); - } - ASSERT_EQ(localVocab.size(), testWords.size()); - - // Check that out-of-bound lookups are detected (this would have a caught the - // one-off bug in #820, LocalVocab.cpp line 55). - ASSERT_THROW(localVocab.getWord(LocalVocabIndex::make(testWords.size())), - std::runtime_error); - ASSERT_THROW(localVocab.getWord(LocalVocabIndex::make(-1)), - std::runtime_error); - // Check that a move gives the expected result. + localVocabSize = localVocab.size(); auto localVocabMoved = std::move(localVocab); - ASSERT_EQ(localVocab.size(), 0); + // TODO Should we reset the pointer? + // ASSERT_EQ(localVocab.size(), 0); ASSERT_EQ(localVocabMoved.size(), testWords.size()); + // Check that we get the same indexes if we do this again, but that no new + // words will be added. for (size_t i = 0; i < testWords.size(); ++i) { - ASSERT_EQ(localVocabMoved.getWord(LocalVocabIndex::make(i)), testWords[i]); + ASSERT_EQ(*localVocabMoved.getIndexAndAddIfNotContained(testWords[i]), + testWords[i]); + ASSERT_EQ(localVocabMoved.size(), localVocabSize); } } @@ -109,29 +103,52 @@ TEST(LocalVocab, clone) { // Create a small local vocabulary. size_t localVocabSize = 100; LocalVocab localVocabOriginal; - for (auto& word : getTestCollectionOfWords(localVocabSize)) { - localVocabOriginal.getIndexAndAddIfNotContained(word); + std::vector indices; + auto inputWords = getTestCollectionOfWords(localVocabSize); + for (const auto& word : inputWords) { + indices.push_back(localVocabOriginal.getIndexAndAddIfNotContained(word)); } ASSERT_EQ(localVocabOriginal.size(), localVocabSize); - // Clone it and test that the clone contains the same words, but under - // different addresses (that is, the word strings were deeply copied). + // Clone it and test that the clone contains the same words. LocalVocab localVocabClone = localVocabOriginal.clone(); ASSERT_EQ(localVocabOriginal.size(), localVocabSize); ASSERT_EQ(localVocabClone.size(), localVocabSize); - for (size_t i = 0; i < localVocabSize; ++i) { - LocalVocabIndex idx = LocalVocabIndex::make(i); - const std::string& wordFromOriginal = localVocabOriginal.getWord(idx); - const std::string& wordFromClone = localVocabClone.getWord(idx); - ASSERT_EQ(wordFromOriginal, wordFromClone); - ASSERT_NE(&wordFromOriginal, &wordFromClone); - } + ASSERT_THAT(localVocabClone.getAllWordsForTesting(), + ::testing::UnorderedElementsAreArray( + localVocabOriginal.getAllWordsForTesting())); + + // Test that the indices are still valid after the original vocabulary was + // destroyed. + localVocabOriginal = LocalVocab{}; - // Check that `nextFreeIndex_` of the clone is OK by adding another word to - // the clone. - localVocabClone.getIndexAndAddIfNotContained("blubb"); - ASSERT_EQ(localVocabClone.getIndexAndAddIfNotContained("blubb"), - LocalVocabIndex::make(localVocabSize)); + for (size_t i = 0; i < inputWords.size(); ++i) { + EXPECT_EQ(*indices[i], inputWords[i]); + } +} +// _____________________________________________________________________________ +TEST(LocalVocab, merge) { + // Create a small local vocabulary. + std::vector indices; + LocalVocab vocA; + LocalVocab vocB; + indices.push_back(vocA.getIndexAndAddIfNotContained("oneA")); + indices.push_back(vocA.getIndexAndAddIfNotContained("twoA")); + indices.push_back(vocA.getIndexAndAddIfNotContained("oneB")); + indices.push_back(vocA.getIndexAndAddIfNotContained("twoB")); + + // Clone it and test that the clone contains the same words. + auto vocabs = std::vector{&std::as_const(vocA), &std::as_const(vocB)}; + LocalVocab localVocabMerged = LocalVocab::merge(vocabs); + ASSERT_EQ(localVocabMerged.size(), 4u); + ASSERT_THAT(localVocabMerged.getAllWordsForTesting(), + ::testing::UnorderedElementsAre("oneA", "twoA", "oneB", "twoB")); + vocA = LocalVocab{}; + vocB = LocalVocab{}; + EXPECT_EQ(*indices[0], "oneA"); + EXPECT_EQ(*indices[1], "twoA"); + EXPECT_EQ(*indices[2], "oneB"); + EXPECT_EQ(*indices[3], "twoB"); } // _____________________________________________________________________________ @@ -150,18 +167,15 @@ TEST(LocalVocab, propagation) { std::shared_ptr resultTable = operation.getResult(); ASSERT_TRUE(resultTable) << "Operation: " << operation.getDescriptor() << std::endl; - std::vector localVocabWords; - for (size_t i = 0; i < resultTable->localVocab().size(); ++i) { - localVocabWords.emplace_back( - resultTable->localVocab().getWord(LocalVocabIndex::make(i))); - } - ASSERT_EQ(localVocabWords, expectedWords) - << "Operation: " << operation.getDescriptor() << std::endl; - }; - - // Lambda that checks that `computeResult` throws an exception. - auto checkThrow = [&](Operation& operation) -> void { - ASSERT_THROW(operation.getResult(), ad_utility::AbortException) + std::vector localVocabWords = + resultTable->localVocab().getAllWordsForTesting(); + // We currently allow the local vocab to have multiple IDs for the same + // word, so we have to deduplicate first. + std::ranges::sort(localVocabWords); + localVocabWords.erase(std::ranges::unique(localVocabWords).begin(), + localVocabWords.end()); + ASSERT_THAT(localVocabWords, + ::testing::UnorderedElementsAreArray(expectedWords)) << "Operation: " << operation.getDescriptor() << std::endl; }; @@ -179,15 +193,20 @@ TEST(LocalVocab, propagation) { // values are moved out by `Values::writeValues` and would then no longer be // there in the subsequent uses of `values1` and `values2` if it were not // copied. + // + // Note 2: For literals, the quotes are part of the name, so if we wanted the + // literal "x", we would have to write TripleComponent{"\"x\""}. For the + // purposes of this test, we just want something that's not yet in the index, + // so "x" etc. is just fine (and also different from the "" below). + auto iri = ad_utility::testing::iri; Values values1( testQec, {{Variable{"?x"}, Variable{"?y"}}, - {{TripleComponent{iri("")}, TripleComponent{iri("")}}, - {TripleComponent{iri("")}, TripleComponent{iri("")}}}}); + {{TripleComponent{iri("")}, TripleComponent{iri("")}}, + {TripleComponent{iri("")}, TripleComponent{iri("")}}}}); Values values1copy = values1; - std::vector expectedVocab{"", "", ""}; - - checkLocalVocab(values1copy, expectedVocab); + std::vector localVocab1{"", "", ""}; + checkLocalVocab(values1copy, localVocab1); // VALUES operation that uses an existing literal (from the test index). Values values2( @@ -196,41 +215,51 @@ TEST(LocalVocab, propagation) { Values values2copy = values2; checkLocalVocab(values2copy, std::vector{}); + // Contains local vocab words that are (at least partially) disjoint from the + // words in `values1`. + Values values3( + testQec, + {{Variable{"?x"}, Variable{"?y"}}, + {{TripleComponent{iri("")}, TripleComponent{iri("")}}, + {TripleComponent{iri("")}, TripleComponent{iri("")}}}}); + std::vector localVocab13{"", "", "", "", + ""}; + // JOIN operation with exactly one non-empty local vocab and with two // non-empty local vocabs (the last two arguments are the two join columns). Join join1(testQec, qet(values1), qet(values2), 0, 0); - checkLocalVocab(join1, expectedVocab); - Join join2(testQec, qet(values1), qet(values1), 0, 0); - checkThrow(join2); + checkLocalVocab(join1, localVocab1); + Join join2(testQec, qet(values1), qet(values3), 0, 0); + checkLocalVocab(join2, localVocab13); // OPTIONAL JOIN operation with exactly one non-empty local vocab. OptionalJoin optJoin1(testQec, qet(values1), qet(values2)); - checkLocalVocab(optJoin1, expectedVocab); + checkLocalVocab(optJoin1, localVocab1); // OPTIONAL JOIN operation with two non-empty local vocab. - OptionalJoin optJoin2(testQec, qet(values1), qet(values1)); - checkThrow(optJoin2); + OptionalJoin optJoin2(testQec, qet(values1), qet(values3)); + checkLocalVocab(optJoin2, localVocab13); // MULTI-COLUMN JOIN operation with exactly one non-empty local vocab and with // two non-empty local vocabs. MultiColumnJoin multiJoin1(testQec, qet(values1), qet(values2)); - checkLocalVocab(multiJoin1, expectedVocab); - MultiColumnJoin multiJoin2(testQec, qet(values1), qet(values1)); - checkThrow(multiJoin2); + checkLocalVocab(multiJoin1, localVocab1); + MultiColumnJoin multiJoin2(testQec, qet(values1), qet(values3)); + checkLocalVocab(multiJoin2, localVocab13); // ORDER BY operation (the third argument are the indices of the columns to be // sorted, and the sort order; not important for this test). OrderBy orderBy(testQec, qet(values1), {{0, true}, {1, true}}); - checkLocalVocab(orderBy, expectedVocab); + checkLocalVocab(orderBy, localVocab1); // SORT operation (the third operation is the sort column). Sort sort(testQec, qet(values1), {0}); - checkLocalVocab(sort, expectedVocab); + checkLocalVocab(sort, localVocab1); // DISTINCT operation (the third argument are the indices of the input columns // that are considered for the output; not important for this test). Distinct distinct1(testQec, qet(values1), {0, 1}); - checkLocalVocab(distinct1, expectedVocab); + checkLocalVocab(distinct1, localVocab1); // GROUP BY operation. auto groupConcatExpression = @@ -249,28 +278,28 @@ TEST(LocalVocab, propagation) { testQec, {Variable{"?x"}}, {Alias{groupConcatExpression("?y", "|"), Variable{"?concat"}}}, qet(values1)); - checkLocalVocab(groupBy, - std::vector{"", "", "", "y1|y2"}); + checkLocalVocab( + groupBy, std::vector{"", "", "", "yN1|yN2"}); // DISTINCT again, but after something has been added to the local vocabulary // (to check that the "y1|y2" added by the GROUP BY does not also appear here, // as it did before GroupBy::computeResult copied it's local vocabulary). Distinct distinct2(testQec, qet(values1), {0}); - checkLocalVocab(distinct2, expectedVocab); + checkLocalVocab(distinct2, localVocab1); // UNION operation with exactly one non-empty local vocab and with two // non-empy local vocabs. Union union1(testQec, qet(values1), qet(values2)); - checkLocalVocab(union1, expectedVocab); - Union union2(testQec, qet(values1), qet(values1)); - checkThrow(union2); + checkLocalVocab(union1, localVocab1); + Union union2(testQec, qet(values1), qet(values3)); + checkLocalVocab(union2, localVocab13); // MINUS operation with exactly one non-empty local vocab and with // two non-empty local vocabs. Minus minus1(testQec, qet(values1), qet(values2)); - checkLocalVocab(minus1, expectedVocab); - Minus minus2(testQec, qet(values1), qet(values1)); - checkThrow(minus2); + checkLocalVocab(minus1, localVocab1); + Minus minus2(testQec, qet(values1), qet(values3)); + checkLocalVocab(minus2, localVocab13); // FILTER operation (the third argument is an expression; which one doesn't // matter for this test). @@ -278,7 +307,7 @@ TEST(LocalVocab, propagation) { testQec, qet(values1), {std::make_unique(Variable{"?x"}), "Expression ?x"}); - checkLocalVocab(filter, expectedVocab); + checkLocalVocab(filter, localVocab1); // BIND operation (the third argument is a `parsedQuery::Bind` object). Bind bind( @@ -286,7 +315,7 @@ TEST(LocalVocab, propagation) { {{std::make_unique(Variable{"?x"}), "Expression ?x"}, Variable{"?z"}}); - checkLocalVocab(bind, expectedVocab); + checkLocalVocab(bind, localVocab1); // TRANSITIVE PATH operation. // @@ -296,15 +325,19 @@ TEST(LocalVocab, propagation) { TransitivePathSide left(std::nullopt, 0, Variable{"?x"}); TransitivePathSide right(std::nullopt, 1, Variable{"?y"}); TransitivePath transitivePath(testQec, qet(values1), left, right, 1, 1); - checkLocalVocab(transitivePath, expectedVocab); + checkLocalVocab(transitivePath, localVocab1); // PATTERN TRICK operations. HasPredicateScan hasPredicateScan(testQec, qet(values1), 0, Variable{"?z"}); - checkLocalVocab(hasPredicateScan, expectedVocab); + checkLocalVocab(hasPredicateScan, localVocab1); + Values valuesPatternTrick( + testQec, + {{Variable{"?x"}, Variable{"?y"}}, + {{TripleComponent{iri("")}, TripleComponent{NO_PATTERN}}, + {TripleComponent{iri("")}, TripleComponent{NO_PATTERN}}}}); CountAvailablePredicates countAvailablePredictes( - testQec, qet(values1), 0, Variable{"?x"}, Variable{"?y"}); - checkLocalVocab(countAvailablePredictes, expectedVocab); - + testQec, qet(valuesPatternTrick), 0, Variable{"?y"}, Variable{"?count"}); + checkLocalVocab(countAvailablePredictes, {""}); // TEXT operations. // // TODO Maybe add tests for the new TextIndexScanFor... classes, diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp index c59b43421e..8319bfa427 100644 --- a/test/ValueIdComparatorsTest.cpp +++ b/test/ValueIdComparatorsTest.cpp @@ -12,6 +12,32 @@ #include "util/Random.h" using namespace valueIdComparators; +namespace valueIdComparators { +inline std::ostream& operator<<(std::ostream& str, Comparison c) { + switch (c) { + using enum Comparison; + case LT: + str << "LT"; + break; + case LE: + str << "LE"; + break; + case EQ: + str << "EQ"; + break; + case NE: + str << "NE"; + break; + case GE: + str << "GE"; + break; + case GT: + str << "GT"; + break; + } + return str; +} +} // namespace valueIdComparators using ad_utility::source_location; TEST(ValueIdComparators, GetRangeForDatatype) { @@ -77,20 +103,20 @@ auto testGetRangesForId(auto begin, auto end, ValueId id, using enum ComparisonResult; for (auto [rangeBegin, rangeEnd] : ranges) { while (it != rangeBegin) { - ASSERT_FALSE(isMatching(*it, id)) << *it << ' ' << id; + ASSERT_FALSE(isMatching(*it, id)) << *it << ' ' << id << comparison; auto expected = isMatchingDatatype(*it) ? False : Undef; ASSERT_EQ(compareIds(*it, id, comparison), expected) << *it << ' ' << id; ++it; } while (it != rangeEnd) { - ASSERT_TRUE(isMatching(*it, id)) << *it << ' ' << id; + ASSERT_TRUE(isMatching(*it, id)) << *it << ' ' << id << comparison; ASSERT_EQ(compareIds(*it, id, comparison), True) << *it << ' ' << id; ++it; } } while (it != end) { - ASSERT_FALSE(isMatching(*it, id)); + ASSERT_FALSE(isMatching(*it, id)) << *it << ", " << id << comparison; auto expected = isMatchingDatatype(*it) ? False : Undef; ASSERT_EQ(compareIds(*it, id, comparison), expected) << *it << ' ' << id; ++it; diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp index 04e13cf883..ca278bd5d3 100644 --- a/test/ValueIdTest.cpp +++ b/test/ValueIdTest.cpp @@ -108,18 +108,24 @@ TEST(ValueId, Indices) { testSingle(0); testSingle(ValueId::maxIndex); - for (size_t idx = 0; idx < 10'000; ++idx) { - auto value = invalidIndexGenerator(); - ASSERT_THROW(makeId(value), ValueId::IndexTooLargeException); - AD_EXPECT_THROW_WITH_MESSAGE(makeId(value), - ::testing::ContainsRegex("is bigger than")); + if (type != Datatype::LocalVocabIndex) { + for (size_t idx = 0; idx < 10'000; ++idx) { + auto value = invalidIndexGenerator(); + ASSERT_THROW(makeId(value), ValueId::IndexTooLargeException); + AD_EXPECT_THROW_WITH_MESSAGE( + makeId(value), ::testing::ContainsRegex("is bigger than")); + } } }; testRandomIds(&makeTextRecordId, &getTextRecordIndex, Datatype::TextRecordIndex); testRandomIds(&makeVocabId, &getVocabIndex, Datatype::VocabIndex); - testRandomIds(&makeLocalVocabId, &getLocalVocabIndex, + + auto localVocabWordToInt = [](const auto& input) { + return std::atoll(getLocalVocabIndex(input).c_str()); + }; + testRandomIds(&makeLocalVocabId, localVocabWordToInt, Datatype::LocalVocabIndex); testRandomIds(&makeWordVocabId, &getWordVocabIndex, Datatype::WordVocabIndex); } @@ -288,7 +294,8 @@ TEST(ValueId, toDebugString) { test(ValueId::makeFromBool(false), "Bool:false"); test(ValueId::makeFromBool(true), "Bool:true"); test(makeVocabId(15), "VocabIndex:15"); - test(makeLocalVocabId(25), "LocalVocabIndex:25"); + StringAligned16 str{"SomeValue"}; + test(ValueId::makeFromLocalVocabIndex(&str), "LocalVocabIndex:SomeValue"); test(makeTextRecordId(37), "TextRecordIndex:37"); test(makeWordVocabId(42), "WordVocabIndex:42"); test(makeBlankNodeId(27), "BlankNodeIndex:27"); diff --git a/test/ValueIdTestHelpers.h b/test/ValueIdTestHelpers.h index 98a5727266..19aba006a3 100644 --- a/test/ValueIdTestHelpers.h +++ b/test/ValueIdTestHelpers.h @@ -5,8 +5,9 @@ #ifndef QLEVER_VALUEIDTESTHELPERS_H #define QLEVER_VALUEIDTESTHELPERS_H -#include "../src/global/ValueId.h" -#include "../src/util/Random.h" +#include "./util/IdTestHelpers.h" +#include "global/ValueId.h" +#include "util/Random.h" // Enabling cheaper unit tests when building in Debug mode #ifdef QLEVER_RUN_EXPENSIVE_TESTS @@ -45,7 +46,7 @@ inline ValueId makeVocabId(uint64_t value) { return ValueId::makeFromVocabIndex(VocabIndex::make(value)); } inline ValueId makeLocalVocabId(uint64_t value) { - return ValueId::makeFromLocalVocabIndex(LocalVocabIndex::make(value)); + return ad_utility::testing::LocalVocabId(value); } inline ValueId makeTextRecordId(uint64_t value) { return ValueId::makeFromTextRecordIndex(TextRecordIndex::make(value)); @@ -58,8 +59,10 @@ inline ValueId makeBlankNodeId(uint64_t value) { } inline uint64_t getVocabIndex(ValueId id) { return id.getVocabIndex().get(); } -inline uint64_t getLocalVocabIndex(ValueId id) { - return id.getLocalVocabIndex().get(); +// TODO Make the tests more precise for the localVocabIndices. +inline std::string getLocalVocabIndex(ValueId id) { + AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::LocalVocabIndex); + return *id.getLocalVocabIndex(); } inline uint64_t getTextRecordIndex(ValueId id) { return id.getTextRecordIndex().get(); @@ -104,6 +107,7 @@ inline auto makeRandomDoubleIds = []() { ad_utility::randomShuffle(ids.begin(), ids.end()); return ids; }; + inline auto makeRandomIds = []() { std::vector ids = makeRandomDoubleIds(); addIdsFromGenerator(indexGenerator, &makeVocabId, ids); diff --git a/test/ValuesTest.cpp b/test/ValuesTest.cpp index 4b8c7171ce..001eab9116 100644 --- a/test/ValuesTest.cpp +++ b/test/ValuesTest.cpp @@ -74,9 +74,12 @@ TEST(Values, computeResult) { const auto& table = result->idTable(); Id x = ad_utility::testing::makeGetId(testQec->getIndex())(""); auto I = ad_utility::testing::IntId; - auto L = ad_utility::testing::LocalVocabId; + auto l = result->localVocab().getIndexOrNullopt(""); + ASSERT_TRUE(l.has_value()); auto U = Id::makeUndefined(); - ASSERT_EQ(table, makeIdTableFromVector({{I(12), x}, {U, L(0)}})); + ASSERT_EQ(table, + makeIdTableFromVector( + {{I(12), x}, {U, Id::makeFromLocalVocabIndex(l.value())}})); } // Check that if the number of variables and the number of values in each row diff --git a/test/util/IdTestHelpers.h b/test/util/IdTestHelpers.h index 95e8a5eef2..1449ab0f88 100644 --- a/test/util/IdTestHelpers.h +++ b/test/util/IdTestHelpers.h @@ -4,6 +4,7 @@ #pragma once +#include "engine/LocalVocab.h" #include "global/Id.h" // Lambdas to simply create an `Id` with a given value and type during unit @@ -18,8 +19,10 @@ inline auto VocabId = [](const auto& v) { return Id::makeFromVocabIndex(VocabIndex::make(v)); }; -inline auto LocalVocabId = [](const auto& v) { - return Id::makeFromLocalVocabIndex(LocalVocabIndex::make(v)); +inline auto LocalVocabId = [](std::integral auto v) { + static ad_utility::Synchronized localVocab; + return Id::makeFromLocalVocabIndex( + localVocab.wlock()->getIndexAndAddIfNotContained(std::to_string(v))); }; inline auto TextRecordId = [](const auto& t) {