From 2cc4b58d3b54a23492cbe54c775934633a608387 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 21 Mar 2024 12:38:41 +0100
Subject: [PATCH 01/17] Playing around with this and that, let's see how fast
 we can get this done.

---
 src/engine/LocalVocab.cpp | 35 +++++++----------------------------
 src/engine/LocalVocab.h   | 20 ++++++--------------
 src/global/IndexTypes.h   |  9 ++++++++-
 src/global/ValueId.h      | 11 +++++++----
 test/util/IdTestHelpers.h |  2 +-
 5 files changed, 29 insertions(+), 48 deletions(-)
diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp
index a4ba9167ff..23278b5a8f 100644
--- a/src/engine/LocalVocab.cpp
+++ b/src/engine/LocalVocab.cpp
@@ -14,26 +14,6 @@ LocalVocab LocalVocab::clone() const {
   // First, make a deep copy of the `absl::node_hash_map` holding the actual
   // map of strings to indexes.
   localVocabClone.wordsToIndexesMap_ = this->wordsToIndexesMap_;
-  // The next free index should be the same.
-  localVocabClone.nextFreeIndex_ = this->nextFreeIndex_;
-  // The map from local ids to strings stores pointers to strings. So we cannot
-  // just copy these from `this->indexesToWordsMap_` to `localVocabClone`, but
-  // we need to make sure to store the pointers to the strings of the new map
-  // `localVocabClone.wordsToIndexesMap_`.
-  //
-  // NOTE: An alternative algorithm would be to sort the word-index pairs in
-  // `wordsToIndexesMap_` by index and then fill `indexesToWordsMap_` in order.
-  // This would have better locality, but the sorting takes non-linear time plus
-  // the sorting has to handle pairs of `LocalVocabIndex` and `std::string`. So
-  // for very large local vocabularies (and only then is this operation
-  // performance-criticial at all), the simpler approach below is probably
-  // better.
-  const size_t localVocabSize = this->size();
-  localVocabClone.indexesToWordsMap_.resize(localVocabSize);
-  for (const auto& [wordInMap, index] : localVocabClone.wordsToIndexesMap_) {
-    AD_CONTRACT_CHECK(index.get() < localVocabSize);
-    localVocabClone.indexesToWordsMap_[index.get()] = std::addressof(wordInMap);
-  }
   // Return the clone.
   return localVocabClone;
 }
@@ -53,14 +33,11 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
   // called (even when the ID is not actually needed because the word is already
   // contained in the map).
   //
+  // TODO<joka921> Make this work with transparent hashing and use try_emplace.
   auto [wordInMapAndIndex, isNewWord] =
-      wordsToIndexesMap_.insert({std::forward<WordT>(word), nextFreeIndex_});
-  const auto& [wordInMap, index] = *wordInMapAndIndex;
-  if (isNewWord) {
-    indexesToWordsMap_.push_back(&wordInMap);
-    nextFreeIndex_ = LocalVocabIndex::make(indexesToWordsMap_.size());
-  }
-  return index;
+      wordsToIndexesMap_.insert(AlignedStr{std::forward<WordT>(word)});
+  const auto& wordInMap = *wordInMapAndIndex;
+  return &wordInMap;
 }
 
 // _____________________________________________________________________________
@@ -77,7 +54,9 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(std::string&& word) {
 // _____________________________________________________________________________
 std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
     const std::string& word) const {
-  auto localVocabIndex = wordsToIndexesMap_.find(word);
+  // TODO<joka921> Maybe we can make this work with transparent hashing,
+  // but if this is only a testing API this is probably not worth the hassle.
+  auto localVocabIndex = wordsToIndexesMap_.find(AlignedStr{word});
   if (localVocabIndex != wordsToIndexesMap_.end()) {
     return localVocabIndex->second;
   } else {
diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
index 95572f7b22..b2e262c621 100644
--- a/src/engine/LocalVocab.h
+++ b/src/engine/LocalVocab.h
@@ -9,7 +9,7 @@
 #include <string>
 #include <vector>
 
-#include "absl/container/node_hash_map.h"
+#include "absl/container/node_hash_set.h"
 #include "global/Id.h"
 
 // A class for maintaing a local vocabulary with contiguous (local) IDs. This is
@@ -21,21 +21,13 @@
 // defined inside of the `ResultTable` class. You gotta start somewhere.
 class LocalVocab {
  private:
+
+
   // A map of the words in the local vocabulary to their local IDs. This is a
   // node hash map because we need the addresses of the words (which are of type
   // `std::string`) to remain stable over their lifetime in the hash map because
   // we refer to them in `wordsToIdsMap_` below.
-  absl::node_hash_map<std::string, LocalVocabIndex> wordsToIndexesMap_;
-
-  // A map of the local IDs to the words. Since the IDs are contiguous, we can
-  // use a `std::vector`. We store pointers to the actual words in
-  // `wordsToIdsMap_` to avoid storing every word twice. This saves space, but
-  // costs us an indirection when looking up a word by its ID.
-  std::vector<const std::string*> indexesToWordsMap_;
-
-  // The next free local ID (will be incremented by one each time we add a new
-  // word).
-  LocalVocabIndex nextFreeIndex_ = LocalVocabIndex::make(0);
+  absl::node_hash_set<AlignedStr> wordsToIndexesMap_;
 
  public:
   // Create a new, empty local vocabulary.
@@ -65,10 +57,10 @@ class LocalVocab {
       const std::string& word) const;
 
   // The number of words in the vocabulary.
-  size_t size() const { return indexesToWordsMap_.size(); }
+  size_t size() const { return wordsToIndexesMap_.size(); }
 
   // Return true if and only if the local vocabulary is empty.
-  bool empty() const { return indexesToWordsMap_.empty(); }
+  bool empty() const { return wordsToIndexesMap_.empty(); }
 
   // Return a const reference to the word.
   const std::string& getWord(LocalVocabIndex localVocabIndex) const;
diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h
index 18506f58c5..507a597121 100644
--- a/src/global/IndexTypes.h
+++ b/src/global/IndexTypes.h
@@ -13,7 +13,14 @@
 // this (very intrusive) renaming doesn't interfere with too many open pull
 // requests.
 using VocabIndex = ad_utility::TypedIndex<uint64_t, "VocabIndex">;
-using LocalVocabIndex = ad_utility::TypedIndex<uint64_t, "LocalVocabIndex">;
+
+// TODO<joka921> Where to put this best...
+struct alignas(16) AlignedStr : public std::string {
+  using std::string::basic_string;
+  explicit AlignedStr(std::string s) : std::string{std::move(s)} {}
+};
+//using LocalVocabIndex = ad_utility::TypedIndex<uint64_t, "LocalVocabIndex">;
+using LocalVocabIndex = const AlignedStr*;
 using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
 using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
 using BlankNodeIndex = ad_utility::TypedIndex<uint64_t, "BlankNodeIndex">;
diff --git a/src/global/ValueId.h b/src/global/ValueId.h
index 7c45b0045c..e575339bfc 100644
--- a/src/global/ValueId.h
+++ b/src/global/ValueId.h
@@ -211,7 +211,7 @@ class ValueId {
     return makeFromIndex(index.get(), Datatype::TextRecordIndex);
   }
   static ValueId makeFromLocalVocabIndex(LocalVocabIndex index) {
-    return makeFromIndex(index.get(), Datatype::LocalVocabIndex);
+    return makeFromIndex(reinterpret_cast<T>(index) >> numDatatypeBits, Datatype::LocalVocabIndex);
   }
   static ValueId makeFromWordVocabIndex(WordVocabIndex index) {
     return makeFromIndex(index.get(), Datatype::WordVocabIndex);
@@ -229,8 +229,8 @@ class ValueId {
   [[nodiscard]] constexpr TextRecordIndex getTextRecordIndex() const noexcept {
     return TextRecordIndex::make(removeDatatypeBits(_bits));
   }
-  [[nodiscard]] constexpr LocalVocabIndex getLocalVocabIndex() const noexcept {
-    return LocalVocabIndex::make(removeDatatypeBits(_bits));
+  [[nodiscard]] LocalVocabIndex getLocalVocabIndex() const noexcept {
+    return reinterpret_cast<LocalVocabIndex>(_bits << numDatatypeBits);
   }
   [[nodiscard]] constexpr WordVocabIndex getWordVocabIndex() const noexcept {
     return WordVocabIndex::make(removeDatatypeBits(_bits));
@@ -334,8 +334,11 @@ class ValueId {
         ostr << (value ? "true" : "false");
       } else if constexpr (ad_utility::isSimilar<T, DateOrLargeYear>) {
         ostr << value.toStringAndType().first;
+      } else if constexpr (ad_utility::isSimilar<T, LocalVocabIndex>) {
+        AD_CORRECTNESS_CHECK(value != nullptr);
+        ostr << *value;
       } else {
-        // T is `VocabIndex || LocalVocabIndex || TextRecordIndex`
+        // T is `VocabIndex | TextRecordIndex`
         ostr << std::to_string(value.get());
       }
     };
diff --git a/test/util/IdTestHelpers.h b/test/util/IdTestHelpers.h
index 95e8a5eef2..291e3c5b18 100644
--- a/test/util/IdTestHelpers.h
+++ b/test/util/IdTestHelpers.h
@@ -19,7 +19,7 @@ inline auto VocabId = [](const auto& v) {
 };
 
 inline auto LocalVocabId = [](const auto& v) {
-  return Id::makeFromLocalVocabIndex(LocalVocabIndex::make(v));
+  return Id::makeFromLocalVocabIndex(reinterpret_cast<LocalVocabIndex>(v));
 };
 
 inline auto TextRecordId = [](const auto& t) {

From ce3afef4df909210435f27eb7c52f8584d12d303 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 21 Mar 2024 17:22:25 +0100
Subject: [PATCH 02/17] A first draft of merging local vocabs. There are still
 some issues with the tests, but overall, this should work.

---
 src/engine/LocalVocab.cpp       | 31 ++++++++++++++-----------
 src/engine/LocalVocab.h         | 16 +++++++++----
 src/engine/ResultTable.h        |  8 +++++++
 src/global/IndexTypes.h         |  2 +-
 src/global/ValueId.h            |  7 +++++-
 test/LocalVocabTest.cpp         | 41 ++++++++++++++++-----------------
 test/ValueIdComparatorsTest.cpp |  3 +++
 test/ValueIdTest.cpp            | 21 ++++++++++++-----
 test/ValueIdTestHelpers.h       |  6 +++--
 test/ValuesTest.cpp             |  4 +++-
 test/util/IdTestHelpers.h       |  9 ++++++--
 11 files changed, 95 insertions(+), 53 deletions(-)

diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp
index 23278b5a8f..88ed5fb893 100644
--- a/src/engine/LocalVocab.cpp
+++ b/src/engine/LocalVocab.cpp
@@ -11,13 +11,22 @@
 // _____________________________________________________________________________
 LocalVocab LocalVocab::clone() const {
   LocalVocab localVocabClone;
-  // First, make a deep copy of the `absl::node_hash_map` holding the actual
-  // map of strings to indexes.
-  localVocabClone.wordsToIndexesMap_ = this->wordsToIndexesMap_;
+  localVocabClone.previousSets_ = previousSets_;
+  localVocabClone.previousSets_.push_back(wordsToIndexesMap_);
   // Return the clone.
   return localVocabClone;
 }
 
+LocalVocab LocalVocab::merge(const LocalVocab& a, const LocalVocab& b) {
+  LocalVocab res;
+  auto inserter = std::back_inserter(res.previousSets_);
+  std::ranges::copy(a.previousSets_, inserter);
+  std::ranges::copy(b.previousSets_, inserter);
+  *inserter = a.wordsToIndexesMap_;
+  *inserter = b.wordsToIndexesMap_;
+  return res;
+}
+
 // _____________________________________________________________________________
 template <typename WordT>
 LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
@@ -35,7 +44,7 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
   //
   // TODO<joka921> Make this work with transparent hashing and use try_emplace.
   auto [wordInMapAndIndex, isNewWord] =
-      wordsToIndexesMap_.insert(AlignedStr{std::forward<WordT>(word)});
+      wordsToIndexesMap().insert(AlignedStr{std::forward<WordT>(word)});
   const auto& wordInMap = *wordInMapAndIndex;
   return &wordInMap;
 }
@@ -56,9 +65,9 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
     const std::string& word) const {
   // TODO<joka921> Maybe we can make this work with transparent hashing,
   // but if this is only a testing API this is probably not worth the hassle.
-  auto localVocabIndex = wordsToIndexesMap_.find(AlignedStr{word});
-  if (localVocabIndex != wordsToIndexesMap_.end()) {
-    return localVocabIndex->second;
+  auto localVocabIndex = wordsToIndexesMap().find(AlignedStr{word});
+  if (localVocabIndex != wordsToIndexesMap().end()) {
+    return &(*localVocabIndex);
   } else {
     return std::nullopt;
   }
@@ -66,11 +75,5 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
 
 // _____________________________________________________________________________
 const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const {
-  if (localVocabIndex.get() >= indexesToWordsMap_.size()) {
-    throw std::runtime_error(absl::StrCat(
-        "LocalVocab error: request for word with local vocab index ",
-        localVocabIndex.get(), ", but size of local vocab is only ",
-        indexesToWordsMap_.size(), ", please contact the developers"));
-  }
-  return *(indexesToWordsMap_.at(localVocabIndex.get()));
+  return *localVocabIndex;
 }
diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
index b2e262c621..e7a3d5f611 100644
--- a/src/engine/LocalVocab.h
+++ b/src/engine/LocalVocab.h
@@ -21,13 +21,17 @@
 // defined inside of the `ResultTable` class. You gotta start somewhere.
 class LocalVocab {
  private:
-
-
   // A map of the words in the local vocabulary to their local IDs. This is a
   // node hash map because we need the addresses of the words (which are of type
   // `std::string`) to remain stable over their lifetime in the hash map because
   // we refer to them in `wordsToIdsMap_` below.
-  absl::node_hash_set<AlignedStr> wordsToIndexesMap_;
+  using Set = absl::node_hash_set<AlignedStr>;
+  std::vector<std::shared_ptr<const Set>> previousSets_;
+  std::shared_ptr<Set> wordsToIndexesMap_ =
+      std::make_shared<absl::node_hash_set<AlignedStr>>();
+
+  auto& wordsToIndexesMap() { return *wordsToIndexesMap_; }
+  const auto& wordsToIndexesMap() const { return *wordsToIndexesMap_; }
 
  public:
   // Create a new, empty local vocabulary.
@@ -57,14 +61,16 @@ class LocalVocab {
       const std::string& word) const;
 
   // The number of words in the vocabulary.
-  size_t size() const { return wordsToIndexesMap_.size(); }
+  size_t size() const { return wordsToIndexesMap().size(); }
 
   // Return true if and only if the local vocabulary is empty.
-  bool empty() const { return wordsToIndexesMap_.empty(); }
+  bool empty() const { return wordsToIndexesMap().empty(); }
 
   // Return a const reference to the word.
   const std::string& getWord(LocalVocabIndex localVocabIndex) const;
 
+  static LocalVocab merge(const LocalVocab& a, const LocalVocab& b);
+
  private:
   // Common implementation for the two variants of
   // `getIndexAndAddIfNotContainedImpl` above.
diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h
index 39950c6266..c7df4c4713 100644
--- a/src/engine/ResultTable.h
+++ b/src/engine/ResultTable.h
@@ -150,6 +150,13 @@ class ResultTable {
                                const ResultTable&>
   static SharedLocalVocabWrapper getSharedLocalVocabFromNonEmptyOf(
       R&& subResults) {
+    // TODO<joka921> Implement this much better.
+    LocalVocab res;
+    for (const ResultTable& table : subResults) {
+      res = LocalVocab::merge(res, *table.localVocab_);
+    }
+    return SharedLocalVocabWrapper{std::move(res)};
+    /*
     AD_CONTRACT_CHECK(!std::ranges::empty(subResults));
     auto hasNonEmptyVocab = [](const ResultTable& tbl) {
       return !tbl.localVocab_->empty();
@@ -173,6 +180,7 @@ class ResultTable {
               *std::ranges::find_if(subResults, hasNonEmptyVocab))
               .localVocab_};
     }
+     */
   }
 
   // Get a (deep) copy of the local vocabulary from the given result. Use this
diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h
index 507a597121..d2e46da5e1 100644
--- a/src/global/IndexTypes.h
+++ b/src/global/IndexTypes.h
@@ -19,7 +19,7 @@ struct alignas(16) AlignedStr : public std::string {
   using std::string::basic_string;
   explicit AlignedStr(std::string s) : std::string{std::move(s)} {}
 };
-//using LocalVocabIndex = ad_utility::TypedIndex<uint64_t, "LocalVocabIndex">;
+// using LocalVocabIndex = ad_utility::TypedIndex<uint64_t, "LocalVocabIndex">;
 using LocalVocabIndex = const AlignedStr*;
 using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
 using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
diff --git a/src/global/ValueId.h b/src/global/ValueId.h
index e575339bfc..e4efe921cb 100644
--- a/src/global/ValueId.h
+++ b/src/global/ValueId.h
@@ -136,6 +136,10 @@ class ValueId {
   /// TODO<joka921> Is this ordering also consistent for corner cases of doubles
   /// (inf, nan, denormalized numbers, negative zero)?
   constexpr auto operator<=>(const ValueId& other) const {
+    if (getDatatype() == Datatype::LocalVocabIndex &&
+        other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] {
+      return *getLocalVocabIndex() <=> *other.getLocalVocabIndex();
+    }
     return _bits <=> other._bits;
   }
 
@@ -211,7 +215,8 @@ class ValueId {
     return makeFromIndex(index.get(), Datatype::TextRecordIndex);
   }
   static ValueId makeFromLocalVocabIndex(LocalVocabIndex index) {
-    return makeFromIndex(reinterpret_cast<T>(index) >> numDatatypeBits, Datatype::LocalVocabIndex);
+    return makeFromIndex(reinterpret_cast<T>(index) >> numDatatypeBits,
+                         Datatype::LocalVocabIndex);
   }
   static ValueId makeFromWordVocabIndex(WordVocabIndex index) {
     return makeFromIndex(index.get(), Datatype::WordVocabIndex);
diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp
index 9f8244b2f8..ee87a54e76 100644
--- a/test/LocalVocabTest.cpp
+++ b/test/LocalVocabTest.cpp
@@ -53,25 +53,26 @@ TEST(LocalVocab, constructionAndAccess) {
   // Add the words from our test vocabulary and check that they get the expected
   // local vocab indexes.
   for (size_t i = 0; i < testWords.size(); ++i) {
-    ASSERT_EQ(localVocab.getIndexAndAddIfNotContained(testWords[i]),
-              LocalVocabIndex::make(i));
+    ASSERT_EQ(*localVocab.getIndexAndAddIfNotContained(testWords[i]),
+              testWords[i]);
   }
+  size_t localVocabSize = localVocab.size();
   ASSERT_EQ(localVocab.size(), testWords.size());
 
   // Check that we get the same indexes if we do this again, but that no new
   // words will be added.
   for (size_t i = 0; i < testWords.size(); ++i) {
-    ASSERT_EQ(localVocab.getIndexAndAddIfNotContained(testWords[i]),
-              LocalVocabIndex::make(i));
+    ASSERT_EQ(*localVocab.getIndexAndAddIfNotContained(testWords[i]),
+              testWords[i]);
+    ASSERT_EQ(localVocab.size(), localVocabSize);
   }
-  ASSERT_EQ(localVocab.size(), testWords.size());
 
   // Check again that we get the right indexes, but with `getIndexOrNullopt`.
   for (size_t i = 0; i < testWords.size(); ++i) {
     std::optional<LocalVocabIndex> localVocabIndex =
         localVocab.getIndexOrNullopt(testWords[i]);
     ASSERT_TRUE(localVocabIndex.has_value());
-    ASSERT_EQ(localVocabIndex.value(), LocalVocabIndex::make(i));
+    ASSERT_EQ(*localVocabIndex.value(), testWords[i]);
   }
 
   // Check that `getIndexOrNullopt` returns `std::nullopt` for words that are
@@ -81,29 +82,23 @@ TEST(LocalVocab, constructionAndAccess) {
     ASSERT_FALSE(localVocab.getIndexOrNullopt(testWords[i] + "A"));
   }
 
-  // Check that the lookup by ID gives the correct words.
-  for (size_t i = 0; i < testWords.size(); ++i) {
-    ASSERT_EQ(localVocab.getWord(LocalVocabIndex::make(i)), testWords[i]);
-  }
-  ASSERT_EQ(localVocab.size(), testWords.size());
-
-  // Check that out-of-bound lookups are detected (this would have a caught the
-  // one-off bug in #820, LocalVocab.cpp line 55).
-  ASSERT_THROW(localVocab.getWord(LocalVocabIndex::make(testWords.size())),
-               std::runtime_error);
-  ASSERT_THROW(localVocab.getWord(LocalVocabIndex::make(-1)),
-               std::runtime_error);
-
   // Check that a move gives the expected result.
+  localVocabSize = localVocab.size();
   auto localVocabMoved = std::move(localVocab);
-  ASSERT_EQ(localVocab.size(), 0);
+  // TODO<joka921> Should we reset the pointer?
+  // ASSERT_EQ(localVocab.size(), 0);
   ASSERT_EQ(localVocabMoved.size(), testWords.size());
+  // Check that we get the same indexes if we do this again, but that no new
+  // words will be added.
   for (size_t i = 0; i < testWords.size(); ++i) {
-    ASSERT_EQ(localVocabMoved.getWord(LocalVocabIndex::make(i)), testWords[i]);
+    ASSERT_EQ(*localVocabMoved.getIndexAndAddIfNotContained(testWords[i]),
+              testWords[i]);
+    ASSERT_EQ(localVocabMoved.size(), localVocabSize);
   }
 }
 
 // _____________________________________________________________________________
+/*
 TEST(LocalVocab, clone) {
   // Create a small local vocabulary.
   size_t localVocabSize = 100;
@@ -132,9 +127,12 @@ TEST(LocalVocab, clone) {
   ASSERT_EQ(localVocabClone.getIndexAndAddIfNotContained("blubb"),
             LocalVocabIndex::make(localVocabSize));
 }
+ */
 
 // _____________________________________________________________________________
 TEST(LocalVocab, propagation) {
+  // TODO<joka921> This test has to be rewritten
+  /*
   // Query execution context (with small test index), see `IndexTestHelpers.h`.
   using ad_utility::AllocatorWithLimit;
   QueryExecutionContext* testQec = ad_utility::testing::getQec();
@@ -308,4 +306,5 @@ TEST(LocalVocab, propagation) {
   //
   // TODO<joka921> Maybe add tests for the new TextIndexScanFor... classes,
   // they never introduce any local vocab.
+   */
 }
diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp
index c59b43421e..6ca68eb7a0 100644
--- a/test/ValueIdComparatorsTest.cpp
+++ b/test/ValueIdComparatorsTest.cpp
@@ -213,6 +213,8 @@ auto testGetRangesForEqualIds(auto begin, auto end, ValueId idBegin,
 // Test that `getRangesFromId` works correctly for `ValueId`s of the unsigned
 // index types (`VocabIndex`, `TextRecordIndex`, `LocalVocabIndex`,
 // `WordVocabIndex`).
+// TODO<joka921> Reinstate all the tests..
+/*
 TEST(ValueIdComparators, IndexTypes) {
   auto ids = makeRandomIds();
   std::sort(ids.begin(), ids.end(), compareByBits);
@@ -255,6 +257,7 @@ TEST(ValueIdComparators, IndexTypes) {
   testImpl.operator()<Datatype::LocalVocabIndex>(&getLocalVocabIndex);
   testImpl.operator()<Datatype::WordVocabIndex>(&getWordVocabIndex);
 }
+ */
 
 // _______________________________________________________________________
 TEST(ValueIdComparators, undefinedWithItself) {
diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp
index 04e13cf883..169291e439 100644
--- a/test/ValueIdTest.cpp
+++ b/test/ValueIdTest.cpp
@@ -108,11 +108,13 @@ TEST(ValueId, Indices) {
     testSingle(0);
     testSingle(ValueId::maxIndex);
 
-    for (size_t idx = 0; idx < 10'000; ++idx) {
-      auto value = invalidIndexGenerator();
-      ASSERT_THROW(makeId(value), ValueId::IndexTooLargeException);
-      AD_EXPECT_THROW_WITH_MESSAGE(makeId(value),
-                                   ::testing::ContainsRegex("is bigger than"));
+    if (type != Datatype::LocalVocabIndex) {
+      for (size_t idx = 0; idx < 10'000; ++idx) {
+        auto value = invalidIndexGenerator();
+        ASSERT_THROW(makeId(value), ValueId::IndexTooLargeException);
+        AD_EXPECT_THROW_WITH_MESSAGE(
+            makeId(value), ::testing::ContainsRegex("is bigger than"));
+      }
     }
   };
 
@@ -129,6 +131,7 @@ TEST(ValueId, Undefined) {
   ASSERT_EQ(id.getDatatype(), Datatype::Undefined);
 }
 
+/*
 TEST(ValueId, OrderingDifferentDatatypes) {
   auto ids = makeRandomIds();
   std::sort(ids.begin(), ids.end());
@@ -139,7 +142,9 @@ TEST(ValueId, OrderingDifferentDatatypes) {
   ASSERT_TRUE(
       std::is_sorted(ids.begin(), ids.end(), compareByDatatypeAndIndexTypes));
 }
+ */
 
+/*
 TEST(ValueId, IndexOrdering) {
   auto testOrder = [](auto makeIdFromIndex, auto getIndexFromId) {
     std::vector<ValueId> ids;
@@ -163,6 +168,7 @@ TEST(ValueId, IndexOrdering) {
   testOrder(&makeWordVocabId, &getWordVocabIndex);
   testOrder(&makeTextRecordId, &getTextRecordIndex);
 }
+ */
 
 TEST(ValueId, DoubleOrdering) {
   auto ids = makeRandomDoubleIds();
@@ -257,6 +263,7 @@ TEST(ValueId, Serialization) {
   }
 }
 
+/*
 TEST(ValueId, Hashing) {
   auto ids = makeRandomIds();
   ad_utility::HashSet<ValueId> idsWithoutDuplicates;
@@ -275,6 +282,7 @@ TEST(ValueId, Hashing) {
 
   ASSERT_EQ(ids, idsWithoutDuplicatesAsVector);
 }
+ */
 
 TEST(ValueId, toDebugString) {
   auto test = [](ValueId id, std::string_view expected) {
@@ -288,7 +296,8 @@ TEST(ValueId, toDebugString) {
   test(ValueId::makeFromBool(false), "Bool:false");
   test(ValueId::makeFromBool(true), "Bool:true");
   test(makeVocabId(15), "VocabIndex:15");
-  test(makeLocalVocabId(25), "LocalVocabIndex:25");
+  AlignedStr str{"SomeValue"};
+  test(ValueId::makeFromLocalVocabIndex(&str), "LocalVocabIndex:SomeValue");
   test(makeTextRecordId(37), "TextRecordIndex:37");
   test(makeWordVocabId(42), "WordVocabIndex:42");
   test(makeBlankNodeId(27), "BlankNodeIndex:27");
diff --git a/test/ValueIdTestHelpers.h b/test/ValueIdTestHelpers.h
index 98a5727266..b4f6cddce3 100644
--- a/test/ValueIdTestHelpers.h
+++ b/test/ValueIdTestHelpers.h
@@ -45,7 +45,8 @@ inline ValueId makeVocabId(uint64_t value) {
   return ValueId::makeFromVocabIndex(VocabIndex::make(value));
 }
 inline ValueId makeLocalVocabId(uint64_t value) {
-  return ValueId::makeFromLocalVocabIndex(LocalVocabIndex::make(value));
+  return ValueId::makeFromLocalVocabIndex(
+      std::bit_cast<LocalVocabIndex>(value << 4));
 }
 inline ValueId makeTextRecordId(uint64_t value) {
   return ValueId::makeFromTextRecordIndex(TextRecordIndex::make(value));
@@ -58,8 +59,9 @@ inline ValueId makeBlankNodeId(uint64_t value) {
 }
 
 inline uint64_t getVocabIndex(ValueId id) { return id.getVocabIndex().get(); }
+// TODO<joka921> Make the tests more precise for the localVocabIndices.
 inline uint64_t getLocalVocabIndex(ValueId id) {
-  return id.getLocalVocabIndex().get();
+  return std::bit_cast<uint64_t>(id.getLocalVocabIndex()) >> 4;
 }
 inline uint64_t getTextRecordIndex(ValueId id) {
   return id.getTextRecordIndex().get();
diff --git a/test/ValuesTest.cpp b/test/ValuesTest.cpp
index 3d28ea07a4..61f9caf9f8 100644
--- a/test/ValuesTest.cpp
+++ b/test/ValuesTest.cpp
@@ -71,8 +71,10 @@ TEST(Values, computeResult) {
   AD_CORRECTNESS_CHECK(success);
   auto I = ad_utility::testing::IntId;
   auto L = ad_utility::testing::LocalVocabId;
+  auto l = result->localVocab().getIndexOrNullopt("<y>");
+  ASSERT_TRUE(l.has_value());
   auto U = Id::makeUndefined();
-  ASSERT_EQ(table, makeIdTableFromVector({{I(12), x}, {U, L(0)}}));
+  ASSERT_EQ(table, makeIdTableFromVector({{I(12), x}, {U, L(l.value())}}));
 }
 
 // Check that if the number of variables and the number of values in each row
diff --git a/test/util/IdTestHelpers.h b/test/util/IdTestHelpers.h
index 291e3c5b18..8fbb448131 100644
--- a/test/util/IdTestHelpers.h
+++ b/test/util/IdTestHelpers.h
@@ -18,8 +18,13 @@ inline auto VocabId = [](const auto& v) {
   return Id::makeFromVocabIndex(VocabIndex::make(v));
 };
 
-inline auto LocalVocabId = [](const auto& v) {
-  return Id::makeFromLocalVocabIndex(reinterpret_cast<LocalVocabIndex>(v));
+inline auto LocalVocabId = []<typename T>(const T& v) {
+  if constexpr (std::integral<T>) {
+    return Id::makeFromLocalVocabIndex(
+        std::bit_cast<LocalVocabIndex>(static_cast<uint64_t>(v)));
+  } else {
+    return Id::makeFromLocalVocabIndex(std::bit_cast<LocalVocabIndex>(v));
+  }
 };
 
 inline auto TextRecordId = [](const auto& t) {

From aa8d3729e4beb4bf1e0cccd71596ad41a1059212 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 21 Mar 2024 18:17:24 +0100
Subject: [PATCH 03/17] A small fix for joins.

---
 src/engine/AddCombinedRowToTable.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h
index 82203afa14..44b9617d33 100644
--- a/src/engine/AddCombinedRowToTable.h
+++ b/src/engine/AddCombinedRowToTable.h
@@ -226,8 +226,13 @@ class AddCombinedRowToIdTable {
     // and simply copy the values from this column without looking at the other
     // input.
     auto mergeWithUndefined = [](const ValueId a, const ValueId b) {
+      // NOTE: For localVocabIndices we cannot use the simple bitwise OR.
+      if (a.isUndefined()) {return b;}
+      return a;
+      /*
       static_assert(ValueId::makeUndefined().getBits() == 0u);
       return ValueId::fromBits(a.getBits() | b.getBits());
+       */
     };
 
     // A lambda that writes the join column with the given `colIdx` to the

From 2836a513d6b02f0166759ec2de3fc2274a3cf45b Mon Sep 17 00:00:00 2001
From: Hannah Bast <bast@cs.uni-freiburg.de>
Date: Fri, 22 Mar 2024 03:48:06 +0100
Subject: [PATCH 04/17] Minor clang-format fix

---
 src/engine/AddCombinedRowToTable.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h
index 44b9617d33..49e212fd3b 100644
--- a/src/engine/AddCombinedRowToTable.h
+++ b/src/engine/AddCombinedRowToTable.h
@@ -227,7 +227,9 @@ class AddCombinedRowToIdTable {
     // input.
     auto mergeWithUndefined = [](const ValueId a, const ValueId b) {
       // NOTE: For localVocabIndices we cannot use the simple bitwise OR.
-      if (a.isUndefined()) {return b;}
+      if (a.isUndefined()) {
+        return b;
+      }
       return a;
       /*
       static_assert(ValueId::makeUndefined().getBits() == 0u);

From cbccbc8f119e5bb4a96d23f94ade2bf7bdbabae4 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Mon, 8 Apr 2024 17:25:38 +0200
Subject: [PATCH 05/17] working on this and that.

---
 test/LocalVocabTest.cpp | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp
index ee87a54e76..e22879ed80 100644
--- a/test/LocalVocabTest.cpp
+++ b/test/LocalVocabTest.cpp
@@ -98,12 +98,11 @@ TEST(LocalVocab, constructionAndAccess) {
 }
 
 // _____________________________________________________________________________
-/*
 TEST(LocalVocab, clone) {
   // Create a small local vocabulary.
   size_t localVocabSize = 100;
   LocalVocab localVocabOriginal;
-  for (auto& word : getTestCollectionOfWords(localVocabSize)) {
+  for (const auto& word : getTestCollectionOfWords(localVocabSize)) {
     localVocabOriginal.getIndexAndAddIfNotContained(word);
   }
   ASSERT_EQ(localVocabOriginal.size(), localVocabSize);
@@ -113,26 +112,17 @@ TEST(LocalVocab, clone) {
   LocalVocab localVocabClone = localVocabOriginal.clone();
   ASSERT_EQ(localVocabOriginal.size(), localVocabSize);
   ASSERT_EQ(localVocabClone.size(), localVocabSize);
-  for (size_t i = 0; i < localVocabSize; ++i) {
-    LocalVocabIndex idx = LocalVocabIndex::make(i);
-    const std::string& wordFromOriginal = localVocabOriginal.getWord(idx);
-    const std::string& wordFromClone = localVocabClone.getWord(idx);
-    ASSERT_EQ(wordFromOriginal, wordFromClone);
-    ASSERT_NE(&wordFromOriginal, &wordFromClone);
+  for (const auto& word : getTestCollectionOfWords(localVocabSize)) {
+    auto fromOrig = localVocabOriginal.getIndexOrNullopt(word).value();
+    auto fromClone = localVocabClone.getIndexOrNullopt(word).value();
+    ASSERT_EQ(fromOrig, fromClone);
+    ASSERT_EQ(word, *fromOrig);
   }
-
-  // Check that `nextFreeIndex_` of the clone is OK by adding another word to
-  // the clone.
-  localVocabClone.getIndexAndAddIfNotContained("blubb");
-  ASSERT_EQ(localVocabClone.getIndexAndAddIfNotContained("blubb"),
-            LocalVocabIndex::make(localVocabSize));
 }
- */
 
+// TODO<joka921> Rewrite
 // _____________________________________________________________________________
 TEST(LocalVocab, propagation) {
-  // TODO<joka921> This test has to be rewritten
-  /*
   // Query execution context (with small test index), see `IndexTestHelpers.h`.
   using ad_utility::AllocatorWithLimit;
   QueryExecutionContext* testQec = ad_utility::testing::getQec();
@@ -306,5 +296,5 @@ TEST(LocalVocab, propagation) {
   //
   // TODO<joka921> Maybe add tests for the new TextIndexScanFor... classes,
   // they never introduce any local vocab.
-   */
 }
+*/

From de10849ac07a08f6b3217be4bd48241e0f7ad09a Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Wed, 10 Apr 2024 20:08:08 +0200
Subject: [PATCH 06/17] A Working version with improved test coverage.

---
 src/engine/AddCombinedRowToTable.h |  4 +-
 src/engine/LocalVocab.cpp          | 10 ++++
 src/engine/LocalVocab.h            | 15 ++++--
 src/global/ValueId.h               |  8 +++-
 test/LocalVocabTest.cpp            | 74 ++++++++++++++++--------------
 5 files changed, 70 insertions(+), 41 deletions(-)

diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h
index 44b9617d33..49e212fd3b 100644
--- a/src/engine/AddCombinedRowToTable.h
+++ b/src/engine/AddCombinedRowToTable.h
@@ -227,7 +227,9 @@ class AddCombinedRowToIdTable {
     // input.
     auto mergeWithUndefined = [](const ValueId a, const ValueId b) {
       // NOTE: For localVocabIndices we cannot use the simple bitwise OR.
-      if (a.isUndefined()) {return b;}
+      if (a.isUndefined()) {
+        return b;
+      }
       return a;
       /*
       static_assert(ValueId::makeUndefined().getBits() == 0u);
diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp
index 88ed5fb893..aa09eef6f6 100644
--- a/src/engine/LocalVocab.cpp
+++ b/src/engine/LocalVocab.cpp
@@ -77,3 +77,13 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
 const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const {
   return *localVocabIndex;
 }
+
+// _____________________________________________________________________________
+std::vector<std::string> LocalVocab::getAllWordsForTesting() const {
+  std::vector<std::string> result;
+  std::ranges::copy(wordsToIndexesMap(), std::back_inserter(result));
+  for (const auto& previous : previousSets_) {
+    std::ranges::copy(*previous, std::back_inserter(result));
+  }
+  return result;
+}
diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
index e7a3d5f611..5d57222414 100644
--- a/src/engine/LocalVocab.h
+++ b/src/engine/LocalVocab.h
@@ -27,8 +27,7 @@ class LocalVocab {
   // we refer to them in `wordsToIdsMap_` below.
   using Set = absl::node_hash_set<AlignedStr>;
   std::vector<std::shared_ptr<const Set>> previousSets_;
-  std::shared_ptr<Set> wordsToIndexesMap_ =
-      std::make_shared<absl::node_hash_set<AlignedStr>>();
+  std::shared_ptr<Set> wordsToIndexesMap_ = std::make_shared<Set>();
 
   auto& wordsToIndexesMap() { return *wordsToIndexesMap_; }
   const auto& wordsToIndexesMap() const { return *wordsToIndexesMap_; }
@@ -61,16 +60,24 @@ class LocalVocab {
       const std::string& word) const;
 
   // The number of words in the vocabulary.
-  size_t size() const { return wordsToIndexesMap().size(); }
+  size_t size() const {
+    auto result = wordsToIndexesMap().size();
+    for (const auto& previous : previousSets_) {
+      result += previous->size();
+    }
+    return result;
+  }
 
   // Return true if and only if the local vocabulary is empty.
-  bool empty() const { return wordsToIndexesMap().empty(); }
+  bool empty() const { return size() == 0; }
 
   // Return a const reference to the word.
   const std::string& getWord(LocalVocabIndex localVocabIndex) const;
 
   static LocalVocab merge(const LocalVocab& a, const LocalVocab& b);
 
+  std::vector<std::string> getAllWordsForTesting() const;
+
  private:
   // Common implementation for the two variants of
   // `getIndexAndAddIfNotContainedImpl` above.
diff --git a/src/global/ValueId.h b/src/global/ValueId.h
index e4efe921cb..0a031d2bce 100644
--- a/src/global/ValueId.h
+++ b/src/global/ValueId.h
@@ -123,7 +123,13 @@ class ValueId {
 
   /// Equality comparison is performed directly on the underlying
   /// representation.
-  constexpr bool operator==(const ValueId&) const = default;
+  constexpr bool operator==(const ValueId& other) const {
+    if (getDatatype() == Datatype::LocalVocabIndex &&
+        other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] {
+      return *getLocalVocabIndex() == *other.getLocalVocabIndex();
+    }
+    return _bits == other._bits;
+  }
 
   /// Comparison is performed directly on the underlying representation. Note
   /// that because the type bits are the most significant bits, all values of
diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp
index e22879ed80..00fbebfa99 100644
--- a/test/LocalVocabTest.cpp
+++ b/test/LocalVocabTest.cpp
@@ -2,6 +2,7 @@
 //  Chair of Algorithms and Data Structures.
 //  Author: Hannah Bast <bast@cs.uni-freiburg.de>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include <sstream>
@@ -112,15 +113,12 @@ TEST(LocalVocab, clone) {
   LocalVocab localVocabClone = localVocabOriginal.clone();
   ASSERT_EQ(localVocabOriginal.size(), localVocabSize);
   ASSERT_EQ(localVocabClone.size(), localVocabSize);
-  for (const auto& word : getTestCollectionOfWords(localVocabSize)) {
-    auto fromOrig = localVocabOriginal.getIndexOrNullopt(word).value();
-    auto fromClone = localVocabClone.getIndexOrNullopt(word).value();
-    ASSERT_EQ(fromOrig, fromClone);
-    ASSERT_EQ(word, *fromOrig);
-  }
+  ASSERT_THAT(localVocabClone.getAllWordsForTesting(),
+              ::testing::UnorderedElementsAreArray(
+                  localVocabOriginal.getAllWordsForTesting()));
+  // TODO<joka921> Maybe also test the indices.
 }
 
-// TODO<joka921> Rewrite
 // _____________________________________________________________________________
 TEST(LocalVocab, propagation) {
   // Query execution context (with small test index), see `IndexTestHelpers.h`.
@@ -137,18 +135,15 @@ TEST(LocalVocab, propagation) {
     std::shared_ptr<const ResultTable> resultTable = operation.getResult();
     ASSERT_TRUE(resultTable)
         << "Operation: " << operation.getDescriptor() << std::endl;
-    std::vector<std::string> localVocabWords;
-    for (size_t i = 0; i < resultTable->localVocab().size(); ++i) {
-      localVocabWords.emplace_back(
-          resultTable->localVocab().getWord(LocalVocabIndex::make(i)));
-    }
-    ASSERT_EQ(localVocabWords, expectedWords)
-        << "Operation: " << operation.getDescriptor() << std::endl;
-  };
-
-  // Lambda that checks that `computeResult` throws an exception.
-  auto checkThrow = [&](Operation& operation) -> void {
-    ASSERT_THROW(operation.getResult(), ad_utility::AbortException)
+    std::vector<std::string> localVocabWords =
+        resultTable->localVocab().getAllWordsForTesting();
+    // We currently allow the local vocab to have multiple IDs for the same
+    // word, so we have to deduplicate first.
+    std::ranges::sort(localVocabWords);
+    localVocabWords.erase(std::ranges::unique(localVocabWords).begin(),
+                          localVocabWords.end());
+    ASSERT_THAT(localVocabWords,
+                ::testing::UnorderedElementsAreArray(expectedWords))
         << "Operation: " << operation.getDescriptor() << std::endl;
   };
 
@@ -175,7 +170,8 @@ TEST(LocalVocab, propagation) {
                            {{TripleComponent{"x"}, TripleComponent{"y1"}},
                             {TripleComponent{"x"}, TripleComponent{"y2"}}}});
   Values values1copy = values1;
-  checkLocalVocab(values1copy, std::vector<std::string>{"x", "y1", "y2"});
+  std::vector<std::string> localVocab1{"x", "y1", "y2"};
+  checkLocalVocab(values1copy, localVocab1);
 
   // VALUES operation that uses an existing literal (from the test index).
   Values values2(testQec, {{Variable{"?x"}, Variable{"?y"}},
@@ -183,27 +179,34 @@ TEST(LocalVocab, propagation) {
   Values values2copy = values2;
   checkLocalVocab(values2copy, std::vector<std::string>{});
 
+  // Contains local vocab words that are (at least partially) disjoint from the
+  // words in `values1`.
+  Values values3(testQec, {{Variable{"?x"}, Variable{"?y"}},
+                           {{TripleComponent{"x"}, TripleComponent{"y1"}},
+                            {TripleComponent{"x2"}, TripleComponent{"y3"}}}});
+  std::vector<std::string> localVocab13{"x", "y1", "y2", "x2", "y3"};
+
   // JOIN operation with exactly one non-empty local vocab and with two
   // non-empty local vocabs (the last two arguments are the two join columns).
   Join join1(testQec, qet(values1), qet(values2), 0, 0);
   checkLocalVocab(join1, std::vector<std::string>{"x", "y1", "y2"});
-  Join join2(testQec, qet(values1), qet(values1), 0, 0);
-  checkThrow(join2);
+  Join join2(testQec, qet(values1), qet(values3), 0, 0);
+  checkLocalVocab(join2, localVocab13);
 
   // OPTIONAL JOIN operation with exactly one non-empty local vocab.
   OptionalJoin optJoin1(testQec, qet(values1), qet(values2));
   checkLocalVocab(optJoin1, std::vector<std::string>{"x", "y1", "y2"});
 
   // OPTIONAL JOIN operation with two non-empty local vocab.
-  OptionalJoin optJoin2(testQec, qet(values1), qet(values1));
-  checkThrow(optJoin2);
+  OptionalJoin optJoin2(testQec, qet(values1), qet(values3));
+  checkLocalVocab(optJoin2, localVocab13);
 
   // MULTI-COLUMN JOIN operation with exactly one non-empty local vocab and with
   // two non-empty local vocabs.
   MultiColumnJoin multiJoin1(testQec, qet(values1), qet(values2));
   checkLocalVocab(multiJoin1, std::vector<std::string>{"x", "y1", "y2"});
-  MultiColumnJoin multiJoin2(testQec, qet(values1), qet(values1));
-  checkThrow(multiJoin2);
+  MultiColumnJoin multiJoin2(testQec, qet(values1), qet(values3));
+  checkLocalVocab(multiJoin2, localVocab13);
 
   // ORDER BY operation (the third argument are the indices of the columns to be
   // sorted, and the sort order; not important for this test).
@@ -248,15 +251,15 @@ TEST(LocalVocab, propagation) {
   // non-empy local vocabs.
   Union union1(testQec, qet(values1), qet(values2));
   checkLocalVocab(union1, std::vector<std::string>{"x", "y1", "y2"});
-  Union union2(testQec, qet(values1), qet(values1));
-  checkThrow(union2);
+  Union union2(testQec, qet(values1), qet(values3));
+  checkLocalVocab(union2, localVocab13);
 
   // MINUS operation with exactly one non-empty local vocab and with
   // two non-empty local vocabs.
   Minus minus1(testQec, qet(values1), qet(values2));
   checkLocalVocab(minus1, std::vector<std::string>{"x", "y1", "y2"});
-  Minus minus2(testQec, qet(values1), qet(values1));
-  checkThrow(minus2);
+  Minus minus2(testQec, qet(values1), qet(values3));
+  checkLocalVocab(minus2, localVocab13);
 
   // FILTER operation (the third argument is an expression; which one doesn't
   // matter for this test).
@@ -287,14 +290,15 @@ TEST(LocalVocab, propagation) {
   // PATTERN TRICK operations.
   HasPredicateScan hasPredicateScan(testQec, qet(values1), 0, Variable{"?z"});
   checkLocalVocab(hasPredicateScan, std::vector<std::string>{"x", "y1", "y2"});
+  Values valuesPatternTrick(
+      testQec, {{Variable{"?x"}, Variable{"?y"}},
+                {{TripleComponent{"x"}, TripleComponent{NO_PATTERN}},
+                 {TripleComponent{"x"}, TripleComponent{NO_PATTERN}}}});
   CountAvailablePredicates countAvailablePredictes(
-      testQec, qet(values1), 0, Variable{"?x"}, Variable{"?y"});
-  checkLocalVocab(countAvailablePredictes,
-                  std::vector<std::string>{"x", "y1", "y2"});
-
+      testQec, qet(valuesPatternTrick), 0, Variable{"?y"}, Variable{"?count"});
+  checkLocalVocab(countAvailablePredictes, {"x"});
   // TEXT operations.
   //
   // TODO<joka921> Maybe add tests for the new TextIndexScanFor... classes,
   // they never introduce any local vocab.
 }
-*/

From 4cedeeb71d7a4f6aec3bd1f79667ed5de58895db Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Wed, 10 Apr 2024 20:21:57 +0200
Subject: [PATCH 07/17] Fix the tests.

---
 test/LocalVocabTest.cpp | 65 +++++++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 28 deletions(-)

diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp
index 00fbebfa99..fc7746a09f 100644
--- a/test/LocalVocabTest.cpp
+++ b/test/LocalVocabTest.cpp
@@ -166,36 +166,43 @@ TEST(LocalVocab, propagation) {
   // literal "x", we would have to write TripleComponent{"\"x\""}. For the
   // purposes of this test, we just want something that's not yet in the index,
   // so "x" etc. is just fine (and also different from the "<x>" below).
-  Values values1(testQec, {{Variable{"?x"}, Variable{"?y"}},
-                           {{TripleComponent{"x"}, TripleComponent{"y1"}},
-                            {TripleComponent{"x"}, TripleComponent{"y2"}}}});
+  auto iri = ad_utility::testing::iri;
+  Values values1(
+      testQec,
+      {{Variable{"?x"}, Variable{"?y"}},
+       {{TripleComponent{iri("<xN1>")}, TripleComponent{iri("<yN1>")}},
+        {TripleComponent{iri("<xN1>")}, TripleComponent{iri("<yN2>")}}}});
   Values values1copy = values1;
-  std::vector<std::string> localVocab1{"x", "y1", "y2"};
+  std::vector<std::string> localVocab1{"<xN1>", "<yN1>", "<yN2>"};
   checkLocalVocab(values1copy, localVocab1);
 
   // VALUES operation that uses an existing literal (from the test index).
-  Values values2(testQec, {{Variable{"?x"}, Variable{"?y"}},
-                           {{TripleComponent{"<x>"}, TripleComponent{"<y>"}}}});
+  Values values2(
+      testQec, {{Variable{"?x"}, Variable{"?y"}},
+                {{TripleComponent{iri("<x>")}, TripleComponent{iri("<y>")}}}});
   Values values2copy = values2;
   checkLocalVocab(values2copy, std::vector<std::string>{});
 
   // Contains local vocab words that are (at least partially) disjoint from the
   // words in `values1`.
-  Values values3(testQec, {{Variable{"?x"}, Variable{"?y"}},
-                           {{TripleComponent{"x"}, TripleComponent{"y1"}},
-                            {TripleComponent{"x2"}, TripleComponent{"y3"}}}});
-  std::vector<std::string> localVocab13{"x", "y1", "y2", "x2", "y3"};
+  Values values3(
+      testQec,
+      {{Variable{"?x"}, Variable{"?y"}},
+       {{TripleComponent{iri("<xN1>")}, TripleComponent{iri("<yN1>")}},
+        {TripleComponent{iri("<xN2>")}, TripleComponent{iri("<yN3>")}}}});
+  std::vector<std::string> localVocab13{"<xN1>", "<yN1>", "<yN2>", "<xN2>",
+                                        "<yN3>"};
 
   // JOIN operation with exactly one non-empty local vocab and with two
   // non-empty local vocabs (the last two arguments are the two join columns).
   Join join1(testQec, qet(values1), qet(values2), 0, 0);
-  checkLocalVocab(join1, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(join1, localVocab1);
   Join join2(testQec, qet(values1), qet(values3), 0, 0);
   checkLocalVocab(join2, localVocab13);
 
   // OPTIONAL JOIN operation with exactly one non-empty local vocab.
   OptionalJoin optJoin1(testQec, qet(values1), qet(values2));
-  checkLocalVocab(optJoin1, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(optJoin1, localVocab1);
 
   // OPTIONAL JOIN operation with two non-empty local vocab.
   OptionalJoin optJoin2(testQec, qet(values1), qet(values3));
@@ -204,23 +211,23 @@ TEST(LocalVocab, propagation) {
   // MULTI-COLUMN JOIN operation with exactly one non-empty local vocab and with
   // two non-empty local vocabs.
   MultiColumnJoin multiJoin1(testQec, qet(values1), qet(values2));
-  checkLocalVocab(multiJoin1, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(multiJoin1, localVocab1);
   MultiColumnJoin multiJoin2(testQec, qet(values1), qet(values3));
   checkLocalVocab(multiJoin2, localVocab13);
 
   // ORDER BY operation (the third argument are the indices of the columns to be
   // sorted, and the sort order; not important for this test).
   OrderBy orderBy(testQec, qet(values1), {{0, true}, {1, true}});
-  checkLocalVocab(orderBy, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(orderBy, localVocab1);
 
   // SORT operation (the third operation is the sort column).
   Sort sort(testQec, qet(values1), {0});
-  checkLocalVocab(sort, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(sort, localVocab1);
 
   // DISTINCT operation (the third argument are the indices of the input columns
   // that are considered for the output; not important for this test).
   Distinct distinct1(testQec, qet(values1), {0, 1});
-  checkLocalVocab(distinct1, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(distinct1, localVocab1);
 
   // GROUP BY operation.
   auto groupConcatExpression =
@@ -239,25 +246,26 @@ TEST(LocalVocab, propagation) {
       testQec, {Variable{"?x"}},
       {Alias{groupConcatExpression("?y", "|"), Variable{"?concat"}}},
       qet(values1));
-  checkLocalVocab(groupBy, std::vector<std::string>{"x", "y1", "y2", "y1|y2"});
+  checkLocalVocab(
+      groupBy, std::vector<std::string>{"<xN1>", "<yN1>", "<yN2>", "yN1|yN2"});
 
   // DISTINCT again, but after something has been added to the local vocabulary
   // (to check that the "y1|y2" added by the GROUP BY does not also appear here,
   // as it did before GroupBy::computeResult copied it's local vocabulary).
   Distinct distinct2(testQec, qet(values1), {0});
-  checkLocalVocab(distinct2, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(distinct2, localVocab1);
 
   // UNION operation with exactly one non-empty local vocab and with two
   // non-empy local vocabs.
   Union union1(testQec, qet(values1), qet(values2));
-  checkLocalVocab(union1, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(union1, localVocab1);
   Union union2(testQec, qet(values1), qet(values3));
   checkLocalVocab(union2, localVocab13);
 
   // MINUS operation with exactly one non-empty local vocab and with
   // two non-empty local vocabs.
   Minus minus1(testQec, qet(values1), qet(values2));
-  checkLocalVocab(minus1, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(minus1, localVocab1);
   Minus minus2(testQec, qet(values1), qet(values3));
   checkLocalVocab(minus2, localVocab13);
 
@@ -267,7 +275,7 @@ TEST(LocalVocab, propagation) {
       testQec, qet(values1),
       {std::make_unique<sparqlExpression::VariableExpression>(Variable{"?x"}),
        "Expression ?x"});
-  checkLocalVocab(filter, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(filter, localVocab1);
 
   // BIND operation (the third argument is a `parsedQuery::Bind` object).
   Bind bind(
@@ -275,7 +283,7 @@ TEST(LocalVocab, propagation) {
       {{std::make_unique<sparqlExpression::VariableExpression>(Variable{"?x"}),
         "Expression ?x"},
        Variable{"?z"}});
-  checkLocalVocab(bind, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(bind, localVocab1);
 
   // TRANSITIVE PATH operation.
   //
@@ -285,18 +293,19 @@ TEST(LocalVocab, propagation) {
   TransitivePathSide left(std::nullopt, 0, Variable{"?x"});
   TransitivePathSide right(std::nullopt, 1, Variable{"?y"});
   TransitivePath transitivePath(testQec, qet(values1), left, right, 1, 1);
-  checkLocalVocab(transitivePath, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(transitivePath, localVocab1);
 
   // PATTERN TRICK operations.
   HasPredicateScan hasPredicateScan(testQec, qet(values1), 0, Variable{"?z"});
-  checkLocalVocab(hasPredicateScan, std::vector<std::string>{"x", "y1", "y2"});
+  checkLocalVocab(hasPredicateScan, localVocab1);
   Values valuesPatternTrick(
-      testQec, {{Variable{"?x"}, Variable{"?y"}},
-                {{TripleComponent{"x"}, TripleComponent{NO_PATTERN}},
-                 {TripleComponent{"x"}, TripleComponent{NO_PATTERN}}}});
+      testQec,
+      {{Variable{"?x"}, Variable{"?y"}},
+       {{TripleComponent{iri("<xN1>")}, TripleComponent{NO_PATTERN}},
+        {TripleComponent{iri("<xN1>")}, TripleComponent{NO_PATTERN}}}});
   CountAvailablePredicates countAvailablePredictes(
       testQec, qet(valuesPatternTrick), 0, Variable{"?y"}, Variable{"?count"});
-  checkLocalVocab(countAvailablePredictes, {"x"});
+  checkLocalVocab(countAvailablePredictes, {"<xN1>"});
   // TEXT operations.
   //
   // TODO<joka921> Maybe add tests for the new TextIndexScanFor... classes,

From 2bc4d5dcb165ae2558e071927bb8e56460e9d5cf Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Wed, 10 Apr 2024 21:49:03 +0200
Subject: [PATCH 08/17] Small round of reviews.

---
 src/engine/AddCombinedRowToTable.h  | 26 +++++++----------
 src/engine/CartesianProductJoin.cpp |  2 +-
 src/engine/Join.cpp                 |  2 +-
 src/engine/LocalVocab.cpp           | 38 ++++++++++--------------
 src/engine/LocalVocab.h             | 28 ++++++++++++------
 src/engine/Minus.cpp                |  3 +-
 src/engine/MultiColumnJoin.cpp      |  3 +-
 src/engine/OptionalJoin.cpp         |  3 +-
 src/engine/ResultTable.cpp          |  6 ++--
 src/engine/ResultTable.h            | 45 +++++------------------------
 src/engine/TransitivePath.cpp       |  2 +-
 src/engine/Union.cpp                |  5 ++--
 src/global/IndexTypes.h             |  9 +++---
 src/global/ValueId.h                | 10 +++++--
 test/ValueIdTest.cpp                |  2 +-
 15 files changed, 79 insertions(+), 105 deletions(-)

diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h
index 49e212fd3b..ae3e3e94bf 100644
--- a/src/engine/AddCombinedRowToTable.h
+++ b/src/engine/AddCombinedRowToTable.h
@@ -218,28 +218,23 @@ class AddCombinedRowToIdTable {
     AD_CORRECTNESS_CHECK(inputLeftAndRight_.has_value());
     result.resize(oldSize + nextIndex_);
 
-    // Sometimes columns are combined where one value is UNDEF and the other one
-    // is not. This function very efficiently returns the not-UNDEF value in
-    // this case.
-    // TODO<joka921> If we keep track of the information that one of the
-    // involved columns contains no UNDEF values at all, we can omit this step
-    // and simply copy the values from this column without looking at the other
-    // input.
-    auto mergeWithUndefined = [](const ValueId a, const ValueId b) {
-      // NOTE: For localVocabIndices we cannot use the simple bitwise OR.
+    // Precondition: `a` and `b` compare equal or at least one of them is UNDEF
+    // If exactly one of them is UNDEF, return the other one, else return any of
+    // them (they are equal anyway).
+    auto getJoinValue = [](const ValueId a, const ValueId b) {
+      // NOTE: For localVocabIndices we might have different pointers that
+      // compare equal because they point to the same word. Therefore we cannot
+      // use a simple bitwise operation to handle the "one of them is UNDEF"
+      // case as we previously did.
       if (a.isUndefined()) {
         return b;
       }
       return a;
-      /*
-      static_assert(ValueId::makeUndefined().getBits() == 0u);
-      return ValueId::fromBits(a.getBits() | b.getBits());
-       */
     };
 
     // A lambda that writes the join column with the given `colIdx` to the
     // `nextResultColIdx`-th column of the result.
-    auto writeJoinColumn = [&result, &mergeWithUndefined, oldSize, this](
+    auto writeJoinColumn = [&result, &getJoinValue, oldSize, this](
                                size_t colIdx, size_t resultColIdx) {
       const auto& colLeft = inputLeft().getColumn(colIdx);
       const auto& colRight = inputRight().getColumn(colIdx);
@@ -249,7 +244,8 @@ class AddCombinedRowToIdTable {
 
       // Write the matching rows.
       for (const auto& [targetIndex, sourceIndices] : indexBuffer_) {
-        auto resultId = mergeWithUndefined(colLeft[sourceIndices[0]],
+        auto resultId =
+            getJoinValue(colLeft[sourceIndices[0]],
                                            colRight[sourceIndices[1]]);
         numUndef += static_cast<size_t>(resultId.isUndefined());
         resultCol[oldSize + targetIndex] = resultId;
diff --git a/src/engine/CartesianProductJoin.cpp b/src/engine/CartesianProductJoin.cpp
index e9059269b0..c1c631f5b7 100644
--- a/src/engine/CartesianProductJoin.cpp
+++ b/src/engine/CartesianProductJoin.cpp
@@ -202,7 +202,7 @@ ResultTable CartesianProductJoin::computeResult() {
   auto subResultsDeref = std::views::transform(
       subResults, [](auto& x) -> decltype(auto) { return *x; });
   return {std::move(result), resultSortedOn(),
-          ResultTable::getSharedLocalVocabFromNonEmptyOf(subResultsDeref)};
+          ResultTable::getMergedLocalVocab(subResultsDeref)};
 }
 
 // ____________________________________________________________________________
diff --git a/src/engine/Join.cpp b/src/engine/Join.cpp
index df84b1696a..dc1063c611 100644
--- a/src/engine/Join.cpp
+++ b/src/engine/Join.cpp
@@ -187,7 +187,7 @@ ResultTable Join::computeResult() {
   // If only one of the two operands has a non-empty local vocabulary, share
   // with that one (otherwise, throws an exception).
   return {std::move(idTable), resultSortedOn(),
-          ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftRes, *rightRes)};
+          ResultTable::getMergedLocalVocab(*leftRes, *rightRes)};
 }
 
 // _____________________________________________________________________________
diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp
index aa09eef6f6..e7559a65cf 100644
--- a/src/engine/LocalVocab.cpp
+++ b/src/engine/LocalVocab.cpp
@@ -11,42 +11,36 @@
 // _____________________________________________________________________________
 LocalVocab LocalVocab::clone() const {
   LocalVocab localVocabClone;
-  localVocabClone.previousSets_ = previousSets_;
-  localVocabClone.previousSets_.push_back(wordsToIndexesMap_);
+  localVocabClone.otherWordSets_ = otherWordSets_;
+  localVocabClone.otherWordSets_.push_back(primaryWordSet_);
   // Return the clone.
   return localVocabClone;
 }
 
+// _____________________________________________________________________________
 LocalVocab LocalVocab::merge(const LocalVocab& a, const LocalVocab& b) {
   LocalVocab res;
-  auto inserter = std::back_inserter(res.previousSets_);
-  std::ranges::copy(a.previousSets_, inserter);
-  std::ranges::copy(b.previousSets_, inserter);
-  *inserter = a.wordsToIndexesMap_;
-  *inserter = b.wordsToIndexesMap_;
+  auto inserter = std::back_inserter(res.otherWordSets_);
+  std::ranges::copy(a.otherWordSets_, inserter);
+  std::ranges::copy(b.otherWordSets_, inserter);
+  *inserter = a.primaryWordSet_;
+  *inserter = b.primaryWordSet_;
   return res;
 }
 
 // _____________________________________________________________________________
 template <typename WordT>
 LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
-  // The following code contains two subtle, but important optimizations:
+  // The following code contains a subtle, but important optimizations:
   //
   // 1. The variant of `insert` used covers the case that `word` was already
   // contained in the map as well as the case that it is newly inserted. This
   // avoids computing the hash for `word` twice in case we see it for the first
   // time (note that hashing a string is not cheap).
-  //
-  // 2. The fact that we have a member variable `nextFreeIndex_` avoids that we
-  // tentatively have to compute the next free ID every time this function is
-  // called (even when the ID is not actually needed because the word is already
-  // contained in the map).
-  //
   // TODO<joka921> Make this work with transparent hashing and use try_emplace.
-  auto [wordInMapAndIndex, isNewWord] =
-      wordsToIndexesMap().insert(AlignedStr{std::forward<WordT>(word)});
-  const auto& wordInMap = *wordInMapAndIndex;
-  return &wordInMap;
+  auto [wordIterator, isNewWord] =
+      primaryWordSet().insert(StringAligned16{std::forward<WordT>(word)});
+  return std::addressof(*wordIterator);
 }
 
 // _____________________________________________________________________________
@@ -65,8 +59,8 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
     const std::string& word) const {
   // TODO<joka921> Maybe we can make this work with transparent hashing,
   // but if this is only a testing API this is probably not worth the hassle.
-  auto localVocabIndex = wordsToIndexesMap().find(AlignedStr{word});
-  if (localVocabIndex != wordsToIndexesMap().end()) {
+  auto localVocabIndex = primaryWordSet().find(StringAligned16{word});
+  if (localVocabIndex != primaryWordSet().end()) {
     return &(*localVocabIndex);
   } else {
     return std::nullopt;
@@ -81,8 +75,8 @@ const std::string& LocalVocab::getWord(LocalVocabIndex localVocabIndex) const {
 // _____________________________________________________________________________
 std::vector<std::string> LocalVocab::getAllWordsForTesting() const {
   std::vector<std::string> result;
-  std::ranges::copy(wordsToIndexesMap(), std::back_inserter(result));
-  for (const auto& previous : previousSets_) {
+  std::ranges::copy(primaryWordSet(), std::back_inserter(result));
+  for (const auto& previous : otherWordSets_) {
     std::ranges::copy(*previous, std::back_inserter(result));
   }
   return result;
diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
index 5d57222414..c2d7f6a8ce 100644
--- a/src/engine/LocalVocab.h
+++ b/src/engine/LocalVocab.h
@@ -24,13 +24,17 @@ class LocalVocab {
   // A map of the words in the local vocabulary to their local IDs. This is a
   // node hash map because we need the addresses of the words (which are of type
   // `std::string`) to remain stable over their lifetime in the hash map because
-  // we refer to them in `wordsToIdsMap_` below.
-  using Set = absl::node_hash_set<AlignedStr>;
-  std::vector<std::shared_ptr<const Set>> previousSets_;
-  std::shared_ptr<Set> wordsToIndexesMap_ = std::make_shared<Set>();
+  // we hand out pointers to them.
+  using Set = absl::node_hash_set<StringAligned16>;
+  std::shared_ptr<Set> primaryWordSet_ = std::make_shared<Set>();
 
-  auto& wordsToIndexesMap() { return *wordsToIndexesMap_; }
-  const auto& wordsToIndexesMap() const { return *wordsToIndexesMap_; }
+  // Local vocabularies from child operations that were merged into this
+  // vocabulary s.t. the pointers are kept alive. They have to be `const`
+  // because they are possibly shared concurrently (for example via the cache).
+  std::vector<std::shared_ptr<const Set>> otherWordSets_;
+
+  auto& primaryWordSet() { return *primaryWordSet_; }
+  const auto& primaryWordSet() const { return *primaryWordSet_; }
 
  public:
   // Create a new, empty local vocabulary.
@@ -40,7 +44,9 @@ class LocalVocab {
   LocalVocab(const LocalVocab&) = delete;
   LocalVocab& operator=(const LocalVocab&) = delete;
 
-  // Make a deep copy explicitly.
+  // Make a logical copy. The clone will have an empty primary set so it can
+  // safely be modified. The contents are copied as shared pointers to const, so
+  // the function runs in linear time in the number of word sets.
   LocalVocab clone() const;
 
   // Moving a local vocabulary is not problematic (though the typical use case
@@ -60,9 +66,10 @@ class LocalVocab {
       const std::string& word) const;
 
   // The number of words in the vocabulary.
+  // Note: This is not constant time, but linear in the number of word sets.
   size_t size() const {
-    auto result = wordsToIndexesMap().size();
-    for (const auto& previous : previousSets_) {
+    auto result = primaryWordSet().size();
+    for (const auto& previous : otherWordSets_) {
       result += previous->size();
     }
     return result;
@@ -74,8 +81,11 @@ class LocalVocab {
   // Return a const reference to the word.
   const std::string& getWord(LocalVocabIndex localVocabIndex) const;
 
+  // Create a local vocab that contains and keeps alive all the words from `a`
+  // as well as `b`.
   static LocalVocab merge(const LocalVocab& a, const LocalVocab& b);
 
+  // Return all the words from all the word sets as a vector.
   std::vector<std::string> getAllWordsForTesting() const;
 
  private:
diff --git a/src/engine/Minus.cpp b/src/engine/Minus.cpp
index 00ebd3be84..8b5c12c9ad 100644
--- a/src/engine/Minus.cpp
+++ b/src/engine/Minus.cpp
@@ -56,8 +56,7 @@ ResultTable Minus::computeResult() {
   // If only one of the two operands has a non-empty local vocabulary, share
   // with that one (otherwise, throws an exception).
   return {std::move(idTable), resultSortedOn(),
-          ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult,
-                                                         *rightResult)};
+          ResultTable::getMergedLocalVocab(*leftResult, *rightResult)};
 }
 
 // _____________________________________________________________________________
diff --git a/src/engine/MultiColumnJoin.cpp b/src/engine/MultiColumnJoin.cpp
index 0de919c071..0d4de5c978 100644
--- a/src/engine/MultiColumnJoin.cpp
+++ b/src/engine/MultiColumnJoin.cpp
@@ -86,8 +86,7 @@ ResultTable MultiColumnJoin::computeResult() {
   // If only one of the two operands has a non-empty local vocabulary, share
   // with that one (otherwise, throws an exception).
   return {std::move(idTable), resultSortedOn(),
-          ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult,
-                                                         *rightResult)};
+          ResultTable::getMergedLocalVocab(*leftResult, *rightResult)};
 }
 
 // _____________________________________________________________________________
diff --git a/src/engine/OptionalJoin.cpp b/src/engine/OptionalJoin.cpp
index 89917189b1..6ec87dc456 100644
--- a/src/engine/OptionalJoin.cpp
+++ b/src/engine/OptionalJoin.cpp
@@ -115,8 +115,7 @@ ResultTable OptionalJoin::computeResult() {
   // If only one of the two operands has a non-empty local vocabulary, share
   // with that one (otherwise, throws an exception).
   return {std::move(idTable), resultSortedOn(),
-          ResultTable::getSharedLocalVocabFromNonEmptyOf(*leftResult,
-                                                         *rightResult)};
+          ResultTable::getMergedLocalVocab(*leftResult, *rightResult)};
 }
 
 // _____________________________________________________________________________
diff --git a/src/engine/ResultTable.cpp b/src/engine/ResultTable.cpp
index 79fde3d57a..73edbaa506 100644
--- a/src/engine/ResultTable.cpp
+++ b/src/engine/ResultTable.cpp
@@ -23,10 +23,10 @@ string ResultTable::asDebugString() const {
 }
 
 // _____________________________________________________________________________
-auto ResultTable::getSharedLocalVocabFromNonEmptyOf(
-    const ResultTable& resultTable1, const ResultTable& resultTable2)
+auto ResultTable::getMergedLocalVocab(const ResultTable& resultTable1,
+                                      const ResultTable& resultTable2)
     -> SharedLocalVocabWrapper {
-  return getSharedLocalVocabFromNonEmptyOf(
+  return getMergedLocalVocab(
       std::array{std::cref(resultTable1), std::cref(resultTable2)});
 }
 
diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h
index c7df4c4713..d072ef5c39 100644
--- a/src/engine/ResultTable.h
+++ b/src/engine/ResultTable.h
@@ -133,54 +133,25 @@ class ResultTable {
     return SharedLocalVocabWrapper{localVocab_};
   }
 
-  // Like `getSharedLocalVocabFrom`, but takes more than one result and assumes
-  // that exactly one of the local vocabularies is empty and gets the shared
-  // local vocab from the non-empty one (if all are empty, arbitrarily share
-  // with the first one).
-  //
-  // TODO: Eventually, we want to be able to merge two non-empty local
-  // vocabularies, but that requires more work since we have to rewrite IDs then
-  // (from the previous separate local vocabularies to the new merged one).
-  static SharedLocalVocabWrapper getSharedLocalVocabFromNonEmptyOf(
+  // Like `getSharedLocalVocabFrom`, but takes more than one result and merges
+  // all the corresponding local vocabs.
+  static SharedLocalVocabWrapper getMergedLocalVocab(
       const ResultTable& resultTable1, const ResultTable& resultTable2);
 
   // Overload for more than two `ResultTables`
   template <std::ranges::forward_range R>
   requires std::convertible_to<std::ranges::range_value_t<R>,
                                const ResultTable&>
-  static SharedLocalVocabWrapper getSharedLocalVocabFromNonEmptyOf(
-      R&& subResults) {
-    // TODO<joka921> Implement this much better.
+  static SharedLocalVocabWrapper getMergedLocalVocab(R&& subResults) {
+    // We could implement the k-way merge much smarter, but in practice we
+    // rarely have more than two inputs, so the performance should be fine (we
+    // are only copying a few shared pointers after all).
+    // TODO<joka921> implement `LocalVocab::merge` in an n-way fashion`.
     LocalVocab res;
     for (const ResultTable& table : subResults) {
       res = LocalVocab::merge(res, *table.localVocab_);
     }
     return SharedLocalVocabWrapper{std::move(res)};
-    /*
-    AD_CONTRACT_CHECK(!std::ranges::empty(subResults));
-    auto hasNonEmptyVocab = [](const ResultTable& tbl) {
-      return !tbl.localVocab_->empty();
-    };
-    auto numNonEmptyVocabs =
-        std::ranges::count_if(subResults, hasNonEmptyVocab);
-    if (numNonEmptyVocabs > 1) {
-      throw std::runtime_error(
-          "Merging of more than one non-empty local vocabularies is currently "
-          "not supported, please contact the developers");
-    }
-    // The static casts in the following are needed to make this code work for
-    // types that are implicitly convertible to `const ResultTable&`, in
-    // particular `std::reference_wrapper<const ResultTable>`.
-    if (numNonEmptyVocabs == 0) {
-      return SharedLocalVocabWrapper{
-          static_cast<const ResultTable&>(*subResults.begin()).localVocab_};
-    } else {
-      return SharedLocalVocabWrapper{
-          static_cast<const ResultTable&>(
-              *std::ranges::find_if(subResults, hasNonEmptyVocab))
-              .localVocab_};
-    }
-     */
   }
 
   // Get a (deep) copy of the local vocabulary from the given result. Use this
diff --git a/src/engine/TransitivePath.cpp b/src/engine/TransitivePath.cpp
index c086561faf..073085b89a 100644
--- a/src/engine/TransitivePath.cpp
+++ b/src/engine/TransitivePath.cpp
@@ -256,7 +256,7 @@ ResultTable TransitivePath::computeResult() {
                     sideRes->idTable());
 
     return {std::move(idTable), resultSortedOn(),
-            ResultTable::getSharedLocalVocabFromNonEmptyOf(*sideRes, *subRes)};
+            ResultTable::getMergedLocalVocab(*sideRes, *subRes)};
   };
 
   if (lhs_.isBoundVariable()) {
diff --git a/src/engine/Union.cpp b/src/engine/Union.cpp
index 7f3269fe37..c8615c5062 100644
--- a/src/engine/Union.cpp
+++ b/src/engine/Union.cpp
@@ -178,9 +178,8 @@ ResultTable Union::computeResult() {
   LOG(DEBUG) << "Union result computation done" << std::endl;
   // If only one of the two operands has a non-empty local vocabulary, share
   // with that one (otherwise, throws an exception).
-  return ResultTable{
-      std::move(idTable), resultSortedOn(),
-      ResultTable::getSharedLocalVocabFromNonEmptyOf(*subRes1, *subRes2)};
+  return ResultTable{std::move(idTable), resultSortedOn(),
+                     ResultTable::getMergedLocalVocab(*subRes1, *subRes2)};
 }
 
 void Union::computeUnion(
diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h
index d2e46da5e1..2260bae25e 100644
--- a/src/global/IndexTypes.h
+++ b/src/global/IndexTypes.h
@@ -14,13 +14,14 @@
 // requests.
 using VocabIndex = ad_utility::TypedIndex<uint64_t, "VocabIndex">;
 
-// TODO<joka921> Where to put this best...
-struct alignas(16) AlignedStr : public std::string {
+// A `std::string` that is aligned to 16 bytes s.t. pointers always end with 4
+// bits that are zero and that are reused for payloads in the `ValueId` class.
+struct alignas(16) StringAligned16 : public std::string {
   using std::string::basic_string;
-  explicit AlignedStr(std::string s) : std::string{std::move(s)} {}
+  explicit StringAligned16(std::string s) : std::string{std::move(s)} {}
 };
 // using LocalVocabIndex = ad_utility::TypedIndex<uint64_t, "LocalVocabIndex">;
-using LocalVocabIndex = const AlignedStr*;
+using LocalVocabIndex = const StringAligned16*;
 using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
 using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
 using BlankNodeIndex = ad_utility::TypedIndex<uint64_t, "BlankNodeIndex">;
diff --git a/src/global/ValueId.h b/src/global/ValueId.h
index 0a031d2bce..33408479e7 100644
--- a/src/global/ValueId.h
+++ b/src/global/ValueId.h
@@ -123,6 +123,11 @@ class ValueId {
 
   /// Equality comparison is performed directly on the underlying
   /// representation.
+  // NOTE: (Also for the operator<=> below: These comparisons only work
+  // correctly if we only store entries in the local vocab that are NOT part of
+  // the vocabulary. This is currently not true for expression results from
+  // GROUP BY and BIND operations (for performance reaons). So a join with such
+  // results will currently lead to wrong results.
   constexpr bool operator==(const ValueId& other) const {
     if (getDatatype() == Datatype::LocalVocabIndex &&
         other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] {
@@ -139,8 +144,6 @@ class ValueId {
   /// For doubles it is first the positive doubles in order, then the negative
   /// doubles in reversed order. This is a direct consequence of comparing the
   /// bit representation of these values as unsigned integers.
-  /// TODO<joka921> Is this ordering also consistent for corner cases of doubles
-  /// (inf, nan, denormalized numbers, negative zero)?
   constexpr auto operator<=>(const ValueId& other) const {
     if (getDatatype() == Datatype::LocalVocabIndex &&
         other.getDatatype() == Datatype::LocalVocabIndex) [[unlikely]] {
@@ -221,6 +224,9 @@ class ValueId {
     return makeFromIndex(index.get(), Datatype::TextRecordIndex);
   }
   static ValueId makeFromLocalVocabIndex(LocalVocabIndex index) {
+    // The last `numDatatypeBits` of a `LocalVocabIndex` are always zero, so we
+    // can reuse them for the datatype.
+    static_assert(alignof(decltype(*index)) >= (1u << numDatatypeBits));
     return makeFromIndex(reinterpret_cast<T>(index) >> numDatatypeBits,
                          Datatype::LocalVocabIndex);
   }
diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp
index 169291e439..17bc6b2e63 100644
--- a/test/ValueIdTest.cpp
+++ b/test/ValueIdTest.cpp
@@ -296,7 +296,7 @@ TEST(ValueId, toDebugString) {
   test(ValueId::makeFromBool(false), "Bool:false");
   test(ValueId::makeFromBool(true), "Bool:true");
   test(makeVocabId(15), "VocabIndex:15");
-  AlignedStr str{"SomeValue"};
+  StringAligned16 str{"SomeValue"};
   test(ValueId::makeFromLocalVocabIndex(&str), "LocalVocabIndex:SomeValue");
   test(makeTextRecordId(37), "TextRecordIndex:37");
   test(makeWordVocabId(42), "WordVocabIndex:42");

From 556c293f2e3457f1f7241dcec922d63d862f022a Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 11 Apr 2024 11:21:15 +0200
Subject: [PATCH 09/17] Fixed the remaining unit tests.

---
 src/engine/AddCombinedRowToTable.h |  3 +--
 src/engine/LocalVocab.cpp          | 27 +++++++++----------
 src/engine/LocalVocab.h            |  6 ++---
 src/engine/ResultTable.h           | 10 +++----
 src/global/IndexTypes.h            |  1 -
 src/global/ValueIdComparators.h    | 43 ++++++++++++++++++++++++++----
 test/LocalVocabTest.cpp            | 40 ++++++++++++++++++++++++---
 test/ValueIdComparatorsTest.cpp    | 12 ++++-----
 test/ValueIdTest.cpp               | 12 ++++-----
 test/ValueIdTestHelpers.h          | 14 +++++-----
 test/ValuesTest.cpp                |  5 ++--
 test/util/IdTestHelpers.h          | 12 ++++-----
 12 files changed, 121 insertions(+), 64 deletions(-)

diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h
index ae3e3e94bf..75c2bcd848 100644
--- a/src/engine/AddCombinedRowToTable.h
+++ b/src/engine/AddCombinedRowToTable.h
@@ -245,8 +245,7 @@ class AddCombinedRowToIdTable {
       // Write the matching rows.
       for (const auto& [targetIndex, sourceIndices] : indexBuffer_) {
         auto resultId =
-            getJoinValue(colLeft[sourceIndices[0]],
-                                           colRight[sourceIndices[1]]);
+            getJoinValue(colLeft[sourceIndices[0]], colRight[sourceIndices[1]]);
         numUndef += static_cast<size_t>(resultId.isUndefined());
         resultCol[oldSize + targetIndex] = resultId;
       }
diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp
index e7559a65cf..a4faa54817 100644
--- a/src/engine/LocalVocab.cpp
+++ b/src/engine/LocalVocab.cpp
@@ -18,29 +18,26 @@ LocalVocab LocalVocab::clone() const {
 }
 
 // _____________________________________________________________________________
-LocalVocab LocalVocab::merge(const LocalVocab& a, const LocalVocab& b) {
+LocalVocab LocalVocab::merge(std::span<const LocalVocab*> vocabs) {
   LocalVocab res;
   auto inserter = std::back_inserter(res.otherWordSets_);
-  std::ranges::copy(a.otherWordSets_, inserter);
-  std::ranges::copy(b.otherWordSets_, inserter);
-  *inserter = a.primaryWordSet_;
-  *inserter = b.primaryWordSet_;
+  for (const auto* vocab : vocabs) {
+    std::ranges::copy(vocab->otherWordSets_, inserter);
+    *inserter = vocab->primaryWordSet_;
+  }
   return res;
 }
 
 // _____________________________________________________________________________
 template <typename WordT>
 LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
-  // The following code contains a subtle, but important optimizations:
-  //
-  // 1. The variant of `insert` used covers the case that `word` was already
-  // contained in the map as well as the case that it is newly inserted. This
-  // avoids computing the hash for `word` twice in case we see it for the first
-  // time (note that hashing a string is not cheap).
-  // TODO<joka921> Make this work with transparent hashing and use try_emplace.
+  // TODO<joka921> As soon as we store `IdOrString` in the local vocab, we
+  // should definitely use `insert` instead of `emplace` here for some
+  // transparency optimizations. We currently need `emplace` because of the
+  // explicit conversion from `string` to `AlignedString16`.
   auto [wordIterator, isNewWord] =
-      primaryWordSet().insert(StringAligned16{std::forward<WordT>(word)});
-  return std::addressof(*wordIterator);
+      primaryWordSet().emplace(std::forward<WordT>(word));
+  return std::to_address(wordIterator);
 }
 
 // _____________________________________________________________________________
@@ -61,7 +58,7 @@ std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
   // but if this is only a testing API this is probably not worth the hassle.
   auto localVocabIndex = primaryWordSet().find(StringAligned16{word});
   if (localVocabIndex != primaryWordSet().end()) {
-    return &(*localVocabIndex);
+    return std::to_address(localVocabIndex);
   } else {
     return std::nullopt;
   }
diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
index c2d7f6a8ce..fdd730c521 100644
--- a/src/engine/LocalVocab.h
+++ b/src/engine/LocalVocab.h
@@ -81,9 +81,9 @@ class LocalVocab {
   // Return a const reference to the word.
   const std::string& getWord(LocalVocabIndex localVocabIndex) const;
 
-  // Create a local vocab that contains and keeps alive all the words from `a`
-  // as well as `b`.
-  static LocalVocab merge(const LocalVocab& a, const LocalVocab& b);
+  // Create a local vocab that contains and keeps alive all the words from each
+  // of the `vocabs`.
+  static LocalVocab merge(std::span<const LocalVocab*> vocabs);
 
   // Return all the words from all the word sets as a vector.
   std::vector<std::string> getAllWordsForTesting() const;
diff --git a/src/engine/ResultTable.h b/src/engine/ResultTable.h
index d072ef5c39..605f9c64a2 100644
--- a/src/engine/ResultTable.h
+++ b/src/engine/ResultTable.h
@@ -143,15 +143,11 @@ class ResultTable {
   requires std::convertible_to<std::ranges::range_value_t<R>,
                                const ResultTable&>
   static SharedLocalVocabWrapper getMergedLocalVocab(R&& subResults) {
-    // We could implement the k-way merge much smarter, but in practice we
-    // rarely have more than two inputs, so the performance should be fine (we
-    // are only copying a few shared pointers after all).
-    // TODO<joka921> implement `LocalVocab::merge` in an n-way fashion`.
-    LocalVocab res;
+    std::vector<const LocalVocab*> vocabs;
     for (const ResultTable& table : subResults) {
-      res = LocalVocab::merge(res, *table.localVocab_);
+      vocabs.push_back(std::to_address(table.localVocab_));
     }
-    return SharedLocalVocabWrapper{std::move(res)};
+    return SharedLocalVocabWrapper{LocalVocab::merge(vocabs)};
   }
 
   // Get a (deep) copy of the local vocabulary from the given result. Use this
diff --git a/src/global/IndexTypes.h b/src/global/IndexTypes.h
index 2260bae25e..adde06bbbc 100644
--- a/src/global/IndexTypes.h
+++ b/src/global/IndexTypes.h
@@ -20,7 +20,6 @@ struct alignas(16) StringAligned16 : public std::string {
   using std::string::basic_string;
   explicit StringAligned16(std::string s) : std::string{std::move(s)} {}
 };
-// using LocalVocabIndex = ad_utility::TypedIndex<uint64_t, "LocalVocabIndex">;
 using LocalVocabIndex = const StringAligned16*;
 using TextRecordIndex = ad_utility::TypedIndex<uint64_t, "TextRecordIndex">;
 using WordVocabIndex = ad_utility::TypedIndex<uint64_t, "WordVocabIndex">;
diff --git a/src/global/ValueIdComparators.h b/src/global/ValueIdComparators.h
index 02f9d2b5ce..48a5119aee 100644
--- a/src/global/ValueIdComparators.h
+++ b/src/global/ValueIdComparators.h
@@ -15,6 +15,30 @@ namespace valueIdComparators {
 // This enum encodes the different numeric comparators LessThan, LessEqual,
 // Equal, NotEqual, GreaterEqual, GreaterThan.
 enum struct Comparison { LT, LE, EQ, NE, GE, GT };
+inline std::ostream& operator<<(std::ostream& str, Comparison c) {
+  switch (c) {
+    using enum Comparison;
+    case LT:
+      str << "LT";
+      break;
+    case LE:
+      str << "LE";
+      break;
+    case EQ:
+      str << "EQ";
+      break;
+    case NE:
+      str << "NE";
+      break;
+    case GE:
+      str << "GE";
+      break;
+    case GT:
+      str << "GT";
+      break;
+  }
+  return str;
+}
 
 // This enum can be used to configure the behavior of the `compareIds` method
 // below in the case when two `Id`s have incompatible datatypes (e.g.
@@ -72,9 +96,7 @@ inline ValueId toValueId(ComparisonResult comparisonResult) {
 // negative doubles in reversed order. In detail it is [0.0 ... infinity, NaN,
 // -0.0, ... -infinity]. This is a direct consequence of comparing the bit
 // representation of these values as unsigned integers.
-inline bool compareByBits(ValueId a, ValueId b) {
-  return a.getBits() < b.getBits();
-}
+inline bool compareByBits(ValueId a, ValueId b) { return a < b; }
 
 namespace detail {
 
@@ -308,6 +330,7 @@ inline std::vector<std::pair<RandomIt, RandomIt>> getRangesForIndexTypes(
   RangeFilter<RandomIt> rangeFilter{comparison};
   auto [eqBegin, eqEnd] =
       std::equal_range(beginType, endType, valueId, &compareByBits);
+
   rangeFilter.addSmaller(beginType, eqBegin);
   rangeFilter.addEqual(eqBegin, eqEnd);
   rangeFilter.addGreater(eqEnd, endType);
@@ -462,8 +485,18 @@ ComparisonResult compareIdsImpl(ValueId a, ValueId b, auto comparator) {
     }
   }
 
-  auto visitor = [comparator](const auto& aValue,
-                              const auto& bValue) -> ComparisonResult {
+  auto visitor = [comparator]<typename A, typename B>(
+                     const A& aValue, const B& bValue) -> ComparisonResult {
+    if constexpr (std::is_same_v<A, LocalVocabIndex>) {
+      if constexpr (!std::is_same_v<B, LocalVocabIndex>) {
+        AD_FAIL();
+      } else {
+        // TODO<joka921> This is one of the places that has to be changed once
+        // we want to implement correct comparisons for the local vocab that use
+        // ICU collation.
+        return fromBool(std::invoke(comparator, *aValue, *bValue));
+      }
+    }
     if constexpr (requires() { std::invoke(comparator, aValue, bValue); }) {
       return fromBool(std::invoke(comparator, aValue, bValue));
     } else {
diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp
index fc7746a09f..828d3b7746 100644
--- a/test/LocalVocabTest.cpp
+++ b/test/LocalVocabTest.cpp
@@ -103,8 +103,10 @@ TEST(LocalVocab, clone) {
   // Create a small local vocabulary.
   size_t localVocabSize = 100;
   LocalVocab localVocabOriginal;
-  for (const auto& word : getTestCollectionOfWords(localVocabSize)) {
-    localVocabOriginal.getIndexAndAddIfNotContained(word);
+  std::vector<LocalVocabIndex> indices;
+  auto inputWords = getTestCollectionOfWords(localVocabSize);
+  for (const auto& word : inputWords) {
+    indices.push_back(localVocabOriginal.getIndexAndAddIfNotContained(word));
   }
   ASSERT_EQ(localVocabOriginal.size(), localVocabSize);
 
@@ -116,7 +118,39 @@ TEST(LocalVocab, clone) {
   ASSERT_THAT(localVocabClone.getAllWordsForTesting(),
               ::testing::UnorderedElementsAreArray(
                   localVocabOriginal.getAllWordsForTesting()));
-  // TODO<joka921> Maybe also test the indices.
+
+  // Test that the indices are still valid after the original vocabulary was
+  // destroyed.
+  localVocabOriginal = LocalVocab{};
+
+  for (size_t i = 0; i < inputWords.size(); ++i) {
+    EXPECT_EQ(*indices[i], inputWords[i]);
+  }
+}
+// _____________________________________________________________________________
+TEST(LocalVocab, merge) {
+  // Create a small local vocabulary.
+  std::vector<LocalVocabIndex> indices;
+  LocalVocab vocA;
+  LocalVocab vocB;
+  indices.push_back(vocA.getIndexAndAddIfNotContained("oneA"));
+  indices.push_back(vocA.getIndexAndAddIfNotContained("twoA"));
+  indices.push_back(vocA.getIndexAndAddIfNotContained("oneB"));
+  indices.push_back(vocA.getIndexAndAddIfNotContained("twoB"));
+
+  // Clone it and test that the clone contains the same words, but under
+  // different addresses (that is, the word strings were deeply copied).
+  auto vocabs = std::vector{&std::as_const(vocA), &std::as_const(vocB)};
+  LocalVocab localVocabMerged = LocalVocab::merge(vocabs);
+  ASSERT_EQ(localVocabMerged.size(), 4u);
+  ASSERT_THAT(localVocabMerged.getAllWordsForTesting(),
+              ::testing::UnorderedElementsAre("oneA", "twoA", "oneB", "twoB"));
+  vocA = LocalVocab{};
+  vocB = LocalVocab{};
+  EXPECT_EQ(*indices[0], "oneA");
+  EXPECT_EQ(*indices[1], "twoA");
+  EXPECT_EQ(*indices[2], "oneB");
+  EXPECT_EQ(*indices[3], "twoB");
 }
 
 // _____________________________________________________________________________
diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp
index 6ca68eb7a0..c9641b00b3 100644
--- a/test/ValueIdComparatorsTest.cpp
+++ b/test/ValueIdComparatorsTest.cpp
@@ -77,20 +77,23 @@ auto testGetRangesForId(auto begin, auto end, ValueId id,
     using enum ComparisonResult;
     for (auto [rangeBegin, rangeEnd] : ranges) {
       while (it != rangeBegin) {
-        ASSERT_FALSE(isMatching(*it, id)) << *it << ' ' << id;
+        ASSERT_FALSE(isMatching(*it, id)) << *it << ' ' << id << comparison;
         auto expected = isMatchingDatatype(*it) ? False : Undef;
         ASSERT_EQ(compareIds(*it, id, comparison), expected)
             << *it << ' ' << id;
         ++it;
       }
       while (it != rangeEnd) {
-        ASSERT_TRUE(isMatching(*it, id)) << *it << ' ' << id;
+        ASSERT_TRUE(isMatching(*it, id)) << *it << ' ' << id << comparison;
         ASSERT_EQ(compareIds(*it, id, comparison), True) << *it << ' ' << id;
         ++it;
       }
     }
     while (it != end) {
-      ASSERT_FALSE(isMatching(*it, id));
+      if (isMatching(*it, id)) {
+        LOG(INFO) << "bum" << std::endl;
+      }
+      ASSERT_FALSE(isMatching(*it, id)) << *it << ", " << id << comparison;
       auto expected = isMatchingDatatype(*it) ? False : Undef;
       ASSERT_EQ(compareIds(*it, id, comparison), expected) << *it << ' ' << id;
       ++it;
@@ -213,8 +216,6 @@ auto testGetRangesForEqualIds(auto begin, auto end, ValueId idBegin,
 // Test that `getRangesFromId` works correctly for `ValueId`s of the unsigned
 // index types (`VocabIndex`, `TextRecordIndex`, `LocalVocabIndex`,
 // `WordVocabIndex`).
-// TODO<joka921> Reinstate all the tests..
-/*
 TEST(ValueIdComparators, IndexTypes) {
   auto ids = makeRandomIds();
   std::sort(ids.begin(), ids.end(), compareByBits);
@@ -257,7 +258,6 @@ TEST(ValueIdComparators, IndexTypes) {
   testImpl.operator()<Datatype::LocalVocabIndex>(&getLocalVocabIndex);
   testImpl.operator()<Datatype::WordVocabIndex>(&getWordVocabIndex);
 }
- */
 
 // _______________________________________________________________________
 TEST(ValueIdComparators, undefinedWithItself) {
diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp
index 17bc6b2e63..ca278bd5d3 100644
--- a/test/ValueIdTest.cpp
+++ b/test/ValueIdTest.cpp
@@ -121,7 +121,11 @@ TEST(ValueId, Indices) {
   testRandomIds(&makeTextRecordId, &getTextRecordIndex,
                 Datatype::TextRecordIndex);
   testRandomIds(&makeVocabId, &getVocabIndex, Datatype::VocabIndex);
-  testRandomIds(&makeLocalVocabId, &getLocalVocabIndex,
+
+  auto localVocabWordToInt = [](const auto& input) {
+    return std::atoll(getLocalVocabIndex(input).c_str());
+  };
+  testRandomIds(&makeLocalVocabId, localVocabWordToInt,
                 Datatype::LocalVocabIndex);
   testRandomIds(&makeWordVocabId, &getWordVocabIndex, Datatype::WordVocabIndex);
 }
@@ -131,7 +135,6 @@ TEST(ValueId, Undefined) {
   ASSERT_EQ(id.getDatatype(), Datatype::Undefined);
 }
 
-/*
 TEST(ValueId, OrderingDifferentDatatypes) {
   auto ids = makeRandomIds();
   std::sort(ids.begin(), ids.end());
@@ -142,9 +145,7 @@ TEST(ValueId, OrderingDifferentDatatypes) {
   ASSERT_TRUE(
       std::is_sorted(ids.begin(), ids.end(), compareByDatatypeAndIndexTypes));
 }
- */
 
-/*
 TEST(ValueId, IndexOrdering) {
   auto testOrder = [](auto makeIdFromIndex, auto getIndexFromId) {
     std::vector<ValueId> ids;
@@ -168,7 +169,6 @@ TEST(ValueId, IndexOrdering) {
   testOrder(&makeWordVocabId, &getWordVocabIndex);
   testOrder(&makeTextRecordId, &getTextRecordIndex);
 }
- */
 
 TEST(ValueId, DoubleOrdering) {
   auto ids = makeRandomDoubleIds();
@@ -263,7 +263,6 @@ TEST(ValueId, Serialization) {
   }
 }
 
-/*
 TEST(ValueId, Hashing) {
   auto ids = makeRandomIds();
   ad_utility::HashSet<ValueId> idsWithoutDuplicates;
@@ -282,7 +281,6 @@ TEST(ValueId, Hashing) {
 
   ASSERT_EQ(ids, idsWithoutDuplicatesAsVector);
 }
- */
 
 TEST(ValueId, toDebugString) {
   auto test = [](ValueId id, std::string_view expected) {
diff --git a/test/ValueIdTestHelpers.h b/test/ValueIdTestHelpers.h
index b4f6cddce3..19aba006a3 100644
--- a/test/ValueIdTestHelpers.h
+++ b/test/ValueIdTestHelpers.h
@@ -5,8 +5,9 @@
 #ifndef QLEVER_VALUEIDTESTHELPERS_H
 #define QLEVER_VALUEIDTESTHELPERS_H
 
-#include "../src/global/ValueId.h"
-#include "../src/util/Random.h"
+#include "./util/IdTestHelpers.h"
+#include "global/ValueId.h"
+#include "util/Random.h"
 
 // Enabling cheaper unit tests when building in Debug mode
 #ifdef QLEVER_RUN_EXPENSIVE_TESTS
@@ -45,8 +46,7 @@ inline ValueId makeVocabId(uint64_t value) {
   return ValueId::makeFromVocabIndex(VocabIndex::make(value));
 }
 inline ValueId makeLocalVocabId(uint64_t value) {
-  return ValueId::makeFromLocalVocabIndex(
-      std::bit_cast<LocalVocabIndex>(value << 4));
+  return ad_utility::testing::LocalVocabId(value);
 }
 inline ValueId makeTextRecordId(uint64_t value) {
   return ValueId::makeFromTextRecordIndex(TextRecordIndex::make(value));
@@ -60,8 +60,9 @@ inline ValueId makeBlankNodeId(uint64_t value) {
 
 inline uint64_t getVocabIndex(ValueId id) { return id.getVocabIndex().get(); }
 // TODO<joka921> Make the tests more precise for the localVocabIndices.
-inline uint64_t getLocalVocabIndex(ValueId id) {
-  return std::bit_cast<uint64_t>(id.getLocalVocabIndex()) >> 4;
+inline std::string getLocalVocabIndex(ValueId id) {
+  AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::LocalVocabIndex);
+  return *id.getLocalVocabIndex();
 }
 inline uint64_t getTextRecordIndex(ValueId id) {
   return id.getTextRecordIndex().get();
@@ -106,6 +107,7 @@ inline auto makeRandomDoubleIds = []() {
   ad_utility::randomShuffle(ids.begin(), ids.end());
   return ids;
 };
+
 inline auto makeRandomIds = []() {
   std::vector<ValueId> ids = makeRandomDoubleIds();
   addIdsFromGenerator(indexGenerator, &makeVocabId, ids);
diff --git a/test/ValuesTest.cpp b/test/ValuesTest.cpp
index 87fc72e1ab..001eab9116 100644
--- a/test/ValuesTest.cpp
+++ b/test/ValuesTest.cpp
@@ -74,11 +74,12 @@ TEST(Values, computeResult) {
   const auto& table = result->idTable();
   Id x = ad_utility::testing::makeGetId(testQec->getIndex())("<x>");
   auto I = ad_utility::testing::IntId;
-  auto L = ad_utility::testing::LocalVocabId;
   auto l = result->localVocab().getIndexOrNullopt("<y>");
   ASSERT_TRUE(l.has_value());
   auto U = Id::makeUndefined();
-  ASSERT_EQ(table, makeIdTableFromVector({{I(12), x}, {U, L(l.value())}}));
+  ASSERT_EQ(table,
+            makeIdTableFromVector(
+                {{I(12), x}, {U, Id::makeFromLocalVocabIndex(l.value())}}));
 }
 
 // Check that if the number of variables and the number of values in each row
diff --git a/test/util/IdTestHelpers.h b/test/util/IdTestHelpers.h
index 8fbb448131..1449ab0f88 100644
--- a/test/util/IdTestHelpers.h
+++ b/test/util/IdTestHelpers.h
@@ -4,6 +4,7 @@
 
 #pragma once
 
+#include "engine/LocalVocab.h"
 #include "global/Id.h"
 
 // Lambdas to simply create an `Id` with a given value and type during unit
@@ -18,13 +19,10 @@ inline auto VocabId = [](const auto& v) {
   return Id::makeFromVocabIndex(VocabIndex::make(v));
 };
 
-inline auto LocalVocabId = []<typename T>(const T& v) {
-  if constexpr (std::integral<T>) {
-    return Id::makeFromLocalVocabIndex(
-        std::bit_cast<LocalVocabIndex>(static_cast<uint64_t>(v)));
-  } else {
-    return Id::makeFromLocalVocabIndex(std::bit_cast<LocalVocabIndex>(v));
-  }
+inline auto LocalVocabId = [](std::integral auto v) {
+  static ad_utility::Synchronized<LocalVocab> localVocab;
+  return Id::makeFromLocalVocabIndex(
+      localVocab.wlock()->getIndexAndAddIfNotContained(std::to_string(v)));
 };
 
 inline auto TextRecordId = [](const auto& t) {

From 4bd0884d857aedf51cf373b1c5e4275aeb727c3b Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 11 Apr 2024 11:58:06 +0200
Subject: [PATCH 10/17] Fix compilation and hopefully coverage.

---
 src/engine/LocalVocab.h         |  1 +
 src/global/ValueIdComparators.h | 24 ------------------------
 test/ValueIdComparatorsTest.cpp | 26 ++++++++++++++++++++++++++
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
index fdd730c521..83c3bccae5 100644
--- a/src/engine/LocalVocab.h
+++ b/src/engine/LocalVocab.h
@@ -6,6 +6,7 @@
 
 #include <cstdlib>
 #include <memory>
+#include <span>
 #include <string>
 #include <vector>
 
diff --git a/src/global/ValueIdComparators.h b/src/global/ValueIdComparators.h
index 48a5119aee..0c0aae44b3 100644
--- a/src/global/ValueIdComparators.h
+++ b/src/global/ValueIdComparators.h
@@ -15,30 +15,6 @@ namespace valueIdComparators {
 // This enum encodes the different numeric comparators LessThan, LessEqual,
 // Equal, NotEqual, GreaterEqual, GreaterThan.
 enum struct Comparison { LT, LE, EQ, NE, GE, GT };
-inline std::ostream& operator<<(std::ostream& str, Comparison c) {
-  switch (c) {
-    using enum Comparison;
-    case LT:
-      str << "LT";
-      break;
-    case LE:
-      str << "LE";
-      break;
-    case EQ:
-      str << "EQ";
-      break;
-    case NE:
-      str << "NE";
-      break;
-    case GE:
-      str << "GE";
-      break;
-    case GT:
-      str << "GT";
-      break;
-  }
-  return str;
-}
 
 // This enum can be used to configure the behavior of the `compareIds` method
 // below in the case when two `Id`s have incompatible datatypes (e.g.
diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp
index c9641b00b3..0f13e8879b 100644
--- a/test/ValueIdComparatorsTest.cpp
+++ b/test/ValueIdComparatorsTest.cpp
@@ -12,6 +12,32 @@
 #include "util/Random.h"
 
 using namespace valueIdComparators;
+namespace valueIdComparators {
+inline std::ostream& operator<<(std::ostream& str, Comparison c) {
+  switch (c) {
+    using enum Comparison;
+    case LT:
+      str << "LT";
+      break;
+    case LE:
+      str << "LE";
+      break;
+    case EQ:
+      str << "EQ";
+      break;
+    case NE:
+      str << "NE";
+      break;
+    case GE:
+      str << "GE";
+      break;
+    case GT:
+      str << "GT";
+      break;
+  }
+  return str;
+}
+}  // namespace valueIdComparators
 using ad_utility::source_location;
 
 TEST(ValueIdComparators, GetRangeForDatatype) {

From 71d7a8be4390605cb09b290c11c34ceffcb24b6f Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 11 Apr 2024 14:16:29 +0200
Subject: [PATCH 11/17] Update the native macos build to clang 18 to fix
 std::to_address.

---
 .github/workflows/macos.yml                   | 20 +++++++++----------
 .../{clang-16-macos => clang-18-macos}        |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)
 rename conanprofiles/{clang-16-macos => clang-18-macos} (84%)

diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index d0efe2c515..21b30f31e3 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -39,14 +39,14 @@ jobs:
 
       - name: Install dependencies
         run:  |
-          brew install llvm@16
+          brew install llvm@18
           brew install conan@2
-          echo 'export PATH="/usr/local/opt/llvm@16/bin:$PATH"' >> ~/.bash_profile
-          echo PATH="/usr/local/opt/llvm@16/bin:$PATH" >> $GITHUB_ENV 
-          echo 'export LDFLAGS="-L/usr/local/opt/llvm@16/lib -L/usr/local/opt/llvm@16/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@16/lib/c++"' >> ~/.bash_profile
-          echo LDFLAGS="-L/usr/local/opt/llvm@16/lib -L/usr/local/opt/llvm@16/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@16/lib/c++" >> $GITHUB_ENV 
-          echo 'export CPPFLAGS="-I/usr/local/opt/llvm@16/include"' >> ~/.bash_profile
-          echo  CPPFLAGS="/usr/local/opt/llvm@16/include" >> $GITHUB_ENV 
+          echo 'export PATH="/usr/local/opt/llvm@18/bin:$PATH"' >> ~/.bash_profile
+          echo PATH="/usr/local/opt/llvm@18/bin:$PATH" >> $GITHUB_ENV 
+          echo 'export LDFLAGS="-L/usr/local/opt/llvm@18/lib -L/usr/local/opt/llvm@18/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@18/lib/c++"' >> ~/.bash_profile
+          echo LDFLAGS="-L/usr/local/opt/llvm@18/lib -L/usr/local/opt/llvm@18/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@18/lib/c++" >> $GITHUB_ENV 
+          echo 'export CPPFLAGS="-I/usr/local/opt/llvm@18/include"' >> ~/.bash_profile
+          echo  CPPFLAGS="/usr/local/opt/llvm@18/include" >> $GITHUB_ENV 
           source ~/.bash_profile
       - name: Print clang version
         run: clang++ --version
@@ -57,15 +57,15 @@ jobs:
           cache-name: cache-conan-modules
         with:
           path: ~/.conan2
-          key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('conanfile.txt') }}
+          key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('conanfile.txt') }}-${{ hashFiles('conanprofiles/clang-18-macos')}}
       - name: Create build directory
         run: mkdir ${{github.workspace}}/build
       - name: Install and run conan
         working-directory: ${{github.workspace}}/build
         run: >
-          conan install .. -pr:b=../conanprofiles/clang-16-macos -pr:h=../conanprofiles/clang-16-macos -of=. --build=missing;
+          conan install .. -pr:b=../conanprofiles/clang-18-macos -pr:h=../conanprofiles/clang-18-macos -of=. --build=missing;
       - name: Configure CMake
-        # For std::ranges::join_view we need the -fexperimental-library flag on libc++16, which on Mac requires to manually tinker with the linking flags.
+        # For std::ranges::join_view we need the -fexperimental-library flag on libc++18, which on Mac requires to manually tinker with the linking flags.
         run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.build-type}} -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DUSE_PARALLEL=true -DRUN_EXPENSIVE_TESTS=false -DENABLE_EXPENSIVE_CHECKS=true -DCMAKE_CXX_COMPILER=clang++ -DADDITIONAL_COMPILER_FLAGS="-fexperimental-library" -DADDITIONAL_LINKER_FLAGS="-L$(brew --prefix llvm)/lib/c++"
 
       - name: Build
diff --git a/conanprofiles/clang-16-macos b/conanprofiles/clang-18-macos
similarity index 84%
rename from conanprofiles/clang-16-macos
rename to conanprofiles/clang-18-macos
index d1c5ccd365..b776d4594a 100644
--- a/conanprofiles/clang-16-macos
+++ b/conanprofiles/clang-18-macos
@@ -4,5 +4,5 @@ build_type=Release
 compiler=clang
 compiler.cppstd=gnu17
 compiler.libcxx=libc++
-compiler.version=16
+compiler.version=18
 os=Macos

From 64f19dff49472c4afdd0bfb323ee52b2568b0388 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 11 Apr 2024 14:25:05 +0200
Subject: [PATCH 12/17] Remove `std::to_address` because of MacOS.

---
 .github/workflows/macos.yml | 20 ++++++++++----------
 src/engine/LocalVocab.cpp   |  8 +++++---
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
index 21b30f31e3..d0efe2c515 100644
--- a/.github/workflows/macos.yml
+++ b/.github/workflows/macos.yml
@@ -39,14 +39,14 @@ jobs:
 
       - name: Install dependencies
         run:  |
-          brew install llvm@18
+          brew install llvm@16
           brew install conan@2
-          echo 'export PATH="/usr/local/opt/llvm@18/bin:$PATH"' >> ~/.bash_profile
-          echo PATH="/usr/local/opt/llvm@18/bin:$PATH" >> $GITHUB_ENV 
-          echo 'export LDFLAGS="-L/usr/local/opt/llvm@18/lib -L/usr/local/opt/llvm@18/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@18/lib/c++"' >> ~/.bash_profile
-          echo LDFLAGS="-L/usr/local/opt/llvm@18/lib -L/usr/local/opt/llvm@18/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@18/lib/c++" >> $GITHUB_ENV 
-          echo 'export CPPFLAGS="-I/usr/local/opt/llvm@18/include"' >> ~/.bash_profile
-          echo  CPPFLAGS="/usr/local/opt/llvm@18/include" >> $GITHUB_ENV 
+          echo 'export PATH="/usr/local/opt/llvm@16/bin:$PATH"' >> ~/.bash_profile
+          echo PATH="/usr/local/opt/llvm@16/bin:$PATH" >> $GITHUB_ENV 
+          echo 'export LDFLAGS="-L/usr/local/opt/llvm@16/lib -L/usr/local/opt/llvm@16/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@16/lib/c++"' >> ~/.bash_profile
+          echo LDFLAGS="-L/usr/local/opt/llvm@16/lib -L/usr/local/opt/llvm@16/lib/c++ -Wl,-rpath,/usr/local/opt/llvm@16/lib/c++" >> $GITHUB_ENV 
+          echo 'export CPPFLAGS="-I/usr/local/opt/llvm@16/include"' >> ~/.bash_profile
+          echo  CPPFLAGS="/usr/local/opt/llvm@16/include" >> $GITHUB_ENV 
           source ~/.bash_profile
       - name: Print clang version
         run: clang++ --version
@@ -57,15 +57,15 @@ jobs:
           cache-name: cache-conan-modules
         with:
           path: ~/.conan2
-          key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('conanfile.txt') }}-${{ hashFiles('conanprofiles/clang-18-macos')}}
+          key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('conanfile.txt') }}
       - name: Create build directory
         run: mkdir ${{github.workspace}}/build
       - name: Install and run conan
         working-directory: ${{github.workspace}}/build
         run: >
-          conan install .. -pr:b=../conanprofiles/clang-18-macos -pr:h=../conanprofiles/clang-18-macos -of=. --build=missing;
+          conan install .. -pr:b=../conanprofiles/clang-16-macos -pr:h=../conanprofiles/clang-16-macos -of=. --build=missing;
       - name: Configure CMake
-        # For std::ranges::join_view we need the -fexperimental-library flag on libc++18, which on Mac requires to manually tinker with the linking flags.
+        # For std::ranges::join_view we need the -fexperimental-library flag on libc++16, which on Mac requires to manually tinker with the linking flags.
         run: cmake -B ${{github.workspace}}/build -DCMAKE_BUILD_TYPE=${{env.build-type}} -DCMAKE_TOOLCHAIN_FILE="$(pwd)/build/conan_toolchain.cmake" -DUSE_PARALLEL=true -DRUN_EXPENSIVE_TESTS=false -DENABLE_EXPENSIVE_CHECKS=true -DCMAKE_CXX_COMPILER=clang++ -DADDITIONAL_COMPILER_FLAGS="-fexperimental-library" -DADDITIONAL_LINKER_FLAGS="-L$(brew --prefix llvm)/lib/c++"
 
       - name: Build
diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp
index a4faa54817..3c4a956e41 100644
--- a/src/engine/LocalVocab.cpp
+++ b/src/engine/LocalVocab.cpp
@@ -37,6 +37,8 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
   // explicit conversion from `string` to `AlignedString16`.
   auto [wordIterator, isNewWord] =
       primaryWordSet().emplace(std::forward<WordT>(word));
+  // TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
+  // the MacOS build.
   return std::to_address(wordIterator);
 }
 
@@ -54,11 +56,11 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContained(std::string&& word) {
 // _____________________________________________________________________________
 std::optional<LocalVocabIndex> LocalVocab::getIndexOrNullopt(
     const std::string& word) const {
-  // TODO<joka921> Maybe we can make this work with transparent hashing,
-  // but if this is only a testing API this is probably not worth the hassle.
   auto localVocabIndex = primaryWordSet().find(StringAligned16{word});
   if (localVocabIndex != primaryWordSet().end()) {
-    return std::to_address(localVocabIndex);
+    // TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
+    // the MacOS build.
+    return &(*localVocabIndex);
   } else {
     return std::nullopt;
   }

From 700b518983e6eed73ffa458f9657812b03a97233 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 11 Apr 2024 14:50:28 +0200
Subject: [PATCH 13/17] Fix mac for good now...

---
 conanprofiles/{clang-18-macos => clang-16-macos} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename conanprofiles/{clang-18-macos => clang-16-macos} (100%)

diff --git a/conanprofiles/clang-18-macos b/conanprofiles/clang-16-macos
similarity index 100%
rename from conanprofiles/clang-18-macos
rename to conanprofiles/clang-16-macos

From 5724a263ab09c226c815fc7881e8402167aa7702 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <joka921@users.noreply.github.com>
Date: Thu, 11 Apr 2024 14:58:53 +0200
Subject: [PATCH 14/17] Update clang-16-macos

---
 conanprofiles/clang-16-macos | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conanprofiles/clang-16-macos b/conanprofiles/clang-16-macos
index b776d4594a..d1c5ccd365 100644
--- a/conanprofiles/clang-16-macos
+++ b/conanprofiles/clang-16-macos
@@ -4,5 +4,5 @@ build_type=Release
 compiler=clang
 compiler.cppstd=gnu17
 compiler.libcxx=libc++
-compiler.version=18
+compiler.version=16
 os=Macos

From b3c6fe2cd4ee8a3e8deb7f87e63c15fd38d86873 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 11 Apr 2024 15:37:37 +0200
Subject: [PATCH 15/17] Finalinally fix the MacOS build.

---
 src/engine/LocalVocab.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/engine/LocalVocab.cpp b/src/engine/LocalVocab.cpp
index 3c4a956e41..52b4267ca9 100644
--- a/src/engine/LocalVocab.cpp
+++ b/src/engine/LocalVocab.cpp
@@ -39,7 +39,7 @@ LocalVocabIndex LocalVocab::getIndexAndAddIfNotContainedImpl(WordT&& word) {
       primaryWordSet().emplace(std::forward<WordT>(word));
   // TODO<Libc++18> Use std::to_address (more idiomatic, but currently breaks
   // the MacOS build.
-  return std::to_address(wordIterator);
+  return &(*wordIterator);
 }
 
 // _____________________________________________________________________________

From a441f5e4ede4b4a13a01b50d6519658821ef7e88 Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 11 Apr 2024 19:56:08 +0200
Subject: [PATCH 16/17] A round of reviews.

---
 src/engine/LocalVocab.h         |  2 +-
 src/global/ValueIdComparators.h | 20 +++++++++-----------
 test/LocalVocabTest.cpp         |  6 ++----
 test/ValueIdComparatorsTest.cpp |  3 ---
 4 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
index 83c3bccae5..3ddd58567f 100644
--- a/src/engine/LocalVocab.h
+++ b/src/engine/LocalVocab.h
@@ -83,7 +83,7 @@ class LocalVocab {
   const std::string& getWord(LocalVocabIndex localVocabIndex) const;
 
   // Create a local vocab that contains and keeps alive all the words from each
-  // of the `vocabs`.
+  // of the `vocabs`. The primary word set of the newly create vocab is empty.
   static LocalVocab merge(std::span<const LocalVocab*> vocabs);
 
   // Return all the words from all the word sets as a vector.
diff --git a/src/global/ValueIdComparators.h b/src/global/ValueIdComparators.h
index 0c0aae44b3..84440c4707 100644
--- a/src/global/ValueIdComparators.h
+++ b/src/global/ValueIdComparators.h
@@ -463,17 +463,15 @@ ComparisonResult compareIdsImpl(ValueId a, ValueId b, auto comparator) {
 
   auto visitor = [comparator]<typename A, typename B>(
                      const A& aValue, const B& bValue) -> ComparisonResult {
-    if constexpr (std::is_same_v<A, LocalVocabIndex>) {
-      if constexpr (!std::is_same_v<B, LocalVocabIndex>) {
-        AD_FAIL();
-      } else {
-        // TODO<joka921> This is one of the places that has to be changed once
-        // we want to implement correct comparisons for the local vocab that use
-        // ICU collation.
-        return fromBool(std::invoke(comparator, *aValue, *bValue));
-      }
-    }
-    if constexpr (requires() { std::invoke(comparator, aValue, bValue); }) {
+    if constexpr (std::is_same_v<A, LocalVocabIndex> &&
+                  std::is_same_v<B, LocalVocabIndex>) {
+      // TODO<joka921> This is one of the places that has to be changed once
+      // we want to implement correct comparisons for the local vocab that use
+      // ICU collation.
+      return fromBool(std::invoke(comparator, *aValue, *bValue));
+    } else if constexpr (requires() {
+                           std::invoke(comparator, aValue, bValue);
+                         }) {
       return fromBool(std::invoke(comparator, aValue, bValue));
     } else {
       AD_FAIL();
diff --git a/test/LocalVocabTest.cpp b/test/LocalVocabTest.cpp
index 828d3b7746..1f76ddb2bc 100644
--- a/test/LocalVocabTest.cpp
+++ b/test/LocalVocabTest.cpp
@@ -110,8 +110,7 @@ TEST(LocalVocab, clone) {
   }
   ASSERT_EQ(localVocabOriginal.size(), localVocabSize);
 
-  // Clone it and test that the clone contains the same words, but under
-  // different addresses (that is, the word strings were deeply copied).
+  // Clone it and test that the clone contains the same words.
   LocalVocab localVocabClone = localVocabOriginal.clone();
   ASSERT_EQ(localVocabOriginal.size(), localVocabSize);
   ASSERT_EQ(localVocabClone.size(), localVocabSize);
@@ -138,8 +137,7 @@ TEST(LocalVocab, merge) {
   indices.push_back(vocA.getIndexAndAddIfNotContained("oneB"));
   indices.push_back(vocA.getIndexAndAddIfNotContained("twoB"));
 
-  // Clone it and test that the clone contains the same words, but under
-  // different addresses (that is, the word strings were deeply copied).
+  // Clone it and test that the clone contains the same words.
   auto vocabs = std::vector{&std::as_const(vocA), &std::as_const(vocB)};
   LocalVocab localVocabMerged = LocalVocab::merge(vocabs);
   ASSERT_EQ(localVocabMerged.size(), 4u);
diff --git a/test/ValueIdComparatorsTest.cpp b/test/ValueIdComparatorsTest.cpp
index 0f13e8879b..8319bfa427 100644
--- a/test/ValueIdComparatorsTest.cpp
+++ b/test/ValueIdComparatorsTest.cpp
@@ -116,9 +116,6 @@ auto testGetRangesForId(auto begin, auto end, ValueId id,
       }
     }
     while (it != end) {
-      if (isMatching(*it, id)) {
-        LOG(INFO) << "bum" << std::endl;
-      }
       ASSERT_FALSE(isMatching(*it, id)) << *it << ", " << id << comparison;
       auto expected = isMatchingDatatype(*it) ? False : Undef;
       ASSERT_EQ(compareIds(*it, id, comparison), expected) << *it << ' ' << id;

From 814e172800efcb9895c0dcebd5b0191dae0d3f3a Mon Sep 17 00:00:00 2001
From: Johannes Kalmbach <johannes.kalmbach@gmail.com>
Date: Thu, 11 Apr 2024 20:00:09 +0200
Subject: [PATCH 17/17] The missing "d"

---
 src/engine/LocalVocab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/engine/LocalVocab.h b/src/engine/LocalVocab.h
index 3ddd58567f..16ed3827a1 100644
--- a/src/engine/LocalVocab.h
+++ b/src/engine/LocalVocab.h
@@ -83,7 +83,7 @@ class LocalVocab {
   const std::string& getWord(LocalVocabIndex localVocabIndex) const;
 
   // Create a local vocab that contains and keeps alive all the words from each
-  // of the `vocabs`. The primary word set of the newly create vocab is empty.
+  // of the `vocabs`. The primary word set of the newly created vocab is empty.
   static LocalVocab merge(std::span<const LocalVocab*> vocabs);
 
   // Return all the words from all the word sets as a vector.