Add XTR Support (#27)

This PR adds preliminary support for running XTR models in LintDB. - Tokenizer runs sentencepiece and adds an eos token for XTR. - Inverted Lists change to index codes per token for XTR. Therefore, XTR makes one less database call to score. - ProductEncoder handles quantization, with the help of new inverted list scanners and distance tables. This touches a lot of code to help find better abstractions.
DeployQL · Jun 11, 2024 · f9b3364 · f9b3364
1 parent 6bc71d3
commit f9b3364
Show file tree

Hide file tree

Showing 92 changed files with 3,283 additions and 1,133 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 debug/
 target/
 assets/
+cmake-build-debug/
 
 .DS_Store
 # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries

diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/.name b/.idea/.name
diff --git a/.idea/LintDB.iml b/.idea/LintDB.iml
diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml
diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/Makefile b/Makefile
@@ -21,7 +21,7 @@ build-python-mac:
 	cd builds/python/lintdb/python && python setup.py build
 
 test:
-	cd builds/debug && cmake -E env GLOG_logtostderr=1 MKL_THREADING_LAYER=GNU ctest --output-on-failure
+	cd builds/debug && cmake -E env GLOG_v=5 GLOG_logtostderr=1 MKL_THREADING_LAYER=GNU ctest --output-on-failure
 
 test-python: build-python
 # had to fix up conda to make this work--
@@ -44,10 +44,10 @@ format:
 
 valgrind:
 # we need valgrind?-3.20 to process dwarf5
-	valgrind -s --trace-children=yes --track-origins=yes --keep-stacktraces=alloc-and-free --suppressions=debug/valgrind-python.supp env PYTHONPATH="_build_python_/lintdb/python/build/lib/lintdb" python benchmarks/bench_lintdb.py --index-path=experiments/py_index_bench_colbert-lifestyle-2024-04-03
+	valgrind -s --trace-children=yes --track-origins=yes --keep-stacktraces=alloc-and-free --suppressions=debug/valgrind-python.supp env PYTHONPATH="_build_python_/lintdb/python/build/lib/lintdb" python benchmarks/bench_lintdb.py --index-path=experiments/py_index_bench_test-collection-xtr
 
-callgrind: build-conda
-	OMP_MAX_ACTIVE_LEVELS=2 OMP_THREAD_LIMIT=6 OMP_NUM_THREADS=6 PYTHONPATH="_build_python_/lintdb/python/build/lib/lintdb" valgrind --tool=callgrind --suppressions=debug/valgrind-python.supp --instr-atstart=yes --dump-instr=yes --collect-jumps=yes python ./benchmarks/bench_lintdb.py
+callgrind:
+	OMP_MAX_ACTIVE_LEVELS=2 OMP_THREAD_LIMIT=6 OMP_NUM_THREADS=6 PYTHONPATH="_build_python_/lintdb/python/build/lib/lintdb" valgrind --tool=callgrind --suppressions=debug/valgrind-python.supp --instr-atstart=yes --dump-instr=yes --collect-jumps=yes python ./benchmarks/bench_lintdb.py single-search
 
 callgrind-colbert: build-conda
 	PYTHONPATH="_build_python_/lintdb/python/build/lib/lintdb" valgrind --tool=callgrind --suppressions=debug/valgrind-python.supp --instr-atstart=no --dump-instr=yes --collect-jumps=yes python ./benchmarks/run_colbert.py
@@ -73,6 +73,9 @@ build-conda:
 	-DBUILD_TESTING=OFF \
 	-DCMAKE_BUILD_TYPE=Release \
 	-DBLA_VENDOR=Intel10_64lp \
+	-DOpenMP_CXX_FLAGS=-fopenmp=libiomp5 \
+	-DOpenMP_CXX_LIB_NAMES=libiomp5 \
+	-DOpenMP_libiomp5_LIBRARY=${ROOT_DIR}/_build_python_/vcpkg_installed/x64-linux/lib/intel64/libiomp5.so \
 	.
 
 	cmake --build _build_python_${PY_VER} --target pylintdb -j12
@@ -92,7 +95,7 @@ build-benchmarks:
 	  .
 	CC=gcc CXX=g++ CMAKE_C_COMPILER=gcc CMAKE_CXX_COMPILER=g++ cmake --build build_benchmarks --target=bench_lintdb -j12
 
-run-perf: build-conda
+run-perf:
 # make sure your system allows perf to run. ex: sudo sysctl -w kernel.perf_event_paranoid=1 
-	OMP_MAX_ACTIVE_LEVELS=2 OMP_THREAD_LIMIT=12 OMP_NUM_THREADS=6 PYTHONPATH="_build_python_/lintdb/python/build/lib/lintdb" perf record -g -- /home/matt/miniconda3/envs/lintdb-benchmark/bin/python -X perf benchmarks/bench_lintdb.py
+	OMP_MAX_ACTIVE_LEVELS=2 OMP_THREAD_LIMIT=12 OMP_NUM_THREADS=6 PYTHONPATH="builds/python/lintdb/python/build/lib/lintdb" perf record -g -- /home/matt/miniconda3/envs/lintdb-benchmark/bin/python -X perf benchmarks/run_lintdb.py
 	perf script | ./debug/stackcollapse-perf.pl | ./debug/flamegraph.pl > perf.data.svg 
diff --git a/benchmarks/bench_lintdb.cpp b/benchmarks/bench_lintdb.cpp
@@ -7,8 +7,8 @@
 #include "lintdb/index_builder/Tokenizer.h"
 
 static void BM_lintdb_search(benchmark::State& state) {
-    // std::string path = "/mnt/data/py_index_bench_colbert-lifestyle-2024-04-16-pq";
-    std::string path = "experiments/py_index_bench_colbert-lifestyle-2024-04-03";
+     std::string path = "experiments/py_index_bench_test-collection-xtr";
+//    std::string path = "experiments/py_index_bench_colbert-lifestyle-2024-04-03";
     lintdb::IndexIVF index(path);
     for (auto _ : state) {
         state.PauseTiming();

diff --git a/benchmarks/bench_lintdb.py b/benchmarks/bench_lintdb.py
@@ -25,7 +25,7 @@ def callgrind_dump_stats(path:str):
 app = typer.Typer()
 
 @app.command()
-def single_search(dataset:str='lifestyle', split:str='dev',profile=False, checkpoint:str='colbert-ir/colbertv2.0', index_path:str='experiments/py_index_bench_colbert-lifestyle-2024-04-03'):
+def single_search(dataset:str='lifestyle', split:str='dev',profile=False, checkpoint:str='colbert-ir/colbertv2.0', index_path:str='experiments/py_index_bench_test-collection-xtr'):
     latencies = []
     memory = []
 
@@ -38,8 +38,8 @@ def single_search(dataset:str='lifestyle', split:str='dev',profile=False, checkp
         converted = embeddings
 
         start = time.perf_counter()
-        # if profile:
-        #     callgrind_start_instrumentation()
+        if profile:
+            callgrind_start_instrumentation()
         opts = ldb.SearchOptions()
         results = index.search(
             0,
@@ -49,9 +49,9 @@ def single_search(dataset:str='lifestyle', split:str='dev',profile=False, checkp
             opts
         )
         latencies.append((time.perf_counter() - start)*1000)
-        # if profile:
-        #     callgrind_stop_instrumentation()
-        #     callgrind_dump_stats("callgrind.out.single_search")
+        if profile:
+            callgrind_stop_instrumentation()
+            callgrind_dump_stats("callgrind.out.single_search")
         memory.append(get_memory_usage())
         rankings[id] = [x.id for x in results]
         count+=1

diff --git a/benchmarks/lotte/common.py b/benchmarks/lotte/common.py
@@ -61,23 +61,24 @@ def colbert_indexing(experiment: str, exp_path: str, dataset: LoTTeDataset, nbit
 def lintdb_search(
         experiment: str, 
         exp_path: str, 
-        dataset:LoTTeDataset, 
-        k, 
-        nbits=2,  
+        dataset:LoTTeDataset,   
         checkpoint: str = "colbert-ir/colbertv2.0", 
         reuse_centroids=True, 
         use_compression=False,
-        failures={}):
+        failures={},
+        use_xtr: bool = False,
+        ):
     # let's get the same model.
     config = ColBERTConfig.load_from_checkpoint(checkpoint)
     config.kmeans_niters=4
     config.ncells = 2
     config.ndocs=1024
     config.centroid_score_threshold=.45
 
-    from colbert.modeling.checkpoint import Checkpoint
-    from colbert import Searcher
-    checkpoint = Checkpoint(checkpoint, config)
+    if not use_xtr:
+        from colbert.modeling.checkpoint import Checkpoint
+        from colbert import Searcher
+        checkpoint = Checkpoint(checkpoint, config)
 
     index_path = f"{exp_path}/py_index_bench_{experiment}"
     if not os.path.exists(index_path):
@@ -96,6 +97,7 @@ def lintdb_search(
         failure_ids=set()
         if failures:
             failure_ids = set(failures.keys())
+        count=0
         for id, query in zip(dataset.qids, dataset.queries):
             if failures and id not in failure_ids:
                 continue
@@ -126,12 +128,18 @@ def lintdb_search(
                         opts
                     )
             else:
+                opts = ldb.SearchOptions()
+                opts.k_top_centroids = 1000
                 results = index.search(
                     0,
                     converted, 
                     64, # nprobe
                     100, # k to return
+                    opts
                 )
+                count+=1
+                # if count == 2:
+                #     return
             for rank, result in enumerate(results):
                 # qid, pid, rank
                 f.write(f"{id}\t{result.id}\t{rank+1}\t{result.score}\n")