Skip to content

Commit

Permalink
Enable passing Python dictionaries in index.add (#28)
Browse files Browse the repository at this point in the history
This PR changes the external interface for indexing. Collections
translates Python maps to input passages, but index didn't.

We update that so we can pass dictionaries with either `text` or
`embeddings` keys.

There are a few internal interface changes as well. Notably, add_batch
to collection passed by value, and that's been fixed.
  • Loading branch information
mtbarta committed Jun 15, 2024
1 parent f9b3364 commit 47effb2
Show file tree
Hide file tree
Showing 34 changed files with 637 additions and 472 deletions.
16 changes: 16 additions & 0 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[bumpversion]
current_version = 0.3.1
commit = True
tag = True

[bumpversion:file:version.txt]
search = {current_version}
replace = {new_version}

[bumpversion:file:vcpkg.json]
search = "version-string": "{current_version}"
replace = "version-string": "{new_version}"

[bumpversion:file:lintdb/version.h]
search = #define LINTDB_VERSION_STRING "{current_version}"
replace = #define LINTDB_VERSION_STRING "{new_version}"
1 change: 1 addition & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[submodule "tools/vcpkg"]
path = tools/vcpkg
url = https://github.com/Microsoft/vcpkg.git
ignore = dirty
[submodule "lib/tokenizers-cpp"]
path = lib/tokenizers-cpp
url = https://github.com/DeployQL/tokenizers-cpp.git
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ project(lintdb
LANGUAGES CXX
)
set(LINTDB_VERSION ${version})
configure_file(lintdb/version.h.in ${CMAKE_CURRENT_SOURCE_DIR}/lintdb/version.h @ONLY)

include(GNUInstallDirs)

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ test-python: build-python
# had to fix up conda to make this work--
# conda install -c conda-forge gcc=12.1.0
# https://stackoverflow.com/questions/72540359/glibcxx-3-4-30-not-found-for-librosa-in-conda-virtual-environment-after-tryin
GLOG_v=100 PYTHONPATH="build/lintdb/python/build/lib" pytest tests/test_*.py
GLOG_v=100 PYTHONPATH="builds/python/lintdb/python/build/lib" pytest -v tests/test_*.py

run-python: build-python
PYTHONPATH="build/lintdb/python/build/lib" python tests/runner.py
Expand Down
25 changes: 19 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,44 @@ conda install lintdb -c deployql -c conda-forge
LintDB makes it easy to upload data, even if you have multiple tenants.

```python
tenant_id = 1
index = ldb.IndexIVF(index_path)

collection_options = lintdb.CollectionOptions()
collection_options.model_file = "model.onnx"
collection_options.tokenizer_file = "colbert_tokenizer.json"
collection = lintdb.Collection(index, collection_options)
...
# we use an IVF index, so we need to train the centroids.
index.train(training_data)
...
# add documents to the index.
doc = ldb.RawPassage(embeddings, id)
index.add(tenant_id, [doc])
# add documents to the collection.
collection.add(tenant_id, [{'id': 1, 'text': 'hello world', 'metadata': {'doc_id': 'abc123'}}])

opts = ldb.SearchOptions()
opts.k_top_centroids = 2 # number of centroids to search per query token.

results = index.search(
results = collection.search(
tenant_id,
embeddings,
32, # number of centroids to search
100, # k to return
opts
)
```

## Late Interaction Model Support
LintDB aims to support late interaction and more advanced retrieval models.

- [x] ColBERTv2 with PLAID
- [x] XTR (experimental)

# Roadmap

LintDB aims to be a full retrieval platform.

We want to extend LintDB's features to include:
- Snippet highlighting and explainability features.
- Support for more algorithms for retrieval and ranking.
- [XTR](https://arxiv.org/pdf/2304.01982.pdf)
- Fine tuning and pretraining, like [PreFLMR](https://arxiv.org/pdf/2402.08327.pdf)
- Increased support for document filtering.

Expand Down
5 changes: 3 additions & 2 deletions benchmarks/lotte/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def open_collection(index_path, index_type):

return index, collection

def create_collection(index_path, index_type, dims, nbits, num_subquantizers=16, num_centroids=32768):
def create_collection(index_path, index_type, dims, nbits, num_subquantizers=64, num_centroids=32768):
index = ldb.IndexIVF(index_path, num_centroids, dims, nbits, 6, num_subquantizers, index_type)
opts = ldb.CollectionOptions()
opts.model_file = model_files[index_type]['model_file']
Expand Down Expand Up @@ -129,11 +129,12 @@ def eval(dataset, experiment, index_type='binarizer', split='dev'):
for id, query in zip(data.qids, data.queries):
opts = ldb.SearchOptions()
opts.k_top_centroids = 100
opts.nearest_tokens_to_fetch = 100
results = collection.search(
0, # tenant
query, # converted,
100, # k to return
opts,
opts
)
for rank, result in enumerate(results):
# qid, pid, rank
Expand Down
203 changes: 135 additions & 68 deletions benchmarks/miracl/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,60 @@
import math
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

import numpy as np

app = typer.Typer()

model_files = {
ldb.IndexEncoding_XTR: {
'model_file': "assets/xtr/encoder.onnx",
'tokenizer_file': "assets/xtr/spiece.model",
},
ldb.IndexEncoding_BINARIZER: {
'model_file': "assets/model.onnx",
'tokenizer_file': "assets/colbert_tokenizer.json",
},
}

def batch(iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]


def open_collection(index_path, index_type):
index = ldb.IndexIVF(index_path)
opts = ldb.CollectionOptions()
opts.model_file = model_files[index_type]['model_file']
opts.tokenizer_file = model_files[index_type]['tokenizer_file']

collection = ldb.Collection(index, opts)

return index, collection

def create_collection(index_path, index_type, dims, nbits, num_subquantizers=64, num_centroids=32768):
index = ldb.IndexIVF(index_path, num_centroids, dims, nbits, 10, num_subquantizers, index_type)
opts = ldb.CollectionOptions()
opts.model_file = model_files[index_type]['model_file']
opts.tokenizer_file = model_files[index_type]['tokenizer_file']

collection = ldb.Collection(index, opts)

return index, collection

def get_index_type(index_type):
index_type_enum = ldb.IndexEncoding_BINARIZER
if index_type == "binarizer":
index_type_enum = ldb.IndexEncoding_BINARIZER
elif index_type == 'pq':
index_type_enum = ldb.IndexEncoding_PRODUCT_QUANTIZER
elif index_type == 'none':
index_type_enum = ldb.IndexEncoding_NONE
elif index_type == 'xtr':
index_type_enum = ldb.IndexEncoding_XTR

return index_type_enum

# https://github.com/PongoAI/pongo-miracl-benchmark/blob/main/scripts/run-pongo.py
def batch(iterable, n=1):
l = len(iterable)
Expand All @@ -26,22 +76,24 @@ def batch(iterable, n=1):
def eval(
experiment: str,
split: str = 'en',
use_rerank: bool = True,
index_type="binarizer",
):
dataset = load_dataset('miracl/miracl', split, use_auth_token=True)

index = ldb.IndexIVF(f"experiments/miracl/{experiment}")
opts = ldb.CollectionOptions()
opts.model_file = "/home/matt/deployql/LintDB/assets/model.onnx"
opts.tokenizer_file = "/home/matt/deployql/LintDB/assets/colbert_tokenizer.json"
index_path = f"experiments/miracl/{experiment}"

collection = ldb.Collection(index, opts)
index_type_enum = get_index_type(index_type)

# lifestyle full centroids == 65536
#lifestyle-40k-benchmark centroids == 32768
if index_type != 'bge':
index, collection = open_collection(index_path, index_type_enum)
else:
from FlagEmbedding import BGEM3FlagModel

if use_rerank:
print("loading reranker model...")
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
model.eval()
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)
index_type_enum = get_index_type('binarizer') # use colbert
index, collection = open_collection(index_path, index_type_enum)

file_exists = False
try:
Expand All @@ -60,21 +112,30 @@ def eval(

for data in tqdm(dataset['dev']):
question = data['query']

results = collection.search(0, question, 100)
if use_rerank:
print("reranking...")
texts = [doc.metadata['text'] for doc in results]
pairs = [(question, text) for text in texts]

with torch.no_grad():
inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
scores = model(**inputs, return_dict=True).logits.view(-1, ).float()

tups = list(zip(results, scores))
results = sorted(tups, key=lambda x: x[1], reverse=True)
results = [x[0] for x in results]
print("done reranking...")
opts = ldb.SearchOptions()
opts.n_probe = 32
opts.num_second_pass = 2500
opts.k_top_centroids=2
if index_type != 'bge':
results = collection.search(0, question, 100, opts)
else:
import string
query = question.translate(str.maketrans('', '', string.punctuation))
embeds = model.encode(query, max_length=1028, return_colbert_vecs=True)['colbert_vecs']
results = index.search(0, embeds, 100, opts)
# if use_rerank:
# print("reranking...")
# texts = [doc.metadata['text'] for doc in results]
# pairs = [(question, text) for text in texts]
#
# with torch.no_grad():
# inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
# scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
#
# tups = list(zip(results, scores))
# results = sorted(tups, key=lambda x: x[1], reverse=True)
# results = [x[0] for x in results]
# print("done reranking...")

mrr = -1
i = 1
Expand Down Expand Up @@ -124,33 +185,30 @@ def eval(
iDCG10_sum = 0
count = 0
for row in reader:
mrr3_sum += float(row[3]) if float(row[3]) <=3 else 0
mrr5_sum += float(row[3]) if float(row[3]) <=5 else 0
mrr3_sum += 1/float(row[3]) if float(row[3]) <=3 else 0
mrr5_sum += 1/float(row[3]) if float(row[3]) <=5 else 0
DCG10_sum += float(row[4])
iDCG10_sum += float(row[5])
count += 1

MRR = mrr_sum / count
MRR3 = mrr3_sum / count
MRR5 = mrr5_sum / count
NDCG10 = DCG10_sum / iDCG10_sum

print(f"MRR: {MRR}")
print(f"MRR@3: {MRR3}")
print(f"MRR@5: {MRR5}")
print(f"NDCG@10: {NDCG10}")



@app.command()
def index(
def run(
experiment: str,
split: str = 'en',
k: int = 5,
start:int=0,
stop:int=40000,
num_procs:int=10,
k: int = 5,
nbits: int=1,
index_type="binarizer",
use_batch:bool=False,
batch_size:int=5,
checkpoint: str = "colbert-ir/colbertv2.0"):
):
print("Loading dataset...")
dataset = load_dataset('miracl/miracl', split, use_auth_token=True)
print("Dataset loaded.")
Expand All @@ -161,33 +219,19 @@ def index(
# delete directory if exists
shutil.rmtree(index_path)

index_type_enum = ldb.IndexEncoding_BINARIZER
if index_type == "binarizer":
index_type_enum = ldb.IndexEncoding_BINARIZER
elif index_type == 'pq':
index_type_enum = ldb.IndexEncoding_PRODUCT_QUANTIZER
elif index_type == 'none':
index_type_enum = ldb.IndexEncoding_NONE
elif index_type == 'xtr':
index_type_enum = ldb.IndexEncoding_XTR

print(f"using index type: {index_type_enum}")
if index_type != 'bge':
index_type_enum = get_index_type(index_type)

# lifestyle full centroids == 65536
#lifestyle-40k-benchmark centroids == 32768
dims = 128
index, collection = create_collection(index_path, index_type_enum, 128, 2)
else:
from FlagEmbedding import BGEM3FlagModel

config = ldb.Configuration()
config.nbits = nbits
config.dim = dims
config.quantizer_type = index_type_enum
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)
index_type_enum = get_index_type('binarizer') # use colbert
index, collection = create_collection(index_path, index_type_enum, 128, 2, num_centroids=942)

index = ldb.IndexIVF(index_path, config)
opts = ldb.CollectionOptions()
opts.model_file = "/home/matt/deployql/LintDB/assets/model.onnx"
opts.tokenizer_file = "/home/matt/deployql/LintDB/assets/colbert_tokenizer.json"

collection = ldb.Collection(index, opts)

id=0
passages = []
Expand All @@ -197,18 +241,41 @@ def index(
passages.extend(data['positive_passages'])
passages.extend(data['negative_passages'])

training_data = random.sample(passages, 1000)
training_data = random.sample(passages, 5000)
training_data = [x['text'] for x in training_data]

collection.train(training_data)
if index_type != 'bge':
collection.train(training_data)
else:

if os.path.exists("miracl-bge-embeddings.npz"):
print("Loading embeddings...")
training_embeds = np.load("miracl-bge-embeddings.npz")['arr_0']
else:
training_embeds = None

for sent in training_data:
embeds = model.encode(sent, max_length=1028, return_colbert_vecs=True)['colbert_vecs']
if training_embeds is None:
training_embeds = embeds
else:
training_embeds = np.append(training_embeds, embeds, axis=0)

print(np.sqrt(len(training_embeds)))
np.savez("miracl-bge-embeddings", training_embeds)
index.train(training_embeds)

start = time.perf_counter()
for passage in passages:
collection.add(0, id, passage['text'], {
'text': passage['text'],
'docid': passage['docid'],
'title': passage['title'],
})
if index_type != 'bge':
collection.add(0, id, passage['text'], {
'text': passage['text'],
'docid': passage['docid'],
'title': passage['title'],
})
else:
embeds = model.encode(passage['text'], max_length=1028, return_colbert_vecs=True)['colbert_vecs']
index.add(0, [{'embeddings': embeds, 'id': id, 'metadata': {'text': passage['text'], 'docid': passage['docid'], 'title': passage['title']}}])
id += 1

duration = time.perf_counter() - start
Expand Down
Loading

0 comments on commit 47effb2

Please sign in to comment.