Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable passing Python dictionaries in index.add #28

Merged
merged 9 commits into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[bumpversion]
current_version = 0.3.1
commit = True
tag = True

[bumpversion:file:version.txt]
search = {current_version}
replace = {new_version}

[bumpversion:file:vcpkg.json]
search = "version-string": "{current_version}"
replace = "version-string": "{new_version}"

[bumpversion:file:lintdb/version.h]
search = #define LINTDB_VERSION_STRING "{current_version}"
replace = #define LINTDB_VERSION_STRING "{new_version}"
1 change: 1 addition & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
[submodule "tools/vcpkg"]
path = tools/vcpkg
url = https://github.com/Microsoft/vcpkg.git
ignore = dirty
[submodule "lib/tokenizers-cpp"]
path = lib/tokenizers-cpp
url = https://github.com/DeployQL/tokenizers-cpp.git
1 change: 0 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ project(lintdb
LANGUAGES CXX
)
set(LINTDB_VERSION ${version})
configure_file(lintdb/version.h.in ${CMAKE_CURRENT_SOURCE_DIR}/lintdb/version.h @ONLY)

include(GNUInstallDirs)

Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ test-python: build-python
# had to fix up conda to make this work--
# conda install -c conda-forge gcc=12.1.0
# https://stackoverflow.com/questions/72540359/glibcxx-3-4-30-not-found-for-librosa-in-conda-virtual-environment-after-tryin
GLOG_v=100 PYTHONPATH="build/lintdb/python/build/lib" pytest tests/test_*.py
GLOG_v=100 PYTHONPATH="builds/python/lintdb/python/build/lib" pytest -v tests/test_*.py

run-python: build-python
PYTHONPATH="build/lintdb/python/build/lib" python tests/runner.py
Expand Down
25 changes: 19 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,31 +23,44 @@ conda install lintdb -c deployql -c conda-forge
LintDB makes it easy to upload data, even if you have multiple tenants.

```python
tenant_id = 1
index = ldb.IndexIVF(index_path)

collection_options = lintdb.CollectionOptions()
collection_options.model_file = "model.onnx"
collection_options.tokenizer_file = "colbert_tokenizer.json"
collection = lintdb.Collection(index, collection_options)
...
# we use an IVF index, so we need to train the centroids.
index.train(training_data)
...
# add documents to the index.
doc = ldb.RawPassage(embeddings, id)
index.add(tenant_id, [doc])
# add documents to the collection.
collection.add(tenant_id, [{'id': 1, 'text': 'hello world', 'metadata': {'doc_id': 'abc123'}}])

opts = ldb.SearchOptions()
opts.k_top_centroids = 2 # number of centroids to search per query token.

results = index.search(
results = collection.search(
tenant_id,
embeddings,
32, # number of centroids to search
100, # k to return
opts
)
```

## Late Interaction Model Support
LintDB aims to support late interaction and more advanced retrieval models.

- [x] ColBERTv2 with PLAID
- [x] XTR (experimental)

# Roadmap

LintDB aims to be a full retrieval platform.

We want to extend LintDB's features to include:
- Snippet highlighting and explainability features.
- Support for more algorithms for retrieval and ranking.
- [XTR](https://arxiv.org/pdf/2304.01982.pdf)
- Fine tuning and pretraining, like [PreFLMR](https://arxiv.org/pdf/2402.08327.pdf)
- Increased support for document filtering.

Expand Down
5 changes: 3 additions & 2 deletions benchmarks/lotte/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def open_collection(index_path, index_type):

return index, collection

def create_collection(index_path, index_type, dims, nbits, num_subquantizers=16, num_centroids=32768):
def create_collection(index_path, index_type, dims, nbits, num_subquantizers=64, num_centroids=32768):
index = ldb.IndexIVF(index_path, num_centroids, dims, nbits, 6, num_subquantizers, index_type)
opts = ldb.CollectionOptions()
opts.model_file = model_files[index_type]['model_file']
Expand Down Expand Up @@ -129,11 +129,12 @@ def eval(dataset, experiment, index_type='binarizer', split='dev'):
for id, query in zip(data.qids, data.queries):
opts = ldb.SearchOptions()
opts.k_top_centroids = 100
opts.nearest_tokens_to_fetch = 100
results = collection.search(
0, # tenant
query, # converted,
100, # k to return
opts,
opts
)
for rank, result in enumerate(results):
# qid, pid, rank
Expand Down
203 changes: 135 additions & 68 deletions benchmarks/miracl/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,60 @@
import math
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

import numpy as np

app = typer.Typer()

model_files = {
ldb.IndexEncoding_XTR: {
'model_file': "assets/xtr/encoder.onnx",
'tokenizer_file': "assets/xtr/spiece.model",
},
ldb.IndexEncoding_BINARIZER: {
'model_file': "assets/model.onnx",
'tokenizer_file': "assets/colbert_tokenizer.json",
},
}

def batch(iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]


def open_collection(index_path, index_type):
index = ldb.IndexIVF(index_path)
opts = ldb.CollectionOptions()
opts.model_file = model_files[index_type]['model_file']
opts.tokenizer_file = model_files[index_type]['tokenizer_file']

collection = ldb.Collection(index, opts)

return index, collection

def create_collection(index_path, index_type, dims, nbits, num_subquantizers=64, num_centroids=32768):
index = ldb.IndexIVF(index_path, num_centroids, dims, nbits, 10, num_subquantizers, index_type)
opts = ldb.CollectionOptions()
opts.model_file = model_files[index_type]['model_file']
opts.tokenizer_file = model_files[index_type]['tokenizer_file']

collection = ldb.Collection(index, opts)

return index, collection

def get_index_type(index_type):
index_type_enum = ldb.IndexEncoding_BINARIZER
if index_type == "binarizer":
index_type_enum = ldb.IndexEncoding_BINARIZER
elif index_type == 'pq':
index_type_enum = ldb.IndexEncoding_PRODUCT_QUANTIZER
elif index_type == 'none':
index_type_enum = ldb.IndexEncoding_NONE
elif index_type == 'xtr':
index_type_enum = ldb.IndexEncoding_XTR

return index_type_enum

# https://github.com/PongoAI/pongo-miracl-benchmark/blob/main/scripts/run-pongo.py
def batch(iterable, n=1):
l = len(iterable)
Expand All @@ -26,22 +76,24 @@ def batch(iterable, n=1):
def eval(
experiment: str,
split: str = 'en',
use_rerank: bool = True,
index_type="binarizer",
):
dataset = load_dataset('miracl/miracl', split, use_auth_token=True)

index = ldb.IndexIVF(f"experiments/miracl/{experiment}")
opts = ldb.CollectionOptions()
opts.model_file = "/home/matt/deployql/LintDB/assets/model.onnx"
opts.tokenizer_file = "/home/matt/deployql/LintDB/assets/colbert_tokenizer.json"
index_path = f"experiments/miracl/{experiment}"

collection = ldb.Collection(index, opts)
index_type_enum = get_index_type(index_type)

# lifestyle full centroids == 65536
#lifestyle-40k-benchmark centroids == 32768
if index_type != 'bge':
index, collection = open_collection(index_path, index_type_enum)
else:
from FlagEmbedding import BGEM3FlagModel

if use_rerank:
print("loading reranker model...")
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
model.eval()
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)
index_type_enum = get_index_type('binarizer') # use colbert
index, collection = open_collection(index_path, index_type_enum)

file_exists = False
try:
Expand All @@ -60,21 +112,30 @@ def eval(

for data in tqdm(dataset['dev']):
question = data['query']

results = collection.search(0, question, 100)
if use_rerank:
print("reranking...")
texts = [doc.metadata['text'] for doc in results]
pairs = [(question, text) for text in texts]

with torch.no_grad():
inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
scores = model(**inputs, return_dict=True).logits.view(-1, ).float()

tups = list(zip(results, scores))
results = sorted(tups, key=lambda x: x[1], reverse=True)
results = [x[0] for x in results]
print("done reranking...")
opts = ldb.SearchOptions()
opts.n_probe = 32
opts.num_second_pass = 2500
opts.k_top_centroids=2
if index_type != 'bge':
results = collection.search(0, question, 100, opts)
else:
import string
query = question.translate(str.maketrans('', '', string.punctuation))
embeds = model.encode(query, max_length=1028, return_colbert_vecs=True)['colbert_vecs']
results = index.search(0, embeds, 100, opts)
# if use_rerank:
# print("reranking...")
# texts = [doc.metadata['text'] for doc in results]
# pairs = [(question, text) for text in texts]
#
# with torch.no_grad():
# inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
# scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
#
# tups = list(zip(results, scores))
# results = sorted(tups, key=lambda x: x[1], reverse=True)
# results = [x[0] for x in results]
# print("done reranking...")

mrr = -1
i = 1
Expand Down Expand Up @@ -124,33 +185,30 @@ def eval(
iDCG10_sum = 0
count = 0
for row in reader:
mrr3_sum += float(row[3]) if float(row[3]) <=3 else 0
mrr5_sum += float(row[3]) if float(row[3]) <=5 else 0
mrr3_sum += 1/float(row[3]) if float(row[3]) <=3 else 0
mrr5_sum += 1/float(row[3]) if float(row[3]) <=5 else 0
DCG10_sum += float(row[4])
iDCG10_sum += float(row[5])
count += 1

MRR = mrr_sum / count
MRR3 = mrr3_sum / count
MRR5 = mrr5_sum / count
NDCG10 = DCG10_sum / iDCG10_sum

print(f"MRR: {MRR}")
print(f"MRR@3: {MRR3}")
print(f"MRR@5: {MRR5}")
print(f"NDCG@10: {NDCG10}")



@app.command()
def index(
def run(
experiment: str,
split: str = 'en',
k: int = 5,
start:int=0,
stop:int=40000,
num_procs:int=10,
k: int = 5,
nbits: int=1,
index_type="binarizer",
use_batch:bool=False,
batch_size:int=5,
checkpoint: str = "colbert-ir/colbertv2.0"):
):
print("Loading dataset...")
dataset = load_dataset('miracl/miracl', split, use_auth_token=True)
print("Dataset loaded.")
Expand All @@ -161,33 +219,19 @@ def index(
# delete directory if exists
shutil.rmtree(index_path)

index_type_enum = ldb.IndexEncoding_BINARIZER
if index_type == "binarizer":
index_type_enum = ldb.IndexEncoding_BINARIZER
elif index_type == 'pq':
index_type_enum = ldb.IndexEncoding_PRODUCT_QUANTIZER
elif index_type == 'none':
index_type_enum = ldb.IndexEncoding_NONE
elif index_type == 'xtr':
index_type_enum = ldb.IndexEncoding_XTR

print(f"using index type: {index_type_enum}")
if index_type != 'bge':
index_type_enum = get_index_type(index_type)

# lifestyle full centroids == 65536
#lifestyle-40k-benchmark centroids == 32768
dims = 128
index, collection = create_collection(index_path, index_type_enum, 128, 2)
else:
from FlagEmbedding import BGEM3FlagModel

config = ldb.Configuration()
config.nbits = nbits
config.dim = dims
config.quantizer_type = index_type_enum
model = BGEM3FlagModel("BAAI/bge-m3", use_fp16=True)
index_type_enum = get_index_type('binarizer') # use colbert
index, collection = create_collection(index_path, index_type_enum, 128, 2, num_centroids=942)

index = ldb.IndexIVF(index_path, config)
opts = ldb.CollectionOptions()
opts.model_file = "/home/matt/deployql/LintDB/assets/model.onnx"
opts.tokenizer_file = "/home/matt/deployql/LintDB/assets/colbert_tokenizer.json"

collection = ldb.Collection(index, opts)

id=0
passages = []
Expand All @@ -197,18 +241,41 @@ def index(
passages.extend(data['positive_passages'])
passages.extend(data['negative_passages'])

training_data = random.sample(passages, 1000)
training_data = random.sample(passages, 5000)
training_data = [x['text'] for x in training_data]

collection.train(training_data)
if index_type != 'bge':
collection.train(training_data)
else:

if os.path.exists("miracl-bge-embeddings.npz"):
print("Loading embeddings...")
training_embeds = np.load("miracl-bge-embeddings.npz")['arr_0']
else:
training_embeds = None

for sent in training_data:
embeds = model.encode(sent, max_length=1028, return_colbert_vecs=True)['colbert_vecs']
if training_embeds is None:
training_embeds = embeds
else:
training_embeds = np.append(training_embeds, embeds, axis=0)

print(np.sqrt(len(training_embeds)))
np.savez("miracl-bge-embeddings", training_embeds)
index.train(training_embeds)

start = time.perf_counter()
for passage in passages:
collection.add(0, id, passage['text'], {
'text': passage['text'],
'docid': passage['docid'],
'title': passage['title'],
})
if index_type != 'bge':
collection.add(0, id, passage['text'], {
'text': passage['text'],
'docid': passage['docid'],
'title': passage['title'],
})
else:
embeds = model.encode(passage['text'], max_length=1028, return_colbert_vecs=True)['colbert_vecs']
index.add(0, [{'embeddings': embeds, 'id': id, 'metadata': {'text': passage['text'], 'docid': passage['docid'], 'title': passage['title']}}])
id += 1

duration = time.perf_counter() - start
Expand Down
Loading
Loading