-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Init ColBERTv2 managed index #9656
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ def __init__( | |
service_context: Optional[ServiceContext] = None, | ||
storage_context: Optional[StorageContext] = None, | ||
model_name: str = "colbert-ir/colbertv2.0", | ||
index_name: str = "", | ||
show_progress: bool = False, | ||
nbits: int = 2, | ||
gpus: int = 0, | ||
|
@@ -58,6 +59,7 @@ def __init__( | |
) -> None: | ||
self.model_name = model_name | ||
self.index_path = "storage/colbert_index" | ||
self.index_name = index_name | ||
self.nbits = nbits | ||
self.gpus = gpus | ||
self.ranks = ranks | ||
|
@@ -75,6 +77,7 @@ def __init__( | |
super().__init__( | ||
nodes=nodes, | ||
index_struct=index_struct, | ||
index_name=index_name, | ||
service_context=service_context, | ||
storage_context=storage_context, | ||
show_progress=show_progress, | ||
|
@@ -100,7 +103,7 @@ def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> IndexDict: | |
"""Generate a PLAID index from the ColBERT checkpoint via its hugging face | ||
model_name. | ||
""" | ||
from colbert import Indexer, IndexUpdater, Searcher | ||
from colbert import Indexer, Searcher | ||
from colbert.infra import ColBERTConfig, Run, RunConfig | ||
|
||
index_struct = IndexDict() | ||
|
@@ -121,12 +124,9 @@ def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> IndexDict: | |
kmeans_niters=self.kmeans_niters, | ||
) | ||
indexer = Indexer(checkpoint=self.model_name, config=config) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @logan-markewich their indexer There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea I suppose. It just feels very similar to BM25 since you can't add or delete data from it? It doesn't fit our definition of index very well There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh they do have CRUD for index data: https://github.com/stanford-futuredata/ColBERT/blob/117098348e5196ede1c8e396c1c14f24d9a8754e/colbert/index_updater.py the previous pr does not support it. I will add them for another pr this pr is mostly for testing retrieval performance |
||
indexer.index("", collection=docs_list, overwrite=True) | ||
indexer.index(name=self.index_name, collection=docs_list, overwrite=True) | ||
self.store = Searcher( | ||
index="", collection=docs_list, checkpoint=self.model_name | ||
) | ||
self.updater = IndexUpdater( | ||
config=config, searcher=self.store, checkpoint=self.model_name | ||
index=self.index_name, collection=docs_list, checkpoint=self.model_name | ||
) | ||
return index_struct | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if this would make more sense as just a retriever? Similar to
BM25Retriever
? Thoughts?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
colbertv2 indeed build index. it has its own indexer, retriever