Skip to content

Commit

Permalink
Voyager Backend (#41)
Browse files Browse the repository at this point in the history
* Add voyager backend
  • Loading branch information
sky-2002 authored Dec 6, 2024
1 parent fb8b4e9 commit 64a6409
Show file tree
Hide file tree
Showing 7 changed files with 171 additions and 9 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ install: venv
uv run pre-commit install

install-no-pre-commit:
uv pip install ".[dev,hnsw,pynndescent,annoy,faiss,usearch]"
uv pip install ".[dev,hnsw,pynndescent,annoy,faiss,usearch,voyager]"

install-base:
uv sync --extra dev
Expand Down
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ The following backends are supported:
- `ivf_scalar`: Inverted file search with scalar quantizer.
- `ivfpq`: Inverted file search with product quantizer.
- `ivfpqr`: Inverted file search with product quantizer and refinement.

- [VOYAGER](https://github.com/spotify/voyager): Voyager is a library for performing fast approximate nearest-neighbor searches on an in-memory collection of vectors.



Expand Down Expand Up @@ -149,7 +149,9 @@ NOTE: the ANN backends do not support dynamic deletion. To delete items, you nee
| | `connectivity` | Number of connections per node in the graph. | `16` |
| | `expansion_add` | Number of candidates considered during graph construction. | `128` |
| | `expansion_search` | Number of candidates considered during search. | `64` |

| **VOYAGER** | `metric` | Similarity space to use (`cosine`, `l2`). | `"cosine"` |
| | `ef_construction` | The number of vectors that this index searches through when inserting a new vector into the index. | `200` |
| | `m` | The number of connections between nodes in the tree’s internal data structure. | `16` |

## Installation
The following installation options are available:
Expand All @@ -166,6 +168,7 @@ pip install vicinity[faiss]
pip install vicinity[hnsw]
pip install vicinity[pynndescent]
pip install vicinity[usearch]
pip install vicinity[voyager]
```

## License
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ pynndescent = [
annoy = ["annoy"]
faiss = ["faiss-cpu"]
usearch = ["usearch"]
voyager = ["voyager"]
all = [
"hnswlib",
"pynndescent>=0.5.10",
Expand All @@ -60,7 +61,8 @@ all = [
"numpy>=1.24.0",
"annoy",
"faiss-cpu",
"usearch"
"usearch",
"voyager"
]

[project.urls]
Expand Down
36 changes: 31 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,17 @@

random_gen = np.random.default_rng(42)

_faiss_index_types = ["flat", "ivf", "hnsw", "lsh", "scalar", "pq", "ivf_scalar", "ivfpq", "ivfpqr"]
_faiss_index_types = [
"flat",
"ivf",
"hnsw",
"lsh",
"scalar",
"pq",
"ivf_scalar",
"ivfpq",
"ivfpqr",
]


@pytest.fixture(scope="session")
Expand All @@ -35,11 +45,15 @@ def query_vector() -> np.ndarray:
(Backend.ANNOY, None),
(Backend.PYNNDESCENT, None),
(Backend.USEARCH, None),
(Backend.VOYAGER, None),
]


# Create human-readable ids for each backend type
BACKEND_IDS = [f"{backend.name}-{index_type}" if index_type else backend.name for backend, index_type in BACKEND_PARAMS]
BACKEND_IDS = [
f"{backend.name}-{index_type}" if index_type else backend.name
for backend, index_type in BACKEND_PARAMS
]


@pytest.fixture(params=BACKEND_PARAMS)
Expand All @@ -49,19 +63,31 @@ def backend_type(request: pytest.FixtureRequest) -> Backend:


@pytest.fixture(params=BACKEND_PARAMS, ids=BACKEND_IDS)
def vicinity_instance(request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray) -> Vicinity:
def vicinity_instance(
request: pytest.FixtureRequest, items: list[str], vectors: np.ndarray
) -> Vicinity:
"""Fixture providing a Vicinity instance for each backend type."""
backend_type, index_type = request.param
# Handle FAISS backend with specific FAISS index types
if backend_type == Backend.FAISS:
if index_type in ("pq", "ivfpq", "ivfpqr"):
# Use smaller values for pq indexes since the dataset is small
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, m=2, nbits=4
vectors,
items,
backend_type=backend_type,
index_type=index_type,
m=2,
nbits=4,
)
else:
return Vicinity.from_vectors_and_items(
vectors, items, backend_type=backend_type, index_type=index_type, nlist=2, nbits=32
vectors,
items,
backend_type=backend_type,
index_type=index_type,
nlist=2,
nbits=32,
)

return Vicinity.from_vectors_and_items(vectors, items, backend_type=backend_type)
5 changes: 5 additions & 0 deletions vicinity/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,10 @@ def get_backend_class(backend: Union[Backend, str]) -> type[AbstractBackend]:

return UsearchBackend

elif backend == Backend.VOYAGER:
from vicinity.backends.voyager import VoyagerBackend

return VoyagerBackend


__all__ = ["get_backend_class", "AbstractBackend"]
125 changes: 125 additions & 0 deletions vicinity/backends/voyager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Union

import numpy as np
from numpy import typing as npt
from voyager import Index, Space

from vicinity.backends.base import AbstractBackend, BaseArgs
from vicinity.datatypes import Backend, QueryResult
from vicinity.utils import Metric, normalize


@dataclass
class VoyagerArgs(BaseArgs):
dim: int = 0
metric: str = "cosine"
ef_construction: int = 200
m: int = 16


class VoyagerBackend(AbstractBackend[VoyagerArgs]):
argument_class = VoyagerArgs
supported_metrics = {Metric.COSINE, Metric.EUCLIDEAN}
inverse_metric_mapping = {
Metric.COSINE: "cosine",
Metric.EUCLIDEAN: "l2",
}

metric_int_mapping = {
"l2": 0,
"cosine": 2,
}

def __init__(
self,
index: Index,
arguments: VoyagerArgs,
) -> None:
"""Initialize the backend using vectors."""
super().__init__(arguments)
self.index = index

@classmethod
def from_vectors(
cls: type[VoyagerBackend],
vectors: npt.NDArray,
metric: Union[str, Metric],
ef_construction: int,
m: int,
**kwargs: Any,
) -> VoyagerBackend:
"""Create a new instance from vectors."""
metric_enum = Metric.from_string(metric)

if metric_enum not in cls.supported_metrics:
raise ValueError(
f"Metric '{metric_enum.value}' is not supported by VoyagerBackend."
)

metric = cls._map_metric_to_string(metric_enum)
dim = vectors.shape[1]
space = Space(value=cls.metric_int_mapping[metric])
index = Index(
space=space,
num_dimensions=dim,
M=m,
ef_construction=ef_construction,
)
index.add_items(vectors)
return cls(
index,
VoyagerArgs(dim=dim, metric=metric, ef_construction=ef_construction, m=m),
)

def query(self, query: npt.NDArray, k: int) -> QueryResult:
"""Query the backend for the nearest neighbors."""
indices, distances = self.index.query(query, k)
return list(zip(indices, distances))

@classmethod
def load(cls: type[VoyagerBackend], base_path: Path) -> VoyagerBackend:
"""Load the vectors from a path."""
path = Path(base_path) / "index.bin"
arguments = VoyagerArgs.load(base_path / "arguments.json")
index = Index.load(str(path))
return cls(index, arguments=arguments)

def save(self, base_path: Path) -> None:
"""Save the vectors to a path."""
path = Path(base_path) / "index.bin"
self.index.save(str(path))
self.arguments.dump(base_path / "arguments.json")

def insert(self, vectors: npt.NDArray) -> None:
"""Insert vectors into the backend."""
self.index.add_items(vectors)

def delete(self, indices: list[int]) -> None:
"""Delete vectors from the backend."""
raise NotImplementedError("Deletion is not supported in Voyager backend.")

def threshold(self, vectors: npt.NDArray, threshold: float) -> list[npt.NDArray]:
"""Threshold the backend."""
out: list[npt.NDArray] = []
for x, y in self.query(vectors, len(self)):
out.append(x[y < threshold])

return out

@property
def backend_type(self) -> Backend:
"""The type of the backend."""
return Backend.VOYAGER

@property
def dim(self) -> int:
"""Get the dimension of the space."""
return self.index.num_dimensions

def __len__(self) -> int:
"""Get the number of vectors."""
return self.index.num_elements
1 change: 1 addition & 0 deletions vicinity/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ class Backend(str, Enum):
PYNNDESCENT = "pynndescent"
FAISS = "faiss"
USEARCH = "usearch"
VOYAGER = "voyager"

0 comments on commit 64a6409

Please sign in to comment.