Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactoring of project structure #432

Merged
merged 5 commits into from
Jan 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:

- name: Test
timeout-minutes: 15
run: python -m unittest discover -v --start-directory python_bindings/tests --pattern "*_test*.py"
run: python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py"

test_cpp:
runs-on: ${{matrix.os}}
Expand Down Expand Up @@ -48,7 +48,7 @@ jobs:
- name: Prepare test data
run: |
pip install numpy
cd examples
cd tests/cpp/
python update_gen_data.py
shell: bash

Expand Down
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,21 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
endif()

add_executable(test_updates examples/updates_test.cpp)
add_executable(test_updates tests/cpp/updates_test.cpp)
target_link_libraries(test_updates hnswlib)

add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
add_executable(searchKnnCloserFirst_test tests/cpp/searchKnnCloserFirst_test.cpp)
target_link_libraries(searchKnnCloserFirst_test hnswlib)

add_executable(searchKnnWithFilter_test examples/searchKnnWithFilter_test.cpp)
add_executable(searchKnnWithFilter_test tests/cpp/searchKnnWithFilter_test.cpp)
target_link_libraries(searchKnnWithFilter_test hnswlib)

add_executable(multiThreadLoad_test examples/multiThreadLoad_test.cpp)
add_executable(multiThreadLoad_test tests/cpp/multiThreadLoad_test.cpp)
target_link_libraries(multiThreadLoad_test hnswlib)

add_executable(multiThread_replace_test examples/multiThread_replace_test.cpp)
add_executable(multiThread_replace_test tests/cpp/multiThread_replace_test.cpp)
target_link_libraries(multiThread_replace_test hnswlib)

add_executable(main main.cpp sift_1b.cpp)
add_executable(main tests/cpp/main.cpp tests/cpp/sift_1b.cpp)
target_link_libraries(main hnswlib)
endif()
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ dist:
python3 -m build --sdist

test:
python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
python3 -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"

clean:
rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so
Expand Down
107 changes: 5 additions & 102 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ Properties of `hnswlib.Index` that support reading and writing:


#### Python bindings examples
[See more examples here](examples/EXAMPLES.md)
```python
import hnswlib
import numpy as np
Expand Down Expand Up @@ -229,104 +230,6 @@ labels, distances = p.knn_query(data, k=1)
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
```

An example with a filter:
```python
import hnswlib
import numpy as np

dim = 16
num_elements = 10000

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))

# Declaring index
hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initiating index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# strongly affects the memory consumption

hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
hnsw_index.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
hnsw_index.set_num_threads(4)

print("Adding %d elements" % (len(data)))
# Added elements will have consecutive ids
hnsw_index.add_items(data, ids=np.arange(num_elements))

print("Querying only even elements")
# Define filter function that allows only even ids
filter_function = lambda idx: idx%2 == 0
# Query the elements for themselves and search only for even elements:
labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function)
# labels contain only elements with even id
```

An example with replacing of deleted elements:
```python
import hnswlib
import numpy as np

dim = 16
num_elements = 1_000
max_num_elements = 2 * num_elements

# Generating sample data
labels1 = np.arange(0, num_elements)
data1 = np.float32(np.random.random((num_elements, dim))) # batch 1
labels2 = np.arange(num_elements, 2 * num_elements)
data2 = np.float32(np.random.random((num_elements, dim))) # batch 2
labels3 = np.arange(2 * num_elements, 3 * num_elements)
data3 = np.float32(np.random.random((num_elements, dim))) # batch 3

# Declaring index
hnsw_index = hnswlib.Index(space='l2', dim=dim)

# Initiating index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# strongly affects the memory consumption

# Enable replacing of deleted elements
hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
hnsw_index.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
hnsw_index.set_num_threads(4)

# Add batch 1 and 2 data
hnsw_index.add_items(data1, labels1)
hnsw_index.add_items(data2, labels2) # Note: maximum number of elements is reached

# Delete data of batch 2
for label in labels2:
hnsw_index.mark_deleted(label)

# Replace deleted elements
# Maximum number of elements is reached therefore we cannot add new items,
# but we can replace the deleted ones by using replace_deleted=True
hnsw_index.add_items(data3, labels3, replace_deleted=True)
# hnsw_index contains the data of batch 1 and batch 3 only
```

### Bindings installation

You can install from sources:
Expand All @@ -346,9 +249,9 @@ Contributions are highly welcome!

Please make pull requests against the `develop` branch.

When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality):
When making changes please run tests (and please add a test to `tests/python` in case there is new functionality):
```bash
python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"
```


Expand All @@ -373,7 +276,7 @@ https://github.com/dbaranchuk/ivf-hnsw
### 200M SIFT test reproduction
To download and extract the bigann dataset (from root directory):
```bash
python3 download_bigann.py
python tests/cpp/download_bigann.py
```
To compile:
```bash
Expand All @@ -393,7 +296,7 @@ The size of the BigANN subset (in millions) is controlled by the variable **subs
### Updates test
To generate testing data (from root directory):
```bash
cd examples
cd tests/cpp
python update_gen_data.py
```
To compile (from root directory):
Expand Down
Loading