Skip to content

Commit

Permalink
Refactoring of project structure (#432)
Browse files Browse the repository at this point in the history
* Refactor file structure, update readme and examples

* Update Makefile

* Update git tester

* Remove redundant updates_test.cpp, apply suggested changes to example file

* Return back python3 in Makefile
  • Loading branch information
dyashuni authored Jan 15, 2023
1 parent 978f713 commit d86f8f9
Show file tree
Hide file tree
Showing 35 changed files with 412 additions and 136 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:

- name: Test
timeout-minutes: 15
run: python -m unittest discover -v --start-directory python_bindings/tests --pattern "*_test*.py"
run: python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py"

test_cpp:
runs-on: ${{matrix.os}}
Expand Down Expand Up @@ -48,7 +48,7 @@ jobs:
- name: Prepare test data
run: |
pip install numpy
cd examples
cd tests/cpp/
python update_gen_data.py
shell: bash

Expand Down
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,21 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
endif()

add_executable(test_updates examples/updates_test.cpp)
add_executable(test_updates tests/cpp/updates_test.cpp)
target_link_libraries(test_updates hnswlib)

add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
add_executable(searchKnnCloserFirst_test tests/cpp/searchKnnCloserFirst_test.cpp)
target_link_libraries(searchKnnCloserFirst_test hnswlib)

add_executable(searchKnnWithFilter_test examples/searchKnnWithFilter_test.cpp)
add_executable(searchKnnWithFilter_test tests/cpp/searchKnnWithFilter_test.cpp)
target_link_libraries(searchKnnWithFilter_test hnswlib)

add_executable(multiThreadLoad_test examples/multiThreadLoad_test.cpp)
add_executable(multiThreadLoad_test tests/cpp/multiThreadLoad_test.cpp)
target_link_libraries(multiThreadLoad_test hnswlib)

add_executable(multiThread_replace_test examples/multiThread_replace_test.cpp)
add_executable(multiThread_replace_test tests/cpp/multiThread_replace_test.cpp)
target_link_libraries(multiThread_replace_test hnswlib)

add_executable(main main.cpp sift_1b.cpp)
add_executable(main tests/cpp/main.cpp tests/cpp/sift_1b.cpp)
target_link_libraries(main hnswlib)
endif()
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ dist:
python3 -m build --sdist

test:
python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
python3 -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"

clean:
rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so
Expand Down
107 changes: 5 additions & 102 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ Properties of `hnswlib.Index` that support reading and writing:

#### Python bindings examples
[See more examples here](examples/EXAMPLES.md)
```python
import hnswlib
import numpy as np
Expand Down Expand Up @@ -229,104 +230,6 @@ labels, distances = p.knn_query(data, k=1)
print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
```

An example with a filter:
```python
import hnswlib
import numpy as np

dim = 16
num_elements = 10000

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))

# Declaring index
hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initiating index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# strongly affects the memory consumption

hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
hnsw_index.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
hnsw_index.set_num_threads(4)

print("Adding %d elements" % (len(data)))
# Added elements will have consecutive ids
hnsw_index.add_items(data, ids=np.arange(num_elements))

print("Querying only even elements")
# Define filter function that allows only even ids
filter_function = lambda idx: idx%2 == 0
# Query the elements for themselves and search only for even elements:
labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function)
# labels contain only elements with even id
```

An example with replacing of deleted elements:
```python
import hnswlib
import numpy as np

dim = 16
num_elements = 1_000
max_num_elements = 2 * num_elements

# Generating sample data
labels1 = np.arange(0, num_elements)
data1 = np.float32(np.random.random((num_elements, dim))) # batch 1
labels2 = np.arange(num_elements, 2 * num_elements)
data2 = np.float32(np.random.random((num_elements, dim))) # batch 2
labels3 = np.arange(2 * num_elements, 3 * num_elements)
data3 = np.float32(np.random.random((num_elements, dim))) # batch 3

# Declaring index
hnsw_index = hnswlib.Index(space='l2', dim=dim)

# Initiating index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# strongly affects the memory consumption

# Enable replacing of deleted elements
hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
hnsw_index.set_ef(10)

# Set number of threads used during batch search/construction
# By default using all available cores
hnsw_index.set_num_threads(4)

# Add batch 1 and 2 data
hnsw_index.add_items(data1, labels1)
hnsw_index.add_items(data2, labels2) # Note: maximum number of elements is reached

# Delete data of batch 2
for label in labels2:
hnsw_index.mark_deleted(label)

# Replace deleted elements
# Maximum number of elements is reached therefore we cannot add new items,
# but we can replace the deleted ones by using replace_deleted=True
hnsw_index.add_items(data3, labels3, replace_deleted=True)
# hnsw_index contains the data of batch 1 and batch 3 only
```

### Bindings installation

You can install from sources:
Expand All @@ -346,9 +249,9 @@ Contributions are highly welcome!

Please make pull requests against the `develop` branch.

When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality):
When making changes please run tests (and please add a test to `tests/python` in case there is new functionality):
```bash
python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"
```


Expand All @@ -373,7 +276,7 @@ https://github.com/dbaranchuk/ivf-hnsw
### 200M SIFT test reproduction
To download and extract the bigann dataset (from root directory):
```bash
python3 download_bigann.py
python tests/cpp/download_bigann.py
```
To compile:
```bash
Expand All @@ -393,7 +296,7 @@ The size of the BigANN subset (in millions) is controlled by the variable **subs
### Updates test
To generate testing data (from root directory):
```bash
cd examples
cd tests/cpp
python update_gen_data.py
```
To compile (from root directory):
Expand Down
Loading

0 comments on commit d86f8f9

Please sign in to comment.