From 454c4c9104a1878fd9f71b75a84b2a3a3b5efb9e Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 14 Jan 2023 16:34:55 +0400 Subject: [PATCH 1/5] Refactor file structure, update readme and examples --- .github/workflows/build.yml | 4 +- CMakeLists.txt | 12 +- README.md | 107 +------ examples/EXAMPLES.md | 205 +++++++++++++ examples/example.py | 9 +- examples/example_filter.py | 45 +++ examples/example_replace_deleted.py | 55 ++++ examples/example_search.py | 41 +++ ...xample_old.py => example_serialization.py} | 43 ++- examples/pyw_hnswlib.py | 4 + python_bindings/tests/__init__.py | 0 .../cpp/download_bigann.py | 0 main.cpp => tests/cpp/main.cpp | 0 .../cpp}/multiThreadLoad_test.cpp | 2 +- .../cpp}/multiThread_replace_test.cpp | 2 +- .../cpp}/searchKnnCloserFirst_test.cpp | 2 +- .../cpp}/searchKnnWithFilter_test.cpp | 2 +- sift_1b.cpp => tests/cpp/sift_1b.cpp | 2 +- sift_test.cpp => tests/cpp/sift_test.cpp | 2 +- {examples => tests/cpp}/update_gen_data.py | 0 tests/cpp/updates_test.cpp | 278 ++++++++++++++++++ .../tests => tests/python}/bindings_test.py | 0 .../python}/bindings_test_filter.py | 0 .../python}/bindings_test_getdata.py | 0 .../python}/bindings_test_labels.py | 0 .../python}/bindings_test_metadata.py | 0 .../python}/bindings_test_pickle.py | 0 .../python}/bindings_test_recall.py | 0 .../python}/bindings_test_replace.py | 0 .../python}/bindings_test_resize.py | 0 .../python}/bindings_test_spaces.py | 0 .../bindings_test_stress_mt_replace.py | 0 {examples => tests/python}/git_tester.py | 0 {examples => tests/python}/speedtest.py | 0 {examples => tests}/updates_test.cpp | 0 35 files changed, 684 insertions(+), 131 deletions(-) create mode 100644 examples/EXAMPLES.md create mode 100644 examples/example_filter.py create mode 100644 examples/example_replace_deleted.py create mode 100644 examples/example_search.py rename examples/{example_old.py => example_serialization.py} (59%) delete mode 100644 python_bindings/tests/__init__.py rename download_bigann.py => tests/cpp/download_bigann.py (100%) rename main.cpp => tests/cpp/main.cpp (100%) rename {examples => tests/cpp}/multiThreadLoad_test.cpp (99%) rename {examples => tests/cpp}/multiThread_replace_test.cpp (99%) rename {examples => tests/cpp}/searchKnnCloserFirst_test.cpp (98%) rename {examples => tests/cpp}/searchKnnWithFilter_test.cpp (99%) rename sift_1b.cpp => tests/cpp/sift_1b.cpp (99%) rename sift_test.cpp => tests/cpp/sift_test.cpp (99%) rename {examples => tests/cpp}/update_gen_data.py (100%) create mode 100644 tests/cpp/updates_test.cpp rename {python_bindings/tests => tests/python}/bindings_test.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_filter.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_getdata.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_labels.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_metadata.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_pickle.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_recall.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_replace.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_resize.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_spaces.py (100%) rename {python_bindings/tests => tests/python}/bindings_test_stress_mt_replace.py (100%) rename {examples => tests/python}/git_tester.py (100%) rename {examples => tests/python}/speedtest.py (100%) rename {examples => tests}/updates_test.cpp (100%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e86d2545..d45b8b33 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,7 +20,7 @@ jobs: - name: Test timeout-minutes: 15 - run: python -m unittest discover -v --start-directory python_bindings/tests --pattern "*_test*.py" + run: python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py" test_cpp: runs-on: ${{matrix.os}} @@ -48,7 +48,7 @@ jobs: - name: Prepare test data run: | pip install numpy - cd examples + cd tests/cpp/ python update_gen_data.py shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index de951171..9fcdcb73 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,21 +16,21 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" ) endif() - add_executable(test_updates examples/updates_test.cpp) + add_executable(test_updates tests/cpp/updates_test.cpp) target_link_libraries(test_updates hnswlib) - add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp) + add_executable(searchKnnCloserFirst_test tests/cpp/searchKnnCloserFirst_test.cpp) target_link_libraries(searchKnnCloserFirst_test hnswlib) - add_executable(searchKnnWithFilter_test examples/searchKnnWithFilter_test.cpp) + add_executable(searchKnnWithFilter_test tests/cpp/searchKnnWithFilter_test.cpp) target_link_libraries(searchKnnWithFilter_test hnswlib) - add_executable(multiThreadLoad_test examples/multiThreadLoad_test.cpp) + add_executable(multiThreadLoad_test tests/cpp/multiThreadLoad_test.cpp) target_link_libraries(multiThreadLoad_test hnswlib) - add_executable(multiThread_replace_test examples/multiThread_replace_test.cpp) + add_executable(multiThread_replace_test tests/cpp/multiThread_replace_test.cpp) target_link_libraries(multiThread_replace_test hnswlib) - add_executable(main main.cpp sift_1b.cpp) + add_executable(main tests/cpp/main.cpp tests/cpp/sift_1b.cpp) target_link_libraries(main hnswlib) endif() diff --git a/README.md b/README.md index c0b0dbcc..80128105 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ Properties of `hnswlib.Index` that support reading and writing: #### Python bindings examples +[See more examples here](examples/EXAMPLES.md) ```python import hnswlib import numpy as np @@ -229,104 +230,6 @@ labels, distances = p.knn_query(data, k=1) print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n") ``` -An example with a filter: -```python -import hnswlib -import numpy as np - -dim = 16 -num_elements = 10000 - -# Generating sample data -data = np.float32(np.random.random((num_elements, dim))) - -# Declaring index -hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip - -# Initiating index -# max_elements - the maximum number of elements, should be known beforehand -# (probably will be made optional in the future) -# -# ef_construction - controls index search speed/build speed tradeoff -# M - is tightly connected with internal dimensionality of the data -# strongly affects the memory consumption - -hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16) - -# Controlling the recall by setting ef: -# higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) - -# Set number of threads used during batch search/construction -# By default using all available cores -hnsw_index.set_num_threads(4) - -print("Adding %d elements" % (len(data))) -# Added elements will have consecutive ids -hnsw_index.add_items(data, ids=np.arange(num_elements)) - -print("Querying only even elements") -# Define filter function that allows only even ids -filter_function = lambda idx: idx%2 == 0 -# Query the elements for themselves and search only for even elements: -labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function) -# labels contain only elements with even id -``` - -An example with replacing of deleted elements: -```python -import hnswlib -import numpy as np - -dim = 16 -num_elements = 1_000 -max_num_elements = 2 * num_elements - -# Generating sample data -labels1 = np.arange(0, num_elements) -data1 = np.float32(np.random.random((num_elements, dim))) # batch 1 -labels2 = np.arange(num_elements, 2 * num_elements) -data2 = np.float32(np.random.random((num_elements, dim))) # batch 2 -labels3 = np.arange(2 * num_elements, 3 * num_elements) -data3 = np.float32(np.random.random((num_elements, dim))) # batch 3 - -# Declaring index -hnsw_index = hnswlib.Index(space='l2', dim=dim) - -# Initiating index -# max_elements - the maximum number of elements, should be known beforehand -# (probably will be made optional in the future) -# -# ef_construction - controls index search speed/build speed tradeoff -# M - is tightly connected with internal dimensionality of the data -# strongly affects the memory consumption - -# Enable replacing of deleted elements -hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) - -# Controlling the recall by setting ef: -# higher ef leads to better accuracy, but slower search -hnsw_index.set_ef(10) - -# Set number of threads used during batch search/construction -# By default using all available cores -hnsw_index.set_num_threads(4) - -# Add batch 1 and 2 data -hnsw_index.add_items(data1, labels1) -hnsw_index.add_items(data2, labels2) # Note: maximum number of elements is reached - -# Delete data of batch 2 -for label in labels2: - hnsw_index.mark_deleted(label) - -# Replace deleted elements -# Maximum number of elements is reached therefore we cannot add new items, -# but we can replace the deleted ones by using replace_deleted=True -hnsw_index.add_items(data3, labels3, replace_deleted=True) -# hnsw_index contains the data of batch 1 and batch 3 only -``` - ### Bindings installation You can install from sources: @@ -346,9 +249,9 @@ Contributions are highly welcome! Please make pull requests against the `develop` branch. -When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality): +When making changes please run tests (and please add a test to `tests/python` in case there is new functionality): ```bash -python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" +python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py ``` @@ -373,7 +276,7 @@ https://github.com/dbaranchuk/ivf-hnsw ### 200M SIFT test reproduction To download and extract the bigann dataset (from root directory): ```bash -python3 download_bigann.py +python tests/cpp/download_bigann.py ``` To compile: ```bash @@ -393,7 +296,7 @@ The size of the BigANN subset (in millions) is controlled by the variable **subs ### Updates test To generate testing data (from root directory): ```bash -cd examples +cd tests/cpp python update_gen_data.py ``` To compile (from root directory): diff --git a/examples/EXAMPLES.md b/examples/EXAMPLES.md new file mode 100644 index 00000000..7dd90dc0 --- /dev/null +++ b/examples/EXAMPLES.md @@ -0,0 +1,205 @@ +# Python bindings examples + +```python +import hnswlib +import numpy as np +import pickle + +dim = 128 +num_elements = 10000 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) +ids = np.arange(num_elements) + +# Declaring index +p = hnswlib.Index(space = 'l2', dim = dim) # possible options are l2, cosine or ip + +# Initializing index - the maximum number of elements should be known beforehand +p.init_index(max_elements = num_elements, ef_construction = 200, M = 16) + +# Element insertion (can be called several times): +p.add_items(data, ids) + +# Controlling the recall by setting ef: +p.set_ef(50) # ef should always be > k + +# Query dataset, k - number of the closest elements (returns 2 numpy arrays) +labels, distances = p.knn_query(data, k = 1) + +# Index objects support pickling +# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method! +# Note: ef parameter is included in serialization; random number generator is initialized with random_seed on Index load +p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip + +### Index parameters are exposed as class properties: +print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") +print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") +print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") +print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") +``` + +An example with updates after serialization/deserialization: +```python +import hnswlib +import numpy as np + +dim = 16 +num_elements = 10000 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# We split the data in two batches: +data1 = data[:num_elements // 2] +data2 = data[num_elements // 2:] + +# Declaring index +p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + +# Initializing index +# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded +# during insertion of an element. +# The capacity can be increased by saving/loading the index, see below. +# +# ef_construction - controls index search speed/build speed tradeoff +# +# M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M) +# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction + +p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) + +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +p.set_ef(10) + +# Set number of threads used during batch search/construction +# By default using all available cores +p.set_num_threads(4) + +print("Adding first batch of %d elements" % (len(data1))) +p.add_items(data1) + +# Query the elements for themselves and measure recall: +labels, distances = p.knn_query(data1, k=1) +print("Recall for the first batch:", np.mean(labels.reshape(-1) == np.arange(len(data1))), "\n") + +# Serializing and deleting the index: +index_path='first_half.bin' +print("Saving index to '%s'" % index_path) +p.save_index("first_half.bin") +del p + +# Re-initializing, loading the index +p = hnswlib.Index(space='l2', dim=dim) # the space can be changed - keeps the data, alters the distance function. + +print("\nLoading index from 'first_half.bin'\n") + +# Increase the total capacity (max_elements), so that it will handle the new data +p.load_index("first_half.bin", max_elements = num_elements) + +print("Adding the second batch of %d elements" % (len(data2))) +p.add_items(data2) + +# Query the elements for themselves and measure recall: +labels, distances = p.knn_query(data, k=1) +print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n") +``` + +An example with a filter: +```python +import hnswlib +import numpy as np + +dim = 16 +num_elements = 10000 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# Declaring index +hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + +# Initiating index +# max_elements - the maximum number of elements, should be known beforehand +# (probably will be made optional in the future) +# +# ef_construction - controls index search speed/build speed tradeoff +# M - is tightly connected with internal dimensionality of the data +# strongly affects the memory consumption + +hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16) + +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(10) + +# Set number of threads used during batch search/construction +# By default using all available cores +hnsw_index.set_num_threads(4) + +print("Adding %d elements" % (len(data))) +# Added elements will have consecutive ids +hnsw_index.add_items(data, ids=np.arange(num_elements)) + +print("Querying only even elements") +# Define filter function that allows only even ids +filter_function = lambda idx: idx%2 == 0 +# Query the elements for themselves and search only for even elements: +labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function) +# labels contain only elements with even id +``` + +An example with replacing of deleted elements: +```python +import hnswlib +import numpy as np + +dim = 16 +num_elements = 1_000 +max_num_elements = 2 * num_elements + +# Generating sample data +labels1 = np.arange(0, num_elements) +data1 = np.float32(np.random.random((num_elements, dim))) # batch 1 +labels2 = np.arange(num_elements, 2 * num_elements) +data2 = np.float32(np.random.random((num_elements, dim))) # batch 2 +labels3 = np.arange(2 * num_elements, 3 * num_elements) +data3 = np.float32(np.random.random((num_elements, dim))) # batch 3 + +# Declaring index +hnsw_index = hnswlib.Index(space='l2', dim=dim) + +# Initiating index +# max_elements - the maximum number of elements, should be known beforehand +# (probably will be made optional in the future) +# +# ef_construction - controls index search speed/build speed tradeoff +# M - is tightly connected with internal dimensionality of the data +# strongly affects the memory consumption + +# Enable replacing of deleted elements +hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) + +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(10) + +# Set number of threads used during batch search/construction +# By default using all available cores +hnsw_index.set_num_threads(4) + +# Add batch 1 and 2 data +hnsw_index.add_items(data1, labels1) +hnsw_index.add_items(data2, labels2) # Note: maximum number of elements is reached + +# Delete data of batch 2 +for label in labels2: + hnsw_index.mark_deleted(label) + +# Replace deleted elements +# Maximum number of elements is reached therefore we cannot add new items, +# but we can replace the deleted ones by using replace_deleted=True +hnsw_index.add_items(data3, labels3, replace_deleted=True) +# hnsw_index contains the data of batch 1 and batch 3 only +``` \ No newline at end of file diff --git a/examples/example.py b/examples/example.py index a08955a1..3d6d7477 100644 --- a/examples/example.py +++ b/examples/example.py @@ -1,6 +1,12 @@ +import os import hnswlib import numpy as np + +""" +Example of index building, search and serialization/deserialization +""" + dim = 16 num_elements = 10000 @@ -34,7 +40,6 @@ # By default using all available cores p.set_num_threads(4) - print("Adding first batch of %d elements" % (len(data1))) p.add_items(data1) @@ -62,3 +67,5 @@ # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n") + +os.remove("first_half.bin") diff --git a/examples/example_filter.py b/examples/example_filter.py new file mode 100644 index 00000000..10a059a8 --- /dev/null +++ b/examples/example_filter.py @@ -0,0 +1,45 @@ +import hnswlib +import numpy as np + + +""" +Example of filtering elements when searching +""" + +dim = 16 +num_elements = 10000 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) + +# Declaring index +hnsw_index = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + +# Initiating index +# max_elements - the maximum number of elements, should be known beforehand +# (probably will be made optional in the future) +# +# ef_construction - controls index search speed/build speed tradeoff +# M - is tightly connected with internal dimensionality of the data +# strongly affects the memory consumption + +hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16) + +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(10) + +# Set number of threads used during batch search/construction +# By default using all available cores +hnsw_index.set_num_threads(4) + +print("Adding %d elements" % (len(data))) +# Added elements will have consecutive ids +hnsw_index.add_items(data, ids=np.arange(num_elements)) + +print("Querying only even elements") +# Define filter function that allows only even ids +filter_function = lambda idx: idx%2 == 0 +# Query the elements for themselves and search only for even elements: +labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function) +# labels contain only elements with even id diff --git a/examples/example_replace_deleted.py b/examples/example_replace_deleted.py new file mode 100644 index 00000000..3c0b62e7 --- /dev/null +++ b/examples/example_replace_deleted.py @@ -0,0 +1,55 @@ +import hnswlib +import numpy as np + + +""" +Example of replacing deleted elements with new ones +""" + +dim = 16 +num_elements = 1_000 +max_num_elements = 2 * num_elements + +# Generating sample data +labels1 = np.arange(0, num_elements) +data1 = np.float32(np.random.random((num_elements, dim))) # batch 1 +labels2 = np.arange(num_elements, 2 * num_elements) +data2 = np.float32(np.random.random((num_elements, dim))) # batch 2 +labels3 = np.arange(2 * num_elements, 3 * num_elements) +data3 = np.float32(np.random.random((num_elements, dim))) # batch 3 + +# Declaring index +hnsw_index = hnswlib.Index(space='l2', dim=dim) + +# Initiating index +# max_elements - the maximum number of elements, should be known beforehand +# (probably will be made optional in the future) +# +# ef_construction - controls index search speed/build speed tradeoff +# M - is tightly connected with internal dimensionality of the data +# strongly affects the memory consumption + +# Enable replacing of deleted elements +hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True) + +# Controlling the recall by setting ef: +# higher ef leads to better accuracy, but slower search +hnsw_index.set_ef(10) + +# Set number of threads used during batch search/construction +# By default using all available cores +hnsw_index.set_num_threads(4) + +# Add batch 1 and 2 data +hnsw_index.add_items(data1, labels1) +hnsw_index.add_items(data2, labels2) # Note: maximum number of elements is reached + +# Delete data of batch 2 +for label in labels2: + hnsw_index.mark_deleted(label) + +# Replace deleted elements +# Maximum number of elements is reached therefore we cannot add new items, +# but we can replace the deleted ones by using replace_deleted=True +hnsw_index.add_items(data3, labels3, replace_deleted=True) +# hnsw_index contains the data of batch 1 and batch 3 only diff --git a/examples/example_search.py b/examples/example_search.py new file mode 100644 index 00000000..4581843b --- /dev/null +++ b/examples/example_search.py @@ -0,0 +1,41 @@ +import hnswlib +import numpy as np +import pickle + + +""" +Example of search +""" + +dim = 128 +num_elements = 10000 + +# Generating sample data +data = np.float32(np.random.random((num_elements, dim))) +ids = np.arange(num_elements) + +# Declaring index +p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip + +# Initializing index - the maximum number of elements should be known beforehand +p.init_index(max_elements=num_elements, ef_construction=200, M=16) + +# Element insertion (can be called several times): +p.add_items(data, ids) + +# Controlling the recall by setting ef: +p.set_ef(50) # ef should always be > k + +# Query dataset, k - number of the closest elements (returns 2 numpy arrays) +labels, distances = p.knn_query(data, k=1) + +# Index objects support pickling +# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method! +# Note: ef parameter is included in serialization; random number generator is initialized with random_seed on Index load +p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip + +### Index parameters are exposed as class properties: +print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}") +print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}") +print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}") +print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}") diff --git a/examples/example_old.py b/examples/example_serialization.py similarity index 59% rename from examples/example_old.py rename to examples/example_serialization.py index 6654a027..76ca1436 100644 --- a/examples/example_old.py +++ b/examples/example_serialization.py @@ -1,34 +1,45 @@ +import os + import hnswlib import numpy as np + +""" +Example of serialization/deserialization +""" + dim = 16 num_elements = 10000 # Generating sample data data = np.float32(np.random.random((num_elements, dim))) +# We split the data in two batches: +data1 = data[:num_elements // 2] +data2 = data[num_elements // 2:] + # Declaring index p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip -# Initing index -# max_elements - the maximum number of elements, should be known beforehand -# (probably will be made optional in the future) +# Initializing index +# max_elements - the maximum number of elements (capacity). Will throw an exception if exceeded +# during insertion of an element. +# The capacity can be increased by saving/loading the index, see below. # # ef_construction - controls index search speed/build speed tradeoff -# M - is tightly connected with internal dimensionality of the data -# stronlgy affects the memory consumption +# +# M - is tightly connected with internal dimensionality of the data. Strongly affects memory consumption (~M) +# Higher M leads to higher accuracy/run_time at fixed ef/efConstruction -p.init_index(max_elements=num_elements, ef_construction=100, M=16) +p.init_index(max_elements=num_elements//2, ef_construction=100, M=16) # Controlling the recall by setting ef: # higher ef leads to better accuracy, but slower search p.set_ef(10) -p.set_num_threads(4) # by default using all available cores - -# We split the data in two batches: -data1 = data[:num_elements // 2] -data2 = data[num_elements // 2:] +# Set number of threads used during batch search/construction +# By default using all available cores +p.set_num_threads(4) print("Adding first batch of %d elements" % (len(data1))) p.add_items(data1) @@ -43,11 +54,13 @@ p.save_index("first_half.bin") del p -# Reiniting, loading the index -p = hnswlib.Index(space='l2', dim=dim) # you can change the sa +# Re-initializing, loading the index +p = hnswlib.Index(space='l2', dim=dim) # the space can be changed - keeps the data, alters the distance function. print("\nLoading index from 'first_half.bin'\n") -p.load_index("first_half.bin") + +# Increase the total capacity (max_elements), so that it will handle the new data +p.load_index("first_half.bin", max_elements = num_elements) print("Adding the second batch of %d elements" % (len(data2))) p.add_items(data2) @@ -55,3 +68,5 @@ # Query the elements for themselves and measure recall: labels, distances = p.knn_query(data, k=1) print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n") + +os.remove("first_half.bin") diff --git a/examples/pyw_hnswlib.py b/examples/pyw_hnswlib.py index aeb93f10..0ccfbc5e 100644 --- a/examples/pyw_hnswlib.py +++ b/examples/pyw_hnswlib.py @@ -4,6 +4,10 @@ import pickle +""" +Example of python wrapper for hnswlib that supports python objects as ids +""" + class Index(): def __init__(self, space, dim): self.index = hnswlib.Index(space, dim) diff --git a/python_bindings/tests/__init__.py b/python_bindings/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/download_bigann.py b/tests/cpp/download_bigann.py similarity index 100% rename from download_bigann.py rename to tests/cpp/download_bigann.py diff --git a/main.cpp b/tests/cpp/main.cpp similarity index 100% rename from main.cpp rename to tests/cpp/main.cpp diff --git a/examples/multiThreadLoad_test.cpp b/tests/cpp/multiThreadLoad_test.cpp similarity index 99% rename from examples/multiThreadLoad_test.cpp rename to tests/cpp/multiThreadLoad_test.cpp index a713b2ba..4d2b4aa2 100644 --- a/examples/multiThreadLoad_test.cpp +++ b/tests/cpp/multiThreadLoad_test.cpp @@ -1,4 +1,4 @@ -#include "../hnswlib/hnswlib.h" +#include "../../hnswlib/hnswlib.h" #include #include diff --git a/examples/multiThread_replace_test.cpp b/tests/cpp/multiThread_replace_test.cpp similarity index 99% rename from examples/multiThread_replace_test.cpp rename to tests/cpp/multiThread_replace_test.cpp index 83ed2826..203cdb0d 100644 --- a/examples/multiThread_replace_test.cpp +++ b/tests/cpp/multiThread_replace_test.cpp @@ -1,4 +1,4 @@ -#include "../hnswlib/hnswlib.h" +#include "../../hnswlib/hnswlib.h" #include #include diff --git a/examples/searchKnnCloserFirst_test.cpp b/tests/cpp/searchKnnCloserFirst_test.cpp similarity index 98% rename from examples/searchKnnCloserFirst_test.cpp rename to tests/cpp/searchKnnCloserFirst_test.cpp index d87102cd..9583fe22 100644 --- a/examples/searchKnnCloserFirst_test.cpp +++ b/tests/cpp/searchKnnCloserFirst_test.cpp @@ -3,7 +3,7 @@ // >>> searchKnnCloserFirst(const void* query_data, size_t k) const; // of class AlgorithmInterface -#include "../hnswlib/hnswlib.h" +#include "../../hnswlib/hnswlib.h" #include diff --git a/examples/searchKnnWithFilter_test.cpp b/tests/cpp/searchKnnWithFilter_test.cpp similarity index 99% rename from examples/searchKnnWithFilter_test.cpp rename to tests/cpp/searchKnnWithFilter_test.cpp index 6102323c..0557b7e4 100644 --- a/examples/searchKnnWithFilter_test.cpp +++ b/tests/cpp/searchKnnWithFilter_test.cpp @@ -1,6 +1,6 @@ // This is a test file for testing the filtering feature -#include "../hnswlib/hnswlib.h" +#include "../../hnswlib/hnswlib.h" #include diff --git a/sift_1b.cpp b/tests/cpp/sift_1b.cpp similarity index 99% rename from sift_1b.cpp rename to tests/cpp/sift_1b.cpp index 96d83267..43777ff6 100644 --- a/sift_1b.cpp +++ b/tests/cpp/sift_1b.cpp @@ -2,7 +2,7 @@ #include #include #include -#include "hnswlib/hnswlib.h" +#include "../../hnswlib/hnswlib.h" #include diff --git a/sift_test.cpp b/tests/cpp/sift_test.cpp similarity index 99% rename from sift_test.cpp rename to tests/cpp/sift_test.cpp index 751580cb..decdf605 100644 --- a/sift_test.cpp +++ b/tests/cpp/sift_test.cpp @@ -2,7 +2,7 @@ #include #include #include -#include "hnswlib/hnswlib.h" +#include "../../hnswlib/hnswlib.h" #include diff --git a/examples/update_gen_data.py b/tests/cpp/update_gen_data.py similarity index 100% rename from examples/update_gen_data.py rename to tests/cpp/update_gen_data.py diff --git a/tests/cpp/updates_test.cpp b/tests/cpp/updates_test.cpp new file mode 100644 index 00000000..52e1fa14 --- /dev/null +++ b/tests/cpp/updates_test.cpp @@ -0,0 +1,278 @@ +#include "../../hnswlib/hnswlib.h" +#include + + +class StopW { + std::chrono::steady_clock::time_point time_begin; + + public: + StopW() { + time_begin = std::chrono::steady_clock::now(); + } + + float getElapsedTimeMicro() { + std::chrono::steady_clock::time_point time_end = std::chrono::steady_clock::now(); + return (std::chrono::duration_cast(time_end - time_begin).count()); + } + + void reset() { + time_begin = std::chrono::steady_clock::now(); + } +}; + + +/* + * replacement for the openmp '#pragma omp parallel for' directive + * only handles a subset of functionality (no reductions etc) + * Process ids from start (inclusive) to end (EXCLUSIVE) + * + * The method is borrowed from nmslib + */ +template +inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { + if (numThreads <= 0) { + numThreads = std::thread::hardware_concurrency(); + } + + if (numThreads == 1) { + for (size_t id = start; id < end; id++) { + fn(id, 0); + } + } else { + std::vector threads; + std::atomic current(start); + + // keep track of exceptions in threads + // https://stackoverflow.com/a/32428427/1713196 + std::exception_ptr lastException = nullptr; + std::mutex lastExceptMutex; + + for (size_t threadId = 0; threadId < numThreads; ++threadId) { + threads.push_back(std::thread([&, threadId] { + while (true) { + size_t id = current.fetch_add(1); + + if ((id >= end)) { + break; + } + + try { + fn(id, threadId); + } catch (...) { + std::unique_lock lastExcepLock(lastExceptMutex); + lastException = std::current_exception(); + /* + * This will work even when current is the largest value that + * size_t can fit, because fetch_add returns the previous value + * before the increment (what will result in overflow + * and produce 0 instead of current + 1). + */ + current = end; + break; + } + } + })); + } + for (auto &thread : threads) { + thread.join(); + } + if (lastException) { + std::rethrow_exception(lastException); + } + } +} + + +template +std::vector load_batch(std::string path, int size) { + std::cout << "Loading " << path << "..."; + // float or int32 (python) + assert(sizeof(datatype) == 4); + + std::ifstream file; + file.open(path, std::ios::binary); + if (!file.is_open()) { + std::cout << "Cannot open " << path << "\n"; + exit(1); + } + std::vector batch(size); + + file.read((char *)batch.data(), size * sizeof(float)); + std::cout << " DONE\n"; + return batch; +} + + +template +static float +test_approx(std::vector &queries, size_t qsize, hnswlib::HierarchicalNSW &appr_alg, size_t vecdim, + std::vector> &answers, size_t K) { + size_t correct = 0; + size_t total = 0; + + for (int i = 0; i < qsize; i++) { + std::priority_queue> result = appr_alg.searchKnn((char *)(queries.data() + vecdim * i), K); + total += K; + while (result.size()) { + if (answers[i].find(result.top().second) != answers[i].end()) { + correct++; + } else { + } + result.pop(); + } + } + return 1.0f * correct / total; +} + + +static void +test_vs_recall( + std::vector &queries, + size_t qsize, + hnswlib::HierarchicalNSW &appr_alg, + size_t vecdim, + std::vector> &answers, + size_t k) { + + std::vector efs = {1}; + for (int i = k; i < 30; i++) { + efs.push_back(i); + } + for (int i = 30; i < 400; i+=10) { + efs.push_back(i); + } + for (int i = 1000; i < 100000; i += 5000) { + efs.push_back(i); + } + std::cout << "ef\trecall\ttime\thops\tdistcomp\n"; + + bool test_passed = false; + for (size_t ef : efs) { + appr_alg.setEf(ef); + + appr_alg.metric_hops = 0; + appr_alg.metric_distance_computations = 0; + StopW stopw = StopW(); + + float recall = test_approx(queries, qsize, appr_alg, vecdim, answers, k); + float time_us_per_query = stopw.getElapsedTimeMicro() / qsize; + float distance_comp_per_query = appr_alg.metric_distance_computations / (1.0f * qsize); + float hops_per_query = appr_alg.metric_hops / (1.0f * qsize); + + std::cout << ef << "\t" << recall << "\t" << time_us_per_query << "us \t" << hops_per_query << "\t" << distance_comp_per_query << "\n"; + if (recall > 0.99) { + test_passed = true; + std::cout << "Recall is over 0.99! " << recall << "\t" << time_us_per_query << "us \t" << hops_per_query << "\t" << distance_comp_per_query << "\n"; + break; + } + } + if (!test_passed) { + std::cerr << "Test failed\n"; + exit(1); + } +} + + +int main(int argc, char **argv) { + int M = 16; + int efConstruction = 200; + int num_threads = std::thread::hardware_concurrency(); + + bool update = false; + + if (argc == 2) { + if (std::string(argv[1]) == "update") { + update = true; + std::cout << "Updates are on\n"; + } else { + std::cout << "Usage ./test_updates [update]\n"; + exit(1); + } + } else if (argc > 2) { + std::cout << "Usage ./test_updates [update]\n"; + exit(1); + } + + std::string path = "../tests/cpp/data/"; + + int N; + int dummy_data_multiplier; + int N_queries; + int d; + int K; + { + std::ifstream configfile; + configfile.open(path + "/config.txt"); + if (!configfile.is_open()) { + std::cout << "Cannot open config.txt\n"; + return 1; + } + configfile >> N >> dummy_data_multiplier >> N_queries >> d >> K; + + printf("Loaded config: N=%d, d_mult=%d, Nq=%d, dim=%d, K=%d\n", N, dummy_data_multiplier, N_queries, d, K); + } + + hnswlib::L2Space l2space(d); + hnswlib::HierarchicalNSW appr_alg(&l2space, N + 1, M, efConstruction); + + std::vector dummy_batch = load_batch(path + "batch_dummy_00.bin", N * d); + + // Adding enterpoint: + + appr_alg.addPoint((void *)dummy_batch.data(), (size_t)0); + + StopW stopw = StopW(); + + if (update) { + std::cout << "Update iteration 0\n"; + + ParallelFor(1, N, num_threads, [&](size_t i, size_t threadId) { + appr_alg.addPoint((void *)(dummy_batch.data() + i * d), i); + }); + appr_alg.checkIntegrity(); + + ParallelFor(1, N, num_threads, [&](size_t i, size_t threadId) { + appr_alg.addPoint((void *)(dummy_batch.data() + i * d), i); + }); + appr_alg.checkIntegrity(); + + for (int b = 1; b < dummy_data_multiplier; b++) { + std::cout << "Update iteration " << b << "\n"; + char cpath[1024]; + sprintf(cpath, "batch_dummy_%02d.bin", b); + std::vector dummy_batchb = load_batch(path + cpath, N * d); + + ParallelFor(0, N, num_threads, [&](size_t i, size_t threadId) { + appr_alg.addPoint((void *)(dummy_batch.data() + i * d), i); + }); + appr_alg.checkIntegrity(); + } + } + + std::cout << "Inserting final elements\n"; + std::vector final_batch = load_batch(path + "batch_final.bin", N * d); + + stopw.reset(); + ParallelFor(0, N, num_threads, [&](size_t i, size_t threadId) { + appr_alg.addPoint((void *)(final_batch.data() + i * d), i); + }); + std::cout << "Finished. Time taken:" << stopw.getElapsedTimeMicro()*1e-6 << " s\n"; + std::cout << "Running tests\n"; + std::vector queries_batch = load_batch(path + "queries.bin", N_queries * d); + + std::vector gt = load_batch(path + "gt.bin", N_queries * K); + + std::vector> answers(N_queries); + for (int i = 0; i < N_queries; i++) { + for (int j = 0; j < K; j++) { + answers[i].insert(gt[i * K + j]); + } + } + + for (int i = 0; i < 3; i++) { + std::cout << "Test iteration " << i << "\n"; + test_vs_recall(queries_batch, N_queries, appr_alg, d, answers, K); + } + + return 0; +} diff --git a/python_bindings/tests/bindings_test.py b/tests/python/bindings_test.py similarity index 100% rename from python_bindings/tests/bindings_test.py rename to tests/python/bindings_test.py diff --git a/python_bindings/tests/bindings_test_filter.py b/tests/python/bindings_test_filter.py similarity index 100% rename from python_bindings/tests/bindings_test_filter.py rename to tests/python/bindings_test_filter.py diff --git a/python_bindings/tests/bindings_test_getdata.py b/tests/python/bindings_test_getdata.py similarity index 100% rename from python_bindings/tests/bindings_test_getdata.py rename to tests/python/bindings_test_getdata.py diff --git a/python_bindings/tests/bindings_test_labels.py b/tests/python/bindings_test_labels.py similarity index 100% rename from python_bindings/tests/bindings_test_labels.py rename to tests/python/bindings_test_labels.py diff --git a/python_bindings/tests/bindings_test_metadata.py b/tests/python/bindings_test_metadata.py similarity index 100% rename from python_bindings/tests/bindings_test_metadata.py rename to tests/python/bindings_test_metadata.py diff --git a/python_bindings/tests/bindings_test_pickle.py b/tests/python/bindings_test_pickle.py similarity index 100% rename from python_bindings/tests/bindings_test_pickle.py rename to tests/python/bindings_test_pickle.py diff --git a/python_bindings/tests/bindings_test_recall.py b/tests/python/bindings_test_recall.py similarity index 100% rename from python_bindings/tests/bindings_test_recall.py rename to tests/python/bindings_test_recall.py diff --git a/python_bindings/tests/bindings_test_replace.py b/tests/python/bindings_test_replace.py similarity index 100% rename from python_bindings/tests/bindings_test_replace.py rename to tests/python/bindings_test_replace.py diff --git a/python_bindings/tests/bindings_test_resize.py b/tests/python/bindings_test_resize.py similarity index 100% rename from python_bindings/tests/bindings_test_resize.py rename to tests/python/bindings_test_resize.py diff --git a/python_bindings/tests/bindings_test_spaces.py b/tests/python/bindings_test_spaces.py similarity index 100% rename from python_bindings/tests/bindings_test_spaces.py rename to tests/python/bindings_test_spaces.py diff --git a/python_bindings/tests/bindings_test_stress_mt_replace.py b/tests/python/bindings_test_stress_mt_replace.py similarity index 100% rename from python_bindings/tests/bindings_test_stress_mt_replace.py rename to tests/python/bindings_test_stress_mt_replace.py diff --git a/examples/git_tester.py b/tests/python/git_tester.py similarity index 100% rename from examples/git_tester.py rename to tests/python/git_tester.py diff --git a/examples/speedtest.py b/tests/python/speedtest.py similarity index 100% rename from examples/speedtest.py rename to tests/python/speedtest.py diff --git a/examples/updates_test.cpp b/tests/updates_test.cpp similarity index 100% rename from examples/updates_test.cpp rename to tests/updates_test.cpp From 2fdf1c158939ab4e087d68dfc64ff3ea13c08e3c Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 14 Jan 2023 16:43:20 +0400 Subject: [PATCH 2/5] Update Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b5e8fda9..20acac6e 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ dist: python3 -m build --sdist test: - python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py" + python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py" clean: rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so From 864f39ffaf73164c81faa451c5055525ddec8635 Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sat, 14 Jan 2023 17:01:29 +0400 Subject: [PATCH 3/5] Update git tester --- tests/python/git_tester.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/python/git_tester.py b/tests/python/git_tester.py index be3b8a25..5a97f3dd 100644 --- a/tests/python/git_tester.py +++ b/tests/python/git_tester.py @@ -5,8 +5,8 @@ from pydriller import Repository -speedtest_src_path = os.path.join("examples", "speedtest.py") -speedtest_copy_path = os.path.join("examples", "speedtest2.py") +speedtest_src_path = os.path.join("tests", "python", "speedtest.py") +speedtest_copy_path = os.path.join("tests", "python", "speedtest2.py") shutil.copyfile(speedtest_src_path, speedtest_copy_path) # the file has to be outside of git commits = list(Repository('.', from_tag="v0.6.0").traverse_commits()) From 556628d289ae2b80c269f85271981cec9cda8fca Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sun, 15 Jan 2023 09:37:36 +0400 Subject: [PATCH 4/5] Remove redundant updates_test.cpp, apply suggested changes to example file --- README.md | 2 +- examples/EXAMPLES.md | 5 +- tests/updates_test.cpp | 278 ----------------------------------------- 3 files changed, 4 insertions(+), 281 deletions(-) delete mode 100644 tests/updates_test.cpp diff --git a/README.md b/README.md index 80128105..04d84d66 100644 --- a/README.md +++ b/README.md @@ -251,7 +251,7 @@ Please make pull requests against the `develop` branch. When making changes please run tests (and please add a test to `tests/python` in case there is new functionality): ```bash -python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py +python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py" ``` diff --git a/examples/EXAMPLES.md b/examples/EXAMPLES.md index 7dd90dc0..71f69ff4 100644 --- a/examples/EXAMPLES.md +++ b/examples/EXAMPLES.md @@ -1,5 +1,6 @@ # Python bindings examples +Creating index, inserting elements, searching and pickle serialization ```python import hnswlib import numpy as np @@ -106,7 +107,7 @@ labels, distances = p.knn_query(data, k=1) print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n") ``` -An example with a filter: +An example with a symbolic filter `filter_function` during the search:: ```python import hnswlib import numpy as np @@ -150,7 +151,7 @@ labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function) # labels contain only elements with even id ``` -An example with replacing of deleted elements: +An example with reusing the memory of the deleted elements when new elements are being added (via `allow_replace_deleted` flag): ```python import hnswlib import numpy as np diff --git a/tests/updates_test.cpp b/tests/updates_test.cpp deleted file mode 100644 index 8e4ac644..00000000 --- a/tests/updates_test.cpp +++ /dev/null @@ -1,278 +0,0 @@ -#include "../hnswlib/hnswlib.h" -#include - - -class StopW { - std::chrono::steady_clock::time_point time_begin; - - public: - StopW() { - time_begin = std::chrono::steady_clock::now(); - } - - float getElapsedTimeMicro() { - std::chrono::steady_clock::time_point time_end = std::chrono::steady_clock::now(); - return (std::chrono::duration_cast(time_end - time_begin).count()); - } - - void reset() { - time_begin = std::chrono::steady_clock::now(); - } -}; - - -/* - * replacement for the openmp '#pragma omp parallel for' directive - * only handles a subset of functionality (no reductions etc) - * Process ids from start (inclusive) to end (EXCLUSIVE) - * - * The method is borrowed from nmslib - */ -template -inline void ParallelFor(size_t start, size_t end, size_t numThreads, Function fn) { - if (numThreads <= 0) { - numThreads = std::thread::hardware_concurrency(); - } - - if (numThreads == 1) { - for (size_t id = start; id < end; id++) { - fn(id, 0); - } - } else { - std::vector threads; - std::atomic current(start); - - // keep track of exceptions in threads - // https://stackoverflow.com/a/32428427/1713196 - std::exception_ptr lastException = nullptr; - std::mutex lastExceptMutex; - - for (size_t threadId = 0; threadId < numThreads; ++threadId) { - threads.push_back(std::thread([&, threadId] { - while (true) { - size_t id = current.fetch_add(1); - - if ((id >= end)) { - break; - } - - try { - fn(id, threadId); - } catch (...) { - std::unique_lock lastExcepLock(lastExceptMutex); - lastException = std::current_exception(); - /* - * This will work even when current is the largest value that - * size_t can fit, because fetch_add returns the previous value - * before the increment (what will result in overflow - * and produce 0 instead of current + 1). - */ - current = end; - break; - } - } - })); - } - for (auto &thread : threads) { - thread.join(); - } - if (lastException) { - std::rethrow_exception(lastException); - } - } -} - - -template -std::vector load_batch(std::string path, int size) { - std::cout << "Loading " << path << "..."; - // float or int32 (python) - assert(sizeof(datatype) == 4); - - std::ifstream file; - file.open(path, std::ios::binary); - if (!file.is_open()) { - std::cout << "Cannot open " << path << "\n"; - exit(1); - } - std::vector batch(size); - - file.read((char *)batch.data(), size * sizeof(float)); - std::cout << " DONE\n"; - return batch; -} - - -template -static float -test_approx(std::vector &queries, size_t qsize, hnswlib::HierarchicalNSW &appr_alg, size_t vecdim, - std::vector> &answers, size_t K) { - size_t correct = 0; - size_t total = 0; - - for (int i = 0; i < qsize; i++) { - std::priority_queue> result = appr_alg.searchKnn((char *)(queries.data() + vecdim * i), K); - total += K; - while (result.size()) { - if (answers[i].find(result.top().second) != answers[i].end()) { - correct++; - } else { - } - result.pop(); - } - } - return 1.0f * correct / total; -} - - -static void -test_vs_recall( - std::vector &queries, - size_t qsize, - hnswlib::HierarchicalNSW &appr_alg, - size_t vecdim, - std::vector> &answers, - size_t k) { - - std::vector efs = {1}; - for (int i = k; i < 30; i++) { - efs.push_back(i); - } - for (int i = 30; i < 400; i+=10) { - efs.push_back(i); - } - for (int i = 1000; i < 100000; i += 5000) { - efs.push_back(i); - } - std::cout << "ef\trecall\ttime\thops\tdistcomp\n"; - - bool test_passed = false; - for (size_t ef : efs) { - appr_alg.setEf(ef); - - appr_alg.metric_hops = 0; - appr_alg.metric_distance_computations = 0; - StopW stopw = StopW(); - - float recall = test_approx(queries, qsize, appr_alg, vecdim, answers, k); - float time_us_per_query = stopw.getElapsedTimeMicro() / qsize; - float distance_comp_per_query = appr_alg.metric_distance_computations / (1.0f * qsize); - float hops_per_query = appr_alg.metric_hops / (1.0f * qsize); - - std::cout << ef << "\t" << recall << "\t" << time_us_per_query << "us \t" << hops_per_query << "\t" << distance_comp_per_query << "\n"; - if (recall > 0.99) { - test_passed = true; - std::cout << "Recall is over 0.99! " << recall << "\t" << time_us_per_query << "us \t" << hops_per_query << "\t" << distance_comp_per_query << "\n"; - break; - } - } - if (!test_passed) { - std::cerr << "Test failed\n"; - exit(1); - } -} - - -int main(int argc, char **argv) { - int M = 16; - int efConstruction = 200; - int num_threads = std::thread::hardware_concurrency(); - - bool update = false; - - if (argc == 2) { - if (std::string(argv[1]) == "update") { - update = true; - std::cout << "Updates are on\n"; - } else { - std::cout << "Usage ./test_updates [update]\n"; - exit(1); - } - } else if (argc > 2) { - std::cout << "Usage ./test_updates [update]\n"; - exit(1); - } - - std::string path = "../examples/data/"; - - int N; - int dummy_data_multiplier; - int N_queries; - int d; - int K; - { - std::ifstream configfile; - configfile.open(path + "/config.txt"); - if (!configfile.is_open()) { - std::cout << "Cannot open config.txt\n"; - return 1; - } - configfile >> N >> dummy_data_multiplier >> N_queries >> d >> K; - - printf("Loaded config: N=%d, d_mult=%d, Nq=%d, dim=%d, K=%d\n", N, dummy_data_multiplier, N_queries, d, K); - } - - hnswlib::L2Space l2space(d); - hnswlib::HierarchicalNSW appr_alg(&l2space, N + 1, M, efConstruction); - - std::vector dummy_batch = load_batch(path + "batch_dummy_00.bin", N * d); - - // Adding enterpoint: - - appr_alg.addPoint((void *)dummy_batch.data(), (size_t)0); - - StopW stopw = StopW(); - - if (update) { - std::cout << "Update iteration 0\n"; - - ParallelFor(1, N, num_threads, [&](size_t i, size_t threadId) { - appr_alg.addPoint((void *)(dummy_batch.data() + i * d), i); - }); - appr_alg.checkIntegrity(); - - ParallelFor(1, N, num_threads, [&](size_t i, size_t threadId) { - appr_alg.addPoint((void *)(dummy_batch.data() + i * d), i); - }); - appr_alg.checkIntegrity(); - - for (int b = 1; b < dummy_data_multiplier; b++) { - std::cout << "Update iteration " << b << "\n"; - char cpath[1024]; - sprintf(cpath, "batch_dummy_%02d.bin", b); - std::vector dummy_batchb = load_batch(path + cpath, N * d); - - ParallelFor(0, N, num_threads, [&](size_t i, size_t threadId) { - appr_alg.addPoint((void *)(dummy_batch.data() + i * d), i); - }); - appr_alg.checkIntegrity(); - } - } - - std::cout << "Inserting final elements\n"; - std::vector final_batch = load_batch(path + "batch_final.bin", N * d); - - stopw.reset(); - ParallelFor(0, N, num_threads, [&](size_t i, size_t threadId) { - appr_alg.addPoint((void *)(final_batch.data() + i * d), i); - }); - std::cout << "Finished. Time taken:" << stopw.getElapsedTimeMicro()*1e-6 << " s\n"; - std::cout << "Running tests\n"; - std::vector queries_batch = load_batch(path + "queries.bin", N_queries * d); - - std::vector gt = load_batch(path + "gt.bin", N_queries * K); - - std::vector> answers(N_queries); - for (int i = 0; i < N_queries; i++) { - for (int j = 0; j < K; j++) { - answers[i].insert(gt[i * K + j]); - } - } - - for (int i = 0; i < 3; i++) { - std::cout << "Test iteration " << i << "\n"; - test_vs_recall(queries_batch, N_queries, appr_alg, d, answers, K); - } - - return 0; -} From 0dc8f80dbfcb0e1c7afb5a7f3c52a7a0efb1eb3b Mon Sep 17 00:00:00 2001 From: Dmitry Yashunin Date: Sun, 15 Jan 2023 10:08:51 +0400 Subject: [PATCH 5/5] Return back python3 in Makefile --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 20acac6e..0de9c765 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ dist: python3 -m build --sdist test: - python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py" + python3 -m unittest discover --start-directory tests/python --pattern "bindings_test*.py" clean: rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so