Refactoring of project structure (#432)

* Refactor file structure, update readme and examples * Update Makefile * Update git tester * Remove redundant updates_test.cpp, apply suggested changes to example file * Return back python3 in Makefile
nmslib · Jan 15, 2023 · d86f8f9 · d86f8f9
1 parent 978f713
commit d86f8f9
Show file tree

Hide file tree

Showing 35 changed files with 412 additions and 136 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -20,7 +20,7 @@ jobs:
 
       - name: Test
         timeout-minutes: 15
-        run: python -m unittest discover -v --start-directory python_bindings/tests --pattern "*_test*.py"
+        run: python -m unittest discover -v --start-directory tests/python --pattern "bindings_test*.py"
 
   test_cpp:
     runs-on: ${{matrix.os}}
@@ -48,7 +48,7 @@ jobs:
       - name: Prepare test data
         run: |
           pip install numpy
-          cd examples
+          cd tests/cpp/
           python update_gen_data.py
         shell: bash
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,21 +16,21 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
       SET( CMAKE_CXX_FLAGS  "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
     endif()
 
-    add_executable(test_updates examples/updates_test.cpp)
+    add_executable(test_updates tests/cpp/updates_test.cpp)
     target_link_libraries(test_updates hnswlib)
 
-    add_executable(searchKnnCloserFirst_test examples/searchKnnCloserFirst_test.cpp)
+    add_executable(searchKnnCloserFirst_test tests/cpp/searchKnnCloserFirst_test.cpp)
     target_link_libraries(searchKnnCloserFirst_test hnswlib)
 
-    add_executable(searchKnnWithFilter_test examples/searchKnnWithFilter_test.cpp)
+    add_executable(searchKnnWithFilter_test tests/cpp/searchKnnWithFilter_test.cpp)
     target_link_libraries(searchKnnWithFilter_test hnswlib)
 
-    add_executable(multiThreadLoad_test examples/multiThreadLoad_test.cpp)
+    add_executable(multiThreadLoad_test tests/cpp/multiThreadLoad_test.cpp)
     target_link_libraries(multiThreadLoad_test hnswlib)
 
-    add_executable(multiThread_replace_test examples/multiThread_replace_test.cpp)
+    add_executable(multiThread_replace_test tests/cpp/multiThread_replace_test.cpp)
     target_link_libraries(multiThread_replace_test hnswlib)
 
-    add_executable(main main.cpp sift_1b.cpp)
+    add_executable(main tests/cpp/main.cpp tests/cpp/sift_1b.cpp)
     target_link_libraries(main hnswlib)
 endif()
diff --git a/Makefile b/Makefile
@@ -7,7 +7,7 @@ dist:
 	python3 -m build --sdist
 
 test:
-	python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
+	python3 -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"
 
 clean:
 	rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so

diff --git a/README.md b/README.md
@@ -123,6 +123,7 @@ Properties of `hnswlib.Index` that support reading and writing:
         
 
 #### Python bindings examples
+[See more examples here](examples/EXAMPLES.md)
 ```python
 import hnswlib
 import numpy as np
@@ -229,104 +230,6 @@ labels, distances = p.knn_query(data, k=1)
 print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(data))), "\n")
 ```
 
-An example with a filter:
-```python
-import hnswlib
-import numpy as np
-
-dim = 16
-num_elements = 10000
-
-# Generating sample data
-data = np.float32(np.random.random((num_elements, dim)))
-
-# Declaring index
-hnsw_index = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
-
-# Initiating index
-# max_elements - the maximum number of elements, should be known beforehand
-#     (probably will be made optional in the future)
-#
-# ef_construction - controls index search speed/build speed tradeoff
-# M - is tightly connected with internal dimensionality of the data
-#     strongly affects the memory consumption
-
-hnsw_index.init_index(max_elements=num_elements, ef_construction=100, M=16)
-
-# Controlling the recall by setting ef:
-# higher ef leads to better accuracy, but slower search
-hnsw_index.set_ef(10)
-
-# Set number of threads used during batch search/construction
-# By default using all available cores
-hnsw_index.set_num_threads(4)
-
-print("Adding %d elements" % (len(data)))
-# Added elements will have consecutive ids
-hnsw_index.add_items(data, ids=np.arange(num_elements))
-
-print("Querying only even elements")
-# Define filter function that allows only even ids
-filter_function = lambda idx: idx%2 == 0
-# Query the elements for themselves and search only for even elements:
-labels, distances = hnsw_index.knn_query(data, k=1, filter=filter_function)
-# labels contain only elements with even id
-```
-
-An example with replacing of deleted elements:
-```python
-import hnswlib
-import numpy as np
-
-dim = 16
-num_elements = 1_000
-max_num_elements = 2 * num_elements
-
-# Generating sample data
-labels1 = np.arange(0, num_elements)
-data1 = np.float32(np.random.random((num_elements, dim)))  # batch 1
-labels2 = np.arange(num_elements, 2 * num_elements)
-data2 = np.float32(np.random.random((num_elements, dim)))  # batch 2
-labels3 = np.arange(2 * num_elements, 3 * num_elements)
-data3 = np.float32(np.random.random((num_elements, dim)))  # batch 3
-
-# Declaring index
-hnsw_index = hnswlib.Index(space='l2', dim=dim)
-
-# Initiating index
-# max_elements - the maximum number of elements, should be known beforehand
-#     (probably will be made optional in the future)
-#
-# ef_construction - controls index search speed/build speed tradeoff
-# M - is tightly connected with internal dimensionality of the data
-#     strongly affects the memory consumption
-
-# Enable replacing of deleted elements
-hnsw_index.init_index(max_elements=max_num_elements, ef_construction=200, M=16, allow_replace_deleted=True)
-
-# Controlling the recall by setting ef:
-# higher ef leads to better accuracy, but slower search
-hnsw_index.set_ef(10)
-
-# Set number of threads used during batch search/construction
-# By default using all available cores
-hnsw_index.set_num_threads(4)
-
-# Add batch 1 and 2 data
-hnsw_index.add_items(data1, labels1)
-hnsw_index.add_items(data2, labels2)  # Note: maximum number of elements is reached
-
-# Delete data of batch 2
-for label in labels2:
-    hnsw_index.mark_deleted(label)
-
-# Replace deleted elements
-# Maximum number of elements is reached therefore we cannot add new items,
-# but we can replace the deleted ones by using replace_deleted=True
-hnsw_index.add_items(data3, labels3, replace_deleted=True)
-# hnsw_index contains the data of batch 1 and batch 3 only
-```
-
 ### Bindings installation
 
 You can install from sources:
@@ -346,9 +249,9 @@ Contributions are highly welcome!
 
 Please make pull requests against the `develop` branch.
 
-When making changes please run tests (and please add a test to `python_bindings/tests` in case there is new functionality):
+When making changes please run tests (and please add a test to `tests/python` in case there is new functionality):
 ```bash
-python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
+python -m unittest discover --start-directory tests/python --pattern "bindings_test*.py"
 ```
 
 
@@ -373,7 +276,7 @@ https://github.com/dbaranchuk/ivf-hnsw
 ### 200M SIFT test reproduction 
 To download and extract the bigann dataset (from root directory):
 ```bash
-python3 download_bigann.py
+python tests/cpp/download_bigann.py
 ```
 To compile:
 ```bash
@@ -393,7 +296,7 @@ The size of the BigANN subset (in millions) is controlled by the variable **subs
 ### Updates test
 To generate testing data (from root directory):
 ```bash
-cd examples
+cd tests/cpp
 python update_gen_data.py
 ```
 To compile (from root directory):