Skip to content

Commit

Permalink
Merge pull request #523 from nmslib/develop
Browse files Browse the repository at this point in the history
Release v0.8.0
  • Loading branch information
yurymalkov committed Dec 3, 2023
2 parents 359b2ba + 5a8fd34 commit 3f34296
Show file tree
Hide file tree
Showing 23 changed files with 1,190 additions and 90 deletions.
14 changes: 9 additions & 5 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ${{matrix.os}}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
os: [ubuntu-latest, windows-latest, macos-latest]
python-version: ["3.7", "3.8", "3.9", "3.10"]
steps:
- uses: actions/checkout@v3
Expand All @@ -28,7 +28,7 @@ jobs:
runs-on: ${{matrix.os}}
strategy:
matrix:
os: [ubuntu-latest, windows-latest]
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
Expand All @@ -40,10 +40,10 @@ jobs:
mkdir build
cd build
cmake ..
if [ "$RUNNER_OS" == "Linux" ]; then
make
elif [ "$RUNNER_OS" == "Windows" ]; then
if [ "$RUNNER_OS" == "Windows" ]; then
cmake --build ./ --config Release
else
make
fi
shell: bash

Expand All @@ -67,10 +67,14 @@ jobs:
./example_mt_search
./example_mt_filter
./example_mt_replace_deleted
./example_multivector_search
./example_epsilon_search
./searchKnnCloserFirst_test
./searchKnnWithFilter_test
./multiThreadLoad_test
./multiThread_replace_test
./test_updates
./test_updates update
./multivector_search_test
./epsilon_search_test
shell: bash
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ var/
.vscode/
.vs/
**.DS_Store
*.pyc
63 changes: 56 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,25 +1,68 @@
cmake_minimum_required (VERSION 2.6)
project(hnsw_lib
cmake_minimum_required(VERSION 3.0...3.26)

project(hnswlib
LANGUAGES CXX)

include(GNUInstallDirs)
include(CheckCXXCompilerFlag)

add_library(hnswlib INTERFACE)
target_include_directories(hnswlib INTERFACE .)
add_library(hnswlib::hnswlib ALIAS hnswlib)

target_include_directories(hnswlib INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)

# Install
install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/hnswlib
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

install(TARGETS hnswlib
EXPORT hnswlibTargets)

install(EXPORT hnswlibTargets
FILE hnswlibConfig.cmake
NAMESPACE hnswlib::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/hnswlib)

# Examples and tests
if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
option(HNSWLIB_EXAMPLES "Build examples and tests." ON)
else()
option(HNSWLIB_EXAMPLES "Build examples and tests." OFF)
endif()
if(HNSWLIB_EXAMPLES)
set(CMAKE_CXX_STANDARD 11)

if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
SET( CMAKE_CXX_FLAGS "-Ofast -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -ftree-vectorize")
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
SET( CMAKE_CXX_FLAGS "-Ofast -std=c++11 -DHAVE_CXX0X -openmp -fpic -ftree-vectorize" )
check_cxx_compiler_flag("-march=native" COMPILER_SUPPORT_NATIVE_FLAG)
if(COMPILER_SUPPORT_NATIVE_FLAG)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native" )
message("set -march=native flag")
else()
check_cxx_compiler_flag("-mcpu=apple-m1" COMPILER_SUPPORT_M1_FLAG)
if(COMPILER_SUPPORT_M1_FLAG)
SET( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=apple-m1" )
message("set -mcpu=apple-m1 flag")
endif()
endif()
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -std=c++11 -DHAVE_CXX0X -march=native -fpic -w -fopenmp -ftree-vectorize -ftree-vectorizer-verbose=0" )
elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
SET( CMAKE_CXX_FLAGS "-Ofast -lrt -DNDEBUG -std=c++11 -DHAVE_CXX0X -openmp -march=native -fpic -w -fopenmp -ftree-vectorize" )
SET( CMAKE_CXX_FLAGS "/O2 -DHAVE_CXX0X /W1 /openmp /EHsc" )
endif()

# examples
add_executable(example_search examples/cpp/example_search.cpp)
target_link_libraries(example_search hnswlib)

add_executable(example_epsilon_search examples/cpp/example_epsilon_search.cpp)
target_link_libraries(example_epsilon_search hnswlib)

add_executable(example_multivector_search examples/cpp/example_multivector_search.cpp)
target_link_libraries(example_multivector_search hnswlib)

add_executable(example_filter examples/cpp/example_filter.cpp)
target_link_libraries(example_filter hnswlib)

Expand All @@ -36,6 +79,12 @@ if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME)
target_link_libraries(example_mt_replace_deleted hnswlib)

# tests
add_executable(multivector_search_test tests/cpp/multivector_search_test.cpp)
target_link_libraries(multivector_search_test hnswlib)

add_executable(epsilon_search_test tests/cpp/epsilon_search_test.cpp)
target_link_libraries(epsilon_search_test hnswlib)

add_executable(test_updates tests/cpp/updates_test.cpp)
target_link_libraries(test_updates hnswlib)

Expand Down
13 changes: 12 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,15 @@ Header-only C++ HNSW implementation with python bindings, insertions and updates

**NEWS:**

**version 0.8.0**

* Multi-vector document search and epsilon search (for now, only in C++)
* By default, there is no statistic aggregation, which speeds up the multi-threaded search (it does not seem like people are using it anyway: [Issue #495](https://github.com/nmslib/hnswlib/issues/495)).
* Various bugfixes and improvements
* `get_items` now have `return_type` parameter, which can be either 'numpy' or 'list'

Full list of changes: https://github.com/nmslib/hnswlib/pull/523

**version 0.7.0**

* Added support to filtering (#402, #430) by [@kishorenc](https://github.com/kishorenc)
Expand Down Expand Up @@ -79,7 +88,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.

* `set_num_threads(num_threads)` set the default number of cpu threads used during data insertion/querying.

* `get_items(ids)` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`). Note that for cosine similarity it currently returns **normalized** vectors.
* `get_items(ids, return_type = 'numpy')` - returns a numpy array (shape:`N*dim`) of vectors that have integer identifiers specified in `ids` numpy vector (shape:`N`) if `return_type` is `list` return list of lists. Note that for cosine similarity it currently returns **normalized** vectors.

* `get_ids_list()` - returns a list of all elements' ids.

Expand Down Expand Up @@ -229,6 +238,8 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
* filtering during the search with a boolean function
* deleting the elements and reusing the memory of the deleted elements for newly added elements
* multithreaded usage
* multivector search
* epsilon search


### Bindings installation
Expand Down
6 changes: 5 additions & 1 deletion examples/cpp/EXAMPLES.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,8 @@ int main() {
Multithreaded examples:
* Creating index, inserting elements, searching [example_mt_search.cpp](example_mt_search.cpp)
* Filtering during the search with a boolean function [example_mt_filter.cpp](example_mt_filter.cpp)
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)
* Reusing the memory of the deleted elements when new elements are being added [example_mt_replace_deleted.cpp](example_mt_replace_deleted.cpp)

More examples:
* Multivector search [example_multivector_search.cpp](example_multivector_search.cpp)
* Epsilon search [example_epsilon_search.cpp](example_epsilon_search.cpp)
66 changes: 66 additions & 0 deletions examples/cpp/example_epsilon_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include "../../hnswlib/hnswlib.h"

typedef unsigned int docidtype;
typedef float dist_t;

int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff
int min_num_candidates = 100; // Minimum number of candidates to search in the epsilon region
// this parameter is similar to ef

int num_queries = 5;
float epsilon2 = 2.0; // Squared distance to query

// Initing index
hnswlib::L2Space space(dim);
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;

size_t data_point_size = space.get_data_size();
char* data = new char[data_point_size * max_elements];
for (int i = 0; i < max_elements; i++) {
char* point_data = data + i * data_point_size;
for (int j = 0; j < dim; j++) {
char* vec_data = point_data + j * sizeof(float);
float value = distrib_real(rng);
*(float*)vec_data = value;
}
}

// Add data to index
for (int i = 0; i < max_elements; i++) {
hnswlib::labeltype label = i;
char* point_data = data + i * data_point_size;
alg_hnsw->addPoint(point_data, label);
}

// Query random vectors
for (int i = 0; i < num_queries; i++) {
char* query_data = new char[data_point_size];
for (int j = 0; j < dim; j++) {
size_t offset = j * sizeof(float);
char* vec_data = query_data + offset;
float value = distrib_real(rng);
*(float*)vec_data = value;
}
std::cout << "Query #" << i << "\n";
hnswlib::EpsilonSearchStopCondition<dist_t> stop_condition(epsilon2, min_num_candidates, max_elements);
std::vector<std::pair<float, hnswlib::labeltype>> result =
alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
size_t num_vectors = result.size();
std::cout << "Found " << num_vectors << " vectors\n";
delete[] query_data;
}

delete[] data;
delete alg_hnsw;
return 0;
}
83 changes: 83 additions & 0 deletions examples/cpp/example_multivector_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
#include "../../hnswlib/hnswlib.h"

typedef unsigned int docidtype;
typedef float dist_t;

int main() {
int dim = 16; // Dimension of the elements
int max_elements = 10000; // Maximum number of elements, should be known beforehand
int M = 16; // Tightly connected with internal dimensionality of the data
// strongly affects the memory consumption
int ef_construction = 200; // Controls index search speed/build speed tradeoff

int num_queries = 5;
int num_docs = 5; // Number of documents to search
int ef_collection = 6; // Number of candidate documents during the search
// Controlls the recall: higher ef leads to better accuracy, but slower search
docidtype min_doc_id = 0;
docidtype max_doc_id = 9;

// Initing index
hnswlib::MultiVectorL2Space<docidtype> space(dim);
hnswlib::HierarchicalNSW<dist_t>* alg_hnsw = new hnswlib::HierarchicalNSW<dist_t>(&space, max_elements, M, ef_construction);

// Generate random data
std::mt19937 rng;
rng.seed(47);
std::uniform_real_distribution<> distrib_real;
std::uniform_int_distribution<docidtype> distrib_docid(min_doc_id, max_doc_id);

size_t data_point_size = space.get_data_size();
char* data = new char[data_point_size * max_elements];
for (int i = 0; i < max_elements; i++) {
// set vector value
char* point_data = data + i * data_point_size;
for (int j = 0; j < dim; j++) {
char* vec_data = point_data + j * sizeof(float);
float value = distrib_real(rng);
*(float*)vec_data = value;
}
// set document id
docidtype doc_id = distrib_docid(rng);
space.set_doc_id(point_data, doc_id);
}

// Add data to index
std::unordered_map<hnswlib::labeltype, docidtype> label_docid_lookup;
for (int i = 0; i < max_elements; i++) {
hnswlib::labeltype label = i;
char* point_data = data + i * data_point_size;
alg_hnsw->addPoint(point_data, label);
label_docid_lookup[label] = space.get_doc_id(point_data);
}

// Query random vectors
size_t query_size = dim * sizeof(float);
for (int i = 0; i < num_queries; i++) {
char* query_data = new char[query_size];
for (int j = 0; j < dim; j++) {
size_t offset = j * sizeof(float);
char* vec_data = query_data + offset;
float value = distrib_real(rng);
*(float*)vec_data = value;
}
std::cout << "Query #" << i << "\n";
hnswlib::MultiVectorSearchStopCondition<docidtype, dist_t> stop_condition(space, num_docs, ef_collection);
std::vector<std::pair<float, hnswlib::labeltype>> result =
alg_hnsw->searchStopConditionClosest(query_data, stop_condition);
size_t num_vectors = result.size();

std::unordered_map<docidtype, size_t> doc_counter;
for (auto pair: result) {
hnswlib::labeltype label = pair.second;
docidtype doc_id = label_docid_lookup[label];
doc_counter[doc_id] += 1;
}
std::cout << "Found " << doc_counter.size() << " documents, " << num_vectors << " vectors\n";
delete[] query_data;
}

delete[] data;
delete alg_hnsw;
return 0;
}
14 changes: 10 additions & 4 deletions hnswlib/bruteforce.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,16 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {


void removePoint(labeltype cur_external) {
size_t cur_c = dict_external_to_internal[cur_external];
std::unique_lock<std::mutex> lock(index_lock);

dict_external_to_internal.erase(cur_external);
auto found = dict_external_to_internal.find(cur_external);
if (found == dict_external_to_internal.end()) {
return;
}

dict_external_to_internal.erase(found);

size_t cur_c = found->second;
labeltype label = *((labeltype*)(data_ + size_per_element_ * (cur_element_count-1) + data_size_));
dict_external_to_internal[label] = cur_c;
memcpy(data_ + size_per_element_ * cur_c,
Expand All @@ -106,7 +112,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_);
labeltype label = *((labeltype*) (data_ + size_per_element_ * i + data_size_));
if ((!isIdAllowed) || (*isIdAllowed)(label)) {
topResults.push(std::pair<dist_t, labeltype>(dist, label));
topResults.emplace(dist, label);
}
}
dist_t lastdist = topResults.empty() ? std::numeric_limits<dist_t>::max() : topResults.top().first;
Expand All @@ -115,7 +121,7 @@ class BruteforceSearch : public AlgorithmInterface<dist_t> {
if (dist <= lastdist) {
labeltype label = *((labeltype *) (data_ + size_per_element_ * i + data_size_));
if ((!isIdAllowed) || (*isIdAllowed)(label)) {
topResults.push(std::pair<dist_t, labeltype>(dist, label));
topResults.emplace(dist, label);
}
if (topResults.size() > k)
topResults.pop();
Expand Down
Loading

0 comments on commit 3f34296

Please sign in to comment.