From 23c3d79726ce04d71e0b94e57974f01b38435754 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Wed, 12 May 2021 13:38:05 -0700 Subject: [PATCH 1/8] wip linear --- src/core/src/ffi/index/linear.rs | 248 +++++++++++++++++++++++++++++++ src/core/src/ffi/index/mod.rs | 37 +++++ src/core/src/ffi/mod.rs | 1 + src/core/src/ffi/utils.rs | 4 + 4 files changed, 290 insertions(+) create mode 100644 src/core/src/ffi/index/linear.rs create mode 100644 src/core/src/ffi/index/mod.rs diff --git a/src/core/src/ffi/index/linear.rs b/src/core/src/ffi/index/linear.rs new file mode 100644 index 0000000000..95304f4247 --- /dev/null +++ b/src/core/src/ffi/index/linear.rs @@ -0,0 +1,248 @@ +use std::path::PathBuf; +use std::slice; + +use crate::index::linear::LinearIndex; +use crate::index::Index; +use crate::signature::{Signature, SigsTrait}; +use crate::sketch::minhash::KmerMinHash; +use crate::sketch::Sketch; + +use crate::ffi::index::SourmashSearchResult; +use crate::ffi::minhash::SourmashKmerMinHash; +use crate::ffi::signature::SourmashSignature; +use crate::ffi::utils::{ForeignObject, SourmashStr}; + +pub struct SourmashLinearIndex; + +impl ForeignObject for SourmashLinearIndex { + type RustObject = LinearIndex; +} + +ffi_fn! { + unsafe fn linearindex_new_with_paths( + search_sigs_ptr: *const *const SourmashStr, + insigs: usize, + template_ptr: *const SourmashKmerMinHash, + threshold: usize, + queries_ptr: *const *const SourmashKmerMinHash, + inqueries: usize, + keep_sigs: bool, + ) -> Result<*mut SourmashLinearIndex> { + let search_sigs: Vec = { + assert!(!search_sigs_ptr.is_null()); + slice::from_raw_parts(search_sigs_ptr, insigs).iter().map(|path| { + let mut new_path = PathBuf::new(); + new_path.push(SourmashStr::as_rust(*path).as_str()); + new_path} + ).collect() + }; + + let linear_index = LinearIndex::builder().build(); + + unimplemented!(); + /* + + let template = { + assert!(!template_ptr.is_null()); + //TODO: avoid clone here + Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone()) + }; + + let queries_vec: Vec; + let queries: Option<&[KmerMinHash]> = if queries_ptr.is_null() { + None + } else { + queries_vec = + slice::from_raw_parts(queries_ptr, inqueries).iter().map(|mh_ptr| + // TODO: avoid this clone + SourmashKmerMinHash::as_rust(*mh_ptr).clone()).collect(); + Some(queries_vec.as_ref()) + }; + let revindex = RevIndex::new( + search_sigs.as_ref(), + &template, + threshold, + queries, + keep_sigs + ); + Ok(SourmashRevIndex::from_rust(revindex)) + */ + + Ok(SourmashLinearIndex::from_rust(linear_index)) + } +} + +/* +ffi_fn! { + unsafe fn revindex_new_with_sigs( + search_sigs_ptr: *const *const SourmashSignature, + insigs: usize, + template_ptr: *const SourmashKmerMinHash, + threshold: usize, + queries_ptr: *const *const SourmashKmerMinHash, + inqueries: usize, + ) -> Result<*mut SourmashRevIndex> { + let search_sigs: Vec = { + assert!(!search_sigs_ptr.is_null()); + slice::from_raw_parts(search_sigs_ptr, insigs).iter().map(|sig| + SourmashSignature::as_rust(*sig) + ).cloned().collect() + }; + + let template = { + assert!(!template_ptr.is_null()); + //TODO: avoid clone here + Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone()) + }; + + let queries_vec: Vec; + let queries: Option<&[KmerMinHash]> = if queries_ptr.is_null() { + None + } else { + queries_vec = + slice::from_raw_parts(queries_ptr, inqueries).iter().map(|mh_ptr| + // TODO: avoid this clone + SourmashKmerMinHash::as_rust(*mh_ptr).clone()).collect(); + Some(queries_vec.as_ref()) + }; + let revindex = RevIndex::new_with_sigs( + search_sigs, + &template, + threshold, + queries, + ); + Ok(SourmashRevIndex::from_rust(revindex)) + } +} +*/ + +#[no_mangle] +pub unsafe extern "C" fn linear_index_free(ptr: *mut SourmashLinearIndex) { + SourmashLinearIndex::drop(ptr); +} + +/* +ffi_fn! { +unsafe fn revindex_search( + ptr: *const SourmashRevIndex, + sig_ptr: *const SourmashSignature, + threshold: f64, + do_containment: bool, + _ignore_abundance: bool, + size: *mut usize +) -> Result<*const *const SourmashSearchResult> { + let revindex = SourmashRevIndex::as_rust(ptr); + let sig = SourmashSignature::as_rust(sig_ptr); + + if sig.signatures.is_empty() { + *size = 0; + return Ok(std::ptr::null::<*const SourmashSearchResult>()); + } + + let mh = if let Sketch::MinHash(mh) = &sig.signatures[0] { + mh + } else { + // TODO: what if it is not a mh? + unimplemented!() + }; + + let results: Vec<(f64, Signature, String)> = revindex + .find_signatures(mh, threshold, do_containment, true)? + .into_iter() + .collect(); + + // FIXME: use the ForeignObject trait, maybe define new method there... + let ptr_sigs: Vec<*const SourmashSearchResult> = results.into_iter().map(|x| { + Box::into_raw(Box::new(x)) as *const SourmashSearchResult + }).collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *const *const SourmashSearchResult) +} +} + +ffi_fn! { +unsafe fn revindex_gather( + ptr: *const SourmashRevIndex, + sig_ptr: *const SourmashSignature, + threshold: f64, + _do_containment: bool, + _ignore_abundance: bool, + size: *mut usize +) -> Result<*const *const SourmashSearchResult> { + let revindex = SourmashRevIndex::as_rust(ptr); + let sig = SourmashSignature::as_rust(sig_ptr); + + if sig.signatures.is_empty() { + *size = 0; + return Ok(std::ptr::null::<*const SourmashSearchResult>()); + } + + let mh = if let Sketch::MinHash(mh) = &sig.signatures[0] { + mh + } else { + // TODO: what if it is not a mh? + unimplemented!() + }; + + // TODO: proper threshold calculation + let threshold: usize = (threshold * (mh.size() as f64)) as _; + + let counter = revindex.counter_for_query(&mh); + dbg!(&counter); + + let results: Vec<(f64, Signature, String)> = revindex + .gather(counter, threshold, mh) + .unwrap() // TODO: proper error handling + .into_iter() + .map(|r| { + let filename = r.filename().to_owned(); + let sig = r.get_match(); + (r.f_match(), sig, filename) + }) + .collect(); + + + // FIXME: use the ForeignObject trait, maybe define new method there... + let ptr_sigs: Vec<*const SourmashSearchResult> = results.into_iter().map(|x| { + Box::into_raw(Box::new(x)) as *const SourmashSearchResult + }).collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *const *const SourmashSearchResult) +} +} + +#[no_mangle] +pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> u64 { + let revindex = SourmashRevIndex::as_rust(ptr); + if let Sketch::MinHash(mh) = revindex.template() { + mh.scaled() + } else { + unimplemented!() + } +} +*/ + +ffi_fn! { +unsafe fn linear_index_signatures(ptr: *const SourmashLinearIndex, + size: *mut usize) -> Result<*mut *mut SourmashSignature> { + let index = SourmashLinearIndex::as_rust(ptr); + + let sigs = index.signatures(); + + // FIXME: use the ForeignObject trait, maybe define new method there... + let ptr_sigs: Vec<*mut SourmashSignature> = sigs.into_iter().map(|x| { + Box::into_raw(Box::new(x)) as *mut SourmashSignature + }).collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *mut *mut SourmashSignature) +} +} diff --git a/src/core/src/ffi/index/mod.rs b/src/core/src/ffi/index/mod.rs new file mode 100644 index 0000000000..bbe038e37d --- /dev/null +++ b/src/core/src/ffi/index/mod.rs @@ -0,0 +1,37 @@ +pub mod linear; + +use crate::signature::Signature; + +use crate::ffi::signature::SourmashSignature; +use crate::ffi::utils::{ForeignObject, SourmashStr}; + +pub struct SourmashSearchResult; + +impl ForeignObject for SourmashSearchResult { + type RustObject = (f64, Signature, String); +} + +#[no_mangle] +pub unsafe extern "C" fn searchresult_free(ptr: *mut SourmashSearchResult) { + SourmashSearchResult::drop(ptr); +} + +#[no_mangle] +pub unsafe extern "C" fn searchresult_score(ptr: *const SourmashSearchResult) -> f64 { + let result = SourmashSearchResult::as_rust(ptr); + result.0 +} + +#[no_mangle] +pub unsafe extern "C" fn searchresult_filename(ptr: *const SourmashSearchResult) -> SourmashStr { + let result = SourmashSearchResult::as_rust(ptr); + (result.2).clone().into() +} + +#[no_mangle] +pub unsafe extern "C" fn searchresult_signature( + ptr: *const SourmashSearchResult, +) -> *mut SourmashSignature { + let result = SourmashSearchResult::as_rust(ptr); + SourmashSignature::from_rust((result.1).clone()) +} diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index bfd9b46bd7..e9f276d5e2 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -8,6 +8,7 @@ pub mod utils; pub mod cmd; pub mod hyperloglog; +pub mod index; pub mod minhash; pub mod nodegraph; pub mod signature; diff --git a/src/core/src/ffi/utils.rs b/src/core/src/ffi/utils.rs index 69baac7b88..b4c1947e22 100644 --- a/src/core/src/ffi/utils.rs +++ b/src/core/src/ffi/utils.rs @@ -314,3 +314,7 @@ pub unsafe extern "C" fn sourmash_str_free(s: *mut SourmashStr) { (*s).free() } } + +impl ForeignObject for SourmashStr { + type RustObject = SourmashStr; +} From 7e71dd313c845a62a893c9b4b2e5e7ca91de709c Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 15 May 2021 13:55:16 -0700 Subject: [PATCH 2/8] 15 tests failing --- Makefile | 2 + include/sourmash.h | 21 +++ src/core/src/ffi/index/linear.rs | 224 ++++--------------------------- src/core/src/index/linear.rs | 14 +- src/sourmash/index.py | 76 +++++++++-- 5 files changed, 125 insertions(+), 212 deletions(-) diff --git a/Makefile b/Makefile index b73271860a..ab2aeb0312 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,8 @@ include/sourmash.h: src/core/src/lib.rs \ src/core/src/ffi/minhash.rs \ src/core/src/ffi/signature.rs \ src/core/src/ffi/nodegraph.rs \ + src/core/src/index/mod.rs \ + src/core/src/index/linear.rs \ src/core/src/errors.rs cd src/core && \ RUSTUP_TOOLCHAIN=nightly cbindgen -c cbindgen.toml . -o ../../$@ diff --git a/include/sourmash.h b/include/sourmash.h index 7f88fcc203..16e44d5205 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -50,8 +50,12 @@ typedef struct SourmashHyperLogLog SourmashHyperLogLog; typedef struct SourmashKmerMinHash SourmashKmerMinHash; +typedef struct SourmashLinearIndex SourmashLinearIndex; + typedef struct SourmashNodegraph SourmashNodegraph; +typedef struct SourmashSearchResult SourmashSearchResult; + typedef struct SourmashSignature SourmashSignature; /** @@ -248,6 +252,15 @@ void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize); bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); +void linearindex_free(SourmashLinearIndex *ptr); + +uintptr_t linearindex_len(const SourmashLinearIndex *ptr); + +SourmashLinearIndex *linearindex_new_with_sigs(const SourmashSignature *const *search_sigs_ptr, + uintptr_t insigs); + +SourmashSignature **linearindex_signatures(const SourmashLinearIndex *ptr, uintptr_t *size); + void nodegraph_buffer_free(uint8_t *ptr, uintptr_t insize); bool nodegraph_count(SourmashNodegraph *ptr, uint64_t h); @@ -292,6 +305,14 @@ SourmashNodegraph *nodegraph_with_tables(uintptr_t ksize, uintptr_t starting_size, uintptr_t n_tables); +SourmashStr searchresult_filename(const SourmashSearchResult *ptr); + +void searchresult_free(SourmashSearchResult *ptr); + +double searchresult_score(const SourmashSearchResult *ptr); + +SourmashSignature *searchresult_signature(const SourmashSearchResult *ptr); + void signature_add_protein(SourmashSignature *ptr, const char *sequence); void signature_add_sequence(SourmashSignature *ptr, const char *sequence, bool force); diff --git a/src/core/src/ffi/index/linear.rs b/src/core/src/ffi/index/linear.rs index 95304f4247..445c8aebb1 100644 --- a/src/core/src/ffi/index/linear.rs +++ b/src/core/src/ffi/index/linear.rs @@ -2,7 +2,7 @@ use std::path::PathBuf; use std::slice; use crate::index::linear::LinearIndex; -use crate::index::Index; +use crate::index::{Index, SigStore}; use crate::signature::{Signature, SigsTrait}; use crate::sketch::minhash::KmerMinHash; use crate::sketch::Sketch; @@ -19,217 +19,47 @@ impl ForeignObject for SourmashLinearIndex { } ffi_fn! { - unsafe fn linearindex_new_with_paths( - search_sigs_ptr: *const *const SourmashStr, - insigs: usize, - template_ptr: *const SourmashKmerMinHash, - threshold: usize, - queries_ptr: *const *const SourmashKmerMinHash, - inqueries: usize, - keep_sigs: bool, - ) -> Result<*mut SourmashLinearIndex> { - let search_sigs: Vec = { - assert!(!search_sigs_ptr.is_null()); - slice::from_raw_parts(search_sigs_ptr, insigs).iter().map(|path| { - let mut new_path = PathBuf::new(); - new_path.push(SourmashStr::as_rust(*path).as_str()); - new_path} - ).collect() +unsafe fn linearindex_new_with_sigs( + search_sigs_ptr: *const *const SourmashSignature, + insigs: usize, +) -> Result<*mut SourmashLinearIndex> { + let search_sigs: Vec> = { + assert!(!search_sigs_ptr.is_null()); + slice::from_raw_parts(search_sigs_ptr, insigs) + .iter() + .map(|sig| SourmashSignature::as_rust(*sig)) + .cloned() + .map(|sig| { + SigStore::builder() + .data(sig) + .filename("") + .name("") + .metadata("") + .storage(None) + .build() + }) + .collect() }; - let linear_index = LinearIndex::builder().build(); - - unimplemented!(); - /* - - let template = { - assert!(!template_ptr.is_null()); - //TODO: avoid clone here - Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone()) - }; - - let queries_vec: Vec; - let queries: Option<&[KmerMinHash]> = if queries_ptr.is_null() { - None - } else { - queries_vec = - slice::from_raw_parts(queries_ptr, inqueries).iter().map(|mh_ptr| - // TODO: avoid this clone - SourmashKmerMinHash::as_rust(*mh_ptr).clone()).collect(); - Some(queries_vec.as_ref()) - }; - let revindex = RevIndex::new( - search_sigs.as_ref(), - &template, - threshold, - queries, - keep_sigs - ); - Ok(SourmashRevIndex::from_rust(revindex)) - */ + let linear_index = LinearIndex::builder().datasets(search_sigs).build(); Ok(SourmashLinearIndex::from_rust(linear_index)) - } } - -/* -ffi_fn! { - unsafe fn revindex_new_with_sigs( - search_sigs_ptr: *const *const SourmashSignature, - insigs: usize, - template_ptr: *const SourmashKmerMinHash, - threshold: usize, - queries_ptr: *const *const SourmashKmerMinHash, - inqueries: usize, - ) -> Result<*mut SourmashRevIndex> { - let search_sigs: Vec = { - assert!(!search_sigs_ptr.is_null()); - slice::from_raw_parts(search_sigs_ptr, insigs).iter().map(|sig| - SourmashSignature::as_rust(*sig) - ).cloned().collect() - }; - - let template = { - assert!(!template_ptr.is_null()); - //TODO: avoid clone here - Sketch::MinHash(SourmashKmerMinHash::as_rust(template_ptr).clone()) - }; - - let queries_vec: Vec; - let queries: Option<&[KmerMinHash]> = if queries_ptr.is_null() { - None - } else { - queries_vec = - slice::from_raw_parts(queries_ptr, inqueries).iter().map(|mh_ptr| - // TODO: avoid this clone - SourmashKmerMinHash::as_rust(*mh_ptr).clone()).collect(); - Some(queries_vec.as_ref()) - }; - let revindex = RevIndex::new_with_sigs( - search_sigs, - &template, - threshold, - queries, - ); - Ok(SourmashRevIndex::from_rust(revindex)) - } } -*/ #[no_mangle] -pub unsafe extern "C" fn linear_index_free(ptr: *mut SourmashLinearIndex) { +pub unsafe extern "C" fn linearindex_free(ptr: *mut SourmashLinearIndex) { SourmashLinearIndex::drop(ptr); } -/* -ffi_fn! { -unsafe fn revindex_search( - ptr: *const SourmashRevIndex, - sig_ptr: *const SourmashSignature, - threshold: f64, - do_containment: bool, - _ignore_abundance: bool, - size: *mut usize -) -> Result<*const *const SourmashSearchResult> { - let revindex = SourmashRevIndex::as_rust(ptr); - let sig = SourmashSignature::as_rust(sig_ptr); - - if sig.signatures.is_empty() { - *size = 0; - return Ok(std::ptr::null::<*const SourmashSearchResult>()); - } - - let mh = if let Sketch::MinHash(mh) = &sig.signatures[0] { - mh - } else { - // TODO: what if it is not a mh? - unimplemented!() - }; - - let results: Vec<(f64, Signature, String)> = revindex - .find_signatures(mh, threshold, do_containment, true)? - .into_iter() - .collect(); - - // FIXME: use the ForeignObject trait, maybe define new method there... - let ptr_sigs: Vec<*const SourmashSearchResult> = results.into_iter().map(|x| { - Box::into_raw(Box::new(x)) as *const SourmashSearchResult - }).collect(); - - let b = ptr_sigs.into_boxed_slice(); - *size = b.len(); - - Ok(Box::into_raw(b) as *const *const SourmashSearchResult) -} -} - -ffi_fn! { -unsafe fn revindex_gather( - ptr: *const SourmashRevIndex, - sig_ptr: *const SourmashSignature, - threshold: f64, - _do_containment: bool, - _ignore_abundance: bool, - size: *mut usize -) -> Result<*const *const SourmashSearchResult> { - let revindex = SourmashRevIndex::as_rust(ptr); - let sig = SourmashSignature::as_rust(sig_ptr); - - if sig.signatures.is_empty() { - *size = 0; - return Ok(std::ptr::null::<*const SourmashSearchResult>()); - } - - let mh = if let Sketch::MinHash(mh) = &sig.signatures[0] { - mh - } else { - // TODO: what if it is not a mh? - unimplemented!() - }; - - // TODO: proper threshold calculation - let threshold: usize = (threshold * (mh.size() as f64)) as _; - - let counter = revindex.counter_for_query(&mh); - dbg!(&counter); - - let results: Vec<(f64, Signature, String)> = revindex - .gather(counter, threshold, mh) - .unwrap() // TODO: proper error handling - .into_iter() - .map(|r| { - let filename = r.filename().to_owned(); - let sig = r.get_match(); - (r.f_match(), sig, filename) - }) - .collect(); - - - // FIXME: use the ForeignObject trait, maybe define new method there... - let ptr_sigs: Vec<*const SourmashSearchResult> = results.into_iter().map(|x| { - Box::into_raw(Box::new(x)) as *const SourmashSearchResult - }).collect(); - - let b = ptr_sigs.into_boxed_slice(); - *size = b.len(); - - Ok(Box::into_raw(b) as *const *const SourmashSearchResult) -} -} - #[no_mangle] -pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> u64 { - let revindex = SourmashRevIndex::as_rust(ptr); - if let Sketch::MinHash(mh) = revindex.template() { - mh.scaled() - } else { - unimplemented!() - } +pub unsafe extern "C" fn linearindex_len(ptr: *const SourmashLinearIndex) -> usize { + let index = SourmashLinearIndex::as_rust(ptr); + index.len() } -*/ ffi_fn! { -unsafe fn linear_index_signatures(ptr: *const SourmashLinearIndex, +unsafe fn linearindex_signatures(ptr: *const SourmashLinearIndex, size: *mut usize) -> Result<*mut *mut SourmashSignature> { let index = SourmashLinearIndex::as_rust(ptr); diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 009ebbaadc..0f31ea4edd 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -30,7 +30,7 @@ struct LinearInfo { impl<'a, L> Index<'a> for LinearIndex where L: Clone + Comparable + 'a, - SigStore: From, + SigStore: From + ReadData, { type Item = L; //type SignatureIterator = std::slice::Iter<'a, Self::Item>; @@ -58,15 +58,13 @@ where fn signatures(&self) -> Vec { self.datasets .iter() - .map(|x| x.data.get().unwrap().clone()) + .map(|x| (*x).data().unwrap()) + .cloned() .collect() } fn signature_refs(&self) -> Vec<&Self::Item> { - self.datasets - .iter() - .map(|x| x.data.get().unwrap()) - .collect() + self.datasets.iter().map(|x| (*x).data().unwrap()).collect() } /* @@ -182,4 +180,8 @@ where pub fn storage(&self) -> Option> { self.storage.clone() } + + pub fn len(&self) -> usize { + self.datasets.len() + } } diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 55c3a2f8c4..2e9dc80e6b 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -1,13 +1,16 @@ "An Abstract Base Class for collections of signatures." import os +import weakref import sourmash from abc import abstractmethod, ABC from collections import namedtuple, Counter import zipfile import copy +from .utils import RustObject, rustcall, decode_str, encode_str from .search import make_jaccard_search_query, make_gather_query +from ._lowlevel import ffi, lib # generic return tuple for Index.search and Index.gather IndexSearchResult = namedtuple('Result', 'score, signature, location') @@ -322,29 +325,84 @@ def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0, return True -class LinearIndex(Index): +class LinearIndex(Index, RustObject): "An Index for a collection of signatures. Can load from a .sig file." + + __dealloc_func__ = lib.linearindex_free + def __init__(self, _signatures=None, filename=None): - self._signatures = [] - if _signatures: - self._signatures = list(_signatures) self.filename = filename + self._objptr = ffi.NULL + + self.__signatures = [] + if not _signatures: + # delay initialization for when we have signatures + return + + self.__signatures = _signatures + self._init_inner() + + def _init_inner(self): + if self._objptr != ffi.NULL: + # Already initialized + return + + if ( + not self.__signatures + and self._objptr == ffi.NULL + ): + raise ValueError("No signatures provided") + elif self.__signatures and self._objptr != ffi.NULL: + raise NotImplementedError("Need to update LinearIndex") + + attached_refs = weakref.WeakKeyDictionary() + + collected = [] + if self.__signatures: + # pass SourmashSignature pointers to LinearIndex. + for sig in self.__signatures: + rv = sig._get_objptr() + attached_refs[rv] = (rv, sig) + collected.append(rv) + search_sigs_ptr = ffi.new("SourmashSignature*[]", collected) + + self._objptr = rustcall( + lib.linearindex_new_with_sigs, + search_sigs_ptr, + len(search_sigs_ptr), + ) + self.__signatures = [] @property def location(self): return self.filename def signatures(self): - return iter(self._signatures) + from sourmash import SourmashSignature + + self._init_inner() + + size = ffi.new("uintptr_t *") + sigs_ptr = self._methodcall(lib.linearindex_signatures, size) + size = size[0] + + sigs = [] + for i in range(size): + sig = SourmashSignature._from_objptr(sigs_ptr[i]) + sigs.append(sig) + + for sig in sigs: + yield sig def __bool__(self): - return bool(self._signatures) + return bool(len(self)) def __len__(self): - return len(self._signatures) + self._init_inner() + return self._methodcall(lib.linearindex_len) def insert(self, node): - self._signatures.append(node) + self.__signatures.append(node) def save(self, path): from .signature import save_signatures @@ -368,7 +426,7 @@ def select(self, **kwargs): kw = { k : v for (k, v) in kwargs.items() if v } siglist = [] - for ss in self._signatures: + for ss in self.signatures(): if select_signature(ss, **kwargs): siglist.append(ss) From 18825de0cf0376dcfdb872fd1ad43d458f371e7b Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 15 May 2021 14:59:07 -0700 Subject: [PATCH 3/8] 3 failing --- Makefile | 2 ++ include/sourmash.h | 2 ++ src/core/src/ffi/index/linear.rs | 14 +++++++------- src/sourmash/index.py | 3 ++- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index ab2aeb0312..70c8b1986a 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,8 @@ include/sourmash.h: src/core/src/lib.rs \ src/core/src/ffi/minhash.rs \ src/core/src/ffi/signature.rs \ src/core/src/ffi/nodegraph.rs \ + src/core/src/ffi/index/mod.rs \ + src/core/src/ffi/index/linear.rs \ src/core/src/index/mod.rs \ src/core/src/index/linear.rs \ src/core/src/errors.rs diff --git a/include/sourmash.h b/include/sourmash.h index 16e44d5205..db3c192b52 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -256,6 +256,8 @@ void linearindex_free(SourmashLinearIndex *ptr); uintptr_t linearindex_len(const SourmashLinearIndex *ptr); +SourmashLinearIndex *linearindex_new(void); + SourmashLinearIndex *linearindex_new_with_sigs(const SourmashSignature *const *search_sigs_ptr, uintptr_t insigs); diff --git a/src/core/src/ffi/index/linear.rs b/src/core/src/ffi/index/linear.rs index 445c8aebb1..118789822b 100644 --- a/src/core/src/ffi/index/linear.rs +++ b/src/core/src/ffi/index/linear.rs @@ -1,16 +1,11 @@ -use std::path::PathBuf; use std::slice; use crate::index::linear::LinearIndex; use crate::index::{Index, SigStore}; -use crate::signature::{Signature, SigsTrait}; -use crate::sketch::minhash::KmerMinHash; -use crate::sketch::Sketch; +use crate::signature::Signature; -use crate::ffi::index::SourmashSearchResult; -use crate::ffi::minhash::SourmashKmerMinHash; use crate::ffi::signature::SourmashSignature; -use crate::ffi::utils::{ForeignObject, SourmashStr}; +use crate::ffi::utils::ForeignObject; pub struct SourmashLinearIndex; @@ -18,6 +13,11 @@ impl ForeignObject for SourmashLinearIndex { type RustObject = LinearIndex; } +#[no_mangle] +pub unsafe extern "C" fn linearindex_new() -> *mut SourmashLinearIndex { + SourmashLinearIndex::from_rust(LinearIndex::builder().build()) +} + ffi_fn! { unsafe fn linearindex_new_with_sigs( search_sigs_ptr: *const *const SourmashSignature, diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 2e9dc80e6b..7de0599915 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -351,7 +351,8 @@ def _init_inner(self): not self.__signatures and self._objptr == ffi.NULL ): - raise ValueError("No signatures provided") + # no signatures provided, initializing empty LinearIndex + self._objptr = lib.linearindex_new() elif self.__signatures and self._objptr != ffi.NULL: raise NotImplementedError("Need to update LinearIndex") From fbdaea61a405ace77021c91fa69f81e543ad16e7 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 15 May 2021 16:10:31 -0700 Subject: [PATCH 4/8] 2 failing. properly handle new insertions and updates --- include/sourmash.h | 4 ++++ src/core/src/ffi/index/linear.rs | 29 +++++++++++++++---------- src/sourmash/index.py | 37 ++++++++++++++++++-------------- 3 files changed, 43 insertions(+), 27 deletions(-) diff --git a/include/sourmash.h b/include/sourmash.h index db3c192b52..ac726f5026 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -254,6 +254,10 @@ bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); void linearindex_free(SourmashLinearIndex *ptr); +void linearindex_insert_many(SourmashLinearIndex *ptr, + const SourmashSignature *const *search_sigs_ptr, + uintptr_t insigs); + uintptr_t linearindex_len(const SourmashLinearIndex *ptr); SourmashLinearIndex *linearindex_new(void); diff --git a/src/core/src/ffi/index/linear.rs b/src/core/src/ffi/index/linear.rs index 118789822b..28ba7581f7 100644 --- a/src/core/src/ffi/index/linear.rs +++ b/src/core/src/ffi/index/linear.rs @@ -27,17 +27,7 @@ unsafe fn linearindex_new_with_sigs( assert!(!search_sigs_ptr.is_null()); slice::from_raw_parts(search_sigs_ptr, insigs) .iter() - .map(|sig| SourmashSignature::as_rust(*sig)) - .cloned() - .map(|sig| { - SigStore::builder() - .data(sig) - .filename("") - .name("") - .metadata("") - .storage(None) - .build() - }) + .map(|sig| SourmashSignature::as_rust(*sig).clone().into()) .collect() }; @@ -47,6 +37,23 @@ unsafe fn linearindex_new_with_sigs( } } +ffi_fn! { +unsafe fn linearindex_insert_many( + ptr: *mut SourmashLinearIndex, + search_sigs_ptr: *const *const SourmashSignature, + insigs: usize, +) -> Result<()> { + let index = SourmashLinearIndex::as_rust_mut(ptr); + + slice::from_raw_parts(search_sigs_ptr, insigs) + .iter() + .try_for_each(|sig| { + let s = SourmashSignature::as_rust(*sig).clone(); + index.insert(s) + }) +} +} + #[no_mangle] pub unsafe extern "C" fn linearindex_free(ptr: *mut SourmashLinearIndex) { SourmashLinearIndex::drop(ptr); diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 7de0599915..347a91b7ef 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -343,36 +343,41 @@ def __init__(self, _signatures=None, filename=None): self._init_inner() def _init_inner(self): - if self._objptr != ffi.NULL: - # Already initialized + if self._objptr != ffi.NULL and not self.__signatures: + # Already initialized, nothing new to add return - if ( - not self.__signatures - and self._objptr == ffi.NULL - ): + if (not self.__signatures and self._objptr == ffi.NULL): # no signatures provided, initializing empty LinearIndex self._objptr = lib.linearindex_new() - elif self.__signatures and self._objptr != ffi.NULL: - raise NotImplementedError("Need to update LinearIndex") + return attached_refs = weakref.WeakKeyDictionary() collected = [] - if self.__signatures: - # pass SourmashSignature pointers to LinearIndex. - for sig in self.__signatures: - rv = sig._get_objptr() - attached_refs[rv] = (rv, sig) - collected.append(rv) - search_sigs_ptr = ffi.new("SourmashSignature*[]", collected) + # pass SourmashSignature pointers to LinearIndex. + for sig in self.__signatures: + rv = sig._get_objptr() + attached_refs[rv] = (rv, sig) + collected.append(rv) + search_sigs_ptr = ffi.new("SourmashSignature*[]", collected) + self.__signatures = [] + if self._objptr != ffi.NULL: + # new signatures to add, insert to already initialized LinearIndex + self._methodcall( + lib.linearindex_insert_many, + search_sigs_ptr, + len(search_sigs_ptr) + ) + else: + # Rust object was not initialized yet, so let's create it with the + # new sigs self._objptr = rustcall( lib.linearindex_new_with_sigs, search_sigs_ptr, len(search_sigs_ptr), ) - self.__signatures = [] @property def location(self): From 97707776594c4831711f2539c36ced42adccee65 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Sat, 15 May 2021 17:41:14 -0700 Subject: [PATCH 5/8] wip find --- src/core/src/ffi/index/linear.rs | 38 +++++++++++++++++++++++++- src/core/src/index/linear.rs | 13 ++++++++- src/core/src/index/mod.rs | 2 ++ src/sourmash/index.py | 46 ++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+), 2 deletions(-) diff --git a/src/core/src/ffi/index/linear.rs b/src/core/src/ffi/index/linear.rs index 28ba7581f7..ad6a1f2b1b 100644 --- a/src/core/src/ffi/index/linear.rs +++ b/src/core/src/ffi/index/linear.rs @@ -1,18 +1,25 @@ use std::slice; use crate::index::linear::LinearIndex; -use crate::index::{Index, SigStore}; +use crate::index::{Index, SearchFn, SigStore}; use crate::signature::Signature; +use crate::ffi::index::SourmashSearchResult; use crate::ffi::signature::SourmashSignature; use crate::ffi::utils::ForeignObject; pub struct SourmashLinearIndex; +pub struct SourmashSearchFn; + impl ForeignObject for SourmashLinearIndex { type RustObject = LinearIndex; } +impl ForeignObject for SourmashSearchFn { + type RustObject = SearchFn; +} + #[no_mangle] pub unsafe extern "C" fn linearindex_new() -> *mut SourmashLinearIndex { SourmashLinearIndex::from_rust(LinearIndex::builder().build()) @@ -83,3 +90,32 @@ unsafe fn linearindex_signatures(ptr: *const SourmashLinearIndex, Ok(Box::into_raw(b) as *mut *mut SourmashSignature) } } + +ffi_fn! { +unsafe fn linearindex_find( + ptr: *const SourmashLinearIndex, + search_fn_ptr: *const SourmashSearchFn, + sig_ptr: *const SourmashSignature, + size: *mut usize, +) -> Result<*const *const SourmashSearchResult> { + let linearindex = SourmashLinearIndex::as_rust(ptr); + let search_fn = SourmashSearchFn::as_rust(search_fn_ptr); + let query = SourmashSignature::as_rust(sig_ptr); + + let results: Vec<(f64, Signature, String)> = linearindex + .find_new(search_fn, query)? + .into_iter() + .collect(); + + // FIXME: use the ForeignObject trait, maybe define new method there... + let ptr_sigs: Vec<*const SourmashSearchResult> = results + .into_iter() + .map(|x| Box::into_raw(Box::new(x)) as *const SourmashSearchResult) + .collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *const *const SourmashSearchResult) +} +} diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 0f31ea4edd..8be1511a34 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -8,7 +8,8 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}; -use crate::index::{Comparable, DatasetInfo, Index, SigStore}; +use crate::index::{Comparable, DatasetInfo, Index, SearchFn, SigStore}; +use crate::signature::Signature; use crate::Error; #[derive(TypedBuilder)] @@ -185,3 +186,13 @@ where self.datasets.len() } } + +impl LinearIndex { + pub fn find_new( + &self, + search_fn: &SearchFn, + query: &Signature, + ) -> Result, Error> { + unimplemented!() + } +} diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 507020fe3c..dc7b5d3fff 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -330,3 +330,5 @@ impl From for SigStore { } } } + +pub struct SearchFn {} diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 347a91b7ef..666415f316 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -423,6 +423,32 @@ def load(cls, location): lidx = LinearIndex(si, filename=location) return lidx + def find(self, search_fn, query, **kwargs): + """Use search_fn to find matching signatures in the index. + + search_fn follows the protocol in JaccardSearch objects. + + Returns a list. + """ + size = ffi.new("uintptr_t *") + results_ptr = self._methodcall( + lib.linearindex_find, + search_fn._get_objptr(), + query._get_objptr(), + ) + + size = size[0] + if size == 0: + return [] + + results = [] + for i in range(size): + match = SearchResult._from_objptr(results_ptr[i]) + results.append(IndexSearchResult(match.score, match.signature, self.filename)) + + for sr in results: + yield sr + def select(self, **kwargs): """Return new LinearIndex containing only signatures that match req's. @@ -439,6 +465,26 @@ def select(self, **kwargs): return LinearIndex(siglist, self.location) +class SearchResult(RustObject): + __dealloc_func__ = lib.searchresult_free + + @property + def score(self): + return self._methodcall(lib.searchresult_score) + + @property + def signature(self): + sig_ptr = self._methodcall(lib.searchresult_signature) + return sourmash.SourmashSignature._from_objptr(sig_ptr) + + @property + def filename(self): + result = decode_str(self._methodcall(lib.searchresult_filename)) + if result == "": + return None + return result + + class LazyLinearIndex(Index): """An Index for lazy linear search of another database. From aa82e0a06974f1aac93cb3839209e80280638ecd Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 18 May 2021 15:17:43 -0700 Subject: [PATCH 6/8] wip searchFn --- include/sourmash.h | 18 +++++++++++++ nix/rust.nix | 2 +- shell.nix | 2 ++ src/core/src/ffi/index/linear.rs | 9 ++----- src/core/src/ffi/mod.rs | 1 + src/core/src/ffi/search.rs | 46 ++++++++++++++++++++++++++++++++ src/core/src/index/linear.rs | 4 +-- src/core/src/index/mod.rs | 38 +++++++++++++++++++++++++- src/sourmash/index.py | 5 +++- src/sourmash/search.py | 18 +++++++++++++ 10 files changed, 131 insertions(+), 12 deletions(-) create mode 100644 src/core/src/ffi/search.rs diff --git a/include/sourmash.h b/include/sourmash.h index ac726f5026..13d3ca6db2 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -16,6 +16,13 @@ enum HashFunctions { }; typedef uint32_t HashFunctions; +enum SearchType { + SEARCH_TYPE_JACCARD = 1, + SEARCH_TYPE_CONTAINMENT = 2, + SEARCH_TYPE_MAX_CONTAINMENT = 3, +}; +typedef uint32_t SearchType; + enum SourmashErrorCode { SOURMASH_ERROR_CODE_NO_ERROR = 0, SOURMASH_ERROR_CODE_PANIC = 1, @@ -54,6 +61,8 @@ typedef struct SourmashLinearIndex SourmashLinearIndex; typedef struct SourmashNodegraph SourmashNodegraph; +typedef struct SourmashSearchFn SourmashSearchFn; + typedef struct SourmashSearchResult SourmashSearchResult; typedef struct SourmashSignature SourmashSignature; @@ -252,6 +261,11 @@ void kmerminhash_slice_free(uint64_t *ptr, uintptr_t insize); bool kmerminhash_track_abundance(const SourmashKmerMinHash *ptr); +const SourmashSearchResult *const *linearindex_find(const SourmashLinearIndex *ptr, + const SourmashSearchFn *search_fn_ptr, + const SourmashSignature *sig_ptr, + uintptr_t *size); + void linearindex_free(SourmashLinearIndex *ptr); void linearindex_insert_many(SourmashLinearIndex *ptr, @@ -311,6 +325,10 @@ SourmashNodegraph *nodegraph_with_tables(uintptr_t ksize, uintptr_t starting_size, uintptr_t n_tables); +void searchfn_free(SourmashSearchFn *ptr); + +SourmashSearchFn *searchfn_new(SearchType search_type, double threshold); + SourmashStr searchresult_filename(const SourmashSearchResult *ptr); void searchresult_free(SourmashSearchResult *ptr); diff --git a/nix/rust.nix b/nix/rust.nix index 5883fd3d52..8aaae712e3 100644 --- a/nix/rust.nix +++ b/nix/rust.nix @@ -3,7 +3,7 @@ let pkgs = import sources.nixpkgs { overlays = [ (import sources.rust-overlay) ]; }; - rustVersion = pkgs.rust-bin.stable.latest.rust.override { + rustVersion = pkgs.rust-bin.nightly.latest.rust.override { #extensions = [ "rust-src" ]; #targets = [ "x86_64-unknown-linux-musl" ]; targets = [ "wasm32-wasi" "wasm32-unknown-unknown" ]; diff --git a/shell.nix b/shell.nix index d63f2ea99b..d8dae33d4a 100644 --- a/shell.nix +++ b/shell.nix @@ -13,12 +13,14 @@ in (python38.withPackages(ps: with ps; [ virtualenv tox setuptools ])) (python39.withPackages(ps: with ps; [ virtualenv setuptools ])) (python37.withPackages(ps: with ps; [ virtualenv setuptools ])) + rust-cbindgen py-spy heaptrack cargo-watch cargo-limit wasmtime wasm-pack + gdb ]; shellHook = '' diff --git a/src/core/src/ffi/index/linear.rs b/src/core/src/ffi/index/linear.rs index ad6a1f2b1b..9568240e6e 100644 --- a/src/core/src/ffi/index/linear.rs +++ b/src/core/src/ffi/index/linear.rs @@ -1,25 +1,20 @@ use std::slice; use crate::index::linear::LinearIndex; -use crate::index::{Index, SearchFn, SigStore}; +use crate::index::{Index, SigStore}; use crate::signature::Signature; use crate::ffi::index::SourmashSearchResult; +use crate::ffi::search::SourmashSearchFn; use crate::ffi::signature::SourmashSignature; use crate::ffi::utils::ForeignObject; pub struct SourmashLinearIndex; -pub struct SourmashSearchFn; - impl ForeignObject for SourmashLinearIndex { type RustObject = LinearIndex; } -impl ForeignObject for SourmashSearchFn { - type RustObject = SearchFn; -} - #[no_mangle] pub unsafe extern "C" fn linearindex_new() -> *mut SourmashLinearIndex { SourmashLinearIndex::from_rust(LinearIndex::builder().build()) diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index e9f276d5e2..326591d499 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -11,6 +11,7 @@ pub mod hyperloglog; pub mod index; pub mod minhash; pub mod nodegraph; +pub mod search; pub mod signature; use std::ffi::CStr; diff --git a/src/core/src/ffi/search.rs b/src/core/src/ffi/search.rs new file mode 100644 index 0000000000..09087c35f2 --- /dev/null +++ b/src/core/src/ffi/search.rs @@ -0,0 +1,46 @@ +use crate::index::{JaccardSearch, SearchType}; +use crate::signature::Signature; + +use crate::ffi::signature::SourmashSignature; +use crate::ffi::utils::{ForeignObject, SourmashStr}; + +pub struct SourmashSearchFn; + +impl ForeignObject for SourmashSearchFn { + type RustObject = JaccardSearch; +} + +#[no_mangle] +pub unsafe extern "C" fn searchfn_free(ptr: *mut SourmashSearchFn) { + SourmashSearchFn::drop(ptr); +} + +#[no_mangle] +pub unsafe extern "C" fn searchfn_new( + search_type: SearchType, + threshold: f64, +) -> *mut SourmashSearchFn { + SourmashSearchFn::from_rust(JaccardSearch::with_threshold(search_type, threshold)) +} + +/* +#[no_mangle] +pub unsafe extern "C" fn searchresult_score(ptr: *const SourmashSearchResult) -> f64 { + let result = SourmashSearchResult::as_rust(ptr); + result.0 +} + +#[no_mangle] +pub unsafe extern "C" fn searchresult_filename(ptr: *const SourmashSearchResult) -> SourmashStr { + let result = SourmashSearchResult::as_rust(ptr); + (result.2).clone().into() +} + +#[no_mangle] +pub unsafe extern "C" fn searchresult_signature( + ptr: *const SourmashSearchResult, +) -> *mut SourmashSignature { + let result = SourmashSearchResult::as_rust(ptr); + SourmashSignature::from_rust((result.1).clone()) +} +*/ diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 8be1511a34..29adad6b22 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}; -use crate::index::{Comparable, DatasetInfo, Index, SearchFn, SigStore}; +use crate::index::{Comparable, DatasetInfo, Index, JaccardSearch, SigStore}; use crate::signature::Signature; use crate::Error; @@ -190,7 +190,7 @@ where impl LinearIndex { pub fn find_new( &self, - search_fn: &SearchFn, + search_fn: &JaccardSearch, query: &Signature, ) -> Result, Error> { unimplemented!() diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index dc7b5d3fff..591a4baae7 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -331,4 +331,40 @@ impl From for SigStore { } } -pub struct SearchFn {} +#[repr(u32)] +pub enum SearchType { + Jaccard = 1, + Containment = 2, + MaxContainment = 3, +} + +pub struct JaccardSearch { + search_type: SearchType, + threshold: f64, + require_scaled: bool, +} + +impl JaccardSearch { + pub fn new(search_type: SearchType) -> Self { + let require_scaled = match search_type { + SearchType::Containment | SearchType::MaxContainment => true, + SearchType::Jaccard => false, + }; + + JaccardSearch { + search_type, + require_scaled, + threshold: 0., + } + } + + pub fn with_threshold(search_type: SearchType, threshold: f64) -> Self { + let mut s = Self::new(search_type); + s.set_threshold(threshold); + s + } + + pub fn set_threshold(&mut self, threshold: f64) { + self.threshold = threshold; + } +} diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 666415f316..583d5bd4e6 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -430,11 +430,14 @@ def find(self, search_fn, query, **kwargs): Returns a list. """ + self._init_inner() + size = ffi.new("uintptr_t *") results_ptr = self._methodcall( lib.linearindex_find, - search_fn._get_objptr(), + search_fn._as_rust(), query._get_objptr(), + size ) size = size[0] diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 93d77920ce..9c907f481e 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -9,6 +9,8 @@ from .logging import notify, error from .signature import SourmashSignature from .minhash import _get_max_hash_for_scaled +from .utils import rustcall +from ._lowlevel import ffi, lib class SearchType(Enum): @@ -98,6 +100,7 @@ def __init__(self, search_type, threshold=None): require_scaled = True self.score_fn = score_fn self.require_scaled = require_scaled + self.search_type = search_type if threshold is None: threshold = 0 @@ -150,6 +153,21 @@ def score_max_containment(self, query_size, shared_size, subject_size, return 0 return shared_size / min_denom + def _as_rust(self): + """ + Return a compatible Rust search function. + + The Rust function duplicates the implementation of this class, since + there is no good way to call back into Python code without involving a + lot of machinery. + """ + + return rustcall( + lib.searchfn_new, + self.search_type.value(), + self.threshold, + ) + class JaccardSearchBestOnly(JaccardSearch): "A subclass of JaccardSearch that implements best-only." From 70382937e993aafdb1f6a2249cd3c736c222cab8 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 25 May 2021 14:54:58 -0700 Subject: [PATCH 7/8] 17 tests failing --- src/core/Cargo.toml | 1 + src/core/src/ffi/search.rs | 26 +-------------- src/core/src/index/linear.rs | 49 ++++++++++++++++++++++++++-- src/core/src/index/mod.rs | 62 +++++++++++++++++++++++++++++++++--- src/sourmash/search.py | 2 +- 5 files changed, 107 insertions(+), 33 deletions(-) diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 3de8c37734..2c9997604d 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -43,6 +43,7 @@ serde_json = "1.0.53" primal-check = "0.3.1" thiserror = "1.0" typed-builder = "0.9.0" +atomic_float = "0.1.0" [target.'cfg(all(target_arch = "wasm32", target_vendor="unknown"))'.dependencies.wasm-bindgen] version = "0.2.62" diff --git a/src/core/src/ffi/search.rs b/src/core/src/ffi/search.rs index 09087c35f2..8a6a94a075 100644 --- a/src/core/src/ffi/search.rs +++ b/src/core/src/ffi/search.rs @@ -1,8 +1,6 @@ use crate::index::{JaccardSearch, SearchType}; -use crate::signature::Signature; -use crate::ffi::signature::SourmashSignature; -use crate::ffi::utils::{ForeignObject, SourmashStr}; +use crate::ffi::utils::ForeignObject; pub struct SourmashSearchFn; @@ -22,25 +20,3 @@ pub unsafe extern "C" fn searchfn_new( ) -> *mut SourmashSearchFn { SourmashSearchFn::from_rust(JaccardSearch::with_threshold(search_type, threshold)) } - -/* -#[no_mangle] -pub unsafe extern "C" fn searchresult_score(ptr: *const SourmashSearchResult) -> f64 { - let result = SourmashSearchResult::as_rust(ptr); - result.0 -} - -#[no_mangle] -pub unsafe extern "C" fn searchresult_filename(ptr: *const SourmashSearchResult) -> SourmashStr { - let result = SourmashSearchResult::as_rust(ptr); - (result.2).clone().into() -} - -#[no_mangle] -pub unsafe extern "C" fn searchresult_signature( - ptr: *const SourmashSearchResult, -) -> *mut SourmashSignature { - let result = SourmashSearchResult::as_rust(ptr); - SourmashSignature::from_rust((result.1).clone()) -} -*/ diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 29adad6b22..26a3f3ec6f 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -1,3 +1,4 @@ +use std::convert::TryInto; use std::fs::File; use std::io::{BufReader, Read}; use std::path::Path; @@ -7,10 +8,13 @@ use std::rc::Rc; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; -use crate::index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}; use crate::index::{Comparable, DatasetInfo, Index, JaccardSearch, SigStore}; use crate::signature::Signature; use crate::Error; +use crate::{ + index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}, + sketch::Sketch, +}; #[derive(TypedBuilder)] pub struct LinearIndex { @@ -193,6 +197,47 @@ impl LinearIndex { search_fn: &JaccardSearch, query: &Signature, ) -> Result, Error> { - unimplemented!() + search_fn.check_is_compatible(&query)?; + + let query_mh; + if let Sketch::MinHash(mh) = &query.signatures[0] { + query_mh = mh; + } else { + unimplemented!() + } + + // TODO: prepare_subject and prepare_query + let location: String = "TODO".into(); + + Ok(self + .datasets + .iter() + .filter_map(|subj| { + let subj_sig = subj.data().unwrap(); + let subj_mh; + if let Sketch::MinHash(mh) = &subj_sig.signatures[0] { + subj_mh = mh; + } else { + unimplemented!() + } + + let (shared_size, total_size) = query_mh.intersection_size(&subj_mh).unwrap(); + let query_size = query.size(); + let subj_size = subj.size(); + + let score: f64 = search_fn.score( + query_size.try_into().unwrap(), + shared_size, + subj_size.try_into().unwrap(), + total_size, + ); + + if search_fn.passes(score) && search_fn.collect(score, subj) { + Some((score, subj_sig.clone(), location.clone())) + } else { + None + } + }) + .collect()) } } diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 591a4baae7..66d89dc70f 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -14,7 +14,9 @@ pub mod search; use std::ops::Deref; use std::path::Path; use std::rc::Rc; +use std::sync::atomic::Ordering; +use atomic_float::AtomicF64; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; @@ -340,7 +342,7 @@ pub enum SearchType { pub struct JaccardSearch { search_type: SearchType, - threshold: f64, + threshold: AtomicF64, require_scaled: bool, } @@ -354,17 +356,67 @@ impl JaccardSearch { JaccardSearch { search_type, require_scaled, - threshold: 0., + threshold: AtomicF64::new(0.0), } } pub fn with_threshold(search_type: SearchType, threshold: f64) -> Self { - let mut s = Self::new(search_type); + let s = Self::new(search_type); s.set_threshold(threshold); s } - pub fn set_threshold(&mut self, threshold: f64) { - self.threshold = threshold; + pub fn set_threshold(&self, threshold: f64) { + self.threshold + .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |_| Some(threshold)) + .unwrap(); + } + + pub fn check_is_compatible(&self, sig: &Signature) -> Result<(), Error> { + // TODO: implement properly + Ok(()) + } + + pub fn score( + &self, + query_size: u64, + shared_size: u64, + subject_size: u64, + total_size: u64, + ) -> f64 { + let shared_size = shared_size as f64; + match self.search_type { + SearchType::Jaccard => shared_size / total_size as f64, + SearchType::Containment => { + if query_size == 0 { + 0.0 + } else { + shared_size / query_size as f64 + } + } + SearchType::MaxContainment => { + let min_denom = query_size.min(subject_size); + if min_denom == 0 { + 0.0 + } else { + shared_size / min_denom as f64 + } + } + } + } + + /// Return True if this match should be collected. + pub fn collect(&self, score: f64, subj: &Signature) -> bool { + true + } + + /// Return true if this score meets or exceeds the threshold. + /// + /// Note: this can be used whenever a score or estimate is available + /// (e.g. internal nodes on an SBT). `collect(...)`, below, decides + /// whether a particular signature should be collected, and/or can + /// update the threshold (used for BestOnly behavior). + pub fn passes(&self, score: f64) -> bool { + score > 0. && score >= self.threshold.load(Ordering::SeqCst) } } diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 9c907f481e..f03e5c3798 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -164,7 +164,7 @@ def _as_rust(self): return rustcall( lib.searchfn_new, - self.search_type.value(), + self.search_type.value, self.threshold, ) From 4e71e8843bb7ef69a846f31907686fd25bd27312 Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Tue, 25 May 2021 19:12:53 -0700 Subject: [PATCH 8/8] 7 failing --- include/sourmash.h | 2 +- src/core/src/ffi/search.rs | 5 ++++- src/core/src/index/linear.rs | 8 ++++---- src/core/src/index/mod.rs | 11 ++++++++++- src/sourmash/search.py | 19 +++++++++++++++++++ 5 files changed, 38 insertions(+), 7 deletions(-) diff --git a/include/sourmash.h b/include/sourmash.h index 13d3ca6db2..ccd88f3c1f 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -327,7 +327,7 @@ SourmashNodegraph *nodegraph_with_tables(uintptr_t ksize, void searchfn_free(SourmashSearchFn *ptr); -SourmashSearchFn *searchfn_new(SearchType search_type, double threshold); +SourmashSearchFn *searchfn_new(SearchType search_type, double threshold, bool best_only); SourmashStr searchresult_filename(const SourmashSearchResult *ptr); diff --git a/src/core/src/ffi/search.rs b/src/core/src/ffi/search.rs index 8a6a94a075..0547bca9f9 100644 --- a/src/core/src/ffi/search.rs +++ b/src/core/src/ffi/search.rs @@ -17,6 +17,9 @@ pub unsafe extern "C" fn searchfn_free(ptr: *mut SourmashSearchFn) { pub unsafe extern "C" fn searchfn_new( search_type: SearchType, threshold: f64, + best_only: bool, ) -> *mut SourmashSearchFn { - SourmashSearchFn::from_rust(JaccardSearch::with_threshold(search_type, threshold)) + let mut func = JaccardSearch::with_threshold(search_type, threshold); + func.set_best_only(best_only); + SourmashSearchFn::from_rust(func) } diff --git a/src/core/src/index/linear.rs b/src/core/src/index/linear.rs index 26a3f3ec6f..0e9a18f64b 100644 --- a/src/core/src/index/linear.rs +++ b/src/core/src/index/linear.rs @@ -9,7 +9,7 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use crate::index::{Comparable, DatasetInfo, Index, JaccardSearch, SigStore}; -use crate::signature::Signature; +use crate::signature::{Signature, SigsTrait}; use crate::Error; use crate::{ index::storage::{FSStorage, ReadData, Storage, StorageInfo, ToWriter}, @@ -221,9 +221,9 @@ impl LinearIndex { unimplemented!() } - let (shared_size, total_size) = query_mh.intersection_size(&subj_mh).unwrap(); - let query_size = query.size(); - let subj_size = subj.size(); + let (shared_size, total_size) = dbg!(query_mh.intersection_size(&subj_mh).unwrap()); + let query_size = query_mh.size(); + let subj_size = subj_mh.size(); let score: f64 = search_fn.score( query_size.try_into().unwrap(), diff --git a/src/core/src/index/mod.rs b/src/core/src/index/mod.rs index 66d89dc70f..a26392c683 100644 --- a/src/core/src/index/mod.rs +++ b/src/core/src/index/mod.rs @@ -344,6 +344,7 @@ pub struct JaccardSearch { search_type: SearchType, threshold: AtomicF64, require_scaled: bool, + best_only: bool, } impl JaccardSearch { @@ -355,8 +356,9 @@ impl JaccardSearch { JaccardSearch { search_type, - require_scaled, threshold: AtomicF64::new(0.0), + require_scaled, + best_only: false, } } @@ -366,6 +368,10 @@ impl JaccardSearch { s } + pub fn set_best_only(&mut self, best_only: bool) { + self.best_only = best_only; + } + pub fn set_threshold(&self, threshold: f64) { self.threshold .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |_| Some(threshold)) @@ -407,6 +413,9 @@ impl JaccardSearch { /// Return True if this match should be collected. pub fn collect(&self, score: f64, subj: &Signature) -> bool { + if self.best_only { + self.threshold.fetch_max(score, Ordering::Relaxed); + } true } diff --git a/src/sourmash/search.py b/src/sourmash/search.py index f03e5c3798..fa763024c8 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -166,16 +166,35 @@ def _as_rust(self): lib.searchfn_new, self.search_type.value, self.threshold, + False ) class JaccardSearchBestOnly(JaccardSearch): "A subclass of JaccardSearch that implements best-only." + def collect(self, score, match): "Raise the threshold to the best match found so far." self.threshold = max(self.threshold, score) return True + def _as_rust(self): + """ + Return a compatible Rust search function. + + The Rust function duplicates the implementation of this class, since + there is no good way to call back into Python code without involving a + lot of machinery. + """ + + return rustcall( + lib.searchfn_new, + self.search_type.value, + self.threshold, + True + ) + + # generic SearchResult tuple. SearchResult = namedtuple('SearchResult',