Skip to content

Commit

Permalink
init zipfilelinearindex in rust
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Apr 16, 2022
1 parent 1a1026f commit 3428e54
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 14 deletions.
38 changes: 38 additions & 0 deletions src/core/tests/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,41 @@ fn zipstorage_list_sbts() -> Result<(), Box<dyn std::error::Error>> {

Ok(())
}

#[cfg(feature = "parallel")]
#[test]
fn zipstorage_parallel_access() -> Result<(), Box<dyn std::error::Error>> {
use std::io::BufReader;

use rayon::prelude::*;
use sourmash::signature::{Signature, SigsTrait};
use sourmash::sketch::minhash::KmerMinHash;

let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
filename.push("../../tests/test-data/v6.sbt.zip");

let zs = ZipStorage::new(filename.to_str().unwrap())?;

let total_hashes: usize = [
".sbt.v3/f71e78178af9e45e6f1d87a0c53c465c",
".sbt.v3/f0c834bc306651d2b9321fb21d3e8d8f",
".sbt.v3/4e94e60265e04f0763142e20b52c0da1",
".sbt.v3/6d6e87e1154e95b279e5e7db414bc37b",
".sbt.v3/0107d767a345eff67ecdaed2ee5cd7ba",
".sbt.v3/b59473c94ff2889eca5d7165936e64b3",
".sbt.v3/60f7e23c24a8d94791cc7a8680c493f9",
]
.par_iter()
.map(|path| {
let data = zs.load(path).unwrap();
let sigs: Vec<Signature> = serde_json::from_reader(&data[..]).expect("Loading error");
sigs.iter()
.map(|v| v.sketches().iter().map(|mh| mh.size()).sum::<usize>())
.sum::<usize>()
})
.sum();

assert_eq!(total_hashes, 3500);

Ok(())
}
57 changes: 44 additions & 13 deletions src/sourmash/index/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,50 @@
CounterGather - an ancillary class returned by the 'counter_gather()' method.
"""

from __future__ import annotations

import os
import sourmash
from abc import abstractmethod, ABC
from collections import namedtuple, Counter
from collections import Counter
from collections import defaultdict
from typing import NamedTuple, Optional, TypedDict, TYPE_CHECKING

from ..search import make_jaccard_search_query, make_gather_query
from ..manifest import CollectionManifest
from ..logging import debug_literal
from ..signature import load_signatures, save_signatures
from .._lowlevel import ffi, lib
from ..utils import RustObject, rustcall, decode_str, encode_str
from .. import SourmashSignature
from ..picklist import SignaturePicklist

if TYPE_CHECKING:
from typing_extensions import Unpack


class IndexSearchResult(NamedTuple):
"""generic return tuple for Index.search and Index.gather"""
score: float
signature: SourmashSignature
location: str


class Selection(TypedDict):
ksize: Optional[int]
moltype: Optional[str]
num: Optional[int]
scaled: Optional[int]
containment: Optional[bool]
abund: Optional[bool]
picklist: Optional[SignaturePicklist]


# TypedDict can't have methods (it is a dict in runtime)
def _selection_as_rust(selection: Selection):
...


# generic return tuple for Index.search and Index.gather
IndexSearchResult = namedtuple('Result', 'score, signature, location')

class Index(ABC):
# this will be removed soon; see sourmash#1894.
Expand Down Expand Up @@ -282,7 +313,6 @@ def gather(self, query, threshold_bp=None, **kwargs):

def peek(self, query_mh, threshold_bp=0):
"Mimic CounterGather.peek() on top of Index. Yes, this is backwards."
from sourmash import SourmashSignature

# build a signature to use with self.gather...
query_ss = SourmashSignature(query_mh)
Expand Down Expand Up @@ -332,8 +362,7 @@ def counter_gather(self, query, threshold_bp, **kwargs):
return counter

@abstractmethod
def select(self, ksize=None, moltype=None, scaled=None, num=None,
abund=None, containment=None):
def select(self, **kwargs: Unpack[Selection]):
"""Return Index containing only signatures that match requirements.
Current arguments can be any or all of:
Expand Down Expand Up @@ -433,7 +462,7 @@ def load(cls, location, filename=None):
lidx = LinearIndex(si, filename=filename)
return lidx

def select(self, **kwargs):
def select(self, **kwargs: Unpack[Selection]):
"""Return new LinearIndex containing only signatures that match req's.
Does not raise ValueError, but may return an empty Index.
Expand Down Expand Up @@ -504,7 +533,7 @@ def save(self, path):
def load(cls, path):
raise NotImplementedError

def select(self, **kwargs):
def select(self, **kwargs: Unpack[Selection]):
"""Return new object yielding only signatures that match req's.
Does not raise ValueError, but may return an empty Index.
Expand All @@ -519,7 +548,7 @@ def select(self, **kwargs):
return LazyLinearIndex(self.db, selection_dict)


class ZipFileLinearIndex(Index):
class ZipFileLinearIndex(Index, RustObject):
"""\
A read-only collection of signatures in a zip file.
Expand All @@ -529,6 +558,8 @@ class ZipFileLinearIndex(Index):
"""
is_database = True

#__dealloc_func__ = lib.zflinearindex_free

def __init__(self, storage, *, selection_dict=None,
traverse_yield_all=False, manifest=None, use_manifest=True):
self.storage = storage
Expand Down Expand Up @@ -667,7 +698,7 @@ def signatures(self):
if select(ss):
yield ss

def select(self, **kwargs):
def select(self, **kwargs: Unpack[Selection]):
"Select signatures in zip file based on ksize/moltype/etc."

# if we have a manifest, run 'select' on the manifest.
Expand Down Expand Up @@ -1042,7 +1073,7 @@ def load_from_pathlist(cls, filename):
def save(self, *args):
raise NotImplementedError

def select(self, **kwargs):
def select(self, **kwargs: Unpack[Selection]):
"Run 'select' on the manifest."
new_manifest = self.manifest.select_to_manifest(**kwargs)
return MultiIndex(new_manifest, self.parent,
Expand Down Expand Up @@ -1154,7 +1185,7 @@ def insert(self, *args):
def save(self, *args):
raise NotImplementedError

def select(self, **kwargs):
def select(self, **kwargs: Unpack[Selection]):
"Run 'select' on manifest, return new object with new manifest."
manifest = self.manifest
new_manifest = manifest.select_to_manifest(**kwargs)
Expand Down Expand Up @@ -1278,7 +1309,7 @@ def save(self, *args):
def insert(self, *args):
raise NotImplementedError

def select(self, **kwargs):
def select(self, **kwargs: Unpack[Selection]):
"Run 'select' on the manifest."
new_manifest = self.manifest.select_to_manifest(**kwargs)
return StandaloneManifestIndex(new_manifest, self._location,
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sbt_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ class ZipStorage(RustObject, Storage):
__dealloc_func__ = lib.zipstorage_free

def __init__(self, path, *, mode="r"):
path = os.path.abspath(path)
if mode == "w":
self.__inner = _RwZipStorage(path)
else:
self.__inner = None
path = os.path.abspath(path)
self._objptr = rustcall(lib.zipstorage_new, to_bytes(path), len(path))

@staticmethod
Expand Down
9 changes: 9 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,15 @@ extras =
commands = pip wheel -w {envtmpdir}/build --no-deps .
twine check {envtmpdir}/build/*

[testenv:mypy]
description = run mypy checker
basepython = python3.8
passenv = {[testenv]passenv}
# without PROGRAMDATA cloning using git for Windows will fail with an `error setting certificate verify locations` error
PROGRAMDATA
deps = mypy
commands = mypy src/sourmash

[testenv:fix_lint]
description = format the code base to adhere to our styles, and complain about what we cannot do automatically
basepython = python3.8
Expand Down

0 comments on commit 3428e54

Please sign in to comment.