From a998a6a8184d06dbf54fd4e887df8279a42989fd Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 8 Apr 2022 07:23:14 -0700 Subject: [PATCH] Oxidize ZipStorage (#1909) Expose the (read-only for now) Rust ZipStorage and use it instead of the regular (read-write) ZipStorage. if writing is needed, fall back to current ZipStorage (as a stopgap while the Rust one doesn't support writing) This PR also removes Python 3.7 support (which we actually dropped in https://github.com/sourmash-bio/sourmash/pull/1839), and adds Python 3.9 and Python 3.10 to the setup.cfg classifiers. * init rust zipstorage * unify into ZipStorage * benchmark that works with previous ZipStorage * benchmark for loading small data from ZipStorage * ouroboros seems to work * peakmem benchmark for zipstorage * add docker target in nix --- Cargo.lock | 43 ++++++++ Makefile | 1 + benchmarks/benchmarks.py | 61 +++++++++++ doc/environment.yml | 2 +- flake.lock | 24 ++--- flake.nix | 14 +++ include/sourmash.h | 21 ++++ setup.cfg | 5 +- src/core/Cargo.toml | 1 + src/core/src/ffi/mod.rs | 1 + src/core/src/ffi/storage.rs | 140 ++++++++++++++++++++++++ src/core/src/storage.rs | 190 ++++++++++++++++++++++----------- src/core/tests/storage.rs | 27 +++-- src/sourmash/index/__init__.py | 24 ++--- src/sourmash/sbt.py | 7 +- src/sourmash/sbt_storage.py | 120 +++++++++++++++++++-- src/sourmash/sourmash_args.py | 2 +- tests/test_index.py | 3 +- tests/test_sbt.py | 2 +- tox.ini | 2 +- 20 files changed, 576 insertions(+), 114 deletions(-) create mode 100644 src/core/src/ffi/storage.rs diff --git a/Cargo.lock b/Cargo.lock index ef043fcddb..d634a4a0ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,12 +2,24 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "Inflector" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe438c63458706e03479442743baae6c88256498e6431708f6dfc520a26515d3" + [[package]] name = "adler" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "aliasable" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "250f629c0161ad8107cf89319e990051fae62832fd343083bea452d93e2205fd" + [[package]] name = "assert_matches" version = "1.5.0" @@ -635,6 +647,30 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "ouroboros" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f31a3b678685b150cba82b702dcdc5e155893f63610cf388d30cd988d4ca2bf" +dependencies = [ + "aliasable", + "ouroboros_macro", + "stable_deref_trait", +] + +[[package]] +name = "ouroboros_macro" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "084fd65d5dd8b3772edccb5ffd1e4b7eba43897ecd0f9401e330e8c542959408" +dependencies = [ + "Inflector", + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "piz" version = "0.4.0" @@ -998,6 +1034,7 @@ dependencies = [ "nohash-hasher", "num-iter", "once_cell", + "ouroboros", "piz", "primal-check", "proptest", @@ -1015,6 +1052,12 @@ dependencies = [ "web-sys", ] +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + [[package]] name = "static_assertions" version = "1.1.0" diff --git a/Makefile b/Makefile index 6e168e261d..10d8c1ec04 100644 --- a/Makefile +++ b/Makefile @@ -32,6 +32,7 @@ include/sourmash.h: src/core/src/lib.rs \ src/core/src/ffi/nodegraph.rs \ src/core/src/ffi/index/mod.rs \ src/core/src/ffi/index/revindex.rs \ + src/core/src/ffi/storage.rs \ src/core/src/errors.rs cd src/core && \ RUSTC_BOOTSTRAP=1 cbindgen -c cbindgen.toml . -o ../../$@ diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index 481cb76b43..375981299e 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -1,6 +1,10 @@ +import os import random +from pathlib import Path +from tempfile import NamedTemporaryFile +from sourmash.sbt_storage import ZipStorage from sourmash.minhash import MinHash @@ -139,3 +143,60 @@ class PeakmemMinAbundanceSuite(PeakmemMinHashSuite): def setup(self): PeakmemMinHashSuite.setup(self) self.mh = MinHash(500, 21, track_abundance=True) + +#################### + +class TimeZipStorageSuite: + + def setup(self): + import zipfile + self.zipfile = NamedTemporaryFile() + + with zipfile.ZipFile(self.zipfile, mode='w', + compression=zipfile.ZIP_STORED) as storage: + for i in range(100_000): + # just so we have lots of entries + storage.writestr(str(i), b"0") + # one big-ish entry + storage.writestr("sig1", b"9" * 1_000_000) + + def time_load_from_zipstorage(self): + with ZipStorage(self.zipfile.name) as storage: + for i in range(20): + storage.load("sig1") + + def time_load_small_from_zipstorage(self): + with ZipStorage(self.zipfile.name) as storage: + for i in range(20): + storage.load("99999") + + def teardown(self): + self.zipfile.close() + + +class PeakmemZipStorageSuite: + def setup(self): + import zipfile + self.zipfile = NamedTemporaryFile() + + with zipfile.ZipFile(self.zipfile, mode='w', + compression=zipfile.ZIP_STORED) as storage: + for i in range(100_000): + # just so we have lots of entries + storage.writestr(str(i), b"0") + # one big-ish entry + storage.writestr("sig1", b"9" * 1_000_000) + + + def peakmem_load_from_zipstorage(self): + with ZipStorage(self.zipfile.name) as storage: + for i in range(20): + storage.load("sig1") + + def peakmem_load_small_from_zipstorage(self): + with ZipStorage(self.zipfile.name) as storage: + for i in range(20): + storage.load("99999") + + def teardown(self): + self.zipfile.close() diff --git a/doc/environment.yml b/doc/environment.yml index 3864f514bd..f3840b2dce 100644 --- a/doc/environment.yml +++ b/doc/environment.yml @@ -3,4 +3,4 @@ channels: - defaults dependencies: - rust - - python =3.7 + - python =3.8 diff --git a/flake.lock b/flake.lock index f86273ccb8..8e41c96a5e 100644 --- a/flake.lock +++ b/flake.lock @@ -49,11 +49,11 @@ ] }, "locked": { - "lastModified": 1639947939, - "narHash": "sha256-pGsM8haJadVP80GFq4xhnSpNitYNQpaXk4cnA796Cso=", + "lastModified": 1648544490, + "narHash": "sha256-EoBDcccV70tfz2LAs5lK0BjC7en5mzUVlgLsd5E6DW4=", "owner": "nix-community", "repo": "naersk", - "rev": "2fc8ce9d3c025d59fee349c1f80be9785049d653", + "rev": "e30ef9a5ce9b3de8bb438f15829c50f9525ca730", "type": "github" }, "original": { @@ -64,11 +64,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1645937171, - "narHash": "sha256-n9f9GZBNMe8UMhcgmmaXNObkH01jjgp7INMrUgBgcy4=", + "lastModified": 1648219316, + "narHash": "sha256-Ctij+dOi0ZZIfX5eMhgwugfvB+WZSrvVNAyAuANOsnQ=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "22dc22f8cedc58fcb11afe1acb08e9999e78be9c", + "rev": "30d3d79b7d3607d56546dd2a6b49e156ba0ec634", "type": "github" }, "original": { @@ -150,11 +150,11 @@ ] }, "locked": { - "lastModified": 1645928338, - "narHash": "sha256-pNbkG19Nb4QTNRCIWwxv06JKKJNCUrDzgRrriEd7W1A=", + "lastModified": 1648866882, + "narHash": "sha256-yMs/RKA9pX47a03zupmQp8/RLRmKzdSDk+h5Yt7K9xU=", "owner": "oxalica", "repo": "rust-overlay", - "rev": "4f6e6588b07427cd8ddc99b664bf0fab02799804", + "rev": "7c90e17cd7c0b9e81d5b23f78b482088ac9961d1", "type": "github" }, "original": { @@ -165,11 +165,11 @@ }, "utils": { "locked": { - "lastModified": 1644229661, - "narHash": "sha256-1YdnJAsNy69bpcjuoKdOYQX0YxZBiCYZo4Twxerqv7k=", + "lastModified": 1648297722, + "narHash": "sha256-W+qlPsiZd8F3XkzXOzAoR+mpFqzm3ekQkJNa+PIh1BQ=", "owner": "numtide", "repo": "flake-utils", - "rev": "3cecb5b042f7f209c56ffd8371b2711a290ec797", + "rev": "0f8662f1319ad6abf89b3380dd2722369fc51ade", "type": "github" }, "original": { diff --git a/flake.nix b/flake.nix index 9ec1cd8300..ba080af204 100644 --- a/flake.nix +++ b/flake.nix @@ -77,6 +77,20 @@ DYLD_LIBRARY_PATH = "${self.packages.${system}.lib}/lib"; NO_BUILD = "1"; }; + docker = + let + bin = self.defaultPackage.${system}; + in + pkgs.dockerTools.buildLayeredImage { + name = bin.pname; + tag = bin.version; + contents = [ bin ]; + + config = { + Cmd = [ "/bin/sourmash" ]; + WorkingDir = "/"; + }; + }; }; defaultPackage = self.packages.${system}.sourmash; diff --git a/include/sourmash.h b/include/sourmash.h index de422efcf5..1f116e681b 100644 --- a/include/sourmash.h +++ b/include/sourmash.h @@ -58,6 +58,8 @@ typedef struct SourmashSearchResult SourmashSearchResult; typedef struct SourmashSignature SourmashSignature; +typedef struct SourmashZipStorage SourmashZipStorage; + /** * Represents a string. */ @@ -456,4 +458,23 @@ SourmashStr sourmash_str_from_cstr(const char *s); char sourmash_translate_codon(const char *codon); +SourmashStr **zipstorage_filenames(const SourmashZipStorage *ptr, uintptr_t *size); + +void zipstorage_free(SourmashZipStorage *ptr); + +SourmashStr **zipstorage_list_sbts(const SourmashZipStorage *ptr, uintptr_t *size); + +const uint8_t *zipstorage_load(const SourmashZipStorage *ptr, + const char *path_ptr, + uintptr_t insize, + uintptr_t *size); + +SourmashZipStorage *zipstorage_new(const char *ptr, uintptr_t insize); + +SourmashStr zipstorage_path(const SourmashZipStorage *ptr); + +void zipstorage_set_subdir(SourmashZipStorage *ptr, const char *path_ptr, uintptr_t insize); + +SourmashStr zipstorage_subdir(const SourmashZipStorage *ptr); + #endif /* SOURMASH_H_INCLUDED */ diff --git a/setup.cfg b/setup.cfg index bd3aeea86d..c959dc4949 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,8 +20,9 @@ classifiers = Operating System :: POSIX :: Linux Operating System :: MacOS :: MacOS X Programming Language :: Rust - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 Topic :: Scientific/Engineering :: Bio-Informatics project_urls = Documentation = https://sourmash.readthedocs.io @@ -42,7 +43,7 @@ install_requires = scipy deprecation>=2.0.6 cachetools>=4,<5 -python_requires = >=3.7 +python_requires = >=3.8 [bdist_wheel] universal = 1 diff --git a/src/core/Cargo.toml b/src/core/Cargo.toml index 736e506c20..43ce78c0c8 100644 --- a/src/core/Cargo.toml +++ b/src/core/Cargo.toml @@ -47,6 +47,7 @@ twox-hash = "1.6.0" vec-collections = "0.3.4" piz = "0.4.0" memmap2 = "0.5.0" +ouroboros = "0.15.0" [dev-dependencies] assert_matches = "1.3.0" diff --git a/src/core/src/ffi/mod.rs b/src/core/src/ffi/mod.rs index e9f276d5e2..a67de37176 100644 --- a/src/core/src/ffi/mod.rs +++ b/src/core/src/ffi/mod.rs @@ -12,6 +12,7 @@ pub mod index; pub mod minhash; pub mod nodegraph; pub mod signature; +pub mod storage; use std::ffi::CStr; use std::os::raw::c_char; diff --git a/src/core/src/ffi/storage.rs b/src/core/src/ffi/storage.rs new file mode 100644 index 0000000000..98eca095b2 --- /dev/null +++ b/src/core/src/ffi/storage.rs @@ -0,0 +1,140 @@ +use std::os::raw::c_char; +use std::slice; + +use crate::ffi::utils::{ForeignObject, SourmashStr}; +use crate::prelude::*; +use crate::storage::ZipStorage; + +pub struct SourmashZipStorage; + +impl ForeignObject for SourmashZipStorage { + type RustObject = ZipStorage; +} + +ffi_fn! { +unsafe fn zipstorage_new(ptr: *const c_char, insize: usize) -> Result<*mut SourmashZipStorage> { + let path = { + assert!(!ptr.is_null()); + let path = slice::from_raw_parts(ptr as *mut u8, insize); + std::str::from_utf8(path)? + }; + let zipstorage = ZipStorage::from_file(path)?; + + Ok(SourmashZipStorage::from_rust(zipstorage)) +} +} + +#[no_mangle] +pub unsafe extern "C" fn zipstorage_free(ptr: *mut SourmashZipStorage) { + SourmashZipStorage::drop(ptr); +} + +ffi_fn! { +unsafe fn zipstorage_load(ptr: *const SourmashZipStorage, + path_ptr: *const c_char, + insize: usize, + size: *mut usize) -> Result<*const u8> { + + let storage = SourmashZipStorage::as_rust(ptr); + + let path = { + assert!(!path_ptr.is_null()); + let path = slice::from_raw_parts(path_ptr as *mut u8, insize); + std::str::from_utf8(path)? + }; + + let buffer = storage.load(path)?; + + let b = buffer.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *const u8) +} +} + +ffi_fn! { +unsafe fn zipstorage_list_sbts( + ptr: *const SourmashZipStorage, + size: *mut usize, +) -> Result<*mut *mut SourmashStr> { + let storage = SourmashZipStorage::as_rust(ptr); + + let sbts = storage.list_sbts()?; + + // FIXME: use the ForeignObject trait, maybe define new method there... + let ptr_sigs: Vec<*mut SourmashStr> = sbts + .into_iter() + .map(|x| Box::into_raw(Box::new(SourmashStr::from_string(x))) as *mut SourmashStr) + .collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *mut *mut SourmashStr) +} +} + +ffi_fn! { +unsafe fn zipstorage_filenames( + ptr: *const SourmashZipStorage, + size: *mut usize, +) -> Result<*mut *mut SourmashStr> { + let storage = SourmashZipStorage::as_rust(ptr); + + let files = storage.filenames()?; + + // FIXME: use the ForeignObject trait, maybe define new method there... + let ptr_sigs: Vec<*mut SourmashStr> = files + .into_iter() + .map(|x| Box::into_raw(Box::new(SourmashStr::from_string(x))) as *mut SourmashStr) + .collect(); + + let b = ptr_sigs.into_boxed_slice(); + *size = b.len(); + + Ok(Box::into_raw(b) as *mut *mut SourmashStr) +} +} + +ffi_fn! { +unsafe fn zipstorage_set_subdir( + ptr: *mut SourmashZipStorage, + path_ptr: *const c_char, + insize: usize, +) -> Result<()> { + let storage = SourmashZipStorage::as_rust_mut(ptr); + + let path = { + assert!(!path_ptr.is_null()); + let path = slice::from_raw_parts(path_ptr as *mut u8, insize); + std::str::from_utf8(path)? + }; + + storage.set_subdir(path.to_string()); + Ok(()) +} +} + +ffi_fn! { +unsafe fn zipstorage_path(ptr: *const SourmashZipStorage) -> Result { + let storage = SourmashZipStorage::as_rust(ptr); + + if let Some(ref path) = storage.path() { + Ok(path.clone().into()) + } else { + Ok("".into()) + } +} +} + +ffi_fn! { +unsafe fn zipstorage_subdir(ptr: *const SourmashZipStorage) -> Result { + let storage = SourmashZipStorage::as_rust(ptr); + + if let Some(ref path) = storage.subdir() { + Ok(path.clone().into()) + } else { + Ok("".into()) + } +} +} diff --git a/src/core/src/storage.rs b/src/core/src/storage.rs index 598f47c7b6..a2269bd6e7 100644 --- a/src/core/src/storage.rs +++ b/src/core/src/storage.rs @@ -1,6 +1,8 @@ +use std::collections::BTreeMap; +use std::ffi::OsStr; use std::fs::{DirBuilder, File}; use std::io::{BufReader, BufWriter, Read, Write}; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::{Arc, Mutex}; use serde::{Deserialize, Serialize}; @@ -46,6 +48,12 @@ impl Storage for InnerStorage { pub enum StorageError { #[error("Path can't be empty")] EmptyPathError, + + #[error("Path not found: {0}")] + PathNotFoundError(String), + + #[error("Error reading data from {0}")] + DataReadError(String), } #[derive(Serialize, Deserialize)] @@ -155,48 +163,80 @@ impl Storage for FSStorage { } } -pub struct ZipStorage<'a> { +#[ouroboros::self_referencing] +pub struct ZipStorage { mapping: Option, - archive: Option>, - //metadata: piz::read::DirectoryContents<'a>, -} -fn load_from_archive<'a>(archive: &'a piz::ZipArchive<'a>, path: &str) -> Result, Error> { - use piz::read::FileTree; + #[borrows(mapping)] + #[covariant] + archive: piz::ZipArchive<'this>, - // FIXME error - let tree = piz::read::as_tree(archive.entries()).map_err(|_| StorageError::EmptyPathError)?; - // FIXME error - let entry = tree - .lookup(path) - .map_err(|_| StorageError::EmptyPathError)?; + subdir: Option, + path: Option, - // FIXME error - let mut reader = BufReader::new( - archive - .read(entry) - .map_err(|_| StorageError::EmptyPathError)?, - ); - let mut contents = Vec::new(); - reader.read_to_end(&mut contents)?; + #[borrows(archive)] + #[covariant] + metadata: Metadata<'this>, +} - Ok(contents) +pub type Metadata<'a> = BTreeMap<&'a OsStr, &'a piz::read::FileMetadata<'a>>; + +fn lookup<'a, P: AsRef>( + metadata: &'a Metadata, + path: P, +) -> Result<&'a piz::read::FileMetadata<'a>, Error> { + let path = path.as_ref(); + metadata + .get(&path.as_os_str()) + .ok_or_else(|| StorageError::PathNotFoundError(path.to_str().unwrap().into()).into()) + .map(|entry| *entry) +} + +fn find_subdirs<'a>(archive: &'a piz::ZipArchive<'a>) -> Result, Error> { + let subdirs: Vec<_> = archive + .entries() + .iter() + .filter(|entry| entry.is_dir()) + .collect(); + if subdirs.len() == 1 { + Ok(Some( + subdirs[0] + .path + .to_str() + .expect("Error converting path") + .into(), + )) + } else { + Ok(None) + } } -impl<'a> Storage for ZipStorage<'a> { +impl Storage for ZipStorage { fn save(&self, _path: &str, _content: &[u8]) -> Result { unimplemented!(); } fn load(&self, path: &str) -> Result, Error> { - if let Some(archive) = &self.archive { - load_from_archive(archive, path) - } else { - //FIXME - let archive = piz::ZipArchive::new((&self.mapping.as_ref()).unwrap()) - .map_err(|_| StorageError::EmptyPathError)?; - load_from_archive(&archive, path) - } + let metadata = self.borrow_metadata(); + + let entry = lookup(metadata, path).or_else(|_| { + if let Some(subdir) = self.borrow_subdir() { + lookup(metadata, subdir.to_owned() + path) + .map_err(|_| StorageError::PathNotFoundError(path.into())) + } else { + Err(StorageError::PathNotFoundError(path.into())) + } + })?; + + let mut reader = BufReader::new( + self.borrow_archive() + .read(entry) + .map_err(|_| StorageError::DataReadError(path.into()))?, + ); + let mut contents = Vec::new(); + reader.read_to_end(&mut contents)?; + + Ok(contents) } fn args(&self) -> StorageArgs { @@ -204,40 +244,68 @@ impl<'a> Storage for ZipStorage<'a> { } } -impl<'a> ZipStorage<'a> { - pub fn new(location: &str) -> Result { +impl ZipStorage { + pub fn from_file(location: &str) -> Result { let zip_file = File::open(location)?; let mapping = unsafe { memmap2::Mmap::map(&zip_file)? }; - //FIXME - //let archive = piz::ZipArchive::new(&mapping).map_err(|_| StorageError::EmptyPathError)?; + let mut storage = ZipStorageBuilder { + mapping: Some(mapping), + archive_builder: |mapping: &Option| { + piz::ZipArchive::new(mapping.as_ref().unwrap()).unwrap() + }, + metadata_builder: |archive: &piz::ZipArchive| { + archive + .entries() + .iter() + .map(|entry| (entry.path.as_os_str(), entry)) + .collect() + }, + subdir: None, + path: Some(location.to_owned()), + } + .build(); - //FIXME - // let tree = - // piz::read::as_tree(archive.entries()).map_err(|_| StorageError::EmptyPathError)?; + let subdir = find_subdirs(storage.borrow_archive())?; + storage.with_mut(|fields| *fields.subdir = subdir); - Ok(Self { - mapping: Some(mapping), - archive: None, - //metadata: tree, - }) - } - - pub fn from_slice(mapping: &'a [u8]) -> Result { - //FIXME - let archive = piz::ZipArchive::new(mapping).map_err(|_| StorageError::EmptyPathError)?; - - //FIXME - //let entries: Vec<_> = archive.entries().iter().map(|x| x.to_owned()).collect(); - //let tree = - // piz::read::as_tree(entries.as_slice()).map_err(|_| StorageError::EmptyPathError)?; - - Ok(Self { - archive: Some(archive), - mapping: None, - /* metadata: archive - .as_tree() - .map_err(|_| StorageError::EmptyPathError)?, */ - }) + Ok(storage) + } + + pub fn path(&self) -> Option { + self.borrow_path().clone() + } + + pub fn subdir(&self) -> Option { + self.borrow_subdir().clone() + } + + pub fn set_subdir(&mut self, path: String) { + self.with_mut(|fields| *fields.subdir = Some(path)) + } + + pub fn list_sbts(&self) -> Result, Error> { + Ok(self + .borrow_archive() + .entries() + .iter() + .filter_map(|entry| { + let path = entry.path.to_str().expect("Error converting path"); + if path.ends_with(".sbt.json") { + Some(path.into()) + } else { + None + } + }) + .collect()) + } + + pub fn filenames(&self) -> Result, Error> { + Ok(self + .borrow_archive() + .entries() + .iter() + .map(|entry| entry.path.to_str().expect("Error converting path").into()) + .collect()) } } diff --git a/src/core/tests/storage.rs b/src/core/tests/storage.rs index 202ddd3fe3..5a60e02fcc 100644 --- a/src/core/tests/storage.rs +++ b/src/core/tests/storage.rs @@ -1,4 +1,3 @@ -use std::fs::File; use std::path::PathBuf; use sourmash::storage::{Storage, ZipStorage}; @@ -8,7 +7,7 @@ fn zipstorage_load_file() -> Result<(), Box> { let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); filename.push("../../tests/test-data/v6.sbt.zip"); - let zs = ZipStorage::new(filename.to_str().unwrap())?; + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; let data = zs.load("v6.sbt.json")?; @@ -19,19 +18,27 @@ fn zipstorage_load_file() -> Result<(), Box> { } #[test] -fn zipstorage_load_slice() -> Result<(), Box> { +fn zipstorage_load_manifest() -> Result<(), Box> { let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - filename.push("../../tests/test-data/v6.sbt.zip"); + filename.push("../../tests/test-data/prot/protein.sbt.zip"); - let zip_file = File::open(filename)?; - let mapping = unsafe { memmap2::Mmap::map(&zip_file)? }; + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; - let zs = ZipStorage::from_slice(&mapping)?; + let _data = zs.load("protein.manifest.csv").expect("error loading file"); - let data = zs.load("v6.sbt.json")?; + Ok(()) +} - let description: serde_json::Value = serde_json::from_slice(&data[..])?; - assert_eq!(description["version"], 6); +#[test] +fn zipstorage_list_sbts() -> Result<(), Box> { + let mut filename = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + filename.push("../../tests/test-data/v6.sbt.zip"); + + let zs = ZipStorage::from_file(filename.to_str().unwrap())?; + + let sbts = zs.list_sbts()?; + + assert_eq!(sbts.len(), 1); Ok(()) } diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index 60f1940c36..5fa521db66 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -554,7 +554,7 @@ def _load_manifest(self): "Load a manifest if one exists" try: manifest_data = self.storage.load('SOURMASH-MANIFEST.csv') - except KeyError: + except (KeyError, FileNotFoundError): self.manifest = None else: debug_literal(f'found manifest on load for {self.storage.path}') @@ -616,18 +616,16 @@ def _signatures_with_internal(self): Note: does not limit signatures to subsets. """ - zf = self.storage.zipfile - # list all the files, without using the Storage interface; currently, # 'Storage' does not provide a way to list all the files, so :shrug:. - for zipinfo in zf.infolist(): + for filename in self.storage._filenames(): # should we load this file? if it ends in .sig OR we are forcing: - if zipinfo.filename.endswith('.sig') or \ - zipinfo.filename.endswith('.sig.gz') or \ + if filename.endswith('.sig') or \ + filename.endswith('.sig.gz') or \ self.traverse_yield_all: - fp = zf.open(zipinfo) - for ss in load_signatures(fp): - yield ss, zipinfo.filename + sig_data = self.storage.load(filename) + for ss in load_signatures(sig_data): + yield ss, filename def signatures(self): "Load all signatures in the zip file." @@ -652,10 +650,10 @@ def signatures(self): # if no manifest here, break Storage class encapsulation # and go for all the files. (This is necessary to support # ad-hoc zipfiles that have no manifests.) - for zipinfo in storage.zipfile.infolist(): + for filename in storage._filenames(): # should we load this file? if it ends in .sig OR force: - if zipinfo.filename.endswith('.sig') or \ - zipinfo.filename.endswith('.sig.gz') or \ + if filename.endswith('.sig') or \ + filename.endswith('.sig.gz') or \ self.traverse_yield_all: if selection_dict: select = lambda x: select_signature(x, @@ -663,7 +661,7 @@ def signatures(self): else: select = lambda x: True - data = self.storage.load(zipinfo.filename) + data = self.storage.load(filename) for ss in load_signatures(data): if select(ss): yield ss diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index d974c23c91..bb001fd940 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -625,7 +625,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): kind = "Zip" if not path.endswith('.sbt.zip'): path += '.sbt.zip' - storage = ZipStorage(path) + storage = ZipStorage(path, mode="w") backend = "FSStorage" assert path[-8:] == '.sbt.zip' @@ -1440,6 +1440,8 @@ def convert_cmd(name, backend): backend = options.pop(0) backend = backend.lower().strip("'") + kwargs = {} + if options: print(options) options = options[0].split(')') @@ -1454,6 +1456,7 @@ def convert_cmd(name, backend): backend = RedisStorage elif backend.lower() in ('zip', 'zipstorage'): backend = ZipStorage + kwargs['mode'] = 'w' elif backend.lower() in ('fs', 'fsstorage'): backend = FSStorage if options: @@ -1469,6 +1472,6 @@ def convert_cmd(name, backend): else: error('backend not recognized: {}'.format(backend)) - with backend(*options) as storage: + with backend(*options, **kwargs) as storage: sbt = SBT.load(name, leaf_loader=SigLeaf.load) sbt.save(name, storage=storage) diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index 695832fa18..398e11c877 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -9,6 +9,11 @@ from abc import ABC from pathlib import Path +from ._lowlevel import ffi, lib +from .utils import RustObject, rustcall, decode_str +from .minhash import to_bytes + + class Storage(ABC): @abc.abstractmethod @@ -89,7 +94,110 @@ def load(self, path): return path.read_bytes() -class ZipStorage(Storage): +class ZipStorage(RustObject, Storage): + + __dealloc_func__ = lib.zipstorage_free + + def __init__(self, path, *, mode="r"): + if mode == "w": + self.__inner = _RwZipStorage(path) + else: + self.__inner = None + path = os.path.abspath(path) + self._objptr = rustcall(lib.zipstorage_new, to_bytes(path), len(path)) + + @staticmethod + def can_open(location): + return zipfile.is_zipfile(location) + + @property + def path(self): + if self.__inner: + return self.__inner.path + return decode_str(self._methodcall(lib.zipstorage_path)) + + @property + def subdir(self): + if self.__inner: + return self.__inner.subdir + return decode_str(self._methodcall(lib.zipstorage_subdir)) + + @subdir.setter + def subdir(self, value): + if self.__inner: + self.__inner.subdir = value + else: + self._methodcall(lib.zipstorage_set_subdir, to_bytes(value), len(value)) + + def _filenames(self): + if self.__inner: + return self.__inner._filenames() + + size = ffi.new("uintptr_t *") + paths_ptr = self._methodcall(lib.zipstorage_filenames, size) + size = size[0] + + paths = [] + for i in range(size): + path = decode_str(paths_ptr[i][0]) + paths.append(path) + + return paths + + def save(self, path, content, *, overwrite=False, compress=False): + if self.__inner: + return self.__inner.save(path, content, overwrite=overwrite, compress=compress) + raise NotImplementedError() + + def load(self, path): + if self.__inner: + return self.__inner.load(path) + + try: + size = ffi.new("uintptr_t *") + rawbuf = self._methodcall(lib.zipstorage_load, to_bytes(path), len(path), size) + size = size[0] + + rawbuf = ffi.gc(rawbuf, lambda o: lib.nodegraph_buffer_free(o, size), size) + buf = ffi.buffer(rawbuf, size) + + # TODO: maybe avoid the [:] here, it triggers a copy... + return buf[:] + except ValueError: + raise FileNotFoundError(path) + + def list_sbts(self): + if self.__inner: + return self.__inner.list_sbts() + + size = ffi.new("uintptr_t *") + paths_ptr = self._methodcall(lib.zipstorage_list_sbts, size) + size = size[0] + + paths = [] + for i in range(size): + path = decode_str(paths_ptr[i][0]) + paths.append(path) + + return paths + + def init_args(self): + return {'path': self.path} + + def flush(self): + if self.__inner: + self.__inner.flush() + + def close(self): + if self.__inner: + self.__inner.close() + + @staticmethod + def can_open(location): + return zipfile.is_zipfile(location) + + +class _RwZipStorage(Storage): def __init__(self, path): self.path = os.path.abspath(path) @@ -119,6 +227,9 @@ def __init__(self, path): if len(subdirs) == 1: self.subdir = subdirs[0] + def _filenames(self): + return [info.filename for info in self.zipfile.infolist()] + def _content_matches(self, zf, path, content): info = zf.getinfo(path) entry_content = zf.read(info) @@ -209,9 +320,6 @@ def load(self, path): else: raise FileNotFoundError(path) - def init_args(self): - return {'path': self.path} - def close(self): # TODO: this is not ideal; checking for zipfile.fp is looking at # internal implementation details from CPython... @@ -285,10 +393,6 @@ def flush(self, *, keep_closed=False): self.bufferzip.close() self.bufferzip = None - @staticmethod - def can_open(location): - return zipfile.is_zipfile(location) - def list_sbts(self): return [f for f in self.zipfile.namelist() if f.endswith(".sbt.json")] diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 764710a494..60cba7f43f 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -966,7 +966,7 @@ def open(self): if os.path.exists(self.location): do_create = False - storage = ZipStorage(self.location) + storage = ZipStorage(self.location, mode="w") if not storage.subdir: storage.subdir = 'signatures' diff --git a/tests/test_index.py b/tests/test_index.py index 35e31e0714..c0ee430ea2 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -941,8 +941,7 @@ def test_zipfile_API_signatures_traverse_yield_all(use_manifest): assert len(zipidx) == 8 # confirm that there are 12 files in there total, incl build.sh and dirs - zf = zipidx.storage.zipfile - allfiles = [ zi.filename for zi in zf.infolist() ] + allfiles = zipidx.storage._filenames() print(allfiles) assert len(allfiles) == 13 diff --git a/tests/test_sbt.py b/tests/test_sbt.py index cb5b043c91..d188cfa291 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -367,7 +367,7 @@ def test_sbt_zipstorage(tmpdir): old_result = {str(s.signature) for s in tree.find(search_obj, to_search.data)} print(*old_result, sep='\n') - with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: + with ZipStorage(str(tmpdir.join("tree.sbt.zip")), mode="w") as storage: tree.save(str(tmpdir.join("tree.sbt.json")), storage=storage) with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: diff --git a/tox.ini b/tox.ini index ec4576ea10..af5250fbee 100644 --- a/tox.ini +++ b/tox.ini @@ -95,7 +95,7 @@ deps = changedir = {toxinidir} commands = asv machine --yes - asv continuous latest HEAD + asv continuous latest HEAD {posargs} [testenv:docs] description = invoke sphinx-build to build the HTML docs