Skip to content

Commit

Permalink
add ZipStorage, support loading tree from storage
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Apr 24, 2020
1 parent bada50e commit f53ab67
Show file tree
Hide file tree
Showing 11 changed files with 258 additions and 31 deletions.
76 changes: 53 additions & 23 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,11 @@ def search_transcript(node, seq, threshold):
import os
from random import randint, random
import sys
from tempfile import NamedTemporaryFile

from deprecation import deprecated

from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage
from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage, ZipStorage
from .logging import error, notify, debug
from .index import Index
from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions
Expand All @@ -68,6 +69,7 @@ def search_transcript(node, seq, threshold):
'FSStorage': FSStorage,
'IPFSStorage': IPFSStorage,
'RedisStorage': RedisStorage,
'ZipStorage': ZipStorage,
}
NodePos = namedtuple("NodePos", ["pos", "node"])

Expand Down Expand Up @@ -649,10 +651,40 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr
SBT
the SBT tree built from the description.
"""
dirname = os.path.dirname(os.path.abspath(location))
sbt_name = os.path.basename(location)
if sbt_name.endswith('.sbt.json'):
sbt_name = sbt_name[:-9]
tempfile = None
sbt_name = None
tree_data = None

## TODO: keep it more compatible with FSStorage,
## don't force tree.sbt.json

if storage is None:
if ZipStorage.can_open(location):
storage = ZipStorage(location)
elif TarStorage.can_open(location):
storage = TarStorage(location)

if storage is not None:
sbts = storage.list_sbts()
if len(sbts) != 1:
print("no SBT, or too many SBTs!")
else:
tree_data = storage.load(sbts[0])

if tree_data is not None:
tempfile = NamedTemporaryFile()

tempfile.write(tree_data)
tempfile.flush()

dirname = os.path.dirname(tempfile.name)
sbt_name = os.path.basename(tempfile.name)

if sbt_name is None:
dirname = os.path.dirname(os.path.abspath(location))
sbt_name = os.path.basename(location)
if sbt_name.endswith('.sbt.json'):
sbt_name = sbt_name[:-9]

loaders = {
1: cls._load_v1,
Expand All @@ -666,17 +698,26 @@ def load(cls, location, leaf_loader=None, storage=None, print_version_warning=Tr
leaf_loader = Leaf.load

sbt_fn = os.path.join(dirname, sbt_name)
if not sbt_fn.endswith('.sbt.json'):
if not sbt_fn.endswith('.sbt.json') and tempfile is None:
sbt_fn += '.sbt.json'
with open(sbt_fn) as fp:
jnodes = json.load(fp)

if tempfile is not None:
tempfile.close()

version = 1
if isinstance(jnodes, Mapping):
version = jnodes['version']

if version < 3 and storage is None:
storage = FSStorage(dirname, '.sbt.{}'.format(sbt_name))
elif storage is None:
klass = STORAGES[jnodes['storage']['backend']]
if jnodes['storage']['backend'] == "FSStorage":
storage = FSStorage(dirname, jnodes['storage']['args']['path'])
elif storage is None:
storage = klass(**jnodes['storage']['args'])

return loaders[version](jnodes, leaf_loader, dirname, storage,
print_version_warning)
Expand Down Expand Up @@ -756,12 +797,6 @@ def _load_v3(cls, info, leaf_loader, dirname, storage, print_version_warning=Tru
sbt_nodes = {}
sbt_leaves = {}

klass = STORAGES[info['storage']['backend']]
if info['storage']['backend'] == "FSStorage":
storage = FSStorage(dirname, info['storage']['args']['path'])
elif storage is None:
storage = klass(**info['storage']['args'])

factory = GraphFactory(*info['factory']['args'])

max_node = 0
Expand Down Expand Up @@ -803,12 +838,6 @@ def _load_v4(cls, info, leaf_loader, dirname, storage, print_version_warning=Tru
sbt_nodes = {}
sbt_leaves = {}

klass = STORAGES[info['storage']['backend']]
if info['storage']['backend'] == "FSStorage":
storage = FSStorage(dirname, info['storage']['args']['path'])
elif storage is None:
storage = klass(**info['storage']['args'])

factory = GraphFactory(*info['factory']['args'])

max_node = 0
Expand Down Expand Up @@ -844,11 +873,12 @@ def _load_v5(cls, info, leaf_loader, dirname, storage, print_version_warning=Tru
sbt_nodes = {}
sbt_leaves = {}

klass = STORAGES[info['storage']['backend']]
if info['storage']['backend'] == "FSStorage":
storage = FSStorage(dirname, info['storage']['args']['path'])
elif storage is None:
storage = klass(**info['storage']['args'])
if storage is None:
klass = STORAGES[info['storage']['backend']]
if info['storage']['backend'] == "FSStorage":
storage = FSStorage(dirname, info['storage']['args']['path'])
elif storage is None:
storage = klass(**info['storage']['args'])

factory = GraphFactory(*info['factory']['args'])

Expand Down
79 changes: 78 additions & 1 deletion sourmash/sbt_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from io import BytesIO
import os
import tarfile
import zipfile


class Storage(abc.ABCMeta(str('ABC'), (object,), {'__slots__': ()})):
Expand All @@ -27,6 +28,9 @@ def __enter__(self):
def __exit__(self, type, value, traceback):
pass

def can_open(self, location):
return False


class FSStorage(Storage):

Expand Down Expand Up @@ -76,6 +80,15 @@ def __init__(self, path=None):
else:
self.tarfile = tarfile.open(path, 'w:gz')

self.subdir = None
try:
# TODO: check more than one dir?
subdirs = next(f for f in self.tarfile.getmembers() if f.isdir())
except StopIteration:
pass
else:
self.subdir = subdirs.name

def save(self, path, content):
info = tarfile.TarInfo(path)
info.size = len(content)
Expand All @@ -86,7 +99,10 @@ def save(self, path, content):
return path

def load(self, path):
content = self.tarfile.getmember(path)
try:
content = self.tarfile.getmember(path)
except KeyError:
content = self.tarfile.getmember(os.path.join(self.subdir, path))
f = self.tarfile.extractfile(content)
return f.read()

Expand All @@ -96,6 +112,67 @@ def init_args(self):
def __exit__(self, type, value, traceback):
self.tarfile.close()

@staticmethod
def can_open(location):
try:
return tarfile.is_tarfile(location)
except IOError:
return False

def list_sbts(self):
return [f for f in self.tarfile.getnames() if f.endswith(".sbt.json")]


class ZipStorage(Storage):

def __init__(self, path=None):
# TODO: leave it open, or close/open every time?

if path is None:
# TODO: Open a temporary file?
pass

self.path = os.path.abspath(path)

dirname = os.path.dirname(self.path)
if not os.path.exists(dirname):
os.makedirs(dirname)

if os.path.exists(self.path):
self.zipfile = zipfile.ZipFile(path, 'r')
else:
self.zipfile = zipfile.ZipFile(path, mode='w',
compression=zipfile.ZIP_BZIP2)

self.subdir = None
subdirs = [f for f in self.zipfile.namelist() if f.endswith("/")]
if len(subdirs) == 1:
self.subdir = subdirs[0]

def save(self, path, content):
self.zipfile.writestr(path, content)
return path

def load(self, path):
try:
return self.zipfile.read(path)
except KeyError:
path = os.path.join(self.subdir, path)
return self.zipfile.read(path)

def init_args(self):
return {'path': self.path}

def __exit__(self, type, value, traceback):
self.zipfile.close()

@staticmethod
def can_open(location):
return zipfile.is_zipfile(location)

def list_sbts(self):
return [f for f in self.zipfile.namelist() if f.endswith(".sbt.json")]


class IPFSStorage(Storage):

Expand Down
2 changes: 1 addition & 1 deletion sourmash/signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def load_one_signature(data, ksize=None, select_moltype=None, ignore_md5sum=Fals
raise ValueError("expected to load exactly one signature")


def save_signatures(siglist, fp=None):
def save_signatures(siglist, fp=None, compressed=False):
"Save multiple signatures into a JSON string (or into file handle 'fp')"
attached_refs = weakref.WeakKeyDictionary()
collected = []
Expand Down
2 changes: 0 additions & 2 deletions src/core/src/ffi/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@ use std::io;
use std::os::raw::c_char;
use std::slice;

use serde_json;

use crate::cmd::ComputeParameters;
use crate::ffi::utils::SourmashStr;
use crate::signature::Signature;
Expand Down
2 changes: 2 additions & 0 deletions src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ impl Signature {
where
R: io::Read,
{
let (rdr, _format) = niffler::get_reader(Box::new(rdr))?;

let sigs: Vec<Signature> = serde_json::from_reader(rdr)?;
Ok(sigs)
}
Expand Down
1 change: 0 additions & 1 deletion src/core/src/sketch/nodegraph.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ use std::slice;
use byteorder::{BigEndian, ByteOrder, LittleEndian, ReadBytesExt, WriteBytesExt};
use failure::Error;
use fixedbitset::FixedBitSet;
use primal_check;

use crate::index::sbt::Update;
use crate::sketch::minhash::KmerMinHash;
Expand Down
Binary file added tests/test-data/genome-s10+s11.sig.gz
Binary file not shown.
Binary file added tests/test-data/v5.tar.gz
Binary file not shown.
Binary file added tests/test-data/v5.zip
Binary file not shown.
Loading

0 comments on commit f53ab67

Please sign in to comment.