Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Remove TarStorage and replace usage with ZipStorage #1170

Merged
merged 2 commits into from
Aug 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions doc/storage.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ Saves internal SBT data in a hidden directory near the SBT JSON description.
We used to create a tar file of JSON + hidden directory,
which requires extracting and using more disk space.

### TarStorage
### ZipStorage

Similar to FSStorage,
but saves the internal SBT data in a `tar` file.
but saves the internal SBT data in a `zip` file.
- Pros
* easy to distribute (two files)
* easy to distribute (one file)
- Cons
* still need to distribute and download everything
(you need the full tar file available locally to be able to use the SBT).
(you need the full zip file available locally to be able to use the SBT).

### IPFSStorage

Expand Down
7 changes: 3 additions & 4 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,12 @@ def search_transcript(node, seq, threshold):
from cachetools import Cache

from .exceptions import IndexNotSupported
from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage, ZipStorage
from .sbt_storage import FSStorage, IPFSStorage, RedisStorage, ZipStorage
from .logging import error, notify, debug
from .index import Index
from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions

STORAGES = {
'TarStorage': TarStorage,
'FSStorage': FSStorage,
'IPFSStorage': IPFSStorage,
'RedisStorage': RedisStorage,
Expand Down Expand Up @@ -1310,8 +1309,8 @@ def convert_cmd(name, backend):
backend = IPFSStorage
elif backend.lower() in ('redis', 'redisstorage'):
backend = RedisStorage
elif backend.lower() in ('tar', 'tarstorage'):
backend = TarStorage
elif backend.lower() in ('zip', 'zipstorage'):
backend = ZipStorage
elif backend.lower() in ('fs', 'fsstorage'):
backend = FSStorage
if options:
Expand Down
41 changes: 0 additions & 41 deletions sourmash/sbt_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,47 +82,6 @@ def load(self, path):
return path.read_bytes()


class TarStorage(Storage):

def __init__(self, path=None):
# TODO: leave it open, or close/open every time?

if path is None:
# TODO: Open a temporary file?
pass # CTB: should raise an exception, no?

self.path = os.path.abspath(path)

dirname = os.path.dirname(self.path)
if not os.path.exists(dirname):
os.makedirs(dirname)

if os.path.exists(self.path):
self.tarfile = tarfile.open(path, 'r')
else:
self.tarfile = tarfile.open(path, 'w:gz')

def save(self, path, content):
info = tarfile.TarInfo(path)
info.size = len(content)

# TODO: check tarfile mode, if read-only reopen as writable
self.tarfile.addfile(info, BytesIO(content))

return path

def load(self, path):
content = self.tarfile.getmember(path)
f = self.tarfile.extractfile(content)
return f.read()

def init_args(self):
return {'path': self.path}

def __exit__(self, type, value, traceback):
self.tarfile.close()


class ZipStorage(Storage):

def __init__(self, path):
Expand Down
37 changes: 1 addition & 36 deletions tests/test_sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sourmash.sbt import SBT, GraphFactory, Leaf, Node
from sourmash.sbtmh import (SigLeaf, search_minhashes,
search_minhashes_containment)
from sourmash.sbt_storage import (FSStorage, TarStorage, RedisStorage,
from sourmash.sbt_storage import (FSStorage, RedisStorage,
IPFSStorage, ZipStorage)

from . import sourmash_tst_utils as utils
Expand Down Expand Up @@ -333,41 +333,6 @@ def test_sbt_fsstorage():
assert os.path.exists(os.path.join(location, '.fstree'))


def test_sbt_tarstorage():
factory = GraphFactory(31, 1e5, 4)
with utils.TempDirectory() as location:
tree = SBT(factory)

for f in utils.SIG_FILES:
sig = load_one_signature(utils.get_test_data(f))

leaf = SigLeaf(os.path.basename(f), sig)
tree.add_node(leaf)
to_search = leaf

print('*' * 60)
print("{}:".format(to_search.metadata))
old_result = {str(s) for s in tree.find(search_minhashes,
to_search.data, 0.1)}
print(*old_result, sep='\n')

with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage:
tree.save(os.path.join(location, 'tree'), storage=storage)

with TarStorage(os.path.join(location, 'tree.tar.gz')) as storage:
tree = SBT.load(os.path.join(location, 'tree'),
leaf_loader=SigLeaf.load,
storage=storage)

print('*' * 60)
print("{}:".format(to_search.metadata))
new_result = {str(s) for s in tree.find(search_minhashes,
to_search.data, 0.1)}
print(*new_result, sep='\n')

assert old_result == new_result


def test_sbt_zipstorage(tmpdir):
# create tree, save to a zip, then load and search.
factory = GraphFactory(31, 1e5, 4)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3893,8 +3893,8 @@ def test_storage_convert():
for (n1, n2) in zip(sorted(original), sorted(ipfs)))

args = ['storage', 'convert',
'-b', """'TarStorage("{}")'""".format(
os.path.join(location, 'v2.sbt.tar.gz')),
'-b', """'ZipStorage("{}")'""".format(
os.path.join(location, 'v2.sbt.zip')),
testsbt]
status, out, err = utils.runscript('sourmash', args,
in_directory=location)
Expand Down