Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support index creation for unseekable file objects #103

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion indexed_gzip/indexed_gzip.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,11 @@ class IndexedGzipFile(io.BufferedReader):
``io.BufferedReader.__init__``. If not provided,
a default value of 4 * spacing is used if spacing
is given else 4 MiB is used.

:arg compressed_size: Size of the compressed data. If not provided,
will be determined by calling ``seek`` and
``tell``. Must be provided for unseekable
file-likes.
"""

# Use 4x spacing because each raw read seeks from the last index point
Expand Down Expand Up @@ -297,7 +302,8 @@ cdef class _IndexedGzipFile:
readall_buf_size=16777216,
drop_handles=True,
index_file=None,
skip_crc_check=False):
skip_crc_check=False,
compressed_size=0):
"""Create an ``_IndexedGzipFile``. The file may be specified either
with an open file handle (``fileobj``), or with a ``filename``. If the
former, the file is assumed have been opened for reading in binary
Expand Down Expand Up @@ -342,6 +348,11 @@ cdef class _IndexedGzipFile:
:arg index_file: Pre-generated index for this ``gz`` file -
if provided, passed through to
:meth:`import_index`.

:arg compressed_size: Size of the compressed data. If not provided,
will be determined by calling ``seek`` and
``tell``. Must be provided for unseekable
file-likes.
"""

cdef FILE *fd = NULL
Expand Down Expand Up @@ -419,6 +430,7 @@ cdef class _IndexedGzipFile:
spacing=spacing,
window_size=window_size,
readbuf_size=readbuf_size,
compressed_size=compressed_size,
flags=flags):
raise ZranError('zran_init returned error (file: '
'{})'.format(self.errname))
Expand Down
5 changes: 5 additions & 0 deletions indexed_gzip/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ def __exit__(self, *a, **kwa):
return ctx()


def touch(path):
"""Create an enpty file."""
with open(path, 'wt') as f:
pass


def poll(until):
"""Waits until ``until`` returns ``True``, printing out a message every
Expand Down
79 changes: 72 additions & 7 deletions indexed_gzip/tests/ctest_indexed_gzip.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ import tempfile
import contextlib

import numpy as np
from io import BytesIO
from io import BytesIO, UnsupportedOperation
import pytest

import indexed_gzip as igzip

from . import gen_test_data
from . import check_data_valid
from . import tempdir
from . import touch
from . import compress

from libc.stdio cimport (SEEK_SET,
Expand Down Expand Up @@ -804,7 +805,7 @@ def test_get_index_seek_points():
# make a test file
data = np.arange(spacing, dtype=np.uint64)
with gzip.open(fname, 'wb') as f:
f.write(data.tostring())
f.write(data.tobytes())

# check points before and after index creation
with igzip._IndexedGzipFile(fname, spacing=spacing) as f:
Expand All @@ -828,34 +829,51 @@ def test_import_export_index():
idxfname = op.join(td, 'test.gzidx')

# make a test file
data = np.arange(65536, dtype=np.uint64)
data = np.arange(524288, dtype=np.uint64)
with gzip.open(fname, 'wb') as f:
f.write(data.tostring())
f.write(data.tobytes())

# generate an index file
with igzip._IndexedGzipFile(fname) as f:
with igzip._IndexedGzipFile(fname, spacing=131072) as f:
f.build_full_index()
points = list(f.seek_points())
f.export_index(idxfname)

# Check that index file works via __init__
with igzip._IndexedGzipFile(fname, index_file=idxfname) as f:
f.seek(65535 * 8)
val = np.frombuffer(f.read(8), dtype=np.uint64)
assert val[0] == 65535
assert points == list(f.seek_points())

# Check that index file works via import_index
with igzip._IndexedGzipFile(fname) as f:
f.import_index(idxfname)
f.seek(65535 * 8)
val = np.frombuffer(f.read(8), dtype=np.uint64)
assert val[0] == 65535
assert points == list(f.seek_points())


def test_import_export_index_open_file():

with tempdir() as td:
fname = op.join(td, 'test.gz')
idxfname = op.join(td, 'test.gzidx')

# make a test file
data = np.arange(524288, dtype=np.uint64)
with gzip.open(fname, 'wb') as f:
f.write(data.tobytes())

# generate an index file from open file handle
with igzip._IndexedGzipFile(fname) as f:
with igzip._IndexedGzipFile(fname, spacing=131072) as f:
f.build_full_index()
points = list(f.seek_points())

# Should raise if wrong permissions
with pytest.raises(ValueError):
touch(idxfname)
with open(idxfname, 'rb') as idxf:
f.export_index(fileobj=idxf)

Expand All @@ -877,17 +895,64 @@ def test_import_export_index():
f.seek(65535 * 8)
val = np.frombuffer(f.read(8), dtype=np.uint64)
assert val[0] == 65535
assert points == list(f.seek_points())

# Test exporting to / importing from a file-like object
idxf = BytesIO()
with igzip._IndexedGzipFile(fname) as f:
with igzip._IndexedGzipFile(fname, spacing=131072) as f:
f.build_full_index()
f.export_index(fileobj=idxf)
points = list(f.seek_points())
idxf.seek(0)
with igzip._IndexedGzipFile(fname) as f:
f.import_index(fileobj=idxf)
f.seek(65535 * 8)
val = np.frombuffer(f.read(8), dtype=np.uint64)
assert val[0] == 65535
assert points == list(f.seek_points())


def test_build_index_from_unseekable():
with tempdir() as td:
fname = op.join(td, 'test.gz')
idxfname = op.join(td, 'test.gzidx')

# make a test file
data = np.arange(524288, dtype=np.uint64)
with gzip.open(fname, 'wb') as f:
f.write(data.tostring())

# Test creating the index when file is unseekable,
# then using the index when file is seekable.
with open(fname, 'rb') as f:
b = f.read()
nbytes = 0 # use a dummy value # len(b)
fileobj = BytesIO(b)

def new_seek(*args, **kwargs):
raise OSError()
def new_tell(*args, **kwargs):
raise OSError()
old_seek = fileobj.seek
old_tell = fileobj.tell
fileobj.seekable = lambda: False
fileobj.seek = new_seek
fileobj.tell = new_tell
# generate an index file
with igzip._IndexedGzipFile(fileobj, spacing=131072, compressed_size=nbytes) as f:
f.build_full_index()
f.export_index(idxfname)
points = list(f.seek_points())
fileobj.seek = old_seek
fileobj.tell = old_tell
fileobj.seekable = lambda: True
fileobj.seek(0)
# Check that index file works via __init__
with igzip._IndexedGzipFile(fileobj, index_file=idxfname) as f:
f.seek(65535 * 8)
val = np.frombuffer(f.read(8), dtype=np.uint64)
assert val[0] == 65535
assert points == list(f.seek_points())


def test_wrapper_class():
Expand Down
Loading