From f9868197a3acf47fc775872ec06b31e8bed667f1 Mon Sep 17 00:00:00 2001 From: John Jolly Date: Thu, 21 Dec 2017 11:08:36 -0700 Subject: [PATCH 01/10] bpo-22908: Add seek and tell functionality to ZipExtFile - Added seek, tell, and seekable functions to ZipExtFile - Added internal variables to ZipExtFile that preserves original values in order to reset the zipfile to it's initial state - Raises io.UnsupportedOperation when accessed without a seekable file object - Could be optimized to seek within the _readbuffer --- Lib/zipfile.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 37ce3281e0928c..fb8a01ba782031 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -772,6 +772,17 @@ def __init__(self, fileobj, mode, zipinfo, decrypter=None, else: self._expected_crc = None + self._seekable = False + try: + if fileobj.seekable(): + self._orig_compress_start = fileobj.tell() + self._orig_compress_size = zipinfo.compress_size + self._orig_file_size = zipinfo.file_size + self._orig_start_crc = self._running_crc + self._seekable = True + except AttributeError: + pass + def __repr__(self): result = ['<%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)] @@ -957,6 +968,47 @@ def close(self): finally: super().close() + def seekable(self): + return self._seekable + + def seek(self, offset, from_what = 0): + if not self._seekable: + raise io.UnsupportedOperation("underlying stream is not seekable") + curr_pos = self.tell() + new_pos = offset # Default seek from start of file + if from_what == 1: # Seek from current position + new_pos = curr_pos + offset + elif from_what == 2: # Seek from EOF + new_pos = self._orig_file_size + offset + + if new_pos > self._orig_file_size: + new_pos = self._orig_file_size + + if new_pos < 0: + new_pos = 0 + + if new_pos > curr_pos: + self.read(new_pos - curr_pos) + elif new_pos < curr_pos: + self._fileobj.seek(self._orig_compress_start) + self._running_crc = self._orig_start_crc + self._compress_left = self._orig_compress_size + self._left = self._orig_file_size + self._readbuffer = b'' + self._offset = 0 + self._decompressor = zipfile._get_decompressor(self._compress_type) + self._eof = False + if new_pos > 0: + self.read(new_pos) + + return self.tell() + + def tell(self): + if not self._seekable: + raise io.UnsupportedOperation("underlying stream is not seekable") + filepos = self._orig_file_size - self._left - len(self._readbuffer) + self._offset + return filepos + class _ZipWriteFile(io.BufferedIOBase): def __init__(self, zf, zinfo, zip64): From 4f60ae5be43cbf1636dc5247ce103aef5af290e3 Mon Sep 17 00:00:00 2001 From: John Jolly Date: Thu, 21 Dec 2017 22:01:06 +0000 Subject: [PATCH 02/10] Added blurb --- .../next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst diff --git a/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst b/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst new file mode 100644 index 00000000000000..d37957b7c5e4f0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst @@ -0,0 +1,4 @@ +Added seek and rest to the ZipExtFile class. This only works if the file +object used to open the zipfile is seekable. This change is the first +iteration of this feature and can easily be optimized by considering the +_readbuffer. From 8cb03b02c13b559b6a8df5e35d1ad539c4c6e3f3 Mon Sep 17 00:00:00 2001 From: John Jolly Date: Fri, 22 Dec 2017 06:20:45 -0700 Subject: [PATCH 03/10] bpo-22908: Optimized backward seek still in _readbuffer --- Lib/zipfile.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index fb8a01ba782031..54dc84dbf7dc34 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -990,16 +990,20 @@ def seek(self, offset, from_what = 0): if new_pos > curr_pos: self.read(new_pos - curr_pos) elif new_pos < curr_pos: - self._fileobj.seek(self._orig_compress_start) - self._running_crc = self._orig_start_crc - self._compress_left = self._orig_compress_size - self._left = self._orig_file_size - self._readbuffer = b'' - self._offset = 0 - self._decompressor = zipfile._get_decompressor(self._compress_type) - self._eof = False - if new_pos > 0: - self.read(new_pos) + if self._offset >= curr_pos - new_pos: + # No need to reset if the new position is within the read buffer + self._offset -= curr_pos - new_pos + else: + self._fileobj.seek(self._orig_compress_start) + self._running_crc = self._orig_start_crc + self._compress_left = self._orig_compress_size + self._left = self._orig_file_size + self._readbuffer = b'' + self._offset = 0 + self._decompressor = zipfile._get_decompressor(self._compress_type) + self._eof = False + if new_pos > 0: + self.read(new_pos) return self.tell() From 384a9e389adffe680b1c2438ba4c2bc557eaa41f Mon Sep 17 00:00:00 2001 From: John Jolly Date: Sun, 24 Dec 2017 11:28:30 -0700 Subject: [PATCH 04/10] bpo-22908: Reads chunk of file during advancing seek pointer --- Lib/zipfile.py | 44 +++++++++++++++++++++++++++----------------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 54dc84dbf7dc34..e8321403d9346f 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -740,6 +740,9 @@ class ZipExtFile(io.BufferedIOBase): # Read from compressed files in 4k blocks. MIN_READ_SIZE = 4096 + # Chunk size to read during seek + MAX_SEEK_READ = 1 << 24 + def __init__(self, fileobj, mode, zipinfo, decrypter=None, close_fileobj=False): self._fileobj = fileobj @@ -987,23 +990,30 @@ def seek(self, offset, from_what = 0): if new_pos < 0: new_pos = 0 - if new_pos > curr_pos: - self.read(new_pos - curr_pos) - elif new_pos < curr_pos: - if self._offset >= curr_pos - new_pos: - # No need to reset if the new position is within the read buffer - self._offset -= curr_pos - new_pos - else: - self._fileobj.seek(self._orig_compress_start) - self._running_crc = self._orig_start_crc - self._compress_left = self._orig_compress_size - self._left = self._orig_file_size - self._readbuffer = b'' - self._offset = 0 - self._decompressor = zipfile._get_decompressor(self._compress_type) - self._eof = False - if new_pos > 0: - self.read(new_pos) + read_offset = new_pos - curr_pos + buff_offset = read_offset + self._offset + + if buff_offset >= 0 and buff_offset < len(self._readbuffer): + # Just move the _offset index if the new position is in the _readbuffer + self._offset = buff_offset + read_offset = 0 + elif read_offset < 0: + # Position is before the current position. Reset the ZipExtFile + + self._fileobj.seek(self._orig_compress_start) + self._running_crc = self._orig_start_crc + self._compress_left = self._orig_compress_size + self._left = self._orig_file_size + self._readbuffer = b'' + self._offset = 0 + self._decompressor = zipfile._get_decompressor(self._compress_type) + self._eof = False + read_offset = new_pos + + while read_offset > 0: + read_len = min(MAX_SEEK_READ, read_offset) + self.read(read_len) + read_offset -= read_len return self.tell() From 418d4533bc8e0cd34be8e4f0454b018412cc6785 Mon Sep 17 00:00:00 2001 From: John Jolly Date: Sun, 24 Dec 2017 11:29:21 -0700 Subject: [PATCH 05/10] bpo-22908: Change seek from_what argument to whence --- Lib/zipfile.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index e8321403d9346f..79c208c644d0ea 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -974,14 +974,14 @@ def close(self): def seekable(self): return self._seekable - def seek(self, offset, from_what = 0): + def seek(self, offset, whence = 0): if not self._seekable: raise io.UnsupportedOperation("underlying stream is not seekable") curr_pos = self.tell() new_pos = offset # Default seek from start of file - if from_what == 1: # Seek from current position + if whence == 1: # Seek from current position new_pos = curr_pos + offset - elif from_what == 2: # Seek from EOF + elif whence == 2: # Seek from EOF new_pos = self._orig_file_size + offset if new_pos > self._orig_file_size: From 41536e4c684662919ed13260465de03485399dfa Mon Sep 17 00:00:00 2001 From: John Jolly Date: Sun, 21 Jan 2018 03:15:23 +0000 Subject: [PATCH 06/10] bpo-22908: Added test_seek_tell to Lib/test/test_zipfile.py --- Lib/test/test_zipfile.py | 34 ++++++++++++++++++++++++++++++++++ Lib/zipfile.py | 16 ++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_zipfile.py b/Lib/test/test_zipfile.py index 3bc867ea51c946..6e30b55e55560c 100644 --- a/Lib/test/test_zipfile.py +++ b/Lib/test/test_zipfile.py @@ -1596,6 +1596,40 @@ def test_open_conflicting_handles(self): self.assertEqual(zipf.read('baz'), msg3) self.assertEqual(zipf.namelist(), ['foo', 'bar', 'baz']) + def test_seek_tell(self): + # Test seek functionality + txt = b"Where's Bruce?" + bloc = txt.find(b"Bruce") + # Check seek on a file + with zipfile.ZipFile(TESTFN, "w") as zipf: + zipf.writestr("foo.txt", txt) + with zipfile.ZipFile(TESTFN, "r") as zipf: + with zipf.open("foo.txt", "r") as fp: + fp.seek(bloc, os.SEEK_SET) + self.assertEqual(fp.tell(), bloc) + fp.seek(-bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), 0) + fp.seek(bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), bloc) + self.assertEqual(fp.read(5), txt[bloc:bloc+5]) + fp.seek(0, os.SEEK_END) + self.assertEqual(fp.tell(), len(txt)) + # Check seek on memory file + data = io.BytesIO() + with zipfile.ZipFile(data, mode="w") as zipf: + zipf.writestr("foo.txt", txt) + with zipfile.ZipFile(data, mode="r") as zipf: + with zipf.open("foo.txt", "r") as fp: + fp.seek(bloc, os.SEEK_SET) + self.assertEqual(fp.tell(), bloc) + fp.seek(-bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), 0) + fp.seek(bloc, os.SEEK_CUR) + self.assertEqual(fp.tell(), bloc) + self.assertEqual(fp.read(5), txt[bloc:bloc+5]) + fp.seek(0, os.SEEK_END) + self.assertEqual(fp.tell(), len(txt)) + def tearDown(self): unlink(TESTFN) unlink(TESTFN2) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 79c208c644d0ea..82f45328132444 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -690,6 +690,18 @@ def __init__(self, file, pos, close, lock, writing): self._close = close self._lock = lock self._writing = writing + self.seekable = file.seekable + self.tell = file.tell + + def seek(self, offset, whence=0): + with self._lock: + if self.writing(): + raise ValueError("Can't reposition in the ZIP file while " + "there is an open writing handle on it. " + "Close the writing handle before trying to read.") + self._file.seek(self._pos) + self._pos = self._file.tell() + return self._pos def read(self, n=-1): with self._lock: @@ -974,7 +986,7 @@ def close(self): def seekable(self): return self._seekable - def seek(self, offset, whence = 0): + def seek(self, offset, whence=0): if not self._seekable: raise io.UnsupportedOperation("underlying stream is not seekable") curr_pos = self.tell() @@ -1011,7 +1023,7 @@ def seek(self, offset, whence = 0): read_offset = new_pos while read_offset > 0: - read_len = min(MAX_SEEK_READ, read_offset) + read_len = min(self.MAX_SEEK_READ, read_offset) self.read(read_len) read_offset -= read_len From 92847a31971dfa13356e870296bee3d8dc1d7e70 Mon Sep 17 00:00:00 2001 From: John Jolly Date: Sun, 21 Jan 2018 05:58:04 +0000 Subject: [PATCH 07/10] bpo-22908: Added seek and tell to list of functions made available to object returned by ZipFile.open --- Doc/library/zipfile.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index 5b8c776ed648cd..dc828bce11d1cc 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -230,9 +230,9 @@ ZipFile Objects With *mode* ``'r'`` the file-like object (``ZipExtFile``) is read-only and provides the following methods: :meth:`~io.BufferedIOBase.read`, :meth:`~io.IOBase.readline`, - :meth:`~io.IOBase.readlines`, :meth:`__iter__`, - :meth:`~iterator.__next__`. These objects can operate independently of - the ZipFile. + :meth:`~io.IOBase.readlines`, :meth:`~io.IOBase.seek`, + :meth:`~io.IOBase.tell`, :meth:`__iter__`, :meth:`~iterator.__next__`. + These objects can operate independently of the ZipFile. With ``mode='w'``, a writable file handle is returned, which supports the :meth:`~io.BufferedIOBase.write` method. While a writable file handle is open, From eca9b32c264a091cb5ae59de6e4fb991bbb42922 Mon Sep 17 00:00:00 2001 From: John Jolly Date: Sun, 21 Jan 2018 07:10:37 +0000 Subject: [PATCH 08/10] bpo-22908: Removed trailing whitespace in doc --- Doc/library/zipfile.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/zipfile.rst b/Doc/library/zipfile.rst index dc828bce11d1cc..23ab504d74bad9 100644 --- a/Doc/library/zipfile.rst +++ b/Doc/library/zipfile.rst @@ -230,7 +230,7 @@ ZipFile Objects With *mode* ``'r'`` the file-like object (``ZipExtFile``) is read-only and provides the following methods: :meth:`~io.BufferedIOBase.read`, :meth:`~io.IOBase.readline`, - :meth:`~io.IOBase.readlines`, :meth:`~io.IOBase.seek`, + :meth:`~io.IOBase.readlines`, :meth:`~io.IOBase.seek`, :meth:`~io.IOBase.tell`, :meth:`__iter__`, :meth:`~iterator.__next__`. These objects can operate independently of the ZipFile. From 1ba5aa936843dfd81e27acc81f1b5ff553d08106 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Jan 2018 23:26:10 -0800 Subject: [PATCH 09/10] disallow whence values other than 0, 1, or 2 raises ValueError in that case. --- Lib/zipfile.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Lib/zipfile.py b/Lib/zipfile.py index 82f45328132444..02a3ea794e6ff9 100644 --- a/Lib/zipfile.py +++ b/Lib/zipfile.py @@ -990,11 +990,15 @@ def seek(self, offset, whence=0): if not self._seekable: raise io.UnsupportedOperation("underlying stream is not seekable") curr_pos = self.tell() - new_pos = offset # Default seek from start of file - if whence == 1: # Seek from current position + if whence == 0: # Seek from start of file + new_pos = offset + elif whence == 1: # Seek from current position new_pos = curr_pos + offset elif whence == 2: # Seek from EOF new_pos = self._orig_file_size + offset + else: + raise ValueError("whence must be os.SEEK_SET (0), " + "os.SEEK_CUR (1), or os.SEEK_END (2)") if new_pos > self._orig_file_size: new_pos = self._orig_file_size From e8af6fb02777a3da977b5f69056072f67d0f28c2 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Mon, 29 Jan 2018 23:30:38 -0800 Subject: [PATCH 10/10] simplify and correct NEWS wording. --- .../next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst b/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst index d37957b7c5e4f0..4f3cc0166019f1 100644 --- a/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst +++ b/Misc/NEWS.d/next/Library/2017-12-21-22-00-11.bpo-22908.cVm89I.rst @@ -1,4 +1,2 @@ -Added seek and rest to the ZipExtFile class. This only works if the file -object used to open the zipfile is seekable. This change is the first -iteration of this feature and can easily be optimized by considering the -_readbuffer. +Added seek and tell to the ZipExtFile class. This only works if the file +object used to open the zipfile is seekable.