diff --git a/CHANGELOG.md b/CHANGELOG.md index 5425c5d3..1dbc7fd1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). - `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998)) - `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000)) - inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008)) +- `TypeError` raised by extract_text method with compressed PDF file ([#886](https://github.com/pdfminer/pdfminer.six/issues/886)) ### Removed diff --git a/pdfminer/utils.py b/pdfminer/utils.py index a5b53852..35c6d427 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -3,7 +3,6 @@ import io import pathlib import string -import struct from html import escape from typing import ( TYPE_CHECKING, @@ -359,22 +358,12 @@ def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]: def nunpack(s: bytes, default: int = 0) -> int: - """Unpacks 1 to 4 or 8 byte integers (big endian).""" + """Unpacks variable-length unsigned integers (big endian).""" length = len(s) if not length: return default - elif length == 1: - return ord(s) - elif length == 2: - return cast(int, struct.unpack(">H", s)[0]) - elif length == 3: - return cast(int, struct.unpack(">L", b"\x00" + s)[0]) - elif length == 4: - return cast(int, struct.unpack(">L", s)[0]) - elif length == 8: - return cast(int, struct.unpack(">Q", s)[0]) else: - raise PDFTypeError("invalid length: %d" % length) + return int.from_bytes(s, byteorder="big", signed=False) PDFDocEncoding = "".join( diff --git a/samples/contrib/issue-886-xref-stream-widths.pdf b/samples/contrib/issue-886-xref-stream-widths.pdf new file mode 100644 index 00000000..eb8e3fd0 Binary files /dev/null and b/samples/contrib/issue-886-xref-stream-widths.pdf differ diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py index 454b5fca..6bd3a9c4 100644 --- a/tests/test_highlevel_extracttext.py +++ b/tests/test_highlevel_extracttext.py @@ -55,6 +55,7 @@ def run_with_file(sample_path): "contrib/issue_566_test_2.pdf": "甲方:中国饮料有限公司(盖章)", "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03", "contrib/issue-791-non-unicode-cmap.pdf": "Peněžní prostředky na účtech", + "contrib/issue-886-xref-stream-widths.pdf": "Hello", } @@ -146,6 +147,12 @@ def test_issue_791_non_unicode_cmap(self): s = run_with_file(test_file) self.assertEqual(s.strip(), test_strings[test_file]) + def test_issue_886_xref_stream_widths(self): + """Ensure that we can support arbitrary width integers in xref streams""" + test_file = "contrib/issue-886-xref-stream-widths.pdf" + s = run_with_file(test_file) + self.assertEqual(s.strip(), test_strings[test_file]) + class TestExtractPages(unittest.TestCase): def _get_test_file_path(self):