pdfminer · dhdaines · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024 · Aug 1, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -22,6 +22,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
 - `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
 - inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))
+- `TypeError` raised by extract_text method with compressed PDF file ([#886](https://github.com/pdfminer/pdfminer.six/issues/886))
 
 ### Removed
 

diff --git a/pdfminer/utils.py b/pdfminer/utils.py
@@ -3,7 +3,6 @@
 import io
 import pathlib
 import string
-import struct
 from html import escape
 from typing import (
     TYPE_CHECKING,
@@ -359,22 +358,12 @@ def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]:
 
 
 def nunpack(s: bytes, default: int = 0) -> int:
-    """Unpacks 1 to 4 or 8 byte integers (big endian)."""
+    """Unpacks variable-length unsigned integers (big endian)."""
     length = len(s)
     if not length:
         return default
-    elif length == 1:
-        return ord(s)
-    elif length == 2:
-        return cast(int, struct.unpack(">H", s)[0])
-    elif length == 3:
-        return cast(int, struct.unpack(">L", b"\x00" + s)[0])
-    elif length == 4:
-        return cast(int, struct.unpack(">L", s)[0])
-    elif length == 8:
-        return cast(int, struct.unpack(">Q", s)[0])
     else:
-        raise PDFTypeError("invalid length: %d" % length)
+        return int.from_bytes(s, byteorder="big", signed=False)
 
 
 PDFDocEncoding = "".join(

diff --git a/samples/contrib/issue-886-xref-stream-widths.pdf b/samples/contrib/issue-886-xref-stream-widths.pdf
diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -55,6 +55,7 @@ def run_with_file(sample_path):
     "contrib/issue_566_test_2.pdf": "甲方：中国饮料有限公司（盖章）",
     "contrib/issue-625-identity-cmap.pdf": "Termin płatności: 2021-05-03",
     "contrib/issue-791-non-unicode-cmap.pdf": "Peněžní prostředky na účtech",
+    "contrib/issue-886-xref-stream-widths.pdf": "Hello",
 }
 
 
@@ -146,6 +147,12 @@ def test_issue_791_non_unicode_cmap(self):
         s = run_with_file(test_file)
         self.assertEqual(s.strip(), test_strings[test_file])
 
+    def test_issue_886_xref_stream_widths(self):
+        """Ensure that we can support arbitrary width integers in xref streams"""
+        test_file = "contrib/issue-886-xref-stream-widths.pdf"
+        s = run_with_file(test_file)
+        self.assertEqual(s.strip(), test_strings[test_file])
+
 
 class TestExtractPages(unittest.TestCase):
     def _get_test_file_path(self):