Do not crash on ASCII85 in inline images and properly support their c…

…olorspaces (#1010)
pdfminer · Jul 15, 2024 · 1a8bd2f · 1a8bd2f
1 parent 88139ad
commit 1a8bd2f
Show file tree

Hide file tree

Showing 7 changed files with 39 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 - `ValueError` when corrupt PDF specifies an invalid mediabox ([#987](https://github.com/pdfminer/pdfminer.six/pull/987))
 - `RecursionError` when corrupt PDF specifies a recursive /Pages object ([#998](https://github.com/pdfminer/pdfminer.six/pull/998))
 - `TypeError` when corrupt PDF specifies text-positioning operators with invalid values ([#1000](https://github.com/pdfminer/pdfminer.six/pull/1000))
+- inline image parsing fails when stream data contains "EI\n" ([#1008](https://github.com/pdfminer/pdfminer.six/issues/1008))
 
 ### Removed
 

diff --git a/pdfminer/image.py b/pdfminer/image.py
@@ -16,6 +16,8 @@
     LITERAL_DEVICE_CMYK,
     LITERAL_DEVICE_GRAY,
     LITERAL_DEVICE_RGB,
+    LITERAL_INLINE_DEVICE_GRAY,
+    LITERAL_INLINE_DEVICE_RGB,
 )
 from pdfminer.pdfexceptions import PDFValueError
 from pdfminer.pdftypes import (
@@ -125,10 +127,16 @@ def export_image(self, image: LTImage) -> str:
         elif image.bits == 1:
             name = self._save_bmp(image, width, height, (width + 7) // 8, image.bits)
 
-        elif image.bits == 8 and LITERAL_DEVICE_RGB in image.colorspace:
+        elif image.bits == 8 and (
+            LITERAL_DEVICE_RGB in image.colorspace
+            or LITERAL_INLINE_DEVICE_RGB in image.colorspace
+        ):
             name = self._save_bmp(image, width, height, width * 3, image.bits * 3)
 
-        elif image.bits == 8 and LITERAL_DEVICE_GRAY in image.colorspace:
+        elif image.bits == 8 and (
+            LITERAL_DEVICE_GRAY in image.colorspace
+            or LITERAL_INLINE_DEVICE_GRAY in image.colorspace
+        ):
             name = self._save_bmp(image, width, height, width, image.bits)
 
         elif len(filters) == 1 and filters[0][0] in LITERALS_FLATE_DECODE:

diff --git a/pdfminer/pdfcolor.py b/pdfminer/pdfcolor.py
@@ -6,6 +6,10 @@
 LITERAL_DEVICE_GRAY = LIT("DeviceGray")
 LITERAL_DEVICE_RGB = LIT("DeviceRGB")
 LITERAL_DEVICE_CMYK = LIT("DeviceCMYK")
+# Abbreviations for inline images
+LITERAL_INLINE_DEVICE_GRAY = LIT("G")
+LITERAL_INLINE_DEVICE_RGB = LIT("RGB")
+LITERAL_INLINE_DEVICE_CMYK = LIT("CMYK")
 
 
 class PDFColorSpace:

diff --git a/pdfminer/pdfinterp.py b/pdfminer/pdfinterp.py
@@ -19,6 +19,7 @@
 )
 from pdfminer.pdfpage import PDFPage
 from pdfminer.pdftypes import (
+    LITERALS_ASCII85_DECODE,
     PDFObjRef,
     PDFStream,
     dict_value,
@@ -331,11 +332,21 @@ def do_keyword(self, pos: int, token: PSKeyword) -> None:
                 if len(objs) % 2 != 0:
                     error_msg = f"Invalid dictionary construct: {objs!r}"
                     raise PSTypeError(error_msg)
-                d = {literal_name(k): v for (k, v) in choplist(2, objs)}
-                (pos, data) = self.get_inline_data(pos + len(b"ID "))
+                d = {literal_name(k): resolve1(v) for (k, v) in choplist(2, objs)}
+                eos = b"EI"
+                filter = d.get("F", None)
+                if filter is not None:
+                    if isinstance(filter, PSLiteral):
+                        filter = [filter]
+                    if filter[0] in LITERALS_ASCII85_DECODE:
+                        eos = b"~>"
+                (pos, data) = self.get_inline_data(pos + len(b"ID "), target=eos)
+                if eos != b"EI":  # it may be necessary for decoding
+                    data += eos
                 obj = PDFStream(d, data)
                 self.push((pos, obj))
-                self.push((pos, self.KEYWORD_EI))
+                if eos == b"EI":  # otherwise it is still in the stream
+                    self.push((pos, self.KEYWORD_EI))
             except PSTypeError:
                 if settings.STRICT:
                     raise

diff --git a/pdfminer/psparser.py b/pdfminer/psparser.py
@@ -450,7 +450,9 @@ def _parse_string_1(self, s: bytes, i: int) -> int:
             return i + 1
 
         elif self.oct:
-            self._curtoken += bytes((int(self.oct, 8),))
+            chrcode = int(self.oct, 8)
+            assert chrcode < 256, "Invalid octal %s (%d)" % (repr(self.oct), chrcode)
+            self._curtoken += bytes((chrcode,))
             self._parse1 = self._parse_string
             return i
 

diff --git a/samples/contrib/issue-1008-inline-ascii85.pdf b/samples/contrib/issue-1008-inline-ascii85.pdf
diff --git a/tests/test_tools_pdf2txt.py b/tests/test_tools_pdf2txt.py
@@ -184,3 +184,10 @@ def test_contrib_issue_495_pdfobjref(self):
         filepath = absolute_sample_path("contrib/issue_495_pdfobjref.pdf")
         image_files = self.extract_images(filepath)
         assert image_files[0].endswith("jpg")
+
+    def test_contrib_issue_1008_inline(self):
+        """Test for parsing and extracting inline images"""
+        filepath = absolute_sample_path("contrib/issue-1008-inline-ascii85.pdf")
+        image_files = self.extract_images(filepath)
+        assert len(image_files) == 23
+        assert all(x.endswith(".bmp") for x in image_files)