py-pdf · MartinThoma · Jul 30, 2023 · Jul 29, 2023 · Jul 29, 2023
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -57,6 +57,7 @@
 from .errors import PdfReadError, PdfStreamError
 from .generic import (
     ArrayObject,
+    DecodedStreamObject,
     DictionaryObject,
     IndirectObject,
     NullObject,
@@ -837,23 +838,43 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
         if color_space == "/Indexed":
             from .generic import TextStringObject
 
+            if isinstance(lookup, DecodedStreamObject):
+                lookup = lookup.get_data()
             if isinstance(lookup, TextStringObject):
                 lookup = lookup.original_bytes
-            if isinstance(lookup, bytes):
-                try:
-                    nb, conv, mode = {  # type: ignore
-                        "1": (0, "", ""),
-                        "L": (1, "P", "L"),
-                        "P": (0, "", ""),
-                        "RGB": (3, "P", "RGB"),
-                        "CMYK": (4, "P", "CMYK"),
-                    }[_get_imagemode(base, 0, "")[0]]
-                except KeyError:  # pragma: no cover
-                    logger_warning(
-                        f"Base {base} not coded please share the pdf file with pypdf dev team",
-                        __name__,
+            if isinstance(lookup, str):
+                lookup = lookup.encode()
+            try:
+                nb, conv, mode = {  # type: ignore
+                    "1": (0, "", ""),
+                    "L": (1, "P", "L"),
+                    "P": (0, "", ""),
+                    "RGB": (3, "P", "RGB"),
+                    "CMYK": (4, "P", "CMYK"),
+                }[_get_imagemode(base, 0, "")[0]]
+            except KeyError:  # pragma: no cover
+                logger_warning(
+                    f"Base {base} not coded please share the pdf file with pypdf dev team",
+                    __name__,
+                )
+                lookup = None
+            else:
+                if img.mode == "1":
+                    colors_arr = [
+                        lookup[x - nb : x] for x in range(nb, len(lookup), nb)
+                    ]
+                    arr = b"".join(
+                        [
+                            b"".join(
+                                [
+                                    colors_arr[1 if img.getpixel((x, y)) > 127 else 0]
+                                    for x in range(img.size[0])
+                                ]
+                            )
+                            for y in range(img.size[1])
+                        ]
                     )
-                    lookup = None
+                    img = Image.frombytes(mode, img.size, arr)
                 else:
                     img = img.convert(conv)
                     if len(lookup) != (hival + 1) * nb:
@@ -865,11 +886,23 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
                         # gray lookup does not work : it is converted to a similar RGB lookup
                         lookup = b"".join([bytes([b, b, b]) for b in lookup])
                         mode = "RGB"
+                    # TODO : cf https://github.com/py-pdf/pypdf/pull/2039
+                    # this is a work around until PIL is able to process CMYK images
+                    elif mode == "CMYK":
+                        _rgb = []
+                        for _c, _m, _y, _k in (
+                            lookup[n : n + 4]
+                            for n in range(0, 4 * (len(lookup) // 4), 4)
+                        ):
+                            _r = int(255 * (1 - _c / 255) * (1 - _k / 255))
+                            _g = int(255 * (1 - _m / 255) * (1 - _k / 255))
+                            _b = int(255 * (1 - _y / 255) * (1 - _k / 255))
+                            _rgb.append(bytes((_r, _g, _b)))
+                        lookup = b"".join(_rgb)
+                        mode = "RGB"
                     if lookup is not None:
                         img.putpalette(lookup, rawmode=mode)
-            else:
-                img.putpalette(lookup.get_data())
-            img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
+                img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
         elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
             # see Table 66 - Additional Entries Specific to an ICC Profile
             # Stream Dictionary

diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -513,6 +513,21 @@ def test_index_lookup():
         diff.size[0] * diff.size[1]
     )
     assert d < 0.001
+    # indexed CMYK images
+    # currently with a  TODO as we convert to RBG the palette
+    url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf"
+    name = "tika-972174.pdf"
+    reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
+    url_png = "https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42"
+    name_png = "usa.png"
+    refimg = Image.open(BytesIO(get_pdf_from_url(url_png, name=name_png)))
+    data = reader.pages[0].images["/Im3"]
+    # assert data.image.mode == "PA" but currently "RGBA"
+    diff = ImageChops.difference(data.image, refimg)
+    d = sqrt(
+        sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()])
+    ) / (diff.size[0] * diff.size[1])
+    assert d < 0.001
 
 
 @pytest.mark.enable_socket()