From ebe94c5934bf55264b1ffa1e7c37a5f91af21542 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 29 Jul 2023 19:15:17 +0200 Subject: [PATCH 1/2] BUG: Fix indexed/CMYK images closes #2030 --- pypdf/filters.py | 66 ++++++++++++++++++++++++++++++++----------- tests/test_filters.py | 15 ++++++++++ 2 files changed, 64 insertions(+), 17 deletions(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index 39e66b466..e86ce5461 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -57,6 +57,7 @@ from .errors import PdfReadError, PdfStreamError from .generic import ( ArrayObject, + DecodedStreamObject, DictionaryObject, IndirectObject, NullObject, @@ -837,23 +838,43 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: if color_space == "/Indexed": from .generic import TextStringObject + if isinstance(lookup, DecodedStreamObject): + lookup = lookup.get_data() if isinstance(lookup, TextStringObject): lookup = lookup.original_bytes - if isinstance(lookup, bytes): - try: - nb, conv, mode = { # type: ignore - "1": (0, "", ""), - "L": (1, "P", "L"), - "P": (0, "", ""), - "RGB": (3, "P", "RGB"), - "CMYK": (4, "P", "CMYK"), - }[_get_imagemode(base, 0, "")[0]] - except KeyError: # pragma: no cover - logger_warning( - f"Base {base} not coded please share the pdf file with pypdf dev team", - __name__, + if isinstance(lookup, str): + lookup = lookup.encode() + try: + nb, conv, mode = { # type: ignore + "1": (0, "", ""), + "L": (1, "P", "L"), + "P": (0, "", ""), + "RGB": (3, "P", "RGB"), + "CMYK": (4, "P", "CMYK"), + }[_get_imagemode(base, 0, "")[0]] + except KeyError: # pragma: no cover + logger_warning( + f"Base {base} not coded please share the pdf file with pypdf dev team", + __name__, + ) + lookup = None + else: + if img.mode == "1": + colors_arr = [ + lookup[x - nb : x] for x in range(nb, len(lookup), nb) + ] + arr = b"".join( + [ + b"".join( + [ + colors_arr[1 if img.getpixel((x, y)) > 127 else 0] + for x in range(img.size[0]) + ] + ) + for y in range(img.size[1]) + ] ) - lookup = None + img = Image.frombytes(mode, img.size, arr) else: img = img.convert(conv) if len(lookup) != (hival + 1) * nb: @@ -865,11 +886,22 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: # gray lookup does not work : it is converted to a similar RGB lookup lookup = b"".join([bytes([b, b, b]) for b in lookup]) mode = "RGB" + # TODO : this is a work around until PIL is able to process CMYK images + elif mode == "CMYK": + _rgb = [] + for _c, _m, _y, _k in ( + lookup[n : n + 4] + for n in range(0, 4 * (len(lookup) // 4), 4) + ): + _r = int(255 * (1 - _c / 255) * (1 - _k / 255)) + _g = int(255 * (1 - _m / 255) * (1 - _k / 255)) + _b = int(255 * (1 - _y / 255) * (1 - _k / 255)) + _rgb.append(bytes((_r, _g, _b))) + lookup = b"".join(_rgb) + mode = "RGB" if lookup is not None: img.putpalette(lookup, rawmode=mode) - else: - img.putpalette(lookup.get_data()) - img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB") + img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB") elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased": # see Table 66 - Additional Entries Specific to an ICC Profile # Stream Dictionary diff --git a/tests/test_filters.py b/tests/test_filters.py index f1dc38baf..54afcfc0e 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -513,6 +513,21 @@ def test_index_lookup(): diff.size[0] * diff.size[1] ) assert d < 0.001 + # indexed CMYK images + # currently with a TODO as we convert to RBG the palette + url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf" + name = "tika-972174.pdf" + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + url_png = "https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42" + name_png = "usa.png" + refimg = Image.open(BytesIO(get_pdf_from_url(url_png, name=name_png))) + data = reader.pages[0].images["/Im3"] + # assert data.image.mode == "PA" but currently "RGBA" + diff = ImageChops.difference(data.image, refimg) + d = sqrt( + sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()]) + ) / (diff.size[0] * diff.size[1]) + assert d < 0.001 @pytest.mark.enable_socket() From bc73a42c16d29aead62ba31d5026902a7274b147 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Sat, 29 Jul 2023 23:23:46 +0200 Subject: [PATCH 2/2] add ref on TODO --- pypdf/filters.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypdf/filters.py b/pypdf/filters.py index e86ce5461..f34e98200 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -886,7 +886,8 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: # gray lookup does not work : it is converted to a similar RGB lookup lookup = b"".join([bytes([b, b, b]) for b in lookup]) mode = "RGB" - # TODO : this is a work around until PIL is able to process CMYK images + # TODO : cf https://github.com/py-pdf/pypdf/pull/2039 + # this is a work around until PIL is able to process CMYK images elif mode == "CMYK": _rgb = [] for _c, _m, _y, _k in (