Skip to content

Commit

Permalink
BUG: Fix indexed/CMYK images
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Jul 29, 2023
1 parent 29a29fe commit ebe94c5
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 17 deletions.
66 changes: 49 additions & 17 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
from .errors import PdfReadError, PdfStreamError
from .generic import (
ArrayObject,
DecodedStreamObject,
DictionaryObject,
IndirectObject,
NullObject,
Expand Down Expand Up @@ -837,23 +838,43 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
if color_space == "/Indexed":
from .generic import TextStringObject

if isinstance(lookup, DecodedStreamObject):
lookup = lookup.get_data()
if isinstance(lookup, TextStringObject):
lookup = lookup.original_bytes
if isinstance(lookup, bytes):
try:
nb, conv, mode = { # type: ignore
"1": (0, "", ""),
"L": (1, "P", "L"),
"P": (0, "", ""),
"RGB": (3, "P", "RGB"),
"CMYK": (4, "P", "CMYK"),
}[_get_imagemode(base, 0, "")[0]]
except KeyError: # pragma: no cover
logger_warning(
f"Base {base} not coded please share the pdf file with pypdf dev team",
__name__,
if isinstance(lookup, str):
lookup = lookup.encode()
try:
nb, conv, mode = { # type: ignore
"1": (0, "", ""),
"L": (1, "P", "L"),
"P": (0, "", ""),
"RGB": (3, "P", "RGB"),
"CMYK": (4, "P", "CMYK"),
}[_get_imagemode(base, 0, "")[0]]
except KeyError: # pragma: no cover
logger_warning(
f"Base {base} not coded please share the pdf file with pypdf dev team",
__name__,
)
lookup = None
else:
if img.mode == "1":
colors_arr = [
lookup[x - nb : x] for x in range(nb, len(lookup), nb)
]
arr = b"".join(
[
b"".join(
[
colors_arr[1 if img.getpixel((x, y)) > 127 else 0]
for x in range(img.size[0])
]
)
for y in range(img.size[1])
]
)
lookup = None
img = Image.frombytes(mode, img.size, arr)
else:
img = img.convert(conv)
if len(lookup) != (hival + 1) * nb:
Expand All @@ -865,11 +886,22 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
# gray lookup does not work : it is converted to a similar RGB lookup
lookup = b"".join([bytes([b, b, b]) for b in lookup])
mode = "RGB"
# TODO : this is a work around until PIL is able to process CMYK images
elif mode == "CMYK":
_rgb = []
for _c, _m, _y, _k in (
lookup[n : n + 4]
for n in range(0, 4 * (len(lookup) // 4), 4)
):
_r = int(255 * (1 - _c / 255) * (1 - _k / 255))
_g = int(255 * (1 - _m / 255) * (1 - _k / 255))
_b = int(255 * (1 - _y / 255) * (1 - _k / 255))
_rgb.append(bytes((_r, _g, _b)))
lookup = b"".join(_rgb)
mode = "RGB"
if lookup is not None:
img.putpalette(lookup, rawmode=mode)
else:
img.putpalette(lookup.get_data())
img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
img = img.convert("L" if base == ColorSpaces.DEVICE_GRAY else "RGB")
elif not isinstance(color_space, NullObject) and color_space[0] == "/ICCBased":
# see Table 66 - Additional Entries Specific to an ICC Profile
# Stream Dictionary
Expand Down
15 changes: 15 additions & 0 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,21 @@ def test_index_lookup():
diff.size[0] * diff.size[1]
)
assert d < 0.001
# indexed CMYK images
# currently with a TODO as we convert to RBG the palette
url = "https://corpora.tika.apache.org/base/docs/govdocs1/972/972174.pdf"
name = "tika-972174.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
url_png = "https://github.com/py-pdf/pypdf/assets/4083478/56c93021-33cd-4387-ae13-5cbe7e673f42"
name_png = "usa.png"
refimg = Image.open(BytesIO(get_pdf_from_url(url_png, name=name_png)))
data = reader.pages[0].images["/Im3"]
# assert data.image.mode == "PA" but currently "RGBA"
diff = ImageChops.difference(data.image, refimg)
d = sqrt(
sum([(a * a + b * b + c * c + d * d) for a, b, c, d in diff.getdata()])
) / (diff.size[0] * diff.size[1])
assert d < 0.001


@pytest.mark.enable_socket()
Expand Down

0 comments on commit ebe94c5

Please sign in to comment.