From b2baf380fa04adbe784b62391d07bd6ed75dc0f1 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 23 May 2024 23:06:20 +0200 Subject: [PATCH 1/6] BUG: Fix images issue 4 bits encoding and LUT starting with UTF16_BOM closes #2660 --- pypdf/_writer.py | 9 +------- pypdf/_xobj_image_helpers.py | 2 +- pypdf/generic/_base.py | 42 +++++++++++++++++++++++++++++++----- pypdf/generic/_utils.py | 1 + tests/test_generic.py | 33 +++++++++++++++++++++------- tests/test_images.py | 22 +++++++++++++++++-- 6 files changed, 85 insertions(+), 24 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 9587b0493..dac7f2193 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -27,7 +27,6 @@ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. -import codecs import collections import decimal import enum @@ -182,13 +181,7 @@ def __init__( # info object info = DictionaryObject() - info.update( - { - NameObject("/Producer"): create_string_object( - codecs.BOM_UTF16_BE + "pypdf".encode("utf-16be") - ) - } - ) + info.update({NameObject("/Producer"): create_string_object("pypdf")}) self._info_obj: PdfObject = self._add_object(info) # root object diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index cc0123ff2..63698e3de 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -152,7 +152,7 @@ def _handle_flate( """ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: - mask = (2 << bits) - 1 + mask = (1 << bits) - 1 nbuff = bytearray(size[0] * size[1]) by = 0 bit = 8 - bits diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index bf6c75a15..8ea898ea4 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -508,6 +508,28 @@ class TextStringObject(str, PdfObject): # noqa: SLOT000 to occur. """ + autodetect_pdfdocencoding: bool + autodetect_utf16: bool + utf16_bom: bytes + + def __new__(cls, value: Any) -> "TextStringObject": + if isinstance(value, bytes): + value = value.decode("charmap") + o = str.__new__(cls, value) + o.autodetect_utf16 = False + o.autodetect_pdfdocencoding = False + o.utf16_bom = b"" + if value.startswith(("\xfe\xff", "\xff\xfe")): + o.autodetect_utf16 = True + o.utf16_bom = value[:2].encode("charmap") + else: + try: + encode_pdfdocencoding(o) + o.autodetect_pdfdocencoding = True + except UnicodeEncodeError: + o.autodetect_utf16 = True + return o + def clone( self, pdf_dest: Any, @@ -518,13 +540,11 @@ def clone( obj = TextStringObject(self) obj.autodetect_pdfdocencoding = self.autodetect_pdfdocencoding obj.autodetect_utf16 = self.autodetect_utf16 + obj.utf16_bom = self.utf16_bom return cast( "TextStringObject", self._reference_clone(obj, pdf_dest, force_duplicate) ) - autodetect_pdfdocencoding = False - autodetect_utf16 = False - @property def original_bytes(self) -> bytes: """ @@ -542,7 +562,12 @@ def get_original_bytes(self) -> bytes: # would have been used to create this object, based upon the autodetect # method. if self.autodetect_utf16: - return codecs.BOM_UTF16_BE + self.encode("utf-16be") + if self.utf16_bom == codecs.BOM_UTF16_LE: + return codecs.BOM_UTF16_LE + self.encode("utf-16le") + elif self.utf16_bom == codecs.BOM_UTF16_BE: + return codecs.BOM_UTF16_BE + self.encode("utf-16be") + else: + return self.encode("utf-16be") elif self.autodetect_pdfdocencoding: return encode_pdfdocencoding(self) else: @@ -553,9 +578,16 @@ def get_encoded_bytes(self) -> bytes: # nicer to look at in the PDF file. Sadly, we take a performance hit # here for trying... try: + if self.autodetect_utf16: + raise UnicodeEncodeError("", "forced", -1, -1, "") bytearr = encode_pdfdocencoding(self) except UnicodeEncodeError: - bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + if self.utf16_bom == codecs.BOM_UTF16_LE: + bytearr = codecs.BOM_UTF16_LE + self.encode("utf-16le") + elif self.utf16_bom == codecs.BOM_UTF16_BE: + bytearr = codecs.BOM_UTF16_BE + self.encode("utf-16be") + else: + bytearr = self.encode("utf-16be") return bytearr def write_to_stream( diff --git a/pypdf/generic/_utils.py b/pypdf/generic/_utils.py index e6da5cf09..049166f71 100644 --- a/pypdf/generic/_utils.py +++ b/pypdf/generic/_utils.py @@ -147,6 +147,7 @@ def create_string_object( if string.startswith((codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE)): retval = TextStringObject(string.decode("utf-16")) retval.autodetect_utf16 = True + retval.utf16_bom = string[:2] return retval else: # This is probably a big performance hit here, but we need diff --git a/tests/test_generic.py b/tests/test_generic.py index 24da063a2..2075f491b 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1,5 +1,6 @@ """Test the pypdf.generic module.""" +import codecs from copy import deepcopy from io import BytesIO from pathlib import Path @@ -476,14 +477,13 @@ def test_rectangleobject(): def test_textstringobject_exc(): tso = TextStringObject("foo") - with pytest.raises(Exception) as exc: - tso.get_original_bytes() - assert exc.value.args[0] == "no information about original bytes" + assert tso.get_original_bytes() == b"foo" def test_textstringobject_autodetect_utf16(): tso = TextStringObject("foo") tso.autodetect_utf16 = True + tso.utf16_bom = codecs.BOM_UTF16_BE assert tso.get_original_bytes() == b"\xfe\xff\x00f\x00o\x00o" @@ -883,7 +883,7 @@ def test_annotation_builder_highlight(pdf_file_path): FloatObject(705.4493), ] ), - printing=False + printing=False, ) writer.add_annotation(0, highlight_annotation) for annot in writer.pages[0]["/Annots"]: @@ -910,7 +910,7 @@ def test_annotation_builder_highlight(pdf_file_path): FloatObject(705.4493), ] ), - printing=True + printing=True, ) writer.add_annotation(1, highlight_annotation) for annot in writer.pages[1]["/Annots"]: @@ -1098,20 +1098,37 @@ def test_indirect_object_invalid_read(): assert exc.value.args[0] == "Error reading indirect object reference at byte 0x5" -def test_create_string_object_utf16be_bom(): +def test_create_string_object_utf16_bom(): + # utf16-be result = create_string_object( b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00" ) assert result == "PaperPort 14\x00" assert result.autodetect_utf16 is True + assert result.utf16_bom == b"\xfe\xff" + assert ( + result.get_encoded_bytes() + == b"\xfe\xff\x00P\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00" + ) - -def test_create_string_object_utf16le_bom(): + # utf16-le result = create_string_object( b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" ) assert result == "PaperPort 14\x00" assert result.autodetect_utf16 is True + assert result.utf16_bom == b"\xff\xfe" + assert ( + result.get_encoded_bytes() + == b"\xff\xfeP\x00a\x00p\x00e\x00r\x00P\x00o\x00r\x00t\x00 \x001\x004\x00\x00\x00" + ) + + # utf16-be without bom + result = TextStringObject("ΓΏ") + result.autodetect_utf16 = True + result.utf16_bom = b"" + assert result.get_encoded_bytes() == b"\x00\xFF" + assert result.original_bytes == b"\x00\xFF" def test_create_string_object_force(): diff --git a/tests/test_images.py b/tests/test_images.py index e77090171..f979c8ff4 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -13,7 +13,7 @@ import pytest from PIL import Image, ImageChops, ImageDraw -from pypdf import PageObject, PdfReader +from pypdf import PageObject, PdfReader, PdfWriter from pypdf.generic import NameObject, NullObject from . import get_data_from_url @@ -352,5 +352,23 @@ def test_corrupted_jpeg_iss2266(pdf, pdf_name, images, images_name, filtr): @pytest.mark.timeout(30) def test_large_compressed_image(): url = "https://github.com/py-pdf/pypdf/files/15306199/file_with_large_compressed_image.pdf" - reader = PdfReader(BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf"))) + reader = PdfReader( + BytesIO(get_data_from_url(url, name="file_with_large_compressed_image.pdf")) + ) list(reader.pages[0].images) + + +@pytest.mark.enable_socket() +def test_ff_fe_starting_lut(): + """Cf issue #2660""" + url = "https://github.com/py-pdf/pypdf/files/15385628/original_before_merge.pdf" + name = "iss2660.pdf" + writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) + b = BytesIO() + writer.write(b) + reader = PdfReader(b) + url = "https://github.com/py-pdf/pypdf/assets/4083478/6150700d-87fd-43a2-8695-c2c05a44838c" + name = "iss2660.png" + img = Image.open(BytesIO(get_data_from_url(url, name=name))) + assert image_similarity(writer.pages[1].images[0].image, img) == 1.0 + assert image_similarity(reader.pages[1].images[0].image, img) == 1.0 From ec68cd46b4397038eb5e0da5487d460950818058 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 23 May 2024 23:32:34 +0200 Subject: [PATCH 2/6] coverage --- pypdf/generic/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/generic/_base.py b/pypdf/generic/_base.py index 8ea898ea4..cf50d820f 100644 --- a/pypdf/generic/_base.py +++ b/pypdf/generic/_base.py @@ -571,7 +571,7 @@ def get_original_bytes(self) -> bytes: elif self.autodetect_pdfdocencoding: return encode_pdfdocencoding(self) else: - raise Exception("no information about original bytes") + raise Exception("no information about original bytes") # pragma: no cover def get_encoded_bytes(self) -> bytes: # Try to write the string out as a PDFDocEncoding encoded string. It's From 5a283dea492fcf4531ffc151e61a13e1a39df29b Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Fri, 24 May 2024 12:58:14 +0200 Subject: [PATCH 3/6] Update tests/test_images.py Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com> --- tests/test_images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index f979c8ff4..31df5c140 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -360,7 +360,7 @@ def test_large_compressed_image(): @pytest.mark.enable_socket() def test_ff_fe_starting_lut(): - """Cf issue #2660""" + """Cf issue #2660""" url = "https://github.com/py-pdf/pypdf/files/15385628/original_before_merge.pdf" name = "iss2660.pdf" writer = PdfWriter(BytesIO(get_data_from_url(url, name=name))) From b13ce6318903e43355bc863743ff866feebbdfe2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 27 May 2024 10:05:57 +0200 Subject: [PATCH 4/6] fix merge --- tests/test_generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_generic.py b/tests/test_generic.py index 6438546a1..88206f723 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -1,5 +1,6 @@ """Test the pypdf.generic module.""" +import codecs from base64 import a85encode from copy import deepcopy from io import BytesIO From c3ce706b96c2d23ad0ddae3e8b0f4e50010b91fe Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 27 May 2024 10:16:47 +0200 Subject: [PATCH 5/6] ruff --- tests/test_images.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_images.py b/tests/test_images.py index 7fcf35f86..5982ecf20 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -441,4 +441,3 @@ def test_inline_image_extraction(): name = "iss2598d.png" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[0].images[0].image, img) == 1 - \ No newline at end of file From 1512aa945910fe97ddff9a41b7cf5cec841dbae2 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 27 May 2024 13:11:05 +0200 Subject: [PATCH 6/6] error in merging --- pypdf/_xobj_image_helpers.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pypdf/_xobj_image_helpers.py b/pypdf/_xobj_image_helpers.py index 7ee032332..45b0c145b 100644 --- a/pypdf/_xobj_image_helpers.py +++ b/pypdf/_xobj_image_helpers.py @@ -168,24 +168,6 @@ def _handle_flate( Process image encoded in flateEncode Returns img, image_format, extension, color inversion """ - - def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes: - mask = (1 << bits) - 1 - nbuff = bytearray(size[0] * size[1]) - by = 0 - bit = 8 - bits - for y in range(size[1]): - if (bit != 0) and (bit != 8 - bits): - by += 1 - bit = 8 - bits - for x in range(size[0]): - nbuff[y * size[0] + x] = (data[by] >> bit) & mask - bit -= bits - if bit < 0: - by += 1 - bit = 8 - bits - return bytes(nbuff) - extension = ".png" # mime_type = "image/png" image_format = "PNG" lookup: Any