Merge branch 'main' into qualified-form-update

py-pdf · Mar 14, 2023 · 0dc5aa1 · 0dc5aa1
2 parents bc0458d + 0afac1d
commit 0dc5aa1
Show file tree

Hide file tree

Showing 16 changed files with 436 additions and 289 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # CHANGELOG
 
+## Version 3.5.2, 2023-03-12
+
+⚠️ We discovered that compress_content_stream has to be applied to a page of
+  the PdfWriter. It may not be applied to a page of the PdfReader!
+
+### Bug Fixes (BUG)
+-  compress_content_stream not readable in Adobe Acrobat (#1698)
+-  Pass logging parameters correctly in set_need_appearances_writer (#1697)
+-  Write /Root/AcroForm in set_need_appearances_writer (#1639)
+
+### Robustness (ROB)
+-  Allow more whitespaces within linearized file (#1701)
+
+[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.5.1...3.5.2)
+
+
 ## Version 3.5.1, 2023-03-05
 
 ### Robustness (ROB)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
@@ -63,9 +63,12 @@ reader = PdfReader("example.pdf")
 writer = PdfWriter()
 
 for page in reader.pages:
-    page.compress_content_streams()  # This is CPU intensive!
     writer.add_page(page)
 
+for page in writer.pages:
+    # ⚠️ This has to be done on the writer, not the reader!
+    page.compress_content_streams()  # This is CPU intensive!
+
 with open("out.pdf", "wb") as f:
     writer.write(f)
 ```

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -46,6 +46,12 @@
 
 from ._cmap import build_char_map, unknown_char_map
 from ._protocols import PdfReaderProtocol, PdfWriterProtocol
+from ._text_extraction import (
+    OrientationNotFoundError,
+    crlf_space_check,
+    handle_tj,
+    mult,
+)
 from ._utils import (
     CompressedTransformationMatrix,
     File,
@@ -72,60 +78,11 @@
     NullObject,
     NumberObject,
     RectangleObject,
-    encode_pdfdocencoding,
 )
 
-CUSTOM_RTL_MIN: int = -1
-CUSTOM_RTL_MAX: int = -1
-CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
 MERGE_CROP_BOX = "cropbox"  # pypdf<=3.4.0 used 'trimbox'
 
 
-def set_custom_rtl(
-    _min: Union[str, int, None] = None,
-    _max: Union[str, int, None] = None,
-    specials: Union[str, List[int], None] = None,
-) -> Tuple[int, int, List[int]]:
-    """
-    Change the Right-To-Left and special characters custom parameters.
-
-    Args:
-        _min: The new minimum value for the range of custom characters that
-            will be written right to left.
-            If set to ``None``, the value will not be changed.
-            If set to an integer or string, it will be converted to its ASCII code.
-            The default value is -1, which sets no additional range to be converted.
-        _max: The new maximum value for the range of custom characters that will
-            be written right to left.
-            If set to ``None``, the value will not be changed.
-            If set to an integer or string, it will be converted to its ASCII code.
-            The default value is -1, which sets no additional range to be converted.
-        specials: The new list of special characters to be inserted in the
-            current insertion order.
-            If set to ``None``, the current value will not be changed.
-            If set to a string, it will be converted to a list of ASCII codes.
-            The default value is an empty list.
-
-    Returns:
-        A tuple containing the new values for ``CUSTOM_RTL_MIN``,
-        ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
-    """
-    global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
-    if isinstance(_min, int):
-        CUSTOM_RTL_MIN = _min
-    elif isinstance(_min, str):
-        CUSTOM_RTL_MIN = ord(_min)
-    if isinstance(_max, int):
-        CUSTOM_RTL_MAX = _max
-    elif isinstance(_max, str):
-        CUSTOM_RTL_MAX = ord(_max)
-    if isinstance(specials, str):
-        CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
-    elif isinstance(specials, list):
-        CUSTOM_RTL_SPECIAL_CHARS = specials
-    return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
-
-
 def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
     retval: Union[None, RectangleObject, IndirectObject] = self.get(name)
     if isinstance(retval, RectangleObject):
@@ -1515,9 +1472,25 @@ def compress_content_streams(self) -> None:
         """
         content = self.get_contents()
         if content is not None:
+            content_obj: Any
             if not isinstance(content, ContentStream):
-                content = ContentStream(content, self.pdf)
-            self[NameObject(PG.CONTENTS)] = content.flate_encode()
+                content_obj = ContentStream(content, self.pdf)
+            else:
+                content_obj = content
+            content_obj = content_obj.flate_encode()
+            try:
+                content.indirect_reference.pdf._objects[  # type: ignore
+                    content.indirect_reference.idnum - 1  # type: ignore
+                ] = content_obj
+            except AttributeError:
+                if self.indirect_reference is not None and hasattr(
+                    self.indirect_reference.pdf, "_add_object"
+                ):
+                    self[
+                        NameObject(PG.CONTENTS)
+                    ] = self.indirect_reference.pdf._add_object(content_obj)
+                else:
+                    raise ValueError("Page must be part of a PdfWriter")
 
     def compressContentStreams(self) -> None:  # deprecated
         """
@@ -1648,26 +1621,6 @@ def _extract_text(
         TL = 0.0
         font_size = 12.0  # init just in case of
 
-        def mult(m: List[float], n: List[float]) -> List[float]:
-            return [
-                m[0] * n[0] + m[1] * n[2],
-                m[0] * n[1] + m[1] * n[3],
-                m[2] * n[0] + m[3] * n[2],
-                m[2] * n[1] + m[3] * n[3],
-                m[4] * n[0] + m[5] * n[2] + n[4],
-                m[4] * n[1] + m[5] * n[3] + n[5],
-            ]
-
-        def orient(m: List[float]) -> int:
-            if m[3] > 1e-6:
-                return 0
-            elif m[3] < -1e-6:
-                return 180
-            elif m[1] > 0:
-                return 90
-            else:
-                return 270
-
         def current_spacewidth() -> float:
             return _space_width / 1000.0
 
@@ -1799,169 +1752,36 @@ def process_operation(operator: bytes, operands: List) -> None:
 
             elif operator == b"Tj":
                 check_crlf_space = True
-                m = mult(tm_matrix, cm_matrix)
-                orientation = orient(m)
-                if orientation in orientations:
-                    if isinstance(operands[0], str):
-                        text += operands[0]
-                    else:
-                        t: str = ""
-                        tt: bytes = (
-                            encode_pdfdocencoding(operands[0])
-                            if isinstance(operands[0], str)
-                            else operands[0]
-                        )
-                        if isinstance(cmap[0], str):
-                            try:
-                                t = tt.decode(
-                                    cmap[0], "surrogatepass"
-                                )  # apply str encoding
-                            except Exception:
-                                # the data does not match the expectation,
-                                # we use the alternative ;
-                                # text extraction may not be good
-                                t = tt.decode(
-                                    "utf-16-be" if cmap[0] == "charmap" else "charmap",
-                                    "surrogatepass",
-                                )  # apply str encoding
-                        else:  # apply dict encoding
-                            t = "".join(
-                                [
-                                    cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
-                                    for x in tt
-                                ]
-                            )
-                        # "\u0590 - \u08FF \uFB50 - \uFDFF"
-                        for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
-                            # x can be a sequence of bytes ; ex: habibi.pdf
-                            if len(x) == 1:
-                                xx = ord(x)
-                            else:
-                                xx = 1
-                            # fmt: off
-                            if (
-                                # cases where the current inserting order is kept
-                                (xx <= 0x2F)                        # punctuations but...
-                                or (0x3A <= xx and xx <= 0x40)      # numbers (x30-39)
-                                or (0x2000 <= xx and xx <= 0x206F)  # upper punctuations..
-                                or (0x20A0 <= xx and xx <= 0x21FF)  # but (numbers) indices/exponents
-                                or xx in CUSTOM_RTL_SPECIAL_CHARS   # customized....
-                            ):
-                                text = x + text if rtl_dir else text + x
-                            elif (  # right-to-left characters set
-                                (0x0590 <= xx and xx <= 0x08FF)
-                                or (0xFB1D <= xx and xx <= 0xFDFF)
-                                or (0xFE70 <= xx and xx <= 0xFEFF)
-                                or (CUSTOM_RTL_MIN <= xx and xx <= CUSTOM_RTL_MAX)
-                            ):
-                                if not rtl_dir:
-                                    rtl_dir = True
-                                    output += text
-                                    if visitor_text is not None:
-                                        visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
-                                    text = ""
-                                text = x + text
-                            else:  # left-to-right
-                                # print(">",xx,x,end="")
-                                if rtl_dir:
-                                    rtl_dir = False
-                                    output += text
-                                    if visitor_text is not None:
-                                        visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
-                                    text = ""
-                                text = text + x
-                            # fmt: on
+                text, rtl_dir = handle_tj(
+                    text,
+                    operands,
+                    cm_matrix,
+                    tm_matrix,  # text matrix
+                    cmap,
+                    orientations,
+                    output,
+                    font_size,
+                    rtl_dir,
+                    visitor_text,
+                )
             else:
                 return None
             if check_crlf_space:
-                m = mult(tm_matrix, cm_matrix)
-                orientation = orient(m)
-                delta_x = m[4] - tm_prev[4]
-                delta_y = m[5] - tm_prev[5]
-                k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
-                f = font_size * k
-                tm_prev = m
-                if orientation not in orientations:
-                    return None
                 try:
-                    if orientation == 0:
-                        if delta_y < -0.8 * f:
-                            if (output + text)[-1] != "\n":
-                                output += text + "\n"
-                                if visitor_text is not None:
-                                    visitor_text(
-                                        text + "\n",
-                                        cm_matrix,
-                                        tm_matrix,
-                                        cmap[3],
-                                        font_size,
-                                    )
-                                text = ""
-                        elif (
-                            abs(delta_y) < f * 0.3
-                            and abs(delta_x) > current_spacewidth() * f * 15
-                            and (output + text)[-1] != " "
-                        ):
-                            text += " "
-                    elif orientation == 180:
-                        if delta_y > 0.8 * f:
-                            if (output + text)[-1] != "\n":
-                                output += text + "\n"
-                                if visitor_text is not None:
-                                    visitor_text(
-                                        text + "\n",
-                                        cm_matrix,
-                                        tm_matrix,
-                                        cmap[3],
-                                        font_size,
-                                    )
-                                text = ""
-                        elif (
-                            abs(delta_y) < f * 0.3
-                            and abs(delta_x) > current_spacewidth() * f * 15
-                            and (output + text)[-1] != " "
-                        ):
-                            text += " "
-                    elif orientation == 90:
-                        if delta_x > 0.8 * f:
-                            if (output + text)[-1] != "\n":
-                                output += text + "\n"
-                                if visitor_text is not None:
-                                    visitor_text(
-                                        text + "\n",
-                                        cm_matrix,
-                                        tm_matrix,
-                                        cmap[3],
-                                        font_size,
-                                    )
-                                text = ""
-                        elif (
-                            abs(delta_x) < f * 0.3
-                            and abs(delta_y) > current_spacewidth() * f * 15
-                            and (output + text)[-1] != " "
-                        ):
-                            text += " "
-                    elif orientation == 270:
-                        if delta_x < -0.8 * f:
-                            if (output + text)[-1] != "\n":
-                                output += text + "\n"
-                                if visitor_text is not None:
-                                    visitor_text(
-                                        text + "\n",
-                                        cm_matrix,
-                                        tm_matrix,
-                                        cmap[3],
-                                        font_size,
-                                    )
-                                text = ""
-                        elif (
-                            abs(delta_x) < f * 0.3
-                            and abs(delta_y) > current_spacewidth() * f * 15
-                            and (output + text)[-1] != " "
-                        ):
-                            text += " "
-                except Exception:
-                    pass
+                    text, output, tm_prev = crlf_space_check(
+                        text,
+                        tm_prev,
+                        cm_matrix,
+                        tm_matrix,
+                        cmap,
+                        orientations,
+                        output,
+                        font_size,
+                        visitor_text,
+                        current_spacewidth(),
+                    )
+                except OrientationNotFoundError:
+                    return None
 
         for operands, operator in content.operations:
             if visitor_operand_before is not None:

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -1801,7 +1801,7 @@ def _read_xref_other_error(
             return startxref
         # No explicit xref table, try finding a cross-reference stream.
         stream.seek(startxref, 0)
-        for look in range(5):
+        for look in range(25):  # value extended to cope with more linearized files
             if stream.read(1).isdigit():
                 # This is not a standard PDF, consider adding a warning
                 startxref += look