py-pdf · MartinThoma · Aug 26, 2023 · Aug 13, 2023 · Aug 13, 2023 · Aug 13, 2023
diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
@@ -75,7 +75,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
         elif isinstance(obj, StreamObject):
             obj2 = StreamObject()
             obj2.update(obj)
-            obj2._data = self.stmCrypt.encrypt(obj._data)
+            obj2.set_data(self.stmCrypt.encrypt(obj._data))
             obj = obj2
         elif isinstance(obj, DictionaryObject):
             obj2 = DictionaryObject()  # type: ignore

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1121,12 +1121,9 @@ def _merge_page(
         )
 
         new_content_array = ArrayObject()
-
         original_content = self.get_contents()
         if original_content is not None:
-            new_content_array.append(
-                PageObject._push_pop_gs(original_content, self.pdf, use_original=True)
-            )
+            new_content_array.append(original_content)
 
         page2content = page2.get_contents()
         if page2content is not None:
@@ -1259,9 +1256,7 @@ def _merge_page_writer(
 
         original_content = self.get_contents()
         if original_content is not None:
-            new_content_array.append(
-                PageObject._push_pop_gs(original_content, self.pdf)
-            )
+            new_content_array.append(original_content)
 
         page2content = page2.get_contents()
         if page2content is not None:

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -2194,6 +2194,7 @@ def clean(content: ContentStream) -> None:
                     i += 1
                 else:
                     i += 1
+            content.get_data()  # this ensures ._data is rebuilt from the .operations
 
         try:
             d = cast(dict, cast(DictionaryObject, page["/Resources"])["/XObject"])

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -818,6 +818,20 @@ def _clone(
             pass
         super()._clone(src, pdf_dest, force_duplicate, ignore_fields)
 
+    def get_data(self) -> bytes:
-    def get_data(self) -> bytes:
+    def get_data(self) -> bytes:
-    def get_data(self) -> bytes:
+    def get_data(self) -> bytes:
+        return self._data
+
+    def set_data(self, data: bytes) -> None:
+        self._data = data
+
+    def getData(self) -> Any:  # deprecated
+        deprecation_with_replacement("getData", "get_data", "3.0.0")
+        return self._data
+
+    def setData(self, data: Any) -> None:  # deprecated
+        deprecation_with_replacement("setData", "set_data", "3.0.0")
+        self.set_data(data)
+
     def hash_value_data(self) -> bytes:
         data = super().hash_value_data()
         data += self._data
@@ -906,19 +920,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
 
 
 class DecodedStreamObject(StreamObject):
-    def get_data(self) -> bytes:
-        return self._data
-
-    def set_data(self, data: bytes) -> None:
-        self._data = data
-
-    def getData(self) -> Any:  # deprecated
-        deprecation_with_replacement("getData", "get_data", "3.0.0")
-        return self._data
-
-    def setData(self, data: Any) -> None:  # deprecated
-        deprecation_with_replacement("setData", "set_data", "3.0.0")
-        self.set_data(data)
+    pass
 
 
 class EncodedStreamObject(StreamObject):
@@ -935,6 +937,7 @@ def decodedSelf(self, value: DecodedStreamObject) -> None:  # deprecated
         deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0")
         self.decoded_self = value
 
+    # This overrides the parent method:
     def get_data(self) -> bytes:
         from ..filters import decode_stream_data
 
@@ -945,37 +948,47 @@ def get_data(self) -> bytes:
             # create decoded object
             decoded = DecodedStreamObject()
 
-            decoded._data = decode_stream_data(self)
+            decoded.set_data(decode_stream_data(self))
             for key, value in list(self.items()):
                 if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS):
                     decoded[key] = value
             self.decoded_self = decoded
-            return decoded._data
-
-    def getData(self) -> Union[None, str, bytes]:  # deprecated
-        deprecation_with_replacement("getData", "get_data", "3.0.0")
-        return self.get_data()
+            return decoded.get_data()
 
+    # This overrides the parent method:
     def set_data(self, data: bytes) -> None:  # deprecated
         from ..filters import FlateDecode
 
         if self.get(SA.FILTER, "") == FT.FLATE_DECODE:
             if not isinstance(data, bytes):
                 raise TypeError("data must be bytes")
             assert self.decoded_self is not None
-            self.decoded_self._data = data
-            self._data = FlateDecode.encode(data)
+            self.decoded_self.set_data(data)
+            super().set_data(FlateDecode.encode(data))
         else:
             raise PdfReadError(
                 "Streams encoded with different filter from only FlateDecode is not supported"
             )
 
-    def setData(self, data: Any) -> None:  # deprecated
-        deprecation_with_replacement("setData", "set_data", "3.0.0")
-        return self.set_data(data)
-
 
 class ContentStream(DecodedStreamObject):
+    """
+    In order to be fast, this datastructure can contain either:
+    * raw data in ._data
+    * parsed stream operations in ._operations
+
+    At any time, ContentStream object can either have one or both of those fields defined,
+    and zero or one of those fields set to None.
+
+    Those fields are "rebuilt" lazily, when accessed:
+    * when .get_data() is called, if ._data is None, it is rebuilt from ._operations
+    * when .operations is called, if ._operations is None, it is rebuilt from ._data
+
+    On the other side, those fields can be invalidated:
+    * when .set_data() is called, ._operations is set to None
+    * when .operations is set, ._data is set to None
+    """
+
     def __init__(
         self,
         stream: Any,
@@ -987,26 +1000,26 @@ def __init__(
         # The inner list has two elements:
         #  Element 0: List
         #  Element 1: str
-        self.operations: List[Tuple[Any, Any]] = []
+        self._operations: List[Tuple[Any, Any]] = []
 
         # stream may be a StreamObject or an ArrayObject containing
         # multiple StreamObjects to be cat'd together.
-        if stream is not None:
+        if stream is None:
+            super().set_data(b"")
+        else:
             stream = stream.get_object()
             if isinstance(stream, ArrayObject):
                 data = b""
                 for s in stream:
                     data += s.get_object().get_data()
                     if len(data) == 0 or data[-1] != b"\n":
                         data += b"\n"
-                stream_bytes = BytesIO(data)
+                super().set_data(bytes(data))
             else:
                 stream_data = stream.get_data()
                 assert stream_data is not None
-                stream_data_bytes = b_(stream_data)  # this is necessary
-                stream_bytes = BytesIO(stream_data_bytes)
+                super().set_data(b_(stream_data))
             self.forced_encoding = forced_encoding
-            self.__parse_content_stream(stream_bytes)
 
     def clone(
         self,
@@ -1058,13 +1071,15 @@ def _clone(
             force_duplicate:
             ignore_fields:
         """
+        src_cs = cast("ContentStream", src)
+        super().set_data(src_cs._data)
         self.pdf = pdf_dest
-        self.operations = list(cast("ContentStream", src).operations)
-        self.forced_encoding = cast("ContentStream", src).forced_encoding
+        self._operations = list(src_cs._operations)
+        self.forced_encoding = src_cs.forced_encoding
         # no need to call DictionaryObjection or anything
         # like super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields)
 
-    def __parse_content_stream(self, stream: StreamType) -> None:
+    def _parse_content_stream(self, stream: StreamType) -> None:
         # 7.8.2 Content Streams
         stream.seek(0, 0)
         operands: List[Union[int, str, PdfObject]] = []
@@ -1080,9 +1095,9 @@ def __parse_content_stream(self, stream: StreamType) -> None:
                     # mechanism is required, of course... thanks buddy...
                     assert operands == []
                     ii = self._read_inline_image(stream)
-                    self.operations.append((ii, b"INLINE IMAGE"))
+                    self._operations.append((ii, b"INLINE IMAGE"))
                 else:
-                    self.operations.append((operands, operator))
+                    self._operations.append((operands, operator))
                     operands = []
             elif peek == b"%":
                 # If we encounter a comment in the content stream, we have to
@@ -1173,29 +1188,53 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
                         data.write(info)
         return {"settings": settings, "data": data.getvalue()}
 
-    @property  # type: ignore
-    def _data(self) -> bytes:  # type: ignore
-        new_data = BytesIO()
-        for operands, operator in self.operations:
-            if operator == b"INLINE IMAGE":
-                new_data.write(b"BI")
-                dict_text = BytesIO()
-                operands["settings"].write_to_stream(dict_text)
-                new_data.write(dict_text.getvalue()[2:-2])
-                new_data.write(b"ID ")
-                new_data.write(operands["data"])
-                new_data.write(b"EI")
-            else:
-                for op in operands:
-                    op.write_to_stream(new_data)
-                    new_data.write(b" ")
-                new_data.write(b_(operator))
-            new_data.write(b"\n")
-        return new_data.getvalue()
-
-    @_data.setter
-    def _data(self, value: bytes) -> None:
-        self.__parse_content_stream(BytesIO(value))
+
+    # This overrides the parent method:
+    def get_data(self) -> bytes:
+        if not self._data:
+            new_data = BytesIO()
+            for operands, operator in self._operations:
+                if operator == b"INLINE IMAGE":
+                    new_data.write(b"BI")
+                    dict_text = BytesIO()
+                    operands["settings"].write_to_stream(dict_text)
+                    new_data.write(dict_text.getvalue()[2:-2])
+                    new_data.write(b"ID ")
+                    new_data.write(operands["data"])
+                    new_data.write(b"EI")
+                else:
+                    for op in operands:
+                        op.write_to_stream(new_data)
+                        new_data.write(b" ")
+                    new_data.write(b_(operator))
+                new_data.write(b"\n")
+            self._data = new_data.getvalue()
+        return self._data
+
+    # This overrides the parent method:
+    def set_data(self, data: bytes) -> None:
+        super().set_data(data)
+        self._operations = []
+
+    @property
+    def operations(self) -> List[Tuple[Any, Any]]:
+        if not self._operations and self._data:
+            self._parse_content_stream(BytesIO(self._data))
+            self._data = b""
+        return self._operations
+
+    @operations.setter
+    def operations(self, operations: List[Tuple[Any, Any]]) -> None:
+        self._operations = operations
+        self._data = b""
+
+    # This overrides the parent method:
+    def write_to_stream(
+        self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
+    ) -> None:
+        if not self._data and self._operations:
+            self.get_data()  # this ensures ._data is rebuilt
+        super().write_to_stream(stream, encryption_key)
 
 
 def read_object(

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -87,3 +87,20 @@ def get_object(self) -> "DummyObj":
 
     def get_reference(self, obj):
         return IndirectObject(idnum=1, generation=1, pdf=self)
+
+
+def is_sublist(child_list, parent_list):
+    """
+    Check if child_list is a sublist of parent_list, with respect to
+    * elements order
+    * elements repetition
+
+    Elements are compared using `==`
+    """
+    if len(child_list) == 0:
+        return True
+    if len(parent_list) == 0:
+        return False
+    if parent_list[0] == child_list[0]:
+        return is_sublist(child_list[1:], parent_list[1:])
+    return is_sublist(child_list, parent_list[1:])
diff --git a/tests/test_generic.py b/tests/test_generic.py
@@ -426,7 +426,7 @@ class Tst:  # to replace pdf
         # TODO: What should happen with the stream?
         assert do == {"/S": "/GoTo"}
         if length in (6, 10):
-            assert b"BT /F1" in do._data
+            assert b"BT /F1" in do.get_data()
         raise PdfReadError("__ALLGOOD__")
     assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__")
 

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -25,6 +25,8 @@
 )
 from pypdf.errors import DeprecationError, PdfReadError, PdfStreamError
 
+from . import is_sublist
+
 TESTS_ROOT = Path(__file__).parent.resolve()
 PROJECT_ROOT = TESTS_ROOT.parent
 RESOURCE_ROOT = PROJECT_ROOT / "resources"
@@ -351,3 +353,21 @@ def test_parse_datetime_err():
         parse_iso8824_date("D:20210408T054711Z")
     assert ex.value.args[0] == "Can not convert date: D:20210408T054711Z"
     assert parse_iso8824_date("D:20210408054711").tzinfo is None
+
+
+def test_is_sublist():
+    # Basic checks:
+    assert is_sublist([0, 1], [0, 1, 2]) is True
+    assert is_sublist([0, 2], [0, 1, 2]) is True
+    assert is_sublist([1, 2], [0, 1, 2]) is True
+    assert is_sublist([0, 3], [0, 1, 2]) is False
+    # Ensure order is checked:
+    assert is_sublist([1, 0], [0, 1, 2]) is False
+    # Ensure duplicates are handled:
+    assert is_sublist([0, 1, 1], [0, 1, 1, 2]) is True
+    assert is_sublist([0, 1, 1], [0, 1, 2]) is False
+    # Edge cases with empty lists:
+    assert is_sublist([], [0, 1, 2]) is True
+    assert is_sublist([0, 1], []) is False
+    # Self-sublist edge case:
+    assert is_sublist([0, 1, 2], [0, 1, 2]) is True
-Original file line number
+Diff line change
@@ Expand Up / @@ -2194,6 +2194,7 @@ def clean(content: ContentStream) -> None: @@
                         i += 1
                     else:
                         i += 1
+                content.get_data()  # this ensures ._data is rebuilt from the .operations
             try:
                 d = cast(dict, cast(DictionaryObject, page["/Resources"])["/XObject"])
@@ Expand Down @@