py-pdf · MartinThoma · Aug 26, 2023 · Aug 13, 2023 · Aug 13, 2023 · Aug 13, 2023
diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
@@ -75,7 +75,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
         elif isinstance(obj, StreamObject):
             obj2 = StreamObject()
             obj2.update(obj)
-            obj2._data = self.stmCrypt.encrypt(obj._data)
+            obj2.set_data(self.stmCrypt.encrypt(obj._data))
             obj = obj2
         elif isinstance(obj, DictionaryObject):
             obj2 = DictionaryObject()  # type: ignore

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1116,12 +1116,10 @@ def _merge_page(
         )
 
         new_content_array = ArrayObject()
-
         original_content = self.get_contents()
         if original_content is not None:
-            new_content_array.append(
-                PageObject._push_pop_gs(original_content, self.pdf)
-            )
+            # new_content_stream = PageObject._push_pop_gs(original_content, self.pdf)
+            new_content_array.append(original_content)
 
         page2content = page2.get_contents()
         if page2content is not None:

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -818,6 +818,20 @@ def _clone(
             pass
         super()._clone(src, pdf_dest, force_duplicate, ignore_fields)
 
+    def get_data(self) -> bytes:
-    def get_data(self) -> bytes:
+    def get_data(self) -> bytes:
-    def get_data(self) -> bytes:
+    def get_data(self) -> bytes:
+        return self._data
+
+    def set_data(self, data: bytes) -> None:
+        self._data = data
+
+    def getData(self) -> Any:  # deprecated
+        deprecation_with_replacement("getData", "get_data", "3.0.0")
+        return self._data
+
+    def setData(self, data: Any) -> None:  # deprecated
+        deprecation_with_replacement("setData", "set_data", "3.0.0")
+        self.set_data(data)
+
     def hash_value_data(self) -> bytes:
         data = super().hash_value_data()
         data += self._data
@@ -840,7 +854,7 @@ def write_to_stream(
             deprecate_no_replacement(
                 "the encryption_key parameter of write_to_stream", "5.0.0"
             )
-        self[NameObject(SA.LENGTH)] = NumberObject(len(self._data))
+        self[NameObject(SA.LENGTH)] = NumberObject(len(self.get_data()))
         DictionaryObject.write_to_stream(self, stream)
         del self[SA.LENGTH]
         stream.write(b"\nstream\n")
@@ -906,19 +920,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
 
 
 class DecodedStreamObject(StreamObject):
-    def get_data(self) -> bytes:
-        return self._data
-
-    def set_data(self, data: bytes) -> None:
-        self._data = data
-
-    def getData(self) -> Any:  # deprecated
-        deprecation_with_replacement("getData", "get_data", "3.0.0")
-        return self._data
-
-    def setData(self, data: Any) -> None:  # deprecated
-        deprecation_with_replacement("setData", "set_data", "3.0.0")
-        self.set_data(data)
+    pass
 
 
 class EncodedStreamObject(StreamObject):
@@ -935,6 +937,7 @@ def decodedSelf(self, value: DecodedStreamObject) -> None:  # deprecated
         deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0")
         self.decoded_self = value
 
+    # This overrides the parent method:
     def get_data(self) -> bytes:
         from ..filters import decode_stream_data
 
@@ -945,35 +948,28 @@ def get_data(self) -> bytes:
             # create decoded object
             decoded = DecodedStreamObject()
 
-            decoded._data = decode_stream_data(self)
+            decoded.set_data(decode_stream_data(self))
             for key, value in list(self.items()):
                 if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS):
                     decoded[key] = value
             self.decoded_self = decoded
-            return decoded._data
-
-    def getData(self) -> Union[None, str, bytes]:  # deprecated
-        deprecation_with_replacement("getData", "get_data", "3.0.0")
-        return self.get_data()
+            return decoded.get_data()
 
+    # This overrides the parent method:
     def set_data(self, data: bytes) -> None:  # deprecated
         from ..filters import FlateDecode
 
         if self.get(SA.FILTER, "") == FT.FLATE_DECODE:
             if not isinstance(data, bytes):
                 raise TypeError("data must be bytes")
             assert self.decoded_self is not None
-            self.decoded_self._data = data
-            self._data = FlateDecode.encode(data)
+            self.decoded_self.set_data(data)
+            super().set_data(FlateDecode.encode(data))
         else:
             raise PdfReadError(
                 "Streams encoded with different filter from only FlateDecode is not supported"
             )
 
-    def setData(self, data: Any) -> None:  # deprecated
-        deprecation_with_replacement("setData", "set_data", "3.0.0")
-        return self.set_data(data)
-
 
 class ContentStream(DecodedStreamObject):
     def __init__(
@@ -987,26 +983,26 @@ def __init__(
         # The inner list has two elements:
         #  Element 0: List
         #  Element 1: str
-        self.operations: List[Tuple[Any, Any]] = []
+        self._operations: List[Tuple[Any, Any]] = []
 
         # stream may be a StreamObject or an ArrayObject containing
         # multiple StreamObjects to be cat'd together.
-        if stream is not None:
+        if stream is None:
+            super().set_data(b"")
+        else:
             stream = stream.get_object()
             if isinstance(stream, ArrayObject):
                 data = b""
                 for s in stream:
                     data += s.get_object().get_data()
                     if len(data) == 0 or data[-1] != b"\n":
                         data += b"\n"
-                stream_bytes = BytesIO(data)
+                super().set_data(bytes(data))
             else:
                 stream_data = stream.get_data()
                 assert stream_data is not None
-                stream_data_bytes = b_(stream_data)  # this is necessary
-                stream_bytes = BytesIO(stream_data_bytes)
+                super().set_data(b_(stream_data))
             self.forced_encoding = forced_encoding
-            self.__parse_content_stream(stream_bytes)
 
     def clone(
         self,
@@ -1058,13 +1054,15 @@ def _clone(
             force_duplicate:
             ignore_fields:
         """
+        src_cs = cast("ContentStream", src)
+        super().set_data(src_cs._data)
         self.pdf = pdf_dest
-        self.operations = list(cast("ContentStream", src).operations)
-        self.forced_encoding = cast("ContentStream", src).forced_encoding
+        self._operations = list(src_cs._operations)
+        self.forced_encoding = src_cs.forced_encoding
         # no need to call DictionaryObjection or anything
         # like super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields)
 
-    def __parse_content_stream(self, stream: StreamType) -> None:
+    def _parse_content_stream(self, stream: StreamType) -> None:
         # 7.8.2 Content Streams
         stream.seek(0, 0)
         operands: List[Union[int, str, PdfObject]] = []
@@ -1080,9 +1078,9 @@ def __parse_content_stream(self, stream: StreamType) -> None:
                     # mechanism is required, of course... thanks buddy...
                     assert operands == []
                     ii = self._read_inline_image(stream)
-                    self.operations.append((ii, b"INLINE IMAGE"))
+                    self._operations.append((ii, b"INLINE IMAGE"))
                 else:
-                    self.operations.append((operands, operator))
+                    self._operations.append((operands, operator))
                     operands = []
             elif peek == b"%":
                 # If we encounter a comment in the content stream, we have to
@@ -1173,29 +1171,45 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
                         data.write(info)
         return {"settings": settings, "data": data.getvalue()}
 
-    @property  # type: ignore
-    def _data(self) -> bytes:  # type: ignore
-        new_data = BytesIO()
-        for operands, operator in self.operations:
-            if operator == b"INLINE IMAGE":
-                new_data.write(b"BI")
-                dict_text = BytesIO()
-                operands["settings"].write_to_stream(dict_text)
-                new_data.write(dict_text.getvalue()[2:-2])
-                new_data.write(b"ID ")
-                new_data.write(operands["data"])
-                new_data.write(b"EI")
-            else:
-                for op in operands:
-                    op.write_to_stream(new_data)
-                    new_data.write(b" ")
-                new_data.write(b_(operator))
-            new_data.write(b"\n")
-        return new_data.getvalue()
-
-    @_data.setter
-    def _data(self, value: bytes) -> None:
-        self.__parse_content_stream(BytesIO(value))
+
+    # This overrides the parent method:
+    def get_data(self) -> bytes:
+        if not self._data:
+            new_data = BytesIO()
+            for operands, operator in self._operations:
+                if operator == b"INLINE IMAGE":
+                    new_data.write(b"BI")
+                    dict_text = BytesIO()
+                    operands["settings"].write_to_stream(dict_text)
+                    new_data.write(dict_text.getvalue()[2:-2])
+                    new_data.write(b"ID ")
+                    new_data.write(operands["data"])
+                    new_data.write(b"EI")
+                else:
+                    for op in operands:
+                        op.write_to_stream(new_data)
+                        new_data.write(b" ")
+                    new_data.write(b_(operator))
+                new_data.write(b"\n")
+            self._data = new_data.getvalue()
+        return self._data
+
+    # This overrides the parent method:
+    def set_data(self, data: bytes) -> None:
+        super().set_data(data)
+        self._operations = []
+
+    @property
+    def operations(self) -> List[Tuple[Any, Any]]:
+        if not self._operations and self._data:
+            self._parse_content_stream(BytesIO(self._data))
+            self._data = b""
+        return self._operations
+
+    @operations.setter
+    def operations(self, operations: List[Tuple[Any, Any]]) -> None:
+        self._operations = operations
+        self._data = b""
 
 
 def read_object(

diff --git a/tests/test_generic.py b/tests/test_generic.py
@@ -426,7 +426,7 @@ class Tst:  # to replace pdf
         # TODO: What should happen with the stream?
         assert do == {"/S": "/GoTo"}
         if length in (6, 10):
-            assert b"BT /F1" in do._data
+            assert b"BT /F1" in do.get_data()
         raise PdfReadError("__ALLGOOD__")
     assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__")
 

diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -321,6 +321,7 @@ def test_remove_images(pdf_file_path, input_path):
         reader = PdfReader(input_stream)
         if input_path == "side-by-side-subfig.pdf":
             extracted_text = reader.pages[0].extract_text()
+            assert extracted_text
             assert "Lorem ipsum dolor sit amet" in extracted_text