BUG: TypeError: can't concat str to bytes (#2114)

This was introduced when I removed seemingly unnecessary calls to b_, a helper function that converts Union[bytes, str] to bytes. Root-cause: Too little test coverage + wrong type annotations Caused-by: 3033122 Closes #2111
py-pdf · Aug 24, 2023 · f16f434 · f16f434
1 parent cbeed04
commit f16f434
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 14 deletions.
diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
@@ -43,7 +43,7 @@
 from pypdf._crypt_providers import rc4_decrypt as RC4_decrypt  # noqa: N812
 from pypdf._crypt_providers import rc4_encrypt as RC4_encrypt  # noqa: N812
 
-from ._utils import logger_warning
+from ._utils import b_, logger_warning
 from .generic import (
     ArrayObject,
     ByteStringObject,
@@ -75,7 +75,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
         elif isinstance(obj, StreamObject):
             obj2 = StreamObject()
             obj2.update(obj)
-            obj2._data = self.stmCrypt.encrypt(obj._data)
+            obj2._data = self.stmCrypt.encrypt(b_(obj._data))
             obj = obj2
         elif isinstance(obj, DictionaryObject):
             obj2 = DictionaryObject()  # type: ignore
@@ -91,7 +91,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject:
             data = self.strCrypt.decrypt(obj.original_bytes)
             obj = create_string_object(data)
         elif isinstance(obj, StreamObject):
-            obj._data = self.stmCrypt.decrypt(obj._data)
+            obj._data = self.stmCrypt.decrypt(b_(obj._data))
         elif isinstance(obj, DictionaryObject):
             for key, value in obj.items():
                 obj[key] = self.decrypt_object(value)

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -54,6 +54,7 @@
 from ._utils import (
     StrByteType,
     StreamType,
+    b_,
     deprecate_no_replacement,
     deprecation_no_replacement,
     deprecation_with_replacement,
@@ -1256,7 +1257,7 @@ def _get_object_from_stream(
         assert cast(str, obj_stm["/Type"]) == "/ObjStm"
         # /N is the number of indirect objects in the stream
         assert idx < obj_stm["/N"]
-        stream_data = BytesIO(obj_stm.get_data())
+        stream_data = BytesIO(b_(obj_stm.get_data()))
         for i in range(obj_stm["/N"]):  # type: ignore
             read_non_whitespace(stream_data)
             stream_data.seek(-1, 1)
@@ -1867,7 +1868,7 @@ def _read_pdf15_xref_stream(
         xrefstream = cast(ContentStream, read_object(stream, self))
         assert cast(str, xrefstream["/Type"]) == "/XRef"
         self.cache_indirect_object(generation, idnum, xrefstream)
-        stream_data = BytesIO(xrefstream.get_data())
+        stream_data = BytesIO(b_(xrefstream.get_data()))
         # Index pairs specify the subsections in the dictionary. If
         # none create one subsection that spans everything.
         idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
@@ -2118,7 +2119,7 @@ def xfa(self) -> Optional[Dict[str, Any]]:
                 if isinstance(f, IndirectObject):
                     field = cast(Optional[EncodedStreamObject], f.get_object())
                     if field:
-                        es = zlib.decompress(field._data)
+                        es = zlib.decompress(b_(field._data))
                         retval[tag] = es
         return retval
 

diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -41,6 +41,7 @@
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from ._utils import (
+    b_,
     deprecate_with_replacement,
     logger_warning,
     ord_,
@@ -655,7 +656,7 @@ def decode(
         return tiff_header + data
 
 
-def decode_stream_data(stream: Any) -> bytes:  # utils.StreamObject
+def decode_stream_data(stream: Any) -> Union[bytes, str]:  # utils.StreamObject
     """
     Decode the stream data based on the specified filters.
 
@@ -682,7 +683,7 @@ def decode_stream_data(stream: Any) -> bytes:  # utils.StreamObject
     decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
     if not isinstance(decodparms, (list, tuple)):
         decodparms = (decodparms,)
-    data: bytes = stream._data
+    data: bytes = b_(stream._data)
     # If there is not data to decode we should not try to decode the data.
     if data:
         for filter_type, params in zip(filters, decodparms):

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -785,7 +785,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None:
 
 class StreamObject(DictionaryObject):
     def __init__(self) -> None:
-        self._data: bytes = b""
+        self._data: Union[bytes, str] = b""
         self.decoded_self: Optional[DecodedStreamObject] = None
 
     def _clone(
@@ -820,7 +820,7 @@ def _clone(
 
     def hash_value_data(self) -> bytes:
         data = super().hash_value_data()
-        data += self._data
+        data += b_(self._data)
         return data
 
     @property
@@ -901,13 +901,13 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
         retval[NameObject(SA.FILTER)] = f
         if parms is not None:
             retval[NameObject(SA.DECODE_PARMS)] = parms
-        retval._data = FlateDecode.encode(self._data, level)
+        retval._data = FlateDecode.encode(b_(self._data), level)
         return retval
 
 
 class DecodedStreamObject(StreamObject):
     def get_data(self) -> bytes:
-        return self._data
+        return b_(self._data)
 
     def set_data(self, data: bytes) -> None:
         self._data = data
@@ -935,7 +935,7 @@ def decodedSelf(self, value: DecodedStreamObject) -> None:  # deprecated
         deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0")
         self.decoded_self = value
 
-    def get_data(self) -> bytes:
+    def get_data(self) -> Union[bytes, str]:
         from ..filters import decode_stream_data
 
         if self.decoded_self is not None:
@@ -996,7 +996,7 @@ def __init__(
             if isinstance(stream, ArrayObject):
                 data = b""
                 for s in stream:
-                    data += s.get_object().get_data()
+                    data += b_(s.get_object().get_data())
                     if len(data) == 0 or data[-1] != b"\n":
                         data += b"\n"
                 stream_bytes = BytesIO(data)

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -395,6 +395,11 @@ def test_iss_1142():
             "https://github.com/py-pdf/pypdf/files/9428434/TelemetryTX_EM.pdf",
             "tika-964029.pdf",
         ),  # no_ressources
+        (
+            # https://www.itu.int/rec/T-REC-X.25-199610-I/en
+            "https://github.com/py-pdf/pypdf/files/12423313/T-REC-X.25-199610-I.PDF-E.pdf",
+            "T-REC-X.25-199610-I!!PDF-E.pdf",
+        ),
     ],
 )
 def test_extract_text(url, name):