Skip to content

Commit

Permalink
BUG: TypeError: can't concat str to bytes (#2114)
Browse files Browse the repository at this point in the history
This was introduced when I removed seemingly unnecessary calls to b_,
a helper function that converts Union[bytes, str] to bytes.

Root-cause: Too little test coverage + wrong type annotations
Caused-by: 3033122

Closes #2111
  • Loading branch information
MartinThoma authored Aug 24, 2023
1 parent cbeed04 commit f16f434
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 14 deletions.
6 changes: 3 additions & 3 deletions pypdf/_encryption.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from pypdf._crypt_providers import rc4_decrypt as RC4_decrypt # noqa: N812
from pypdf._crypt_providers import rc4_encrypt as RC4_encrypt # noqa: N812

from ._utils import logger_warning
from ._utils import b_, logger_warning
from .generic import (
ArrayObject,
ByteStringObject,
Expand Down Expand Up @@ -75,7 +75,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
elif isinstance(obj, StreamObject):
obj2 = StreamObject()
obj2.update(obj)
obj2._data = self.stmCrypt.encrypt(obj._data)
obj2._data = self.stmCrypt.encrypt(b_(obj._data))
obj = obj2
elif isinstance(obj, DictionaryObject):
obj2 = DictionaryObject() # type: ignore
Expand All @@ -91,7 +91,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject:
data = self.strCrypt.decrypt(obj.original_bytes)
obj = create_string_object(data)
elif isinstance(obj, StreamObject):
obj._data = self.stmCrypt.decrypt(obj._data)
obj._data = self.stmCrypt.decrypt(b_(obj._data))
elif isinstance(obj, DictionaryObject):
for key, value in obj.items():
obj[key] = self.decrypt_object(value)
Expand Down
7 changes: 4 additions & 3 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
from ._utils import (
StrByteType,
StreamType,
b_,
deprecate_no_replacement,
deprecation_no_replacement,
deprecation_with_replacement,
Expand Down Expand Up @@ -1256,7 +1257,7 @@ def _get_object_from_stream(
assert cast(str, obj_stm["/Type"]) == "/ObjStm"
# /N is the number of indirect objects in the stream
assert idx < obj_stm["/N"]
stream_data = BytesIO(obj_stm.get_data())
stream_data = BytesIO(b_(obj_stm.get_data()))
for i in range(obj_stm["/N"]): # type: ignore
read_non_whitespace(stream_data)
stream_data.seek(-1, 1)
Expand Down Expand Up @@ -1867,7 +1868,7 @@ def _read_pdf15_xref_stream(
xrefstream = cast(ContentStream, read_object(stream, self))
assert cast(str, xrefstream["/Type"]) == "/XRef"
self.cache_indirect_object(generation, idnum, xrefstream)
stream_data = BytesIO(xrefstream.get_data())
stream_data = BytesIO(b_(xrefstream.get_data()))
# Index pairs specify the subsections in the dictionary. If
# none create one subsection that spans everything.
idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])
Expand Down Expand Up @@ -2118,7 +2119,7 @@ def xfa(self) -> Optional[Dict[str, Any]]:
if isinstance(f, IndirectObject):
field = cast(Optional[EncodedStreamObject], f.get_object())
if field:
es = zlib.decompress(field._data)
es = zlib.decompress(b_(field._data))
retval[tag] = es
return retval

Expand Down
5 changes: 3 additions & 2 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ._utils import (
b_,
deprecate_with_replacement,
logger_warning,
ord_,
Expand Down Expand Up @@ -655,7 +656,7 @@ def decode(
return tiff_header + data


def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject
def decode_stream_data(stream: Any) -> Union[bytes, str]: # utils.StreamObject
"""
Decode the stream data based on the specified filters.
Expand All @@ -682,7 +683,7 @@ def decode_stream_data(stream: Any) -> bytes: # utils.StreamObject
decodparms = stream.get(SA.DECODE_PARMS, ({},) * len(filters))
if not isinstance(decodparms, (list, tuple)):
decodparms = (decodparms,)
data: bytes = stream._data
data: bytes = b_(stream._data)
# If there is not data to decode we should not try to decode the data.
if data:
for filter_type, params in zip(filters, decodparms):
Expand Down
12 changes: 6 additions & 6 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -785,7 +785,7 @@ def _reset_node_tree_relationship(child_obj: Any) -> None:

class StreamObject(DictionaryObject):
def __init__(self) -> None:
self._data: bytes = b""
self._data: Union[bytes, str] = b""
self.decoded_self: Optional[DecodedStreamObject] = None

def _clone(
Expand Down Expand Up @@ -820,7 +820,7 @@ def _clone(

def hash_value_data(self) -> bytes:
data = super().hash_value_data()
data += self._data
data += b_(self._data)
return data

@property
Expand Down Expand Up @@ -901,13 +901,13 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
retval[NameObject(SA.FILTER)] = f
if parms is not None:
retval[NameObject(SA.DECODE_PARMS)] = parms
retval._data = FlateDecode.encode(self._data, level)
retval._data = FlateDecode.encode(b_(self._data), level)
return retval


class DecodedStreamObject(StreamObject):
def get_data(self) -> bytes:
return self._data
return b_(self._data)

def set_data(self, data: bytes) -> None:
self._data = data
Expand Down Expand Up @@ -935,7 +935,7 @@ def decodedSelf(self, value: DecodedStreamObject) -> None: # deprecated
deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0")
self.decoded_self = value

def get_data(self) -> bytes:
def get_data(self) -> Union[bytes, str]:
from ..filters import decode_stream_data

if self.decoded_self is not None:
Expand Down Expand Up @@ -996,7 +996,7 @@ def __init__(
if isinstance(stream, ArrayObject):
data = b""
for s in stream:
data += s.get_object().get_data()
data += b_(s.get_object().get_data())
if len(data) == 0 or data[-1] != b"\n":
data += b"\n"
stream_bytes = BytesIO(data)
Expand Down
5 changes: 5 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,11 @@ def test_iss_1142():
"https://github.com/py-pdf/pypdf/files/9428434/TelemetryTX_EM.pdf",
"tika-964029.pdf",
), # no_ressources
(
# https://www.itu.int/rec/T-REC-X.25-199610-I/en
"https://github.com/py-pdf/pypdf/files/12423313/T-REC-X.25-199610-I.PDF-E.pdf",
"T-REC-X.25-199610-I!!PDF-E.pdf",
),
],
)
def test_extract_text(url, name):
Expand Down

0 comments on commit f16f434

Please sign in to comment.