Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PI: Making pypdf as fast as pdfrw #2086

Merged
merged 17 commits into from
Aug 26, 2023
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pypdf/_encryption.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
elif isinstance(obj, StreamObject):
obj2 = StreamObject()
obj2.update(obj)
obj2._data = self.stmCrypt.encrypt(obj._data)
obj2.set_data(self.stmCrypt.encrypt(obj._data))
obj = obj2
elif isinstance(obj, DictionaryObject):
obj2 = DictionaryObject() # type: ignore
Expand Down
6 changes: 2 additions & 4 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1116,12 +1116,10 @@ def _merge_page(
)

new_content_array = ArrayObject()

original_content = self.get_contents()
if original_content is not None:
new_content_array.append(
PageObject._push_pop_gs(original_content, self.pdf)
)
# new_content_stream = PageObject._push_pop_gs(original_content, self.pdf)
Lucas-C marked this conversation as resolved.
Show resolved Hide resolved
new_content_array.append(original_content)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the _push_pop_gs make the conversion to a ContentObject where the operators have been analysed. this is most propably the more consuming part of the code,however we d not need to do that : we can just append the new code at the beginning/end of the array (eventually create this array). Don't forget to add the extracontent to the object (required to be indirect ref)

Copy link
Member Author

@Lucas-C Lucas-C Aug 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the _push_pop_gs make the conversion to a ContentObject where the operators have been analysed. this is most propably the more consuming part of the code

Exactly! 😊

Don't forget to add the extracontent to the object (required to be indirect ref)

What do you mean by extracontent?
Is something missing with the current code in this PR?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo : I mean the extra content(text added to the first page content). What you have to remember is that stream objects that are composing the content must be added list of objects with the _add_object() function and then just store the indirect objects in the ArrayObject that is stored in the /Contents of the page Object.
Personnally, what I would do:
create an array object
copy in the existing streams. if the object is a content object, replace it with a Encoded Stream object(use of _replace_object() function)
insert at the beginning of the first encoded stream "q\n" using the set_data() function
insert at the end of the last stream the "Q\n"
then do what is requrired the Page2 content appending the encodedstream


page2content = page2.get_contents()
if page2content is not None:
Expand Down
134 changes: 74 additions & 60 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,20 @@ def _clone(
pass
super()._clone(src, pdf_dest, force_duplicate, ignore_fields)

def get_data(self) -> bytes:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def get_data(self) -> bytes:
def get_data(self) -> bytes:

So far from I remember _data does not have the same meaning in an encodedStream (where _data contains the compressed data) and DecodedStream(where the data are clear). raising up the set_data into content stream will leave people think set_data on an encoded stream is valid where the results are not good (some side fields needs to be set also)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So far from I remember _data does not have the same meaning in an encodedStream (where _data contains the compressed data) and DecodedStream(where the data are clear)

Interesting. Is this documented somewhere?

Accessing a private property (._data) from outside the ContentStream class seemed like a code smell to me,
but if there is a semantic meaning to using .data instead of get/set_data(),
I should take care about this.

raising up the set_data into content stream will leave people think set_data on an encoded stream is valid where the results are not good (some side fields needs to be set also)

I do not really understand: EncodedStreamObject.set_data() existed prior to this PR,
and has been used in several places, right?
But you mention that it should not be used / is not valid?

MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
return self._data

def set_data(self, data: bytes) -> None:
self._data = data

def getData(self) -> Any: # deprecated
deprecation_with_replacement("getData", "get_data", "3.0.0")
return self._data

def setData(self, data: Any) -> None: # deprecated
deprecation_with_replacement("setData", "set_data", "3.0.0")
self.set_data(data)

def hash_value_data(self) -> bytes:
data = super().hash_value_data()
data += self._data
Expand All @@ -840,7 +854,7 @@ def write_to_stream(
deprecate_no_replacement(
"the encryption_key parameter of write_to_stream", "5.0.0"
)
self[NameObject(SA.LENGTH)] = NumberObject(len(self._data))
self[NameObject(SA.LENGTH)] = NumberObject(len(self.get_data()))
DictionaryObject.write_to_stream(self, stream)
del self[SA.LENGTH]
stream.write(b"\nstream\n")
Expand Down Expand Up @@ -906,19 +920,7 @@ def flate_encode(self, level: int = -1) -> "EncodedStreamObject":


class DecodedStreamObject(StreamObject):
def get_data(self) -> bytes:
return self._data

def set_data(self, data: bytes) -> None:
self._data = data

def getData(self) -> Any: # deprecated
deprecation_with_replacement("getData", "get_data", "3.0.0")
return self._data

def setData(self, data: Any) -> None: # deprecated
deprecation_with_replacement("setData", "set_data", "3.0.0")
self.set_data(data)
pass


class EncodedStreamObject(StreamObject):
Expand All @@ -935,6 +937,7 @@ def decodedSelf(self, value: DecodedStreamObject) -> None: # deprecated
deprecation_with_replacement("decodedSelf", "decoded_self", "3.0.0")
self.decoded_self = value

# This overrides the parent method:
def get_data(self) -> bytes:
from ..filters import decode_stream_data

Expand All @@ -945,35 +948,28 @@ def get_data(self) -> bytes:
# create decoded object
decoded = DecodedStreamObject()

decoded._data = decode_stream_data(self)
decoded.set_data(decode_stream_data(self))
for key, value in list(self.items()):
if key not in (SA.LENGTH, SA.FILTER, SA.DECODE_PARMS):
decoded[key] = value
self.decoded_self = decoded
return decoded._data

def getData(self) -> Union[None, str, bytes]: # deprecated
deprecation_with_replacement("getData", "get_data", "3.0.0")
return self.get_data()
return decoded.get_data()

# This overrides the parent method:
def set_data(self, data: bytes) -> None: # deprecated
from ..filters import FlateDecode

if self.get(SA.FILTER, "") == FT.FLATE_DECODE:
if not isinstance(data, bytes):
raise TypeError("data must be bytes")
assert self.decoded_self is not None
self.decoded_self._data = data
self._data = FlateDecode.encode(data)
self.decoded_self.set_data(data)
super().set_data(FlateDecode.encode(data))
else:
raise PdfReadError(
"Streams encoded with different filter from only FlateDecode is not supported"
)

def setData(self, data: Any) -> None: # deprecated
deprecation_with_replacement("setData", "set_data", "3.0.0")
return self.set_data(data)


class ContentStream(DecodedStreamObject):
def __init__(
Expand All @@ -987,26 +983,26 @@ def __init__(
# The inner list has two elements:
# Element 0: List
# Element 1: str
self.operations: List[Tuple[Any, Any]] = []
self._operations: List[Tuple[Any, Any]] = []

# stream may be a StreamObject or an ArrayObject containing
# multiple StreamObjects to be cat'd together.
if stream is not None:
if stream is None:
super().set_data(b"")
else:
stream = stream.get_object()
if isinstance(stream, ArrayObject):
data = b""
for s in stream:
data += s.get_object().get_data()
if len(data) == 0 or data[-1] != b"\n":
data += b"\n"
stream_bytes = BytesIO(data)
super().set_data(bytes(data))
else:
stream_data = stream.get_data()
assert stream_data is not None
stream_data_bytes = b_(stream_data) # this is necessary
stream_bytes = BytesIO(stream_data_bytes)
super().set_data(b_(stream_data))
self.forced_encoding = forced_encoding
self.__parse_content_stream(stream_bytes)

def clone(
self,
Expand Down Expand Up @@ -1058,13 +1054,15 @@ def _clone(
force_duplicate:
ignore_fields:
"""
src_cs = cast("ContentStream", src)
super().set_data(src_cs._data)
self.pdf = pdf_dest
self.operations = list(cast("ContentStream", src).operations)
self.forced_encoding = cast("ContentStream", src).forced_encoding
self._operations = list(src_cs._operations)
self.forced_encoding = src_cs.forced_encoding
# no need to call DictionaryObjection or anything
# like super(DictionaryObject,self)._clone(src, pdf_dest, force_duplicate, ignore_fields)

def __parse_content_stream(self, stream: StreamType) -> None:
def _parse_content_stream(self, stream: StreamType) -> None:
# 7.8.2 Content Streams
stream.seek(0, 0)
operands: List[Union[int, str, PdfObject]] = []
Expand All @@ -1080,9 +1078,9 @@ def __parse_content_stream(self, stream: StreamType) -> None:
# mechanism is required, of course... thanks buddy...
assert operands == []
ii = self._read_inline_image(stream)
self.operations.append((ii, b"INLINE IMAGE"))
self._operations.append((ii, b"INLINE IMAGE"))
else:
self.operations.append((operands, operator))
self._operations.append((operands, operator))
operands = []
elif peek == b"%":
# If we encounter a comment in the content stream, we have to
Expand Down Expand Up @@ -1173,29 +1171,45 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
data.write(info)
return {"settings": settings, "data": data.getvalue()}

@property # type: ignore
def _data(self) -> bytes: # type: ignore
new_data = BytesIO()
for operands, operator in self.operations:
if operator == b"INLINE IMAGE":
new_data.write(b"BI")
dict_text = BytesIO()
operands["settings"].write_to_stream(dict_text)
new_data.write(dict_text.getvalue()[2:-2])
new_data.write(b"ID ")
new_data.write(operands["data"])
new_data.write(b"EI")
else:
for op in operands:
op.write_to_stream(new_data)
new_data.write(b" ")
new_data.write(b_(operator))
new_data.write(b"\n")
return new_data.getvalue()

@_data.setter
def _data(self, value: bytes) -> None:
self.__parse_content_stream(BytesIO(value))

# This overrides the parent method:
def get_data(self) -> bytes:
if not self._data:
new_data = BytesIO()
for operands, operator in self._operations:
if operator == b"INLINE IMAGE":
new_data.write(b"BI")
dict_text = BytesIO()
operands["settings"].write_to_stream(dict_text)
new_data.write(dict_text.getvalue()[2:-2])
new_data.write(b"ID ")
new_data.write(operands["data"])
new_data.write(b"EI")
else:
for op in operands:
op.write_to_stream(new_data)
new_data.write(b" ")
new_data.write(b_(operator))
new_data.write(b"\n")
self._data = new_data.getvalue()
return self._data

# This overrides the parent method:
def set_data(self, data: bytes) -> None:
super().set_data(data)
self._operations = []

@property
def operations(self) -> List[Tuple[Any, Any]]:
if not self._operations and self._data:
self._parse_content_stream(BytesIO(self._data))
self._data = b""
return self._operations

@operations.setter
def operations(self, operations: List[Tuple[Any, Any]]) -> None:
self._operations = operations
self._data = b""


def read_object(
Expand Down
2 changes: 1 addition & 1 deletion tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ class Tst: # to replace pdf
# TODO: What should happen with the stream?
assert do == {"/S": "/GoTo"}
if length in (6, 10):
assert b"BT /F1" in do._data
assert b"BT /F1" in do.get_data()
raise PdfReadError("__ALLGOOD__")
assert should_fail ^ (exc.value.args[0] == "__ALLGOOD__")

Expand Down
1 change: 1 addition & 0 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,7 @@ def test_remove_images(pdf_file_path, input_path):
reader = PdfReader(input_stream)
if input_path == "side-by-side-subfig.pdf":
extracted_text = reader.pages[0].extract_text()
assert extracted_text
assert "Lorem ipsum dolor sit amet" in extracted_text


Expand Down