Skip to content

Commit

Permalink
PI: Don't load entire file into memory when passed file name
Browse files Browse the repository at this point in the history
This halves allocated memory when doing a simple
PdfWriter(clone_from=«str»)

I can't just close the self.stream in `__del__` because for some strange
reason the unit tests mark it as unflagged even after the test block
ends. Something about `__del__` finalizers being run on a second pass
while `weakref.finalize()` is run on the first pass.
  • Loading branch information
mjsir911 committed May 17, 2024
1 parent 18519a0 commit 956639e
Showing 1 changed file with 10 additions and 3 deletions.
13 changes: 10 additions & 3 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,10 @@
import os
import re
import struct
import weakref
import zlib
from datetime import datetime
from io import BytesIO, UnsupportedOperation
from io import BytesIO, FileIO, UnsupportedOperation
from pathlib import Path
from typing import (
Any,
Expand Down Expand Up @@ -310,9 +311,11 @@ def __init__(
"It may not be read correctly.",
__name__,
)

if isinstance(stream, (str, Path)):
with open(stream, "rb") as fh:
stream = BytesIO(fh.read())
stream = FileIO(stream, "rb")
weakref.finalize(self, stream.close)

self.read(stream)
self.stream = stream

Expand Down Expand Up @@ -342,6 +345,10 @@ def __init__(
elif password is not None:
raise PdfReadError("Not encrypted file")

def close(self) -> None:
"""Close the underlying file handle"""
self.stream.close()

@property
def root_object(self) -> DictionaryObject:
"""Provide access to "/Root". standardized with PdfWriter."""
Expand Down

0 comments on commit 956639e

Please sign in to comment.