Skip to content

Commit

Permalink
Merge branch 'main' into qualified-form-update
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinThoma authored Mar 14, 2023
2 parents bc0458d + 0afac1d commit 0dc5aa1
Show file tree
Hide file tree
Showing 16 changed files with 436 additions and 289 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
# CHANGELOG

## Version 3.5.2, 2023-03-12

⚠️ We discovered that compress_content_stream has to be applied to a page of
the PdfWriter. It may not be applied to a page of the PdfReader!

### Bug Fixes (BUG)
- compress_content_stream not readable in Adobe Acrobat (#1698)
- Pass logging parameters correctly in set_need_appearances_writer (#1697)
- Write /Root/AcroForm in set_need_appearances_writer (#1639)

### Robustness (ROB)
- Allow more whitespaces within linearized file (#1701)

[Full Changelog](https://github.com/py-pdf/pypdf/compare/3.5.1...3.5.2)


## Version 3.5.1, 2023-03-05

### Robustness (ROB)
Expand Down
5 changes: 4 additions & 1 deletion docs/user/file-size.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,12 @@ reader = PdfReader("example.pdf")
writer = PdfWriter()

for page in reader.pages:
page.compress_content_streams() # This is CPU intensive!
writer.add_page(page)

for page in writer.pages:
# ⚠️ This has to be done on the writer, not the reader!
page.compress_content_streams() # This is CPU intensive!

with open("out.pdf", "wb") as f:
writer.write(f)
```
Expand Down
280 changes: 50 additions & 230 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@

from ._cmap import build_char_map, unknown_char_map
from ._protocols import PdfReaderProtocol, PdfWriterProtocol
from ._text_extraction import (
OrientationNotFoundError,
crlf_space_check,
handle_tj,
mult,
)
from ._utils import (
CompressedTransformationMatrix,
File,
Expand All @@ -72,60 +78,11 @@
NullObject,
NumberObject,
RectangleObject,
encode_pdfdocencoding,
)

CUSTOM_RTL_MIN: int = -1
CUSTOM_RTL_MAX: int = -1
CUSTOM_RTL_SPECIAL_CHARS: List[int] = []
MERGE_CROP_BOX = "cropbox" # pypdf<=3.4.0 used 'trimbox'


def set_custom_rtl(
_min: Union[str, int, None] = None,
_max: Union[str, int, None] = None,
specials: Union[str, List[int], None] = None,
) -> Tuple[int, int, List[int]]:
"""
Change the Right-To-Left and special characters custom parameters.
Args:
_min: The new minimum value for the range of custom characters that
will be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
_max: The new maximum value for the range of custom characters that will
be written right to left.
If set to ``None``, the value will not be changed.
If set to an integer or string, it will be converted to its ASCII code.
The default value is -1, which sets no additional range to be converted.
specials: The new list of special characters to be inserted in the
current insertion order.
If set to ``None``, the current value will not be changed.
If set to a string, it will be converted to a list of ASCII codes.
The default value is an empty list.
Returns:
A tuple containing the new values for ``CUSTOM_RTL_MIN``,
``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``.
"""
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
if isinstance(_min, int):
CUSTOM_RTL_MIN = _min
elif isinstance(_min, str):
CUSTOM_RTL_MIN = ord(_min)
if isinstance(_max, int):
CUSTOM_RTL_MAX = _max
elif isinstance(_max, str):
CUSTOM_RTL_MAX = ord(_max)
if isinstance(specials, str):
CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials]
elif isinstance(specials, list):
CUSTOM_RTL_SPECIAL_CHARS = specials
return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS


def _get_rectangle(self: Any, name: str, defaults: Iterable[str]) -> RectangleObject:
retval: Union[None, RectangleObject, IndirectObject] = self.get(name)
if isinstance(retval, RectangleObject):
Expand Down Expand Up @@ -1515,9 +1472,25 @@ def compress_content_streams(self) -> None:
"""
content = self.get_contents()
if content is not None:
content_obj: Any
if not isinstance(content, ContentStream):
content = ContentStream(content, self.pdf)
self[NameObject(PG.CONTENTS)] = content.flate_encode()
content_obj = ContentStream(content, self.pdf)
else:
content_obj = content
content_obj = content_obj.flate_encode()
try:
content.indirect_reference.pdf._objects[ # type: ignore
content.indirect_reference.idnum - 1 # type: ignore
] = content_obj
except AttributeError:
if self.indirect_reference is not None and hasattr(
self.indirect_reference.pdf, "_add_object"
):
self[
NameObject(PG.CONTENTS)
] = self.indirect_reference.pdf._add_object(content_obj)
else:
raise ValueError("Page must be part of a PdfWriter")

def compressContentStreams(self) -> None: # deprecated
"""
Expand Down Expand Up @@ -1648,26 +1621,6 @@ def _extract_text(
TL = 0.0
font_size = 12.0 # init just in case of

def mult(m: List[float], n: List[float]) -> List[float]:
return [
m[0] * n[0] + m[1] * n[2],
m[0] * n[1] + m[1] * n[3],
m[2] * n[0] + m[3] * n[2],
m[2] * n[1] + m[3] * n[3],
m[4] * n[0] + m[5] * n[2] + n[4],
m[4] * n[1] + m[5] * n[3] + n[5],
]

def orient(m: List[float]) -> int:
if m[3] > 1e-6:
return 0
elif m[3] < -1e-6:
return 180
elif m[1] > 0:
return 90
else:
return 270

def current_spacewidth() -> float:
return _space_width / 1000.0

Expand Down Expand Up @@ -1799,169 +1752,36 @@ def process_operation(operator: bytes, operands: List) -> None:

elif operator == b"Tj":
check_crlf_space = True
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
if orientation in orientations:
if isinstance(operands[0], str):
text += operands[0]
else:
t: str = ""
tt: bytes = (
encode_pdfdocencoding(operands[0])
if isinstance(operands[0], str)
else operands[0]
)
if isinstance(cmap[0], str):
try:
t = tt.decode(
cmap[0], "surrogatepass"
) # apply str encoding
except Exception:
# the data does not match the expectation,
# we use the alternative ;
# text extraction may not be good
t = tt.decode(
"utf-16-be" if cmap[0] == "charmap" else "charmap",
"surrogatepass",
) # apply str encoding
else: # apply dict encoding
t = "".join(
[
cmap[0][x] if x in cmap[0] else bytes((x,)).decode()
for x in tt
]
)
# "\u0590 - \u08FF \uFB50 - \uFDFF"
for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
# x can be a sequence of bytes ; ex: habibi.pdf
if len(x) == 1:
xx = ord(x)
else:
xx = 1
# fmt: off
if (
# cases where the current inserting order is kept
(xx <= 0x2F) # punctuations but...
or (0x3A <= xx and xx <= 0x40) # numbers (x30-39)
or (0x2000 <= xx and xx <= 0x206F) # upper punctuations..
or (0x20A0 <= xx and xx <= 0x21FF) # but (numbers) indices/exponents
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
):
text = x + text if rtl_dir else text + x
elif ( # right-to-left characters set
(0x0590 <= xx and xx <= 0x08FF)
or (0xFB1D <= xx and xx <= 0xFDFF)
or (0xFE70 <= xx and xx <= 0xFEFF)
or (CUSTOM_RTL_MIN <= xx and xx <= CUSTOM_RTL_MAX)
):
if not rtl_dir:
rtl_dir = True
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = x + text
else: # left-to-right
# print(">",xx,x,end="")
if rtl_dir:
rtl_dir = False
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
text = ""
text = text + x
# fmt: on
text, rtl_dir = handle_tj(
text,
operands,
cm_matrix,
tm_matrix, # text matrix
cmap,
orientations,
output,
font_size,
rtl_dir,
visitor_text,
)
else:
return None
if check_crlf_space:
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - tm_prev[4]
delta_y = m[5] - tm_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
tm_prev = m
if orientation not in orientations:
return None
try:
if orientation == 0:
if delta_y < -0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_y) < f * 0.3
and abs(delta_x) > current_spacewidth() * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 180:
if delta_y > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_y) < f * 0.3
and abs(delta_x) > current_spacewidth() * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 90:
if delta_x > 0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_x) < f * 0.3
and abs(delta_y) > current_spacewidth() * f * 15
and (output + text)[-1] != " "
):
text += " "
elif orientation == 270:
if delta_x < -0.8 * f:
if (output + text)[-1] != "\n":
output += text + "\n"
if visitor_text is not None:
visitor_text(
text + "\n",
cm_matrix,
tm_matrix,
cmap[3],
font_size,
)
text = ""
elif (
abs(delta_x) < f * 0.3
and abs(delta_y) > current_spacewidth() * f * 15
and (output + text)[-1] != " "
):
text += " "
except Exception:
pass
text, output, tm_prev = crlf_space_check(
text,
tm_prev,
cm_matrix,
tm_matrix,
cmap,
orientations,
output,
font_size,
visitor_text,
current_spacewidth(),
)
except OrientationNotFoundError:
return None

for operands, operator in content.operations:
if visitor_operand_before is not None:
Expand Down
2 changes: 1 addition & 1 deletion pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1801,7 +1801,7 @@ def _read_xref_other_error(
return startxref
# No explicit xref table, try finding a cross-reference stream.
stream.seek(startxref, 0)
for look in range(5):
for look in range(25): # value extended to cope with more linearized files
if stream.read(1).isdigit():
# This is not a standard PDF, consider adding a warning
startxref += look
Expand Down
Loading

0 comments on commit 0dc5aa1

Please sign in to comment.