Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: invalid cm/tm in visitor functions #2206

Merged
merged 15 commits into from
Oct 8, 2023
Merged
24 changes: 18 additions & 6 deletions docs/user/extract-text.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,26 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra
You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment.

The function provided in argument visitor_text of function extract_text has five arguments:
text, current transformation matrix, text matrix, font-dictionary and font-size.
In most cases the x and y coordinates of the current position
are in index 4 and 5 of the current transformation matrix.
* text : the current text (as long as possible, can be up to a full line)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
* user_matrix: current matrix to move from user coordinate space(aka. CTM)
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
* tm_matrix: current matrix from text coordinate space
* font-dictionary: full font dictionary
* font-size: the size (in text coordinate space)

the matrix stores 6 parameters. the 4 first provides the rotation/scaling matrix and the last two provides the translation (horizontal/vertical)
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved
it is recommended to use the user_matrix as it takes into all transformations.
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved

notes :
- as indicated in pdf 1.7 refeence, page 204 the user matrix applies to text space/image space/form space/pattern space.
- if you want to get the full transformation from text to user space, you can use the mult function (availalbe in global import) as follow :
`txt2user = mult(tm, cm))`
the font-size is the raw text size, that is affected by the user_matrix
MartinThoma marked this conversation as resolved.
Show resolved Hide resolved


The font-dictionary may be None in case of unknown fonts.
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".

**Caveat**: In complicated documents the calculated positions might be wrong.
**Caveat**: In complicated documents the calculated positions may be difficult to (if you move from multiple forms to page user space for example).

The function provided in argument visitor_operand_before has four arguments:
operator, operand-arguments, current transformation matrix and text matrix.
Expand All @@ -53,7 +65,7 @@ parts = []


def visitor_body(text, cm, tm, font_dict, font_size):
y = tm[5]
y = cm[5]
if y > 50 and y < 720:
parts.append(text)

Expand Down Expand Up @@ -88,7 +100,7 @@ def visitor_svg_rect(op, args, cm, tm):


def visitor_svg_text(text, cm, tm, fontDict, fontSize):
(x, y) = (tm[4], tm[5])
(x, y) = (cm[4], cm[5])
dwg.add(dwg.text(text, insert=(x, y), fill="blue"))


Expand Down
3 changes: 2 additions & 1 deletion pypdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ._crypt_providers import crypt_provider
from ._encryption import PasswordType
from ._merger import PdfFileMerger, PdfMerger
from ._page import PageObject, Transformation
from ._page import PageObject, Transformation, mult
from ._reader import DocumentInformation, PdfFileReader, PdfReader
from ._version import __version__
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
Expand All @@ -31,6 +31,7 @@
__all__ = [
"__version__",
"_debug_versions",
"mult",
"PageRange",
"PaperSize",
"DocumentInformation",
Expand Down
70 changes: 47 additions & 23 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1922,18 +1922,17 @@ def _extract_text(
# are strings where the byte->string encoding was unknown, so adding
# them to the text here would be gibberish.

cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
cm_stack = []
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [
1.0,
0.0,
0.0,
1.0,
0.0,
0.0,
] # will store previous tm_matrix

# cm/tm_prev stores the last modified matrices can be an intermediate position
cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]

# memo_cm/tm will be used to store the position at the beginning of building the text
memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
char_scale = 1.0
space_scale = 1.0
_space_width: float = 500.0 # will be set correctly at first Tf
Expand All @@ -1944,9 +1943,9 @@ def current_spacewidth() -> float:
return _space_width / 1000.0

def process_operation(operator: bytes, operands: List) -> None:
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
nonlocal orientations, rtl_dir, visitor_text
nonlocal orientations, rtl_dir, visitor_text, output, text
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS

check_crlf_space: bool = False
Expand All @@ -1955,14 +1954,18 @@ def process_operation(operator: bytes, operands: List) -> None:
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
return None
elif operator == b"ET":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
# table 4.7 "Graphics state operators", page 219
# cm_matrix calculation is a reserved for the moment
elif operator == b"q":
Expand Down Expand Up @@ -1993,7 +1996,7 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"cm":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
cm_matrix = mult(
[
Expand All @@ -2006,6 +2009,8 @@ def process_operation(operator: bytes, operands: List) -> None:
],
cm_matrix,
)
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
# Table 5.2 page 398
elif operator == b"Tz":
char_scale = float(operands[0]) / 100.0
Expand All @@ -2017,8 +2022,10 @@ def process_operation(operator: bytes, operands: List) -> None:
if text != "":
output += text # .translate(cmap)
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
try:
# charMapTuple: font_type, float(sp_width / 2), encoding,
# map_dict, font-dictionary
Expand Down Expand Up @@ -2089,17 +2096,19 @@ def process_operation(operator: bytes, operands: List) -> None:
try:
text, output, cm_prev, tm_prev = crlf_space_check(
text,
cm_prev,
tm_prev,
cm_matrix,
tm_matrix,
(cm_prev, tm_prev),
(cm_matrix, tm_matrix),
(memo_cm, memo_tm),
cmap,
orientations,
output,
font_size,
visitor_text,
current_spacewidth(),
)
if text == "":
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()
except OrientationNotFoundError:
return None

Expand Down Expand Up @@ -2131,12 +2140,18 @@ def process_operation(operator: bytes, operands: List) -> None:
elif operator == b"Do":
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
try:
if output[-1] != "\n":
output += "\n"
if visitor_text is not None:
visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(
"\n",
memo_cm,
memo_tm,
cmap[3],
font_size,
)
except IndexError:
pass
try:
Expand All @@ -2152,21 +2167,30 @@ def process_operation(operator: bytes, operands: List) -> None:
)
output += text
if visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(
text,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
except Exception:
logger_warning(
f" impossible to decode XFormObject {operands[0]}",
__name__,
)
finally:
text = ""
memo_cm = cm_matrix.copy()
memo_tm = tm_matrix.copy()

else:
process_operation(operator, operands)
if visitor_operand_after is not None:
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
output += text # just in case of
if text != "" and visitor_text is not None:
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
return output

def extract_text(
Expand Down
31 changes: 19 additions & 12 deletions pypdf/_text_extraction/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,9 @@ def orient(m: List[float]) -> int:

def crlf_space_check(
text: str,
cm_prev: List[float],
tm_prev: List[float],
cm_matrix: List[float],
tm_matrix: List[float],
cmtm_prev: Tuple[List[float], List[float]],
cmtm_matrix: Tuple[List[float], List[float]],
memo_cmtm: Tuple[List[float], List[float]],
cmap: Tuple[
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
],
Expand All @@ -100,13 +99,21 @@ def crlf_space_check(
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
spacewidth: float,
) -> Tuple[str, str, List[float], List[float]]:
cm_prev = cmtm_prev[0]
tm_prev = cmtm_prev[1]
cm_matrix = cmtm_matrix[0]
tm_matrix = cmtm_matrix[1]
memo_cm = memo_cmtm[0]
memo_tm = memo_cmtm[1]

m_prev = mult(tm_prev, cm_prev)
m = mult(tm_matrix, cm_matrix)
orientation = orient(m)
delta_x = m[4] - m_prev[4]
delta_y = m[5] - m_prev[5]
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
f = font_size * k
cm_prev = m
Copy link
Contributor

@troethe troethe Sep 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't look quite right. I think this line can however be safely removed, because cm_prev doesn't get accessed until it is being assigned to again at the end of this function (i.e. this is currently a noop).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agree.
Good point to have noticed it . Actually I've forgot to pass the multiplied cm_matrix to the visitor

if orientation not in orientations:
raise OrientationNotFoundError
try:
Expand All @@ -117,8 +124,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -136,8 +143,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -155,8 +162,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand All @@ -174,8 +181,8 @@ def crlf_space_check(
if visitor_text is not None:
visitor_text(
text + "\n",
cm_prev,
tm_prev,
memo_cm,
memo_tm,
cmap[3],
font_size,
)
Expand Down
78 changes: 78 additions & 0 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,3 +1288,81 @@ def test_get_contents_from_nullobject():
p = writer.add_blank_page(100, 100)
p[NameObject("/Contents")] = writer._add_object(NullObject())
p.get_contents()


@pytest.mark.enable_socket()
def test_pos_text_in_textvisitor():
"""See #2200"""
url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf"
name = "test_text_pos.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
p = ()

def visitor_body2(text, cm, tm, fontdict, fontsize) -> None:
nonlocal p
if text.startswith("5425."):
p = (tm[4], tm[5])

reader.pages[0].extract_text(visitor_text=visitor_body2)
assert abs(p[0] - 323.5) < 0.1
assert abs(p[1] - 457.4) < 0.1


@pytest.mark.enable_socket()
def test_pos_text_in_textvisitor2():
"""See #2075"""
url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf"
name = "LegIndex-page6.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
x_lvl = 26
lst = []

def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None:
nonlocal x_lvl, lst
if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210:
lst.append(text.strip(" \n"))

reader.pages[0].extract_text(visitor_text=visitor_lvl)
assert lst == [
"ACUPUNCTURE BOARD",
"ACUPUNCTURISTS AND ACUPUNCTURE",
"ADMINISTRATIVE LAW AND PROCEDURE",
"ADMINISTRATIVE LAW, OFFICE OF",
"ADOPTION",
"ADULT EDUCATION",
"ADVERTISING. See also MARKETING; and particular subject matter (e.g.,",
]
x_lvl = 35
lst = []
reader.pages[0].extract_text(visitor_text=visitor_lvl)
assert lst == [
"members, AB 1264",
"assistants, acupuncture, AB 1264",
"complaints, investigations, etc., AB 1264",
"day, california acupuncture, HR 48",
"massage services, asian, AB 1264",
"supervising acupuncturists, AB 1264",
"supportive acupuncture services, basic, AB 1264",
"rules and regulations—",
"professional assistants and employees: employment and compensation, AB 916",
"adults, adoption of, AB 1756",
"agencies, organizations, etc.: requirements, prohibitions, etc., SB 807",
"assistance programs, adoption: nonminor dependents, SB 9",
"birth certificates, AB 1302",
"contact agreements, postadoption—",
"facilitators, adoption, AB 120",
"failed adoptions: reproductive loss leave, SB 848",
"hearings, adoption finalization: remote proceedings, technology, etc., SB 21",
"native american tribes, AB 120",
"parental rights, reinstatement of, AB 20",
"parents, prospective adoptive: criminal background checks, SB 824",
"services, adult educational, SB 877",
"week, adult education, ACR 31",
"alcoholic beverages: tied-house restrictions, AB 546",
"campaign re social equity, civil rights, etc., SB 447",
"cannabis, AB 794",
"elections. See ELECTIONS.",
"false, misleading, etc., advertising—",
"hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures, SB 683",
"housing rental properties advertised rates: disclosures, SB 611",
]
Loading