Skip to content

Commit

Permalink
pdf2txt: clean up construction of LAParams from arguments (#682)
Browse files Browse the repository at this point in the history
* Fix pdf2txt --boxes-flow=disabled

Fixes:
```
$ pdf2txt.py --boxes-flow=disabled test.pdf
Traceback (most recent call last):
  File "tools/pdf2txt.py", line 204, in <module>
    sys.exit(main())
  File "tools/pdf2txt.py", line 198, in main
    outfp = extract_text(**vars(A))
  File "tools/pdf2txt.py", line 66, in extract_text
    pdfminer.high_level.extract_text_to_fp(fp, **locals())
  File "pdfminer/high_level.py", line 85, in extract_text_to_fp
    interpreter.process_page(page)
  File "pdfminer/pdfinterp.py", line 896, in process_page
    self.device.end_page(page)
  File "pdfminer/converter.py", line 51, in end_page
    self.cur_item.analyze(self.laparams)
  File "pdfminer/layout.py", line 822, in analyze
    group.analyze(laparams)
  File "pdfminer/layout.py", line 575, in analyze
    LTTextGroup.analyze(self, laparams)
  File "pdfminer/layout.py", line 362, in analyze
    obj.analyze(laparams)
  File "pdfminer/layout.py", line 575, in analyze
    LTTextGroup.analyze(self, laparams)
  File "pdfminer/layout.py", line 362, in analyze
    obj.analyze(laparams)
  File "pdfminer/layout.py", line 575, in analyze
    LTTextGroup.analyze(self, laparams)
  File "pdfminer/layout.py", line 362, in analyze
    obj.analyze(laparams)
  File "pdfminer/layout.py", line 577, in analyze
    self._objs.sort(
  File "pdfminer/layout.py", line 578, in <lambda>
    key=lambda obj: (1 - laparams.boxes_flow) * obj.x0
TypeError: unsupported operand type(s) for -: 'int' and 'str'
```

Related: Issue #477, PR #479

* update CHANGELOG

* merge CHANGELOG

* pdf2txt: clean up handling of layout parameter arguments
 * avoid specifying default values twice
 * construct LAParams earlier, rather than passing its components around
 * fix crash with --boxes_flow=disabled

* update CHANGELOG

* construct new LAParams, so _validate runs

* Improve readability of setting LAParams by explicitly copying them from parsed_args into init of LAParams. And move all parsed_args post processing to the parse_args() method.

* Add cli argument for line_overlap

* Also use default values from LAParams for --detect-vertical and --all-texts

Co-authored-by: Pieter Marsman <pietermarsman@gmail.com>
  • Loading branch information
0xabu and pietermarsman committed Jan 25, 2022
1 parent aa5dec2 commit d87bd02
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 58 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
- Hande decompression error due to CRC checksum error ([#637](https://github.com/pdfminer/pdfminer.six/pull/637))
- Add handling of JPXDecode filter to enable extraction of images for some pdfs ([#645](https://github.com/pdfminer/pdfminer.six/pull/645))
- Fix extraction of jbig2 files, which was producing invalid files ([#652](https://github.com/pdfminer/pdfminer.six/pull/653))
- Crash in `pdf2txt.py --boxes-flow=disabled` ([#682](https://github.com/pdfminer/pdfminer.six/pull/682))

## [20211012]

Expand Down
115 changes: 57 additions & 58 deletions tools/pdf2txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import argparse
import logging
import sys
from typing import Any, Container, Iterable, List, Optional, Union
from typing import Any, Container, Iterable, List, Optional

import pdfminer.high_level
from pdfminer.layout import LAParams
Expand All @@ -17,12 +17,10 @@
(".xml", "xml"),
(".tag", "tag"))

FloatOrDisabled = Union[float, str] # Union[float, Literal["disabled"]]


def float_or_disabled(x: str) -> FloatOrDisabled:
def float_or_disabled(x: str) -> Optional[float]:
if x.lower().strip() == "disabled":
return "disabled"
return None
try:
return float(x)
except ValueError:
Expand All @@ -32,13 +30,7 @@ def float_or_disabled(x: str) -> FloatOrDisabled:
def extract_text(
files: Iterable[str] = [],
outfile: str = '-',
no_laparams: bool = False,
all_texts: Optional[bool] = None,
detect_vertical: Optional[bool] = None,
word_margin: Optional[float] = None,
char_margin: Optional[float] = None,
line_margin: Optional[float] = None,
boxes_flow: Optional[FloatOrDisabled] = None,
laparams: Optional[LAParams] = None,
output_type: str = 'text',
codec: str = 'utf-8',
strip_control: bool = False,
Expand All @@ -56,19 +48,6 @@ def extract_text(
if not files:
raise ValueError("Must provide files to work upon!")

# If any LAParams group arguments were passed,
# create an LAParams object and
# populate with given args. Otherwise, set it to None.
if not no_laparams:
laparams: Optional[LAParams] = LAParams()
for param in ("all_texts", "detect_vertical", "word_margin",
"char_margin", "line_margin", "boxes_flow"):
paramv = locals().get(param, None)
if paramv is not None:
setattr(laparams, param, paramv)
else:
laparams = None

if output_type == "text" and outfile != "-":
for override, alttype in OUTPUT_TYPES:
if outfile.endswith(override):
Expand All @@ -87,7 +66,7 @@ def extract_text(
return outfp


def maketheparser() -> argparse.ArgumentParser:
def parse_args(args: Optional[List[str]]) -> argparse.Namespace:
parser = argparse.ArgumentParser(description=__doc__, add_help=True)
parser.add_argument(
"files", type=str, default=None, nargs="+",
Expand Down Expand Up @@ -124,41 +103,49 @@ def maketheparser() -> argparse.ArgumentParser:
help="The number of degrees to rotate the PDF "
"before other types of processing.")

la_params = parser.add_argument_group(
la_params = LAParams() # will be used for defaults
la_param_group = parser.add_argument_group(
'Layout analysis', description='Used during layout analysis.')
la_params.add_argument(
la_param_group.add_argument(
"--no-laparams", "-n", default=False, action="store_true",
help="If layout analysis parameters should be ignored.")
la_params.add_argument(
"--detect-vertical", "-V", default=False, action="store_true",
la_param_group.add_argument(
"--detect-vertical", "-V", default=la_params.detect_vertical,
action="store_true",
help="If vertical text should be considered during layout analysis")
la_params.add_argument(
"--char-margin", "-M", type=float, default=2.0,
la_param_group.add_argument(
"--line-overlap", type=float, default=la_params.line_overlap,
help='If two characters have more overlap than this they '
'are considered to be on the same line. The overlap is specified '
'relative to the minimum height of both characters.')
la_param_group.add_argument(
"--char-margin", "-M", type=float, default=la_params.char_margin,
help="If two characters are closer together than this margin they "
"are considered to be part of the same line. The margin is "
"specified relative to the width of the character.")
la_params.add_argument(
"--word-margin", "-W", type=float, default=0.1,
la_param_group.add_argument(
"--word-margin", "-W", type=float, default=la_params.word_margin,
help="If two characters on the same line are further apart than this "
"margin then they are considered to be two separate words, and "
"an intermediate space will be added for readability. The margin "
"is specified relative to the width of the character.")
la_params.add_argument(
"--line-margin", "-L", type=float, default=0.5,
help="If two lines are are close together they are considered to "
la_param_group.add_argument(
"--line-margin", "-L", type=float, default=la_params.line_margin,
help="If two lines are close together they are considered to "
"be part of the same paragraph. The margin is specified "
"relative to the height of a line.")
la_params.add_argument(
"--boxes-flow", "-F", type=float_or_disabled, default=0.5,
la_param_group.add_argument(
"--boxes-flow", "-F", type=float_or_disabled,
default=la_params.boxes_flow,
help="Specifies how much a horizontal and vertical position of a "
"text matters when determining the order of lines. The value "
"should be within the range of -1.0 (only horizontal position "
"matters) to +1.0 (only vertical position matters). You can also "
"pass `disabled` to disable advanced layout analysis, and "
"instead return text based on the position of the bottom left "
"corner of the text box.")
la_params.add_argument(
"--all-texts", "-A", default=False, action="store_true",
la_param_group.add_argument(
"--all-texts", "-A", default=la_params.all_texts, action="store_true",
help="If layout analysis should be performed on text in figures.")

output_params = parser.add_argument_group(
Expand Down Expand Up @@ -194,28 +181,40 @@ def maketheparser() -> argparse.ArgumentParser:
"--strip-control", "-S", default=False, action="store_true",
help="Remove control statement from text. "
"Only used when output_type is xml.")
return parser

parsed_args = parser.parse_args(args=args)

# main


def main(args: Optional[List[str]] = None) -> int:
# Propagate parsed layout parameters to LAParams object
if parsed_args.no_laparams:
parsed_args.laparams = None
else:
parsed_args.laparams = LAParams(
line_overlap=parsed_args.line_overlap,
char_margin=parsed_args.char_margin,
line_margin=parsed_args.line_margin,
word_margin=parsed_args.word_margin,
boxes_flow=parsed_args.boxes_flow,
detect_vertical=parsed_args.detect_vertical,
all_texts=parsed_args.all_texts,
)

if parsed_args.page_numbers:
parsed_args.page_numbers = {x-1 for x in parsed_args.page_numbers}

if parsed_args.pagenos:
parsed_args.page_numbers = {int(x)-1 for x in parsed_args.pagenos.split(",")}

if parsed_args.output_type == "text" and parsed_args.outfile != "-":
for override, alttype in OUTPUT_TYPES:
if parsed_args.outfile.endswith(override):
parsed_args.output_type = alttype

P = maketheparser()
A = P.parse_args(args=args)
return parsed_args

if A.page_numbers:
A.page_numbers = {x-1 for x in A.page_numbers}
if A.pagenos:
A.page_numbers = {int(x)-1 for x in A.pagenos.split(",")}

if A.output_type == "text" and A.outfile != "-":
for override, alttype in OUTPUT_TYPES:
if A.outfile.endswith(override):
A.output_type = alttype

outfp = extract_text(**vars(A))
def main(args: Optional[List[str]] = None) -> int:
parsed_args = parse_args(args)
outfp = extract_text(**vars(parsed_args))
outfp.close()
return 0

Expand Down

0 comments on commit d87bd02

Please sign in to comment.