From 5d81e8e407763ca6a010f4e7d5884688e1665fbd Mon Sep 17 00:00:00 2001 From: Jake Stockwin Date: Thu, 26 Mar 2020 10:27:20 +0000 Subject: [PATCH 1/3] Updated misleading documentation about word_margin --- CHANGELOG.md | 2 +- docs/source/topics/converting_pdf_to_text.rst | 16 ++++++++-------- pdfminer/layout.py | 14 ++++++-------- tools/pdf2txt.py | 10 +++++----- 4 files changed, 20 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fc8f8b0c..ecf2c3bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [Unreleased] ### Fixed - +- Updated misleading documentation for `word_margin` and `char_margin` ([#407](https://github.com/pdfminer/pdfminer.six/pull/407)) - Ignore ValueError when converting font encoding differences ([#389](https://github.com/pdfminer/pdfminer.six/pull/389)) - Grouping of text lines outside of parent container bounding box ([#386](https://github.com/pdfminer/pdfminer.six/pull/386)) diff --git a/docs/source/topics/converting_pdf_to_text.rst b/docs/source/topics/converting_pdf_to_text.rst index b192d49d..98555526 100644 --- a/docs/source/topics/converting_pdf_to_text.rst +++ b/docs/source/topics/converting_pdf_to_text.rst @@ -50,12 +50,12 @@ meaningful way. Each character has an x-coordinate and a y-coordinate for its bottom-left corner and upper-right corner, i.e. its bounding box. Pdfminer .six uses these bounding boxes to decide which characters belong together. -Characters that are both horizontally and vertically close are grouped. How -close they should be is determined by the `char_margin` (M in figure) and the -`line_overlap` (not in figure) parameter. The horizontal *distance* between the -bounding boxes of two characters should be smaller that the `char_margin` and -the vertical *overlap* between the bounding boxes should be smaller the the -`line_overlap`. +Characters that are both horizontally and vertically close are grouped onto +one line. How close they should be is determined by the `char_margin` +(M in figure) and the `line_overlap` (not in figure) parameter. The horizontal +*distance* between the bounding boxes of two characters should be smaller that +the `char_margin` and the vertical *overlap* between the bounding boxes should +be smaller the the `line_overlap`. .. raw:: html @@ -69,10 +69,10 @@ relative to the minimum height of either one of the bounding boxes. Spaces need to be inserted between characters because the PDF format has no notion of the space character. A space is inserted if the characters are further apart that the `word_margin` (W in the figure). The `word_margin` is -relative to the maximum width or height of the new character. Having a larger +relative to the maximum width or height of the new character. Having a smaller `word_margin` creates smaller words and inserts spaces between characters more often. Note that the `word_margin` should be smaller than the -`char_margin` otherwise all the characters are seperated by a space. +`char_margin` otherwise none of the characters will be separated by a space. The result of this stage is a list of lines. Each line consists a list of characters. These characters either original `LTChar` characters that diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 0a22c5a5..ca17f101 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -36,14 +36,12 @@ class LAParams: are considered to be on the same line. The overlap is specified relative to the minimum height of both characters. :param char_margin: If two characters are closer together than this - margin they are considered to be part of the same word. If - characters are on the same line but not part of the same word, an - intermediate space is inserted. The margin is specified relative to - the width of the character. - :param word_margin: If two words are are closer together than this - margin they are considered to be part of the same line. A space is - added in between for readability. The margin is specified relative - to the width of the word. + margin they are considered part of the same line. The margin is + specified relative to the width of the character. + :param word_margin: If two characters on the same line are further apart + than this margin then they are considered to be two separate words, and + an intermediate space will be added for readability. The margin is + specified relative to the width of the character. :param line_margin: If two lines are are close together they are considered to be part of the same paragraph. The margin is specified relative to the height of a line. diff --git a/tools/pdf2txt.py b/tools/pdf2txt.py index 23321357..4313f152 100755 --- a/tools/pdf2txt.py +++ b/tools/pdf2txt.py @@ -102,14 +102,14 @@ def maketheparser(): la_params.add_argument( "--char-margin", "-M", type=float, default=2.0, help="If two characters are closer together than this margin they " - "are considered to be part of the same word. The margin is " + "are considered to be part of the same line. The margin is " "specified relative to the width of the character.") la_params.add_argument( "--word-margin", "-W", type=float, default=0.1, - help="If two words are are closer together than this margin they " - "are considered to be part of the same line. A space is added " - "in between for readability. The margin is specified relative " - "to the width of the word.") + help="If two characters on the same line are further apart than this " + "margin then they are considered to be two separate words, and " + "an intermediate space will be added for readability. The margin " + "is specified relative to the width of the character.") la_params.add_argument( "--line-margin", "-L", type=float, default=0.5, help="If two lines are are close together they are considered to " From ad2a2dc50c6239141fe2bdbc36096e40dd8f1963 Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Thu, 26 Mar 2020 22:59:53 +0100 Subject: [PATCH 2/3] Small change in sentence about word_margin --- docs/source/topics/converting_pdf_to_text.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/topics/converting_pdf_to_text.rst b/docs/source/topics/converting_pdf_to_text.rst index 98555526..c1dd2f96 100644 --- a/docs/source/topics/converting_pdf_to_text.rst +++ b/docs/source/topics/converting_pdf_to_text.rst @@ -70,8 +70,8 @@ Spaces need to be inserted between characters because the PDF format has no notion of the space character. A space is inserted if the characters are further apart that the `word_margin` (W in the figure). The `word_margin` is relative to the maximum width or height of the new character. Having a smaller -`word_margin` creates smaller words and inserts spaces between characters -more often. Note that the `word_margin` should be smaller than the +`word_margin` creates smaller words and inserts more spaces between +characters. Note that the `word_margin` should be smaller than the `char_margin` otherwise none of the characters will be separated by a space. The result of this stage is a list of lines. Each line consists a list of From c026ac0509712ce0b1e3b0363f5e334b3e6f875e Mon Sep 17 00:00:00 2001 From: Pieter Marsman Date: Thu, 26 Mar 2020 23:01:38 +0100 Subject: [PATCH 3/3] Remove confusing sentence about adding spaces --- docs/source/topics/converting_pdf_to_text.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/topics/converting_pdf_to_text.rst b/docs/source/topics/converting_pdf_to_text.rst index c1dd2f96..1571b860 100644 --- a/docs/source/topics/converting_pdf_to_text.rst +++ b/docs/source/topics/converting_pdf_to_text.rst @@ -70,9 +70,9 @@ Spaces need to be inserted between characters because the PDF format has no notion of the space character. A space is inserted if the characters are further apart that the `word_margin` (W in the figure). The `word_margin` is relative to the maximum width or height of the new character. Having a smaller -`word_margin` creates smaller words and inserts more spaces between -characters. Note that the `word_margin` should be smaller than the -`char_margin` otherwise none of the characters will be separated by a space. +`word_margin` creates smaller words. Note that the `word_margin` should at +least be smaller than the `char_margin` otherwise none of the characters will +be separated by a space. The result of this stage is a list of lines. Each line consists a list of characters. These characters either original `LTChar` characters that