Skip to content

Commit

Permalink
Standardize handling of cropbox
Browse files Browse the repository at this point in the history
... fixing various issues with PageImage. Also adds
force_mediabox parameter to Page.to_image(...).

Thanks to @stefanw for flagging:
    #1054
  • Loading branch information
jsvine committed Jan 5, 2024
1 parent f4f4473 commit 07d9997
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 50 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ All notable changes to this project will be documented in this file. The format
- Add support for PDF 1.3 logical structure via `Page.structure_tree` (h/t @dhdaines). ([#963](https://github.com/jsvine/pdfplumber/pulls/963))
- Add "gswin64c" as another possible Ghostscript executable in `repair.py` (h/t @echedey-ls). ([#1032](https://github.com/jsvine/pdfplumber/issues/1030))
- Re-add `Page.close()` method, have `PDF.close()` close all pages as well, and improve relevant documentation (h/t @luketudge). ([#1042](https://github.com/jsvine/pdfplumber/issues/1042))
- Add `force_mediabox` parameter to `Page.to_image(...)`. ([#1054](https://github.com/jsvine/pdfplumber/issues/1054))

### Fixed

- Standardize handling of cropbox, fixing various issues with PageImage. ([#1054](https://github.com/jsvine/pdfplumber/issues/1054))
- Fix `Page.get_textmap` caching to allow for `extra_attrs=[...]`, by preconverting list kwargs to tuples. ([#1030](https://github.com/jsvine/pdfplumber/issues/1030))


Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ To turn any page (including cropped pages) into an `PageImage` object, call `my_
- `width`: The desired image width in pixels. Default: unset, determined by `resolution`. Type: `int`.
- `height`: The desired image width in pixels. Default: unset, determined by `resolution`. Type: `int`.
- `antialias`: Whether to use antialiasing when creating the image. Setting to `True` creates images with less-jagged text and graphics, but with larger file sizes. Default: `False`. Type: `bool`.
- `force_mediabox`: Use the page's `.mediabox` dimensions, rather than the `.cropbox` dimensions. Default: `False`. Type: `bool`.

For instance:

Expand Down
60 changes: 35 additions & 25 deletions pdfplumber/display.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,12 @@ def __init__(
original: Optional[PIL.Image.Image] = None,
resolution: Union[int, float] = DEFAULT_RESOLUTION,
antialias: bool = False,
force_mediabox: bool = False,
):
self.page = page
self.root = page if page.is_original else page.root_page
self.resolution = resolution

if original is None:
self.original = get_page_image(
stream=page.pdf.stream,
Expand All @@ -92,43 +96,49 @@ def __init__(
else:
self.original = original

if page.is_original:
self.root = page
cropped = False
else:
self.root = page.root_page
cropped = page.root_page.bbox != page.bbox
self.scale = self.original.size[0] / (page.cropbox[2] - page.cropbox[0])

self.resolution = resolution
self.scale = self.original.size[0] / self.root.width

if cropped:
cropbox = (
int((page.bbox[0] - page.root_page.bbox[0]) * self.scale),
int((page.bbox[1] - page.root_page.bbox[1]) * self.scale),
int((page.bbox[2] - page.root_page.bbox[0]) * self.scale),
int((page.bbox[3] - page.root_page.bbox[1]) * self.scale),
# This value represents the coordinates of the page,
# in page-unit values, that will be displayed.
self.bbox = (
page.bbox
if page.bbox != page.mediabox
else (page.mediabox if force_mediabox else page.cropbox)
)

# If this value is different than the *Page*'s .cropbox
# (e.g., because the mediabox differs from the cropbox or
# or because we've used Page.crop(...)), then we'll need to
# crop the initially-converted image.
if page.bbox != page.cropbox:
crop_dims = self._reproject_bbox(page.cropbox)
bbox_dims = self._reproject_bbox(self.bbox)
self.original = self.original.crop(
(
bbox_dims[0] - crop_dims[0],
bbox_dims[1] - crop_dims[1],
bbox_dims[2] - crop_dims[0],
bbox_dims[3] - crop_dims[1],
)
)
self.original = self.original.crop(cropbox)

self.reset()

def _reproject_bbox(self, bbox: T_bbox) -> T_bbox:
def _reproject_bbox(self, bbox: T_bbox) -> Tuple[int, int, int, int]:
x0, top, x1, bottom = bbox
_x0, _top = self._reproject((x0, top))
_x1, _bottom = self._reproject((x1, bottom))
return (_x0, _top, _x1, _bottom)

def _reproject(self, coord: T_point) -> T_point:
def _reproject(self, coord: T_point) -> Tuple[int, int]:
"""
Given an (x0, top) tuple from the *root* coordinate system,
return an (x0, top) tuple in the *image* coordinate system.
"""
x0, top = coord
px0, ptop = self.page.bbox[:2]
rx0, rtop = self.root.bbox[:2]
_x0 = (x0 + rx0 - px0) * self.scale
_top = (top + rtop - ptop) * self.scale
return (_x0, _top)
_x0 = (x0 - self.bbox[0]) * self.scale
_top = (top - self.bbox[1]) * self.scale
return (int(_x0), int(_top))

def reset(self) -> "PageImage":
self.annotated = PIL.Image.new("RGB", self.original.size)
Expand Down Expand Up @@ -202,7 +212,7 @@ def draw_vline(
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
points = (location, self.page.bbox[1], location, self.page.bbox[3])
points = (location, self.bbox[1], location, self.bbox[3])
self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width)
return self

Expand All @@ -222,7 +232,7 @@ def draw_hline(
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
points = (self.page.bbox[0], location, self.page.bbox[2], location)
points = (self.bbox[0], location, self.bbox[2], location)
self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width)
return self

Expand Down
74 changes: 50 additions & 24 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,27 @@ def new_func(**kwargs: Any) -> TextMap:
return new_func


def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:
# Per PDF Reference 3.8.4: "Note: Although rectangles are
# conventionally specified by their lower-left and upperright
# corners, it is acceptable to specify any two diagonally opposite
# corners."
x0, x1 = sorted((box_raw[0], box_raw[2]))
y0, y1 = sorted((box_raw[1], box_raw[3]))
if rotation in [90, 270]:
return (y0, x0, y1, x1)
else:
return (x0, y0, x1, y1)


# PDFs coordinate spaces refer to an origin in the bottom-left of the
# page; pdfplumber flips this vertically, so that the origin is in the
# top-left.
def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox:
x0, y0, x1, y1 = box_raw
return (x0, mb_height - y1, x1, mb_height - y0)


class Page(Container):
cached_properties: List[str] = Container.cached_properties + ["_layout"]
is_original: bool = True
Expand All @@ -201,35 +222,34 @@ def __init__(
self.root_page = self
self.page_obj = page_obj
self.page_number = page_number
_rotation = resolve_all(self.page_obj.attrs.get("Rotate", 0)) or 0
self.rotation = _rotation % 360
self.page_obj.rotate = self.rotation
self.initial_doctop = initial_doctop

cropbox = page_obj.attrs.get("CropBox")
mediabox = page_obj.attrs.get("MediaBox")
def get_attr(key: str, default: Any = None) -> Any:
ref = page_obj.attrs.get(key)
return default if ref is None else resolve_all(ref)

self.cropbox = resolve_all(cropbox) if cropbox is not None else None
self.mediabox = resolve_all(mediabox) or self.cropbox
m = self.mediabox
# Per PDF Reference Table 3.27: "The number of degrees by which the
# page should be rotated clockwise when displayed or printed. The value
# must be a multiple of 90. Default value: 0"
_rotation = get_attr("Rotate", 0)
self.rotation = _rotation % 360

self.bbox: T_bbox = (
(
min(m[1], m[3]),
min(m[0], m[2]),
max(m[1], m[3]),
max(m[0], m[2]),
)
if self.rotation in [90, 270]
else (
min(m[0], m[2]),
min(m[1], m[3]),
max(m[0], m[2]),
max(m[1], m[3]),
mb_raw = _normalize_box(get_attr("MediaBox"), self.rotation)
mb_height = mb_raw[3] - mb_raw[1]

self.mediabox = _invert_box(mb_raw, mb_height)

if "CropBox" in page_obj.attrs:
self.cropbox = _invert_box(
_normalize_box(get_attr("CropBox"), self.rotation), mb_height
)
)
else:
self.cropbox = self.mediabox

# Page.bbox defaults to self.mediabox, but can be altered by Page.crop(...)
self.bbox = self.mediabox

# https://rednafi.com/python/lru_cache_on_methods/
# See https://rednafi.com/python/lru_cache_on_methods/
self.get_textmap = textmap_cacher(self._get_textmap)

def close(self) -> None:
Expand Down Expand Up @@ -542,6 +562,7 @@ def to_image(
width: Optional[Union[int, float]] = None,
height: Optional[Union[int, float]] = None,
antialias: bool = False,
force_mediabox: bool = False,
) -> "PageImage":
"""
You can pass a maximum of 1 of the following:
Expand All @@ -562,7 +583,10 @@ def to_image(
resolution = 72 * height / self.height

return PageImage(
self, resolution=resolution or DEFAULT_RESOLUTION, antialias=antialias
self,
resolution=resolution or DEFAULT_RESOLUTION,
antialias=antialias,
force_mediabox=force_mediabox,
)

def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
Expand Down Expand Up @@ -597,6 +621,8 @@ def __init__(self, parent_page: Page):
self.pdf = parent_page.pdf
self.page_obj = parent_page.page_obj
self.page_number = parent_page.page_number
self.mediabox = parent_page.mediabox
self.cropbox = parent_page.cropbox
self.flush_cache(Container.cached_properties)
self.get_textmap = textmap_cacher(self._get_textmap)

Expand Down
Binary file added tests/pdfs/issue-1054-example.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion tests/test_basics.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def test_rotation(self):
assert rotated.pages[0].width == 612
assert rotated.pages[0].height == 1008

assert rotated.pages[0].cropbox == self.pdf.pages[0].cropbox
assert rotated.pages[0].cropbox != self.pdf.pages[0].cropbox
assert rotated.pages[0].bbox != self.pdf.pages[0].bbox

def test_password(self):
Expand Down
8 changes: 8 additions & 0 deletions tests/test_display.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,14 @@ def test_cropped(self):
im = self.pdf.pages[0].crop((10, 20, 30, 50)).to_image()
assert im.original.size == (20, 30)

def test_cropbox(self):
path = os.path.join(HERE, "pdfs/issue-1054-example.pdf")
with pdfplumber.open(path) as pdf:
im = pdf.pages[0].to_image()
assert im.original.size == (596, 842)
im = pdf.pages[0].to_image(force_mediabox=True)
assert im.original.size == (2227, 2923)

def test_copy(self):
assert self.im.copy().original == self.im.original

Expand Down

0 comments on commit 07d9997

Please sign in to comment.