Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

move to AlternativeImage feature selectors in OCR-D/core#294: #75

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

* Adapt to feature selection/filtering mechanism for derived images in core
* Fixes for image-feature-related corner cases in crop and deskew
* Use explicit (second) output fileGrp when producing derived images

## [0.4.0] - 2019-08-21

Changed:
Expand Down
41 changes: 24 additions & 17 deletions ocrd_tesserocr/binarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(TesserocrBinarize, self).__init__(*args, **kwargs)
if hasattr(self, 'output_file_grp'):
try:
self.page_grp, self.image_grp = self.output_file_grp.split(',')
kba marked this conversation as resolved.
Show resolved Hide resolved
except ValueError:
self.page_grp = self.output_file_grp
self.image_grp = FALLBACK_IMAGE_GRP
LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP)

def process(self):
"""Performs binarization of the region / line with Tesseract on the workspace.
Expand All @@ -41,37 +48,36 @@ def process(self):

Set up Tesseract to recognize the segment image's layout, and get
the binarized image. Create an image file, and reference it as
AlternativeImage in the element and as file with a fileGrp USE
equal `OCR-D-IMG-BIN` in the workspace.
AlternativeImage in the segment element. Add the new image file
to the workspace with the fileGrp USE given in the second position
of the output fileGrp, or ``OCR-D-IMG-BIN``, and an ID based on input
bertsky marked this conversation as resolved.
Show resolved Hide resolved
file and input element.

Produce a new output file by serialising the resulting hierarchy.
"""
# pylint: disable=attribute-defined-outside-init
try:
self.page_grp, self.image_grp = self.output_file_grp.split(',')
except ValueError:
self.page_grp = self.output_file_grp
self.image_grp = FALLBACK_IMAGE_GRP
LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP)
oplevel = self.parameter['operation_level']

with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
for n, input_file in enumerate(self.input_files):
file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()

# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
# FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
# what we want here is `externalModel="ocrd-tool" externalId="parameters"`
Labels=[LabelsType(#externalRef="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))

page_image, page_xywh, _ = self.workspace.image_from_page(
page, page_id)
LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)
Expand Down Expand Up @@ -129,5 +135,6 @@ def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, f
page_id=page_id,
file_grp=self.image_grp)
# update PAGE (reference the image file):
features = xywh['features'] + ",binarized"
segment.add_AlternativeImage(AlternativeImageType(
filename=file_path, comments="binarized"))
filename=file_path, comments=features))
87 changes: 56 additions & 31 deletions ocrd_tesserocr/crop.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import tesserocr
from ocrd_utils import (
getLogger, concat_padded,
crop_image,
bbox_from_points, points_from_bbox, bbox_from_xywh,
MIMETYPE_PAGE
)
Expand All @@ -15,21 +16,28 @@
to_xml
)
from ocrd_models.ocrd_page_generateds import BorderType
from ocrd_models import OcrdExif
from ocrd import Processor

from .config import TESSDATA_PREFIX, OCRD_TOOL

TOOL = 'ocrd-tesserocr-crop'
LOG = getLogger('processor.TesserocrCrop')
FILEGRP_IMG = 'OCR-D-IMG-CROP'
FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-CROP'

class TesserocrCrop(Processor):

def __init__(self, *args, **kwargs):
kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
kwargs['version'] = OCRD_TOOL['version']
super(TesserocrCrop, self).__init__(*args, **kwargs)
if hasattr(self, 'output_file_grp'):
try:
self.page_grp, self.image_grp = self.output_file_grp.split(',')
kba marked this conversation as resolved.
Show resolved Hide resolved
except ValueError:
self.page_grp = self.output_file_grp
self.image_grp = FALLBACK_FILEGRP_IMG
LOG.info("No output file group for images specified, falling back to '%s'",
FALLBACK_FILEGRP_IMG)

def process(self):
"""Performs page cropping with Tesseract on the workspace.
Expand All @@ -39,6 +47,12 @@ def process(self):
the largest coordinate extent spanning all of them. Use this
extent in defining a Border, and add that to the page.

Moreover, crop the original image accordingly, and reference the
resulting image file as AlternativeImage in the Page element.
Add the new image file to the workspace with the fileGrp USE given
in the second position of the output fileGrp, or ``OCR-D-IMG-CROP``,
wrznr marked this conversation as resolved.
Show resolved Hide resolved
and an ID based on input file and input element.

Produce new output files by serialising the resulting hierarchy.
"""
padding = self.parameter['padding']
Expand All @@ -53,27 +67,47 @@ def process(self):
page_id = input_file.pageId or input_file.ID
LOG.info("INPUT FILE %i / %s", n, page_id)
pcgts = page_from_file(self.workspace.download_file(input_file))
page = pcgts.get_Page()

# add metadata about this operation and its runtime parameters:
metadata = pcgts.get_Metadata() # ensured by from_file()
metadata.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=TOOL,
# FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
# what we want here is `externalModel="ocrd-tool" externalId="parameters"`
Labels=[LabelsType(#externalRef="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))
page = pcgts.get_Page()
Labels=[LabelsType(
externalModel="ocrd-tool",
externalId="parameters",
Label=[LabelType(type_=name,
value=self.parameter[name])
for name in self.parameter.keys()])]))

# warn of existing Border:
bertsky marked this conversation as resolved.
Show resolved Hide resolved
border = page.get_Border()
if border:
left, top, right, bottom = bbox_from_points(border.get_Coords().points)
LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
left, top, right, bottom)

page_image, page_xywh, page_image_info = self.workspace.image_from_page(
page, page_id,
# image must not have been rotated or cropped already,
# abort if no such image can be produced:
feature_filter='deskewed,cropped')
if page_image_info.resolution != 1:
dpi = page_image_info.resolution
if page_image_info.resolutionUnit == 'cm':
dpi = round(dpi * 2.54)
tessapi.SetVariable('user_defined_dpi', str(dpi))
zoom = 300 / dpi
else:
zoom = 1

# warn of existing segmentation:
regions = page.get_TextRegion()
if regions:
min_x = image.width
min_y = image.height
min_x = page_image.width
min_y = page_image.height
max_x = 0
max_y = 0
for region in regions:
Expand All @@ -84,17 +118,7 @@ def process(self):
max_y = max(max_y, bottom)
LOG.warning('Ignoring extent from existing TextRegions: %i:%i,%i:%i',
min_x, max_x, min_y, max_y)

page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
page_image_info = OcrdExif(page_image)
if page_image_info.xResolution != 1:
dpi = page_image_info.xResolution
if page_image_info.resolutionUnit == 'cm':
dpi = round(dpi * 2.54)
tessapi.SetVariable('user_defined_dpi', str(dpi))
zoom = 300 / dpi
else:
zoom = 1

LOG.debug("Cropping with tesseract")
tessapi.SetImage(page_image)
# PSM.SPARSE_TEXT: get as much text as possible in no particular order
Expand Down Expand Up @@ -155,31 +179,32 @@ def process(self):
# update PAGE (annotate border):
page.set_Border(border)
# update METS (add the image file):
page_image = page_image.crop(
page_image = crop_image(page_image,
box=(min_x, min_y, max_x, max_y))
file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG)
page_xywh['features'] += ',cropped'
file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
if file_id == input_file.ID:
file_id = concat_padded(FILEGRP_IMG, n)
file_id = concat_padded(self.image_grp, n)
file_path = self.workspace.save_image_file(page_image,
file_id,
page_id=page_id,
file_grp=FILEGRP_IMG)
file_grp=self.image_grp)
# update PAGE (reference the image file):
page.add_AlternativeImage(AlternativeImageType(
filename=file_path, comments="cropped"))
filename=file_path, comments=page_xywh['features']))
else:
LOG.error("Cannot find valid extent for page '%s'", page_id)

# Use input_file's basename for the new file -
# this way the files retain the same basenames:
file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
if file_id == input_file.ID:
file_id = concat_padded(self.output_file_grp, n)
file_id = concat_padded(self.page_grp, n)
self.workspace.add_file(
ID=file_id,
file_grp=self.output_file_grp,
file_grp=self.page_grp,
pageId=input_file.pageId,
mimetype=MIMETYPE_PAGE,
local_filename=os.path.join(self.output_file_grp,
local_filename=os.path.join(self.page_grp,
file_id + '.xml'),
content=to_xml(pcgts))
Loading