OCR-D · bertsky · Sep 26, 2019 · Aug 28, 2019 · Sep 5, 2019 · Sep 6, 2019
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
 
+  * Adapt to feature selection/filtering mechanism for derived images in core
+  * Fixes for image-feature-related corner cases in crop and deskew
+  * Use explicit (second) output fileGrp when producing derived images
+
 ## [0.4.0] - 2019-08-21
 
 Changed:

diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py
@@ -32,6 +32,13 @@ def __init__(self, *args, **kwargs):
         kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
         kwargs['version'] = OCRD_TOOL['version']
         super(TesserocrBinarize, self).__init__(*args, **kwargs)
+        if hasattr(self, 'output_file_grp'):
+            try:
+                self.page_grp, self.image_grp = self.output_file_grp.split(',')
+            except ValueError:
+                self.page_grp = self.output_file_grp
+                self.image_grp = FALLBACK_IMAGE_GRP
+                LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP)
 
     def process(self):
         """Performs binarization of the region / line with Tesseract on the workspace.
@@ -41,37 +48,36 @@ def process(self):
 
         Set up Tesseract to recognize the segment image's layout, and get
         the binarized image. Create an image file, and reference it as
-        AlternativeImage in the element and as file with a fileGrp USE
-        equal `OCR-D-IMG-BIN` in the workspace.
+        AlternativeImage in the segment element. Add the new image file
+        to the workspace with the fileGrp USE given in the second position
+        of the output fileGrp, or ``OCR-D-IMG-BIN``, and an ID based on input
+        file and input element.
 
         Produce a new output file by serialising the resulting hierarchy.
         """
-        # pylint: disable=attribute-defined-outside-init
-        try:
-            self.page_grp, self.image_grp = self.output_file_grp.split(',')
-        except ValueError:
-            self.page_grp = self.output_file_grp
-            self.image_grp = FALLBACK_IMAGE_GRP
-            LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP)
         oplevel = self.parameter['operation_level']
+
         with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi:
             for n, input_file in enumerate(self.input_files):
                 file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
                 page_id = input_file.pageId or input_file.ID
                 LOG.info("INPUT FILE %i / %s", n, page_id)
                 pcgts = page_from_file(self.workspace.download_file(input_file))
+                page = pcgts.get_Page()
+
+                # add metadata about this operation and its runtime parameters:
                 metadata = pcgts.get_Metadata() # ensured by from_file()
                 metadata.add_MetadataItem(
                     MetadataItemType(type_="processingStep",
                                      name=self.ocrd_tool['steps'][0],
                                      value=TOOL,
-                                     # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
-                                     # what we want here is `externalModel="ocrd-tool" externalId="parameters"`
-                                     Labels=[LabelsType(#externalRef="parameters",
-                                                        Label=[LabelType(type_=name,
-                                                                         value=self.parameter[name])
-                                                               for name in self.parameter.keys()])]))
-                page = pcgts.get_Page()
+                                     Labels=[LabelsType(
+                                         externalModel="ocrd-tool",
+                                         externalId="parameters",
+                                         Label=[LabelType(type_=name,
+                                                          value=self.parameter[name])
+                                                for name in self.parameter.keys()])]))
+
                 page_image, page_xywh, _ = self.workspace.image_from_page(
                     page, page_id)
                 LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id)
@@ -129,5 +135,6 @@ def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, f
                                     page_id=page_id,
                                     file_grp=self.image_grp)
         # update PAGE (reference the image file):
+        features = xywh['features'] + ",binarized"
         segment.add_AlternativeImage(AlternativeImageType(
-            filename=file_path, comments="binarized"))
+            filename=file_path, comments=features))
diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py
@@ -4,6 +4,7 @@
 import tesserocr
 from ocrd_utils import (
     getLogger, concat_padded,
+    crop_image,
     bbox_from_points, points_from_bbox, bbox_from_xywh,
     MIMETYPE_PAGE
 )
@@ -15,21 +16,28 @@
     to_xml
 )
 from ocrd_models.ocrd_page_generateds import BorderType
-from ocrd_models import OcrdExif
 from ocrd import Processor
 
 from .config import TESSDATA_PREFIX, OCRD_TOOL
 
 TOOL = 'ocrd-tesserocr-crop'
 LOG = getLogger('processor.TesserocrCrop')
-FILEGRP_IMG = 'OCR-D-IMG-CROP'
+FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-CROP'
 
 class TesserocrCrop(Processor):
 
     def __init__(self, *args, **kwargs):
         kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL]
         kwargs['version'] = OCRD_TOOL['version']
         super(TesserocrCrop, self).__init__(*args, **kwargs)
+        if hasattr(self, 'output_file_grp'):
+            try:
+                self.page_grp, self.image_grp = self.output_file_grp.split(',')
+            except ValueError:
+                self.page_grp = self.output_file_grp
+                self.image_grp = FALLBACK_FILEGRP_IMG
+                LOG.info("No output file group for images specified, falling back to '%s'",
+                         FALLBACK_FILEGRP_IMG)
 
     def process(self):
         """Performs page cropping with Tesseract on the workspace.
@@ -39,6 +47,12 @@ def process(self):
         the largest coordinate extent spanning all of them. Use this
         extent in defining a Border, and add that to the page.
 
+        Moreover, crop the original image accordingly, and reference the
+        resulting image file as AlternativeImage in the Page element.
+        Add the new image file to the workspace with the fileGrp USE given
+        in the second position of the output fileGrp, or ``OCR-D-IMG-CROP``,
+        and an ID based on input file and input element.
+
         Produce new output files by serialising the resulting hierarchy.
         """
         padding = self.parameter['padding']
@@ -53,27 +67,47 @@ def process(self):
                 page_id = input_file.pageId or input_file.ID
                 LOG.info("INPUT FILE %i / %s", n, page_id)
                 pcgts = page_from_file(self.workspace.download_file(input_file))
+                page = pcgts.get_Page()
+
+                # add metadata about this operation and its runtime parameters:
                 metadata = pcgts.get_Metadata() # ensured by from_file()
                 metadata.add_MetadataItem(
                     MetadataItemType(type_="processingStep",
                                      name=self.ocrd_tool['steps'][0],
                                      value=TOOL,
-                                     # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this
-                                     # what we want here is `externalModel="ocrd-tool" externalId="parameters"`
-                                     Labels=[LabelsType(#externalRef="parameters",
-                                                        Label=[LabelType(type_=name,
-                                                                         value=self.parameter[name])
-                                                               for name in self.parameter.keys()])]))
-                page = pcgts.get_Page()
+                                     Labels=[LabelsType(
+                                         externalModel="ocrd-tool",
+                                         externalId="parameters",
+                                         Label=[LabelType(type_=name,
+                                                          value=self.parameter[name])
+                                                for name in self.parameter.keys()])]))
+
+                # warn of existing Border:
                 border = page.get_Border()
                 if border:
                     left, top, right, bottom = bbox_from_points(border.get_Coords().points)
                     LOG.warning('Overwriting existing Border: %i:%i,%i:%i',
                                 left, top, right, bottom)
+
+                page_image, page_xywh, page_image_info = self.workspace.image_from_page(
+                    page, page_id,
+                    # image must not have been rotated or cropped already,
+                    # abort if no such image can be produced:
+                    feature_filter='deskewed,cropped')
+                if page_image_info.resolution != 1:
+                    dpi = page_image_info.resolution
+                    if page_image_info.resolutionUnit == 'cm':
+                        dpi = round(dpi * 2.54)
+                    tessapi.SetVariable('user_defined_dpi', str(dpi))
+                    zoom = 300 / dpi
+                else:
+                    zoom = 1
+
+                # warn of existing segmentation:
                 regions = page.get_TextRegion()
                 if regions:
-                    min_x = image.width
-                    min_y = image.height
+                    min_x = page_image.width
+                    min_y = page_image.height
                     max_x = 0
                     max_y = 0
                     for region in regions:
@@ -84,17 +118,7 @@ def process(self):
                         max_y = max(max_y, bottom)
                     LOG.warning('Ignoring extent from existing TextRegions: %i:%i,%i:%i',
                                 min_x, max_x, min_y, max_y)
-
-                page_image = self.workspace.resolve_image_as_pil(page.imageFilename)
-                page_image_info = OcrdExif(page_image)
-                if page_image_info.xResolution != 1:
-                    dpi = page_image_info.xResolution
-                    if page_image_info.resolutionUnit == 'cm':
-                        dpi = round(dpi * 2.54)
-                    tessapi.SetVariable('user_defined_dpi', str(dpi))
-                    zoom = 300 / dpi
-                else:
-                    zoom = 1
+
                 LOG.debug("Cropping with tesseract")
                 tessapi.SetImage(page_image)
                 # PSM.SPARSE_TEXT: get as much text as possible in no particular order
@@ -155,31 +179,32 @@ def process(self):
                     # update PAGE (annotate border):
                     page.set_Border(border)
                     # update METS (add the image file):
-                    page_image = page_image.crop(
+                    page_image = crop_image(page_image,
                         box=(min_x, min_y, max_x, max_y))
-                    file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG)
+                    page_xywh['features'] += ',cropped'
+                    file_id = input_file.ID.replace(self.input_file_grp, self.image_grp)
                     if file_id == input_file.ID:
-                        file_id = concat_padded(FILEGRP_IMG, n)
+                        file_id = concat_padded(self.image_grp, n)
                     file_path = self.workspace.save_image_file(page_image,
                                                 file_id,
                                                 page_id=page_id,
-                                                file_grp=FILEGRP_IMG)
+                                                file_grp=self.image_grp)
                     # update PAGE (reference the image file):
                     page.add_AlternativeImage(AlternativeImageType(
-                        filename=file_path, comments="cropped"))
+                        filename=file_path, comments=page_xywh['features']))
                 else:
                     LOG.error("Cannot find valid extent for page '%s'", page_id)
 
                 # Use input_file's basename for the new file -
                 # this way the files retain the same basenames:
-                file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp)
+                file_id = input_file.ID.replace(self.input_file_grp, self.page_grp)
                 if file_id == input_file.ID:
-                    file_id = concat_padded(self.output_file_grp, n)
+                    file_id = concat_padded(self.page_grp, n)
                 self.workspace.add_file(
                     ID=file_id,
-                    file_grp=self.output_file_grp,
+                    file_grp=self.page_grp,
                     pageId=input_file.pageId,
                     mimetype=MIMETYPE_PAGE,
-                    local_filename=os.path.join(self.output_file_grp,
+                    local_filename=os.path.join(self.page_grp,
                                                 file_id + '.xml'),
                     content=to_xml(pcgts))