diff --git a/CHANGELOG.md b/CHANGELOG.md index b3678d2..3e301f1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased + * Adapt to feature selection/filtering mechanism for derived images in core + * Fixes for image-feature-related corner cases in crop and deskew + * Use explicit (second) output fileGrp when producing derived images + ## [0.4.0] - 2019-08-21 Changed: diff --git a/ocrd_tesserocr/binarize.py b/ocrd_tesserocr/binarize.py index 685554d..acf6e1a 100644 --- a/ocrd_tesserocr/binarize.py +++ b/ocrd_tesserocr/binarize.py @@ -32,6 +32,13 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrBinarize, self).__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp'): + try: + self.page_grp, self.image_grp = self.output_file_grp.split(',') + except ValueError: + self.page_grp = self.output_file_grp + self.image_grp = FALLBACK_IMAGE_GRP + LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) def process(self): """Performs binarization of the region / line with Tesseract on the workspace. @@ -41,37 +48,36 @@ def process(self): Set up Tesseract to recognize the segment image's layout, and get the binarized image. Create an image file, and reference it as - AlternativeImage in the element and as file with a fileGrp USE - equal `OCR-D-IMG-BIN` in the workspace. + AlternativeImage in the segment element. Add the new image file + to the workspace with the fileGrp USE given in the second position + of the output fileGrp, or ``OCR-D-IMG-BIN``, and an ID based on input + file and input element. Produce a new output file by serialising the resulting hierarchy. """ - # pylint: disable=attribute-defined-outside-init - try: - self.page_grp, self.image_grp = self.output_file_grp.split(',') - except ValueError: - self.page_grp = self.output_file_grp - self.image_grp = FALLBACK_IMAGE_GRP - LOG.info("No output file group for images specified, falling back to '%s'", FALLBACK_IMAGE_GRP) oplevel = self.parameter['operation_level'] + with PyTessBaseAPI(path=TESSDATA_PREFIX) as tessapi: for n, input_file in enumerate(self.input_files): file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, - # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this - # what we want here is `externalModel="ocrd-tool" externalId="parameters"` - Labels=[LabelsType(#externalRef="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page = pcgts.get_Page() + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page_image, page_xywh, _ = self.workspace.image_from_page( page, page_id) LOG.info("Binarizing on '%s' level in page '%s'", oplevel, page_id) @@ -129,5 +135,6 @@ def _process_segment(self, tessapi, ril, segment, image, xywh, where, page_id, f page_id=page_id, file_grp=self.image_grp) # update PAGE (reference the image file): + features = xywh['features'] + ",binarized" segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments="binarized")) + filename=file_path, comments=features)) diff --git a/ocrd_tesserocr/crop.py b/ocrd_tesserocr/crop.py index d17e275..6112002 100644 --- a/ocrd_tesserocr/crop.py +++ b/ocrd_tesserocr/crop.py @@ -4,6 +4,7 @@ import tesserocr from ocrd_utils import ( getLogger, concat_padded, + crop_image, bbox_from_points, points_from_bbox, bbox_from_xywh, MIMETYPE_PAGE ) @@ -15,14 +16,13 @@ to_xml ) from ocrd_models.ocrd_page_generateds import BorderType -from ocrd_models import OcrdExif from ocrd import Processor from .config import TESSDATA_PREFIX, OCRD_TOOL TOOL = 'ocrd-tesserocr-crop' LOG = getLogger('processor.TesserocrCrop') -FILEGRP_IMG = 'OCR-D-IMG-CROP' +FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-CROP' class TesserocrCrop(Processor): @@ -30,6 +30,14 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrCrop, self).__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp'): + try: + self.page_grp, self.image_grp = self.output_file_grp.split(',') + except ValueError: + self.page_grp = self.output_file_grp + self.image_grp = FALLBACK_FILEGRP_IMG + LOG.info("No output file group for images specified, falling back to '%s'", + FALLBACK_FILEGRP_IMG) def process(self): """Performs page cropping with Tesseract on the workspace. @@ -39,6 +47,12 @@ def process(self): the largest coordinate extent spanning all of them. Use this extent in defining a Border, and add that to the page. + Moreover, crop the original image accordingly, and reference the + resulting image file as AlternativeImage in the Page element. + Add the new image file to the workspace with the fileGrp USE given + in the second position of the output fileGrp, or ``OCR-D-IMG-CROP``, + and an ID based on input file and input element. + Produce new output files by serialising the resulting hierarchy. """ padding = self.parameter['padding'] @@ -53,27 +67,47 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, - # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this - # what we want here is `externalModel="ocrd-tool" externalId="parameters"` - Labels=[LabelsType(#externalRef="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page = pcgts.get_Page() + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + + # warn of existing Border: border = page.get_Border() if border: left, top, right, bottom = bbox_from_points(border.get_Coords().points) LOG.warning('Overwriting existing Border: %i:%i,%i:%i', left, top, right, bottom) + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, + # image must not have been rotated or cropped already, + # abort if no such image can be produced: + feature_filter='deskewed,cropped') + if page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi = round(dpi * 2.54) + tessapi.SetVariable('user_defined_dpi', str(dpi)) + zoom = 300 / dpi + else: + zoom = 1 + + # warn of existing segmentation: regions = page.get_TextRegion() if regions: - min_x = image.width - min_y = image.height + min_x = page_image.width + min_y = page_image.height max_x = 0 max_y = 0 for region in regions: @@ -84,17 +118,7 @@ def process(self): max_y = max(max_y, bottom) LOG.warning('Ignoring extent from existing TextRegions: %i:%i,%i:%i', min_x, max_x, min_y, max_y) - - page_image = self.workspace.resolve_image_as_pil(page.imageFilename) - page_image_info = OcrdExif(page_image) - if page_image_info.xResolution != 1: - dpi = page_image_info.xResolution - if page_image_info.resolutionUnit == 'cm': - dpi = round(dpi * 2.54) - tessapi.SetVariable('user_defined_dpi', str(dpi)) - zoom = 300 / dpi - else: - zoom = 1 + LOG.debug("Cropping with tesseract") tessapi.SetImage(page_image) # PSM.SPARSE_TEXT: get as much text as possible in no particular order @@ -155,31 +179,32 @@ def process(self): # update PAGE (annotate border): page.set_Border(border) # update METS (add the image file): - page_image = page_image.crop( + page_image = crop_image(page_image, box=(min_x, min_y, max_x, max_y)) - file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + page_xywh['features'] += ',cropped' + file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) if file_id == input_file.ID: - file_id = concat_padded(FILEGRP_IMG, n) + file_id = concat_padded(self.image_grp, n) file_path = self.workspace.save_image_file(page_image, file_id, page_id=page_id, - file_grp=FILEGRP_IMG) + file_grp=self.image_grp) # update PAGE (reference the image file): page.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments="cropped")) + filename=file_path, comments=page_xywh['features'])) else: LOG.error("Cannot find valid extent for page '%s'", page_id) # Use input_file's basename for the new file - # this way the files retain the same basenames: - file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + file_id = input_file.ID.replace(self.input_file_grp, self.page_grp) if file_id == input_file.ID: - file_id = concat_padded(self.output_file_grp, n) + file_id = concat_padded(self.page_grp, n) self.workspace.add_file( ID=file_id, - file_grp=self.output_file_grp, + file_grp=self.page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, + local_filename=os.path.join(self.page_grp, file_id + '.xml'), content=to_xml(pcgts)) diff --git a/ocrd_tesserocr/deskew.py b/ocrd_tesserocr/deskew.py index e454c1a..91d51ad 100644 --- a/ocrd_tesserocr/deskew.py +++ b/ocrd_tesserocr/deskew.py @@ -29,7 +29,7 @@ TOOL = 'ocrd-tesserocr-deskew' LOG = getLogger('processor.TesserocrDeskew') -FILEGRP_IMG = 'OCR-D-IMG-DESKEW' +FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-DESKEW' class TesserocrDeskew(Processor): @@ -37,6 +37,14 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrDeskew, self).__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp'): + try: + self.page_grp, self.image_grp = self.output_file_grp.split(',') + except ValueError: + self.page_grp = self.output_file_grp + self.image_grp = FALLBACK_FILEGRP_IMG + LOG.info("No output file group for images specified, falling back to '%s'", + FALLBACK_FILEGRP_IMG) def process(self): """Performs deskewing of the page / region with Tesseract on the workspace. @@ -48,14 +56,16 @@ def process(self): Set up Tesseract to recognise the region image's orientation, skew and script (with both OSD and AnalyseLayout). Rotate the image accordingly, and annotate the angle, readingDirection and textlineOrder. - + Create a corresponding image file, and reference it as AlternativeImage - in the region element and as file with a fileGrp USE `OCR-D-IMG-DESKEW` - in the workspace. - + in the element. Add the new image file to the workspace with the fileGrp USE + given in the second position of the output fileGrp, or ``OCR-D-IMG-DESKEW``, + and an ID based on input file and input element. + Produce a new output file by serialising the resulting hierarchy. """ oplevel = self.parameter['operation_level'] + with PyTessBaseAPI( path=TESSDATA_PREFIX, lang="osd", # osd required for legacy init! @@ -63,66 +73,96 @@ def process(self): psm=PSM.AUTO_OSD ) as tessapi: for n, input_file in enumerate(self.input_files): - file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, - # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this - # what we want here is `externalModel="ocrd-tool" externalId="parameters"` - Labels=[LabelsType(#externalRef="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page = pcgts.get_Page() + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id) - if page_image_info.xResolution != 1: - dpi = page_image_info.xResolution + page, page_id, + # image must not have been rotated already, + # (we will overwrite @orientation anyway,) + # (This is true even if oplevel is region + # and page-level deskewing has been applied, + # because we still need to rule out rotated + # images on the region level, so better + # rotate the page level ourselves!) + # abort if no such image can be produced: + feature_filter='deskewed') + if page_image_info.resolution != 1: + dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) LOG.info("Deskewing on '%s' level in page '%s'", oplevel, page_id) - + if oplevel == 'page': self._process_segment(tessapi, page, page_image, page_xywh, "page '%s'" % page_id, input_file.pageId, file_id) else: + if page_xywh['angle']: + LOG.info("About to rotate page '%s' by %.2f°", + page_id, page_xywh['angle']) + page_image = page_image.rotate(page_xywh['angle'], + expand=True, + #resample=Image.BILINEAR, + fillcolor='white') + # pretend to image_from_segment that this has *not* + # been rotated yet (so we can rule out images rotated + # on the region level): + #page_xywh['features'] += ',deskewed' + page_xywh['x'] -= round(0.5 * max(0, page_image.width - page_xywh['w'])) + page_xywh['y'] -= round(0.5 * max(0, page_image.height - page_xywh['h'])) + regions = page.get_TextRegion() + page.get_TableRegion() if not regions: LOG.warning("Page '%s' contains no text regions", page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) + region, page_image, page_xywh, + # image must not have been rotated already, + # (we will overwrite @orientation anyway,) + # abort if no such image can be produced: + feature_filter='deskewed') self._process_segment(tessapi, region, region_image, region_xywh, "region '%s'" % region.id, input_file.pageId, file_id + '_' + region.id) - + + if page_xywh['angle']: + # no pretense! (regardless of region results) + page_xywh['features'] += ',deskewed' + # Use input_file's basename for the new file - # this way the files retain the same basenames: - file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + file_id = input_file.ID.replace(self.input_file_grp, self.page_grp) if file_id == input_file.ID: - file_id = concat_padded(self.output_file_grp, n) + file_id = concat_padded(self.page_grp, n) self.workspace.add_file( ID=file_id, - file_grp=self.output_file_grp, + file_grp=self.page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, + local_filename=os.path.join(self.page_grp, file_id + '.xml'), content=to_xml(pcgts)) - + def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_id): - if (isinstance(segment, PageType) and - not xywh['x'] and not xywh['y']): - comments = '' - else: - comments = 'cropped' + features = xywh['features'] angle = 0. tessapi.SetImage(image) #tessapi.SetPageSegMode(PSM.AUTO_OSD) @@ -141,7 +181,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i osr['orient_deg'], osr['orient_conf'], where) angle = osr['orient_deg'] if angle: - comments += ',rotated-%d' % angle + features += ',rotated-%d' % angle assert not math.isnan(osr['script_conf']), \ "script detection failed (Tesseract probably compiled without legacy OEM, or osd model not installed)" if osr['script_conf'] < 10: @@ -203,8 +243,8 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i if layout: orientation, writing_direction, textline_order, deskew_angle = layout.Orientation() deskew_angle *= - 180 / math.pi - if int(deskew_angle): - comments += ',deskewed' + if deskew_angle: + features += ',deskewed' LOG.info('orientation/deskewing for %s: %s / %s / %s / %.3f°', where, membername(Orientation, orientation), membername(WritingDirection, writing_direction), @@ -237,17 +277,16 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i # 270: Image.ROTATE_270 # }.get(angle)) # no default angle += deskew_angle - if angle: - # Tesseract layout analysis already rotates the image, even for each - # sub-segment (depending on RIL), but the accuracy is not as good - # as setting the image to the sub-segments and running without iterator. - # (These images can be queried via GetBinaryImage/GetImage, cf. segment_region) - # Unfortunately, it does _not_ use expand=True, but chops off corners. - # So we must do it here from the original image ourself: - LOG.debug('About to rotate %s by %.2f° clockwise', where, angle) - image = image.rotate(-angle, expand=True, fillcolor='white') - angle = 180 - (180 - angle) % 360 # map to [-179.999,180] - segment.set_orientation(angle) + # Tesseract layout analysis already rotates the image, even for each + # sub-segment (depending on RIL), but the accuracy is not as good + # as setting the image to the sub-segments and running without iterator. + # (These images can be queried via GetBinaryImage/GetImage, cf. segment_region) + # Unfortunately, it does _not_ use expand=True, but chops off corners. + # So we must do it here from the original image ourself: + LOG.debug('About to rotate %s by %.2f° clockwise', where, angle) + image = image.rotate(-angle, expand=True, fillcolor='white') + angle = 180 - (180 - angle) % 360 # map to [-179.999,180] + segment.set_orientation(angle) if isinstance(segment, (TextRegionType, PageType)): segment.set_readingDirection({ WritingDirection.LEFT_TO_RIGHT: 'left-to-right', @@ -267,7 +306,7 @@ def _process_segment(self, tessapi, segment, image, xywh, where, page_id, file_i file_path = self.workspace.save_image_file(image, file_id, page_id=page_id, - file_grp=FILEGRP_IMG) + file_grp=self.image_grp) # update PAGE (reference the image file): segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=comments)) + filename=file_path, comments=features)) diff --git a/ocrd_tesserocr/recognize.py b/ocrd_tesserocr/recognize.py index 309ca2e..28e3a4f 100644 --- a/ocrd_tesserocr/recognize.py +++ b/ocrd_tesserocr/recognize.py @@ -43,13 +43,13 @@ def process(self): Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the requested - `textequiv_level`. If `overwrite_words` is enabled and any layout + ``textequiv_level``. If ``overwrite_words`` is enabled and any layout annotation below the line level already exists, then remove it - (regardless of `textequiv_level`). + (regardless of ``textequiv_level``). Set up Tesseract to recognise each segment's image rectangle with - the appropriate mode and `model`. Create new elements below the line + the appropriate mode and ``model``. Create new elements below the line level if necessary. Put text results and confidence values into new - TextEquiv at `textequiv_level`, and make the higher levels consistent + TextEquiv at ``textequiv_level``, and make the higher levels consistent with that (by concatenation joined by whitespace). Produce new output files by serialising the resulting hierarchy. @@ -62,6 +62,7 @@ def process(self): for sub_model in model.split('+'): if sub_model not in get_languages()[1]: raise Exception("configured model " + sub_model + " is not installed") + with PyTessBaseAPI(path=TESSDATA_PREFIX, lang=model) as tessapi: LOG.info("Using model '%s' in %s for recognition at the %s level", model, get_languages()[0], maxlevel) @@ -116,26 +117,29 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, - # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this - # what we want here is `externalModel="ocrd-tool" externalId="parameters"` - Labels=[LabelsType(#externalRef="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page = pcgts.get_Page() + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - if page_image_info.xResolution != 1: - dpi = page_image_info.xResolution + if page_image_info.resolution != 1: + dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) #tessapi.SetImage(page_image) + LOG.info("Processing page '%s'", page_id) regions = page.get_TextRegion() if not regions: @@ -354,7 +358,7 @@ def _process_glyphs_in_word(self, result_it, word, word_xywh): result_it.Next(RIL.SYMBOL) def page_update_higher_textequiv_levels(level, pcgts): - '''Update the TextEquivs of all PAGE-XML hierarchy levels above `level` for consistency. + '''Update the TextEquivs of all PAGE-XML hierarchy levels above ``level`` for consistency. Starting with the hierarchy level chosen for processing, join all first TextEquiv (by the rules governing the respective level) diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index d692c30..840864a 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -36,7 +36,7 @@ def process(self): Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the region level, - and remove any existing TextLine elements (unless `overwrite_lines` + and remove any existing TextLine elements (unless ``overwrite_lines`` is False). Set up Tesseract to detect lines, and add each one to the region @@ -54,22 +54,25 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, - # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this - # what we want here is `externalModel="ocrd-tool" externalId="parameters"` - Labels=[LabelsType(#externalRef="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page = pcgts.get_Page() + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - if page_image_info.xResolution != 1: - dpi = page_image_info.xResolution + if page_image_info.resolution != 1: + dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) diff --git a/ocrd_tesserocr/segment_region.py b/ocrd_tesserocr/segment_region.py index c9996f5..5a82f8b 100644 --- a/ocrd_tesserocr/segment_region.py +++ b/ocrd_tesserocr/segment_region.py @@ -37,11 +37,7 @@ TOOL = 'ocrd-tesserocr-segment-region' LOG = getLogger('processor.TesserocrSegmentRegion') -FILEGRP_IMG = 'OCR-D-IMG-CROP' - -# (will be passed as padding to both BoundingBox and GetImage) -# (actually, Tesseract honours padding only on the left and bottom, -# whereas right and top are increased less) +FALLBACK_FILEGRP_IMG = 'OCR-D-IMG-CROP' class TesserocrSegmentRegion(Processor): @@ -49,23 +45,34 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = OCRD_TOOL['tools'][TOOL] kwargs['version'] = OCRD_TOOL['version'] super(TesserocrSegmentRegion, self).__init__(*args, **kwargs) + if hasattr(self, 'output_file_grp'): + try: + self.page_grp, self.image_grp = self.output_file_grp.split(',') + except ValueError: + self.page_grp = self.output_file_grp + self.image_grp = FALLBACK_FILEGRP_IMG + LOG.info("No output file group for images specified, falling back to '%s'", + FALLBACK_FILEGRP_IMG) def process(self): """Performs (text) region segmentation with Tesseract on the workspace. Open and deserialize PAGE input files and their respective images, and remove any existing Region and ReadingOrder elements - (unless `overwrite_regions` is False). + (unless ``overwrite_regions`` is False). Set up Tesseract to detect blocks, and add each one to the page as a region according to BlockType at the detected coordinates. - If `find_tables` is True, try to detect table blocks and add them + If ``find_tables`` is True, try to detect table blocks and add them as (atomic) TableRegion. - If `crop_polygons` is True, create a cropped (and possibly deskewed) - raw image file for each region (masked along its polygon outline), - and reference it as AlternativeImage in the region element and - as file with a fileGrp USE equal `OCR-D-IMG-CROP` in the workspace. + If ``crop_polygons`` is True, create a cropped (and possibly deskewed) + image (without extra binarization) for each region (which gets + clipped to white outside its polygon outline), and reference th + resulting image file as AlternativeImage in the region element. + Add the new image to the workspace with the fileGrp USE given + in the second position of the output fileGrp, or ``OCR-D-IMG-CROP``, + and an ID based on input file and input element. Produce a new output file by serialising the resulting hierarchy. """ @@ -83,22 +90,26 @@ def process(self): # analysed as independent text/line blocks: tessapi.SetVariable("textord_tabfind_find_tables", "0") for (n, input_file) in enumerate(self.input_files): - file_id = input_file.ID.replace(self.input_file_grp, FILEGRP_IMG) + file_id = input_file.ID.replace(self.input_file_grp, self.image_grp) page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, - # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this - # what we want here is `externalModel="ocrd-tool" externalId="parameters"` - Labels=[LabelsType(#externalRef="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page = pcgts.get_Page() + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) + + # delete or warn of existing regions: if page.get_TextRegion(): if overwrite_regions: LOG.info('removing existing TextRegions') @@ -122,16 +133,18 @@ def process(self): if overwrite_regions: LOG.info('overwriting existing ReadingOrder') # (cannot sustain old regionrefs) - page.set_ReadingOrder([]) + page.set_ReadingOrder(None) else: LOG.warning('keeping existing ReadingOrder') + page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - if page_image_info.xResolution != 1: - dpi = page_image_info.xResolution + if page_image_info.resolution != 1: + dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) + LOG.info("Detecting regions in page '%s'", page_id) tessapi.SetImage(page_image) # is already cropped to Border tessapi.SetPageSegMode(PSM.AUTO) # (default) @@ -142,15 +155,15 @@ def process(self): # Use input_file's basename for the new file - # this way the files retain the same basenames: - file_id = input_file.ID.replace(self.input_file_grp, self.output_file_grp) + file_id = input_file.ID.replace(self.input_file_grp, self.page_grp) if file_id == input_file.ID: - file_id = concat_padded(self.output_file_grp, n) + file_id = concat_padded(self.page_grp, n) self.workspace.add_file( ID=file_id, - file_grp=self.output_file_grp, + file_grp=self.page_grp, pageId=input_file.pageId, mimetype=MIMETYPE_PAGE, - local_filename=os.path.join(self.output_file_grp, + local_filename=os.path.join(self.page_grp, file_id + '.xml'), content=to_xml(pcgts)) @@ -161,17 +174,20 @@ def _process_page(self, it, page, page_image, page_xywh, page_id, file_id): # and its BlockPolygon() index = 0 while it and not it.Empty(RIL.BLOCK): + # (padding will be passed to both BoundingBox and GetImage) + # (actually, Tesseract honours padding only on the left and bottom, + # whereas right and top are increased less!) bbox = it.BoundingBox(RIL.BLOCK, padding=self.parameter['padding']) points = points_from_x0y0x1y1(bbox) - # add offset from any Border: + # add offset from Border, if any: xywh = xywh_from_points(points) xywh['x'] += page_xywh['x'] xywh['y'] += page_xywh['y'] points = points_from_xywh(xywh) # sometimes these polygons are not planar, which causes # PIL.ImageDraw.Draw.polygon (and likely others as well) - # to misbehave; unfortunately, we do not have coordinate - # semantics in PAGE (left/right inner/outer, multi-path etc) + # to misbehave; however, PAGE coordinate semantics prohibit + # multi-path polygons! # (probably a bug in Tesseract itself): polygon = it.BlockPolygon() if self.parameter['crop_polygons'] and polygon and list(polygon): @@ -216,7 +232,8 @@ def _process_page(self, it, page, page_image, page_xywh, page_id, file_id): # it is a bad idea to create a TextRegion # for it (better set `find_tables` False): # PT.TABLE, - # will always yield a 90° deskew angle below: + # should actually get a 90° @orientation + # (but that's ultimately for deskewing to decide): PT.VERTICAL_TEXT]: region = TextRegionType(id=ID, Coords=coords) page.add_TextRegion(region) @@ -259,14 +276,15 @@ def _process_page(self, it, page, page_image, page_xywh, page_id, file_id): # You have been warned! # get the raw image (masked by white space along the block polygon): region_image, _, _ = it.GetImage(RIL.BLOCK, self.parameter['padding'], page_image) + page_xywh['features'] += ',cropped' # update METS (add the image file): file_path = self.workspace.save_image_file(region_image, file_id + '_' + ID, page_id=page_id, - file_grp=FILEGRP_IMG) + file_grp=self.image_grp) # update PAGE (reference the image file): region.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments="cropped")) + filename=file_path, comments=page_xywh['features'])) # # iterator increment # diff --git a/ocrd_tesserocr/segment_word.py b/ocrd_tesserocr/segment_word.py index a4db5fe..909b647 100644 --- a/ocrd_tesserocr/segment_word.py +++ b/ocrd_tesserocr/segment_word.py @@ -35,7 +35,7 @@ def process(self): Open and deserialize PAGE input files and their respective images, then iterate over the element hierarchy down to the textline level, - and remove any existing Word elements (unless `overwrite_words` + and remove any existing Word elements (unless ``overwrite_words`` is False). Set up Tesseract to detect words, and add each one to the line @@ -53,22 +53,24 @@ def process(self): page_id = input_file.pageId or input_file.ID LOG.info("INPUT FILE %i / %s", n, page_id) pcgts = page_from_file(self.workspace.download_file(input_file)) + page = pcgts.get_Page() + + # add metadata about this operation and its runtime parameters: metadata = pcgts.get_Metadata() # ensured by from_file() metadata.add_MetadataItem( MetadataItemType(type_="processingStep", name=self.ocrd_tool['steps'][0], value=TOOL, - # FIXME: externalRef is invalid by pagecontent.xsd, but ocrd does not reflect this - # what we want here is `externalModel="ocrd-tool" externalId="parameters"` - Labels=[LabelsType(#externalRef="parameters", - Label=[LabelType(type_=name, - value=self.parameter[name]) - for name in self.parameter.keys()])])) - page = pcgts.get_Page() + Labels=[LabelsType( + externalModel="ocrd-tool", + externalId="parameters", + Label=[LabelType(type_=name, + value=self.parameter[name]) + for name in self.parameter.keys()])])) page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - if page_image_info.xResolution != 1: - dpi = page_image_info.xResolution + if page_image_info.resolution != 1: + dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi = round(dpi * 2.54) tessapi.SetVariable('user_defined_dpi', str(dpi)) diff --git a/requirements.txt b/requirements.txt index df539ca..cd4e740 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ ocrd >= 1.0.0b17 click -tesserocr==2.4.1 +tesserocr>=2.4.1 diff --git a/setup.py b/setup.py index eb0cb5f..f3b20de 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ setup( name='ocrd_tesserocr', - version='0.4.0', + version='0.4.1', description='Tesserocr bindings', long_description=codecs.open('README.rst', encoding='utf-8').read(), author='Konstantin Baierer, Kay-Michael Würzner, Robert Sachunsky',