Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

QA Spec - Schema #236

Closed
wants to merge 15 commits into from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/.project
.idea/
venv
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ validate: json
jsonschema --output pretty --validator Draft201909Validator --instance ocrd_eval.sample.json ocrd_eval.schema.json

deps:
pip install yaml click jsonschema
pip install pyyaml click jsonschema
102 changes: 102 additions & 0 deletions ocrd_eval.sample.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
[
{
"@id": "wf-data16_ant_complex_minimal_ocr-eval",
"label": "Workflow on data 16_ant_complex_minimal_ocr",
"metadata": {
"ocr_workflow": {
"@id": "https://github.com/OCR-D/quiver-back-end/blob/main/workflows/ocrd_workflows/minimal_ocr.txt",
"label": "OCR Workflow minimal_ocr"
},
"eval_workflow": {
"@id": "https://github.com/OCR-D/quiver-back-end/blob/main/workflows/ocrd_workflows/dinglehopper_eval.txt",
"label": "Evaluation Workflow dinglehopper_eval"
},
"gt_workspace": {
"@id": "https://github.com/OCR-D/quiver-data/blob/main/16_ant_complex.ocrd.zip",
"label": "GT workspace 16th century Antiqua complex layout"
},
"ocr_workspace": {
"@id": "https://github.com/OCR-D/quiver-back-end/blob/main/workflows/results/16_ant_complex_minimal_ocr_ocr.zip",
"label": "OCR workspace for 16_ant_complex_minimal_ocr"
},
"eval_workspace": {
"@id": "https://github.com/OCR-D/quiver-back-end/blob/main/workflows/results/16_ant_complex_minimal_ocr_evaluation.zip",
"label": "Evaluation workspace for 16_ant_complex_minimal_ocr"
},
"workflow_steps": [
{
"id": "ocrd-tesserocr-recognize",
"params": {
"segmentation_level": "region",
"textequiv_level": "word",
"find_tables": true,
"model": "Fraktur_GT4HistOCR",
"dpi": 0,
"padding": 0,
"overwrite_segments": false,
"overwrite_text": true,
"shrink_polygons": false,
"block_polygons": false,
"find_staves": false,
"sparse_text": false,
"raw_lines": false,
"char_whitelist": "",
"char_blacklist": "",
"char_unblacklist": "",
"tesseract_parameters": {},
"xpath_parameters": {},
"xpath_model": {},
"auto_model": false,
"oem": "DEFAULT"
}
}
],
"workflow_model": "Fraktur_GT4HistOCR",
"eval_tool": "ocrd-dinglehopper vNone",
"document_metadata": {
"data_properties": {
"fonts": [
"Antiqua"
],
"publication_century": "1500-1600",
"publication_decade": "",
"publication_year": "16th century",
"number_of_pages": 3,
"layout": "complex"
}
}
},
"evaluation_results": {
"document_wide": {
"wall_time": 7.72297,
"cpu_time": 10.385645,
"cer_mean": 0.10240852523716282,
"cer_median": 0.10536980749746708,
"cer_range": [
0.07124352331606218,
0.1306122448979592
],
"cer_standard_deviation": 0.02979493530847308,
"wer": 0.23466068901129858,
"pages_per_minute": 23.307095586283516
},
"by_page": [
{
"page_id": "phys_0007",
"cer_mean": 0.07124352331606218,
"wer": 0.2231404958677686
},
{
"page_id": "phys_0008",
"cer_mean": 0.10536980749746708,
"wer": 0.2484472049689441
},
{
"page_id": "phys_0009",
"cer_mean": 0.1306122448979592,
"wer": 0.2323943661971831
}
]
}
}
]
1 change: 1 addition & 0 deletions ocrd_eval.schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A List of Evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI.\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation_results"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation_results": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "array", "description": "Human readable description of the individual steps and their parameters in the workflow (for UI)", "minItems": 1, "items": {"type": "object", "properties": {"id": {"type": "string", "description": "The name of the processor used for this workflow step", "pattern": "^ocrd-[a-z\\-]+"}, "params": {"type": "object", "description": "A map of parameters and their values applied to the processor used for this workflow step"}}, "required": ["id", "params"]}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{2}0-[12][0-9]{2}0"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "fraktur"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "allOf": [{"$ref": "#$defs/DocumentEvaluationMetrics"}, {"$ref": "#$defs/CommonEvaluationMetrics"}], "unevaluatedProperties": false}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"$ref": "#$defs/CommonEvaluationMetrics"}, {"$ref": "#$defs/PageId"}], "unevaluatedProperties": false}}}}, "PageId": {"type": "object", "properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, "CommonEvaluationMetrics": {"type": "object", "properties": {"cer_mean": {"type": "number", "description": "Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wer": {"type": "number", "description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}}, "DocumentEvaluationMetrics": {"type": "object", "properties": {"cer_median": {"type": "number", "description": "Median of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "cer_standard_deviation": {"type": "number", "description": "Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wall_time": {"type": "number", "description": "Actual time needed for processing workflow"}, "cpu_time": {"type": "number", "description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"type": "number", "description": "Number of pages processed per minute"}}}}}
217 changes: 217 additions & 0 deletions ocrd_eval.schema.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
$schema: https://json-schema.org/draft/2019-09/schema
$id: https://ocr-d.de/en/spec/ocrd_eval.schema.json

title: A List of Evaluations for OCR-D
description: >
- All references to URL are JSON-LD-like objects with at least an `@id`
property referencing the URL and `label` for a human-readable label to be
used in the UI.
type: array
items:
required: ['@id', 'label', 'metadata', 'evaluation_results']
unevaluatedProperties: false
allOf:
- { '$ref': '#/$defs/LabeledUrl' }
- properties:
metadata: { '$ref': '#/$defs/EvaluationMetadata' }
evaluation_results: { '$ref': '#/$defs/EvaluationReport' }

# Reusable definitions
$defs:

LabeledUrl:
type: object
required: ['@id']
properties:
'@id':
type: string
format: uri
description: URL of the thing
label:
type: string
description: Description of the thing for UI purposes

EvaluationMetadata:
type: object
title: Metadata about one evaluation
additionalProperties: false
description: >
EvaluationMetadata contains all the info on how an EvaluationReport came to be.

There are two OCR-D *workflows* involved:
- ocr_workflow: The workflow which produced the OCR results to evaluate
- eval_workflow: The workflow run to evaluate OCR and GT

There are three OCR-D *workspaces* involved:
- gt_workspace: The workspace containing the GT
- ocr_workspace: The workspace containing the OCR results from ocr_workflow
- eval_workspace: The workspace on which the eval_workflow was run

required:
- ocr_workflow
- ocr_workspace
- eval_workflow
- eval_workspace
- gt_workspace
- document_metadata

properties:

ocr_workflow:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The OCR-D workflow that produced the ocr_workspace

ocr_workspace:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The workspace containing the OCR

eval_workflow:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The OCR-D workflow that produced the eval_workspace

eval_workspace:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The workspace containing the evaluation results

gt_workspace:
allOf: [{ '$ref': '#/$defs/LabeledUrl' }]
description: The workspace containing the GT

workflow_steps:
type: array
description: Human readable description of the individual steps and their parameters in the workflow (for UI)
minItems: 1
items:
type: object
properties:
id:
type: string
description: The name of the processor used for this workflow step
pattern: '^ocrd-[a-z\-]+'
params:
type: object
description: A map of parameters and their values applied to the processor used for this workflow step
required: ['id', 'params']

workflow_model:
type: string
description: Human readable name of the main model used for recognition in the OCR workflow (for UI)

eval_tool:
type: string
description: Human readable name and version of evaluation tool used (for UI)

document_metadata:
type: object
title: Bibliographical and typographical metadata about the work to be evaluated
properties:

publication_year:
type: number
description: Year the document was originally published

publication_century:
type: string
description: Century the document was originally published
pattern: '[12][0-9]{3}-[12][0-9]{3}'
mweidling marked this conversation as resolved.
Show resolved Hide resolved

publication_decade:
type: string
description: Decade the document was originally published
pattern: '[12][0-9]{2}0-[12][0-9]{2}0'

number_of_pages:
type: number
description: Number of pages in this work (i.e. the number of images in the gt_workspace)

layout:
type: string
enum: ['simple', 'complex']

fonts:
type: array
items:
type: string
enum: ['antiqua', 'fraktur']
kba marked this conversation as resolved.
Show resolved Hide resolved

provenance:
type: object
description: Information on which tools in which version were used in determining metrics
properties:
parameters:
type: object
description: Parameters passed to the evaluation processor

EvaluationReport:
type: object
additionalProperties: false
description: The metrics measured for this document
properties:
document_wide:
type: object
description: Document-wide metrics
allOf: [
{ $ref: '#$defs/DocumentEvaluationMetrics' },
{ $ref: '#$defs/CommonEvaluationMetrics' }
]
unevaluatedProperties: false
by_page:
type: array
description: Metrics page-by-page
items:
type: object
allOf: [
{ $ref: '#$defs/CommonEvaluationMetrics' },
{ $ref: '#$defs/PageId' }
]
unevaluatedProperties: false
mweidling marked this conversation as resolved.
Show resolved Hide resolved

PageId:
type: object
properties:
page_id:
type: string
description: PAGE ID

CommonEvaluationMetrics:
type: object
properties:
cer_mean:
type: number
description: Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)

wer:
type: number
description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)


DocumentEvaluationMetrics:
type: object
properties:
cer_median:
type: number
description: Median of the page-wise CER (in document_wide) or regions on a page (in by_page)

cer_range:
type: array
minItems: 2
maxItems: 2
items:
type: number
description: Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)

cer_standard_deviation:
type: number
description: Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)

wall_time:
type: number
description: Actual time needed for processing workflow

cpu_time:
type: number
description: Cumulative CPU time used for processing workflow

pages_per_minute:
type: number
description: Number of pages processed per minute