OCR-D · kba · Dec 19, 2022 · Jan 27, 2023 · Jan 27, 2023 · Jan 27, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 /.project
 .idea/
+venv
diff --git a/Makefile b/Makefile
@@ -7,4 +7,4 @@ validate: json
 	jsonschema --output pretty --validator Draft201909Validator --instance ocrd_eval.sample.json ocrd_eval.schema.json
 
 deps:
-	pip install yaml click jsonschema
+	pip install pyyaml click jsonschema
diff --git a/ocrd_eval.sample.json b/ocrd_eval.sample.json
@@ -0,0 +1,102 @@
+[
+    {
+        "@id": "wf-data16_ant_complex_minimal_ocr-eval",
+        "label": "Workflow on data 16_ant_complex_minimal_ocr",
+        "metadata": {
+            "ocr_workflow": {
+                "@id": "https://github.com/OCR-D/quiver-back-end/blob/main/workflows/ocrd_workflows/minimal_ocr.txt",
+                "label": "OCR Workflow minimal_ocr"
+            },
+            "eval_workflow": {
+                "@id": "https://github.com/OCR-D/quiver-back-end/blob/main/workflows/ocrd_workflows/dinglehopper_eval.txt",
+                "label": "Evaluation Workflow dinglehopper_eval"
+            },
+            "gt_workspace": {
+                "@id": "https://github.com/OCR-D/quiver-data/blob/main/16_ant_complex.ocrd.zip",
+                "label": "GT workspace 16th century Antiqua complex layout"
+            },
+            "ocr_workspace": {
+                "@id": "https://github.com/OCR-D/quiver-back-end/blob/main/workflows/results/16_ant_complex_minimal_ocr_ocr.zip",
+                "label": "OCR workspace for 16_ant_complex_minimal_ocr"
+            },
+            "eval_workspace": {
+                "@id": "https://github.com/OCR-D/quiver-back-end/blob/main/workflows/results/16_ant_complex_minimal_ocr_evaluation.zip",
+                "label": "Evaluation workspace for 16_ant_complex_minimal_ocr"
+            },
+            "workflow_steps": [
+                {
+                    "id": "ocrd-tesserocr-recognize",
+                    "params": {
+                        "segmentation_level": "region",
+                        "textequiv_level": "word",
+                        "find_tables": true,
+                        "model": "Fraktur_GT4HistOCR",
+                        "dpi": 0,
+                        "padding": 0,
+                        "overwrite_segments": false,
+                        "overwrite_text": true,
+                        "shrink_polygons": false,
+                        "block_polygons": false,
+                        "find_staves": false,
+                        "sparse_text": false,
+                        "raw_lines": false,
+                        "char_whitelist": "",
+                        "char_blacklist": "",
+                        "char_unblacklist": "",
+                        "tesseract_parameters": {},
+                        "xpath_parameters": {},
+                        "xpath_model": {},
+                        "auto_model": false,
+                        "oem": "DEFAULT"
+                    }
+                }
+            ],
+            "workflow_model": "Fraktur_GT4HistOCR",
+            "eval_tool": "ocrd-dinglehopper vNone",
+            "document_metadata": {
+                "data_properties": {
+                    "fonts": [
+                        "Antiqua"
+                    ],
+                    "publication_century": "1500-1600",
+                    "publication_decade": "",
+                    "publication_year": "16th century",
+                    "number_of_pages": 3,
+                    "layout": "complex"
+                }
+            }
+        },
+        "evaluation_results": {
+            "document_wide": {
+                "wall_time": 7.72297,
+                "cpu_time": 10.385645,
+                "cer_mean": 0.10240852523716282,
+                "cer_median": 0.10536980749746708,
+                "cer_range": [
+                    0.07124352331606218,
+                    0.1306122448979592
+                ],
+                "cer_standard_deviation": 0.02979493530847308,
+                "wer": 0.23466068901129858,
+                "pages_per_minute": 23.307095586283516
+            },
+            "by_page": [
+                {
+                    "page_id": "phys_0007",
+                    "cer_mean": 0.07124352331606218,
+                    "wer": 0.2231404958677686
+                },
+                {
+                    "page_id": "phys_0008",
+                    "cer_mean": 0.10536980749746708,
+                    "wer": 0.2484472049689441
+                },
+                {
+                    "page_id": "phys_0009",
+                    "cer_mean": 0.1306122448979592,
+                    "wer": 0.2323943661971831
+                }
+            ]
+        }
+    }
+]
diff --git a/ocrd_eval.schema.json b/ocrd_eval.schema.json
@@ -0,0 +1 @@
+{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A List of Evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n  property referencing the URL and `label` for a human-readable label to be\n  used in the UI.\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation_results"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation_results": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n  - ocr_workflow: The workflow which produced the OCR results to evaluate\n  - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n  - gt_workspace: The workspace containing the GT\n  - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n  - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "array", "description": "Human readable description of the individual steps and their parameters in the workflow (for UI)", "minItems": 1, "items": {"type": "object", "properties": {"id": {"type": "string", "description": "The name of the processor used for this workflow step", "pattern": "^ocrd-[a-z\\-]+"}, "params": {"type": "object", "description": "A map of parameters and their values applied to the processor used for this workflow step"}}, "required": ["id", "params"]}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{2}0-[12][0-9]{2}0"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "textura", "gotico-antiqua", "rotunda", "italic", "bastarda", "greek", "schwabacher", "hebrew", "fraktur"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "allOf": [{"$ref": "#$defs/DocumentEvaluationMetrics"}, {"$ref": "#$defs/CommonEvaluationMetrics"}], "unevaluatedProperties": false}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"$ref": "#$defs/CommonEvaluationMetrics"}, {"$ref": "#$defs/PageId"}], "unevaluatedProperties": false}}}}, "PageId": {"type": "object", "properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, "CommonEvaluationMetrics": {"type": "object", "properties": {"cer_mean": {"type": "number", "description": "Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wer": {"type": "number", "description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}}, "DocumentEvaluationMetrics": {"type": "object", "properties": {"cer_median": {"type": "number", "description": "Median of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "cer_standard_deviation": {"type": "number", "description": "Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wall_time": {"type": "number", "description": "Actual time needed for processing workflow"}, "cpu_time": {"type": "number", "description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"type": "number", "description": "Number of pages processed per minute"}}}}}
diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml
@@ -0,0 +1,217 @@
+$schema: https://json-schema.org/draft/2019-09/schema
+$id: https://ocr-d.de/en/spec/ocrd_eval.schema.json
+
+title: A List of Evaluations for OCR-D
+description: >
+  - All references to URL are JSON-LD-like objects with at least an `@id`
+    property referencing the URL and `label` for a human-readable label to be
+    used in the UI.
+type: array
+items:
+  required: ['@id', 'label', 'metadata', 'evaluation_results']
+  unevaluatedProperties: false
+  allOf:
+    - { '$ref': '#/$defs/LabeledUrl' }
+    - properties:
+        metadata: { '$ref': '#/$defs/EvaluationMetadata' }
+        evaluation_results: { '$ref': '#/$defs/EvaluationReport' }
+
+# Reusable definitions
+$defs:
+
+  LabeledUrl:
+    type: object
+    required: ['@id']
+    properties:
+      '@id':
+        type: string
+        format: uri
+        description: URL of the thing
+      label:
+        type: string
+        description: Description of the thing for UI purposes
+
+  EvaluationMetadata:
+    type: object
+    title: Metadata about one evaluation
+    additionalProperties: false
+    description: >
+      EvaluationMetadata contains all the info on how an EvaluationReport came to be.
+
+      There are two OCR-D *workflows* involved:
+        - ocr_workflow: The workflow which produced the OCR results to evaluate
+        - eval_workflow: The workflow run to evaluate OCR and GT
+
+      There are three OCR-D *workspaces* involved:
+        - gt_workspace: The workspace containing the GT
+        - ocr_workspace: The workspace containing the OCR results from ocr_workflow
+        - eval_workspace: The workspace on which the eval_workflow was run
+
+    required:
+      - ocr_workflow
+      - ocr_workspace
+      - eval_workflow
+      - eval_workspace
+      - gt_workspace
+      - document_metadata
+
+    properties:
+
+      ocr_workflow:
+        allOf:  [{ '$ref': '#/$defs/LabeledUrl' }]
+        description: The OCR-D workflow that produced the ocr_workspace
+
+      ocr_workspace:
+        allOf:  [{ '$ref': '#/$defs/LabeledUrl' }]
+        description: The workspace containing the OCR
+
+      eval_workflow:
+        allOf:  [{ '$ref': '#/$defs/LabeledUrl' }]
+        description: The OCR-D workflow that produced the eval_workspace
+
+      eval_workspace:
+        allOf:  [{ '$ref': '#/$defs/LabeledUrl' }]
+        description: The workspace containing the evaluation results
+
+      gt_workspace:
+        allOf:  [{ '$ref': '#/$defs/LabeledUrl' }]
+        description: The workspace containing the GT
+
+      workflow_steps:
+        type: array
+        description: Human readable description of the individual steps and their parameters in the workflow (for UI)
+        minItems: 1
+        items:
+          type: object
+          properties:
+            id:
+              type: string
+              description: The name of the processor used for this workflow step
+              pattern: '^ocrd-[a-z\-]+'
+            params:
+              type: object
+              description: A map of parameters and their values applied to the processor used for this workflow step
+          required: ['id', 'params']
+
+      workflow_model:
+        type: string
+        description: Human readable name of the main model used for recognition in the OCR workflow (for UI)
+
+      eval_tool:
+        type: string
+        description: Human readable name and version of evaluation tool used (for UI)
+
+      document_metadata:
+        type: object
+        title: Bibliographical and typographical metadata about the work to be evaluated
+        properties:
+
+          publication_year:
+            type: number
+            description: Year the document was originally published
+
+          publication_century:
+            type: string
+            description: Century the document was originally published
+            pattern: '[12][0-9]{3}-[12][0-9]{3}'
+
+          publication_decade:
+            type: string
+            description: Decade the document was originally published
+            pattern: '[12][0-9]{2}0-[12][0-9]{2}0'
+
+          number_of_pages:
+            type: number
+            description: Number of pages in this work (i.e. the number of images in the gt_workspace)
+
+          layout:
+            type: string
+            enum: ['simple', 'complex']
+
+          fonts:
+            type: array
+            items:
+              type: string
+              enum: ['antiqua', 'textura', 'gotico-antiqua', 'rotunda', 'italic', 'bastarda', 'greek', 'schwabacher', 'hebrew', 'fraktur']
+
+      provenance:
+        type: object
+        description: Information on which tools in which version were used in determining metrics
+        properties:
+          parameters:
+            type: object
+            description: Parameters passed to the evaluation processor
+
+  EvaluationReport:
+    type: object
+    additionalProperties: false
+    description: The metrics measured for this document
+    properties:
+      document_wide:
+        type: object
+        description: Document-wide metrics
+        allOf: [
+          { $ref: '#$defs/DocumentEvaluationMetrics' },
+          { $ref: '#$defs/CommonEvaluationMetrics' }
+        ]
+        unevaluatedProperties: false
+      by_page:
+        type: array
+        description: Metrics page-by-page
+        items:
+          type: object
+          allOf: [
+            { $ref: '#$defs/CommonEvaluationMetrics' },
+            { $ref: '#$defs/PageId' }
+          ] 
+          unevaluatedProperties: false
+
+  PageId:
+    type: object
+    properties:
+      page_id:
+        type: string
+        description: PAGE ID
+
+  CommonEvaluationMetrics:
+    type: object
+    properties:
+      cer_mean:
+        type: number
+        description: Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)
+
+      wer:
+        type: number
+        description: CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)
+
+
+  DocumentEvaluationMetrics:
+    type: object
+    properties:
+      cer_median:
+        type: number
+        description: Median of the page-wise CER (in document_wide) or regions on a page (in by_page)
+
+      cer_range:
+        type: array
+        minItems: 2
+        maxItems: 2
+        items:
+          type: number
+          description: Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)
+
+      cer_standard_deviation:
+        type: number
+        description: Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)
+
+      wall_time:
+        type: number
+        description: Actual time needed for processing workflow
+
+      cpu_time:
+        type: number
+        description: Cumulative CPU time used for processing workflow
+
+      pages_per_minute:
+        type: number
+        description: Number of pages processed per minute
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A List of Evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI.\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation_results"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation_results": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D workflows involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D workspaces involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "array", "description": "Human readable description of the individual steps and their parameters in the workflow (for UI)", "minItems": 1, "items": {"type": "object", "properties": {"id": {"type": "string", "description": "The name of the processor used for this workflow step", "pattern": "^ocrd-[a-z\\-]+"}, "params": {"type": "object", "description": "A map of parameters and their values applied to the processor used for this workflow step"}}, "required": ["id", "params"]}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{2}0-[12][0-9]{2}0"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "textura", "gotico-antiqua", "rotunda", "italic", "bastarda", "greek", "schwabacher", "hebrew", "fraktur"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "allOf": [{"$ref": "#$defs/DocumentEvaluationMetrics"}, {"$ref": "#$defs/CommonEvaluationMetrics"}], "unevaluatedProperties": false}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"$ref": "#$defs/CommonEvaluationMetrics"}, {"$ref": "#$defs/PageId"}], "unevaluatedProperties": false}}}}, "PageId": {"type": "object", "properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, "CommonEvaluationMetrics": {"type": "object", "properties": {"cer_mean": {"type": "number", "description": "Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wer": {"type": "number", "description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}}, "DocumentEvaluationMetrics": {"type": "object", "properties": {"cer_median": {"type": "number", "description": "Median of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "cer_standard_deviation": {"type": "number", "description": "Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wall_time": {"type": "number", "description": "Actual time needed for processing workflow"}, "cpu_time": {"type": "number", "description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"type": "number", "description": "Number of pages processed per minute"}}}}}