diff --git a/ocrd_eval.sample.json b/ocrd_eval.sample.json index 80de251..df71d17 100644 --- a/ocrd_eval.sample.json +++ b/ocrd_eval.sample.json @@ -1 +1 @@ -[{"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json", "label": "OCR workflow 1 on workspace 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/1.nf", "label": "OCR Workflow 1"}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_steps": {"0": "Processor A", "1": "Processor B"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 1234, "cer": 0.57, "cer_min_max": [0.2, 0.57]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.8, "processing_time": 2.1}]}}, {"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json", "label": "OCR Workflow 2 on Data 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/2.nf", "label": "OCR Workflow 2"}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_steps": {"0": "Processor A", "1": "Processor B"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 4567, "cer": 0.9, "cer_min_max": [0.2, 0.99]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.9, "processing_time": 2.1}]}}] \ No newline at end of file +[{"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf1-data345-eval1.json", "label": "OCR workflow 1 on workspace 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/1.nf", "label": "OCR Workflow 1", "steps": {"0": "Processor A", "1": "Processor B"}}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 1234, "cer": 0.57, "cer_min_max": [0.2, 0.57]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.8, "processing_time": 2.1}]}}, {"@id": "https://github.com/OCR-D/quiver/tree/data/evaluations/wf2-data345-eval1.json", "label": "OCR Workflow 2 on Data 345", "metadata": {"ocr_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/2.nf", "label": "OCR Workflow 2", "steps": {"0": "Processor A", "1": "Processor B"}}, "eval_workflow": {"@id": "https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf", "label": "Evaluation Workflow 1"}, "gt_workspace": {"@id": "https://gt.ocr-d.de/workspace/789", "label": "GT workspace 789 (19th century fraktur)"}, "ocr_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/3000.ocrd.zip", "label": "OCR result workspace 3000"}, "eval_workspace": {"@id": "https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip", "label": "Evaluation Workspace 345"}, "workflow_model": "Fraktur_GT4HistOCR", "document_metadata": {"fonts": ["antiqua", "fraktur"], "publication_century": "1800-1900", "publication_decade": "1850-1860", "publication_year": 1855, "number_of_pages": 100, "layout": "simple"}}, "evaluation": {"document_wide": {"wall_time": 4567, "cer": 0.9, "cer_min_max": [0.2, 0.99]}, "by_page": [{"page_id": "PHYS_0001", "cer": 0.9, "processing_time": 2.1}]}}] \ No newline at end of file diff --git a/ocrd_eval.sample.yml b/ocrd_eval.sample.yml index 74383f8..9924204 100644 --- a/ocrd_eval.sample.yml +++ b/ocrd_eval.sample.yml @@ -4,6 +4,9 @@ ocr_workflow: '@id': https://github.com/OCR-D/quiver/tree/data/workflows/1.nf label: OCR Workflow 1 + steps: + '0': Processor A + '1': Processor B eval_workflow: '@id': https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf label: Evaluation Workflow 1 @@ -16,9 +19,6 @@ eval_workspace: '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip label: Evaluation Workspace 345 - workflow_steps: - '0': Processor A - '1': Processor B workflow_model: Fraktur_GT4HistOCR document_metadata: fonts: @@ -47,6 +47,9 @@ ocr_workflow: '@id': https://github.com/OCR-D/quiver/tree/data/workflows/2.nf label: OCR Workflow 2 + steps: + '0': Processor A + '1': Processor B eval_workflow: '@id': https://github.com/OCR-D/quiver/tree/data/workflows/eval1.nf label: Evaluation Workflow 1 @@ -59,9 +62,6 @@ eval_workspace: '@id': https://github.com/OCR-D/quiver/tree/data/workspaces/345.ocrd.zip label: Evaluation Workspace 345 - workflow_steps: - '0': Processor A - '1': Processor B workflow_model: Fraktur_GT4HistOCR document_metadata: fonts: diff --git a/ocrd_eval.schema.json b/ocrd_eval.schema.json index c903137..b11d918 100644 --- a/ocrd_eval.schema.json +++ b/ocrd_eval.schema.json @@ -1 +1 @@ -{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A list of evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "fraktur", "ancient_greek", "hebrew"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "properties": {"$ref": "#$defs/EvaluationMetrics"}}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, {"properties": {"$ref": "#$defs/EvaluationMetrics"}}]}}}}, "EvaluationMetrics": {"cer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "cer_mean": {"description": "Arithmetic mean of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_median": {"description": "Median of the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "cer_standard_deviation": {"description": "Standard deviation the page-wise CER (in document_wide) or regions on a page (in by_page)"}, "wer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "wall_time": {"description": "Actual time needed for processing workflow"}, "cpu_time": {"description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"description": "Number of pages processed per minute"}}}} \ No newline at end of file +{"$schema": "https://json-schema.org/draft/2019-09/schema", "$id": "https://ocr-d.de/en/spec/ocrd_eval.schema.json", "title": "A list of evaluations for OCR-D", "description": "- All references to URL are JSON-LD-like objects with at least an `@id`\n property referencing the URL and `label` for a human-readable label to be\n used in the UI\n", "type": "array", "items": {"required": ["@id", "label", "metadata", "evaluation"], "unevaluatedProperties": false, "allOf": [{"$ref": "#/$defs/LabeledUrl"}, {"properties": {"metadata": {"$ref": "#/$defs/EvaluationMetadata"}, "evaluation": {"$ref": "#/$defs/EvaluationReport"}}}]}, "$defs": {"LabeledUrl": {"type": "object", "required": ["@id"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}}}, "EvaluationMetadata": {"type": "object", "title": "Metadata about one evaluation", "additionalProperties": false, "description": "EvaluationMetadata contains all the info on how an EvaluationReport came to be.\nThere are two OCR-D *workflows* involved:\n - ocr_workflow: The workflow which produced the OCR results to evaluate\n - eval_workflow: The workflow run to evaluate OCR and GT\n\nThere are three OCR-D *workspaces* involved:\n - gt_workspace: The workspace containing the GT\n - ocr_workspace: The workspace containing the OCR results from ocr_workflow\n - eval_workspace: The workspace on which the eval_workflow was run\n", "required": ["ocr_workflow", "ocr_workspace", "eval_workflow", "eval_workspace", "gt_workspace", "document_metadata"], "properties": {"ocr_workflow": {"type": "object", "required": ["@id", "steps"], "properties": {"@id": {"type": "string", "format": "uri", "description": "URL of the thing"}, "label": {"type": "string", "description": "Description of the thing for UI purposes"}, "steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}}, "description": "The OCR-D workflow that produced the ocr_workspace"}, "ocr_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the OCR"}, "eval_workflow": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The OCR-D workflow that produced the eval_workspace"}, "eval_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the evaluation results"}, "gt_workspace": {"allOf": [{"$ref": "#/$defs/LabeledUrl"}], "description": "The workspace containing the GT"}, "workflow_steps": {"type": "object", "description": "Human readable description of the individual steps in the workflow (for UI)", "patternProperties": {"^[0-9]+$": {"type": "string", "description": "Description of this workflow step"}}}, "workflow_model": {"type": "string", "description": "Human readable name of the main model used for recognition in the OCR workflow (for UI)"}, "eval_tool": {"type": "string", "description": "Human readable name and version of evaluation tool used (for UI)"}, "document_metadata": {"type": "object", "title": "Bibliographical and typographical metadata about the work to be evaluated", "properties": {"publication_year": {"type": "number", "description": "Year the document was originally published"}, "publication_century": {"type": "string", "description": "Century the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "publication_decade": {"type": "string", "description": "Decade the document was originally published", "pattern": "[12][0-9]{3}-[12][0-9]{3}"}, "number_of_pages": {"type": "number", "description": "Number of pages in this work (i.e. the number of images in the gt_workspace)"}, "layout": {"type": "string", "enum": ["simple", "complex"]}, "fonts": {"type": "array", "items": {"type": "string", "enum": ["antiqua", "fraktur", "ancient_greek", "hebrew"]}}}}, "provenance": {"type": "object", "description": "Information on which tools in which version were used in determining metrics", "properties": {"parameters": {"type": "object", "description": "Parameters passed to the evaluation processor"}}}}}, "EvaluationReport": {"type": "object", "additionalProperties": false, "description": "The metrics measured for this document", "properties": {"document_wide": {"type": "object", "description": "Document-wide metrics", "properties": {"$ref": "#$defs/EvaluationMetrics"}}, "by_page": {"type": "array", "description": "Metrics page-by-page", "items": {"type": "object", "allOf": [{"properties": {"page_id": {"type": "string", "description": "PAGE ID"}}}, {"properties": {"$ref": "#$defs/EvaluationMetrics"}}]}}}}, "EvaluationMetrics": {"cer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "cer_range": {"type": "array", "minItems": 2, "maxItems": 2, "items": {"type": "number", "description": "Minimum and maximum of CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}}, "wer": {"description": "CER calculated over the text of a whole page (in by_page) or combined text of all pages (in document_wide)"}, "wall_time": {"description": "Actual time needed for processing workflow"}, "cpu_time": {"description": "Cumulative CPU time used for processing workflow"}, "pages_per_minute": {"description": "Number of pages processed per minute"}}}} \ No newline at end of file diff --git a/ocrd_eval.schema.yml b/ocrd_eval.schema.yml index 4a9c1df..a3ef08a 100644 --- a/ocrd_eval.schema.yml +++ b/ocrd_eval.schema.yml @@ -58,7 +58,23 @@ $defs: properties: ocr_workflow: - allOf: [{ '$ref': '#/$defs/LabeledUrl' }] + type: object + required: ['@id', 'steps'] + properties: + '@id': + type: string + format: uri + description: URL of the thing + label: + type: string + description: Description of the thing for UI purposes + steps: + type: object + description: Human readable description of the individual steps in the workflow (for UI) + patternProperties: + '^[0-9]+$': + type: string + description: Description of this workflow step description: The OCR-D workflow that produced the ocr_workspace ocr_workspace: @@ -77,14 +93,6 @@ $defs: allOf: [{ '$ref': '#/$defs/LabeledUrl' }] description: The workspace containing the GT - workflow_steps: - type: object - description: Human readable description of the individual steps in the workflow (for UI) - patternProperties: - '^[0-9]+$': - type: string - description: Description of this workflow step - workflow_model: type: string description: Human readable name of the main model used for recognition in the OCR workflow (for UI)