Skip to content

Commit

Permalink
Add transkribus models to OCR tool
Browse files Browse the repository at this point in the history
Add transkribus model IDs to langs.json
to make them available for use by the OCR
engine.

Bug: T331960
  • Loading branch information
Parthiv-M authored Apr 3, 2023
1 parent 812c2b9 commit f2834ec
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 0 deletions.
54 changes: 54 additions & 0 deletions public/langs.json
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
"tesseract": "bel",
"google": "be"
},
"ben-print": {
"transkribus": 46239
},
"bg": {
"tesseract": "bul",
"google": "bg"
Expand Down Expand Up @@ -78,6 +81,15 @@
"tesseract": "deu",
"google": "de"
},
"de-17": {
"transkribus": 38871
},
"de-hd-m1": {
"transkribus": 38291
},
"dev": {
"transkribus": 45909
},
"de-frk": {
"tesseract": "frk"
},
Expand All @@ -102,6 +114,12 @@
"en-handwritten-m3": {
"transkribus": 37646
},
"en-print-m1": {
"transkribus": 39995
},
"en-typewriter": {
"transkribus": 37545
},
"enm": {
"tesseract": "enm",
"google": "enm"
Expand All @@ -114,6 +132,9 @@
"tesseract": "spa",
"google": "es"
},
"es-md": {
"transkribus": 48440
},
"es-old": {
"tesseract": "spa_old"
},
Expand All @@ -133,6 +154,9 @@
"tesseract": "fin",
"google": "fi"
},
"fin": {
"transkribus": 37748
},
"fo": {
"tesseract": "fao",
"google": "fo"
Expand All @@ -141,6 +165,9 @@
"tesseract": "fra",
"google": "fr"
},
"fr-m1": {
"transkribus": 37758
},
"fro": {
"google": "fro"
},
Expand All @@ -155,6 +182,12 @@
"tesseract": "gle",
"google": "ga"
},
"ger-hd-m1": {
"transkribus": 35909
},
"ger-15": {
"transkribus": 45902
},
"gd": {
"tesseract": "gla",
"google": "gd"
Expand Down Expand Up @@ -207,6 +240,9 @@
"tesseract": "ita",
"google": "it"
},
"it-hd-m1": {
"transkribus": 38440
},
"it-old": {
"tesseract": "ita_old"
},
Expand Down Expand Up @@ -339,6 +375,9 @@
"tesseract": "pol",
"google": "pl"
},
"pl-m2": {
"transkribus": 44976
},
"ps": {
"tesseract": "pus",
"google": "ps"
Expand Down Expand Up @@ -366,10 +405,19 @@
"ru-petr1708": {
"google": "ru-PETR1708"
},
"rus-hd-2": {
"transkribus" : 45595
},
"rus-print": {
"transkribus" : 44358
},
"sa": {
"tesseract": "san",
"google": "sa"
},
"san" : {
"transkribus" : 45909
},
"sd": {
"tesseract": "snd"
},
Expand Down Expand Up @@ -409,6 +457,9 @@
"tesseract": "swa",
"google": "sw"
},
"swe-2.1": {
"transkribus": 45736
},
"syr": {
"tesseract": "syr",
"google": "syr"
Expand Down Expand Up @@ -472,6 +523,9 @@
"tesseract": "yid",
"google": "yi"
},
"yi-hd": {
"transkribus": 46159
},
"yo": {
"tesseract": "yor",
"google": "yo"
Expand Down
18 changes: 18 additions & 0 deletions src/Engine/EngineBase.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,22 +37,40 @@ abstract class EngineBase {
/** @var string[] Additional localized names for non-standard language codes. */
public const LANG_NAMES = [
'az-cyrl' => 'Azərbaycan (qədim yazı)',
'ben-print' => 'Bengali Printed Books +150 New',
'de-frk' => 'Deutsch (Fraktur)',
'de-17' => 'Dutch_XVII_Century',
'de-hd-m1' => 'Transkribus Dutch Handwriting M1',
'dev' => 'Devanagari Mixed M1A',
'en-b2022' => 'Transkribus B2022 English Model M4',
'en-handwritten-m3' => 'Transkribus English Handwriting M3',
'en-print-m1' => 'Transkribus Print M1',
'en-typewriter' => 'Transkribus Typewriter',
'enm' => 'Middle English (1100-1500)',
'es-md' => 'Diario de Madrid 1788-1825',
'es-old' => 'español (viejo)',
'fin' => 'NLF_Newseye_GT_FI_M2+',
'fr-m1' => 'Transkribus French Model 1',
'frm' => 'moyen français (1400-1600)',
'fro' => 'Franceis, François, Romanz (1400-1600)',
'ger-hd-m1' => 'Transkribus German handwriting M1',
'ger-15' => '15th-16th century German',
'it-old' => 'italiano antico',
'it-hd-m1' => 'Transkribus Italian Handwriting M1',
'ka-old' => 'ქართული (ძველი)',
'ko-vert' => '한국어 (세로)',
'kur' => 'کوردی',
'osd' => 'Orientation and script detection module',
'pl-m2' => 'Transkribus Polish M2',
'rus-hd-2' => 'Russian generic handwriting 2',
'rus-print' => 'Russian print of the 18th century',
'ru-petr1708' => 'Русский (старая орфография)',
'san' => 'Devanagari Mixed M1A',
'sr-latn' => 'Српски (латиница)',
'swe-2.1' => 'Stockholm Notaries 1700 2.1',
'syr' => 'leššānā Suryāyā',
'uz-cyrl' => 'oʻzbekcha',
'yi-hd' => 'The Dybbuk for Yiddish Handwriting'
];

/**
Expand Down

0 comments on commit f2834ec

Please sign in to comment.