Skip to content

Commit

Permalink
Remove EngineBase::LANG_NAMES as all data is now in models.json
Browse files Browse the repository at this point in the history
Remove `EngineBase::LANG_NAMES` as all data is now in models.json, and add some documentation about the structure of that file.

Bug: T330061
  • Loading branch information
samwilson authored Dec 6, 2024
1 parent ac19192 commit 7ade214
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 89 deletions.
20 changes: 20 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,23 @@ Run container
```
./docker/run.sh
```

## Structure of models.json

The engines' model and language information is stored in `/public/models.json`,
from where it's read and returned in the `/api/available_langs` API endpoint.

OCR engines take zero to many model names (often called 'languages' because
there's direct mapping to those, but we're moving away from this nomenclature
now because it doesn't always hold true).

`models.json` is first grouped by engine, and then each engine has a list of models.
These are identified by a 'model code', which is what the user provides in the `langs[]` parameter.
For some engines these are passed through to the actual engine process or API,
but others don't have convenient model names and so we invent them
and add whatever extra info is needed as additional properties within `models.json`.

In addition to the model code, every model needs to have at least a `title` and `languages` property.

* `title`: This is what's shown (unlocalized) to the user.
* `languages`: An array of ISO639 language codes. This is (or will be) what's used to group models when the user is browsing them.
6 changes: 3 additions & 3 deletions public/models.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"title": "azərbaycanca"
},
"az-cyrl": {
"languages": ["az","Cyrl"],
"languages": ["az"],
"title": "Azərbaycan (qədim yazı)"
},
"be": {
Expand Down Expand Up @@ -459,8 +459,8 @@
"title": "azərbaycanca"
},
"aze_cyrl": {
"languages": ["aze","cyrl"],
"title": "Azerbaijani (Cyrillic)"
"languages": ["aze"],
"title": "Azərbaycan (qədim yazı)"
},
"bel": {
"languages": ["be"],
Expand Down
8 changes: 4 additions & 4 deletions src/Controller/OcrController.php
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,8 @@ public function homeAction(): Response {
$this->setup();

// Pre-supply available langs for autocompletion in the form.
static::$params['available_langs'] = $this->engine->getValidModels();
sort( static::$params['available_langs'] );
static::$params['available_langs'] = $this->engine->getValidModels( true );
ksort( static::$params['available_langs'] );

// set empty array to avoid errors while rendering template on non-transkribus engines
static::$params['available_line_ids'] = [];
Expand Down Expand Up @@ -295,7 +295,7 @@ public function apiAction(): JsonResponse {
}

/**
* Get a list of languages available for use with a specific OCR engine.
* Get a list of models available for use with a specific OCR engine.
*
* @Route("/api/available_langs", name="apiLangs", methods={"GET"})
* @OA\Parameter(
Expand All @@ -305,7 +305,7 @@ public function apiAction(): JsonResponse {
* example="tesseract",
* @OA\Schema(type="string")
* )
* @OA\Response(response=200, description="List of available language codes and names, in JSON format.")
* @OA\Response(response=200, description="List of available model codes and names, in JSON format.")
* @return JsonResponse
*/
public function apiAvailableLangsAction(): JsonResponse {
Expand Down
76 changes: 0 additions & 76 deletions src/Engine/EngineBase.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,79 +34,6 @@ abstract class EngineBase {
/** @var string[][] Local PHP array copy of models.json */
protected $modelList;

/** @var string[] Additional localized names for non-standard language codes. */
public const LANG_NAMES = [
'Fraktur' => 'Fraktur script',
'Latin' => 'Latin script',
'az-cyrl' => 'Azərbaycan (qədim yazı)',
'bali' => 'Balinese palm-leaf manuscripts 16th century',
'ben-print' => 'Bengali Printed Books +150 New',
'cs-space' => 'Old Czech Handwriting (with spaces)',
'cs-no-space' => 'Old Czech Handwriting (without spaces)',
'da-goth' => '19th century Danish Gothic handwriting v.1.1',
'da-goth-print' => 'Danish gothic print 1859-1888 v4',
'da-gjen' => 'Gjentofte 1881-1913 Denmark',
'de-frk' => 'Deutsch (Fraktur)',
'de-17' => 'Dutch_XVII_Century',
'de-hd-m1' => 'Transkribus Dutch Handwriting M1',
'dev' => 'Devanagari Mixed M1A',
'el-ligo' => 'Ligorio 0.3 PyL',
'el-print' => 'Noscemus GM 6',
'en-b2022' => 'Transkribus B2022 English Model M4',
'en-handwritten-m3' => 'Transkribus English Handwriting M3',
'en-print-m1' => 'Transkribus Print M1',
'en-typewriter' => 'Transkribus Typewriter',
'enm' => 'Middle English (1100-1500)',
'es-md' => 'Diario de Madrid 1788-1825',
'es-old' => 'español (viejo)',
'es-redonda-extended-v1_2' => 'SpanishRedonda_sXVI-XVII_extended_v1.2',
'et-court' => 'Estonian Court Records 19thC',
'fin' => 'NLF_Newseye_GT_FI_M2+',
'fr-m1' => 'Transkribus French Model 1',
'frm' => 'moyen français (1400-1600)',
'fro' => 'Franceis, François, Romanz (1400-1600)',
'ger-hd-m1' => 'Transkribus German handwriting M1',
'ger-15' => '15th-16th century German',
'he-dijest' => 'Hebrew DiJeSt 2.0',
'hu-hand-19' => 'Hungarian handwriting 19th–20th cent.',
'it-old' => 'italiano antico',
'it-hd-m1' => 'Transkribus Italian Handwriting M1',
'jv-01' => 'Javanese model v0.1 b06/24',
'ka-old' => 'ქართული (ძველი)',
'ko-vert' => '한국어 (세로)',
'kur' => 'کوردی',
'la-caro' => 'Carolingian Minuscule Model CMM 9th-11th c.',
'la-in' => 'Latin Incunabula (Reichenau)',
'la-med' => 'UCL–University of Toronto #7',
'la-neo' => 'Pylaia_NeoLatin_Ravenstein',
'nl-1605' => 'Admiraliteit Zeeland 1605-1609 compleet',
'nl-mount' => 'Dutch Mountains (18th Century)',
'nl-news' => 'Dutch newspapers 17th century',
'no-1820' => 'NorHand 1820-1940',
'no-1874' => 'Sunnhordland Partition Protocols ',
'osd' => 'Orientation and script detection module',
'pl-m2' => 'Transkribus Polish M2',
'pt-m1' => 'General Portuguese M1',
'pt-17' => 'SPJCL17C V4.2',
'pt-hd' => 'Portuguese Handwriting 16th-19th century',
'ro-print' => 'RTA2 (Romanian Transition Alphabet)',
'rus-hd-2' => 'Russian generic handwriting 2',
'rus-print' => 'Russian print of the 18th century',
'ru-petr1708' => 'Русский (старая орфография)',
'san' => 'Devanagari Mixed M1A',
'sl-hand-18' => 'Slovenian 18th century manuscript',
'sk-hand' => 'Handwritten Glagolitic',
'sr-latn' => 'Српски (латиница)',
'swe-3' => 'Stockholm Notaries 1700 3.0',
'swe-lion-i' => 'The Swedish Lion I',
'syr' => 'leššānā Suryāyā',
'uz-cyrl' => 'oʻzbekcha',
'uk-20th-print' => 'Printed Ukrainian 20th century',
'uk-generic-handwriting-1' => 'Ukrainian generic handwriting 1',
'uk-wikisource-print' => 'Ukrainian Wikisource Print',
'yi-hd' => 'The Dybbuk for Yiddish Handwriting'
];

/**
* EngineBase constructor.
* @param Intuition $intuition
Expand Down Expand Up @@ -185,9 +112,6 @@ public function getModelTitle( ?string $model = null ): string {
if ( isset( $this->getModelList()[ $model ]['title'] ) ) {
return $this->getModelList()[ $model ]['title'];
}
if ( isset( static::LANG_NAMES[$model] ) ) {
return static::LANG_NAMES[$model];
}
return $this->intuition->getLangName( $model ) ?: '';
}

Expand Down
6 changes: 3 additions & 3 deletions templates/output.html.twig
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@
data-placeholder="{{ msg('langs-placeholder') }}"
{% endif %}
>
{% for lang in available_langs %}
<option value="{{ lang }}" {% if lang in langs %}selected{% endif %}>
{{- lang }}{% if ocr_lang_name(lang) is not empty %} &ndash; {{ ocr_lang_name(lang) }}{% endif -%}
{% for lang_code,lang_name in available_langs %}
<option value="{{ lang_code }}" {% if lang_code in langs %}selected{% endif %}>
{{- lang_code }} &ndash; {{ lang_name -}}
</option>
{% endfor %}
</select>
Expand Down
2 changes: 1 addition & 1 deletion tests/Engine/EngineBaseTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ public function testLangNames(): void {
// From Intuition.
static::assertSame( 'français', $this->tesseractEngine->getModelTitle( 'fr' ) );

// From EngineBase::LANG_NAMES
// From models.json
static::assertSame( 'moyen français (1400-1600)', $this->tesseractEngine->getModelTitle( 'frm' ) );

// Make sure every language has a name.
Expand Down
4 changes: 2 additions & 2 deletions tests/Twig/AppExtensionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ public function setUp(): void {
* @covers AppExtension::getOcrLangName
*/
public function testOcrLangName(): void {
// Non-standard language code with name defined in EngineBase::LANG_NAMES
static::assertSame( 'Azərbaycan (qədim yazı)', $this->ext->getOcrLangName( 'az-cyrl' ) );
// Non-standard language code with name defined in models.json
static::assertSame( 'Azərbaycan (qədim yazı)', $this->ext->getOcrLangName( 'aze_cyrl' ) );

// Standard language code (name provided by Intuition)
static::assertSame( 'English', $this->ext->getOcrLangName( 'en' ) );
Expand Down

0 comments on commit 7ade214

Please sign in to comment.