Skip to content

Commit

Permalink
Merge pull request #659 from NatLibFi/issue631-rest-api-language-dete…
Browse files Browse the repository at this point in the history
…ction

Add language detection to REST API
  • Loading branch information
juhoinkinen authored Sep 17, 2024
2 parents e1edc53 + 36b479a commit c42a93f
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 0 deletions.
59 changes: 59 additions & 0 deletions annif/openapi/annif.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,49 @@ paths:
"503":
$ref: '#/components/responses/ServiceUnavailable'
x-codegen-request-body-name: documents
/detect-language:
post:
tags:
- Language detection
summary: detect the language of a text given a list of candidate languages
operationId: annif.rest.detect_language
requestBody:
content:
application/json:
schema:
type: object
required:
- text
- languages
properties:
text:
type: string
description: input text
example: A quick brown fox jumped over the lazy dog.
languages:
type: array
description: candidate languages as IETF BCP 47 codes
items:
type: string
maxLength: 3
minLength: 2
example: en
minItems: 1
maxItems: 5
required: true
responses:
200:
description: successful operation
content:
application/json:
schema:
$ref: '#/components/schemas/DetectedLanguages'
400:
description: Bad Request
content:
application/problem+json:
schema:
$ref: '#/components/schemas/Problem'
components:
schemas:
ApiInfo:
Expand Down Expand Up @@ -316,6 +359,22 @@ components:
type: string
example: Vulpes vulpes
description: A document with attached, known good subjects
DetectedLanguages:
type: object
properties:
results:
type: array
items:
type: object
properties:
language:
type: string
example: en
nullable: true
score:
type: number
example: 0.85
description: Candidate languages with their associated scores
Problem:
type: object
properties:
Expand Down
26 changes: 26 additions & 0 deletions annif/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from annif.corpus import Document, DocumentList, SubjectSet
from annif.exception import AnnifException
from annif.project import Access
from annif.simplemma_util import get_language_detector

if TYPE_CHECKING:
from connexion.lifecycle import ConnexionResponse
Expand Down Expand Up @@ -82,6 +83,31 @@ def show_project(
return project.dump(), 200, {"Content-Type": "application/json"}


def detect_language(body: dict[str, Any]):
"""return scores for detected languages formatted according to Swagger spec"""

text = body.get("text")
languages = body.get("languages")

detector = get_language_detector(tuple(languages))
try:
proportions = detector.proportion_in_each_language(text)
except ValueError:
return connexion.problem(
status=400,
title="Bad Request",
detail="unsupported candidate languages",
)

result = {
"results": [
{"language": lang if lang != "unk" else None, "score": score}
for lang, score in proportions.items()
]
}
return result, 200, {"Content-Type": "application/json"}


def _suggestion_to_dict(
suggestion: SubjectSuggestion, subject_index: SubjectIndex, language: str
) -> dict[str, str | float | None]:
Expand Down
12 changes: 12 additions & 0 deletions tests/test_openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,15 @@ def test_openapi_learn_novocab(app_client):
data = []
req = app_client.post("http://localhost:8000/v1/projects/novocab/learn", json=data)
assert req.status_code == 503


def test_rest_detect_language_no_candidates(app_client):
data = {"text": "example text", "languages": []}
req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
assert req.status_code == 400


def test_rest_detect_language_too_many_candidates(app_client):
data = {"text": "example text", "languages": ["en", "fr", "de", "it", "es", "nl"]}
req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
assert req.status_code == 400
32 changes: 32 additions & 0 deletions tests/test_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,38 @@ def test_rest_show_project_nonexistent(app):
assert result.status_code == 404


def test_rest_detect_language_english(app):
# english text should be detected
with app.app_context():
result = annif.rest.detect_language(
{"text": "example text", "languages": ["en", "fi", "sv"]}
)[0]
assert {"language": "en", "score": 1} in result["results"]


def test_rest_detect_language_unknown(app):
# an unknown language should return None
with app.app_context():
result = annif.rest.detect_language(
{"text": "exampley texty", "languages": ["fi", "sv"]}
)[0]
assert {"language": None, "score": 1} in result["results"]


def test_rest_detect_language_no_text(app):
with app.app_context():
result = annif.rest.detect_language({"text": "", "languages": ["en"]})[0]
assert {"language": None, "score": 1} in result["results"]


def test_rest_detect_language_unsupported_candidates(app):
with app.app_context():
result = annif.rest.detect_language(
{"text": "example text", "languages": ["unk"]}
)
assert result.status_code == 400


def test_rest_suggest_public(app):
# public projects should be accessible via REST
with app.app_context():
Expand Down

0 comments on commit c42a93f

Please sign in to comment.