Skip to content

Commit

Permalink
Merge branch 'main' into update-dependencies-v1.2
Browse files Browse the repository at this point in the history
  • Loading branch information
juhoinkinen authored Sep 17, 2024
2 parents 4d82fb7 + c42a93f commit 53f16b1
Show file tree
Hide file tree
Showing 12 changed files with 190 additions and 23 deletions.
20 changes: 10 additions & 10 deletions .github/workflows/cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-22.04
name: lint with isort, Black & flake8
steps:
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
- name: "Prepare: restore caches, install Poetry, set up Python"
uses: ./.github/actions/prepare
with:
Expand All @@ -42,7 +42,7 @@ jobs:
runs-on: ubuntu-22.04
name: check CLI startup time
steps:
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
- name: "Prepare: restore caches, install Poetry, set up Python"
id: prepare
uses: ./.github/actions/prepare
Expand All @@ -64,7 +64,7 @@ jobs:
python-version: ["3.9", "3.10", "3.11", "3.12"]
name: test on Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
- name: Install system packages
run: |
sudo apt-get install \
Expand Down Expand Up @@ -109,7 +109,7 @@ jobs:
poetry run pytest --cov=./ --cov-report xml --cov-append -m slow
fi
- name: Upload coverage to Codecov
uses: codecov/codecov-action@c16abc29c95fcf9174b58eb7e1abf4c866893bc8 # v4.1.1
uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673 # v4.5.0
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
- name: Save cache
Expand All @@ -129,7 +129,7 @@ jobs:
timeout-minutes: 15
steps:
- name: "Build image for testing"
uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85 # v6.7.0
with:
push: false
tags: test-image
Expand All @@ -145,7 +145,7 @@ jobs:
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
steps:
- name: Login to Quay.io
uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # v3.1.0
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
with:
registry: quay.io
username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
Expand All @@ -158,7 +158,7 @@ jobs:
tags: |
latest
- name: Build and push to Quay.io
uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85 # v6.7.0
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
Expand All @@ -170,7 +170,7 @@ jobs:
runs-on: ubuntu-22.04
if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
steps:
- uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
- name: "Prepare: restore caches, install Poetry, set up Python"
uses: ./.github/actions/prepare
with:
Expand All @@ -184,7 +184,7 @@ jobs:
poetry publish --build
- name: Login to Quay.io
uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # v3.1.0
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
with:
registry: quay.io
username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
Expand All @@ -199,7 +199,7 @@ jobs:
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
- name: Build and push to Quay.io
uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85 # v6.7.0
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,18 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7

- name: Initialize CodeQL
uses: github/codeql-action/init@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
uses: github/codeql-action/init@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6
with:
languages: ${{ matrix.language }}
queries: +security-and-quality

- name: Autobuild
uses: github/codeql-action/autobuild@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
uses: github/codeql-action/autobuild@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
uses: github/codeql-action/analyze@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6
with:
category: "/language:${{ matrix.language }}"
6 changes: 3 additions & 3 deletions .github/workflows/docker-rebuild.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,15 @@ jobs:
timeout-minutes: 15
steps:
- name: "Build for testing"
uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85 # v6.7.0
with:
push: false
tags: test-image
- name: "Test with pytest"
run: |
docker run --rm --workdir /Annif test-image pytest -p no:cacheprovider
- name: Login to Quay.io
uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20 # v3.1.0
uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567 # v3.3.0
with:
registry: quay.io
username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
Expand All @@ -35,7 +35,7 @@ jobs:
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
- name: Build and push to Quay.io
uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0 # v5.3.0
uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85 # v6.7.0
with:
push: true
tags: ${{ steps.meta.outputs.tags }}
Expand Down
4 changes: 2 additions & 2 deletions annif/analyzer/simplemma.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from __future__ import annotations

import simplemma
import annif.simplemma_util

from . import analyzer

Expand All @@ -15,4 +15,4 @@ def __init__(self, param: str, **kwargs) -> None:
super().__init__(**kwargs)

def _normalize_word(self, word: str) -> str:
return simplemma.lemmatize(word, lang=self.lang)
return annif.simplemma_util.lemmatizer.lemmatize(word, lang=self.lang)
59 changes: 59 additions & 0 deletions annif/openapi/annif.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,49 @@ paths:
"503":
$ref: '#/components/responses/ServiceUnavailable'
x-codegen-request-body-name: documents
/detect-language:
post:
tags:
- Language detection
summary: detect the language of a text given a list of candidate languages
operationId: annif.rest.detect_language
requestBody:
content:
application/json:
schema:
type: object
required:
- text
- languages
properties:
text:
type: string
description: input text
example: A quick brown fox jumped over the lazy dog.
languages:
type: array
description: candidate languages as IETF BCP 47 codes
items:
type: string
maxLength: 3
minLength: 2
example: en
minItems: 1
maxItems: 5
required: true
responses:
200:
description: successful operation
content:
application/json:
schema:
$ref: '#/components/schemas/DetectedLanguages'
400:
description: Bad Request
content:
application/problem+json:
schema:
$ref: '#/components/schemas/Problem'
components:
schemas:
ApiInfo:
Expand Down Expand Up @@ -316,6 +359,22 @@ components:
type: string
example: Vulpes vulpes
description: A document with attached, known good subjects
DetectedLanguages:
type: object
properties:
results:
type: array
items:
type: object
properties:
language:
type: string
example: en
nullable: true
score:
type: number
example: 0.85
description: Candidate languages with their associated scores
Problem:
type: object
properties:
Expand Down
26 changes: 26 additions & 0 deletions annif/rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from annif.corpus import Document, DocumentList, SubjectSet
from annif.exception import AnnifException
from annif.project import Access
from annif.simplemma_util import get_language_detector

if TYPE_CHECKING:
from connexion.lifecycle import ConnexionResponse
Expand Down Expand Up @@ -82,6 +83,31 @@ def show_project(
return project.dump(), 200, {"Content-Type": "application/json"}


def detect_language(body: dict[str, Any]):
"""return scores for detected languages formatted according to Swagger spec"""

text = body.get("text")
languages = body.get("languages")

detector = get_language_detector(tuple(languages))
try:
proportions = detector.proportion_in_each_language(text)
except ValueError:
return connexion.problem(
status=400,
title="Bad Request",
detail="unsupported candidate languages",
)

result = {
"results": [
{"language": lang if lang != "unk" else None, "score": score}
for lang, score in proportions.items()
]
}
return result, 200, {"Content-Type": "application/json"}


def _suggestion_to_dict(
suggestion: SubjectSuggestion, subject_index: SubjectIndex, language: str
) -> dict[str, str | float | None]:
Expand Down
17 changes: 17 additions & 0 deletions annif/simplemma_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Wrapper code for using Simplemma functionality in Annif"""

from typing import Tuple, Union

from simplemma import LanguageDetector, Lemmatizer
from simplemma.strategies import DefaultStrategy
from simplemma.strategies.dictionaries import DefaultDictionaryFactory

LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max)

_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE)
_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory)
lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy)


def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector:
return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)
8 changes: 5 additions & 3 deletions annif/transform/langfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@

from typing import TYPE_CHECKING

from simplemma.language_detector import in_target_language

import annif
import annif.simplemma_util

from . import transform

Expand All @@ -31,6 +30,9 @@ def __init__(
self.text_min_length = int(text_min_length)
self.sentence_min_length = int(sentence_min_length)
self.min_ratio = float(min_ratio)
self.language_detector = annif.simplemma_util.get_language_detector(
self.project.language
)

def transform_fn(self, text: str) -> str:
if len(text) < self.text_min_length:
Expand All @@ -41,7 +43,7 @@ def transform_fn(self, text: str) -> str:
if len(sent) < self.sentence_min_length:
retained_sentences.append(sent)
continue
proportion = in_target_language(sent, lang=(self.project.language,))
proportion = self.language_detector.proportion_in_target_languages(sent)
if proportion >= self.min_ratio:
retained_sentences.append(sent)
return " ".join(retained_sentences)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ numpy = "1.26.*"
optuna = "~4.0.0"
python-dateutil = "2.9.*"
tomli = { version = "2.0.*", python = "<3.11" }
simplemma = "~1.1.0"
simplemma = "~1.1.1"
jsonschema = "~4.23.0"
huggingface-hub = "~0.24.5"

Expand Down
12 changes: 12 additions & 0 deletions tests/test_openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,15 @@ def test_openapi_learn_novocab(app_client):
data = []
req = app_client.post("http://localhost:8000/v1/projects/novocab/learn", json=data)
assert req.status_code == 503


def test_rest_detect_language_no_candidates(app_client):
data = {"text": "example text", "languages": []}
req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
assert req.status_code == 400


def test_rest_detect_language_too_many_candidates(app_client):
data = {"text": "example text", "languages": ["en", "fr", "de", "it", "es", "nl"]}
req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
assert req.status_code == 400
32 changes: 32 additions & 0 deletions tests/test_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,38 @@ def test_rest_show_project_nonexistent(app):
assert result.status_code == 404


def test_rest_detect_language_english(app):
# english text should be detected
with app.app_context():
result = annif.rest.detect_language(
{"text": "example text", "languages": ["en", "fi", "sv"]}
)[0]
assert {"language": "en", "score": 1} in result["results"]


def test_rest_detect_language_unknown(app):
# an unknown language should return None
with app.app_context():
result = annif.rest.detect_language(
{"text": "exampley texty", "languages": ["fi", "sv"]}
)[0]
assert {"language": None, "score": 1} in result["results"]


def test_rest_detect_language_no_text(app):
with app.app_context():
result = annif.rest.detect_language({"text": "", "languages": ["en"]})[0]
assert {"language": None, "score": 1} in result["results"]


def test_rest_detect_language_unsupported_candidates(app):
with app.app_context():
result = annif.rest.detect_language(
{"text": "example text", "languages": ["unk"]}
)
assert result.status_code == 400


def test_rest_suggest_public(app):
# public projects should be accessible via REST
with app.app_context():
Expand Down
19 changes: 19 additions & 0 deletions tests/test_simplemma_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Unit tests for Simplemma utility functions"""

import pytest

from annif.simplemma_util import get_language_detector


def test_get_language_detector():
detector = get_language_detector("en")
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == pytest.approx(0.75)


def test_get_language_detector_many():
detector = get_language_detector(("en", "fr"))
text = "She said 'au revoir' and left"
proportion = detector.proportion_in_target_languages(text)
assert proportion == pytest.approx(1.0)

0 comments on commit 53f16b1

Please sign in to comment.