Merge branch 'main' into update-dependencies-v1.2

NatLibFi · Sep 17, 2024 · 53f16b1 · 53f16b1
2 parents 4d82fb7 + c42a93f
commit 53f16b1
Show file tree

Hide file tree

Showing 12 changed files with 190 additions and 23 deletions.
diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml
@@ -19,7 +19,7 @@ jobs:
     runs-on: ubuntu-22.04
     name: lint with isort, Black & flake8
     steps:
-    - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
+    - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
     - name: "Prepare: restore caches, install Poetry, set up Python"
       uses: ./.github/actions/prepare
       with:
@@ -42,7 +42,7 @@ jobs:
     runs-on: ubuntu-22.04
     name: check CLI startup time
     steps:
-    - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
+    - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
     - name: "Prepare: restore caches, install Poetry, set up Python"
       id: prepare
       uses: ./.github/actions/prepare
@@ -64,7 +64,7 @@ jobs:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     name: test on Python ${{ matrix.python-version }}
     steps:
-    - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
+    - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
     - name: Install system packages
       run: |
         sudo apt-get install \
@@ -109,7 +109,7 @@ jobs:
           poetry run pytest --cov=./ --cov-report xml --cov-append -m slow
         fi
     - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@c16abc29c95fcf9174b58eb7e1abf4c866893bc8  # v4.1.1
+      uses: codecov/codecov-action@e28ff129e5465c2c0dcc6f003fc735cb6ae0c673  # v4.5.0
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
     - name: Save cache
@@ -129,7 +129,7 @@ jobs:
     timeout-minutes: 15
     steps:
     - name: "Build image for testing"
-      uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0  # v5.3.0
+      uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85  # v6.7.0
       with:
         push: false
         tags: test-image
@@ -145,7 +145,7 @@ jobs:
     if: github.event_name == 'push' && github.ref == 'refs/heads/main'
     steps:
     - name: Login to Quay.io
-      uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20  # v3.1.0
+      uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567  # v3.3.0
       with:
         registry: quay.io
         username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
@@ -158,7 +158,7 @@ jobs:
         tags: |
           latest
     - name: Build and push to Quay.io
-      uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0  # v5.3.0
+      uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85  # v6.7.0
       with:
         push: true
         tags: ${{ steps.meta.outputs.tags }}
@@ -170,7 +170,7 @@ jobs:
     runs-on: ubuntu-22.04
     if: github.event_name == 'push' && contains(github.ref, 'refs/tags/')
     steps:
-    - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
+    - uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
     - name: "Prepare: restore caches, install Poetry, set up Python"
       uses: ./.github/actions/prepare
       with:
@@ -184,7 +184,7 @@ jobs:
         poetry publish --build
 
     - name: Login to Quay.io
-      uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20  # v3.1.0
+      uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567  # v3.3.0
       with:
         registry: quay.io
         username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
@@ -199,7 +199,7 @@ jobs:
           type=semver,pattern={{version}}
           type=semver,pattern={{major}}.{{minor}}
     - name: Build and push to Quay.io
-      uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0  # v5.3.0
+      uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85  # v6.7.0
       with:
         push: true
         tags: ${{ steps.meta.outputs.tags }}

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -27,18 +27,18 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
 
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+        uses: github/codeql-action/init@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6
         with:
           languages: ${{ matrix.language }}
           queries: +security-and-quality
 
       - name: Autobuild
-        uses: github/codeql-action/autobuild@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+        uses: github/codeql-action/autobuild@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6
 
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+        uses: github/codeql-action/analyze@4dd16135b69a43b6c8efb853346f8437d92d3c93 # v3.26.6
         with:
           category: "/language:${{ matrix.language }}"
diff --git a/.github/workflows/docker-rebuild.yml b/.github/workflows/docker-rebuild.yml
@@ -10,15 +10,15 @@ jobs:
     timeout-minutes: 15
     steps:
     - name: "Build for testing"
-      uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0  # v5.3.0
+      uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85  # v6.7.0
       with:
         push: false
         tags: test-image
     - name: "Test with pytest"
       run: |
         docker run --rm --workdir /Annif test-image pytest -p no:cacheprovider
     - name: Login to Quay.io
-      uses: docker/login-action@e92390c5fb421da1463c202d546fed0ec5c39f20  # v3.1.0
+      uses: docker/login-action@9780b0c442fbb1117ed29e0efdff1e18412f7567  # v3.3.0
       with:
         registry: quay.io
         username: ${{ secrets.YHTEENTOIMIVUUSPALVELUT_QUAY_IO_USERNAME }}
@@ -35,7 +35,7 @@ jobs:
           type=semver,pattern={{version}}
           type=semver,pattern={{major}}.{{minor}}
     - name: Build and push to Quay.io
-      uses: docker/build-push-action@2cdde995de11925a030ce8070c3d77a52ffcf1c0  # v5.3.0
+      uses: docker/build-push-action@5cd11c3a4ced054e52742c5fd54dca954e0edd85  # v6.7.0
       with:
         push: true
         tags: ${{ steps.meta.outputs.tags }}

diff --git a/annif/analyzer/simplemma.py b/annif/analyzer/simplemma.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import simplemma
+import annif.simplemma_util
 
 from . import analyzer
 
@@ -15,4 +15,4 @@ def __init__(self, param: str, **kwargs) -> None:
         super().__init__(**kwargs)
 
     def _normalize_word(self, word: str) -> str:
-        return simplemma.lemmatize(word, lang=self.lang)
+        return annif.simplemma_util.lemmatizer.lemmatize(word, lang=self.lang)
diff --git a/annif/openapi/annif.yaml b/annif/openapi/annif.yaml
@@ -182,6 +182,49 @@ paths:
         "503":
           $ref: '#/components/responses/ServiceUnavailable'
       x-codegen-request-body-name: documents
+  /detect-language:
+    post:
+      tags:
+      - Language detection
+      summary: detect the language of a text given a list of candidate languages
+      operationId: annif.rest.detect_language
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+              - text
+              - languages
+              properties:
+                text:
+                  type: string
+                  description: input text
+                  example: A quick brown fox jumped over the lazy dog.
+                languages:
+                  type: array
+                  description: candidate languages as IETF BCP 47 codes
+                  items:
+                    type: string
+                    maxLength: 3
+                    minLength: 2
+                    example: en
+                  minItems: 1
+                  maxItems: 5
+        required: true
+      responses:
+        200:
+          description: successful operation
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/DetectedLanguages'
+        400:
+          description: Bad Request
+          content:
+            application/problem+json:
+              schema:
+                $ref: '#/components/schemas/Problem'
 components:
   schemas:
     ApiInfo:
@@ -316,6 +359,22 @@ components:
                 type: string
                 example: Vulpes vulpes
       description: A document with attached, known good subjects
+    DetectedLanguages:
+      type: object
+      properties:
+        results:
+          type: array
+          items:
+            type: object
+            properties:
+              language:
+                type: string
+                example: en
+                nullable: true
+              score:
+                type: number
+                example: 0.85
+      description: Candidate languages with their associated scores
     Problem:
       type: object
       properties:

diff --git a/annif/rest.py b/annif/rest.py
@@ -12,6 +12,7 @@
 from annif.corpus import Document, DocumentList, SubjectSet
 from annif.exception import AnnifException
 from annif.project import Access
+from annif.simplemma_util import get_language_detector
 
 if TYPE_CHECKING:
     from connexion.lifecycle import ConnexionResponse
@@ -82,6 +83,31 @@ def show_project(
     return project.dump(), 200, {"Content-Type": "application/json"}
 
 
+def detect_language(body: dict[str, Any]):
+    """return scores for detected languages formatted according to Swagger spec"""
+
+    text = body.get("text")
+    languages = body.get("languages")
+
+    detector = get_language_detector(tuple(languages))
+    try:
+        proportions = detector.proportion_in_each_language(text)
+    except ValueError:
+        return connexion.problem(
+            status=400,
+            title="Bad Request",
+            detail="unsupported candidate languages",
+        )
+
+    result = {
+        "results": [
+            {"language": lang if lang != "unk" else None, "score": score}
+            for lang, score in proportions.items()
+        ]
+    }
+    return result, 200, {"Content-Type": "application/json"}
+
+
 def _suggestion_to_dict(
     suggestion: SubjectSuggestion, subject_index: SubjectIndex, language: str
 ) -> dict[str, str | float | None]:

diff --git a/annif/simplemma_util.py b/annif/simplemma_util.py
@@ -0,0 +1,17 @@
+"""Wrapper code for using Simplemma functionality in Annif"""
+
+from typing import Tuple, Union
+
+from simplemma import LanguageDetector, Lemmatizer
+from simplemma.strategies import DefaultStrategy
+from simplemma.strategies.dictionaries import DefaultDictionaryFactory
+
+LANG_CACHE_SIZE = 5  # How many language dictionaries to keep in memory at once (max)
+
+_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE)
+_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory)
+lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy)
+
+
+def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector:
+    return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)
diff --git a/annif/transform/langfilter.py b/annif/transform/langfilter.py
@@ -5,9 +5,8 @@
 
 from typing import TYPE_CHECKING
 
-from simplemma.language_detector import in_target_language
-
 import annif
+import annif.simplemma_util
 
 from . import transform
 
@@ -31,6 +30,9 @@ def __init__(
         self.text_min_length = int(text_min_length)
         self.sentence_min_length = int(sentence_min_length)
         self.min_ratio = float(min_ratio)
+        self.language_detector = annif.simplemma_util.get_language_detector(
+            self.project.language
+        )
 
     def transform_fn(self, text: str) -> str:
         if len(text) < self.text_min_length:
@@ -41,7 +43,7 @@ def transform_fn(self, text: str) -> str:
             if len(sent) < self.sentence_min_length:
                 retained_sentences.append(sent)
                 continue
-            proportion = in_target_language(sent, lang=(self.project.language,))
+            proportion = self.language_detector.proportion_in_target_languages(sent)
             if proportion >= self.min_ratio:
                 retained_sentences.append(sent)
         return " ".join(retained_sentences)
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,7 @@ numpy = "1.26.*"
 optuna = "~4.0.0"
 python-dateutil = "2.9.*"
 tomli = { version = "2.0.*", python = "<3.11" }
-simplemma = "~1.1.0"
+simplemma = "~1.1.1"
 jsonschema = "~4.23.0"
 huggingface-hub = "~0.24.5"
 

diff --git a/tests/test_openapi.py b/tests/test_openapi.py
@@ -126,3 +126,15 @@ def test_openapi_learn_novocab(app_client):
     data = []
     req = app_client.post("http://localhost:8000/v1/projects/novocab/learn", json=data)
     assert req.status_code == 503
+
+
+def test_rest_detect_language_no_candidates(app_client):
+    data = {"text": "example text", "languages": []}
+    req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
+    assert req.status_code == 400
+
+
+def test_rest_detect_language_too_many_candidates(app_client):
+    data = {"text": "example text", "languages": ["en", "fr", "de", "it", "es", "nl"]}
+    req = app_client.post("http://localhost:8000/v1/detect-language", json=data)
+    assert req.status_code == 400
diff --git a/tests/test_rest.py b/tests/test_rest.py
@@ -53,6 +53,38 @@ def test_rest_show_project_nonexistent(app):
         assert result.status_code == 404
 
 
+def test_rest_detect_language_english(app):
+    # english text should be detected
+    with app.app_context():
+        result = annif.rest.detect_language(
+            {"text": "example text", "languages": ["en", "fi", "sv"]}
+        )[0]
+        assert {"language": "en", "score": 1} in result["results"]
+
+
+def test_rest_detect_language_unknown(app):
+    # an unknown language should return None
+    with app.app_context():
+        result = annif.rest.detect_language(
+            {"text": "exampley texty", "languages": ["fi", "sv"]}
+        )[0]
+        assert {"language": None, "score": 1} in result["results"]
+
+
+def test_rest_detect_language_no_text(app):
+    with app.app_context():
+        result = annif.rest.detect_language({"text": "", "languages": ["en"]})[0]
+        assert {"language": None, "score": 1} in result["results"]
+
+
+def test_rest_detect_language_unsupported_candidates(app):
+    with app.app_context():
+        result = annif.rest.detect_language(
+            {"text": "example text", "languages": ["unk"]}
+        )
+        assert result.status_code == 400
+
+
 def test_rest_suggest_public(app):
     # public projects should be accessible via REST
     with app.app_context():

diff --git a/tests/test_simplemma_util.py b/tests/test_simplemma_util.py
@@ -0,0 +1,19 @@
+"""Unit tests for Simplemma utility functions"""
+
+import pytest
+
+from annif.simplemma_util import get_language_detector
+
+
+def test_get_language_detector():
+    detector = get_language_detector("en")
+    text = "She said 'au revoir' and left"
+    proportion = detector.proportion_in_target_languages(text)
+    assert proportion == pytest.approx(0.75)
+
+
+def test_get_language_detector_many():
+    detector = get_language_detector(("en", "fr"))
+    text = "She said 'au revoir' and left"
+    proportion = detector.proportion_in_target_languages(text)
+    assert proportion == pytest.approx(1.0)