Catch OperationFailed by _normalize_word() and log text being tokenized

NatLibFi · Nov 2, 2023 · 7560fff · 7560fff
1 parent 5ca8144
commit 7560fff
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 5 deletions.
diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py
@@ -5,6 +5,8 @@
 import functools
 import unicodedata
 
+from annif.exception import OperationFailedException
+
 _KEY_TOKEN_MIN_LENGTH = "token_min_length"
 
 
@@ -44,11 +46,16 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
 
         import nltk.tokenize
 
-        return [
-            self._normalize_word(word)
-            for word in nltk.tokenize.word_tokenize(text)
-            if (not filter or self.is_valid_token(word))
-        ]
+        try:
+            return [
+                self._normalize_word(word)
+                for word in nltk.tokenize.word_tokenize(text)
+                if (not filter or self.is_valid_token(word))
+            ]
+        except OperationFailedException as err:
+            raise OperationFailedException(
+                f"Error in tokenization of text '{text}'"
+            ) from err
 
     def _normalize_word(self, word):
         """Normalize (stem or lemmatize) a word form into a normal form."""

diff --git a/tests/test_analyzer.py b/tests/test_analyzer.py
@@ -1,8 +1,11 @@
 """Unit tests for analyzers in Annif"""
+import importlib.util
+from unittest import mock
 
 import pytest
 
 import annif.analyzer
+from annif.exception import OperationFailedException
 
 
 def test_get_analyzer_nonexistent():
@@ -60,6 +63,24 @@ def test_english_tokenize_words_no_filter():
     assert len(words) == 23
 
 
+@pytest.mark.skipif(
+    importlib.util.find_spec("voikko") is None,
+    reason="test requires that Voikko is installed",
+)
+def test_tokenize_words_operationfailed():
+    analyzer = annif.analyzer.get_analyzer("voikko(fi)")
+    text = "An error producing sentence."
+    with mock.patch(
+        "voikko.libvoikko.Voikko.analyze",
+        side_effect=ValueError,
+    ):
+        with pytest.raises(
+            OperationFailedException,
+            match="Error in tokenization of text 'An error producing sentence.'",
+        ):
+            analyzer.tokenize_words(text)
+
+
 def test_english_filter_words_min_token():
     analyzer = annif.analyzer.get_analyzer("snowball(english,token_min_length=2)")
     text = """Since 2000, a 3D printer can be used to print