Skip to content

Commit

Permalink
Catch OperationFailed by _normalize_word() and log text being tokenized
Browse files Browse the repository at this point in the history
  • Loading branch information
juhoinkinen committed Nov 2, 2023
1 parent 5ca8144 commit 7560fff
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 5 deletions.
17 changes: 12 additions & 5 deletions annif/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import functools
import unicodedata

from annif.exception import OperationFailedException

_KEY_TOKEN_MIN_LENGTH = "token_min_length"


Expand Down Expand Up @@ -44,11 +46,16 @@ def tokenize_words(self, text: str, filter: bool = True) -> list[str]:

import nltk.tokenize

return [
self._normalize_word(word)
for word in nltk.tokenize.word_tokenize(text)
if (not filter or self.is_valid_token(word))
]
try:
return [
self._normalize_word(word)
for word in nltk.tokenize.word_tokenize(text)
if (not filter or self.is_valid_token(word))
]
except OperationFailedException as err:
raise OperationFailedException(
f"Error in tokenization of text '{text}'"
) from err

def _normalize_word(self, word):
"""Normalize (stem or lemmatize) a word form into a normal form."""
Expand Down
21 changes: 21 additions & 0 deletions tests/test_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Unit tests for analyzers in Annif"""
import importlib.util
from unittest import mock

import pytest

import annif.analyzer
from annif.exception import OperationFailedException


def test_get_analyzer_nonexistent():
Expand Down Expand Up @@ -60,6 +63,24 @@ def test_english_tokenize_words_no_filter():
assert len(words) == 23


@pytest.mark.skipif(
importlib.util.find_spec("voikko") is None,
reason="test requires that Voikko is installed",
)
def test_tokenize_words_operationfailed():
analyzer = annif.analyzer.get_analyzer("voikko(fi)")
text = "An error producing sentence."
with mock.patch(
"voikko.libvoikko.Voikko.analyze",
side_effect=ValueError,
):
with pytest.raises(
OperationFailedException,
match="Error in tokenization of text 'An error producing sentence.'",
):
analyzer.tokenize_words(text)


def test_english_filter_words_min_token():
analyzer = annif.analyzer.get_analyzer("snowball(english,token_min_length=2)")
text = """Since 2000, a 3D printer can be used to print
Expand Down

0 comments on commit 7560fff

Please sign in to comment.