Skip to content

Commit

Permalink
Automate NLTK datapackage punkt_tab download for Analyzers
Browse files Browse the repository at this point in the history
  • Loading branch information
juhoinkinen committed Sep 19, 2024
1 parent 53f16b1 commit 642281e
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 0 deletions.
16 changes: 16 additions & 0 deletions annif/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import functools
import unicodedata

import annif

logger = annif.logger

_KEY_TOKEN_MIN_LENGTH = "token_min_length"


Expand All @@ -21,6 +25,18 @@ def __init__(self, **kwargs) -> None:
if _KEY_TOKEN_MIN_LENGTH in kwargs:
self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH])

import nltk.data

try:
nltk.data.find("tokenizers/punkt_tab")
except LookupError as err:
logger.debug(str(err))
if "punkt_tab" in str(err): # "punkt_tab" is surrounded by color code tags
logger.warning(
'NLTK datapackage "punkt_tab" not found, downloading it now.'
)
nltk.download("punkt_tab")

def tokenize_sentences(self, text: str) -> list[str]:
"""Tokenize a piece of text (e.g. a document) into sentences."""
import nltk.tokenize
Expand Down
11 changes: 11 additions & 0 deletions tests/test_analyzer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Unit tests for analyzers in Annif"""

from unittest import mock

import pytest

import annif.analyzer
Expand All @@ -15,6 +17,15 @@ def test_get_analyzer_badspec():
annif.analyzer.get_analyzer("()")


@mock.patch("nltk.data.find", side_effect=LookupError("Resource punkt_tab not found"))
@mock.patch("nltk.download")
def test_nltk_data_missing(download, find):
annif.analyzer.get_analyzer("snowball(english)")
assert find.called
assert download.called
assert download.call_args == mock.call("punkt_tab")


def test_english_analyzer_normalize_word():
analyzer = annif.analyzer.get_analyzer("snowball(english)")
assert analyzer._normalize_word("running") == "run"
Expand Down

0 comments on commit 642281e

Please sign in to comment.