From c047aeaa2b6656a5038eb35f83429bd3c6c3f532 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Thu, 19 Sep 2024 16:30:05 +0300 Subject: [PATCH] Use variable for NLTK tokenizer datapackage name (punkt_tab) --- annif/analyzer/analyzer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/annif/analyzer/analyzer.py b/annif/analyzer/analyzer.py index d08de648..4b647c18 100644 --- a/annif/analyzer/analyzer.py +++ b/annif/analyzer/analyzer.py @@ -11,6 +11,7 @@ logger = annif.logger _KEY_TOKEN_MIN_LENGTH = "token_min_length" +_NLTK_TOKENIZER_DATA = "punkt_tab" class Analyzer(metaclass=abc.ABCMeta): @@ -28,14 +29,15 @@ def __init__(self, **kwargs) -> None: import nltk.data try: - nltk.data.find("tokenizers/punkt_tab") + nltk.data.find("tokenizers/" + _NLTK_TOKENIZER_DATA) except LookupError as err: logger.debug(str(err)) - if "punkt_tab" in str(err): # "punkt_tab" is surrounded by color code tags + if _NLTK_TOKENIZER_DATA in str(err): logger.warning( - 'NLTK datapackage "punkt_tab" not found, downloading it now.' + f'NLTK datapackage "{_NLTK_TOKENIZER_DATA}" not found, ' + "downloading it now." ) - nltk.download("punkt_tab") + nltk.download(_NLTK_TOKENIZER_DATA) def tokenize_sentences(self, text: str) -> list[str]: """Tokenize a piece of text (e.g. a document) into sentences."""