From ffab9eae9c13e3e66fcc0cff5cd9391c2be8f79b Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Fri, 3 Sep 2021 13:57:21 +0300 Subject: [PATCH] Don't tokenize sentences with spaCy; NLTK works fine for that --- annif/analyzer/spacy.py | 11 +---------- tests/test_analyzer_spacy.py | 16 ---------------- 2 files changed, 1 insertion(+), 26 deletions(-) diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py index 31e368cbc..e188e6648 100644 --- a/annif/analyzer/spacy.py +++ b/annif/analyzer/spacy.py @@ -1,7 +1,6 @@ """Simple analyzer for Annif. Only folds words to lower case.""" import spacy -from spacy.tokens import Doc, Span from . import analyzer @@ -11,18 +10,10 @@ class SpacyAnalyzer(analyzer.Analyzer): def __init__(self, param, **kwargs): self.param = param self.nlp = spacy.load(param, exclude=['ner', 'parser']) - # we need a way to split sentences, now that parser is excluded - self.nlp.add_pipe('sentencizer') super().__init__(**kwargs) - def tokenize_sentences(self, text): - doc = self.nlp(text) - return list(doc.sents) - def tokenize_words(self, text): - if not isinstance(text, (Doc, Span)): - text = self.nlp(text) - return [lemma for lemma in (token.lemma_ for token in text) + return [lemma for lemma in (token.lemma_ for token in self.nlp(text)) if self.is_valid_token(lemma)] def normalize_word(self, word): diff --git a/tests/test_analyzer_spacy.py b/tests/test_analyzer_spacy.py index 49c8c641e..d06dfc9e7 100644 --- a/tests/test_analyzer_spacy.py +++ b/tests/test_analyzer_spacy.py @@ -6,22 +6,6 @@ spacy = pytest.importorskip("annif.analyzer.spacy") -def test_spacy_english_tokenize_sentences(): - analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)") - sentences = analyzer.tokenize_sentences(""" - The quick brown fox jumps over the lazy dog. - The five boxing wizards jump quickly. - Pack my box with five dozen liquor jugs. - """.strip()) - assert len(sentences) == 3 - assert sentences[0].text.strip() == \ - 'The quick brown fox jumps over the lazy dog.' - assert sentences[1].text.strip() == \ - 'The five boxing wizards jump quickly.' - assert sentences[2].text.strip() == \ - 'Pack my box with five dozen liquor jugs.' - - def test_spacy_english_tokenize_words(): analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)") words = analyzer.tokenize_words("""