Don't tokenize sentences with spaCy; NLTK works fine for that

NatLibFi · Nov 22, 2021 · ffab9ea · ffab9ea
1 parent 07b2733
commit ffab9ea
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 26 deletions.
diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py
@@ -1,7 +1,6 @@
 """Simple analyzer for Annif. Only folds words to lower case."""
 
 import spacy
-from spacy.tokens import Doc, Span
 from . import analyzer
 
 
@@ -11,18 +10,10 @@ class SpacyAnalyzer(analyzer.Analyzer):
     def __init__(self, param, **kwargs):
         self.param = param
         self.nlp = spacy.load(param, exclude=['ner', 'parser'])
-        # we need a way to split sentences, now that parser is excluded
-        self.nlp.add_pipe('sentencizer')
         super().__init__(**kwargs)
 
-    def tokenize_sentences(self, text):
-        doc = self.nlp(text)
-        return list(doc.sents)
-
     def tokenize_words(self, text):
-        if not isinstance(text, (Doc, Span)):
-            text = self.nlp(text)
-        return [lemma for lemma in (token.lemma_ for token in text)
+        return [lemma for lemma in (token.lemma_ for token in self.nlp(text))
                 if self.is_valid_token(lemma)]
 
     def normalize_word(self, word):

diff --git a/tests/test_analyzer_spacy.py b/tests/test_analyzer_spacy.py
@@ -6,22 +6,6 @@
 spacy = pytest.importorskip("annif.analyzer.spacy")
 
 
-def test_spacy_english_tokenize_sentences():
-    analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
-    sentences = analyzer.tokenize_sentences("""
-        The quick brown fox jumps over the lazy dog.
-        The five boxing wizards jump quickly.
-        Pack my box with five dozen liquor jugs.
-        """.strip())
-    assert len(sentences) == 3
-    assert sentences[0].text.strip() == \
-        'The quick brown fox jumps over the lazy dog.'
-    assert sentences[1].text.strip() == \
-        'The five boxing wizards jump quickly.'
-    assert sentences[2].text.strip() == \
-        'Pack my box with five dozen liquor jugs.'
-
-
 def test_spacy_english_tokenize_words():
     analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
     words = analyzer.tokenize_words("""