From ffab9eae9c13e3e66fcc0cff5cd9391c2be8f79b Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 3 Sep 2021 13:57:21 +0300
Subject: [PATCH] Don't tokenize sentences with spaCy; NLTK works fine for that

---
 annif/analyzer/spacy.py      | 11 +----------
 tests/test_analyzer_spacy.py | 16 ----------------
 2 files changed, 1 insertion(+), 26 deletions(-)

diff --git a/annif/analyzer/spacy.py b/annif/analyzer/spacy.py
index 31e368cbc..e188e6648 100644
--- a/annif/analyzer/spacy.py
+++ b/annif/analyzer/spacy.py
@@ -1,7 +1,6 @@
 """Simple analyzer for Annif. Only folds words to lower case."""
 
 import spacy
-from spacy.tokens import Doc, Span
 from . import analyzer
 
 
@@ -11,18 +10,10 @@ class SpacyAnalyzer(analyzer.Analyzer):
     def __init__(self, param, **kwargs):
         self.param = param
         self.nlp = spacy.load(param, exclude=['ner', 'parser'])
-        # we need a way to split sentences, now that parser is excluded
-        self.nlp.add_pipe('sentencizer')
         super().__init__(**kwargs)
 
-    def tokenize_sentences(self, text):
-        doc = self.nlp(text)
-        return list(doc.sents)
-
     def tokenize_words(self, text):
-        if not isinstance(text, (Doc, Span)):
-            text = self.nlp(text)
-        return [lemma for lemma in (token.lemma_ for token in text)
+        return [lemma for lemma in (token.lemma_ for token in self.nlp(text))
                 if self.is_valid_token(lemma)]
 
     def normalize_word(self, word):
diff --git a/tests/test_analyzer_spacy.py b/tests/test_analyzer_spacy.py
index 49c8c641e..d06dfc9e7 100644
--- a/tests/test_analyzer_spacy.py
+++ b/tests/test_analyzer_spacy.py
@@ -6,22 +6,6 @@
 spacy = pytest.importorskip("annif.analyzer.spacy")
 
 
-def test_spacy_english_tokenize_sentences():
-    analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
-    sentences = analyzer.tokenize_sentences("""
-        The quick brown fox jumps over the lazy dog.
-        The five boxing wizards jump quickly.
-        Pack my box with five dozen liquor jugs.
-        """.strip())
-    assert len(sentences) == 3
-    assert sentences[0].text.strip() == \
-        'The quick brown fox jumps over the lazy dog.'
-    assert sentences[1].text.strip() == \
-        'The five boxing wizards jump quickly.'
-    assert sentences[2].text.strip() == \
-        'Pack my box with five dozen liquor jugs.'
-
-
 def test_spacy_english_tokenize_words():
     analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
     words = analyzer.tokenize_words("""