Skip to content

Commit

Permalink
Don't tokenize sentences with spaCy; NLTK works fine for that
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Nov 22, 2021
1 parent 07b2733 commit ffab9ea
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 26 deletions.
11 changes: 1 addition & 10 deletions annif/analyzer/spacy.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Simple analyzer for Annif. Only folds words to lower case."""

import spacy
from spacy.tokens import Doc, Span
from . import analyzer


Expand All @@ -11,18 +10,10 @@ class SpacyAnalyzer(analyzer.Analyzer):
def __init__(self, param, **kwargs):
self.param = param
self.nlp = spacy.load(param, exclude=['ner', 'parser'])
# we need a way to split sentences, now that parser is excluded
self.nlp.add_pipe('sentencizer')
super().__init__(**kwargs)

def tokenize_sentences(self, text):
doc = self.nlp(text)
return list(doc.sents)

def tokenize_words(self, text):
if not isinstance(text, (Doc, Span)):
text = self.nlp(text)
return [lemma for lemma in (token.lemma_ for token in text)
return [lemma for lemma in (token.lemma_ for token in self.nlp(text))
if self.is_valid_token(lemma)]

def normalize_word(self, word):
Expand Down
16 changes: 0 additions & 16 deletions tests/test_analyzer_spacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,6 @@
spacy = pytest.importorskip("annif.analyzer.spacy")


def test_spacy_english_tokenize_sentences():
analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
sentences = analyzer.tokenize_sentences("""
The quick brown fox jumps over the lazy dog.
The five boxing wizards jump quickly.
Pack my box with five dozen liquor jugs.
""".strip())
assert len(sentences) == 3
assert sentences[0].text.strip() == \
'The quick brown fox jumps over the lazy dog.'
assert sentences[1].text.strip() == \
'The five boxing wizards jump quickly.'
assert sentences[2].text.strip() == \
'Pack my box with five dozen liquor jugs.'


def test_spacy_english_tokenize_words():
analyzer = annif.analyzer.get_analyzer("spacy(en_core_web_sm)")
words = analyzer.tokenize_words("""
Expand Down

0 comments on commit ffab9ea

Please sign in to comment.