Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First implementation of LSI backend, with tests. Fixes #201 #219

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions annif/backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from . import ensemble
from . import http
from . import tfidf
from . import lsi
from . import fasttext
from . import pav

Expand All @@ -28,5 +29,6 @@ def get_backend(backend_id):
register_backend(ensemble.EnsembleBackend)
register_backend(http.HTTPBackend)
register_backend(tfidf.TFIDFBackend)
register_backend(lsi.LSIBackend)
register_backend(fasttext.FastTextBackend)
register_backend(pav.PAVBackend)
76 changes: 76 additions & 0 deletions annif/backend/lsi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Backend that returns most similar subjects based on similarity in LSI
vector space"""

import os.path
import gensim.similarities
from gensim.matutils import Sparse2Corpus
from gensim.models import LsiModel
import annif.util
from annif.hit import VectorAnalysisResult
from annif.exception import NotInitializedException
from . import backend


class LSIBackend(backend.AnnifBackend):
"""TF-IDF vector space similarity based backend for Annif"""
name = "lsi"
needs_subject_index = True
needs_subject_vectorizer = True

# defaults for uninitialized instances
_lsi = None
_index = None

MODEL_FILE = 'lsi-model'
INDEX_FILE = 'lsi-index'

def initialize(self):
if self._lsi is None:
path = os.path.join(self._get_datadir(), self.MODEL_FILE)
self.debug('loading LSI model from {}'.format(path))
if os.path.exists(path):
self._lsi = LsiModel.load(path)
else:
raise NotInitializedException(
'LSI model {} not found'.format(path),
backend_id=self.backend_id)
if self._index is None:
path = os.path.join(self._get_datadir(), self.INDEX_FILE)
self.debug('loading similarity index from {}'.format(path))
if os.path.exists(path):
self._index = gensim.similarities.MatrixSimilarity.load(path)
else:
raise NotInitializedException(
'similarity index {} not found'.format(path),
backend_id=self.backend_id)

def load_corpus(self, corpus, project):
self.info('creating LSI model')
veccorpus = project.vectorizer.transform(
(subj.text for subj in corpus.subjects))
gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)
self._lsi = LsiModel(
gscorpus,
num_topics=int(self.params['num_topics']))
annif.util.atomic_save(
self._lsi,
self._get_datadir(),
self.MODEL_FILE)
self.info('creating similarity index')
self._index = gensim.similarities.MatrixSimilarity(
self._lsi[gscorpus])
annif.util.atomic_save(
self._index,
self._get_datadir(),
self.INDEX_FILE)

def _analyze(self, text, project, params):
self.initialize()
self.debug('Analyzing text "{}..." (len={})'.format(
text[:20], len(text)))
vectors = project.vectorizer.transform([text])
corpus = Sparse2Corpus(vectors, documents_columns=False)
lsi_vector = self._lsi[corpus]
docsim = self._index[lsi_vector[0]]
fullresult = VectorAnalysisResult(docsim, project.subjects)
return fullresult.filter(limit=int(self.params['limit']))
27 changes: 27 additions & 0 deletions projects.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,33 @@ analyzer=snowball(english)
limit=100
vocab=yso-en

[lsi-fi]
name=LSI Finnish
language=fi
backends=lsi
analyzer=snowball(finnish)
limit=100
vocab=yso-fi
num_topics=200

[lsi-sv]
name=LSI Swedish
language=sv
backends=lsi
analyzer=snowball(swedish)
limit=100
vocab=yso-sv
num_topics=200

[lsi-en]
name=LSI English
language=en
backends=lsi
analyzer=snowball(english)
limit=100
vocab=yso-en
num_topics=200

[fasttext-fi]
name=fastText Finnish
language=fi
Expand Down
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import py.path
import unittest.mock
import annif
from sklearn.feature_extraction.text import TfidfVectorizer


@pytest.fixture(scope='module')
Expand Down Expand Up @@ -79,3 +80,11 @@ def project(document_corpus):
proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)')
proj.subjects = annif.corpus.SubjectIndex(document_corpus)
return proj


@pytest.fixture(scope='module')
def project_with_vectorizer(project, document_corpus):
project.vectorizer = TfidfVectorizer(
tokenizer=project.analyzer.tokenize_words)
project.vectorizer.fit([subj.text for subj in document_corpus.subjects])
return project
53 changes: 53 additions & 0 deletions tests/test_backend_lsi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Unit tests for the LSI backend in Annif"""

import annif
import annif.backend


def test_lsi_load_documents(
datadir,
document_corpus,
project_with_vectorizer):
lsi_type = annif.backend.get_backend("lsi")
lsi = lsi_type(
backend_id='lsi',
params={'limit': 10, 'num_topics': 100},
datadir=str(datadir))

lsi.load_corpus(document_corpus, project_with_vectorizer)
assert len(lsi._index) > 0
assert datadir.join('lsi-index').exists()
assert datadir.join('lsi-index').size() > 0


def test_lsi_analyze(datadir, project_with_vectorizer):
lsi_type = annif.backend.get_backend("lsi")
lsi = lsi_type(
backend_id='lsi',
params={'limit': 10, 'num_topics': 100},
datadir=str(datadir))

results = lsi.analyze("""Arkeologiaa sanotaan joskus myös
muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä.
Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä,
joita ihmisten toiminta on jättänyt maaperään tai vesistöjen
pohjaan.""", project_with_vectorizer)

assert len(results) == 10
assert 'http://www.yso.fi/onto/yso/p1265' in [
result.uri for result in results]
assert 'arkeologia' in [result.label for result in results]


def test_lsi_analyze_unknown(datadir, project_with_vectorizer):
lsi_type = annif.backend.get_backend("lsi")
lsi = lsi_type(
backend_id='lsi',
params={'limit': 10},
datadir=str(datadir))

results = lsi.analyze("abcdefghijk",
project_with_vectorizer) # unknown word

assert len(results) == 0
30 changes: 10 additions & 20 deletions tests/test_backend_tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,25 @@

import annif
import annif.backend
import annif.corpus
from sklearn.feature_extraction.text import TfidfVectorizer
import pytest
import unittest.mock


@pytest.fixture(scope='module')
def project(document_corpus):
proj = unittest.mock.Mock()
proj.analyzer = annif.analyzer.get_analyzer('snowball(finnish)')
proj.subjects = annif.corpus.SubjectIndex(document_corpus)
proj.vectorizer = TfidfVectorizer(tokenizer=proj.analyzer.tokenize_words)
proj.vectorizer.fit([subj.text for subj in document_corpus.subjects])
return proj


def test_tfidf_load_documents(datadir, document_corpus, project):
def test_tfidf_load_documents(
datadir,
document_corpus,
project_with_vectorizer):
tfidf_type = annif.backend.get_backend("tfidf")
tfidf = tfidf_type(
backend_id='tfidf',
params={'limit': 10},
datadir=str(datadir))

tfidf.load_corpus(document_corpus, project)
tfidf.load_corpus(document_corpus, project_with_vectorizer)
assert len(tfidf._index) > 0
assert datadir.join('tfidf-index').exists()
assert datadir.join('tfidf-index').size() > 0


def test_tfidf_analyze(datadir, project):
def test_tfidf_analyze(datadir, project_with_vectorizer):
tfidf_type = annif.backend.get_backend("tfidf")
tfidf = tfidf_type(
backend_id='tfidf',
Expand All @@ -43,21 +32,22 @@ def test_tfidf_analyze(datadir, project):
tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä.
Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä,
joita ihmisten toiminta on jättänyt maaperään tai vesistöjen
pohjaan.""", project)
pohjaan.""", project_with_vectorizer)

assert len(results) == 10
assert 'http://www.yso.fi/onto/yso/p1265' in [
result.uri for result in results]
assert 'arkeologia' in [result.label for result in results]


def test_tfidf_analyze_unknown(datadir, project):
def test_tfidf_analyze_unknown(datadir, project_with_vectorizer):
tfidf_type = annif.backend.get_backend("tfidf")
tfidf = tfidf_type(
backend_id='tfidf',
params={'limit': 10},
datadir=str(datadir))

results = tfidf.analyze("abcdefghijk", project) # unknown word
results = tfidf.analyze("abcdefghijk",
project_with_vectorizer) # unknown word

assert len(results) == 0