Skip to content

Commit

Permalink
Merge pull request #270 from NatLibFi/issue267-cli-analyze-to-suggest
Browse files Browse the repository at this point in the history
Rename CLI command analyze to suggest and refactor method names
  • Loading branch information
osma authored Apr 17, 2019
2 parents c8c370a + 11c352e commit dee89b4
Show file tree
Hide file tree
Showing 26 changed files with 271 additions and 265 deletions.
12 changes: 6 additions & 6 deletions annif/backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,18 @@ def initialize(self):
pass

@abc.abstractmethod
def _analyze(self, text, project, params):
def _suggest(self, text, project, params):
"""This method should implemented by backends. It implements
the analyze functionality, with pre-processed parameters."""
the suggest functionality, with pre-processed parameters."""
pass # pragma: no cover

def analyze(self, text, project, params=None):
"""Analyze some input text and return a list of subjects represented
as a list of AnalysisHit objects."""
def suggest(self, text, project, params=None):
"""Suggest subjects for the input text and return a list of subjects
represented as a list of SubjectSuggestion objects."""
beparams = dict(self.params)
if params is not None:
beparams.update(params)
return self._analyze(text, project, params=beparams)
return self._suggest(text, project, params=beparams)

def debug(self, message):
"""Log a debug message from this backend"""
Expand Down
12 changes: 6 additions & 6 deletions annif/backend/dummy.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Dummy backend for testing basic interaction of projects and backends"""


from annif.hit import AnalysisHit, ListAnalysisResult
from annif.suggestion import SubjectSuggestion, ListSuggestionResult
from . import backend


Expand All @@ -14,12 +14,12 @@ class DummyBackend(backend.AnnifLearningBackend):
def initialize(self):
self.initialized = True

def _analyze(self, text, project, params):
def _suggest(self, text, project, params):
score = float(params.get('score', 1.0))
return ListAnalysisResult([AnalysisHit(uri=self.uri,
label=self.label,
score=score)],
project.subjects)
return ListSuggestionResult([SubjectSuggestion(uri=self.uri,
label=self.label,
score=score)],
project.subjects)

def learn(self, corpus, project):
# in this dummy backend we "learn" by picking up the URI and label
Expand Down
12 changes: 6 additions & 6 deletions annif/backend/ensemble.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Ensemble backend that combines results from multiple projects"""


import annif.hit
import annif.suggestion
import annif.project
import annif.util
from . import backend
Expand All @@ -16,23 +16,23 @@ def _normalize_hits(self, hits, source_project):
by subclasses."""
return hits

def _analyze_with_sources(self, text, sources):
def _suggest_with_sources(self, text, sources):
hits_from_sources = []
for project_id, weight in sources:
source_project = annif.project.get_project(project_id)
hits = source_project.analyze(text)
hits = source_project.suggest(text)
self.debug(
'Got {} hits from project {}'.format(
len(hits), source_project.project_id))
norm_hits = self._normalize_hits(hits, source_project)
hits_from_sources.append(
annif.hit.WeightedHits(
annif.suggestion.WeightedSuggestion(
hits=norm_hits, weight=weight))
return hits_from_sources

def _analyze(self, text, project, params):
def _suggest(self, text, project, params):
sources = annif.util.parse_sources(params['sources'])
hits_from_sources = self._analyze_with_sources(text, sources)
hits_from_sources = self._suggest_with_sources(text, sources)
merged_hits = annif.util.merge_hits(
hits_from_sources, project.subjects)
self.debug('{} hits after merging'.format(len(merged_hits)))
Expand Down
8 changes: 4 additions & 4 deletions annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import collections
import os.path
import annif.util
from annif.hit import AnalysisHit, ListAnalysisResult
from annif.suggestion import SubjectSuggestion, ListSuggestionResult
from annif.exception import NotInitializedException
import fastText
from . import backend
Expand Down Expand Up @@ -112,7 +112,7 @@ def _predict_chunks(self, chunktexts, project, limit):
filter(None, [self._normalize_text(project, chunktext)
for chunktext in chunktexts])), limit)

def _analyze_chunks(self, chunktexts, project):
def _suggest_chunks(self, chunktexts, project):
limit = int(self.params['limit'])
chunklabels, chunkscores = self._predict_chunks(
chunktexts, project, limit)
Expand All @@ -127,8 +127,8 @@ def _analyze_chunks(self, chunktexts, project):
results = []
for score, label in best_labels[:limit]:
subject = self._label_to_subject(project, label)
results.append(AnalysisHit(
results.append(SubjectSuggestion(
uri=subject[0],
label=subject[1],
score=score / len(chunktexts)))
return ListAnalysisResult(results, project.subjects)
return ListSuggestionResult(results, project.subjects)
22 changes: 11 additions & 11 deletions annif/backend/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@

import requests
import requests.exceptions
from annif.hit import AnalysisHit, ListAnalysisResult
from annif.suggestion import SubjectSuggestion, ListSuggestionResult
from . import backend


class HTTPBackend(backend.AnnifBackend):
name = "http"

def _analyze(self, text, project, params):
def _suggest(self, text, project, params):
data = {'text': text}
if 'project' in params:
data['project'] = params['project']
Expand All @@ -21,26 +21,26 @@ def _analyze(self, text, project, params):
req.raise_for_status()
except requests.exceptions.RequestException as err:
self.warning("HTTP request failed: {}".format(err))
return ListAnalysisResult([], project.subjects)
return ListSuggestionResult([], project.subjects)

try:
response = req.json()
except ValueError as err:
self.warning("JSON decode failed: {}".format(err))
return ListAnalysisResult([], project.subjects)
return ListSuggestionResult([], project.subjects)

if 'results' in response:
results = response['results']
else:
results = response

try:
return ListAnalysisResult([AnalysisHit(uri=h['uri'],
label=h['label'],
score=h['score'])
for h in results
if h['score'] > 0.0],
project.subjects)
return ListSuggestionResult([SubjectSuggestion(uri=h['uri'],
label=h['label'],
score=h['score'])
for h in results
if h['score'] > 0.0],
project.subjects)
except (TypeError, ValueError) as err:
self.warning("Problem interpreting JSON data: {}".format(err))
return ListAnalysisResult([], project.subjects)
return ListSuggestionResult([], project.subjects)
19 changes: 10 additions & 9 deletions annif/backend/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@


import abc
from annif.hit import ListAnalysisResult
from annif.suggestion import ListSuggestionResult


class ChunkingBackend(metaclass=abc.ABCMeta):
"""Annif backend mixin that implements chunking of input"""

@abc.abstractmethod
def _analyze_chunks(self, chunktexts, project):
"""Analyze the chunked text; should be implemented by the subclass
inheriting this mixin"""
def _suggest_chunks(self, chunktexts, project):
"""Suggest subjects for the chunked text; should be implemented by
the subclass inheriting this mixin"""

pass # pragma: no cover

def _analyze(self, text, project, params):
def _suggest(self, text, project, params):
self.initialize()
self.debug('Analyzing text "{}..." (len={})'.format(
self.debug('Suggesting subjects for text "{}..." (len={})'.format(
text[:20], len(text)))
sentences = project.analyzer.tokenize_sentences(text)
self.debug('Found {} sentences'.format(len(sentences)))
Expand All @@ -26,6 +26,7 @@ def _analyze(self, text, project, params):
for i in range(0, len(sentences), chunksize):
chunktexts.append(' '.join(sentences[i:i + chunksize]))
self.debug('Split sentences into {} chunks'.format(len(chunktexts)))
if len(chunktexts) == 0: # nothing to analyze, empty result
return ListAnalysisResult(hits=[], subject_index=project.subjects)
return self._analyze_chunks(chunktexts, project)
if len(chunktexts) == 0: # no input, empty result
return ListSuggestionResult(
hits=[], subject_index=project.subjects)
return self._suggest_chunks(chunktexts, project)
14 changes: 7 additions & 7 deletions annif/backend/pav.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.isotonic import IsotonicRegression
import numpy as np
import annif.corpus
import annif.hit
import annif.suggestion
import annif.project
import annif.util
from annif.exception import NotInitializedException
Expand Down Expand Up @@ -53,20 +53,20 @@ def _normalize_hits(self, hits, source_project):
else: # default to raw score
score = hit.score
pav_result.append(
annif.hit.AnalysisHit(
annif.suggestion.SubjectSuggestion(
uri=hit.uri,
label=hit.label,
score=score))
pav_result.sort(key=lambda hit: hit.score, reverse=True)
return annif.hit.ListAnalysisResult(
return annif.suggestion.ListSuggestionResult(
pav_result, source_project.subjects)

@staticmethod
def _analyze_train_corpus(source_project, corpus):
def _suggest_train_corpus(source_project, corpus):
scores = []
true = []
for doc in corpus.documents:
hits = source_project.analyze(doc.text)
hits = source_project.suggest(doc.text)
scores.append(hits.vector)
subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
true.append(subjects.as_vector(source_project.subjects))
Expand All @@ -76,8 +76,8 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
self.info("creating PAV model for source {}, min_docs={}".format(
source_project_id, min_docs))
source_project = annif.project.get_project(source_project_id)
# analyze the training corpus
scores, true = self._analyze_train_corpus(source_project, corpus)
# suggest subjects for the training corpus
scores, true = self._suggest_train_corpus(source_project, corpus)
# create the concept-specific PAV regression models
pav_regressions = {}
for cid in range(len(source_project.subjects)):
Expand Down
8 changes: 4 additions & 4 deletions annif/backend/tfidf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import gensim.similarities
from gensim.matutils import Sparse2Corpus
import annif.util
from annif.hit import VectorAnalysisResult
from annif.suggestion import VectorSuggestionResult
from annif.exception import NotInitializedException
from . import backend

Expand Down Expand Up @@ -45,11 +45,11 @@ def train(self, corpus, project):
self.datadir,
self.INDEX_FILE)

def _analyze(self, text, project, params):
def _suggest(self, text, project, params):
self.initialize()
self.debug('Analyzing text "{}..." (len={})'.format(
self.debug('Suggesting subjects for text "{}..." (len={})'.format(
text[:20], len(text)))
vectors = project.vectorizer.transform([text])
docsim = self._index[vectors[0]]
fullresult = VectorAnalysisResult(docsim, project.subjects)
fullresult = VectorSuggestionResult(docsim, project.subjects)
return fullresult.filter(limit=int(self.params['limit']))
13 changes: 7 additions & 6 deletions annif/backend/vw_multi.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""Annif backend using the Vorpal Wabbit multiclass and multilabel
"""Annif backend using the Vowpal Wabbit multiclass and multilabel
classifiers"""

import random
import os.path
import annif.util
from vowpalwabbit import pyvw
import numpy as np
from annif.hit import ListAnalysisResult, VectorAnalysisResult
from annif.suggestion import ListSuggestionResult, VectorSuggestionResult
from annif.exception import ConfigurationException, NotInitializedException
from . import backend
from . import mixins
Expand Down Expand Up @@ -112,7 +112,7 @@ def _get_input(self, input, project, text):
return self._normalize_text(project, text)
else:
proj = annif.project.get_project(input)
result = proj.analyze(text)
result = proj.suggest(text)
features = [
'{}:{}'.format(self._cleanup_text(hit.uri), hit.score)
for hit in result.hits]
Expand Down Expand Up @@ -209,7 +209,7 @@ def _convert_result(self, result, project):
# result is a list of scores (probabilities or binary 1/0)
return np.array(result)

def _analyze_chunks(self, chunktexts, project):
def _suggest_chunks(self, chunktexts, project):
results = []
for chunktext in chunktexts:
exampletext = self._inputs_to_exampletext(project, chunktext)
Expand All @@ -219,6 +219,7 @@ def _analyze_chunks(self, chunktexts, project):
result = self._model.predict(example)
results.append(self._convert_result(result, project))
if not results: # empty result
return ListAnalysisResult(hits=[], subject_index=project.subjects)
return VectorAnalysisResult(
return ListSuggestionResult(
hits=[], subject_index=project.subjects)
return VectorSuggestionResult(
np.array(results).mean(axis=0), project.subjects)
Loading

0 comments on commit dee89b4

Please sign in to comment.