Merge pull request #270 from NatLibFi/issue267-cli-analyze-to-suggest

Rename CLI command analyze to suggest and refactor method names
NatLibFi · Apr 17, 2019 · dee89b4 · dee89b4
2 parents c8c370a + 11c352e
commit dee89b4
Show file tree

Hide file tree

Showing 26 changed files with 271 additions and 265 deletions.
diff --git a/annif/backend/backend.py b/annif/backend/backend.py
@@ -30,18 +30,18 @@ def initialize(self):
         pass
 
     @abc.abstractmethod
-    def _analyze(self, text, project, params):
+    def _suggest(self, text, project, params):
         """This method should implemented by backends. It implements
-        the analyze functionality, with pre-processed parameters."""
+        the suggest functionality, with pre-processed parameters."""
         pass  # pragma: no cover
 
-    def analyze(self, text, project, params=None):
-        """Analyze some input text and return a list of subjects represented
-        as a list of AnalysisHit objects."""
+    def suggest(self, text, project, params=None):
+        """Suggest subjects for the input text and return a list of subjects
+        represented as a list of SubjectSuggestion objects."""
         beparams = dict(self.params)
         if params is not None:
             beparams.update(params)
-        return self._analyze(text, project, params=beparams)
+        return self._suggest(text, project, params=beparams)
 
     def debug(self, message):
         """Log a debug message from this backend"""

diff --git a/annif/backend/dummy.py b/annif/backend/dummy.py
@@ -1,7 +1,7 @@
 """Dummy backend for testing basic interaction of projects and backends"""
 
 
-from annif.hit import AnalysisHit, ListAnalysisResult
+from annif.suggestion import SubjectSuggestion, ListSuggestionResult
 from . import backend
 
 
@@ -14,12 +14,12 @@ class DummyBackend(backend.AnnifLearningBackend):
     def initialize(self):
         self.initialized = True
 
-    def _analyze(self, text, project, params):
+    def _suggest(self, text, project, params):
         score = float(params.get('score', 1.0))
-        return ListAnalysisResult([AnalysisHit(uri=self.uri,
-                                               label=self.label,
-                                               score=score)],
-                                  project.subjects)
+        return ListSuggestionResult([SubjectSuggestion(uri=self.uri,
+                                                       label=self.label,
+                                                       score=score)],
+                                    project.subjects)
 
     def learn(self, corpus, project):
         # in this dummy backend we "learn" by picking up the URI and label

diff --git a/annif/backend/ensemble.py b/annif/backend/ensemble.py
@@ -1,7 +1,7 @@
 """Ensemble backend that combines results from multiple projects"""
 
 
-import annif.hit
+import annif.suggestion
 import annif.project
 import annif.util
 from . import backend
@@ -16,23 +16,23 @@ def _normalize_hits(self, hits, source_project):
         by subclasses."""
         return hits
 
-    def _analyze_with_sources(self, text, sources):
+    def _suggest_with_sources(self, text, sources):
         hits_from_sources = []
         for project_id, weight in sources:
             source_project = annif.project.get_project(project_id)
-            hits = source_project.analyze(text)
+            hits = source_project.suggest(text)
             self.debug(
                 'Got {} hits from project {}'.format(
                     len(hits), source_project.project_id))
             norm_hits = self._normalize_hits(hits, source_project)
             hits_from_sources.append(
-                annif.hit.WeightedHits(
+                annif.suggestion.WeightedSuggestion(
                     hits=norm_hits, weight=weight))
         return hits_from_sources
 
-    def _analyze(self, text, project, params):
+    def _suggest(self, text, project, params):
         sources = annif.util.parse_sources(params['sources'])
-        hits_from_sources = self._analyze_with_sources(text, sources)
+        hits_from_sources = self._suggest_with_sources(text, sources)
         merged_hits = annif.util.merge_hits(
             hits_from_sources, project.subjects)
         self.debug('{} hits after merging'.format(len(merged_hits)))

diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py
@@ -3,7 +3,7 @@
 import collections
 import os.path
 import annif.util
-from annif.hit import AnalysisHit, ListAnalysisResult
+from annif.suggestion import SubjectSuggestion, ListSuggestionResult
 from annif.exception import NotInitializedException
 import fastText
 from . import backend
@@ -112,7 +112,7 @@ def _predict_chunks(self, chunktexts, project, limit):
             filter(None, [self._normalize_text(project, chunktext)
                           for chunktext in chunktexts])), limit)
 
-    def _analyze_chunks(self, chunktexts, project):
+    def _suggest_chunks(self, chunktexts, project):
         limit = int(self.params['limit'])
         chunklabels, chunkscores = self._predict_chunks(
             chunktexts, project, limit)
@@ -127,8 +127,8 @@ def _analyze_chunks(self, chunktexts, project):
         results = []
         for score, label in best_labels[:limit]:
             subject = self._label_to_subject(project, label)
-            results.append(AnalysisHit(
+            results.append(SubjectSuggestion(
                 uri=subject[0],
                 label=subject[1],
                 score=score / len(chunktexts)))
-        return ListAnalysisResult(results, project.subjects)
+        return ListSuggestionResult(results, project.subjects)
diff --git a/annif/backend/http.py b/annif/backend/http.py
@@ -4,14 +4,14 @@
 
 import requests
 import requests.exceptions
-from annif.hit import AnalysisHit, ListAnalysisResult
+from annif.suggestion import SubjectSuggestion, ListSuggestionResult
 from . import backend
 
 
 class HTTPBackend(backend.AnnifBackend):
     name = "http"
 
-    def _analyze(self, text, project, params):
+    def _suggest(self, text, project, params):
         data = {'text': text}
         if 'project' in params:
             data['project'] = params['project']
@@ -21,26 +21,26 @@ def _analyze(self, text, project, params):
             req.raise_for_status()
         except requests.exceptions.RequestException as err:
             self.warning("HTTP request failed: {}".format(err))
-            return ListAnalysisResult([], project.subjects)
+            return ListSuggestionResult([], project.subjects)
 
         try:
             response = req.json()
         except ValueError as err:
             self.warning("JSON decode failed: {}".format(err))
-            return ListAnalysisResult([], project.subjects)
+            return ListSuggestionResult([], project.subjects)
 
         if 'results' in response:
             results = response['results']
         else:
             results = response
 
         try:
-            return ListAnalysisResult([AnalysisHit(uri=h['uri'],
-                                                   label=h['label'],
-                                                   score=h['score'])
-                                       for h in results
-                                       if h['score'] > 0.0],
-                                      project.subjects)
+            return ListSuggestionResult([SubjectSuggestion(uri=h['uri'],
+                                                           label=h['label'],
+                                                           score=h['score'])
+                                         for h in results
+                                         if h['score'] > 0.0],
+                                        project.subjects)
         except (TypeError, ValueError) as err:
             self.warning("Problem interpreting JSON data: {}".format(err))
-            return ListAnalysisResult([], project.subjects)
+            return ListSuggestionResult([], project.subjects)
diff --git a/annif/backend/mixins.py b/annif/backend/mixins.py
@@ -2,22 +2,22 @@
 
 
 import abc
-from annif.hit import ListAnalysisResult
+from annif.suggestion import ListSuggestionResult
 
 
 class ChunkingBackend(metaclass=abc.ABCMeta):
     """Annif backend mixin that implements chunking of input"""
 
     @abc.abstractmethod
-    def _analyze_chunks(self, chunktexts, project):
-        """Analyze the chunked text; should be implemented by the subclass
-        inheriting this mixin"""
+    def _suggest_chunks(self, chunktexts, project):
+        """Suggest subjects for the chunked text; should be implemented by
+        the subclass inheriting this mixin"""
 
         pass  # pragma: no cover
 
-    def _analyze(self, text, project, params):
+    def _suggest(self, text, project, params):
         self.initialize()
-        self.debug('Analyzing text "{}..." (len={})'.format(
+        self.debug('Suggesting subjects for text "{}..." (len={})'.format(
             text[:20], len(text)))
         sentences = project.analyzer.tokenize_sentences(text)
         self.debug('Found {} sentences'.format(len(sentences)))
@@ -26,6 +26,7 @@ def _analyze(self, text, project, params):
         for i in range(0, len(sentences), chunksize):
             chunktexts.append(' '.join(sentences[i:i + chunksize]))
         self.debug('Split sentences into {} chunks'.format(len(chunktexts)))
-        if len(chunktexts) == 0:  # nothing to analyze, empty result
-            return ListAnalysisResult(hits=[], subject_index=project.subjects)
-        return self._analyze_chunks(chunktexts, project)
+        if len(chunktexts) == 0:  # no input, empty result
+            return ListSuggestionResult(
+                hits=[], subject_index=project.subjects)
+        return self._suggest_chunks(chunktexts, project)
diff --git a/annif/backend/pav.py b/annif/backend/pav.py
@@ -8,7 +8,7 @@
 from sklearn.isotonic import IsotonicRegression
 import numpy as np
 import annif.corpus
-import annif.hit
+import annif.suggestion
 import annif.project
 import annif.util
 from annif.exception import NotInitializedException
@@ -53,20 +53,20 @@ def _normalize_hits(self, hits, source_project):
             else:  # default to raw score
                 score = hit.score
             pav_result.append(
-                annif.hit.AnalysisHit(
+                annif.suggestion.SubjectSuggestion(
                     uri=hit.uri,
                     label=hit.label,
                     score=score))
         pav_result.sort(key=lambda hit: hit.score, reverse=True)
-        return annif.hit.ListAnalysisResult(
+        return annif.suggestion.ListSuggestionResult(
             pav_result, source_project.subjects)
 
     @staticmethod
-    def _analyze_train_corpus(source_project, corpus):
+    def _suggest_train_corpus(source_project, corpus):
         scores = []
         true = []
         for doc in corpus.documents:
-            hits = source_project.analyze(doc.text)
+            hits = source_project.suggest(doc.text)
             scores.append(hits.vector)
             subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
             true.append(subjects.as_vector(source_project.subjects))
@@ -76,8 +76,8 @@ def _create_pav_model(self, source_project_id, min_docs, corpus):
         self.info("creating PAV model for source {}, min_docs={}".format(
             source_project_id, min_docs))
         source_project = annif.project.get_project(source_project_id)
-        # analyze the training corpus
-        scores, true = self._analyze_train_corpus(source_project, corpus)
+        # suggest subjects for the training corpus
+        scores, true = self._suggest_train_corpus(source_project, corpus)
         # create the concept-specific PAV regression models
         pav_regressions = {}
         for cid in range(len(source_project.subjects)):

diff --git a/annif/backend/tfidf.py b/annif/backend/tfidf.py
@@ -5,7 +5,7 @@
 import gensim.similarities
 from gensim.matutils import Sparse2Corpus
 import annif.util
-from annif.hit import VectorAnalysisResult
+from annif.suggestion import VectorSuggestionResult
 from annif.exception import NotInitializedException
 from . import backend
 
@@ -45,11 +45,11 @@ def train(self, corpus, project):
             self.datadir,
             self.INDEX_FILE)
 
-    def _analyze(self, text, project, params):
+    def _suggest(self, text, project, params):
         self.initialize()
-        self.debug('Analyzing text "{}..." (len={})'.format(
+        self.debug('Suggesting subjects for text "{}..." (len={})'.format(
             text[:20], len(text)))
         vectors = project.vectorizer.transform([text])
         docsim = self._index[vectors[0]]
-        fullresult = VectorAnalysisResult(docsim, project.subjects)
+        fullresult = VectorSuggestionResult(docsim, project.subjects)
         return fullresult.filter(limit=int(self.params['limit']))
diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
@@ -1,12 +1,12 @@
-"""Annif backend using the Vorpal Wabbit multiclass and multilabel
+"""Annif backend using the Vowpal Wabbit multiclass and multilabel
 classifiers"""
 
 import random
 import os.path
 import annif.util
 from vowpalwabbit import pyvw
 import numpy as np
-from annif.hit import ListAnalysisResult, VectorAnalysisResult
+from annif.suggestion import ListSuggestionResult, VectorSuggestionResult
 from annif.exception import ConfigurationException, NotInitializedException
 from . import backend
 from . import mixins
@@ -112,7 +112,7 @@ def _get_input(self, input, project, text):
             return self._normalize_text(project, text)
         else:
             proj = annif.project.get_project(input)
-            result = proj.analyze(text)
+            result = proj.suggest(text)
             features = [
                 '{}:{}'.format(self._cleanup_text(hit.uri), hit.score)
                 for hit in result.hits]
@@ -209,7 +209,7 @@ def _convert_result(self, result, project):
             # result is a list of scores (probabilities or binary 1/0)
             return np.array(result)
 
-    def _analyze_chunks(self, chunktexts, project):
+    def _suggest_chunks(self, chunktexts, project):
         results = []
         for chunktext in chunktexts:
             exampletext = self._inputs_to_exampletext(project, chunktext)
@@ -219,6 +219,7 @@ def _analyze_chunks(self, chunktexts, project):
             result = self._model.predict(example)
             results.append(self._convert_result(result, project))
         if not results:  # empty result
-            return ListAnalysisResult(hits=[], subject_index=project.subjects)
-        return VectorAnalysisResult(
+            return ListSuggestionResult(
+                hits=[], subject_index=project.subjects)
+        return VectorSuggestionResult(
             np.array(results).mean(axis=0), project.subjects)