From 7c4fad86dea1dd67a8ce79313486d580ba0e84a9 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 5 Jan 2021 13:45:06 +0200
Subject: [PATCH 01/57] Initial YAKE integration commit

---
 README.md                 |   2 +-
 annif/backend/__init__.py |   6 +
 annif/backend/yake.py     | 249 ++++++++++++++++++++++++++++++++++++++
 projects.cfg.dist         |   8 ++
 setup.py                  |   1 +
 5 files changed, 265 insertions(+), 1 deletion(-)
 create mode 100755 annif/backend/yake.py

diff --git a/README.md b/README.md
index 583c3a6b1..b115f7cbf 100644
--- a/README.md
+++ b/README.md
@@ -133,4 +133,4 @@ Zenodo DOI:
 
 The code in this repository is licensed under Apache License 2.0, except for the
 dependencies included under `annif/static/css` and `annif/static/js`,
-which have their own licenses. See the file headers for details.
+which have their own licenses. See the file headers for details. Using the optional Yake backend may change the licence of Annif to GPLv3, because [YAKE](https://github.com/LIAAD/yake) is licensed under GPLv3.
diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py
index 056a2fa99..bea668138 100644
--- a/annif/backend/__init__.py
+++ b/annif/backend/__init__.py
@@ -60,3 +60,9 @@ def get_backend(backend_id):
     register_backend(omikuji.OmikujiBackend)
 except ImportError:
     annif.logger.debug("Omikuji not available, not enabling omikuji backend")
+
+try:
+    from . import yake
+    register_backend(yake.YakeBackend)
+except ImportError:
+    annif.logger.debug("YAKE not available, not enabling yake backend")
diff --git a/annif/backend/yake.py b/annif/backend/yake.py
new file mode 100755
index 000000000..654e77a22
--- /dev/null
+++ b/annif/backend/yake.py
@@ -0,0 +1,249 @@
+"""Annif backend using Yake keyword extraction"""
+# TODO Mention GPLv3 license also here?
+
+import yake
+import os.path
+import re
+from collections import defaultdict
+from rdflib.namespace import SKOS, RDF, OWL, URIRef
+import rdflib
+from nltk.corpus import stopwords
+from . import backend
+from annif.suggestion import SubjectSuggestion, ListSuggestionResult
+
+
+class YakeBackend(backend.AnnifBackend):
+    """Yake based backend for Annif"""
+    name = "yake"
+    needs_subject_index = False
+
+    # defaults for uninitialized instances
+    _index = None
+    _graph = None
+    INDEX_FILE = 'yake-index'
+
+    DEFAULT_PARAMETERS = {
+        'max_ngram_size': 3,
+        'deduplication_threshold': 0.9,
+        'deduplication_algo': 'levs',
+        'window_size': 1,
+        'num_keywords': 100,
+        'features': None,
+    }
+
+    def default_params(self):
+        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
+        params.update(self.DEFAULT_PARAMETERS)
+        return params
+
+    @property
+    def is_trained(self):
+        return True
+
+    def initialize(self):
+        self._initialize_index()
+        # self.graph
+        self._kw_extractor = yake.KeywordExtractor(
+            lan=self.project.language,
+            n=self.params['max_ngram_size'],
+            dedupLim=self.params['deduplication_threshold'],
+            dedupFunc=self.params['deduplication_algo'],
+            windowsSize=self.params['window_size'],
+            top=self.params['num_keywords'],
+            features=self.params['features'])
+
+    def _initialize_index(self):
+        if self._index is None:
+            path = os.path.join(self.datadir, self.INDEX_FILE)
+            if os.path.exists(path):
+                self.info('Loading index from {}'.format(path))
+                self._index = self._load_index(path)
+                self.info(f'Loaded index with {len(self._index)} labels')
+            else:
+                self.info('Creating index')
+                self._create_index()
+                self._save_index(path)
+                self.info(f'Created index with {len(self._index)} labels')
+
+    @property
+    def graph(self):
+        if self._graph is None:
+            self._graph = rdflib.Graph()
+            path = os.path.join(self.project.vocab.datadir, 'subjects.ttl')
+            self.info('Loading graph from {}'.format(path))
+            self._graph.load(path, format=rdflib.util.guess_format(path))
+        return self._graph
+
+    def _create_index(self):
+        # TODO Should index creation be done on loadvoc command?
+        # TODO American to British labels?
+        index = defaultdict(list)
+        for predicate in [SKOS.prefLabel]:  #, SKOS.altLabel, SKOS.hiddenLabel]:
+            for concept in self.graph.subjects(RDF.type, SKOS.Concept):
+                if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
+                    continue
+                for label in self.graph.objects(concept, predicate):
+                    if not label.language == self.project.language:
+                        continue
+                    uri = str(concept)
+                    label = str(label)
+                    # This really is useful: Disambiguate by dropping ambigious labels
+                    # if label[-1] == ')':
+                        # continue
+                    # label = re.sub(r' \(.*\)', '', label)  # Remove specifier
+                    lemmatized_label = self._lemmatize_phrase(label)
+                    lemmatized_label = self._sort_phrase(lemmatized_label)
+                    index[lemmatized_label].append(uri)
+        index.pop('', None)  # Remove possible empty string entry
+        self._index = dict(index)
+
+    def _save_index(self, path):
+        with open(path, 'w', encoding='utf-8') as indexfile:
+            for label, uris in self._index.items():
+                line = label + '\t' + ' '.join(uris)
+                print(line, file=indexfile)
+
+    def _load_index(self, path):
+        index = dict()
+        with open(path, 'r', encoding='utf-8') as indexfile:
+            for line in indexfile:
+                label, uris = line.strip().split('\t')
+                uris = uris.split()
+                index[label] = uris
+        return index
+
+    def _sort_phrase(self, phrase):
+        words = phrase.split()
+        return ' '.join(sorted(words))
+
+    def _lemmatize_phrase(self, phrase):
+        # if self.project.language == 'fi':
+            # lan_stopwords = set(stopwords.words('finnish'))
+        # elif self.project.language == 'en':
+            # stopwords = set(stopwords.words('english'))
+        normalized = []
+        # phrase = re.sub(r'\W+', '', phrase)
+        for word in phrase.split():
+            # if word in lan_stopwords:
+                # continue
+            normalized.append(
+                self.project.analyzer.normalize_word(word).lower())
+        return ' '.join(normalized)
+
+    def _sort_phrase(self, phrase):
+        words = phrase.split()
+        return ' '.join(sorted(words))
+
+    def _keyphrases2suggestions(self, keyphrases):
+        suggestions = []
+        not_matched = []
+        for kp, score in keyphrases:
+            uris = self._keyphrase2uris(kp)
+            for uri in uris:
+                # Its faster to get label from Annif subject index than from graph (but is even this needed?)
+                label = self.project.subjects.uris_to_labels([uri])[0]
+                suggestions.append(
+                    (uri, label, self._transform_score(score)))
+            if not uris:
+                not_matched.append((kp, self._transform_score(score)))
+        # Remove duplicate uris, combining the scores
+        suggestions = self._combine_suggestions(suggestions)
+        self.debug('Keyphrases not matched:\n' + '\t'.join(
+            [x[0] + ' ' + str(x[1]) for x
+             in sorted(not_matched, reverse=True, key=lambda x: x[1])]))
+        return suggestions
+
+    def _keyphrase2uris(self, keyphrase):
+        keyphrase = self._lemmatize_phrase(keyphrase)
+        keyphrase = self._sort_phrase(keyphrase)
+        uris = []
+        uris.extend(self._index.get(keyphrase, []))
+
+        # Maybe TODO: Search only in hidden labels if not found in pref or alt labels:
+        # if not uris:
+            # uris.extend(hidden_label_index.get(mutated_kp, []))
+
+        # Maybe TODO: if not found, search for part of keyword:
+        # if not uris and ' ' in keyphrase:
+            # words = keyphrase.split()
+            # uris.extend(self._index.get(' '.join(words[:-1]), []))
+            # uris.extend(self._index.get(' '.join(words[1:]), []))
+        return uris
+
+    def _transform_score(self, score):
+        return 1.0 / (3*score + 1)
+
+    def _combine_suggestions(self, suggestions):
+        combined_suggestions = {}
+        for uri, label, score in suggestions:
+            if uri not in combined_suggestions:
+                combined_suggestions[uri] = (label, score)
+            else:
+                old_score = combined_suggestions[uri][1]
+                conflated_score = self._conflate_scores(score, old_score)
+                combined_suggestions[uri] = (label, conflated_score)
+        combined_suggestions = [(uri, *label_score) for uri, label_score
+                                in combined_suggestions.items()]
+        return combined_suggestions
+
+    def _conflate_scores(self, score1, score2):
+        # https://stats.stackexchange.com/questions/194878/combining-two-probability-scores/194884
+        # return min(1, score1 + score2)
+        # return min(1.0, (score1**2 + score2**2)**0.5)
+        # score1 = 0.5 * score1 + 0.5
+        # score2 = 0.5 * score2 + 0.5
+        return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2))
+
+    # def _get_node_degrees(self, suggestions):
+    #     connections = []
+    #     for uri, label, score in suggestions:
+    #         suggestion_neighbours = []
+    #         u = URIRef(uri)
+    #         suggestion_neighbours.extend(
+    #             [o for o in self.graph.objects(u, SKOS.broader)])
+    #         suggestion_neighbours.extend(
+    #             [o for o in self.graph.objects(u, SKOS.narrower)])
+    #         #suggestion_neighbours.extend([o for o in graph.objects(u, SKOS.related)])
+    #         connections.append((u, suggestion_neighbours))
+
+    #     node_degrees = []
+    #     for uri, label, score in suggestions:
+    #         u = URIRef(uri)
+    #         cnt = 0
+    #         for neighbour, suggestion_neighbours in connections:
+    #             if u == neighbour:
+    #                 # print('SELF')
+    #                 continue
+    #             if u in suggestion_neighbours:
+    #                 # print('HIT')
+    #                 cnt += 1
+    #         node_degrees.append(cnt)  # / len(suggestion_neighbours))
+    #     return node_degrees
+
+    # def _modify_scores(self, suggestions, node_degrees, scale):
+    #     modified_suggestions = []
+    #     for suggestion, node_degree in zip(suggestions, node_degrees):
+    #         modified_suggestions.append(
+    #             (suggestion[0], suggestion[1],
+    #              float(suggestion[2]) + scale * node_degree))
+    #     return modified_suggestions
+
+    def _suggest(self, text, params):
+        self.debug(
+            f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
+        limit = int(params['limit'])
+
+        keywords = self._kw_extractor.extract_keywords(text)
+        suggestions = self._keyphrases2suggestions(keywords)
+
+        # node_degrees = self._get_node_degrees(suggestions)
+        # suggestions = self._modify_scores(suggestions, node_degrees, scale=0.01)
+
+        subject_suggestions = [SubjectSuggestion(
+                uri=uri,
+                label=label,
+                notation=None,  # TODO Should notation be fetched to here?
+                score=score)
+                for uri, label, score in suggestions[:limit] if score > 0.0]
+        return ListSuggestionResult.create_from_index(subject_suggestions,
+                                                      self.project.subjects)
diff --git a/projects.cfg.dist b/projects.cfg.dist
index 6baac2a6d..fa5625a2f 100644
--- a/projects.cfg.dist
+++ b/projects.cfg.dist
@@ -111,6 +111,14 @@ backend=omikuji
 analyzer=snowball(english)
 vocab=yso-en
 
+[yake-fi]
+name=YAKE Finnish
+language=fi
+backend=yake
+vocab=yso-fi
+analyzer=voikko(fi)
+input_limit=20000
+
 [ensemble-fi]
 name=Ensemble Finnish
 language=fi
diff --git a/setup.py b/setup.py
index 4f41bdcf5..730b736de 100644
--- a/setup.py
+++ b/setup.py
@@ -44,6 +44,7 @@ def read(fname):
         'vw': ['vowpalwabbit==8.8.1'],
         'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'],
         'omikuji': ['omikuji==0.3.*'],
+        'yake': ['yake @ git+https://github.com/LIAAD/yake@v0.4.3'],
         'dev': [
             'codecov',
             'pytest-cov',

From 45b18a668989f8795a483058ad8a6d547497fcb8 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 12 Jan 2021 19:50:51 +0200
Subject: [PATCH 02/57] Cleanup & pep8 fixes

---
 annif/backend/yake.py | 78 ++++---------------------------------------
 1 file changed, 6 insertions(+), 72 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 654e77a22..962677b77 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -3,11 +3,9 @@
 
 import yake
 import os.path
-import re
 from collections import defaultdict
 from rdflib.namespace import SKOS, RDF, OWL, URIRef
 import rdflib
-from nltk.corpus import stopwords
 from . import backend
 from annif.suggestion import SubjectSuggestion, ListSuggestionResult
 
@@ -42,7 +40,6 @@ def is_trained(self):
 
     def initialize(self):
         self._initialize_index()
-        # self.graph
         self._kw_extractor = yake.KeywordExtractor(
             lan=self.project.language,
             n=self.params['max_ngram_size'],
@@ -56,9 +53,9 @@ def _initialize_index(self):
         if self._index is None:
             path = os.path.join(self.datadir, self.INDEX_FILE)
             if os.path.exists(path):
-                self.info('Loading index from {}'.format(path))
                 self._index = self._load_index(path)
-                self.info(f'Loaded index with {len(self._index)} labels')
+                self.info(
+                    f'Loaded index from {path} with {len(self._index)} labels')
             else:
                 self.info('Creating index')
                 self._create_index()
@@ -68,6 +65,8 @@ def _initialize_index(self):
     @property
     def graph(self):
         if self._graph is None:
+            # TODO use as_graph() that is now available
+            # self._graph = vocab.as_graph()
             self._graph = rdflib.Graph()
             path = os.path.join(self.project.vocab.datadir, 'subjects.ttl')
             self.info('Loading graph from {}'.format(path))
@@ -80,17 +79,14 @@ def _create_index(self):
         index = defaultdict(list)
         for predicate in [SKOS.prefLabel]:  #, SKOS.altLabel, SKOS.hiddenLabel]:
             for concept in self.graph.subjects(RDF.type, SKOS.Concept):
-                if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
+                if (concept, OWL.deprecated, rdflib.Literal(True)) \
+                        in self.graph:
                     continue
                 for label in self.graph.objects(concept, predicate):
                     if not label.language == self.project.language:
                         continue
                     uri = str(concept)
                     label = str(label)
-                    # This really is useful: Disambiguate by dropping ambigious labels
-                    # if label[-1] == ')':
-                        # continue
-                    # label = re.sub(r' \(.*\)', '', label)  # Remove specifier
                     lemmatized_label = self._lemmatize_phrase(label)
                     lemmatized_label = self._sort_phrase(lemmatized_label)
                     index[lemmatized_label].append(uri)
@@ -117,30 +113,18 @@ def _sort_phrase(self, phrase):
         return ' '.join(sorted(words))
 
     def _lemmatize_phrase(self, phrase):
-        # if self.project.language == 'fi':
-            # lan_stopwords = set(stopwords.words('finnish'))
-        # elif self.project.language == 'en':
-            # stopwords = set(stopwords.words('english'))
         normalized = []
-        # phrase = re.sub(r'\W+', '', phrase)
         for word in phrase.split():
-            # if word in lan_stopwords:
-                # continue
             normalized.append(
                 self.project.analyzer.normalize_word(word).lower())
         return ' '.join(normalized)
 
-    def _sort_phrase(self, phrase):
-        words = phrase.split()
-        return ' '.join(sorted(words))
-
     def _keyphrases2suggestions(self, keyphrases):
         suggestions = []
         not_matched = []
         for kp, score in keyphrases:
             uris = self._keyphrase2uris(kp)
             for uri in uris:
-                # Its faster to get label from Annif subject index than from graph (but is even this needed?)
                 label = self.project.subjects.uris_to_labels([uri])[0]
                 suggestions.append(
                     (uri, label, self._transform_score(score)))
@@ -159,15 +143,6 @@ def _keyphrase2uris(self, keyphrase):
         uris = []
         uris.extend(self._index.get(keyphrase, []))
 
-        # Maybe TODO: Search only in hidden labels if not found in pref or alt labels:
-        # if not uris:
-            # uris.extend(hidden_label_index.get(mutated_kp, []))
-
-        # Maybe TODO: if not found, search for part of keyword:
-        # if not uris and ' ' in keyphrase:
-            # words = keyphrase.split()
-            # uris.extend(self._index.get(' '.join(words[:-1]), []))
-            # uris.extend(self._index.get(' '.join(words[1:]), []))
         return uris
 
     def _transform_score(self, score):
@@ -188,46 +163,8 @@ def _combine_suggestions(self, suggestions):
 
     def _conflate_scores(self, score1, score2):
         # https://stats.stackexchange.com/questions/194878/combining-two-probability-scores/194884
-        # return min(1, score1 + score2)
-        # return min(1.0, (score1**2 + score2**2)**0.5)
-        # score1 = 0.5 * score1 + 0.5
-        # score2 = 0.5 * score2 + 0.5
         return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2))
 
-    # def _get_node_degrees(self, suggestions):
-    #     connections = []
-    #     for uri, label, score in suggestions:
-    #         suggestion_neighbours = []
-    #         u = URIRef(uri)
-    #         suggestion_neighbours.extend(
-    #             [o for o in self.graph.objects(u, SKOS.broader)])
-    #         suggestion_neighbours.extend(
-    #             [o for o in self.graph.objects(u, SKOS.narrower)])
-    #         #suggestion_neighbours.extend([o for o in graph.objects(u, SKOS.related)])
-    #         connections.append((u, suggestion_neighbours))
-
-    #     node_degrees = []
-    #     for uri, label, score in suggestions:
-    #         u = URIRef(uri)
-    #         cnt = 0
-    #         for neighbour, suggestion_neighbours in connections:
-    #             if u == neighbour:
-    #                 # print('SELF')
-    #                 continue
-    #             if u in suggestion_neighbours:
-    #                 # print('HIT')
-    #                 cnt += 1
-    #         node_degrees.append(cnt)  # / len(suggestion_neighbours))
-    #     return node_degrees
-
-    # def _modify_scores(self, suggestions, node_degrees, scale):
-    #     modified_suggestions = []
-    #     for suggestion, node_degree in zip(suggestions, node_degrees):
-    #         modified_suggestions.append(
-    #             (suggestion[0], suggestion[1],
-    #              float(suggestion[2]) + scale * node_degree))
-    #     return modified_suggestions
-
     def _suggest(self, text, params):
         self.debug(
             f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
@@ -236,9 +173,6 @@ def _suggest(self, text, params):
         keywords = self._kw_extractor.extract_keywords(text)
         suggestions = self._keyphrases2suggestions(keywords)
 
-        # node_degrees = self._get_node_degrees(suggestions)
-        # suggestions = self._modify_scores(suggestions, node_degrees, scale=0.01)
-
         subject_suggestions = [SubjectSuggestion(
                 uri=uri,
                 label=label,

From 099553561d65878d9044beaeb3091bd6b5bce151 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 12 Jan 2021 19:55:08 +0200
Subject: [PATCH 03/57] Increase keyphrase word number to 4

---
 annif/backend/yake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 962677b77..449c48368 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -21,7 +21,7 @@ class YakeBackend(backend.AnnifBackend):
     INDEX_FILE = 'yake-index'
 
     DEFAULT_PARAMETERS = {
-        'max_ngram_size': 3,
+        'max_ngram_size': 4,
         'deduplication_threshold': 0.9,
         'deduplication_algo': 'levs',
         'window_size': 1,

From 1c6e0e859da838b45e938fa9f3df1045b37fd158 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 12 Jan 2021 19:57:14 +0200
Subject: [PATCH 04/57] Use sets of uris instead of lists of uris in index

---
 annif/backend/yake.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 449c48368..0ec013603 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -76,8 +76,8 @@ def graph(self):
     def _create_index(self):
         # TODO Should index creation be done on loadvoc command?
         # TODO American to British labels?
-        index = defaultdict(list)
         for predicate in [SKOS.prefLabel]:  #, SKOS.altLabel, SKOS.hiddenLabel]:
+        index = defaultdict(set)
             for concept in self.graph.subjects(RDF.type, SKOS.Concept):
                 if (concept, OWL.deprecated, rdflib.Literal(True)) \
                         in self.graph:
@@ -89,7 +89,7 @@ def _create_index(self):
                     label = str(label)
                     lemmatized_label = self._lemmatize_phrase(label)
                     lemmatized_label = self._sort_phrase(lemmatized_label)
-                    index[lemmatized_label].append(uri)
+                    index[lemmatized_label].add(uri)
         index.pop('', None)  # Remove possible empty string entry
         self._index = dict(index)
 

From 4bf08dac5978c8f6e53a8f51d3e21b074c6af6db Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 12 Jan 2021 19:59:34 +0200
Subject: [PATCH 05/57] Put also alt and hidden labels in index instead of just
 prefs

---
 annif/backend/yake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 0ec013603..17ab49487 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -76,8 +76,8 @@ def graph(self):
     def _create_index(self):
         # TODO Should index creation be done on loadvoc command?
         # TODO American to British labels?
-        for predicate in [SKOS.prefLabel]:  #, SKOS.altLabel, SKOS.hiddenLabel]:
         index = defaultdict(set)
+        for predicate in [SKOS.prefLabel, SKOS.altLabel, SKOS.hiddenLabel]:
             for concept in self.graph.subjects(RDF.type, SKOS.Concept):
                 if (concept, OWL.deprecated, rdflib.Literal(True)) \
                         in self.graph:

From 85eb8f7df35686ab96977e87f88f41349d3b1bc4 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 12 Jan 2021 20:01:48 +0200
Subject: [PATCH 06/57] More straightforward score transformation

---
 annif/backend/yake.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 17ab49487..ad262c4c5 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -146,7 +146,8 @@ def _keyphrase2uris(self, keyphrase):
         return uris
 
     def _transform_score(self, score):
-        return 1.0 / (3*score + 1)
+        # TODO if score<0:
+        return 1.0 / (score + 1)
 
     def _combine_suggestions(self, suggestions):
         combined_suggestions = {}

From 5a97da55a2de60a23eabe93645b831a8a56a9285 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 12 Jan 2021 20:08:05 +0200
Subject: [PATCH 07/57] Shorten&simplify code

---
 annif/backend/yake.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index ad262c4c5..20e7ec1ea 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -74,8 +74,8 @@ def graph(self):
         return self._graph
 
     def _create_index(self):
-        # TODO Should index creation be done on loadvoc command?
-        # TODO American to British labels?
+        # TODO Should index creation & saving be done on loadvoc command?
+        # Or saving at all? It takes about 1 min to create the index
         index = defaultdict(set)
         for predicate in [SKOS.prefLabel, SKOS.altLabel, SKOS.hiddenLabel]:
             for concept in self.graph.subjects(RDF.type, SKOS.Concept):
@@ -104,8 +104,7 @@ def _load_index(self, path):
         with open(path, 'r', encoding='utf-8') as indexfile:
             for line in indexfile:
                 label, uris = line.strip().split('\t')
-                uris = uris.split()
-                index[label] = uris
+                index[label] = uris.split()
         return index
 
     def _sort_phrase(self, phrase):
@@ -140,10 +139,7 @@ def _keyphrases2suggestions(self, keyphrases):
     def _keyphrase2uris(self, keyphrase):
         keyphrase = self._lemmatize_phrase(keyphrase)
         keyphrase = self._sort_phrase(keyphrase)
-        uris = []
-        uris.extend(self._index.get(keyphrase, []))
-
-        return uris
+        return self._index.get(keyphrase, [])
 
     def _transform_score(self, score):
         # TODO if score<0:

From 583272a84fa6859d72e1ef76edde2425f424a53c Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 12 Jan 2021 20:57:34 +0200
Subject: [PATCH 08/57] Remove unused import

---
 annif/backend/yake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 20e7ec1ea..660323f54 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -4,7 +4,7 @@
 import yake
 import os.path
 from collections import defaultdict
-from rdflib.namespace import SKOS, RDF, OWL, URIRef
+from rdflib.namespace import SKOS, RDF, OWL
 import rdflib
 from . import backend
 from annif.suggestion import SubjectSuggestion, ListSuggestionResult

From db9d4b064e8781fe6c24f90ee004e9b53d3f0d80 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 26 Jan 2021 13:53:50 +0200
Subject: [PATCH 09/57] Load graph using as_graph method of the vocab module

---
 annif/backend/yake.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 660323f54..2e5ee516b 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -65,12 +65,8 @@ def _initialize_index(self):
     @property
     def graph(self):
         if self._graph is None:
-            # TODO use as_graph() that is now available
-            # self._graph = vocab.as_graph()
-            self._graph = rdflib.Graph()
-            path = os.path.join(self.project.vocab.datadir, 'subjects.ttl')
-            self.info('Loading graph from {}'.format(path))
-            self._graph.load(path, format=rdflib.util.guess_format(path))
+            self.info('Loading graph')
+            self._graph = self.project.vocab.as_graph()
         return self._graph
 
     def _create_index(self):

From b6979187b7c4ade05b03f6c8087029136604eb5e Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 26 Jan 2021 16:02:07 +0200
Subject: [PATCH 10/57] Configurable label types for index creation

---
 annif/backend/yake.py | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 2e5ee516b..79526b749 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -8,6 +8,7 @@
 import rdflib
 from . import backend
 from annif.suggestion import SubjectSuggestion, ListSuggestionResult
+from annif.exception import ConfigurationException
 
 
 class YakeBackend(backend.AnnifBackend):
@@ -27,6 +28,7 @@ class YakeBackend(backend.AnnifBackend):
         'window_size': 1,
         'num_keywords': 100,
         'features': None,
+        'default_label_types': ['pref', 'alt']
     }
 
     def default_params(self):
@@ -38,6 +40,21 @@ def default_params(self):
     def is_trained(self):
         return True
 
+    @property
+    def label_types(self):
+        mapping = {'pref': SKOS.prefLabel,
+                   'alt': SKOS.altLabel,
+                   'hidden': SKOS.hiddenLabel}
+        if 'label_types' in self.params:
+            lt_entries = self.params['label_types'].split(',')
+            try:
+                return [mapping[lt.strip()] for lt in lt_entries]
+            except KeyError as err:
+                raise ConfigurationException(
+                    f'invalid label type {err}', backend_id=self.backend_id)
+        else:
+            return [mapping[lt] for lt in self.params['default_label_types']]
+
     def initialize(self):
         self._initialize_index()
         self._kw_extractor = yake.KeywordExtractor(
@@ -73,12 +90,12 @@ def _create_index(self):
         # TODO Should index creation & saving be done on loadvoc command?
         # Or saving at all? It takes about 1 min to create the index
         index = defaultdict(set)
-        for predicate in [SKOS.prefLabel, SKOS.altLabel, SKOS.hiddenLabel]:
+        for label_type in self.label_types:
             for concept in self.graph.subjects(RDF.type, SKOS.Concept):
                 if (concept, OWL.deprecated, rdflib.Literal(True)) \
                         in self.graph:
                     continue
-                for label in self.graph.objects(concept, predicate):
+                for label in self.graph.objects(concept, label_type):
                     if not label.language == self.project.language:
                         continue
                     uri = str(concept)

From 630af0476490573562ec020d2ebf280cb3dce6c9 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 26 Jan 2021 17:18:27 +0200
Subject: [PATCH 11/57] Dont unnecessarily pass a label to SubjectSuggestion

---
 annif/backend/yake.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 79526b749..79a77f49e 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -185,8 +185,8 @@ def _suggest(self, text, params):
 
         subject_suggestions = [SubjectSuggestion(
                 uri=uri,
-                label=label,
-                notation=None,  # TODO Should notation be fetched to here?
+                label=None,
+                notation=None,
                 score=score)
                 for uri, label, score in suggestions[:limit] if score > 0.0]
         return ListSuggestionResult.create_from_index(subject_suggestions,

From 44611e90769ffc384e3796d9c9f3aad08f5f9e67 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Thu, 28 Jan 2021 16:26:52 +0200
Subject: [PATCH 12/57] Replace negative Yake scores with zero

---
 annif/backend/yake.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 79a77f49e..201bbbe57 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -155,7 +155,9 @@ def _keyphrase2uris(self, keyphrase):
         return self._index.get(keyphrase, [])
 
     def _transform_score(self, score):
-        # TODO if score<0:
+        if score < 0:
+            self.debug(f'Replacing negative YAKE score {score} with zero')
+            return 1.0
         return 1.0 / (score + 1)
 
     def _combine_suggestions(self, suggestions):

From 4169945c07b25af801d89f99221bf1b5dd2d7354 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Thu, 28 Jan 2021 17:34:18 +0200
Subject: [PATCH 13/57] Omit processing vocabulary label (it's fetched when
 creating ListSuggResult)

---
 annif/backend/yake.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 201bbbe57..9951f3509 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -137,9 +137,8 @@ def _keyphrases2suggestions(self, keyphrases):
         for kp, score in keyphrases:
             uris = self._keyphrase2uris(kp)
             for uri in uris:
-                label = self.project.subjects.uris_to_labels([uri])[0]
                 suggestions.append(
-                    (uri, label, self._transform_score(score)))
+                    (uri, self._transform_score(score)))
             if not uris:
                 not_matched.append((kp, self._transform_score(score)))
         # Remove duplicate uris, combining the scores
@@ -162,16 +161,14 @@ def _transform_score(self, score):
 
     def _combine_suggestions(self, suggestions):
         combined_suggestions = {}
-        for uri, label, score in suggestions:
+        for uri, score in suggestions:
             if uri not in combined_suggestions:
-                combined_suggestions[uri] = (label, score)
+                combined_suggestions[uri] = score
             else:
-                old_score = combined_suggestions[uri][1]
-                conflated_score = self._conflate_scores(score, old_score)
-                combined_suggestions[uri] = (label, conflated_score)
-        combined_suggestions = [(uri, *label_score) for uri, label_score
-                                in combined_suggestions.items()]
-        return combined_suggestions
+                old_score = combined_suggestions[uri]
+                combined_suggestions[uri] = self._conflate_scores(
+                    score, old_score)
+        return list(combined_suggestions.items())
 
     def _conflate_scores(self, score1, score2):
         # https://stats.stackexchange.com/questions/194878/combining-two-probability-scores/194884
@@ -190,6 +187,6 @@ def _suggest(self, text, params):
                 label=None,
                 notation=None,
                 score=score)
-                for uri, label, score in suggestions[:limit] if score > 0.0]
+                for uri, score in suggestions[:limit] if score > 0.0]
         return ListSuggestionResult.create_from_index(subject_suggestions,
                                                       self.project.subjects)

From ea8d0059fead44ff10374c75fe885c42cc41e528 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Thu, 28 Jan 2021 17:44:33 +0200
Subject: [PATCH 14/57] Improve variable names; remove unnecessary comments

---
 annif/backend/yake.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 9951f3509..40fef2e79 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -141,11 +141,11 @@ def _keyphrases2suggestions(self, keyphrases):
                     (uri, self._transform_score(score)))
             if not uris:
                 not_matched.append((kp, self._transform_score(score)))
-        # Remove duplicate uris, combining the scores
+        # Remove duplicate uris, conflating the scores
         suggestions = self._combine_suggestions(suggestions)
         self.debug('Keyphrases not matched:\n' + '\t'.join(
-            [x[0] + ' ' + str(x[1]) for x
-             in sorted(not_matched, reverse=True, key=lambda x: x[1])]))
+            [kp[0] + ' ' + str(kp[1]) for kp
+             in sorted(not_matched, reverse=True, key=lambda kp: kp[1])]))
         return suggestions
 
     def _keyphrase2uris(self, keyphrase):
@@ -171,7 +171,6 @@ def _combine_suggestions(self, suggestions):
         return list(combined_suggestions.items())
 
     def _conflate_scores(self, score1, score2):
-        # https://stats.stackexchange.com/questions/194878/combining-two-probability-scores/194884
         return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2))
 
     def _suggest(self, text, params):
@@ -179,8 +178,8 @@ def _suggest(self, text, params):
             f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
         limit = int(params['limit'])
 
-        keywords = self._kw_extractor.extract_keywords(text)
-        suggestions = self._keyphrases2suggestions(keywords)
+        keyphrases = self._kw_extractor.extract_keywords(text)
+        suggestions = self._keyphrases2suggestions(keyphrases)
 
         subject_suggestions = [SubjectSuggestion(
                 uri=uri,

From 3d0afb90e0edbf32b42d4cb8035c2234be3b95b4 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 29 Jan 2021 17:26:28 +0200
Subject: [PATCH 15/57] Get language for label picking from params (to ease
 testing)

---
 annif/backend/yake.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 40fef2e79..305890ad3 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -58,7 +58,7 @@ def label_types(self):
     def initialize(self):
         self._initialize_index()
         self._kw_extractor = yake.KeywordExtractor(
-            lan=self.project.language,
+            lan=self.params['language'],
             n=self.params['max_ngram_size'],
             dedupLim=self.params['deduplication_threshold'],
             dedupFunc=self.params['deduplication_algo'],
@@ -96,7 +96,7 @@ def _create_index(self):
                         in self.graph:
                     continue
                 for label in self.graph.objects(concept, label_type):
-                    if not label.language == self.project.language:
+                    if not label.language == self.params['language']:
                         continue
                     uri = str(concept)
                     label = str(label)

From f9354a0afadda5aafb23c474b7025a2b00e2b28b Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 29 Jan 2021 17:27:01 +0200
Subject: [PATCH 16/57] Config switch for removing a specifier in parenthesis
 from index labels

---
 annif/backend/yake.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 305890ad3..2edd67747 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -3,9 +3,11 @@
 
 import yake
 import os.path
+import re
 from collections import defaultdict
 from rdflib.namespace import SKOS, RDF, OWL
 import rdflib
+import annif.util
 from . import backend
 from annif.suggestion import SubjectSuggestion, ListSuggestionResult
 from annif.exception import ConfigurationException
@@ -28,7 +30,8 @@ class YakeBackend(backend.AnnifBackend):
         'window_size': 1,
         'num_keywords': 100,
         'features': None,
-        'default_label_types': ['pref', 'alt']
+        'default_label_types': ['pref', 'alt'],
+        'remove_specifiers': False
     }
 
     def default_params(self):
@@ -100,6 +103,8 @@ def _create_index(self):
                         continue
                     uri = str(concept)
                     label = str(label)
+                    if annif.util.boolean(self.params['remove_specifiers']):
+                        label = re.sub(r' \(.*\)', '', label)
                     lemmatized_label = self._lemmatize_phrase(label)
                     lemmatized_label = self._sort_phrase(lemmatized_label)
                     index[lemmatized_label].add(uri)

From 7116aa00d130007ecd7cbfe7e669de08bcce77da Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 29 Jan 2021 17:36:53 +0200
Subject: [PATCH 17/57] Test for suggest method of Yake

---
 tests/test_backend_yake.py | 41 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100755 tests/test_backend_yake.py

diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
new file mode 100755
index 000000000..372c8485f
--- /dev/null
+++ b/tests/test_backend_yake.py
@@ -0,0 +1,41 @@
+"""Unit tests for the Yake backend in Annif"""
+
+import annif
+import pytest
+import os
+from rdflib import Graph
+
+
+@pytest.fixture(scope='module')
+def graph_project(project):
+    _rdf_file_path = os.path.join(
+        os.path.dirname(__file__),
+        'corpora',
+        'archaeology',
+        'yso-archaeology.rdf')
+    g = Graph()
+    g.load(_rdf_file_path)
+    project.vocab.as_graph.return_value = g
+    return project
+
+
+def test_yake_suggest(project, graph_project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'fi'},
+        project=graph_project)
+
+    results = yake.suggest("""Arkeologia on tieteenala, jota sanotaan joskus
+        muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
+        tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä.
+        Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä,
+        joita ihmisten toiminta on jättänyt maaperään tai vesistöjen
+        pohjaan.""")
+
+    assert len(results) > 0
+    assert len(results) <= 8
+    hits = results.as_list(project.subjects)
+    assert 'http://www.yso.fi/onto/yso/p1265' in [
+        result.uri for result in hits]
+    assert 'arkeologia' in [result.label for result in hits]

From 154ae55142bbb5076cbf293f9b50b4bcadb65135 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 29 Jan 2021 18:03:45 +0200
Subject: [PATCH 18/57] Skip Yake tests when Yake is not installed (Python 3.7
 in Travis)

---
 tests/test_backend_yake.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 372c8485f..455b37c54 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -5,6 +5,8 @@
 import os
 from rdflib import Graph
 
+pytest.importorskip("annif.backend.yake")
+
 
 @pytest.fixture(scope='module')
 def graph_project(project):

From 9ce122f0ca855f4928e7e0436474bf32610ddee6 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Mon, 1 Feb 2021 10:04:03 +0200
Subject: [PATCH 19/57] Try to simplify _create_index method; reorder methods

---
 annif/backend/yake.py | 98 +++++++++++++++++++++----------------------
 1 file changed, 49 insertions(+), 49 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 2edd67747..265f5a5ce 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -58,6 +58,13 @@ def label_types(self):
         else:
             return [mapping[lt] for lt in self.params['default_label_types']]
 
+    @property
+    def graph(self):
+        if self._graph is None:
+            self.info('Loading graph')
+            self._graph = self.project.vocab.as_graph()
+        return self._graph
+
     def initialize(self):
         self._initialize_index()
         self._kw_extractor = yake.KeywordExtractor(
@@ -82,35 +89,6 @@ def _initialize_index(self):
                 self._save_index(path)
                 self.info(f'Created index with {len(self._index)} labels')
 
-    @property
-    def graph(self):
-        if self._graph is None:
-            self.info('Loading graph')
-            self._graph = self.project.vocab.as_graph()
-        return self._graph
-
-    def _create_index(self):
-        # TODO Should index creation & saving be done on loadvoc command?
-        # Or saving at all? It takes about 1 min to create the index
-        index = defaultdict(set)
-        for label_type in self.label_types:
-            for concept in self.graph.subjects(RDF.type, SKOS.Concept):
-                if (concept, OWL.deprecated, rdflib.Literal(True)) \
-                        in self.graph:
-                    continue
-                for label in self.graph.objects(concept, label_type):
-                    if not label.language == self.params['language']:
-                        continue
-                    uri = str(concept)
-                    label = str(label)
-                    if annif.util.boolean(self.params['remove_specifiers']):
-                        label = re.sub(r' \(.*\)', '', label)
-                    lemmatized_label = self._lemmatize_phrase(label)
-                    lemmatized_label = self._sort_phrase(lemmatized_label)
-                    index[lemmatized_label].add(uri)
-        index.pop('', None)  # Remove possible empty string entry
-        self._index = dict(index)
-
     def _save_index(self, path):
         with open(path, 'w', encoding='utf-8') as indexfile:
             for label, uris in self._index.items():
@@ -125,9 +103,27 @@ def _load_index(self, path):
                 index[label] = uris.split()
         return index
 
-    def _sort_phrase(self, phrase):
-        words = phrase.split()
-        return ' '.join(sorted(words))
+    def _create_index(self):
+        index = defaultdict(set)
+        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
+            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
+                continue
+            uri = str(concept)
+            for label_type in self.label_types:
+                for label in self.graph.objects(concept, label_type):
+                    if not label.language == self.params['language']:
+                        continue
+                    label = self._normalize_label(label)
+                    index[label].add(uri)
+        index.pop('', None)  # Remove possible empty string entry
+        self._index = dict(index)
+
+    def _normalize_label(self, label):
+        label = str(label)
+        if annif.util.boolean(self.params['remove_specifiers']):
+            label = re.sub(r' \(.*\)', '', label)
+        lemmatized_label = self._lemmatize_phrase(label)
+        return self._sort_phrase(lemmatized_label)
 
     def _lemmatize_phrase(self, phrase):
         normalized = []
@@ -136,6 +132,27 @@ def _lemmatize_phrase(self, phrase):
                 self.project.analyzer.normalize_word(word).lower())
         return ' '.join(normalized)
 
+    def _sort_phrase(self, phrase):
+        words = phrase.split()
+        return ' '.join(sorted(words))
+
+    def _suggest(self, text, params):
+        self.debug(
+            f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
+        limit = int(params['limit'])
+
+        keyphrases = self._kw_extractor.extract_keywords(text)
+        suggestions = self._keyphrases2suggestions(keyphrases)
+
+        subject_suggestions = [SubjectSuggestion(
+                uri=uri,
+                label=None,
+                notation=None,
+                score=score)
+                for uri, score in suggestions[:limit] if score > 0.0]
+        return ListSuggestionResult.create_from_index(subject_suggestions,
+                                                      self.project.subjects)
+
     def _keyphrases2suggestions(self, keyphrases):
         suggestions = []
         not_matched = []
@@ -177,20 +194,3 @@ def _combine_suggestions(self, suggestions):
 
     def _conflate_scores(self, score1, score2):
         return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2))
-
-    def _suggest(self, text, params):
-        self.debug(
-            f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
-        limit = int(params['limit'])
-
-        keyphrases = self._kw_extractor.extract_keywords(text)
-        suggestions = self._keyphrases2suggestions(keyphrases)
-
-        subject_suggestions = [SubjectSuggestion(
-                uri=uri,
-                label=None,
-                notation=None,
-                score=score)
-                for uri, score in suggestions[:limit] if score > 0.0]
-        return ListSuggestionResult.create_from_index(subject_suggestions,
-                                                      self.project.subjects)

From 36514889980591fa348255143fc6f09f452a597c Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 3 Feb 2021 19:19:00 +0200
Subject: [PATCH 20/57] Better name for option (remove_parentheses)

---
 annif/backend/yake.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 265f5a5ce..7ceb57fb9 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -31,7 +31,7 @@ class YakeBackend(backend.AnnifBackend):
         'num_keywords': 100,
         'features': None,
         'default_label_types': ['pref', 'alt'],
-        'remove_specifiers': False
+        'remove_parentheses': False
     }
 
     def default_params(self):
@@ -120,7 +120,7 @@ def _create_index(self):
 
     def _normalize_label(self, label):
         label = str(label)
-        if annif.util.boolean(self.params['remove_specifiers']):
+        if annif.util.boolean(self.params['remove_parentheses']):
             label = re.sub(r' \(.*\)', '', label)
         lemmatized_label = self._lemmatize_phrase(label)
         return self._sort_phrase(lemmatized_label)

From 4b5d73031102064e0b461a59747a43421caa8cb0 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 5 Feb 2021 11:40:32 +0200
Subject: [PATCH 21/57] Install Yake from PyPI, not from GitHub

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 730b736de..dc7b9d622 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@ def read(fname):
         'vw': ['vowpalwabbit==8.8.1'],
         'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'],
         'omikuji': ['omikuji==0.3.*'],
-        'yake': ['yake @ git+https://github.com/LIAAD/yake@v0.4.3'],
+        'yake': ['yake==0.4.3'],
         'dev': [
             'codecov',
             'pytest-cov',

From b3578ef0e2bd9db309a31abcbcf18014d61c879f Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 5 Feb 2021 11:43:50 +0200
Subject: [PATCH 22/57] Get labels of a concept using a helper method in index
 creation

---
 annif/backend/yake.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 7ceb57fb9..dd47411cc 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -109,14 +109,19 @@ def _create_index(self):
             if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
                 continue
             uri = str(concept)
-            for label_type in self.label_types:
-                for label in self.graph.objects(concept, label_type):
-                    if not label.language == self.params['language']:
-                        continue
-                    label = self._normalize_label(label)
-                    index[label].add(uri)
+            labels = self._get_concept_labels(concept, self.label_types)
+            for label in labels:
+                label = self._normalize_label(label)
+                index[label].add(uri)
         index.pop('', None)  # Remove possible empty string entry
         self._index = dict(index)
+    def _get_concept_labels(self, concept, label_types):
+        labels = []
+        for label_type in label_types:
+            for label in self.graph.objects(concept, label_type):
+                if label.language == self.params['language']:
+                    labels.append(label)
+        return labels
 
     def _normalize_label(self, label):
         label = str(label)

From 64e015adeba31021493a6fb28b9d59712d9c64c2 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 5 Feb 2021 11:46:02 +0200
Subject: [PATCH 23/57] Return index from create_index() instead setting the
 index field in the method

---
 annif/backend/yake.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index dd47411cc..606ef3533 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -85,7 +85,7 @@ def _initialize_index(self):
                     f'Loaded index from {path} with {len(self._index)} labels')
             else:
                 self.info('Creating index')
-                self._create_index()
+                self._index = self._create_index()
                 self._save_index(path)
                 self.info(f'Created index with {len(self._index)} labels')
 
@@ -114,7 +114,8 @@ def _create_index(self):
                 label = self._normalize_label(label)
                 index[label].add(uri)
         index.pop('', None)  # Remove possible empty string entry
-        self._index = dict(index)
+        return dict(index)
+
     def _get_concept_labels(self, concept, label_types):
         labels = []
         for label_type in label_types:

From 77410974e5e218235ee4e5672997cbd0c8159c7f Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 5 Feb 2021 13:04:46 +0200
Subject: [PATCH 24/57] Combine scores using "additive conflation"

---
 annif/backend/yake.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 606ef3533..856a52b4b 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -194,9 +194,13 @@ def _combine_suggestions(self, suggestions):
                 combined_suggestions[uri] = score
             else:
                 old_score = combined_suggestions[uri]
-                combined_suggestions[uri] = self._conflate_scores(
+                combined_suggestions[uri] = self._combine_scores(
                     score, old_score)
         return list(combined_suggestions.items())
 
-    def _conflate_scores(self, score1, score2):
-        return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2))
+    def _combine_scores(self, score1, score2):
+        # The result is never smaller than the greater input
+        score1 = score1/2 + 0.5
+        score2 = score2/2 + 0.5
+        confl = score1 * score2 / (score1 * score2 + (1-score1) * (1-score2))
+        return (confl-0.5) * 2

From 5bde900f236a385dc2cb54ca4046902c8082976b Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 5 Feb 2021 13:13:37 +0200
Subject: [PATCH 25/57] Create Yake object on suggest (allows setting Yake
 params on runtime)

---
 annif/backend/yake.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 856a52b4b..4d6ce74ae 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -67,14 +67,6 @@ def graph(self):
 
     def initialize(self):
         self._initialize_index()
-        self._kw_extractor = yake.KeywordExtractor(
-            lan=self.params['language'],
-            n=self.params['max_ngram_size'],
-            dedupLim=self.params['deduplication_threshold'],
-            dedupFunc=self.params['deduplication_algo'],
-            windowsSize=self.params['window_size'],
-            top=self.params['num_keywords'],
-            features=self.params['features'])
 
     def _initialize_index(self):
         if self._index is None:
@@ -147,6 +139,14 @@ def _suggest(self, text, params):
             f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
         limit = int(params['limit'])
 
+        self._kw_extractor = yake.KeywordExtractor(
+            lan=params['language'],
+            n=int(params['max_ngram_size']),
+            dedupLim=float(params['deduplication_threshold']),
+            dedupFunc=params['deduplication_algo'],
+            windowsSize=int(params['window_size']),
+            top=int(params['num_keywords']),
+            features=self.params['features'])
         keyphrases = self._kw_extractor.extract_keywords(text)
         suggestions = self._keyphrases2suggestions(keyphrases)
 

From b2c08cfcfcda1ea4206c72ccff59c54f075583b0 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 5 Feb 2021 14:26:55 +0200
Subject: [PATCH 26/57] Avoid crash for empty or non-alphanumeric input

---
 annif/backend/yake.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 4d6ce74ae..88130beb1 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -139,6 +139,10 @@ def _suggest(self, text, params):
             f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
         limit = int(params['limit'])
 
+        alphanum = re.compile('[^a-zA-Z0-9]')
+        if len(re.sub(alphanum, '', text)) == 0:
+            return ListSuggestionResult([])
+
         self._kw_extractor = yake.KeywordExtractor(
             lan=params['language'],
             n=int(params['max_ngram_size']),

From daee74b2cbdf7ada55bb7798807da11e8a373ae0 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 5 Feb 2021 14:27:19 +0200
Subject: [PATCH 27/57] Add unit tests

---
 tests/test_backend_yake.py | 108 +++++++++++++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 455b37c54..aa7e22dba 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -4,6 +4,8 @@
 import pytest
 import os
 from rdflib import Graph
+import annif.backend
+
 
 pytest.importorskip("annif.backend.yake")
 
@@ -41,3 +43,109 @@ def test_yake_suggest(project, graph_project):
     assert 'http://www.yso.fi/onto/yso/p1265' in [
         result.uri for result in hits]
     assert 'arkeologia' in [result.label for result in hits]
+
+
+def test_yake_suggest_non_alphanum_text(project, graph_project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'fi'},
+        project=graph_project)
+
+    results = yake.suggest(".,!")
+    assert len(results) == 0
+
+
+def test_create_index_preflabels(graph_project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'fi', 'label_types': 'pref'},
+        project=graph_project)
+    index = yake._create_index()
+    # Some of the 130 prefLabels get merged in lemmatization:
+    # assyriologit, assyriologia (assyriolog); arkealogit, arkeologia
+    # (arkeolog); egyptologit, egyptologia (egyptolog)
+    assert len(index) == 127
+    assert 'kalliotaid' in index
+    assert 'luolamaalauks' not in index
+
+
+def test_create_index_altlabels(graph_project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'fi', 'label_types': 'alt'},
+        project=graph_project)
+    index = yake._create_index()
+    assert len(index) == 34
+    assert 'kalliotaid' not in index
+    assert 'luolamaalauks' in index
+
+
+def test_create_index_pref_and_altlabels(graph_project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'fi'},
+        project=graph_project)
+    index = yake._create_index()
+    assert len(index) == 161
+    assert 'kalliotaid' in index
+    assert 'luolamaalauks' in index
+
+
+def test_create_index_label_languages(graph_project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'sv', 'label_types': 'pref'},
+        project=graph_project)
+    index = yake._create_index()
+    assert len(index) == 130
+    assert 'kalliotaid' not in index
+    assert 'bergkonst' in index
+    assert 'rock art' not in index
+
+
+def test_combine_suggestions_different_uris(project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'fi'},
+        project=project)
+
+    suggestions = [('http://www.yso.fi/onto/yso/p1265', 0.75),
+                   ('http://www.yso.fi/onto/yso/p1266', 0.25)]
+    combined = yake._combine_suggestions(suggestions)
+    assert len(combined) == 2
+    assert combined[0] == suggestions[0]
+    assert combined[1] == suggestions[1]
+
+
+def test_combine_suggestions_same_uri(project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'fi'},
+        project=project)
+
+    combined = yake._combine_suggestions(
+        [('http://www.yso.fi/onto/yso/p1265', 0.42),
+         ('http://www.yso.fi/onto/yso/p1265', 0.42)])
+    assert len(combined) == 1
+
+
+def test_combine_scores(project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'limit': 8, 'language': 'fi'},
+        project=project)
+
+    assert yake._combine_scores(0.5, 0.5) == 0.8
+    assert yake._combine_scores(0.75, 0.75) == 0.96
+    assert yake._combine_scores(1.0, 0.424242) == 1.0
+    assert yake._combine_scores(1.0, 0.0) == 1.0
+    assert yake._combine_scores(0.4, 0.3) == 0.625
+    assert yake._combine_scores(0.4, 0.5) == 0.75

From c19e8b76da59cbd61fefc495b6e9ff7538a1d472 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Thu, 11 Feb 2021 10:17:26 +0200
Subject: [PATCH 28/57] Make graph_project a common pytest fixture (move it to
 conftest.py)

---
 tests/conftest.py            | 14 ++++++++++++++
 tests/test_backend_stwfsa.py | 19 -------------------
 tests/test_backend_yake.py   | 15 ---------------
 3 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 2e3949f47..80a8bfc3b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,6 +5,7 @@
 import pytest
 import py.path
 import unittest.mock
+from rdflib import Graph
 import annif
 import annif.analyzer
 import annif.corpus
@@ -116,6 +117,19 @@ def project(subject_index, datadir, registry, vocabulary):
     return proj
 
 
+@pytest.fixture(scope='module')
+def graph_project(project):
+    _rdf_file_path = os.path.join(
+        os.path.dirname(__file__),
+        'corpora',
+        'archaeology',
+        'yso-archaeology.rdf')
+    g = Graph()
+    g.load(_rdf_file_path)
+    project.vocab.as_graph.return_value = g
+    return project
+
+
 @pytest.fixture(scope='module')
 def app_project(app):
     with app.app_context():
diff --git a/tests/test_backend_stwfsa.py b/tests/test_backend_stwfsa.py
index e27f11c6b..858b2371b 100644
--- a/tests/test_backend_stwfsa.py
+++ b/tests/test_backend_stwfsa.py
@@ -1,27 +1,8 @@
-import os
 from annif.backend import get_backend
-from rdflib import Graph
 import annif.corpus
 from annif.backend.stwfsa import StwfsaBackend
 from annif.exception import NotInitializedException, NotSupportedException
-
 import pytest
-from unittest.mock import Mock
-
-
-@pytest.fixture
-def graph_project(project):
-    _rdf_file_path = os.path.join(
-        os.path.dirname(__file__),
-        'corpora',
-        'archaeology',
-        'yso-archaeology.rdf')
-    g = Graph()
-    g.load(_rdf_file_path)
-    mock_vocab = Mock()
-    mock_vocab.as_graph.return_value = g
-    project.vocab = mock_vocab
-    return project
 
 
 _backend_conf = {
diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index aa7e22dba..2a79c386e 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -2,27 +2,12 @@
 
 import annif
 import pytest
-import os
-from rdflib import Graph
 import annif.backend
 
 
 pytest.importorskip("annif.backend.yake")
 
 
-@pytest.fixture(scope='module')
-def graph_project(project):
-    _rdf_file_path = os.path.join(
-        os.path.dirname(__file__),
-        'corpora',
-        'archaeology',
-        'yso-archaeology.rdf')
-    g = Graph()
-    g.load(_rdf_file_path)
-    project.vocab.as_graph.return_value = g
-    return project
-
-
 def test_yake_suggest(project, graph_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(

From af1129e13cdd4c7112e811641f60a6bb5962fef6 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Thu, 11 Feb 2021 16:14:25 +0200
Subject: [PATCH 29/57] Avoid need for clumsy mapping for labeltypes by using
 directly SKOS names

---
 annif/backend/yake.py      | 22 ++++++++++------------
 tests/test_backend_yake.py |  6 +++---
 2 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 88130beb1..fba3c8863 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -30,7 +30,7 @@ class YakeBackend(backend.AnnifBackend):
         'window_size': 1,
         'num_keywords': 100,
         'features': None,
-        'default_label_types': ['pref', 'alt'],
+        'default_label_types': ['prefLabel', 'altLabel'],
         'remove_parentheses': False
     }
 
@@ -45,18 +45,16 @@ def is_trained(self):
 
     @property
     def label_types(self):
-        mapping = {'pref': SKOS.prefLabel,
-                   'alt': SKOS.altLabel,
-                   'hidden': SKOS.hiddenLabel}
         if 'label_types' in self.params:
-            lt_entries = self.params['label_types'].split(',')
-            try:
-                return [mapping[lt.strip()] for lt in lt_entries]
-            except KeyError as err:
-                raise ConfigurationException(
-                    f'invalid label type {err}', backend_id=self.backend_id)
-        else:
-            return [mapping[lt] for lt in self.params['default_label_types']]
+            lt_entries = [lt.strip() for lt
+                          in self.params['label_types'].split(',')]
+            valid_types = ('prefLabel', 'altLabel', 'hiddenLabel')
+            for lt in lt_entries:
+                if lt not in valid_types:
+                    raise ConfigurationException(
+                        f'invalid label type {lt}', backend_id=self.backend_id)
+            return [getattr(SKOS, lt) for lt in lt_entries]
+        return [getattr(SKOS, lt) for lt in self.params['default_label_types']]
 
     @property
     def graph(self):
diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 2a79c386e..cf077f404 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -45,7 +45,7 @@ def test_create_index_preflabels(graph_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
-        config_params={'limit': 8, 'language': 'fi', 'label_types': 'pref'},
+        config_params={'language': 'fi', 'label_types': 'prefLabel'},
         project=graph_project)
     index = yake._create_index()
     # Some of the 130 prefLabels get merged in lemmatization:
@@ -60,7 +60,7 @@ def test_create_index_altlabels(graph_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
-        config_params={'limit': 8, 'language': 'fi', 'label_types': 'alt'},
+        config_params={'language': 'fi', 'label_types': 'altLabel'},
         project=graph_project)
     index = yake._create_index()
     assert len(index) == 34
@@ -84,7 +84,7 @@ def test_create_index_label_languages(graph_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
-        config_params={'limit': 8, 'language': 'sv', 'label_types': 'pref'},
+        config_params={'limit': 8, 'language': 'sv', 'label_types': 'prefLabel'},
         project=graph_project)
     index = yake._create_index()
     assert len(index) == 130

From dca4474cee77c8ce7ea35e8234dd38a337c2271d Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Thu, 11 Feb 2021 17:04:00 +0200
Subject: [PATCH 30/57] Avoid need for "default_label_types" name for defaults

---
 annif/backend/yake.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index fba3c8863..10da74409 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -30,7 +30,7 @@ class YakeBackend(backend.AnnifBackend):
         'window_size': 1,
         'num_keywords': 100,
         'features': None,
-        'default_label_types': ['prefLabel', 'altLabel'],
+        'label_types': ['prefLabel', 'altLabel'],
         'remove_parentheses': False
     }
 
@@ -45,16 +45,17 @@ def is_trained(self):
 
     @property
     def label_types(self):
-        if 'label_types' in self.params:
-            lt_entries = [lt.strip() for lt
-                          in self.params['label_types'].split(',')]
+        if type(self.params['label_types']) == str:  # Label types set by user
+            label_types = [lt.strip() for lt
+                           in self.params['label_types'].split(',')]
             valid_types = ('prefLabel', 'altLabel', 'hiddenLabel')
-            for lt in lt_entries:
+            for lt in label_types:
                 if lt not in valid_types:
                     raise ConfigurationException(
                         f'invalid label type {lt}', backend_id=self.backend_id)
-            return [getattr(SKOS, lt) for lt in lt_entries]
-        return [getattr(SKOS, lt) for lt in self.params['default_label_types']]
+        else:
+            label_types = self.params['label_types']  # The defaults
+        return [getattr(SKOS, lt) for lt in label_types]
 
     @property
     def graph(self):

From 2917628f86d8956766c1353de32691df3a4c664c Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 12 Feb 2021 12:56:36 +0200
Subject: [PATCH 31/57] Refactor attempting to resolve complexitity complains
 by CodeClimate

---
 annif/backend/yake.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 10da74409..dc6db845b 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -48,15 +48,17 @@ def label_types(self):
         if type(self.params['label_types']) == str:  # Label types set by user
             label_types = [lt.strip() for lt
                            in self.params['label_types'].split(',')]
-            valid_types = ('prefLabel', 'altLabel', 'hiddenLabel')
-            for lt in label_types:
-                if lt not in valid_types:
-                    raise ConfigurationException(
-                        f'invalid label type {lt}', backend_id=self.backend_id)
+            self._validate_label_types(label_types)
         else:
             label_types = self.params['label_types']  # The defaults
         return [getattr(SKOS, lt) for lt in label_types]
 
+    def _validate_label_types(self, label_types):
+        for lt in label_types:
+            if lt not in ('prefLabel', 'altLabel', 'hiddenLabel'):
+                raise ConfigurationException(
+                    f'invalid label type {lt}', backend_id=self.backend_id)
+
     @property
     def graph(self):
         if self._graph is None:

From 6a3aebd09aaeb087634a29917667318ab4ace02d Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 12 Feb 2021 12:59:32 +0200
Subject: [PATCH 32/57] Add test for invalid label types

---
 tests/test_backend_yake.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index cf077f404..3390b80fc 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -1,13 +1,23 @@
 """Unit tests for the Yake backend in Annif"""
 
 import annif
-import pytest
 import annif.backend
-
+import pytest
+from annif.exception import ConfigurationException
 
 pytest.importorskip("annif.backend.yake")
 
 
+def test_invalid_label_type(graph_project):
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'label_types': 'invalid_type', 'language': 'fi'},
+        project=graph_project)
+    with pytest.raises(ConfigurationException):
+        yake.suggest("example text")
+
+
 def test_yake_suggest(project, graph_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(

From b06ca9cb898d43d2bc1096ea4232148584767e06 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 12 Feb 2021 13:54:11 +0200
Subject: [PATCH 33/57] Remove pointless test

---
 tests/test_backend_yake.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 3390b80fc..b7f98f9ba 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -90,19 +90,6 @@ def test_create_index_pref_and_altlabels(graph_project):
     assert 'luolamaalauks' in index
 
 
-def test_create_index_label_languages(graph_project):
-    yake_type = annif.backend.get_backend('yake')
-    yake = yake_type(
-        backend_id='yake',
-        config_params={'limit': 8, 'language': 'sv', 'label_types': 'prefLabel'},
-        project=graph_project)
-    index = yake._create_index()
-    assert len(index) == 130
-    assert 'kalliotaid' not in index
-    assert 'bergkonst' in index
-    assert 'rock art' not in index
-
-
 def test_combine_suggestions_different_uris(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(

From 79e225b1b92af3d5f40ebeee331ed039ccbdde46 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 12 Feb 2021 16:34:17 +0200
Subject: [PATCH 34/57] Test for removing parentheses from label when creating
 index

---
 tests/test_backend_yake.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index b7f98f9ba..9151177e0 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -4,6 +4,8 @@
 import annif.backend
 import pytest
 from annif.exception import ConfigurationException
+from rdflib import Graph, URIRef, Literal
+from rdflib.namespace import SKOS, RDF
 
 pytest.importorskip("annif.backend.yake")
 
@@ -90,6 +92,28 @@ def test_create_index_pref_and_altlabels(graph_project):
     assert 'luolamaalauks' in index
 
 
+def test_remove_parentheses(graph_project):
+    graph = Graph()
+    graph.add((
+        URIRef('http://www.yso.fi/onto/yso/p4354'), RDF.type, SKOS.Concept))
+    graph.add((
+        URIRef('http://www.yso.fi/onto/yso/p4354'), SKOS.prefLabel,
+        Literal('lapset (ikäryhmät)', lang='fi')))
+    graph_project.vocab.as_graph.return_value = graph
+
+    yake_type = annif.backend.get_backend('yake')
+    yake = yake_type(
+        backend_id='yake',
+        config_params={'language': 'fi', 'remove_parentheses': True},
+        project=graph_project)
+    index = yake._create_index()
+    assert len(index) == 1
+    assert 'laps' in index
+    assert '(' not in index
+    assert ')' not in index
+    assert 'ikä' not in index
+
+
 def test_combine_suggestions_different_uris(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(

From f0911191fecfd45410197a38f467b74fd4363075 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 17 Feb 2021 19:11:44 +0200
Subject: [PATCH 35/57] Add methods for accessing SKOS concepts & labels via
 AnnifVocabulary

---
 annif/corpus/skos.py | 15 +++++++++++++++
 annif/vocab.py       | 20 ++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py
index 5da101d09..5a3bd88cb 100644
--- a/annif/corpus/skos.py
+++ b/annif/corpus/skos.py
@@ -48,6 +48,21 @@ def subjects(self):
             yield Subject(uri=str(concept), label=label, notation=notation,
                           text=None)
 
+    @property
+    def skos_concepts(self):
+        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
+            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
+                continue
+            yield concept
+
+    def get_skos_concept_labels(self, concept, label_types, language):
+        labels = []
+        for label_type in label_types:
+            for label in self.graph.objects(concept, label_type):
+                if label.language == language:
+                    labels.append(label)
+        return labels
+
     @staticmethod
     def is_rdf_file(path):
         """return True if the path looks like an RDF file that can be loaded
diff --git a/annif/vocab.py b/annif/vocab.py
index 2355d1f3b..0b06bb38a 100644
--- a/annif/vocab.py
+++ b/annif/vocab.py
@@ -21,6 +21,7 @@ class AnnifVocabulary(DatadirMixin):
     def __init__(self, vocab_id, datadir):
         DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
         self.vocab_id = vocab_id
+        self._skos_vocab = None
 
     def _create_subject_index(self, subject_corpus):
         self._subjects = annif.corpus.SubjectIndex(subject_corpus)
@@ -55,6 +56,25 @@ def subjects(self):
                     "subject file {} not found".format(path))
         return self._subjects
 
+    @property
+    def skos_vocab(self):
+        if self._skos_vocab is None:
+            path = os.path.join(self.datadir, 'subjects.ttl')
+            if os.path.exists(path):
+                logger.debug(f'loading graph from {path}')
+                self._skos_vocab = annif.corpus.SubjectFileSKOS(path, None)
+            else:
+                raise NotInitializedException(f'graph file {path} not found')
+        return self._skos_vocab
+
+    @property
+    def skos_concepts(self):
+        return self.skos_vocab.skos_concepts
+
+    def get_skos_concept_labels(self, concept, label_types, language):
+        return self.skos_vocab.get_skos_concept_labels(concept, label_types,
+                                                       language)
+
     def load_vocabulary(self, subject_corpus, language):
         """load subjects from a subject corpus and save them into a
         SKOS/Turtle file for later use"""

From fb0b7b5c3ff533c71c99d8849c31e56ba68e4b03 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 17 Feb 2021 19:14:15 +0200
Subject: [PATCH 36/57] Access SKOS concepts & labels via AnnifVocabulary in
 Yake

---
 annif/backend/yake.py      | 25 ++++----------------
 tests/conftest.py          |  8 +++++++
 tests/test_backend_yake.py | 48 +++++++++++++++++++++-----------------
 3 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index dc6db845b..65814d763 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -5,8 +5,7 @@
 import os.path
 import re
 from collections import defaultdict
-from rdflib.namespace import SKOS, RDF, OWL
-import rdflib
+from rdflib.namespace import SKOS
 import annif.util
 from . import backend
 from annif.suggestion import SubjectSuggestion, ListSuggestionResult
@@ -59,13 +58,6 @@ def _validate_label_types(self, label_types):
                 raise ConfigurationException(
                     f'invalid label type {lt}', backend_id=self.backend_id)
 
-    @property
-    def graph(self):
-        if self._graph is None:
-            self.info('Loading graph')
-            self._graph = self.project.vocab.as_graph()
-        return self._graph
-
     def initialize(self):
         self._initialize_index()
 
@@ -98,25 +90,16 @@ def _load_index(self, path):
 
     def _create_index(self):
         index = defaultdict(set)
-        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
-            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
-                continue
+        for concept in self.project.vocab.skos_concepts:
             uri = str(concept)
-            labels = self._get_concept_labels(concept, self.label_types)
+            labels = self.project.vocab.get_skos_concept_labels(
+                concept, self.label_types, self.params['language'])
             for label in labels:
                 label = self._normalize_label(label)
                 index[label].add(uri)
         index.pop('', None)  # Remove possible empty string entry
         return dict(index)
 
-    def _get_concept_labels(self, concept, label_types):
-        labels = []
-        for label_type in label_types:
-            for label in self.graph.objects(concept, label_type):
-                if label.language == self.params['language']:
-                    labels.append(label)
-        return labels
-
     def _normalize_label(self, label):
         label = str(label)
         if annif.util.boolean(self.params['remove_parentheses']):
diff --git a/tests/conftest.py b/tests/conftest.py
index 80a8bfc3b..f23eb8c0d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -130,6 +130,14 @@ def graph_project(project):
     return project
 
 
+@pytest.fixture(scope='function')
+def skos_project(project, skos_vocabulary):
+    project.vocab.skos_concepts = skos_vocabulary.skos_concepts
+    project.vocab.get_skos_concept_labels = \
+        skos_vocabulary.get_skos_concept_labels
+    return project
+
+
 @pytest.fixture(scope='module')
 def app_project(app):
     with app.app_context():
diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 9151177e0..93304baa9 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -6,26 +6,27 @@
 from annif.exception import ConfigurationException
 from rdflib import Graph, URIRef, Literal
 from rdflib.namespace import SKOS, RDF
+from copy import copy
 
 pytest.importorskip("annif.backend.yake")
 
 
-def test_invalid_label_type(graph_project):
+def test_invalid_label_type(skos_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'label_types': 'invalid_type', 'language': 'fi'},
-        project=graph_project)
+        project=skos_project)
     with pytest.raises(ConfigurationException):
         yake.suggest("example text")
 
 
-def test_yake_suggest(project, graph_project):
+def test_yake_suggest(project, skos_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'limit': 8, 'language': 'fi'},
-        project=graph_project)
+        project=skos_project)
 
     results = yake.suggest("""Arkeologia on tieteenala, jota sanotaan joskus
         muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
@@ -42,23 +43,23 @@ def test_yake_suggest(project, graph_project):
     assert 'arkeologia' in [result.label for result in hits]
 
 
-def test_yake_suggest_non_alphanum_text(project, graph_project):
+def test_yake_suggest_non_alphanum_text(project, skos_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'limit': 8, 'language': 'fi'},
-        project=graph_project)
+        project=skos_project)
 
     results = yake.suggest(".,!")
     assert len(results) == 0
 
 
-def test_create_index_preflabels(graph_project):
+def test_create_index_preflabels(skos_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'language': 'fi', 'label_types': 'prefLabel'},
-        project=graph_project)
+        project=skos_project)
     index = yake._create_index()
     # Some of the 130 prefLabels get merged in lemmatization:
     # assyriologit, assyriologia (assyriolog); arkealogit, arkeologia
@@ -68,44 +69,49 @@ def test_create_index_preflabels(graph_project):
     assert 'luolamaalauks' not in index
 
 
-def test_create_index_altlabels(graph_project):
+def test_create_index_pref_and_altlabels(skos_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
-        config_params={'language': 'fi', 'label_types': 'altLabel'},
-        project=graph_project)
+        config_params={'limit': 8, 'language': 'fi'},
+        project=skos_project)
     index = yake._create_index()
-    assert len(index) == 34
-    assert 'kalliotaid' not in index
+    assert len(index) == 161
+    assert 'kalliotaid' in index
     assert 'luolamaalauks' in index
 
 
-def test_create_index_pref_and_altlabels(graph_project):
+def test_create_index_altlabels(skos_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
-        config_params={'limit': 8, 'language': 'fi'},
-        project=graph_project)
+        config_params={'language': 'fi', 'label_types': 'altLabel'},
+        project=skos_project)
     index = yake._create_index()
-    assert len(index) == 161
-    assert 'kalliotaid' in index
+    assert len(index) == 34
+    assert 'kalliotaid' not in index
     assert 'luolamaalauks' in index
 
 
-def test_remove_parentheses(graph_project):
+def test_remove_parentheses(project, skos_vocabulary):
     graph = Graph()
     graph.add((
         URIRef('http://www.yso.fi/onto/yso/p4354'), RDF.type, SKOS.Concept))
     graph.add((
         URIRef('http://www.yso.fi/onto/yso/p4354'), SKOS.prefLabel,
         Literal('lapset (ikäryhmät)', lang='fi')))
-    graph_project.vocab.as_graph.return_value = graph
+
+    skos_vocabulary = copy(skos_vocabulary)  # Do not modify original fixture
+    skos_vocabulary.graph = graph
+    project.vocab.skos_concepts = skos_vocabulary.skos_concepts
+    project.vocab.get_skos_concept_labels = \
+        skos_vocabulary.get_skos_concept_labels
 
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'language': 'fi', 'remove_parentheses': True},
-        project=graph_project)
+        project=project)
     index = yake._create_index()
     assert len(index) == 1
     assert 'laps' in index

From d1c2af5d35a038ad59879c90566594e042340e91 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 17 Feb 2021 19:20:52 +0200
Subject: [PATCH 37/57] Access SKOS graph via skos_vocab in AnnifVocabulary

---
 annif/vocab.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/annif/vocab.py b/annif/vocab.py
index 0b06bb38a..328099aaa 100644
--- a/annif/vocab.py
+++ b/annif/vocab.py
@@ -1,7 +1,6 @@
 """Vocabulary management functionality for Annif"""
 
 import os.path
-import rdflib.graph
 import annif
 import annif.corpus
 import annif.util
@@ -93,9 +92,4 @@ def as_skos(self):
 
     def as_graph(self):
         """return the vocabulary as an rdflib graph"""
-        g = rdflib.graph.Graph()
-        g.load(
-            os.path.join(self.datadir, 'subjects.ttl'),
-            format='ttl'
-        )
-        return g
+        return self.skos_vocab.graph

From f080d02af74bd75788753fcc8bf5dade81d516a0 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 17 Feb 2021 19:26:54 +0200
Subject: [PATCH 38/57] Reduce code duplication by using the skos_concepts
 property

---
 annif/corpus/skos.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py
index 5a3bd88cb..d69a79df6 100644
--- a/annif/corpus/skos.py
+++ b/annif/corpus/skos.py
@@ -35,9 +35,7 @@ def __init__(self, path, language):
 
     @property
     def subjects(self):
-        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
-            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
-                continue
+        for concept in self.skos_concepts:
             labels = self.graph.preferredLabel(concept, lang=self.language)
             notation = self.graph.value(concept, SKOS.notation, None, any=True)
             if not labels:

From 1fdf80388cb986e981383e2f41e3f10e5d912e0b Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 10 Mar 2021 11:19:39 +0200
Subject: [PATCH 39/57] Update YAKE to 0.4.5 (eliminates warnings on input with
 no keywords)

---
 annif/backend/yake.py      | 4 ----
 setup.py                   | 2 +-
 tests/test_backend_yake.py | 4 ++--
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 65814d763..2ffe9b3ed 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -123,10 +123,6 @@ def _suggest(self, text, params):
             f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
         limit = int(params['limit'])
 
-        alphanum = re.compile('[^a-zA-Z0-9]')
-        if len(re.sub(alphanum, '', text)) == 0:
-            return ListSuggestionResult([])
-
         self._kw_extractor = yake.KeywordExtractor(
             lan=params['language'],
             n=int(params['max_ngram_size']),
diff --git a/setup.py b/setup.py
index dc7b9d622..07ecc70d1 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@ def read(fname):
         'vw': ['vowpalwabbit==8.8.1'],
         'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'],
         'omikuji': ['omikuji==0.3.*'],
-        'yake': ['yake==0.4.3'],
+        'yake': ['yake==0.4.5'],
         'dev': [
             'codecov',
             'pytest-cov',
diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 93304baa9..86d189764 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -43,14 +43,14 @@ def test_yake_suggest(project, skos_project):
     assert 'arkeologia' in [result.label for result in hits]
 
 
-def test_yake_suggest_non_alphanum_text(project, skos_project):
+def test_yake_suggest_no_input(project, skos_project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'limit': 8, 'language': 'fi'},
         project=skos_project)
 
-    results = yake.suggest(".,!")
+    results = yake.suggest("ja tai .,!")
     assert len(results) == 0
 
 

From 2fb2244f3015a2b34b2c9e1f3a3390fada97b78e Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 17 Mar 2021 10:29:51 +0200
Subject: [PATCH 40/57] Install Yake in GH Actions jobs for unit tests for
 Python 3.6 & 3.8

---
 .github/workflows/python-package.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index a02dcdac5..cad50c2d0 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -32,9 +32,9 @@ jobs:
         # Install the optional neural network dependencies (TensorFlow and LMDB)
         # - except for one Python version (3.7) so that we can test also without them
         if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[nn]; fi
-        # Install the optional Omikuji dependency
+        # Install the optional Omikuji and YAKE dependencies
         # - except for one Python version (3.7) so that we can test also without them
-        if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[omikuji]; fi
+        if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[omikuji,yake]; fi
         # Install the optional fastText dependencies for Python 3.7 only
         if [[ ${{ matrix.python-version }} == '3.7' ]]; then pip install .[fasttext]; fi
         # For Python 3.6

From 0b1cacd1d630419ba3f9e8fcf697cb899ac85504 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 17 Mar 2021 11:44:41 +0200
Subject: [PATCH 41/57] Remove condition and debug message for neg. Yake score,
 use max() instead

---
 annif/backend/yake.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 2ffe9b3ed..38ff5899f 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -166,9 +166,7 @@ def _keyphrase2uris(self, keyphrase):
         return self._index.get(keyphrase, [])
 
     def _transform_score(self, score):
-        if score < 0:
-            self.debug(f'Replacing negative YAKE score {score} with zero')
-            return 1.0
+        score = max(score, 0)
         return 1.0 / (score + 1)
 
     def _combine_suggestions(self, suggestions):

From 975b1bc170b93ccb262789c2dc370a07ad8d5e57 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 30 Apr 2021 11:48:45 +0300
Subject: [PATCH 42/57] Adapt to current master: remove unnecessary
 skos_project fixture

---
 tests/conftest.py          |  8 --------
 tests/test_backend_yake.py | 24 ++++++++++++------------
 2 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index f23eb8c0d..80a8bfc3b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -130,14 +130,6 @@ def graph_project(project):
     return project
 
 
-@pytest.fixture(scope='function')
-def skos_project(project, skos_vocabulary):
-    project.vocab.skos_concepts = skos_vocabulary.skos_concepts
-    project.vocab.get_skos_concept_labels = \
-        skos_vocabulary.get_skos_concept_labels
-    return project
-
-
 @pytest.fixture(scope='module')
 def app_project(app):
     with app.app_context():
diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 86d189764..7ecbbb8c3 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -11,22 +11,22 @@
 pytest.importorskip("annif.backend.yake")
 
 
-def test_invalid_label_type(skos_project):
+def test_invalid_label_type(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'label_types': 'invalid_type', 'language': 'fi'},
-        project=skos_project)
+        project=project)
     with pytest.raises(ConfigurationException):
         yake.suggest("example text")
 
 
-def test_yake_suggest(project, skos_project):
+def test_yake_suggest(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'limit': 8, 'language': 'fi'},
-        project=skos_project)
+        project=project)
 
     results = yake.suggest("""Arkeologia on tieteenala, jota sanotaan joskus
         muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede
@@ -43,23 +43,23 @@ def test_yake_suggest(project, skos_project):
     assert 'arkeologia' in [result.label for result in hits]
 
 
-def test_yake_suggest_no_input(project, skos_project):
+def test_yake_suggest_no_input(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'limit': 8, 'language': 'fi'},
-        project=skos_project)
+        project=project)
 
     results = yake.suggest("ja tai .,!")
     assert len(results) == 0
 
 
-def test_create_index_preflabels(skos_project):
+def test_create_index_preflabels(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'language': 'fi', 'label_types': 'prefLabel'},
-        project=skos_project)
+        project=project)
     index = yake._create_index()
     # Some of the 130 prefLabels get merged in lemmatization:
     # assyriologit, assyriologia (assyriolog); arkealogit, arkeologia
@@ -69,24 +69,24 @@ def test_create_index_preflabels(skos_project):
     assert 'luolamaalauks' not in index
 
 
-def test_create_index_pref_and_altlabels(skos_project):
+def test_create_index_pref_and_altlabels(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'limit': 8, 'language': 'fi'},
-        project=skos_project)
+        project=project)
     index = yake._create_index()
     assert len(index) == 161
     assert 'kalliotaid' in index
     assert 'luolamaalauks' in index
 
 
-def test_create_index_altlabels(skos_project):
+def test_create_index_altlabels(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(
         backend_id='yake',
         config_params={'language': 'fi', 'label_types': 'altLabel'},
-        project=skos_project)
+        project=project)
     index = yake._create_index()
     assert len(index) == 34
     assert 'kalliotaid' not in index

From f354015ad206ad2e09ff8a67516320e0bd73647c Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 30 Apr 2021 11:50:21 +0300
Subject: [PATCH 43/57] Adapt to current master: altLabels in archaelogy corpus
 have changed

---
 tests/test_backend_yake.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 7ecbbb8c3..696064a1d 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -76,7 +76,7 @@ def test_create_index_pref_and_altlabels(project):
         config_params={'limit': 8, 'language': 'fi'},
         project=project)
     index = yake._create_index()
-    assert len(index) == 161
+    assert len(index) == 160
     assert 'kalliotaid' in index
     assert 'luolamaalauks' in index
 
@@ -88,7 +88,7 @@ def test_create_index_altlabels(project):
         config_params={'language': 'fi', 'label_types': 'altLabel'},
         project=project)
     index = yake._create_index()
-    assert len(index) == 34
+    assert len(index) == 33
     assert 'kalliotaid' not in index
     assert 'luolamaalauks' in index
 

From d83b9fbf4dadc7faf40a06843f5c82bc383e14be Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 30 Apr 2021 12:08:47 +0300
Subject: [PATCH 44/57] Adapt to current master: use project fixture in
 test_stwfsa, remove graph_project fixture

---
 tests/conftest.py            | 14 --------------
 tests/test_backend_stwfsa.py |  4 ++--
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 80a8bfc3b..2e3949f47 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,7 +5,6 @@
 import pytest
 import py.path
 import unittest.mock
-from rdflib import Graph
 import annif
 import annif.analyzer
 import annif.corpus
@@ -117,19 +116,6 @@ def project(subject_index, datadir, registry, vocabulary):
     return proj
 
 
-@pytest.fixture(scope='module')
-def graph_project(project):
-    _rdf_file_path = os.path.join(
-        os.path.dirname(__file__),
-        'corpora',
-        'archaeology',
-        'yso-archaeology.rdf')
-    g = Graph()
-    g.load(_rdf_file_path)
-    project.vocab.as_graph.return_value = g
-    return project
-
-
 @pytest.fixture(scope='module')
 def app_project(app):
     with app.app_context():
diff --git a/tests/test_backend_stwfsa.py b/tests/test_backend_stwfsa.py
index 858b2371b..8c5fd329b 100644
--- a/tests/test_backend_stwfsa.py
+++ b/tests/test_backend_stwfsa.py
@@ -54,12 +54,12 @@ def test_stwfsa_not_initialized(project):
         stwfsa.suggest("example text")
 
 
-def test_stwfsa_train(document_corpus, graph_project, datadir):
+def test_stwfsa_train(document_corpus, project, datadir):
     stwfsa_type = get_backend(StwfsaBackend.name)
     stwfsa = stwfsa_type(
         backend_id=StwfsaBackend.name,
         config_params=_backend_conf,
-        project=graph_project)
+        project=project)
     stwfsa.train(document_corpus)
     assert stwfsa._model is not None
     model_file = datadir.join(stwfsa.MODEL_FILE)

From 315b47462b14c7a066d05773fbe8601a3d67bab4 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 30 Apr 2021 12:14:02 +0300
Subject: [PATCH 45/57] Remove test for removing parentheses from labels

---
 tests/test_backend_yake.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py
index 696064a1d..795973853 100755
--- a/tests/test_backend_yake.py
+++ b/tests/test_backend_yake.py
@@ -4,9 +4,6 @@
 import annif.backend
 import pytest
 from annif.exception import ConfigurationException
-from rdflib import Graph, URIRef, Literal
-from rdflib.namespace import SKOS, RDF
-from copy import copy
 
 pytest.importorskip("annif.backend.yake")
 
@@ -93,33 +90,6 @@ def test_create_index_altlabels(project):
     assert 'luolamaalauks' in index
 
 
-def test_remove_parentheses(project, skos_vocabulary):
-    graph = Graph()
-    graph.add((
-        URIRef('http://www.yso.fi/onto/yso/p4354'), RDF.type, SKOS.Concept))
-    graph.add((
-        URIRef('http://www.yso.fi/onto/yso/p4354'), SKOS.prefLabel,
-        Literal('lapset (ikäryhmät)', lang='fi')))
-
-    skos_vocabulary = copy(skos_vocabulary)  # Do not modify original fixture
-    skos_vocabulary.graph = graph
-    project.vocab.skos_concepts = skos_vocabulary.skos_concepts
-    project.vocab.get_skos_concept_labels = \
-        skos_vocabulary.get_skos_concept_labels
-
-    yake_type = annif.backend.get_backend('yake')
-    yake = yake_type(
-        backend_id='yake',
-        config_params={'language': 'fi', 'remove_parentheses': True},
-        project=project)
-    index = yake._create_index()
-    assert len(index) == 1
-    assert 'laps' in index
-    assert '(' not in index
-    assert ')' not in index
-    assert 'ikä' not in index
-
-
 def test_combine_suggestions_different_uris(project):
     yake_type = annif.backend.get_backend('yake')
     yake = yake_type(

From c797b5640b7cc7ed99f4dcc51788e1ba442adcc2 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 30 Apr 2021 13:18:04 +0300
Subject: [PATCH 46/57] Implement get_skos_concept_labels using list
 comprehension

---
 annif/corpus/skos.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py
index d69a79df6..7c40805f7 100644
--- a/annif/corpus/skos.py
+++ b/annif/corpus/skos.py
@@ -54,12 +54,10 @@ def skos_concepts(self):
             yield concept
 
     def get_skos_concept_labels(self, concept, label_types, language):
-        labels = []
-        for label_type in label_types:
-            for label in self.graph.objects(concept, label_type):
-                if label.language == language:
-                    labels.append(label)
-        return labels
+        return [str(label)
+                for label_type in label_types
+                for label in self.graph.objects(concept, label_type)
+                if label.language == language]
 
     @staticmethod
     def is_rdf_file(path):

From fcacef8d2addeb4d161a94ce17417b09c75e7a54 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 30 Apr 2021 13:55:45 +0300
Subject: [PATCH 47/57] Rename & refactor methods for SKOS vocabulary

---
 annif/backend/yake.py | 5 +++--
 annif/corpus/skos.py  | 6 +++---
 annif/vocab.py        | 8 --------
 3 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 38ff5899f..cbde4c4f1 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -90,9 +90,10 @@ def _load_index(self, path):
 
     def _create_index(self):
         index = defaultdict(set)
-        for concept in self.project.vocab.skos_concepts:
+        concepts = self.project.vocab.skos_vocab.concepts
+        for concept in concepts:
             uri = str(concept)
-            labels = self.project.vocab.get_skos_concept_labels(
+            labels = self.project.vocab.skos_vocab.get_concept_labels(
                 concept, self.label_types, self.params['language'])
             for label in labels:
                 label = self._normalize_label(label)
diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py
index 7c40805f7..ba4f04406 100644
--- a/annif/corpus/skos.py
+++ b/annif/corpus/skos.py
@@ -35,7 +35,7 @@ def __init__(self, path, language):
 
     @property
     def subjects(self):
-        for concept in self.skos_concepts:
+        for concept in self.concepts:
             labels = self.graph.preferredLabel(concept, lang=self.language)
             notation = self.graph.value(concept, SKOS.notation, None, any=True)
             if not labels:
@@ -47,13 +47,13 @@ def subjects(self):
                           text=None)
 
     @property
-    def skos_concepts(self):
+    def concepts(self):
         for concept in self.graph.subjects(RDF.type, SKOS.Concept):
             if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
                 continue
             yield concept
 
-    def get_skos_concept_labels(self, concept, label_types, language):
+    def get_concept_labels(self, concept, label_types, language):
         return [str(label)
                 for label_type in label_types
                 for label in self.graph.objects(concept, label_type)
diff --git a/annif/vocab.py b/annif/vocab.py
index 328099aaa..35c4af4e5 100644
--- a/annif/vocab.py
+++ b/annif/vocab.py
@@ -66,14 +66,6 @@ def skos_vocab(self):
                 raise NotInitializedException(f'graph file {path} not found')
         return self._skos_vocab
 
-    @property
-    def skos_concepts(self):
-        return self.skos_vocab.skos_concepts
-
-    def get_skos_concept_labels(self, concept, label_types, language):
-        return self.skos_vocab.get_skos_concept_labels(concept, label_types,
-                                                       language)
-
     def load_vocabulary(self, subject_corpus, language):
         """load subjects from a subject corpus and save them into a
         SKOS/Turtle file for later use"""

From d6c2aa699a6e310361d9bc1ae82fd40e31a261ff Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 30 Apr 2021 13:57:19 +0300
Subject: [PATCH 48/57] Rename method for accessing SKOS vocab as a file object

---
 annif/backend/maui.py | 2 +-
 annif/vocab.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/annif/backend/maui.py b/annif/backend/maui.py
index a40016f0c..bb9894722 100644
--- a/annif/backend/maui.py
+++ b/annif/backend/maui.py
@@ -101,7 +101,7 @@ def _upload_vocabulary(self, params):
         json = {}
         try:
             resp = requests.put(self.tagger_url(params) + '/vocab',
-                                data=self.project.vocab.as_skos())
+                                data=self.project.vocab.as_skos_file())
             try:
                 json = resp.json()
             except ValueError:
diff --git a/annif/vocab.py b/annif/vocab.py
index 35c4af4e5..953e9917a 100644
--- a/annif/vocab.py
+++ b/annif/vocab.py
@@ -78,7 +78,7 @@ def load_vocabulary(self, subject_corpus, language):
         subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'),
                                  language)
 
-    def as_skos(self):
+    def as_skos_file(self):
         """return the vocabulary as a file object, in SKOS/Turtle syntax"""
         return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb')
 

From 6ddb55708116a38a60cba7b4892e7ef3b895960b Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Fri, 30 Apr 2021 18:04:17 +0300
Subject: [PATCH 49/57] Adjust license explanation

---
 README.md             | 5 ++++-
 annif/backend/yake.py | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b115f7cbf..8eb84eb86 100644
--- a/README.md
+++ b/README.md
@@ -133,4 +133,7 @@ Zenodo DOI:
 
 The code in this repository is licensed under Apache License 2.0, except for the
 dependencies included under `annif/static/css` and `annif/static/js`,
-which have their own licenses. See the file headers for details. Using the optional Yake backend may change the licence of Annif to GPLv3, because [YAKE](https://github.com/LIAAD/yake) is licensed under GPLv3.
+which have their own licenses, see the file headers for details.
+Enabling the optional Yake backend may result in [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt)
+terms to cover the application, because [YAKE](https://github.com/LIAAD/yake)
+is licensed under GPLv3.
diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index cbde4c4f1..6ccce8f3c 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -1,5 +1,7 @@
 """Annif backend using Yake keyword extraction"""
-# TODO Mention GPLv3 license also here?
+# Enabling this optional backend may result in GPLv3 terms to cover the
+# application, because YAKE (https://github.com/LIAAD/yake) is licensed under
+# GPLv3.
 
 import yake
 import os.path

From 27e0cdc44de8a17b32f9b967b48f28a36d0ea5f9 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 5 May 2021 10:37:36 +0300
Subject: [PATCH 50/57] Better name and docstring for the property for
 accessing SKOS vocabulary

---
 annif/backend/yake.py | 6 +++---
 annif/vocab.py        | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 6ccce8f3c..b763dac36 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -92,10 +92,10 @@ def _load_index(self, path):
 
     def _create_index(self):
         index = defaultdict(set)
-        concepts = self.project.vocab.skos_vocab.concepts
-        for concept in concepts:
+        skos_vocab = self.project.vocab.skos
+        for concept in skos_vocab.concepts:
             uri = str(concept)
-            labels = self.project.vocab.skos_vocab.get_concept_labels(
+            labels = skos_vocab.get_concept_labels(
                 concept, self.label_types, self.params['language'])
             for label in labels:
                 label = self._normalize_label(label)
diff --git a/annif/vocab.py b/annif/vocab.py
index 953e9917a..92f618580 100644
--- a/annif/vocab.py
+++ b/annif/vocab.py
@@ -56,7 +56,8 @@ def subjects(self):
         return self._subjects
 
     @property
-    def skos_vocab(self):
+    def skos(self):
+        """return the subject vocabulary from SKOS file"""
         if self._skos_vocab is None:
             path = os.path.join(self.datadir, 'subjects.ttl')
             if os.path.exists(path):
@@ -84,4 +85,4 @@ def as_skos_file(self):
 
     def as_graph(self):
         """return the vocabulary as an rdflib graph"""
-        return self.skos_vocab.graph
+        return self.skos.graph

From 338c9b6ff2bff33cc42385bae24fbce1d2c1e0a9 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 5 May 2021 10:41:04 +0300
Subject: [PATCH 51/57] Change log message for loading index to debug level

---
 annif/backend/yake.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index b763dac36..7ebbd7a2c 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -68,7 +68,7 @@ def _initialize_index(self):
             path = os.path.join(self.datadir, self.INDEX_FILE)
             if os.path.exists(path):
                 self._index = self._load_index(path)
-                self.info(
+                self.debug(
                     f'Loaded index from {path} with {len(self._index)} labels')
             else:
                 self.info('Creating index')

From d09e8e30e4daf8a0a33050e28422435436f97752 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 11 May 2021 17:55:52 +0300
Subject: [PATCH 52/57] Readjust license explanation for Yake backend

---
 README.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 8eb84eb86..2e301f69d 100644
--- a/README.md
+++ b/README.md
@@ -134,6 +134,10 @@ Zenodo DOI:
 The code in this repository is licensed under Apache License 2.0, except for the
 dependencies included under `annif/static/css` and `annif/static/js`,
 which have their own licenses, see the file headers for details.
-Enabling the optional Yake backend may result in [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt)
-terms to cover the application, because [YAKE](https://github.com/LIAAD/yake)
-is licensed under GPLv3.
+Please note that the [YAKE](https://github.com/LIAAD/yake) library is licended
+under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt), while Annif is
+licensed under the Apache License 2.0. The licenses are compatible, but
+depending on legal interpretation, the terms of the GPLv3 (for example the
+requirement to publish corresponding source code when publishing an executable
+application) may be considered to apply to the whole of Annif+Yake if you
+decide to install the optional Yake dependency.

From 06fe6ef335a44b0b45de3fc830718d83b6e14b20 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Tue, 11 May 2021 23:22:41 +0300
Subject: [PATCH 53/57] Pass project's language to AnnifVocabulary and adapt
 fixtures as needed

---
 annif/project.py    | 3 ++-
 annif/vocab.py      | 6 ++++--
 tests/conftest.py   | 2 +-
 tests/test_vocab.py | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/annif/project.py b/annif/project.py
index 8396d1e61..a4677c613 100644
--- a/annif/project.py
+++ b/annif/project.py
@@ -143,7 +143,8 @@ def vocab(self):
                 raise ConfigurationException("vocab setting is missing",
                                              project_id=self.project_id)
             self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id,
-                                                      self._base_datadir)
+                                                      self._base_datadir,
+                                                      self.language)
         return self._vocab
 
     @property
diff --git a/annif/vocab.py b/annif/vocab.py
index 92f618580..b60d3d382 100644
--- a/annif/vocab.py
+++ b/annif/vocab.py
@@ -17,9 +17,10 @@ class AnnifVocabulary(DatadirMixin):
     # defaults for uninitialized instances
     _subjects = None
 
-    def __init__(self, vocab_id, datadir):
+    def __init__(self, vocab_id, datadir, language):
         DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id)
         self.vocab_id = vocab_id
+        self.language = language
         self._skos_vocab = None
 
     def _create_subject_index(self, subject_corpus):
@@ -62,7 +63,8 @@ def skos(self):
             path = os.path.join(self.datadir, 'subjects.ttl')
             if os.path.exists(path):
                 logger.debug(f'loading graph from {path}')
-                self._skos_vocab = annif.corpus.SubjectFileSKOS(path, None)
+                self._skos_vocab = annif.corpus.SubjectFileSKOS(path,
+                                                                self.language)
             else:
                 raise NotInitializedException(f'graph file {path} not found')
         return self._skos_vocab
diff --git a/tests/conftest.py b/tests/conftest.py
index 2e3949f47..f92e40e92 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -68,7 +68,7 @@ def subject_file():
 
 @pytest.fixture(scope='module')
 def vocabulary(datadir):
-    vocab = annif.vocab.AnnifVocabulary('my-vocab', datadir)
+    vocab = annif.vocab.AnnifVocabulary('my-vocab', datadir, 'fi')
     subjfile = os.path.join(
         os.path.dirname(__file__),
         'corpora',
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
index 51a28e2b5..c56de5d51 100755
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@@ -7,7 +7,7 @@
 
 
 def load_dummy_vocab(tmpdir):
-    vocab = annif.vocab.AnnifVocabulary('vocab-id', str(tmpdir))
+    vocab = annif.vocab.AnnifVocabulary('vocab-id', str(tmpdir), 'en')
     subjfile = os.path.join(
         os.path.dirname(__file__),
         'corpora',

From 4d67d4421657f41136d75aa7be210292587b2002 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 12 May 2021 10:50:23 +0300
Subject: [PATCH 54/57] Rename lemmatize_phrase function to normalize_phrase

---
 annif/backend/yake.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 7ebbd7a2c..725851bd7 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -107,10 +107,10 @@ def _normalize_label(self, label):
         label = str(label)
         if annif.util.boolean(self.params['remove_parentheses']):
             label = re.sub(r' \(.*\)', '', label)
-        lemmatized_label = self._lemmatize_phrase(label)
-        return self._sort_phrase(lemmatized_label)
+        normalized_label = self._normalize_phrase(label)
+        return self._sort_phrase(normalized_label)
 
-    def _lemmatize_phrase(self, phrase):
+    def _normalize_phrase(self, phrase):
         normalized = []
         for word in phrase.split():
             normalized.append(
@@ -164,7 +164,7 @@ def _keyphrases2suggestions(self, keyphrases):
         return suggestions
 
     def _keyphrase2uris(self, keyphrase):
-        keyphrase = self._lemmatize_phrase(keyphrase)
+        keyphrase = self._normalize_phrase(keyphrase)
         keyphrase = self._sort_phrase(keyphrase)
         return self._index.get(keyphrase, [])
 

From 9597526503d677420814d1196719d33a713547ec Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 12 May 2021 15:30:39 +0300
Subject: [PATCH 55/57] Use atomic_save for saving YAKE index

---
 annif/backend/yake.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 725851bd7..3ff62dccf 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -4,6 +4,7 @@
 # GPLv3.
 
 import yake
+import joblib
 import os.path
 import re
 from collections import defaultdict
@@ -67,7 +68,7 @@ def _initialize_index(self):
         if self._index is None:
             path = os.path.join(self.datadir, self.INDEX_FILE)
             if os.path.exists(path):
-                self._index = self._load_index(path)
+                self._index = joblib.load(path)
                 self.debug(
                     f'Loaded index from {path} with {len(self._index)} labels')
             else:
@@ -77,18 +78,11 @@ def _initialize_index(self):
                 self.info(f'Created index with {len(self._index)} labels')
 
     def _save_index(self, path):
-        with open(path, 'w', encoding='utf-8') as indexfile:
-            for label, uris in self._index.items():
-                line = label + '\t' + ' '.join(uris)
-                print(line, file=indexfile)
-
-    def _load_index(self, path):
-        index = dict()
-        with open(path, 'r', encoding='utf-8') as indexfile:
-            for line in indexfile:
-                label, uris = line.strip().split('\t')
-                index[label] = uris.split()
-        return index
+        annif.util.atomic_save(
+            self._index,
+            self.datadir,
+            self.INDEX_FILE,
+            method=joblib.dump)
 
     def _create_index(self):
         index = defaultdict(set)

From 2dafa54abf2863656dcb6b32bb23e1da367079b8 Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 12 May 2021 16:01:19 +0300
Subject: [PATCH 56/57] Adjust license explanation comment to point to license
 section in README.md

---
 annif/backend/yake.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/annif/backend/yake.py b/annif/backend/yake.py
index 3ff62dccf..de828d54b 100755
--- a/annif/backend/yake.py
+++ b/annif/backend/yake.py
@@ -1,7 +1,6 @@
 """Annif backend using Yake keyword extraction"""
-# Enabling this optional backend may result in GPLv3 terms to cover the
-# application, because YAKE (https://github.com/LIAAD/yake) is licensed under
-# GPLv3.
+# For license remarks of this backend see README.md:
+# https://github.com/NatLibFi/Annif#license.
 
 import yake
 import joblib

From 9a2127a29419ae72bb0b4af439fc8e892a4055fc Mon Sep 17 00:00:00 2001
From: Juho Inkinen <juho.inkinen@helsinki.fi>
Date: Wed, 12 May 2021 16:43:43 +0300
Subject: [PATCH 57/57] Truncate long log messages for objects to be saved

---
 annif/util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/annif/util.py b/annif/util.py
index 042ff97f5..8d5174b4a 100644
--- a/annif/util.py
+++ b/annif/util.py
@@ -19,7 +19,7 @@ def atomic_save(obj, dirname, filename, method=None):
     tempfd, tempfilename = tempfile.mkstemp(
         prefix=prefix, suffix=suffix, dir=dirname)
     os.close(tempfd)
-    logger.debug('saving %s to temporary file %s', str(obj), tempfilename)
+    logger.debug('saving %s to temporary file %s', str(obj)[:90], tempfilename)
     if method is not None:
         method(obj, tempfilename)
     else: