From 7c4fad86dea1dd67a8ce79313486d580ba0e84a9 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 5 Jan 2021 13:45:06 +0200 Subject: [PATCH 01/57] Initial YAKE integration commit --- README.md | 2 +- annif/backend/__init__.py | 6 + annif/backend/yake.py | 249 ++++++++++++++++++++++++++++++++++++++ projects.cfg.dist | 8 ++ setup.py | 1 + 5 files changed, 265 insertions(+), 1 deletion(-) create mode 100755 annif/backend/yake.py diff --git a/README.md b/README.md index 583c3a6b1..b115f7cbf 100644 --- a/README.md +++ b/README.md @@ -133,4 +133,4 @@ Zenodo DOI: The code in this repository is licensed under Apache License 2.0, except for the dependencies included under `annif/static/css` and `annif/static/js`, -which have their own licenses. See the file headers for details. +which have their own licenses. See the file headers for details. Using the optional Yake backend may change the licence of Annif to GPLv3, because [YAKE](https://github.com/LIAAD/yake) is licensed under GPLv3. diff --git a/annif/backend/__init__.py b/annif/backend/__init__.py index 056a2fa99..bea668138 100644 --- a/annif/backend/__init__.py +++ b/annif/backend/__init__.py @@ -60,3 +60,9 @@ def get_backend(backend_id): register_backend(omikuji.OmikujiBackend) except ImportError: annif.logger.debug("Omikuji not available, not enabling omikuji backend") + +try: + from . import yake + register_backend(yake.YakeBackend) +except ImportError: + annif.logger.debug("YAKE not available, not enabling yake backend") diff --git a/annif/backend/yake.py b/annif/backend/yake.py new file mode 100755 index 000000000..654e77a22 --- /dev/null +++ b/annif/backend/yake.py @@ -0,0 +1,249 @@ +"""Annif backend using Yake keyword extraction""" +# TODO Mention GPLv3 license also here? + +import yake +import os.path +import re +from collections import defaultdict +from rdflib.namespace import SKOS, RDF, OWL, URIRef +import rdflib +from nltk.corpus import stopwords +from . import backend +from annif.suggestion import SubjectSuggestion, ListSuggestionResult + + +class YakeBackend(backend.AnnifBackend): + """Yake based backend for Annif""" + name = "yake" + needs_subject_index = False + + # defaults for uninitialized instances + _index = None + _graph = None + INDEX_FILE = 'yake-index' + + DEFAULT_PARAMETERS = { + 'max_ngram_size': 3, + 'deduplication_threshold': 0.9, + 'deduplication_algo': 'levs', + 'window_size': 1, + 'num_keywords': 100, + 'features': None, + } + + def default_params(self): + params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy() + params.update(self.DEFAULT_PARAMETERS) + return params + + @property + def is_trained(self): + return True + + def initialize(self): + self._initialize_index() + # self.graph + self._kw_extractor = yake.KeywordExtractor( + lan=self.project.language, + n=self.params['max_ngram_size'], + dedupLim=self.params['deduplication_threshold'], + dedupFunc=self.params['deduplication_algo'], + windowsSize=self.params['window_size'], + top=self.params['num_keywords'], + features=self.params['features']) + + def _initialize_index(self): + if self._index is None: + path = os.path.join(self.datadir, self.INDEX_FILE) + if os.path.exists(path): + self.info('Loading index from {}'.format(path)) + self._index = self._load_index(path) + self.info(f'Loaded index with {len(self._index)} labels') + else: + self.info('Creating index') + self._create_index() + self._save_index(path) + self.info(f'Created index with {len(self._index)} labels') + + @property + def graph(self): + if self._graph is None: + self._graph = rdflib.Graph() + path = os.path.join(self.project.vocab.datadir, 'subjects.ttl') + self.info('Loading graph from {}'.format(path)) + self._graph.load(path, format=rdflib.util.guess_format(path)) + return self._graph + + def _create_index(self): + # TODO Should index creation be done on loadvoc command? + # TODO American to British labels? + index = defaultdict(list) + for predicate in [SKOS.prefLabel]: #, SKOS.altLabel, SKOS.hiddenLabel]: + for concept in self.graph.subjects(RDF.type, SKOS.Concept): + if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: + continue + for label in self.graph.objects(concept, predicate): + if not label.language == self.project.language: + continue + uri = str(concept) + label = str(label) + # This really is useful: Disambiguate by dropping ambigious labels + # if label[-1] == ')': + # continue + # label = re.sub(r' \(.*\)', '', label) # Remove specifier + lemmatized_label = self._lemmatize_phrase(label) + lemmatized_label = self._sort_phrase(lemmatized_label) + index[lemmatized_label].append(uri) + index.pop('', None) # Remove possible empty string entry + self._index = dict(index) + + def _save_index(self, path): + with open(path, 'w', encoding='utf-8') as indexfile: + for label, uris in self._index.items(): + line = label + '\t' + ' '.join(uris) + print(line, file=indexfile) + + def _load_index(self, path): + index = dict() + with open(path, 'r', encoding='utf-8') as indexfile: + for line in indexfile: + label, uris = line.strip().split('\t') + uris = uris.split() + index[label] = uris + return index + + def _sort_phrase(self, phrase): + words = phrase.split() + return ' '.join(sorted(words)) + + def _lemmatize_phrase(self, phrase): + # if self.project.language == 'fi': + # lan_stopwords = set(stopwords.words('finnish')) + # elif self.project.language == 'en': + # stopwords = set(stopwords.words('english')) + normalized = [] + # phrase = re.sub(r'\W+', '', phrase) + for word in phrase.split(): + # if word in lan_stopwords: + # continue + normalized.append( + self.project.analyzer.normalize_word(word).lower()) + return ' '.join(normalized) + + def _sort_phrase(self, phrase): + words = phrase.split() + return ' '.join(sorted(words)) + + def _keyphrases2suggestions(self, keyphrases): + suggestions = [] + not_matched = [] + for kp, score in keyphrases: + uris = self._keyphrase2uris(kp) + for uri in uris: + # Its faster to get label from Annif subject index than from graph (but is even this needed?) + label = self.project.subjects.uris_to_labels([uri])[0] + suggestions.append( + (uri, label, self._transform_score(score))) + if not uris: + not_matched.append((kp, self._transform_score(score))) + # Remove duplicate uris, combining the scores + suggestions = self._combine_suggestions(suggestions) + self.debug('Keyphrases not matched:\n' + '\t'.join( + [x[0] + ' ' + str(x[1]) for x + in sorted(not_matched, reverse=True, key=lambda x: x[1])])) + return suggestions + + def _keyphrase2uris(self, keyphrase): + keyphrase = self._lemmatize_phrase(keyphrase) + keyphrase = self._sort_phrase(keyphrase) + uris = [] + uris.extend(self._index.get(keyphrase, [])) + + # Maybe TODO: Search only in hidden labels if not found in pref or alt labels: + # if not uris: + # uris.extend(hidden_label_index.get(mutated_kp, [])) + + # Maybe TODO: if not found, search for part of keyword: + # if not uris and ' ' in keyphrase: + # words = keyphrase.split() + # uris.extend(self._index.get(' '.join(words[:-1]), [])) + # uris.extend(self._index.get(' '.join(words[1:]), [])) + return uris + + def _transform_score(self, score): + return 1.0 / (3*score + 1) + + def _combine_suggestions(self, suggestions): + combined_suggestions = {} + for uri, label, score in suggestions: + if uri not in combined_suggestions: + combined_suggestions[uri] = (label, score) + else: + old_score = combined_suggestions[uri][1] + conflated_score = self._conflate_scores(score, old_score) + combined_suggestions[uri] = (label, conflated_score) + combined_suggestions = [(uri, *label_score) for uri, label_score + in combined_suggestions.items()] + return combined_suggestions + + def _conflate_scores(self, score1, score2): + # https://stats.stackexchange.com/questions/194878/combining-two-probability-scores/194884 + # return min(1, score1 + score2) + # return min(1.0, (score1**2 + score2**2)**0.5) + # score1 = 0.5 * score1 + 0.5 + # score2 = 0.5 * score2 + 0.5 + return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2)) + + # def _get_node_degrees(self, suggestions): + # connections = [] + # for uri, label, score in suggestions: + # suggestion_neighbours = [] + # u = URIRef(uri) + # suggestion_neighbours.extend( + # [o for o in self.graph.objects(u, SKOS.broader)]) + # suggestion_neighbours.extend( + # [o for o in self.graph.objects(u, SKOS.narrower)]) + # #suggestion_neighbours.extend([o for o in graph.objects(u, SKOS.related)]) + # connections.append((u, suggestion_neighbours)) + + # node_degrees = [] + # for uri, label, score in suggestions: + # u = URIRef(uri) + # cnt = 0 + # for neighbour, suggestion_neighbours in connections: + # if u == neighbour: + # # print('SELF') + # continue + # if u in suggestion_neighbours: + # # print('HIT') + # cnt += 1 + # node_degrees.append(cnt) # / len(suggestion_neighbours)) + # return node_degrees + + # def _modify_scores(self, suggestions, node_degrees, scale): + # modified_suggestions = [] + # for suggestion, node_degree in zip(suggestions, node_degrees): + # modified_suggestions.append( + # (suggestion[0], suggestion[1], + # float(suggestion[2]) + scale * node_degree)) + # return modified_suggestions + + def _suggest(self, text, params): + self.debug( + f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') + limit = int(params['limit']) + + keywords = self._kw_extractor.extract_keywords(text) + suggestions = self._keyphrases2suggestions(keywords) + + # node_degrees = self._get_node_degrees(suggestions) + # suggestions = self._modify_scores(suggestions, node_degrees, scale=0.01) + + subject_suggestions = [SubjectSuggestion( + uri=uri, + label=label, + notation=None, # TODO Should notation be fetched to here? + score=score) + for uri, label, score in suggestions[:limit] if score > 0.0] + return ListSuggestionResult.create_from_index(subject_suggestions, + self.project.subjects) diff --git a/projects.cfg.dist b/projects.cfg.dist index 6baac2a6d..fa5625a2f 100644 --- a/projects.cfg.dist +++ b/projects.cfg.dist @@ -111,6 +111,14 @@ backend=omikuji analyzer=snowball(english) vocab=yso-en +[yake-fi] +name=YAKE Finnish +language=fi +backend=yake +vocab=yso-fi +analyzer=voikko(fi) +input_limit=20000 + [ensemble-fi] name=Ensemble Finnish language=fi diff --git a/setup.py b/setup.py index 4f41bdcf5..730b736de 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ def read(fname): 'vw': ['vowpalwabbit==8.8.1'], 'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'], 'omikuji': ['omikuji==0.3.*'], + 'yake': ['yake @ git+https://github.com/LIAAD/yake@v0.4.3'], 'dev': [ 'codecov', 'pytest-cov', From 45b18a668989f8795a483058ad8a6d547497fcb8 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 12 Jan 2021 19:50:51 +0200 Subject: [PATCH 02/57] Cleanup & pep8 fixes --- annif/backend/yake.py | 78 ++++--------------------------------------- 1 file changed, 6 insertions(+), 72 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 654e77a22..962677b77 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -3,11 +3,9 @@ import yake import os.path -import re from collections import defaultdict from rdflib.namespace import SKOS, RDF, OWL, URIRef import rdflib -from nltk.corpus import stopwords from . import backend from annif.suggestion import SubjectSuggestion, ListSuggestionResult @@ -42,7 +40,6 @@ def is_trained(self): def initialize(self): self._initialize_index() - # self.graph self._kw_extractor = yake.KeywordExtractor( lan=self.project.language, n=self.params['max_ngram_size'], @@ -56,9 +53,9 @@ def _initialize_index(self): if self._index is None: path = os.path.join(self.datadir, self.INDEX_FILE) if os.path.exists(path): - self.info('Loading index from {}'.format(path)) self._index = self._load_index(path) - self.info(f'Loaded index with {len(self._index)} labels') + self.info( + f'Loaded index from {path} with {len(self._index)} labels') else: self.info('Creating index') self._create_index() @@ -68,6 +65,8 @@ def _initialize_index(self): @property def graph(self): if self._graph is None: + # TODO use as_graph() that is now available + # self._graph = vocab.as_graph() self._graph = rdflib.Graph() path = os.path.join(self.project.vocab.datadir, 'subjects.ttl') self.info('Loading graph from {}'.format(path)) @@ -80,17 +79,14 @@ def _create_index(self): index = defaultdict(list) for predicate in [SKOS.prefLabel]: #, SKOS.altLabel, SKOS.hiddenLabel]: for concept in self.graph.subjects(RDF.type, SKOS.Concept): - if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: + if (concept, OWL.deprecated, rdflib.Literal(True)) \ + in self.graph: continue for label in self.graph.objects(concept, predicate): if not label.language == self.project.language: continue uri = str(concept) label = str(label) - # This really is useful: Disambiguate by dropping ambigious labels - # if label[-1] == ')': - # continue - # label = re.sub(r' \(.*\)', '', label) # Remove specifier lemmatized_label = self._lemmatize_phrase(label) lemmatized_label = self._sort_phrase(lemmatized_label) index[lemmatized_label].append(uri) @@ -117,30 +113,18 @@ def _sort_phrase(self, phrase): return ' '.join(sorted(words)) def _lemmatize_phrase(self, phrase): - # if self.project.language == 'fi': - # lan_stopwords = set(stopwords.words('finnish')) - # elif self.project.language == 'en': - # stopwords = set(stopwords.words('english')) normalized = [] - # phrase = re.sub(r'\W+', '', phrase) for word in phrase.split(): - # if word in lan_stopwords: - # continue normalized.append( self.project.analyzer.normalize_word(word).lower()) return ' '.join(normalized) - def _sort_phrase(self, phrase): - words = phrase.split() - return ' '.join(sorted(words)) - def _keyphrases2suggestions(self, keyphrases): suggestions = [] not_matched = [] for kp, score in keyphrases: uris = self._keyphrase2uris(kp) for uri in uris: - # Its faster to get label from Annif subject index than from graph (but is even this needed?) label = self.project.subjects.uris_to_labels([uri])[0] suggestions.append( (uri, label, self._transform_score(score))) @@ -159,15 +143,6 @@ def _keyphrase2uris(self, keyphrase): uris = [] uris.extend(self._index.get(keyphrase, [])) - # Maybe TODO: Search only in hidden labels if not found in pref or alt labels: - # if not uris: - # uris.extend(hidden_label_index.get(mutated_kp, [])) - - # Maybe TODO: if not found, search for part of keyword: - # if not uris and ' ' in keyphrase: - # words = keyphrase.split() - # uris.extend(self._index.get(' '.join(words[:-1]), [])) - # uris.extend(self._index.get(' '.join(words[1:]), [])) return uris def _transform_score(self, score): @@ -188,46 +163,8 @@ def _combine_suggestions(self, suggestions): def _conflate_scores(self, score1, score2): # https://stats.stackexchange.com/questions/194878/combining-two-probability-scores/194884 - # return min(1, score1 + score2) - # return min(1.0, (score1**2 + score2**2)**0.5) - # score1 = 0.5 * score1 + 0.5 - # score2 = 0.5 * score2 + 0.5 return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2)) - # def _get_node_degrees(self, suggestions): - # connections = [] - # for uri, label, score in suggestions: - # suggestion_neighbours = [] - # u = URIRef(uri) - # suggestion_neighbours.extend( - # [o for o in self.graph.objects(u, SKOS.broader)]) - # suggestion_neighbours.extend( - # [o for o in self.graph.objects(u, SKOS.narrower)]) - # #suggestion_neighbours.extend([o for o in graph.objects(u, SKOS.related)]) - # connections.append((u, suggestion_neighbours)) - - # node_degrees = [] - # for uri, label, score in suggestions: - # u = URIRef(uri) - # cnt = 0 - # for neighbour, suggestion_neighbours in connections: - # if u == neighbour: - # # print('SELF') - # continue - # if u in suggestion_neighbours: - # # print('HIT') - # cnt += 1 - # node_degrees.append(cnt) # / len(suggestion_neighbours)) - # return node_degrees - - # def _modify_scores(self, suggestions, node_degrees, scale): - # modified_suggestions = [] - # for suggestion, node_degree in zip(suggestions, node_degrees): - # modified_suggestions.append( - # (suggestion[0], suggestion[1], - # float(suggestion[2]) + scale * node_degree)) - # return modified_suggestions - def _suggest(self, text, params): self.debug( f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') @@ -236,9 +173,6 @@ def _suggest(self, text, params): keywords = self._kw_extractor.extract_keywords(text) suggestions = self._keyphrases2suggestions(keywords) - # node_degrees = self._get_node_degrees(suggestions) - # suggestions = self._modify_scores(suggestions, node_degrees, scale=0.01) - subject_suggestions = [SubjectSuggestion( uri=uri, label=label, From 099553561d65878d9044beaeb3091bd6b5bce151 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 12 Jan 2021 19:55:08 +0200 Subject: [PATCH 03/57] Increase keyphrase word number to 4 --- annif/backend/yake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 962677b77..449c48368 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -21,7 +21,7 @@ class YakeBackend(backend.AnnifBackend): INDEX_FILE = 'yake-index' DEFAULT_PARAMETERS = { - 'max_ngram_size': 3, + 'max_ngram_size': 4, 'deduplication_threshold': 0.9, 'deduplication_algo': 'levs', 'window_size': 1, From 1c6e0e859da838b45e938fa9f3df1045b37fd158 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 12 Jan 2021 19:57:14 +0200 Subject: [PATCH 04/57] Use sets of uris instead of lists of uris in index --- annif/backend/yake.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 449c48368..0ec013603 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -76,8 +76,8 @@ def graph(self): def _create_index(self): # TODO Should index creation be done on loadvoc command? # TODO American to British labels? - index = defaultdict(list) for predicate in [SKOS.prefLabel]: #, SKOS.altLabel, SKOS.hiddenLabel]: + index = defaultdict(set) for concept in self.graph.subjects(RDF.type, SKOS.Concept): if (concept, OWL.deprecated, rdflib.Literal(True)) \ in self.graph: @@ -89,7 +89,7 @@ def _create_index(self): label = str(label) lemmatized_label = self._lemmatize_phrase(label) lemmatized_label = self._sort_phrase(lemmatized_label) - index[lemmatized_label].append(uri) + index[lemmatized_label].add(uri) index.pop('', None) # Remove possible empty string entry self._index = dict(index) From 4bf08dac5978c8f6e53a8f51d3e21b074c6af6db Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 12 Jan 2021 19:59:34 +0200 Subject: [PATCH 05/57] Put also alt and hidden labels in index instead of just prefs --- annif/backend/yake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 0ec013603..17ab49487 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -76,8 +76,8 @@ def graph(self): def _create_index(self): # TODO Should index creation be done on loadvoc command? # TODO American to British labels? - for predicate in [SKOS.prefLabel]: #, SKOS.altLabel, SKOS.hiddenLabel]: index = defaultdict(set) + for predicate in [SKOS.prefLabel, SKOS.altLabel, SKOS.hiddenLabel]: for concept in self.graph.subjects(RDF.type, SKOS.Concept): if (concept, OWL.deprecated, rdflib.Literal(True)) \ in self.graph: From 85eb8f7df35686ab96977e87f88f41349d3b1bc4 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 12 Jan 2021 20:01:48 +0200 Subject: [PATCH 06/57] More straightforward score transformation --- annif/backend/yake.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 17ab49487..ad262c4c5 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -146,7 +146,8 @@ def _keyphrase2uris(self, keyphrase): return uris def _transform_score(self, score): - return 1.0 / (3*score + 1) + # TODO if score<0: + return 1.0 / (score + 1) def _combine_suggestions(self, suggestions): combined_suggestions = {} From 5a97da55a2de60a23eabe93645b831a8a56a9285 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 12 Jan 2021 20:08:05 +0200 Subject: [PATCH 07/57] Shorten&simplify code --- annif/backend/yake.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index ad262c4c5..20e7ec1ea 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -74,8 +74,8 @@ def graph(self): return self._graph def _create_index(self): - # TODO Should index creation be done on loadvoc command? - # TODO American to British labels? + # TODO Should index creation & saving be done on loadvoc command? + # Or saving at all? It takes about 1 min to create the index index = defaultdict(set) for predicate in [SKOS.prefLabel, SKOS.altLabel, SKOS.hiddenLabel]: for concept in self.graph.subjects(RDF.type, SKOS.Concept): @@ -104,8 +104,7 @@ def _load_index(self, path): with open(path, 'r', encoding='utf-8') as indexfile: for line in indexfile: label, uris = line.strip().split('\t') - uris = uris.split() - index[label] = uris + index[label] = uris.split() return index def _sort_phrase(self, phrase): @@ -140,10 +139,7 @@ def _keyphrases2suggestions(self, keyphrases): def _keyphrase2uris(self, keyphrase): keyphrase = self._lemmatize_phrase(keyphrase) keyphrase = self._sort_phrase(keyphrase) - uris = [] - uris.extend(self._index.get(keyphrase, [])) - - return uris + return self._index.get(keyphrase, []) def _transform_score(self, score): # TODO if score<0: From 583272a84fa6859d72e1ef76edde2425f424a53c Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 12 Jan 2021 20:57:34 +0200 Subject: [PATCH 08/57] Remove unused import --- annif/backend/yake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 20e7ec1ea..660323f54 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -4,7 +4,7 @@ import yake import os.path from collections import defaultdict -from rdflib.namespace import SKOS, RDF, OWL, URIRef +from rdflib.namespace import SKOS, RDF, OWL import rdflib from . import backend from annif.suggestion import SubjectSuggestion, ListSuggestionResult From db9d4b064e8781fe6c24f90ee004e9b53d3f0d80 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 26 Jan 2021 13:53:50 +0200 Subject: [PATCH 09/57] Load graph using as_graph method of the vocab module --- annif/backend/yake.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 660323f54..2e5ee516b 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -65,12 +65,8 @@ def _initialize_index(self): @property def graph(self): if self._graph is None: - # TODO use as_graph() that is now available - # self._graph = vocab.as_graph() - self._graph = rdflib.Graph() - path = os.path.join(self.project.vocab.datadir, 'subjects.ttl') - self.info('Loading graph from {}'.format(path)) - self._graph.load(path, format=rdflib.util.guess_format(path)) + self.info('Loading graph') + self._graph = self.project.vocab.as_graph() return self._graph def _create_index(self): From b6979187b7c4ade05b03f6c8087029136604eb5e Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 26 Jan 2021 16:02:07 +0200 Subject: [PATCH 10/57] Configurable label types for index creation --- annif/backend/yake.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 2e5ee516b..79526b749 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -8,6 +8,7 @@ import rdflib from . import backend from annif.suggestion import SubjectSuggestion, ListSuggestionResult +from annif.exception import ConfigurationException class YakeBackend(backend.AnnifBackend): @@ -27,6 +28,7 @@ class YakeBackend(backend.AnnifBackend): 'window_size': 1, 'num_keywords': 100, 'features': None, + 'default_label_types': ['pref', 'alt'] } def default_params(self): @@ -38,6 +40,21 @@ def default_params(self): def is_trained(self): return True + @property + def label_types(self): + mapping = {'pref': SKOS.prefLabel, + 'alt': SKOS.altLabel, + 'hidden': SKOS.hiddenLabel} + if 'label_types' in self.params: + lt_entries = self.params['label_types'].split(',') + try: + return [mapping[lt.strip()] for lt in lt_entries] + except KeyError as err: + raise ConfigurationException( + f'invalid label type {err}', backend_id=self.backend_id) + else: + return [mapping[lt] for lt in self.params['default_label_types']] + def initialize(self): self._initialize_index() self._kw_extractor = yake.KeywordExtractor( @@ -73,12 +90,12 @@ def _create_index(self): # TODO Should index creation & saving be done on loadvoc command? # Or saving at all? It takes about 1 min to create the index index = defaultdict(set) - for predicate in [SKOS.prefLabel, SKOS.altLabel, SKOS.hiddenLabel]: + for label_type in self.label_types: for concept in self.graph.subjects(RDF.type, SKOS.Concept): if (concept, OWL.deprecated, rdflib.Literal(True)) \ in self.graph: continue - for label in self.graph.objects(concept, predicate): + for label in self.graph.objects(concept, label_type): if not label.language == self.project.language: continue uri = str(concept) From 630af0476490573562ec020d2ebf280cb3dce6c9 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 26 Jan 2021 17:18:27 +0200 Subject: [PATCH 11/57] Dont unnecessarily pass a label to SubjectSuggestion --- annif/backend/yake.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 79526b749..79a77f49e 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -185,8 +185,8 @@ def _suggest(self, text, params): subject_suggestions = [SubjectSuggestion( uri=uri, - label=label, - notation=None, # TODO Should notation be fetched to here? + label=None, + notation=None, score=score) for uri, label, score in suggestions[:limit] if score > 0.0] return ListSuggestionResult.create_from_index(subject_suggestions, From 44611e90769ffc384e3796d9c9f3aad08f5f9e67 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 28 Jan 2021 16:26:52 +0200 Subject: [PATCH 12/57] Replace negative Yake scores with zero --- annif/backend/yake.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 79a77f49e..201bbbe57 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -155,7 +155,9 @@ def _keyphrase2uris(self, keyphrase): return self._index.get(keyphrase, []) def _transform_score(self, score): - # TODO if score<0: + if score < 0: + self.debug(f'Replacing negative YAKE score {score} with zero') + return 1.0 return 1.0 / (score + 1) def _combine_suggestions(self, suggestions): From 4169945c07b25af801d89f99221bf1b5dd2d7354 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 28 Jan 2021 17:34:18 +0200 Subject: [PATCH 13/57] Omit processing vocabulary label (it's fetched when creating ListSuggResult) --- annif/backend/yake.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 201bbbe57..9951f3509 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -137,9 +137,8 @@ def _keyphrases2suggestions(self, keyphrases): for kp, score in keyphrases: uris = self._keyphrase2uris(kp) for uri in uris: - label = self.project.subjects.uris_to_labels([uri])[0] suggestions.append( - (uri, label, self._transform_score(score))) + (uri, self._transform_score(score))) if not uris: not_matched.append((kp, self._transform_score(score))) # Remove duplicate uris, combining the scores @@ -162,16 +161,14 @@ def _transform_score(self, score): def _combine_suggestions(self, suggestions): combined_suggestions = {} - for uri, label, score in suggestions: + for uri, score in suggestions: if uri not in combined_suggestions: - combined_suggestions[uri] = (label, score) + combined_suggestions[uri] = score else: - old_score = combined_suggestions[uri][1] - conflated_score = self._conflate_scores(score, old_score) - combined_suggestions[uri] = (label, conflated_score) - combined_suggestions = [(uri, *label_score) for uri, label_score - in combined_suggestions.items()] - return combined_suggestions + old_score = combined_suggestions[uri] + combined_suggestions[uri] = self._conflate_scores( + score, old_score) + return list(combined_suggestions.items()) def _conflate_scores(self, score1, score2): # https://stats.stackexchange.com/questions/194878/combining-two-probability-scores/194884 @@ -190,6 +187,6 @@ def _suggest(self, text, params): label=None, notation=None, score=score) - for uri, label, score in suggestions[:limit] if score > 0.0] + for uri, score in suggestions[:limit] if score > 0.0] return ListSuggestionResult.create_from_index(subject_suggestions, self.project.subjects) From ea8d0059fead44ff10374c75fe885c42cc41e528 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 28 Jan 2021 17:44:33 +0200 Subject: [PATCH 14/57] Improve variable names; remove unnecessary comments --- annif/backend/yake.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 9951f3509..40fef2e79 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -141,11 +141,11 @@ def _keyphrases2suggestions(self, keyphrases): (uri, self._transform_score(score))) if not uris: not_matched.append((kp, self._transform_score(score))) - # Remove duplicate uris, combining the scores + # Remove duplicate uris, conflating the scores suggestions = self._combine_suggestions(suggestions) self.debug('Keyphrases not matched:\n' + '\t'.join( - [x[0] + ' ' + str(x[1]) for x - in sorted(not_matched, reverse=True, key=lambda x: x[1])])) + [kp[0] + ' ' + str(kp[1]) for kp + in sorted(not_matched, reverse=True, key=lambda kp: kp[1])])) return suggestions def _keyphrase2uris(self, keyphrase): @@ -171,7 +171,6 @@ def _combine_suggestions(self, suggestions): return list(combined_suggestions.items()) def _conflate_scores(self, score1, score2): - # https://stats.stackexchange.com/questions/194878/combining-two-probability-scores/194884 return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2)) def _suggest(self, text, params): @@ -179,8 +178,8 @@ def _suggest(self, text, params): f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') limit = int(params['limit']) - keywords = self._kw_extractor.extract_keywords(text) - suggestions = self._keyphrases2suggestions(keywords) + keyphrases = self._kw_extractor.extract_keywords(text) + suggestions = self._keyphrases2suggestions(keyphrases) subject_suggestions = [SubjectSuggestion( uri=uri, From 3d0afb90e0edbf32b42d4cb8035c2234be3b95b4 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 29 Jan 2021 17:26:28 +0200 Subject: [PATCH 15/57] Get language for label picking from params (to ease testing) --- annif/backend/yake.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 40fef2e79..305890ad3 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -58,7 +58,7 @@ def label_types(self): def initialize(self): self._initialize_index() self._kw_extractor = yake.KeywordExtractor( - lan=self.project.language, + lan=self.params['language'], n=self.params['max_ngram_size'], dedupLim=self.params['deduplication_threshold'], dedupFunc=self.params['deduplication_algo'], @@ -96,7 +96,7 @@ def _create_index(self): in self.graph: continue for label in self.graph.objects(concept, label_type): - if not label.language == self.project.language: + if not label.language == self.params['language']: continue uri = str(concept) label = str(label) From f9354a0afadda5aafb23c474b7025a2b00e2b28b Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 29 Jan 2021 17:27:01 +0200 Subject: [PATCH 16/57] Config switch for removing a specifier in parenthesis from index labels --- annif/backend/yake.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 305890ad3..2edd67747 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -3,9 +3,11 @@ import yake import os.path +import re from collections import defaultdict from rdflib.namespace import SKOS, RDF, OWL import rdflib +import annif.util from . import backend from annif.suggestion import SubjectSuggestion, ListSuggestionResult from annif.exception import ConfigurationException @@ -28,7 +30,8 @@ class YakeBackend(backend.AnnifBackend): 'window_size': 1, 'num_keywords': 100, 'features': None, - 'default_label_types': ['pref', 'alt'] + 'default_label_types': ['pref', 'alt'], + 'remove_specifiers': False } def default_params(self): @@ -100,6 +103,8 @@ def _create_index(self): continue uri = str(concept) label = str(label) + if annif.util.boolean(self.params['remove_specifiers']): + label = re.sub(r' \(.*\)', '', label) lemmatized_label = self._lemmatize_phrase(label) lemmatized_label = self._sort_phrase(lemmatized_label) index[lemmatized_label].add(uri) From 7116aa00d130007ecd7cbfe7e669de08bcce77da Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 29 Jan 2021 17:36:53 +0200 Subject: [PATCH 17/57] Test for suggest method of Yake --- tests/test_backend_yake.py | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100755 tests/test_backend_yake.py diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py new file mode 100755 index 000000000..372c8485f --- /dev/null +++ b/tests/test_backend_yake.py @@ -0,0 +1,41 @@ +"""Unit tests for the Yake backend in Annif""" + +import annif +import pytest +import os +from rdflib import Graph + + +@pytest.fixture(scope='module') +def graph_project(project): + _rdf_file_path = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'yso-archaeology.rdf') + g = Graph() + g.load(_rdf_file_path) + project.vocab.as_graph.return_value = g + return project + + +def test_yake_suggest(project, graph_project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'fi'}, + project=graph_project) + + results = yake.suggest("""Arkeologia on tieteenala, jota sanotaan joskus + muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede + tai oikeammin joukko tieteitä, jotka tutkivat ihmisen menneisyyttä. + Tutkimusta tehdään analysoimalla muinaisjäännöksiä eli niitä jälkiä, + joita ihmisten toiminta on jättänyt maaperään tai vesistöjen + pohjaan.""") + + assert len(results) > 0 + assert len(results) <= 8 + hits = results.as_list(project.subjects) + assert 'http://www.yso.fi/onto/yso/p1265' in [ + result.uri for result in hits] + assert 'arkeologia' in [result.label for result in hits] From 154ae55142bbb5076cbf293f9b50b4bcadb65135 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 29 Jan 2021 18:03:45 +0200 Subject: [PATCH 18/57] Skip Yake tests when Yake is not installed (Python 3.7 in Travis) --- tests/test_backend_yake.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 372c8485f..455b37c54 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -5,6 +5,8 @@ import os from rdflib import Graph +pytest.importorskip("annif.backend.yake") + @pytest.fixture(scope='module') def graph_project(project): From 9ce122f0ca855f4928e7e0436474bf32610ddee6 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Mon, 1 Feb 2021 10:04:03 +0200 Subject: [PATCH 19/57] Try to simplify _create_index method; reorder methods --- annif/backend/yake.py | 98 +++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 2edd67747..265f5a5ce 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -58,6 +58,13 @@ def label_types(self): else: return [mapping[lt] for lt in self.params['default_label_types']] + @property + def graph(self): + if self._graph is None: + self.info('Loading graph') + self._graph = self.project.vocab.as_graph() + return self._graph + def initialize(self): self._initialize_index() self._kw_extractor = yake.KeywordExtractor( @@ -82,35 +89,6 @@ def _initialize_index(self): self._save_index(path) self.info(f'Created index with {len(self._index)} labels') - @property - def graph(self): - if self._graph is None: - self.info('Loading graph') - self._graph = self.project.vocab.as_graph() - return self._graph - - def _create_index(self): - # TODO Should index creation & saving be done on loadvoc command? - # Or saving at all? It takes about 1 min to create the index - index = defaultdict(set) - for label_type in self.label_types: - for concept in self.graph.subjects(RDF.type, SKOS.Concept): - if (concept, OWL.deprecated, rdflib.Literal(True)) \ - in self.graph: - continue - for label in self.graph.objects(concept, label_type): - if not label.language == self.params['language']: - continue - uri = str(concept) - label = str(label) - if annif.util.boolean(self.params['remove_specifiers']): - label = re.sub(r' \(.*\)', '', label) - lemmatized_label = self._lemmatize_phrase(label) - lemmatized_label = self._sort_phrase(lemmatized_label) - index[lemmatized_label].add(uri) - index.pop('', None) # Remove possible empty string entry - self._index = dict(index) - def _save_index(self, path): with open(path, 'w', encoding='utf-8') as indexfile: for label, uris in self._index.items(): @@ -125,9 +103,27 @@ def _load_index(self, path): index[label] = uris.split() return index - def _sort_phrase(self, phrase): - words = phrase.split() - return ' '.join(sorted(words)) + def _create_index(self): + index = defaultdict(set) + for concept in self.graph.subjects(RDF.type, SKOS.Concept): + if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: + continue + uri = str(concept) + for label_type in self.label_types: + for label in self.graph.objects(concept, label_type): + if not label.language == self.params['language']: + continue + label = self._normalize_label(label) + index[label].add(uri) + index.pop('', None) # Remove possible empty string entry + self._index = dict(index) + + def _normalize_label(self, label): + label = str(label) + if annif.util.boolean(self.params['remove_specifiers']): + label = re.sub(r' \(.*\)', '', label) + lemmatized_label = self._lemmatize_phrase(label) + return self._sort_phrase(lemmatized_label) def _lemmatize_phrase(self, phrase): normalized = [] @@ -136,6 +132,27 @@ def _lemmatize_phrase(self, phrase): self.project.analyzer.normalize_word(word).lower()) return ' '.join(normalized) + def _sort_phrase(self, phrase): + words = phrase.split() + return ' '.join(sorted(words)) + + def _suggest(self, text, params): + self.debug( + f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') + limit = int(params['limit']) + + keyphrases = self._kw_extractor.extract_keywords(text) + suggestions = self._keyphrases2suggestions(keyphrases) + + subject_suggestions = [SubjectSuggestion( + uri=uri, + label=None, + notation=None, + score=score) + for uri, score in suggestions[:limit] if score > 0.0] + return ListSuggestionResult.create_from_index(subject_suggestions, + self.project.subjects) + def _keyphrases2suggestions(self, keyphrases): suggestions = [] not_matched = [] @@ -177,20 +194,3 @@ def _combine_suggestions(self, suggestions): def _conflate_scores(self, score1, score2): return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2)) - - def _suggest(self, text, params): - self.debug( - f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') - limit = int(params['limit']) - - keyphrases = self._kw_extractor.extract_keywords(text) - suggestions = self._keyphrases2suggestions(keyphrases) - - subject_suggestions = [SubjectSuggestion( - uri=uri, - label=None, - notation=None, - score=score) - for uri, score in suggestions[:limit] if score > 0.0] - return ListSuggestionResult.create_from_index(subject_suggestions, - self.project.subjects) From 36514889980591fa348255143fc6f09f452a597c Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 3 Feb 2021 19:19:00 +0200 Subject: [PATCH 20/57] Better name for option (remove_parentheses) --- annif/backend/yake.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 265f5a5ce..7ceb57fb9 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -31,7 +31,7 @@ class YakeBackend(backend.AnnifBackend): 'num_keywords': 100, 'features': None, 'default_label_types': ['pref', 'alt'], - 'remove_specifiers': False + 'remove_parentheses': False } def default_params(self): @@ -120,7 +120,7 @@ def _create_index(self): def _normalize_label(self, label): label = str(label) - if annif.util.boolean(self.params['remove_specifiers']): + if annif.util.boolean(self.params['remove_parentheses']): label = re.sub(r' \(.*\)', '', label) lemmatized_label = self._lemmatize_phrase(label) return self._sort_phrase(lemmatized_label) From 4b5d73031102064e0b461a59747a43421caa8cb0 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 5 Feb 2021 11:40:32 +0200 Subject: [PATCH 21/57] Install Yake from PyPI, not from GitHub --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 730b736de..dc7b9d622 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ def read(fname): 'vw': ['vowpalwabbit==8.8.1'], 'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'], 'omikuji': ['omikuji==0.3.*'], - 'yake': ['yake @ git+https://github.com/LIAAD/yake@v0.4.3'], + 'yake': ['yake==0.4.3'], 'dev': [ 'codecov', 'pytest-cov', From b3578ef0e2bd9db309a31abcbcf18014d61c879f Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 5 Feb 2021 11:43:50 +0200 Subject: [PATCH 22/57] Get labels of a concept using a helper method in index creation --- annif/backend/yake.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 7ceb57fb9..dd47411cc 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -109,14 +109,19 @@ def _create_index(self): if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: continue uri = str(concept) - for label_type in self.label_types: - for label in self.graph.objects(concept, label_type): - if not label.language == self.params['language']: - continue - label = self._normalize_label(label) - index[label].add(uri) + labels = self._get_concept_labels(concept, self.label_types) + for label in labels: + label = self._normalize_label(label) + index[label].add(uri) index.pop('', None) # Remove possible empty string entry self._index = dict(index) + def _get_concept_labels(self, concept, label_types): + labels = [] + for label_type in label_types: + for label in self.graph.objects(concept, label_type): + if label.language == self.params['language']: + labels.append(label) + return labels def _normalize_label(self, label): label = str(label) From 64e015adeba31021493a6fb28b9d59712d9c64c2 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 5 Feb 2021 11:46:02 +0200 Subject: [PATCH 23/57] Return index from create_index() instead setting the index field in the method --- annif/backend/yake.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index dd47411cc..606ef3533 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -85,7 +85,7 @@ def _initialize_index(self): f'Loaded index from {path} with {len(self._index)} labels') else: self.info('Creating index') - self._create_index() + self._index = self._create_index() self._save_index(path) self.info(f'Created index with {len(self._index)} labels') @@ -114,7 +114,8 @@ def _create_index(self): label = self._normalize_label(label) index[label].add(uri) index.pop('', None) # Remove possible empty string entry - self._index = dict(index) + return dict(index) + def _get_concept_labels(self, concept, label_types): labels = [] for label_type in label_types: From 77410974e5e218235ee4e5672997cbd0c8159c7f Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 5 Feb 2021 13:04:46 +0200 Subject: [PATCH 24/57] Combine scores using "additive conflation" --- annif/backend/yake.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 606ef3533..856a52b4b 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -194,9 +194,13 @@ def _combine_suggestions(self, suggestions): combined_suggestions[uri] = score else: old_score = combined_suggestions[uri] - combined_suggestions[uri] = self._conflate_scores( + combined_suggestions[uri] = self._combine_scores( score, old_score) return list(combined_suggestions.items()) - def _conflate_scores(self, score1, score2): - return score1 * score2 / (score1 * score2 + (1-score1) * (1-score2)) + def _combine_scores(self, score1, score2): + # The result is never smaller than the greater input + score1 = score1/2 + 0.5 + score2 = score2/2 + 0.5 + confl = score1 * score2 / (score1 * score2 + (1-score1) * (1-score2)) + return (confl-0.5) * 2 From 5bde900f236a385dc2cb54ca4046902c8082976b Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 5 Feb 2021 13:13:37 +0200 Subject: [PATCH 25/57] Create Yake object on suggest (allows setting Yake params on runtime) --- annif/backend/yake.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 856a52b4b..4d6ce74ae 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -67,14 +67,6 @@ def graph(self): def initialize(self): self._initialize_index() - self._kw_extractor = yake.KeywordExtractor( - lan=self.params['language'], - n=self.params['max_ngram_size'], - dedupLim=self.params['deduplication_threshold'], - dedupFunc=self.params['deduplication_algo'], - windowsSize=self.params['window_size'], - top=self.params['num_keywords'], - features=self.params['features']) def _initialize_index(self): if self._index is None: @@ -147,6 +139,14 @@ def _suggest(self, text, params): f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') limit = int(params['limit']) + self._kw_extractor = yake.KeywordExtractor( + lan=params['language'], + n=int(params['max_ngram_size']), + dedupLim=float(params['deduplication_threshold']), + dedupFunc=params['deduplication_algo'], + windowsSize=int(params['window_size']), + top=int(params['num_keywords']), + features=self.params['features']) keyphrases = self._kw_extractor.extract_keywords(text) suggestions = self._keyphrases2suggestions(keyphrases) From b2c08cfcfcda1ea4206c72ccff59c54f075583b0 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 5 Feb 2021 14:26:55 +0200 Subject: [PATCH 26/57] Avoid crash for empty or non-alphanumeric input --- annif/backend/yake.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 4d6ce74ae..88130beb1 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -139,6 +139,10 @@ def _suggest(self, text, params): f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') limit = int(params['limit']) + alphanum = re.compile('[^a-zA-Z0-9]') + if len(re.sub(alphanum, '', text)) == 0: + return ListSuggestionResult([]) + self._kw_extractor = yake.KeywordExtractor( lan=params['language'], n=int(params['max_ngram_size']), From daee74b2cbdf7ada55bb7798807da11e8a373ae0 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 5 Feb 2021 14:27:19 +0200 Subject: [PATCH 27/57] Add unit tests --- tests/test_backend_yake.py | 108 +++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 455b37c54..aa7e22dba 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -4,6 +4,8 @@ import pytest import os from rdflib import Graph +import annif.backend + pytest.importorskip("annif.backend.yake") @@ -41,3 +43,109 @@ def test_yake_suggest(project, graph_project): assert 'http://www.yso.fi/onto/yso/p1265' in [ result.uri for result in hits] assert 'arkeologia' in [result.label for result in hits] + + +def test_yake_suggest_non_alphanum_text(project, graph_project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'fi'}, + project=graph_project) + + results = yake.suggest(".,!") + assert len(results) == 0 + + +def test_create_index_preflabels(graph_project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'fi', 'label_types': 'pref'}, + project=graph_project) + index = yake._create_index() + # Some of the 130 prefLabels get merged in lemmatization: + # assyriologit, assyriologia (assyriolog); arkealogit, arkeologia + # (arkeolog); egyptologit, egyptologia (egyptolog) + assert len(index) == 127 + assert 'kalliotaid' in index + assert 'luolamaalauks' not in index + + +def test_create_index_altlabels(graph_project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'fi', 'label_types': 'alt'}, + project=graph_project) + index = yake._create_index() + assert len(index) == 34 + assert 'kalliotaid' not in index + assert 'luolamaalauks' in index + + +def test_create_index_pref_and_altlabels(graph_project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'fi'}, + project=graph_project) + index = yake._create_index() + assert len(index) == 161 + assert 'kalliotaid' in index + assert 'luolamaalauks' in index + + +def test_create_index_label_languages(graph_project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'sv', 'label_types': 'pref'}, + project=graph_project) + index = yake._create_index() + assert len(index) == 130 + assert 'kalliotaid' not in index + assert 'bergkonst' in index + assert 'rock art' not in index + + +def test_combine_suggestions_different_uris(project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'fi'}, + project=project) + + suggestions = [('http://www.yso.fi/onto/yso/p1265', 0.75), + ('http://www.yso.fi/onto/yso/p1266', 0.25)] + combined = yake._combine_suggestions(suggestions) + assert len(combined) == 2 + assert combined[0] == suggestions[0] + assert combined[1] == suggestions[1] + + +def test_combine_suggestions_same_uri(project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'fi'}, + project=project) + + combined = yake._combine_suggestions( + [('http://www.yso.fi/onto/yso/p1265', 0.42), + ('http://www.yso.fi/onto/yso/p1265', 0.42)]) + assert len(combined) == 1 + + +def test_combine_scores(project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'limit': 8, 'language': 'fi'}, + project=project) + + assert yake._combine_scores(0.5, 0.5) == 0.8 + assert yake._combine_scores(0.75, 0.75) == 0.96 + assert yake._combine_scores(1.0, 0.424242) == 1.0 + assert yake._combine_scores(1.0, 0.0) == 1.0 + assert yake._combine_scores(0.4, 0.3) == 0.625 + assert yake._combine_scores(0.4, 0.5) == 0.75 From c19e8b76da59cbd61fefc495b6e9ff7538a1d472 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 11 Feb 2021 10:17:26 +0200 Subject: [PATCH 28/57] Make graph_project a common pytest fixture (move it to conftest.py) --- tests/conftest.py | 14 ++++++++++++++ tests/test_backend_stwfsa.py | 19 ------------------- tests/test_backend_yake.py | 15 --------------- 3 files changed, 14 insertions(+), 34 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 2e3949f47..80a8bfc3b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ import pytest import py.path import unittest.mock +from rdflib import Graph import annif import annif.analyzer import annif.corpus @@ -116,6 +117,19 @@ def project(subject_index, datadir, registry, vocabulary): return proj +@pytest.fixture(scope='module') +def graph_project(project): + _rdf_file_path = os.path.join( + os.path.dirname(__file__), + 'corpora', + 'archaeology', + 'yso-archaeology.rdf') + g = Graph() + g.load(_rdf_file_path) + project.vocab.as_graph.return_value = g + return project + + @pytest.fixture(scope='module') def app_project(app): with app.app_context(): diff --git a/tests/test_backend_stwfsa.py b/tests/test_backend_stwfsa.py index e27f11c6b..858b2371b 100644 --- a/tests/test_backend_stwfsa.py +++ b/tests/test_backend_stwfsa.py @@ -1,27 +1,8 @@ -import os from annif.backend import get_backend -from rdflib import Graph import annif.corpus from annif.backend.stwfsa import StwfsaBackend from annif.exception import NotInitializedException, NotSupportedException - import pytest -from unittest.mock import Mock - - -@pytest.fixture -def graph_project(project): - _rdf_file_path = os.path.join( - os.path.dirname(__file__), - 'corpora', - 'archaeology', - 'yso-archaeology.rdf') - g = Graph() - g.load(_rdf_file_path) - mock_vocab = Mock() - mock_vocab.as_graph.return_value = g - project.vocab = mock_vocab - return project _backend_conf = { diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index aa7e22dba..2a79c386e 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -2,27 +2,12 @@ import annif import pytest -import os -from rdflib import Graph import annif.backend pytest.importorskip("annif.backend.yake") -@pytest.fixture(scope='module') -def graph_project(project): - _rdf_file_path = os.path.join( - os.path.dirname(__file__), - 'corpora', - 'archaeology', - 'yso-archaeology.rdf') - g = Graph() - g.load(_rdf_file_path) - project.vocab.as_graph.return_value = g - return project - - def test_yake_suggest(project, graph_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( From af1129e13cdd4c7112e811641f60a6bb5962fef6 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 11 Feb 2021 16:14:25 +0200 Subject: [PATCH 29/57] Avoid need for clumsy mapping for labeltypes by using directly SKOS names --- annif/backend/yake.py | 22 ++++++++++------------ tests/test_backend_yake.py | 6 +++--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 88130beb1..fba3c8863 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -30,7 +30,7 @@ class YakeBackend(backend.AnnifBackend): 'window_size': 1, 'num_keywords': 100, 'features': None, - 'default_label_types': ['pref', 'alt'], + 'default_label_types': ['prefLabel', 'altLabel'], 'remove_parentheses': False } @@ -45,18 +45,16 @@ def is_trained(self): @property def label_types(self): - mapping = {'pref': SKOS.prefLabel, - 'alt': SKOS.altLabel, - 'hidden': SKOS.hiddenLabel} if 'label_types' in self.params: - lt_entries = self.params['label_types'].split(',') - try: - return [mapping[lt.strip()] for lt in lt_entries] - except KeyError as err: - raise ConfigurationException( - f'invalid label type {err}', backend_id=self.backend_id) - else: - return [mapping[lt] for lt in self.params['default_label_types']] + lt_entries = [lt.strip() for lt + in self.params['label_types'].split(',')] + valid_types = ('prefLabel', 'altLabel', 'hiddenLabel') + for lt in lt_entries: + if lt not in valid_types: + raise ConfigurationException( + f'invalid label type {lt}', backend_id=self.backend_id) + return [getattr(SKOS, lt) for lt in lt_entries] + return [getattr(SKOS, lt) for lt in self.params['default_label_types']] @property def graph(self): diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 2a79c386e..cf077f404 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -45,7 +45,7 @@ def test_create_index_preflabels(graph_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', - config_params={'limit': 8, 'language': 'fi', 'label_types': 'pref'}, + config_params={'language': 'fi', 'label_types': 'prefLabel'}, project=graph_project) index = yake._create_index() # Some of the 130 prefLabels get merged in lemmatization: @@ -60,7 +60,7 @@ def test_create_index_altlabels(graph_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', - config_params={'limit': 8, 'language': 'fi', 'label_types': 'alt'}, + config_params={'language': 'fi', 'label_types': 'altLabel'}, project=graph_project) index = yake._create_index() assert len(index) == 34 @@ -84,7 +84,7 @@ def test_create_index_label_languages(graph_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', - config_params={'limit': 8, 'language': 'sv', 'label_types': 'pref'}, + config_params={'limit': 8, 'language': 'sv', 'label_types': 'prefLabel'}, project=graph_project) index = yake._create_index() assert len(index) == 130 From dca4474cee77c8ce7ea35e8234dd38a337c2271d Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Thu, 11 Feb 2021 17:04:00 +0200 Subject: [PATCH 30/57] Avoid need for "default_label_types" name for defaults --- annif/backend/yake.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index fba3c8863..10da74409 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -30,7 +30,7 @@ class YakeBackend(backend.AnnifBackend): 'window_size': 1, 'num_keywords': 100, 'features': None, - 'default_label_types': ['prefLabel', 'altLabel'], + 'label_types': ['prefLabel', 'altLabel'], 'remove_parentheses': False } @@ -45,16 +45,17 @@ def is_trained(self): @property def label_types(self): - if 'label_types' in self.params: - lt_entries = [lt.strip() for lt - in self.params['label_types'].split(',')] + if type(self.params['label_types']) == str: # Label types set by user + label_types = [lt.strip() for lt + in self.params['label_types'].split(',')] valid_types = ('prefLabel', 'altLabel', 'hiddenLabel') - for lt in lt_entries: + for lt in label_types: if lt not in valid_types: raise ConfigurationException( f'invalid label type {lt}', backend_id=self.backend_id) - return [getattr(SKOS, lt) for lt in lt_entries] - return [getattr(SKOS, lt) for lt in self.params['default_label_types']] + else: + label_types = self.params['label_types'] # The defaults + return [getattr(SKOS, lt) for lt in label_types] @property def graph(self): From 2917628f86d8956766c1353de32691df3a4c664c Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 12 Feb 2021 12:56:36 +0200 Subject: [PATCH 31/57] Refactor attempting to resolve complexitity complains by CodeClimate --- annif/backend/yake.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 10da74409..dc6db845b 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -48,15 +48,17 @@ def label_types(self): if type(self.params['label_types']) == str: # Label types set by user label_types = [lt.strip() for lt in self.params['label_types'].split(',')] - valid_types = ('prefLabel', 'altLabel', 'hiddenLabel') - for lt in label_types: - if lt not in valid_types: - raise ConfigurationException( - f'invalid label type {lt}', backend_id=self.backend_id) + self._validate_label_types(label_types) else: label_types = self.params['label_types'] # The defaults return [getattr(SKOS, lt) for lt in label_types] + def _validate_label_types(self, label_types): + for lt in label_types: + if lt not in ('prefLabel', 'altLabel', 'hiddenLabel'): + raise ConfigurationException( + f'invalid label type {lt}', backend_id=self.backend_id) + @property def graph(self): if self._graph is None: From 6a3aebd09aaeb087634a29917667318ab4ace02d Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 12 Feb 2021 12:59:32 +0200 Subject: [PATCH 32/57] Add test for invalid label types --- tests/test_backend_yake.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index cf077f404..3390b80fc 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -1,13 +1,23 @@ """Unit tests for the Yake backend in Annif""" import annif -import pytest import annif.backend - +import pytest +from annif.exception import ConfigurationException pytest.importorskip("annif.backend.yake") +def test_invalid_label_type(graph_project): + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'label_types': 'invalid_type', 'language': 'fi'}, + project=graph_project) + with pytest.raises(ConfigurationException): + yake.suggest("example text") + + def test_yake_suggest(project, graph_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( From b06ca9cb898d43d2bc1096ea4232148584767e06 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 12 Feb 2021 13:54:11 +0200 Subject: [PATCH 33/57] Remove pointless test --- tests/test_backend_yake.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 3390b80fc..b7f98f9ba 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -90,19 +90,6 @@ def test_create_index_pref_and_altlabels(graph_project): assert 'luolamaalauks' in index -def test_create_index_label_languages(graph_project): - yake_type = annif.backend.get_backend('yake') - yake = yake_type( - backend_id='yake', - config_params={'limit': 8, 'language': 'sv', 'label_types': 'prefLabel'}, - project=graph_project) - index = yake._create_index() - assert len(index) == 130 - assert 'kalliotaid' not in index - assert 'bergkonst' in index - assert 'rock art' not in index - - def test_combine_suggestions_different_uris(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( From 79e225b1b92af3d5f40ebeee331ed039ccbdde46 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 12 Feb 2021 16:34:17 +0200 Subject: [PATCH 34/57] Test for removing parentheses from label when creating index --- tests/test_backend_yake.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index b7f98f9ba..9151177e0 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -4,6 +4,8 @@ import annif.backend import pytest from annif.exception import ConfigurationException +from rdflib import Graph, URIRef, Literal +from rdflib.namespace import SKOS, RDF pytest.importorskip("annif.backend.yake") @@ -90,6 +92,28 @@ def test_create_index_pref_and_altlabels(graph_project): assert 'luolamaalauks' in index +def test_remove_parentheses(graph_project): + graph = Graph() + graph.add(( + URIRef('http://www.yso.fi/onto/yso/p4354'), RDF.type, SKOS.Concept)) + graph.add(( + URIRef('http://www.yso.fi/onto/yso/p4354'), SKOS.prefLabel, + Literal('lapset (ikäryhmät)', lang='fi'))) + graph_project.vocab.as_graph.return_value = graph + + yake_type = annif.backend.get_backend('yake') + yake = yake_type( + backend_id='yake', + config_params={'language': 'fi', 'remove_parentheses': True}, + project=graph_project) + index = yake._create_index() + assert len(index) == 1 + assert 'laps' in index + assert '(' not in index + assert ')' not in index + assert 'ikä' not in index + + def test_combine_suggestions_different_uris(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( From f0911191fecfd45410197a38f467b74fd4363075 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 17 Feb 2021 19:11:44 +0200 Subject: [PATCH 35/57] Add methods for accessing SKOS concepts & labels via AnnifVocabulary --- annif/corpus/skos.py | 15 +++++++++++++++ annif/vocab.py | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 5da101d09..5a3bd88cb 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -48,6 +48,21 @@ def subjects(self): yield Subject(uri=str(concept), label=label, notation=notation, text=None) + @property + def skos_concepts(self): + for concept in self.graph.subjects(RDF.type, SKOS.Concept): + if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: + continue + yield concept + + def get_skos_concept_labels(self, concept, label_types, language): + labels = [] + for label_type in label_types: + for label in self.graph.objects(concept, label_type): + if label.language == language: + labels.append(label) + return labels + @staticmethod def is_rdf_file(path): """return True if the path looks like an RDF file that can be loaded diff --git a/annif/vocab.py b/annif/vocab.py index 2355d1f3b..0b06bb38a 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -21,6 +21,7 @@ class AnnifVocabulary(DatadirMixin): def __init__(self, vocab_id, datadir): DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id) self.vocab_id = vocab_id + self._skos_vocab = None def _create_subject_index(self, subject_corpus): self._subjects = annif.corpus.SubjectIndex(subject_corpus) @@ -55,6 +56,25 @@ def subjects(self): "subject file {} not found".format(path)) return self._subjects + @property + def skos_vocab(self): + if self._skos_vocab is None: + path = os.path.join(self.datadir, 'subjects.ttl') + if os.path.exists(path): + logger.debug(f'loading graph from {path}') + self._skos_vocab = annif.corpus.SubjectFileSKOS(path, None) + else: + raise NotInitializedException(f'graph file {path} not found') + return self._skos_vocab + + @property + def skos_concepts(self): + return self.skos_vocab.skos_concepts + + def get_skos_concept_labels(self, concept, label_types, language): + return self.skos_vocab.get_skos_concept_labels(concept, label_types, + language) + def load_vocabulary(self, subject_corpus, language): """load subjects from a subject corpus and save them into a SKOS/Turtle file for later use""" From fb0b7b5c3ff533c71c99d8849c31e56ba68e4b03 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 17 Feb 2021 19:14:15 +0200 Subject: [PATCH 36/57] Access SKOS concepts & labels via AnnifVocabulary in Yake --- annif/backend/yake.py | 25 ++++---------------- tests/conftest.py | 8 +++++++ tests/test_backend_yake.py | 48 +++++++++++++++++++++----------------- 3 files changed, 39 insertions(+), 42 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index dc6db845b..65814d763 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -5,8 +5,7 @@ import os.path import re from collections import defaultdict -from rdflib.namespace import SKOS, RDF, OWL -import rdflib +from rdflib.namespace import SKOS import annif.util from . import backend from annif.suggestion import SubjectSuggestion, ListSuggestionResult @@ -59,13 +58,6 @@ def _validate_label_types(self, label_types): raise ConfigurationException( f'invalid label type {lt}', backend_id=self.backend_id) - @property - def graph(self): - if self._graph is None: - self.info('Loading graph') - self._graph = self.project.vocab.as_graph() - return self._graph - def initialize(self): self._initialize_index() @@ -98,25 +90,16 @@ def _load_index(self, path): def _create_index(self): index = defaultdict(set) - for concept in self.graph.subjects(RDF.type, SKOS.Concept): - if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: - continue + for concept in self.project.vocab.skos_concepts: uri = str(concept) - labels = self._get_concept_labels(concept, self.label_types) + labels = self.project.vocab.get_skos_concept_labels( + concept, self.label_types, self.params['language']) for label in labels: label = self._normalize_label(label) index[label].add(uri) index.pop('', None) # Remove possible empty string entry return dict(index) - def _get_concept_labels(self, concept, label_types): - labels = [] - for label_type in label_types: - for label in self.graph.objects(concept, label_type): - if label.language == self.params['language']: - labels.append(label) - return labels - def _normalize_label(self, label): label = str(label) if annif.util.boolean(self.params['remove_parentheses']): diff --git a/tests/conftest.py b/tests/conftest.py index 80a8bfc3b..f23eb8c0d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -130,6 +130,14 @@ def graph_project(project): return project +@pytest.fixture(scope='function') +def skos_project(project, skos_vocabulary): + project.vocab.skos_concepts = skos_vocabulary.skos_concepts + project.vocab.get_skos_concept_labels = \ + skos_vocabulary.get_skos_concept_labels + return project + + @pytest.fixture(scope='module') def app_project(app): with app.app_context(): diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 9151177e0..93304baa9 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -6,26 +6,27 @@ from annif.exception import ConfigurationException from rdflib import Graph, URIRef, Literal from rdflib.namespace import SKOS, RDF +from copy import copy pytest.importorskip("annif.backend.yake") -def test_invalid_label_type(graph_project): +def test_invalid_label_type(skos_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'label_types': 'invalid_type', 'language': 'fi'}, - project=graph_project) + project=skos_project) with pytest.raises(ConfigurationException): yake.suggest("example text") -def test_yake_suggest(project, graph_project): +def test_yake_suggest(project, skos_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'limit': 8, 'language': 'fi'}, - project=graph_project) + project=skos_project) results = yake.suggest("""Arkeologia on tieteenala, jota sanotaan joskus muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede @@ -42,23 +43,23 @@ def test_yake_suggest(project, graph_project): assert 'arkeologia' in [result.label for result in hits] -def test_yake_suggest_non_alphanum_text(project, graph_project): +def test_yake_suggest_non_alphanum_text(project, skos_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'limit': 8, 'language': 'fi'}, - project=graph_project) + project=skos_project) results = yake.suggest(".,!") assert len(results) == 0 -def test_create_index_preflabels(graph_project): +def test_create_index_preflabels(skos_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'language': 'fi', 'label_types': 'prefLabel'}, - project=graph_project) + project=skos_project) index = yake._create_index() # Some of the 130 prefLabels get merged in lemmatization: # assyriologit, assyriologia (assyriolog); arkealogit, arkeologia @@ -68,44 +69,49 @@ def test_create_index_preflabels(graph_project): assert 'luolamaalauks' not in index -def test_create_index_altlabels(graph_project): +def test_create_index_pref_and_altlabels(skos_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', - config_params={'language': 'fi', 'label_types': 'altLabel'}, - project=graph_project) + config_params={'limit': 8, 'language': 'fi'}, + project=skos_project) index = yake._create_index() - assert len(index) == 34 - assert 'kalliotaid' not in index + assert len(index) == 161 + assert 'kalliotaid' in index assert 'luolamaalauks' in index -def test_create_index_pref_and_altlabels(graph_project): +def test_create_index_altlabels(skos_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', - config_params={'limit': 8, 'language': 'fi'}, - project=graph_project) + config_params={'language': 'fi', 'label_types': 'altLabel'}, + project=skos_project) index = yake._create_index() - assert len(index) == 161 - assert 'kalliotaid' in index + assert len(index) == 34 + assert 'kalliotaid' not in index assert 'luolamaalauks' in index -def test_remove_parentheses(graph_project): +def test_remove_parentheses(project, skos_vocabulary): graph = Graph() graph.add(( URIRef('http://www.yso.fi/onto/yso/p4354'), RDF.type, SKOS.Concept)) graph.add(( URIRef('http://www.yso.fi/onto/yso/p4354'), SKOS.prefLabel, Literal('lapset (ikäryhmät)', lang='fi'))) - graph_project.vocab.as_graph.return_value = graph + + skos_vocabulary = copy(skos_vocabulary) # Do not modify original fixture + skos_vocabulary.graph = graph + project.vocab.skos_concepts = skos_vocabulary.skos_concepts + project.vocab.get_skos_concept_labels = \ + skos_vocabulary.get_skos_concept_labels yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'language': 'fi', 'remove_parentheses': True}, - project=graph_project) + project=project) index = yake._create_index() assert len(index) == 1 assert 'laps' in index From d1c2af5d35a038ad59879c90566594e042340e91 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 17 Feb 2021 19:20:52 +0200 Subject: [PATCH 37/57] Access SKOS graph via skos_vocab in AnnifVocabulary --- annif/vocab.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/annif/vocab.py b/annif/vocab.py index 0b06bb38a..328099aaa 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -1,7 +1,6 @@ """Vocabulary management functionality for Annif""" import os.path -import rdflib.graph import annif import annif.corpus import annif.util @@ -93,9 +92,4 @@ def as_skos(self): def as_graph(self): """return the vocabulary as an rdflib graph""" - g = rdflib.graph.Graph() - g.load( - os.path.join(self.datadir, 'subjects.ttl'), - format='ttl' - ) - return g + return self.skos_vocab.graph From f080d02af74bd75788753fcc8bf5dade81d516a0 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 17 Feb 2021 19:26:54 +0200 Subject: [PATCH 38/57] Reduce code duplication by using the skos_concepts property --- annif/corpus/skos.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 5a3bd88cb..d69a79df6 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -35,9 +35,7 @@ def __init__(self, path, language): @property def subjects(self): - for concept in self.graph.subjects(RDF.type, SKOS.Concept): - if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: - continue + for concept in self.skos_concepts: labels = self.graph.preferredLabel(concept, lang=self.language) notation = self.graph.value(concept, SKOS.notation, None, any=True) if not labels: From 1fdf80388cb986e981383e2f41e3f10e5d912e0b Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 10 Mar 2021 11:19:39 +0200 Subject: [PATCH 39/57] Update YAKE to 0.4.5 (eliminates warnings on input with no keywords) --- annif/backend/yake.py | 4 ---- setup.py | 2 +- tests/test_backend_yake.py | 4 ++-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 65814d763..2ffe9b3ed 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -123,10 +123,6 @@ def _suggest(self, text, params): f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})') limit = int(params['limit']) - alphanum = re.compile('[^a-zA-Z0-9]') - if len(re.sub(alphanum, '', text)) == 0: - return ListSuggestionResult([]) - self._kw_extractor = yake.KeywordExtractor( lan=params['language'], n=int(params['max_ngram_size']), diff --git a/setup.py b/setup.py index dc7b9d622..07ecc70d1 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ def read(fname): 'vw': ['vowpalwabbit==8.8.1'], 'nn': ['tensorflow-cpu==2.3.1', 'lmdb==1.0.0'], 'omikuji': ['omikuji==0.3.*'], - 'yake': ['yake==0.4.3'], + 'yake': ['yake==0.4.5'], 'dev': [ 'codecov', 'pytest-cov', diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 93304baa9..86d189764 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -43,14 +43,14 @@ def test_yake_suggest(project, skos_project): assert 'arkeologia' in [result.label for result in hits] -def test_yake_suggest_non_alphanum_text(project, skos_project): +def test_yake_suggest_no_input(project, skos_project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'limit': 8, 'language': 'fi'}, project=skos_project) - results = yake.suggest(".,!") + results = yake.suggest("ja tai .,!") assert len(results) == 0 From 2fb2244f3015a2b34b2c9e1f3a3390fada97b78e Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 17 Mar 2021 10:29:51 +0200 Subject: [PATCH 40/57] Install Yake in GH Actions jobs for unit tests for Python 3.6 & 3.8 --- .github/workflows/python-package.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index a02dcdac5..cad50c2d0 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -32,9 +32,9 @@ jobs: # Install the optional neural network dependencies (TensorFlow and LMDB) # - except for one Python version (3.7) so that we can test also without them if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[nn]; fi - # Install the optional Omikuji dependency + # Install the optional Omikuji and YAKE dependencies # - except for one Python version (3.7) so that we can test also without them - if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[omikuji]; fi + if [[ ${{ matrix.python-version }} != '3.7' ]]; then pip install .[omikuji,yake]; fi # Install the optional fastText dependencies for Python 3.7 only if [[ ${{ matrix.python-version }} == '3.7' ]]; then pip install .[fasttext]; fi # For Python 3.6 From 0b1cacd1d630419ba3f9e8fcf697cb899ac85504 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 17 Mar 2021 11:44:41 +0200 Subject: [PATCH 41/57] Remove condition and debug message for neg. Yake score, use max() instead --- annif/backend/yake.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 2ffe9b3ed..38ff5899f 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -166,9 +166,7 @@ def _keyphrase2uris(self, keyphrase): return self._index.get(keyphrase, []) def _transform_score(self, score): - if score < 0: - self.debug(f'Replacing negative YAKE score {score} with zero') - return 1.0 + score = max(score, 0) return 1.0 / (score + 1) def _combine_suggestions(self, suggestions): From 975b1bc170b93ccb262789c2dc370a07ad8d5e57 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 30 Apr 2021 11:48:45 +0300 Subject: [PATCH 42/57] Adapt to current master: remove unnecessary skos_project fixture --- tests/conftest.py | 8 -------- tests/test_backend_yake.py | 24 ++++++++++++------------ 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index f23eb8c0d..80a8bfc3b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -130,14 +130,6 @@ def graph_project(project): return project -@pytest.fixture(scope='function') -def skos_project(project, skos_vocabulary): - project.vocab.skos_concepts = skos_vocabulary.skos_concepts - project.vocab.get_skos_concept_labels = \ - skos_vocabulary.get_skos_concept_labels - return project - - @pytest.fixture(scope='module') def app_project(app): with app.app_context(): diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 86d189764..7ecbbb8c3 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -11,22 +11,22 @@ pytest.importorskip("annif.backend.yake") -def test_invalid_label_type(skos_project): +def test_invalid_label_type(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'label_types': 'invalid_type', 'language': 'fi'}, - project=skos_project) + project=project) with pytest.raises(ConfigurationException): yake.suggest("example text") -def test_yake_suggest(project, skos_project): +def test_yake_suggest(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'limit': 8, 'language': 'fi'}, - project=skos_project) + project=project) results = yake.suggest("""Arkeologia on tieteenala, jota sanotaan joskus muinaistutkimukseksi tai muinaistieteeksi. Se on humanistinen tiede @@ -43,23 +43,23 @@ def test_yake_suggest(project, skos_project): assert 'arkeologia' in [result.label for result in hits] -def test_yake_suggest_no_input(project, skos_project): +def test_yake_suggest_no_input(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'limit': 8, 'language': 'fi'}, - project=skos_project) + project=project) results = yake.suggest("ja tai .,!") assert len(results) == 0 -def test_create_index_preflabels(skos_project): +def test_create_index_preflabels(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'language': 'fi', 'label_types': 'prefLabel'}, - project=skos_project) + project=project) index = yake._create_index() # Some of the 130 prefLabels get merged in lemmatization: # assyriologit, assyriologia (assyriolog); arkealogit, arkeologia @@ -69,24 +69,24 @@ def test_create_index_preflabels(skos_project): assert 'luolamaalauks' not in index -def test_create_index_pref_and_altlabels(skos_project): +def test_create_index_pref_and_altlabels(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'limit': 8, 'language': 'fi'}, - project=skos_project) + project=project) index = yake._create_index() assert len(index) == 161 assert 'kalliotaid' in index assert 'luolamaalauks' in index -def test_create_index_altlabels(skos_project): +def test_create_index_altlabels(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( backend_id='yake', config_params={'language': 'fi', 'label_types': 'altLabel'}, - project=skos_project) + project=project) index = yake._create_index() assert len(index) == 34 assert 'kalliotaid' not in index From f354015ad206ad2e09ff8a67516320e0bd73647c Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 30 Apr 2021 11:50:21 +0300 Subject: [PATCH 43/57] Adapt to current master: altLabels in archaelogy corpus have changed --- tests/test_backend_yake.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 7ecbbb8c3..696064a1d 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -76,7 +76,7 @@ def test_create_index_pref_and_altlabels(project): config_params={'limit': 8, 'language': 'fi'}, project=project) index = yake._create_index() - assert len(index) == 161 + assert len(index) == 160 assert 'kalliotaid' in index assert 'luolamaalauks' in index @@ -88,7 +88,7 @@ def test_create_index_altlabels(project): config_params={'language': 'fi', 'label_types': 'altLabel'}, project=project) index = yake._create_index() - assert len(index) == 34 + assert len(index) == 33 assert 'kalliotaid' not in index assert 'luolamaalauks' in index From d83b9fbf4dadc7faf40a06843f5c82bc383e14be Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 30 Apr 2021 12:08:47 +0300 Subject: [PATCH 44/57] Adapt to current master: use project fixture in test_stwfsa, remove graph_project fixture --- tests/conftest.py | 14 -------------- tests/test_backend_stwfsa.py | 4 ++-- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 80a8bfc3b..2e3949f47 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,6 @@ import pytest import py.path import unittest.mock -from rdflib import Graph import annif import annif.analyzer import annif.corpus @@ -117,19 +116,6 @@ def project(subject_index, datadir, registry, vocabulary): return proj -@pytest.fixture(scope='module') -def graph_project(project): - _rdf_file_path = os.path.join( - os.path.dirname(__file__), - 'corpora', - 'archaeology', - 'yso-archaeology.rdf') - g = Graph() - g.load(_rdf_file_path) - project.vocab.as_graph.return_value = g - return project - - @pytest.fixture(scope='module') def app_project(app): with app.app_context(): diff --git a/tests/test_backend_stwfsa.py b/tests/test_backend_stwfsa.py index 858b2371b..8c5fd329b 100644 --- a/tests/test_backend_stwfsa.py +++ b/tests/test_backend_stwfsa.py @@ -54,12 +54,12 @@ def test_stwfsa_not_initialized(project): stwfsa.suggest("example text") -def test_stwfsa_train(document_corpus, graph_project, datadir): +def test_stwfsa_train(document_corpus, project, datadir): stwfsa_type = get_backend(StwfsaBackend.name) stwfsa = stwfsa_type( backend_id=StwfsaBackend.name, config_params=_backend_conf, - project=graph_project) + project=project) stwfsa.train(document_corpus) assert stwfsa._model is not None model_file = datadir.join(stwfsa.MODEL_FILE) From 315b47462b14c7a066d05773fbe8601a3d67bab4 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 30 Apr 2021 12:14:02 +0300 Subject: [PATCH 45/57] Remove test for removing parentheses from labels --- tests/test_backend_yake.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/tests/test_backend_yake.py b/tests/test_backend_yake.py index 696064a1d..795973853 100755 --- a/tests/test_backend_yake.py +++ b/tests/test_backend_yake.py @@ -4,9 +4,6 @@ import annif.backend import pytest from annif.exception import ConfigurationException -from rdflib import Graph, URIRef, Literal -from rdflib.namespace import SKOS, RDF -from copy import copy pytest.importorskip("annif.backend.yake") @@ -93,33 +90,6 @@ def test_create_index_altlabels(project): assert 'luolamaalauks' in index -def test_remove_parentheses(project, skos_vocabulary): - graph = Graph() - graph.add(( - URIRef('http://www.yso.fi/onto/yso/p4354'), RDF.type, SKOS.Concept)) - graph.add(( - URIRef('http://www.yso.fi/onto/yso/p4354'), SKOS.prefLabel, - Literal('lapset (ikäryhmät)', lang='fi'))) - - skos_vocabulary = copy(skos_vocabulary) # Do not modify original fixture - skos_vocabulary.graph = graph - project.vocab.skos_concepts = skos_vocabulary.skos_concepts - project.vocab.get_skos_concept_labels = \ - skos_vocabulary.get_skos_concept_labels - - yake_type = annif.backend.get_backend('yake') - yake = yake_type( - backend_id='yake', - config_params={'language': 'fi', 'remove_parentheses': True}, - project=project) - index = yake._create_index() - assert len(index) == 1 - assert 'laps' in index - assert '(' not in index - assert ')' not in index - assert 'ikä' not in index - - def test_combine_suggestions_different_uris(project): yake_type = annif.backend.get_backend('yake') yake = yake_type( From c797b5640b7cc7ed99f4dcc51788e1ba442adcc2 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 30 Apr 2021 13:18:04 +0300 Subject: [PATCH 46/57] Implement get_skos_concept_labels using list comprehension --- annif/corpus/skos.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index d69a79df6..7c40805f7 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -54,12 +54,10 @@ def skos_concepts(self): yield concept def get_skos_concept_labels(self, concept, label_types, language): - labels = [] - for label_type in label_types: - for label in self.graph.objects(concept, label_type): - if label.language == language: - labels.append(label) - return labels + return [str(label) + for label_type in label_types + for label in self.graph.objects(concept, label_type) + if label.language == language] @staticmethod def is_rdf_file(path): From fcacef8d2addeb4d161a94ce17417b09c75e7a54 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 30 Apr 2021 13:55:45 +0300 Subject: [PATCH 47/57] Rename & refactor methods for SKOS vocabulary --- annif/backend/yake.py | 5 +++-- annif/corpus/skos.py | 6 +++--- annif/vocab.py | 8 -------- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 38ff5899f..cbde4c4f1 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -90,9 +90,10 @@ def _load_index(self, path): def _create_index(self): index = defaultdict(set) - for concept in self.project.vocab.skos_concepts: + concepts = self.project.vocab.skos_vocab.concepts + for concept in concepts: uri = str(concept) - labels = self.project.vocab.get_skos_concept_labels( + labels = self.project.vocab.skos_vocab.get_concept_labels( concept, self.label_types, self.params['language']) for label in labels: label = self._normalize_label(label) diff --git a/annif/corpus/skos.py b/annif/corpus/skos.py index 7c40805f7..ba4f04406 100644 --- a/annif/corpus/skos.py +++ b/annif/corpus/skos.py @@ -35,7 +35,7 @@ def __init__(self, path, language): @property def subjects(self): - for concept in self.skos_concepts: + for concept in self.concepts: labels = self.graph.preferredLabel(concept, lang=self.language) notation = self.graph.value(concept, SKOS.notation, None, any=True) if not labels: @@ -47,13 +47,13 @@ def subjects(self): text=None) @property - def skos_concepts(self): + def concepts(self): for concept in self.graph.subjects(RDF.type, SKOS.Concept): if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph: continue yield concept - def get_skos_concept_labels(self, concept, label_types, language): + def get_concept_labels(self, concept, label_types, language): return [str(label) for label_type in label_types for label in self.graph.objects(concept, label_type) diff --git a/annif/vocab.py b/annif/vocab.py index 328099aaa..35c4af4e5 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -66,14 +66,6 @@ def skos_vocab(self): raise NotInitializedException(f'graph file {path} not found') return self._skos_vocab - @property - def skos_concepts(self): - return self.skos_vocab.skos_concepts - - def get_skos_concept_labels(self, concept, label_types, language): - return self.skos_vocab.get_skos_concept_labels(concept, label_types, - language) - def load_vocabulary(self, subject_corpus, language): """load subjects from a subject corpus and save them into a SKOS/Turtle file for later use""" From d6c2aa699a6e310361d9bc1ae82fd40e31a261ff Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 30 Apr 2021 13:57:19 +0300 Subject: [PATCH 48/57] Rename method for accessing SKOS vocab as a file object --- annif/backend/maui.py | 2 +- annif/vocab.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/annif/backend/maui.py b/annif/backend/maui.py index a40016f0c..bb9894722 100644 --- a/annif/backend/maui.py +++ b/annif/backend/maui.py @@ -101,7 +101,7 @@ def _upload_vocabulary(self, params): json = {} try: resp = requests.put(self.tagger_url(params) + '/vocab', - data=self.project.vocab.as_skos()) + data=self.project.vocab.as_skos_file()) try: json = resp.json() except ValueError: diff --git a/annif/vocab.py b/annif/vocab.py index 35c4af4e5..953e9917a 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -78,7 +78,7 @@ def load_vocabulary(self, subject_corpus, language): subject_corpus.save_skos(os.path.join(self.datadir, 'subjects.ttl'), language) - def as_skos(self): + def as_skos_file(self): """return the vocabulary as a file object, in SKOS/Turtle syntax""" return open(os.path.join(self.datadir, 'subjects.ttl'), 'rb') From 6ddb55708116a38a60cba7b4892e7ef3b895960b Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Fri, 30 Apr 2021 18:04:17 +0300 Subject: [PATCH 49/57] Adjust license explanation --- README.md | 5 ++++- annif/backend/yake.py | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b115f7cbf..8eb84eb86 100644 --- a/README.md +++ b/README.md @@ -133,4 +133,7 @@ Zenodo DOI: The code in this repository is licensed under Apache License 2.0, except for the dependencies included under `annif/static/css` and `annif/static/js`, -which have their own licenses. See the file headers for details. Using the optional Yake backend may change the licence of Annif to GPLv3, because [YAKE](https://github.com/LIAAD/yake) is licensed under GPLv3. +which have their own licenses, see the file headers for details. +Enabling the optional Yake backend may result in [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt) +terms to cover the application, because [YAKE](https://github.com/LIAAD/yake) +is licensed under GPLv3. diff --git a/annif/backend/yake.py b/annif/backend/yake.py index cbde4c4f1..6ccce8f3c 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -1,5 +1,7 @@ """Annif backend using Yake keyword extraction""" -# TODO Mention GPLv3 license also here? +# Enabling this optional backend may result in GPLv3 terms to cover the +# application, because YAKE (https://github.com/LIAAD/yake) is licensed under +# GPLv3. import yake import os.path From 27e0cdc44de8a17b32f9b967b48f28a36d0ea5f9 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 5 May 2021 10:37:36 +0300 Subject: [PATCH 50/57] Better name and docstring for the property for accessing SKOS vocabulary --- annif/backend/yake.py | 6 +++--- annif/vocab.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 6ccce8f3c..b763dac36 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -92,10 +92,10 @@ def _load_index(self, path): def _create_index(self): index = defaultdict(set) - concepts = self.project.vocab.skos_vocab.concepts - for concept in concepts: + skos_vocab = self.project.vocab.skos + for concept in skos_vocab.concepts: uri = str(concept) - labels = self.project.vocab.skos_vocab.get_concept_labels( + labels = skos_vocab.get_concept_labels( concept, self.label_types, self.params['language']) for label in labels: label = self._normalize_label(label) diff --git a/annif/vocab.py b/annif/vocab.py index 953e9917a..92f618580 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -56,7 +56,8 @@ def subjects(self): return self._subjects @property - def skos_vocab(self): + def skos(self): + """return the subject vocabulary from SKOS file""" if self._skos_vocab is None: path = os.path.join(self.datadir, 'subjects.ttl') if os.path.exists(path): @@ -84,4 +85,4 @@ def as_skos_file(self): def as_graph(self): """return the vocabulary as an rdflib graph""" - return self.skos_vocab.graph + return self.skos.graph From 338c9b6ff2bff33cc42385bae24fbce1d2c1e0a9 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 5 May 2021 10:41:04 +0300 Subject: [PATCH 51/57] Change log message for loading index to debug level --- annif/backend/yake.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index b763dac36..7ebbd7a2c 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -68,7 +68,7 @@ def _initialize_index(self): path = os.path.join(self.datadir, self.INDEX_FILE) if os.path.exists(path): self._index = self._load_index(path) - self.info( + self.debug( f'Loaded index from {path} with {len(self._index)} labels') else: self.info('Creating index') From d09e8e30e4daf8a0a33050e28422435436f97752 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 11 May 2021 17:55:52 +0300 Subject: [PATCH 52/57] Readjust license explanation for Yake backend --- README.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8eb84eb86..2e301f69d 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,10 @@ Zenodo DOI: The code in this repository is licensed under Apache License 2.0, except for the dependencies included under `annif/static/css` and `annif/static/js`, which have their own licenses, see the file headers for details. -Enabling the optional Yake backend may result in [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt) -terms to cover the application, because [YAKE](https://github.com/LIAAD/yake) -is licensed under GPLv3. +Please note that the [YAKE](https://github.com/LIAAD/yake) library is licended +under [GPLv3](https://www.gnu.org/licenses/gpl-3.0.txt), while Annif is +licensed under the Apache License 2.0. The licenses are compatible, but +depending on legal interpretation, the terms of the GPLv3 (for example the +requirement to publish corresponding source code when publishing an executable +application) may be considered to apply to the whole of Annif+Yake if you +decide to install the optional Yake dependency. From 06fe6ef335a44b0b45de3fc830718d83b6e14b20 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Tue, 11 May 2021 23:22:41 +0300 Subject: [PATCH 53/57] Pass project's language to AnnifVocabulary and adapt fixtures as needed --- annif/project.py | 3 ++- annif/vocab.py | 6 ++++-- tests/conftest.py | 2 +- tests/test_vocab.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/annif/project.py b/annif/project.py index 8396d1e61..a4677c613 100644 --- a/annif/project.py +++ b/annif/project.py @@ -143,7 +143,8 @@ def vocab(self): raise ConfigurationException("vocab setting is missing", project_id=self.project_id) self._vocab = annif.vocab.AnnifVocabulary(self.vocab_id, - self._base_datadir) + self._base_datadir, + self.language) return self._vocab @property diff --git a/annif/vocab.py b/annif/vocab.py index 92f618580..b60d3d382 100644 --- a/annif/vocab.py +++ b/annif/vocab.py @@ -17,9 +17,10 @@ class AnnifVocabulary(DatadirMixin): # defaults for uninitialized instances _subjects = None - def __init__(self, vocab_id, datadir): + def __init__(self, vocab_id, datadir, language): DatadirMixin.__init__(self, datadir, 'vocabs', vocab_id) self.vocab_id = vocab_id + self.language = language self._skos_vocab = None def _create_subject_index(self, subject_corpus): @@ -62,7 +63,8 @@ def skos(self): path = os.path.join(self.datadir, 'subjects.ttl') if os.path.exists(path): logger.debug(f'loading graph from {path}') - self._skos_vocab = annif.corpus.SubjectFileSKOS(path, None) + self._skos_vocab = annif.corpus.SubjectFileSKOS(path, + self.language) else: raise NotInitializedException(f'graph file {path} not found') return self._skos_vocab diff --git a/tests/conftest.py b/tests/conftest.py index 2e3949f47..f92e40e92 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -68,7 +68,7 @@ def subject_file(): @pytest.fixture(scope='module') def vocabulary(datadir): - vocab = annif.vocab.AnnifVocabulary('my-vocab', datadir) + vocab = annif.vocab.AnnifVocabulary('my-vocab', datadir, 'fi') subjfile = os.path.join( os.path.dirname(__file__), 'corpora', diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 51a28e2b5..c56de5d51 100755 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -7,7 +7,7 @@ def load_dummy_vocab(tmpdir): - vocab = annif.vocab.AnnifVocabulary('vocab-id', str(tmpdir)) + vocab = annif.vocab.AnnifVocabulary('vocab-id', str(tmpdir), 'en') subjfile = os.path.join( os.path.dirname(__file__), 'corpora', From 4d67d4421657f41136d75aa7be210292587b2002 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 12 May 2021 10:50:23 +0300 Subject: [PATCH 54/57] Rename lemmatize_phrase function to normalize_phrase --- annif/backend/yake.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 7ebbd7a2c..725851bd7 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -107,10 +107,10 @@ def _normalize_label(self, label): label = str(label) if annif.util.boolean(self.params['remove_parentheses']): label = re.sub(r' \(.*\)', '', label) - lemmatized_label = self._lemmatize_phrase(label) - return self._sort_phrase(lemmatized_label) + normalized_label = self._normalize_phrase(label) + return self._sort_phrase(normalized_label) - def _lemmatize_phrase(self, phrase): + def _normalize_phrase(self, phrase): normalized = [] for word in phrase.split(): normalized.append( @@ -164,7 +164,7 @@ def _keyphrases2suggestions(self, keyphrases): return suggestions def _keyphrase2uris(self, keyphrase): - keyphrase = self._lemmatize_phrase(keyphrase) + keyphrase = self._normalize_phrase(keyphrase) keyphrase = self._sort_phrase(keyphrase) return self._index.get(keyphrase, []) From 9597526503d677420814d1196719d33a713547ec Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 12 May 2021 15:30:39 +0300 Subject: [PATCH 55/57] Use atomic_save for saving YAKE index --- annif/backend/yake.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 725851bd7..3ff62dccf 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -4,6 +4,7 @@ # GPLv3. import yake +import joblib import os.path import re from collections import defaultdict @@ -67,7 +68,7 @@ def _initialize_index(self): if self._index is None: path = os.path.join(self.datadir, self.INDEX_FILE) if os.path.exists(path): - self._index = self._load_index(path) + self._index = joblib.load(path) self.debug( f'Loaded index from {path} with {len(self._index)} labels') else: @@ -77,18 +78,11 @@ def _initialize_index(self): self.info(f'Created index with {len(self._index)} labels') def _save_index(self, path): - with open(path, 'w', encoding='utf-8') as indexfile: - for label, uris in self._index.items(): - line = label + '\t' + ' '.join(uris) - print(line, file=indexfile) - - def _load_index(self, path): - index = dict() - with open(path, 'r', encoding='utf-8') as indexfile: - for line in indexfile: - label, uris = line.strip().split('\t') - index[label] = uris.split() - return index + annif.util.atomic_save( + self._index, + self.datadir, + self.INDEX_FILE, + method=joblib.dump) def _create_index(self): index = defaultdict(set) From 2dafa54abf2863656dcb6b32bb23e1da367079b8 Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 12 May 2021 16:01:19 +0300 Subject: [PATCH 56/57] Adjust license explanation comment to point to license section in README.md --- annif/backend/yake.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/annif/backend/yake.py b/annif/backend/yake.py index 3ff62dccf..de828d54b 100755 --- a/annif/backend/yake.py +++ b/annif/backend/yake.py @@ -1,7 +1,6 @@ """Annif backend using Yake keyword extraction""" -# Enabling this optional backend may result in GPLv3 terms to cover the -# application, because YAKE (https://github.com/LIAAD/yake) is licensed under -# GPLv3. +# For license remarks of this backend see README.md: +# https://github.com/NatLibFi/Annif#license. import yake import joblib From 9a2127a29419ae72bb0b4af439fc8e892a4055fc Mon Sep 17 00:00:00 2001 From: Juho Inkinen Date: Wed, 12 May 2021 16:43:43 +0300 Subject: [PATCH 57/57] Truncate long log messages for objects to be saved --- annif/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/util.py b/annif/util.py index 042ff97f5..8d5174b4a 100644 --- a/annif/util.py +++ b/annif/util.py @@ -19,7 +19,7 @@ def atomic_save(obj, dirname, filename, method=None): tempfd, tempfilename = tempfile.mkstemp( prefix=prefix, suffix=suffix, dir=dirname) os.close(tempfd) - logger.debug('saving %s to temporary file %s', str(obj), tempfilename) + logger.debug('saving %s to temporary file %s', str(obj)[:90], tempfilename) if method is not None: method(obj, tempfilename) else: