Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simplify project configuration by allowing only one backend per project #232

Merged
merged 1 commit into from
Jan 15, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 29 additions & 47 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, project_id, config, datadir):
self.vocab_id = config.get('vocab', None)
self._base_datadir = datadir
self._datadir = os.path.join(datadir, 'projects', self.project_id)
self.backends = self._setup_backends(config)
self.backend = self._setup_backend(config)

def _get_datadir(self):
"""return the path of the directory where this project can store its
Expand All @@ -45,16 +45,10 @@ def _get_datadir(self):
os.makedirs(self._datadir)
return self._datadir

def _setup_backends(self, config):
backends = []
for backend_id, weight in annif.util.parse_sources(config['backends']):
backend_type = annif.backend.get_backend(backend_id)
backend = backend_type(
backend_id,
params=config,
datadir=self._datadir)
backends.append((backend, weight))
return backends
def _setup_backend(self, config):
backend_id = config['backend']
backend_type = annif.backend.get_backend(backend_id)
return backend_type(backend_id, params=config, datadir=self._datadir)

def _initialize_analyzer(self):
analyzer = self.analyzer
Expand All @@ -80,40 +74,34 @@ def _initialize_vectorizer(self):
except AnnifException as err:
logger.warning(err.format_message())

def _initialize_backends(self):
logger.debug("Project '%s': initializing backends", self.project_id)
for backend, _ in self.backends:
try:
backend.initialize()
except AnnifException as err:
logger.warning(err.format_message())
def _initialize_backend(self):
logger.debug("Project '%s': initializing backend", self.project_id)
try:
self.backend.initialize()
except AnnifException as err:
logger.warning(err.format_message())

def initialize(self):
"""initialize this project and all backends so that they are ready to
"""initialize this project and its backend so that they are ready to
analyze"""
logger.debug("Initializing project '%s'", self.project_id)

self._initialize_analyzer()
self._initialize_subjects()
self._initialize_vectorizer()
self._initialize_backends()
self._initialize_backend()

self.initialized = True

def _analyze_with_backends(self, text, backend_params):
hits_from_backends = []
def _analyze_with_backend(self, text, backend_params):
if backend_params is None:
backend_params = {}
for backend, weight in self.backends:
beparams = backend_params.get(backend.backend_id, {})
hits = backend.analyze(text, project=self, params=beparams)
logger.debug(
'Got %d hits from backend %s',
len(hits), backend.backend_id)
hits_from_backends.append(
annif.hit.WeightedHits(
hits=hits, weight=weight))
return hits_from_backends
beparams = backend_params.get(self.backend.backend_id, {})
hits = self.backend.analyze(text, project=self, params=beparams)
logger.debug(
'Got %d hits from backend %s',
len(hits), self.backend.backend_id)
return hits

@property
def analyzer(self):
Expand Down Expand Up @@ -149,21 +137,18 @@ def vectorizer(self):
return self._vectorizer

def analyze(self, text, backend_params=None):
"""Analyze the given text by passing it to backends and joining the
results. Returns a list of AnalysisHit objects ordered by decreasing
score."""
"""Analyze the given text by passing it to the backend. Returns a
list of AnalysisHit objects ordered by decreasing score."""

logger.debug('Analyzing text "%s..." (len=%d)',
text[:20], len(text))
hits_from_backends = self._analyze_with_backends(text, backend_params)
merged_hits = annif.util.merge_hits(hits_from_backends, self.subjects)
logger.debug('%d hits after merging', len(merged_hits))
return merged_hits
hits = self._analyze_with_backend(text, backend_params)
logger.debug('%d hits from backend', len(hits))
return hits

def _create_vectorizer(self, subjectcorpus):
if True not in [
be[0].needs_subject_vectorizer for be in self.backends]:
logger.debug('not creating vectorizer: not needed by any backend')
if not self.backend.needs_subject_vectorizer:
logger.debug('not creating vectorizer: not needed by backend')
return
logger.info('creating vectorizer')
self._vectorizer = TfidfVectorizer(
Expand All @@ -180,17 +165,14 @@ def load_documents(self, corpus):

corpus.set_subject_index(self.subjects)
self._create_vectorizer(corpus)

for backend, _ in self.backends:
backend.load_corpus(corpus, project=self)
self.backend.load_corpus(corpus, project=self)

def dump(self):
"""return this project as a dict"""
return {'project_id': self.project_id,
'name': self.name,
'language': self.language,
'backends': [{'backend_id': be[0].backend_id,
'weight': be[1]} for be in self.backends]
'backend': {'backend_id': self.backend.backend_id}
}


Expand Down
12 changes: 3 additions & 9 deletions annif/swagger/annif.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,8 @@ definitions:
backend_id:
type: string
example: my-backend
weight:
type: number
example: 1.0
required:
- backend_id
- weight
Project:
description: A project definition
properties:
Expand All @@ -115,15 +111,13 @@ definitions:
language:
type: string
example: en
backends:
type: array
items:
$ref: '#/definitions/ProjectBackend'
backend:
$ref: '#/definitions/ProjectBackend'
required:
- project_id
- name
- language
- backends
- backend
ProjectList:
description: A list of projects
properties:
Expand Down
24 changes: 12 additions & 12 deletions projects.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,31 @@
[tfidf-fi]
name=TF-IDF Finnish
language=fi
backends=tfidf
backend=tfidf
analyzer=voikko(fi)
limit=100
vocab=yso-fi

[tfidf-sv]
name=TF-IDF Swedish
language=sv
backends=tfidf
backend=tfidf
analyzer=snowball(swedish)
limit=100
vocab=yso-sv

[tfidf-en]
name=TF-IDF English
language=en
backends=tfidf
backend=tfidf
analyzer=snowball(english)
limit=100
vocab=yso-en

[fasttext-fi]
name=fastText Finnish
language=fi
backends=fasttext
backend=fasttext
analyzer=voikko(fi)
dim=500
lr=0.25
Expand All @@ -40,7 +40,7 @@ vocab=yso-fi
[fasttext-sv]
name=fastText Swedish
language=sv
backends=fasttext
backend=fasttext
analyzer=snowball(swedish)
dim=500
lr=0.25
Expand All @@ -53,7 +53,7 @@ vocab=yso-sv
[fasttext-en]
name=fastText English
language=en
backends=fasttext
backend=fasttext
analyzer=snowball(english)
dim=500
lr=0.25
Expand All @@ -66,44 +66,44 @@ vocab=yso-en
[maui-fi]
name=Maui Finnish
language=fi
backends=http
backend=http
endpoint=http://localhost:8080/maui/jyu-fin/analyze
vocab=yso-fi

[maui-sv]
name=Maui Swedish
language=sv
backends=http
backend=http
endpoint=http://localhost:8080/maui/jyu-swe/analyze
vocab=yso-sv

[maui-en]
name=Maui English
language=en
backends=http
backend=http
endpoint=http://localhost:8080/maui/jyu-eng/analyze
vocab=yso-en

[annif-api-fi]
name=Annif prototype API Finnish
language=fi
backends=http
backend=http
endpoint=http://api.annif.org/v0/autoindex
project=yso-finna-fi
vocab=yso-fi

[annif-api-sv]
name=Annif prototype API Swedish
language=sv
backends=http
backend=http
endpoint=http://api.annif.org/v0/autoindex
project=yso-finna-sv
vocab=yso-sv

[annif-api-en]
name=Annif prototype API English
language=en
backends=http
backend=http
endpoint=http://api.annif.org/v0/autoindex
project=yso-finna-en
vocab=yso-en
22 changes: 11 additions & 11 deletions tests/projects.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,71 @@
[dummy-fi]
name=Dummy Finnish
language=fi
backends=dummy
backend=dummy
analyzer=snowball(finnish)
key=value
vocab=dummy

[dummy-en]
name=Dummy English
language=en
backends=dummy:0.5
backend=dummy
analyzer=snowball(english)
vocab=dummy

[dummydummy]
name=Dummy+Dummy combination
language=en
backends=dummy:2,dummy
backend=dummy
analyzer=snowball(english)
vocab=dummy

[ensemble]
name=Ensemble
language=en
backends=ensemble
backend=ensemble
sources=dummy-en,dummydummy
vocab=dummy

[noanalyzer]
name=Dummy with no analyzer
language=en
backends=dummy
backend=dummy
vocab=dummy

[novocab]
name=TFIDF with no vocab
language=en
backends=tfidf
backend=tfidf
analyzer=snowball(english)

[pav]
name=PAV Ensemble Finnish
language=fi
backends=pav
backend=pav
sources=tfidf-fi,fasttext-fi
vocab=yso-fi

[tfidf-fi]
name=TF-IDF Finnish
language=fi
backends=tfidf
backend=tfidf
analyzer=snowball(finnish)
limit=10
vocab=yso-fi

[tfidf-en]
name=TF-IDF English
language=en
backends=tfidf
backend=tfidf
analyzer=snowball(english)
limit=10
vocab=yso-en

[fasttext-en]
name=fastText English
language=en
backends=fasttext
backend=fasttext
analyzer=snowball(english)
dim=100
lr=0.25
Expand All @@ -80,7 +80,7 @@ vocab=yso-en
[fasttext-fi]
name=fastText Finnish
language=fi
backends=fasttext
backend=fasttext
analyzer=snowball(finnish)
dim=100
lr=0.25
Expand Down
Loading