Skip to content

Commit

Permalink
Simplify project configuration by allowing only one backend per proje…
Browse files Browse the repository at this point in the history
…ct. Fixes #218
  • Loading branch information
osma committed Jan 15, 2019
1 parent 9a63b09 commit c758985
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 92 deletions.
76 changes: 29 additions & 47 deletions annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, project_id, config, datadir):
self.vocab_id = config.get('vocab', None)
self._base_datadir = datadir
self._datadir = os.path.join(datadir, 'projects', self.project_id)
self.backends = self._setup_backends(config)
self.backend = self._setup_backend(config)

def _get_datadir(self):
"""return the path of the directory where this project can store its
Expand All @@ -45,16 +45,10 @@ def _get_datadir(self):
os.makedirs(self._datadir)
return self._datadir

def _setup_backends(self, config):
backends = []
for backend_id, weight in annif.util.parse_sources(config['backends']):
backend_type = annif.backend.get_backend(backend_id)
backend = backend_type(
backend_id,
params=config,
datadir=self._datadir)
backends.append((backend, weight))
return backends
def _setup_backend(self, config):
backend_id = config['backend']
backend_type = annif.backend.get_backend(backend_id)
return backend_type(backend_id, params=config, datadir=self._datadir)

def _initialize_analyzer(self):
analyzer = self.analyzer
Expand All @@ -80,40 +74,34 @@ def _initialize_vectorizer(self):
except AnnifException as err:
logger.warning(err.format_message())

def _initialize_backends(self):
logger.debug("Project '%s': initializing backends", self.project_id)
for backend, _ in self.backends:
try:
backend.initialize()
except AnnifException as err:
logger.warning(err.format_message())
def _initialize_backend(self):
logger.debug("Project '%s': initializing backend", self.project_id)
try:
self.backend.initialize()
except AnnifException as err:
logger.warning(err.format_message())

def initialize(self):
"""initialize this project and all backends so that they are ready to
"""initialize this project and its backend so that they are ready to
analyze"""
logger.debug("Initializing project '%s'", self.project_id)

self._initialize_analyzer()
self._initialize_subjects()
self._initialize_vectorizer()
self._initialize_backends()
self._initialize_backend()

self.initialized = True

def _analyze_with_backends(self, text, backend_params):
hits_from_backends = []
def _analyze_with_backend(self, text, backend_params):
if backend_params is None:
backend_params = {}
for backend, weight in self.backends:
beparams = backend_params.get(backend.backend_id, {})
hits = backend.analyze(text, project=self, params=beparams)
logger.debug(
'Got %d hits from backend %s',
len(hits), backend.backend_id)
hits_from_backends.append(
annif.hit.WeightedHits(
hits=hits, weight=weight))
return hits_from_backends
beparams = backend_params.get(self.backend.backend_id, {})
hits = self.backend.analyze(text, project=self, params=beparams)
logger.debug(
'Got %d hits from backend %s',
len(hits), self.backend.backend_id)
return hits

@property
def analyzer(self):
Expand Down Expand Up @@ -149,21 +137,18 @@ def vectorizer(self):
return self._vectorizer

def analyze(self, text, backend_params=None):
"""Analyze the given text by passing it to backends and joining the
results. Returns a list of AnalysisHit objects ordered by decreasing
score."""
"""Analyze the given text by passing it to the backend. Returns a
list of AnalysisHit objects ordered by decreasing score."""

logger.debug('Analyzing text "%s..." (len=%d)',
text[:20], len(text))
hits_from_backends = self._analyze_with_backends(text, backend_params)
merged_hits = annif.util.merge_hits(hits_from_backends, self.subjects)
logger.debug('%d hits after merging', len(merged_hits))
return merged_hits
hits = self._analyze_with_backend(text, backend_params)
logger.debug('%d hits from backend', len(hits))
return hits

def _create_vectorizer(self, subjectcorpus):
if True not in [
be[0].needs_subject_vectorizer for be in self.backends]:
logger.debug('not creating vectorizer: not needed by any backend')
if not self.backend.needs_subject_vectorizer:
logger.debug('not creating vectorizer: not needed by backend')
return
logger.info('creating vectorizer')
self._vectorizer = TfidfVectorizer(
Expand All @@ -180,17 +165,14 @@ def load_documents(self, corpus):

corpus.set_subject_index(self.subjects)
self._create_vectorizer(corpus)

for backend, _ in self.backends:
backend.load_corpus(corpus, project=self)
self.backend.load_corpus(corpus, project=self)

def dump(self):
"""return this project as a dict"""
return {'project_id': self.project_id,
'name': self.name,
'language': self.language,
'backends': [{'backend_id': be[0].backend_id,
'weight': be[1]} for be in self.backends]
'backend': {'backend_id': self.backend.backend_id}
}


Expand Down
12 changes: 3 additions & 9 deletions annif/swagger/annif.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,8 @@ definitions:
backend_id:
type: string
example: my-backend
weight:
type: number
example: 1.0
required:
- backend_id
- weight
Project:
description: A project definition
properties:
Expand All @@ -115,15 +111,13 @@ definitions:
language:
type: string
example: en
backends:
type: array
items:
$ref: '#/definitions/ProjectBackend'
backend:
$ref: '#/definitions/ProjectBackend'
required:
- project_id
- name
- language
- backends
- backend
ProjectList:
description: A list of projects
properties:
Expand Down
24 changes: 12 additions & 12 deletions projects.cfg.dist
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,31 @@
[tfidf-fi]
name=TF-IDF Finnish
language=fi
backends=tfidf
backend=tfidf
analyzer=voikko(fi)
limit=100
vocab=yso-fi

[tfidf-sv]
name=TF-IDF Swedish
language=sv
backends=tfidf
backend=tfidf
analyzer=snowball(swedish)
limit=100
vocab=yso-sv

[tfidf-en]
name=TF-IDF English
language=en
backends=tfidf
backend=tfidf
analyzer=snowball(english)
limit=100
vocab=yso-en

[fasttext-fi]
name=fastText Finnish
language=fi
backends=fasttext
backend=fasttext
analyzer=voikko(fi)
dim=500
lr=0.25
Expand All @@ -40,7 +40,7 @@ vocab=yso-fi
[fasttext-sv]
name=fastText Swedish
language=sv
backends=fasttext
backend=fasttext
analyzer=snowball(swedish)
dim=500
lr=0.25
Expand All @@ -53,7 +53,7 @@ vocab=yso-sv
[fasttext-en]
name=fastText English
language=en
backends=fasttext
backend=fasttext
analyzer=snowball(english)
dim=500
lr=0.25
Expand All @@ -66,44 +66,44 @@ vocab=yso-en
[maui-fi]
name=Maui Finnish
language=fi
backends=http
backend=http
endpoint=http://localhost:8080/maui/jyu-fin/analyze
vocab=yso-fi

[maui-sv]
name=Maui Swedish
language=sv
backends=http
backend=http
endpoint=http://localhost:8080/maui/jyu-swe/analyze
vocab=yso-sv

[maui-en]
name=Maui English
language=en
backends=http
backend=http
endpoint=http://localhost:8080/maui/jyu-eng/analyze
vocab=yso-en

[annif-api-fi]
name=Annif prototype API Finnish
language=fi
backends=http
backend=http
endpoint=http://api.annif.org/v0/autoindex
project=yso-finna-fi
vocab=yso-fi

[annif-api-sv]
name=Annif prototype API Swedish
language=sv
backends=http
backend=http
endpoint=http://api.annif.org/v0/autoindex
project=yso-finna-sv
vocab=yso-sv

[annif-api-en]
name=Annif prototype API English
language=en
backends=http
backend=http
endpoint=http://api.annif.org/v0/autoindex
project=yso-finna-en
vocab=yso-en
22 changes: 11 additions & 11 deletions tests/projects.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,71 +3,71 @@
[dummy-fi]
name=Dummy Finnish
language=fi
backends=dummy
backend=dummy
analyzer=snowball(finnish)
key=value
vocab=dummy

[dummy-en]
name=Dummy English
language=en
backends=dummy:0.5
backend=dummy
analyzer=snowball(english)
vocab=dummy

[dummydummy]
name=Dummy+Dummy combination
language=en
backends=dummy:2,dummy
backend=dummy
analyzer=snowball(english)
vocab=dummy

[ensemble]
name=Ensemble
language=en
backends=ensemble
backend=ensemble
sources=dummy-en,dummydummy
vocab=dummy

[noanalyzer]
name=Dummy with no analyzer
language=en
backends=dummy
backend=dummy
vocab=dummy

[novocab]
name=TFIDF with no vocab
language=en
backends=tfidf
backend=tfidf
analyzer=snowball(english)

[pav]
name=PAV Ensemble Finnish
language=fi
backends=pav
backend=pav
sources=tfidf-fi,fasttext-fi
vocab=yso-fi

[tfidf-fi]
name=TF-IDF Finnish
language=fi
backends=tfidf
backend=tfidf
analyzer=snowball(finnish)
limit=10
vocab=yso-fi

[tfidf-en]
name=TF-IDF English
language=en
backends=tfidf
backend=tfidf
analyzer=snowball(english)
limit=10
vocab=yso-en

[fasttext-en]
name=fastText English
language=en
backends=fasttext
backend=fasttext
analyzer=snowball(english)
dim=100
lr=0.25
Expand All @@ -80,7 +80,7 @@ vocab=yso-en
[fasttext-fi]
name=fastText Finnish
language=fi
backends=fasttext
backend=fasttext
analyzer=snowball(finnish)
dim=100
lr=0.25
Expand Down
Loading

0 comments on commit c758985

Please sign in to comment.