From e007c71b60bcbfb7ade7c0b06fc1177c66b0cad2 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 24 Sep 2019 16:28:02 +0300 Subject: [PATCH 1/2] Merge subjects in CombinedCorpus instead of concatenating them. Fixes #332 --- annif/corpus/combine.py | 13 ++++++++++--- tests/test_corpus.py | 16 ++++++++++++++++ 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py index f59e3bc11..18e65071c 100644 --- a/annif/corpus/combine.py +++ b/annif/corpus/combine.py @@ -1,7 +1,7 @@ """Class for combining multiple corpora so they behave like a single corpus""" import itertools -from .types import DocumentCorpus, SubjectCorpus +from .types import DocumentCorpus, SubjectCorpus, Subject class CombinedCorpus(SubjectCorpus, DocumentCorpus): @@ -13,8 +13,15 @@ def __init__(self, corpora): @property def subjects(self): - return itertools.chain.from_iterable( - [corpus.subjects for corpus in self._corpora]) + for source_subjects in zip(*[corpus.subjects for corpus in self._corpora]): + uri = None + label = None + texts = [] + for subject in source_subjects: + uri = subject.uri + label = subject.label + texts.append(subject.text) + yield Subject(uri=uri, label=label, text=" ".join(texts)) @property def documents(self): diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 4fecf04b2..1b2b81501 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -250,3 +250,19 @@ def test_docfile_is_empty(tmpdir): empty_file = tmpdir.ensure('empty.tsv') docs = annif.corpus.DocumentFile(str(empty_file)) assert docs.is_empty() + + +def test_combinedcorpus(tmpdir, subject_index): + docfile = tmpdir.join('documents.tsv') + docfile.write("""Läntinen\t + Oulunlinnan\t + Harald Hirmuinen\t""") + + corpus1 = annif.corpus.DocumentFile(str(docfile)) + corpus2 = annif.corpus.DocumentFile(str(docfile)) + + combined = annif.corpus.CombinedCorpus([corpus1, corpus2]) + combined.set_subject_index(subject_index) + + assert len(list(combined.documents)) == 6 + assert len(list(combined.subjects)) == len(list(corpus1.subjects)) From d495acc98e6fde63023694c9fd092de4c6c3c656 Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Tue, 24 Sep 2019 16:33:04 +0300 Subject: [PATCH 2/2] fix too long line --- annif/corpus/combine.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py index 18e65071c..bb9109acb 100644 --- a/annif/corpus/combine.py +++ b/annif/corpus/combine.py @@ -13,7 +13,8 @@ def __init__(self, corpora): @property def subjects(self): - for source_subjects in zip(*[corpus.subjects for corpus in self._corpora]): + for source_subjects in zip( + *[corpus.subjects for corpus in self._corpora]): uri = None label = None texts = []