diff --git a/annif/corpus/combine.py b/annif/corpus/combine.py index f59e3bc11..bb9109acb 100644 --- a/annif/corpus/combine.py +++ b/annif/corpus/combine.py @@ -1,7 +1,7 @@ """Class for combining multiple corpora so they behave like a single corpus""" import itertools -from .types import DocumentCorpus, SubjectCorpus +from .types import DocumentCorpus, SubjectCorpus, Subject class CombinedCorpus(SubjectCorpus, DocumentCorpus): @@ -13,8 +13,16 @@ def __init__(self, corpora): @property def subjects(self): - return itertools.chain.from_iterable( - [corpus.subjects for corpus in self._corpora]) + for source_subjects in zip( + *[corpus.subjects for corpus in self._corpora]): + uri = None + label = None + texts = [] + for subject in source_subjects: + uri = subject.uri + label = subject.label + texts.append(subject.text) + yield Subject(uri=uri, label=label, text=" ".join(texts)) @property def documents(self): diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 4fecf04b2..1b2b81501 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -250,3 +250,19 @@ def test_docfile_is_empty(tmpdir): empty_file = tmpdir.ensure('empty.tsv') docs = annif.corpus.DocumentFile(str(empty_file)) assert docs.is_empty() + + +def test_combinedcorpus(tmpdir, subject_index): + docfile = tmpdir.join('documents.tsv') + docfile.write("""Läntinen\t + Oulunlinnan\t + Harald Hirmuinen\t""") + + corpus1 = annif.corpus.DocumentFile(str(docfile)) + corpus2 = annif.corpus.DocumentFile(str(docfile)) + + combined = annif.corpus.CombinedCorpus([corpus1, corpus2]) + combined.set_subject_index(subject_index) + + assert len(list(combined.documents)) == 6 + assert len(list(combined.subjects)) == len(list(corpus1.subjects))