Skip to content

Commit

Permalink
Merge subjects in CombinedCorpus instead of concatenating them. Fixes #…
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Sep 24, 2019
1 parent 81994c4 commit e007c71
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 3 deletions.
13 changes: 10 additions & 3 deletions annif/corpus/combine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Class for combining multiple corpora so they behave like a single corpus"""

import itertools
from .types import DocumentCorpus, SubjectCorpus
from .types import DocumentCorpus, SubjectCorpus, Subject


class CombinedCorpus(SubjectCorpus, DocumentCorpus):
Expand All @@ -13,8 +13,15 @@ def __init__(self, corpora):

@property
def subjects(self):
return itertools.chain.from_iterable(
[corpus.subjects for corpus in self._corpora])
for source_subjects in zip(*[corpus.subjects for corpus in self._corpora]):
uri = None
label = None
texts = []
for subject in source_subjects:
uri = subject.uri
label = subject.label
texts.append(subject.text)
yield Subject(uri=uri, label=label, text=" ".join(texts))

@property
def documents(self):
Expand Down
16 changes: 16 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,19 @@ def test_docfile_is_empty(tmpdir):
empty_file = tmpdir.ensure('empty.tsv')
docs = annif.corpus.DocumentFile(str(empty_file))
assert docs.is_empty()


def test_combinedcorpus(tmpdir, subject_index):
docfile = tmpdir.join('documents.tsv')
docfile.write("""Läntinen\t<http://www.yso.fi/onto/yso/p2557>
Oulunlinnan\t<http://www.yso.fi/onto/yso/p7346>
Harald Hirmuinen\t<http://www.yso.fi/onto/yso/p6479>""")

corpus1 = annif.corpus.DocumentFile(str(docfile))
corpus2 = annif.corpus.DocumentFile(str(docfile))

combined = annif.corpus.CombinedCorpus([corpus1, corpus2])
combined.set_subject_index(subject_index)

assert len(list(combined.documents)) == 6
assert len(list(combined.subjects)) == len(list(corpus1.subjects))

0 comments on commit e007c71

Please sign in to comment.