Skip to content

Commit

Permalink
Merge pull request #313 from NatLibFi/issue309-key-files-with-labels-…
Browse files Browse the repository at this point in the history
…only

Issue309 key files with labels only
  • Loading branch information
osma authored Aug 9, 2019
2 parents 1877f40 + 0678998 commit a8999e7
Show file tree
Hide file tree
Showing 5 changed files with 62 additions and 12 deletions.
7 changes: 0 additions & 7 deletions annif/corpus/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
class DocumentToSubjectCorpusMixin(SubjectCorpus):
"""Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus"""

_subject_index = None
_subject_corpus = None
_temp_directory = None

Expand All @@ -19,12 +18,6 @@ def subjects(self):
self._generate_corpus_from_documents()
return self._subject_corpus.subjects

def set_subject_index(self, subject_index):
"""Set a subject index for looking up labels that are necessary for
conversion"""

self._subject_index = subject_index

def _subject_filename(self, subject_id):
filename = '{:08d}.txt'.format(subject_id)
return os.path.join(self._temp_directory.name, filename)
Expand Down
9 changes: 6 additions & 3 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ def documents(self):
text = docfile.read()
with open(keyfilename, encoding='utf-8') as keyfile:
subjects = SubjectSet.from_string(keyfile.read())
yield Document(text=text, uris=subjects.subject_uris,
labels=subjects.subject_labels)
yield self._create_document(text=text,
uris=subjects.subject_uris,
labels=subjects.subject_labels)


class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin):
Expand All @@ -66,7 +67,9 @@ def opener(path):
text, uris = line.split('\t', maxsplit=1)
subjects = [annif.util.cleanup_uri(uri)
for uri in uris.split()]
yield Document(text=text, uris=subjects, labels=[])
yield self._create_document(text=text,
uris=subjects,
labels=[])


class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
Expand Down
16 changes: 16 additions & 0 deletions annif/corpus/subject.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,22 @@ def by_label(self, label):
logger.warning('Unknown subject label "%s"', label)
return None

def uris_to_labels(self, uris):
"""return a list of labels corresponding to the given URIs; unknown
URIs are ignored"""

return [self[subject_id][1]
for subject_id in (self.by_uri(uri) for uri in uris)
if subject_id is not None]

def labels_to_uris(self, labels):
"""return a list of URIs corresponding to the given labels; unknown
labels are ignored"""

return [self[subject_id][0]
for subject_id in (self.by_label(label) for label in labels)
if subject_id is not None]

def save(self, path):
"""Save this subject index into a file."""

Expand Down
21 changes: 21 additions & 0 deletions annif/corpus/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,33 @@
class DocumentCorpus(metaclass=abc.ABCMeta):
"""Abstract base class for document corpora"""

_subject_index = None

@property
@abc.abstractmethod
def documents(self):
"""Iterate through the document corpus, yielding Document objects."""
pass # pragma: no cover

def set_subject_index(self, subject_index):
"""Set a subject index for looking up labels that are necessary for
conversion"""

self._subject_index = subject_index

def _create_document(self, text, uris, labels):
"""Create a new Document instance from possibly incomplete
information. URIs for labels and vice versa are looked up from the
subject index, if available."""

if self._subject_index:
if not uris and labels:
uris = set((self._subject_index.labels_to_uris(labels)))
if not labels and uris:
labels = set((self._subject_index.uris_to_labels(uris)))

return Document(text=text, uris=uris, labels=labels)


Subject = collections.namedtuple('Subject', 'uri label text')

Expand Down
21 changes: 19 additions & 2 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ def test_docdir_tsv_require_subjects(tmpdir):
assert files[1][1] == str(tmpdir.join('doc2.tsv'))


def test_docdir_as_doccorpus(tmpdir):
def test_docdir_tsv_as_doccorpus(tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.tsv').write('<http://example.org/subj1>\tsubj1')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.tsv').write('<http://example.org/subj2>\tsubj1')
tmpdir.join('doc2.tsv').write('<http://example.org/subj2>\tsubj2')
tmpdir.join('doc3.txt').write('doc3')

docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
Expand All @@ -131,6 +131,23 @@ def test_docdir_as_doccorpus(tmpdir):
assert docs[1].uris == {'http://example.org/subj2'}


def test_docdir_key_as_doccorpus(tmpdir, subject_index):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('arkeologit')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.key').write('kalliotaide')
tmpdir.join('doc3.txt').write('doc3')

docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
docdir.set_subject_index(subject_index)
docs = list(docdir.documents)
assert len(docs) == 2
assert docs[0].text == 'doc1'
assert docs[0].uris == {'http://www.yso.fi/onto/yso/p10849'}
assert docs[1].text == 'doc2'
assert docs[1].uris == {'http://www.yso.fi/onto/yso/p13027'}


def test_subjdir(tmpdir):
tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one
first subject
Expand Down

0 comments on commit a8999e7

Please sign in to comment.