Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue309 key files with labels only #313

Merged
merged 6 commits into from
Aug 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions annif/corpus/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
class DocumentToSubjectCorpusMixin(SubjectCorpus):
"""Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus"""

_subject_index = None
_subject_corpus = None
_temp_directory = None

Expand All @@ -19,12 +18,6 @@ def subjects(self):
self._generate_corpus_from_documents()
return self._subject_corpus.subjects

def set_subject_index(self, subject_index):
"""Set a subject index for looking up labels that are necessary for
conversion"""

self._subject_index = subject_index

def _subject_filename(self, subject_id):
filename = '{:08d}.txt'.format(subject_id)
return os.path.join(self._temp_directory.name, filename)
Expand Down
9 changes: 6 additions & 3 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,9 @@ def documents(self):
text = docfile.read()
with open(keyfilename, encoding='utf-8') as keyfile:
subjects = SubjectSet.from_string(keyfile.read())
yield Document(text=text, uris=subjects.subject_uris,
labels=subjects.subject_labels)
yield self._create_document(text=text,
uris=subjects.subject_uris,
labels=subjects.subject_labels)


class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin):
Expand All @@ -66,7 +67,9 @@ def opener(path):
text, uris = line.split('\t', maxsplit=1)
subjects = [annif.util.cleanup_uri(uri)
for uri in uris.split()]
yield Document(text=text, uris=subjects, labels=[])
yield self._create_document(text=text,
uris=subjects,
labels=[])


class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
Expand Down
16 changes: 16 additions & 0 deletions annif/corpus/subject.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,22 @@ def by_label(self, label):
logger.warning('Unknown subject label "%s"', label)
return None

def uris_to_labels(self, uris):
"""return a list of labels corresponding to the given URIs; unknown
URIs are ignored"""

return [self[subject_id][1]
for subject_id in (self.by_uri(uri) for uri in uris)
if subject_id is not None]

def labels_to_uris(self, labels):
"""return a list of URIs corresponding to the given labels; unknown
labels are ignored"""

return [self[subject_id][0]
for subject_id in (self.by_label(label) for label in labels)
if subject_id is not None]

def save(self, path):
"""Save this subject index into a file."""

Expand Down
21 changes: 21 additions & 0 deletions annif/corpus/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,33 @@
class DocumentCorpus(metaclass=abc.ABCMeta):
"""Abstract base class for document corpora"""

_subject_index = None

@property
@abc.abstractmethod
def documents(self):
"""Iterate through the document corpus, yielding Document objects."""
pass # pragma: no cover

def set_subject_index(self, subject_index):
"""Set a subject index for looking up labels that are necessary for
conversion"""

self._subject_index = subject_index

def _create_document(self, text, uris, labels):
"""Create a new Document instance from possibly incomplete
information. URIs for labels and vice versa are looked up from the
subject index, if available."""

if self._subject_index:
if not uris and labels:
uris = set((self._subject_index.labels_to_uris(labels)))
if not labels and uris:
labels = set((self._subject_index.uris_to_labels(uris)))

return Document(text=text, uris=uris, labels=labels)


Subject = collections.namedtuple('Subject', 'uri label text')

Expand Down
21 changes: 19 additions & 2 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ def test_docdir_tsv_require_subjects(tmpdir):
assert files[1][1] == str(tmpdir.join('doc2.tsv'))


def test_docdir_as_doccorpus(tmpdir):
def test_docdir_tsv_as_doccorpus(tmpdir):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.tsv').write('<http://example.org/subj1>\tsubj1')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.tsv').write('<http://example.org/subj2>\tsubj1')
tmpdir.join('doc2.tsv').write('<http://example.org/subj2>\tsubj2')
tmpdir.join('doc3.txt').write('doc3')

docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
Expand All @@ -131,6 +131,23 @@ def test_docdir_as_doccorpus(tmpdir):
assert docs[1].uris == {'http://example.org/subj2'}


def test_docdir_key_as_doccorpus(tmpdir, subject_index):
tmpdir.join('doc1.txt').write('doc1')
tmpdir.join('doc1.key').write('arkeologit')
tmpdir.join('doc2.txt').write('doc2')
tmpdir.join('doc2.key').write('kalliotaide')
tmpdir.join('doc3.txt').write('doc3')

docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
docdir.set_subject_index(subject_index)
docs = list(docdir.documents)
assert len(docs) == 2
assert docs[0].text == 'doc1'
assert docs[0].uris == {'http://www.yso.fi/onto/yso/p10849'}
assert docs[1].text == 'doc2'
assert docs[1].uris == {'http://www.yso.fi/onto/yso/p13027'}


def test_subjdir(tmpdir):
tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one
first subject
Expand Down