NatLibFi · osma · Aug 9, 2019 · Aug 8, 2019 · Aug 9, 2019 · Aug 9, 2019
diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py
@@ -9,7 +9,6 @@
 class DocumentToSubjectCorpusMixin(SubjectCorpus):
     """Mixin class for enabling a DocumentCorpus to act as a SubjectCorpus"""
 
-    _subject_index = None
     _subject_corpus = None
     _temp_directory = None
 
@@ -19,12 +18,6 @@ def subjects(self):
             self._generate_corpus_from_documents()
         return self._subject_corpus.subjects
 
-    def set_subject_index(self, subject_index):
-        """Set a subject index for looking up labels that are necessary for
-        conversion"""
-
-        self._subject_index = subject_index
-
     def _subject_filename(self, subject_id):
         filename = '{:08d}.txt'.format(subject_id)
         return os.path.join(self._temp_directory.name, filename)

diff --git a/annif/corpus/document.py b/annif/corpus/document.py
@@ -42,8 +42,9 @@ def documents(self):
                 text = docfile.read()
             with open(keyfilename, encoding='utf-8') as keyfile:
                 subjects = SubjectSet.from_string(keyfile.read())
-            yield Document(text=text, uris=subjects.subject_uris,
-                           labels=subjects.subject_labels)
+            yield self._create_document(text=text,
+                                        uris=subjects.subject_uris,
+                                        labels=subjects.subject_labels)
 
 
 class DocumentFile(DocumentCorpus, DocumentToSubjectCorpusMixin):
@@ -66,7 +67,9 @@ def opener(path):
                 text, uris = line.split('\t', maxsplit=1)
                 subjects = [annif.util.cleanup_uri(uri)
                             for uri in uris.split()]
-                yield Document(text=text, uris=subjects, labels=[])
+                yield self._create_document(text=text,
+                                            uris=subjects,
+                                            labels=[])
 
 
 class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):

diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
@@ -78,6 +78,22 @@ def by_label(self, label):
             logger.warning('Unknown subject label "%s"', label)
             return None
 
+    def uris_to_labels(self, uris):
+        """return a list of labels corresponding to the given URIs; unknown
+        URIs are ignored"""
+
+        return [self[subject_id][1]
+                for subject_id in (self.by_uri(uri) for uri in uris)
+                if subject_id is not None]
+
+    def labels_to_uris(self, labels):
+        """return a list of URIs corresponding to the given labels; unknown
+        labels are ignored"""
+
+        return [self[subject_id][0]
+                for subject_id in (self.by_label(label) for label in labels)
+                if subject_id is not None]
+
     def save(self, path):
         """Save this subject index into a file."""
 

diff --git a/annif/corpus/types.py b/annif/corpus/types.py
@@ -10,12 +10,33 @@
 class DocumentCorpus(metaclass=abc.ABCMeta):
     """Abstract base class for document corpora"""
 
+    _subject_index = None
+
     @property
     @abc.abstractmethod
     def documents(self):
         """Iterate through the document corpus, yielding Document objects."""
         pass  # pragma: no cover
 
+    def set_subject_index(self, subject_index):
+        """Set a subject index for looking up labels that are necessary for
+        conversion"""
+
+        self._subject_index = subject_index
+
+    def _create_document(self, text, uris, labels):
+        """Create a new Document instance from possibly incomplete
+        information. URIs for labels and vice versa are looked up from the
+        subject index, if available."""
+
+        if self._subject_index:
+            if not uris and labels:
+                uris = set((self._subject_index.labels_to_uris(labels)))
+            if not labels and uris:
+                labels = set((self._subject_index.uris_to_labels(uris)))
+
+        return Document(text=text, uris=uris, labels=labels)
+
 
 Subject = collections.namedtuple('Subject', 'uri label text')
 

diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -115,11 +115,11 @@ def test_docdir_tsv_require_subjects(tmpdir):
     assert files[1][1] == str(tmpdir.join('doc2.tsv'))
 
 
-def test_docdir_as_doccorpus(tmpdir):
+def test_docdir_tsv_as_doccorpus(tmpdir):
     tmpdir.join('doc1.txt').write('doc1')
     tmpdir.join('doc1.tsv').write('<http://example.org/subj1>\tsubj1')
     tmpdir.join('doc2.txt').write('doc2')
-    tmpdir.join('doc2.tsv').write('<http://example.org/subj2>\tsubj1')
+    tmpdir.join('doc2.tsv').write('<http://example.org/subj2>\tsubj2')
     tmpdir.join('doc3.txt').write('doc3')
 
     docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
@@ -131,6 +131,23 @@ def test_docdir_as_doccorpus(tmpdir):
     assert docs[1].uris == {'http://example.org/subj2'}
 
 
+def test_docdir_key_as_doccorpus(tmpdir, subject_index):
+    tmpdir.join('doc1.txt').write('doc1')
+    tmpdir.join('doc1.key').write('arkeologit')
+    tmpdir.join('doc2.txt').write('doc2')
+    tmpdir.join('doc2.key').write('kalliotaide')
+    tmpdir.join('doc3.txt').write('doc3')
+
+    docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
+    docdir.set_subject_index(subject_index)
+    docs = list(docdir.documents)
+    assert len(docs) == 2
+    assert docs[0].text == 'doc1'
+    assert docs[0].uris == {'http://www.yso.fi/onto/yso/p10849'}
+    assert docs[1].text == 'doc2'
+    assert docs[1].uris == {'http://www.yso.fi/onto/yso/p13027'}
+
+
 def test_subjdir(tmpdir):
     tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one
         first subject