NatLibFi · juhoinkinen · Sep 2, 2019 · Jul 17, 2019 · Jul 17, 2019 · Jul 17, 2019
diff --git a/annif/corpus/document.py b/annif/corpus/document.py
@@ -8,6 +8,9 @@
 from .types import Document, DocumentCorpus
 from .convert import DocumentToSubjectCorpusMixin
 from .subject import SubjectSet
+from ..exception import AnnifException
+
+logger = annif.logger
 
 
 class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
@@ -60,13 +63,26 @@ def opener(path):
                 return gzip.open(path, mode='rt')
         else:
             opener = open
-
-        with opener(self.path) as tsvfile:
-            for line in tsvfile:
-                text, uris = line.split('\t', maxsplit=1)
-                subjects = [annif.util.cleanup_uri(uri)
-                            for uri in uris.split()]
-                yield Document(text=text, uris=subjects, labels=[])
+        try:
+            with opener(self.path) as tsvfile:
+                for line in tsvfile:
+                    yield from self._parse_tsv_line(line)
+        except FileNotFoundError as err:
+            raise AnnifException(str(err))
+
+    @staticmethod
+    def _parse_tsv_line(line):
+        try:
+            text, uris = line.split('\t', maxsplit=1)
+            subjects = [annif.util.cleanup_uri(uri)
+                        for uri in uris.split()]
+            yield Document(text=text, uris=subjects, labels=[])
+        except ValueError as err:
+            if 'not enough values to unpack' in str(err):
+                msg = 'Skipping invalid line (missing tab): "%s"'
+                logger.warning(msg, line.rstrip())
+            else:
+                raise
 
 
 class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):

diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -2,6 +2,8 @@
 
 import gzip
 import annif.corpus
+import pytest
+from annif.exception import AnnifException
 
 
 def test_subjectset_uris():
@@ -201,6 +203,33 @@ def test_docfile_plain(tmpdir):
     assert len(list(docs.documents)) == 3
 
 
+def test_docfile_nonexistent(tmpdir):
+    docfile = tmpdir.join('documents_nonexistent.tsv')
+    with pytest.raises(AnnifException) as err:
+        docs = annif.corpus.DocumentFile(str(docfile))
+        list(docs.documents)
+    assert "No such file or directory: '{}'".format(docfile) in str(err.value)
+
+
+def test_docfile_plain_invalid_lines(tmpdir, caplog, capsys):
+    docfile = tmpdir.join('documents_invalid.tsv')
+    docfile.write("""Läntinen\t<http://www.yso.fi/onto/yso/p2557>
+
+        Oulunlinnan\t<http://www.yso.fi/onto/yso/p7346>
+        A line with no tabs
+        Harald Hirmuinen\t<http://www.yso.fi/onto/yso/p6479>""")
+    docs = annif.corpus.DocumentFile(str(docfile))
+    assert len(list(docs.documents)) == 3
+    # Need to capture both logs and stderr: when running this test individually
+    # the warning goes to warnings log, but when full suite it goes to stderror
+    logs = caplog.text
+    stdout, stderr = capsys.readouterr()
+    lines_all_streams = (logs + stdout + stderr).splitlines()
+    assert len(lines_all_streams) == 2
+    expected_msg = 'Skipping invalid line (missing tab):'
+    assert all(expected_msg in l for l in lines_all_streams)
+
+
 def test_docfile_gzipped(tmpdir):
     docfile = tmpdir.join('documents.tsv.gz')
     with gzip.open(str(docfile), 'wt') as gzf: