diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 157d033de..f29fc2ddb 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -5,10 +5,12 @@ import re import gzip import annif.util -from .types import Document, DocumentCorpus +from .types import DocumentCorpus from .convert import DocumentToSubjectCorpusMixin from .subject import SubjectSet +logger = annif.logger + class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin): """A directory of files as a full text document corpus""" @@ -61,15 +63,21 @@ def opener(path): return gzip.open(path, mode='rt') else: opener = open - with opener(self.path) as tsvfile: for line in tsvfile: - text, uris = line.split('\t', maxsplit=1) - subjects = [annif.util.cleanup_uri(uri) - for uri in uris.split()] - yield self._create_document(text=text, - uris=subjects, - labels=[]) + yield from self._parse_tsv_line(line) + + def _parse_tsv_line(self, line): + if '\t' in line: + text, uris = line.split('\t', maxsplit=1) + subjects = [annif.util.cleanup_uri(uri) + for uri in uris.split()] + yield self._create_document(text=text, + uris=subjects, + labels=[]) + else: + logger.warning('Skipping invalid line (missing tab): "%s"', + line.rstrip()) class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin): diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 9d8e2971e..4f7928aff 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -218,6 +218,23 @@ def test_docfile_plain(tmpdir): assert len(list(docs.documents)) == 3 +def test_docfile_plain_invalid_lines(tmpdir, caplog): + logger = annif.logger + logger.propagate = True + docfile = tmpdir.join('documents_invalid.tsv') + docfile.write("""Läntinen\t + + Oulunlinnan\t + A line with no tabs + Harald Hirmuinen\t""") + docs = annif.corpus.DocumentFile(str(docfile)) + assert len(list(docs.documents)) == 3 + assert len(caplog.records) == 2 + expected_msg = 'Skipping invalid line (missing tab):' + for record in caplog.records: + assert expected_msg in record.message + + def test_docfile_gzipped(tmpdir): docfile = tmpdir.join('documents.tsv.gz') with gzip.open(str(docfile), 'wt') as gzf: