Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue281 handle invalid lines in training data tsv file #299

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
import re
import gzip
import annif.util
from .types import Document, DocumentCorpus
from .types import DocumentCorpus
from .convert import DocumentToSubjectCorpusMixin
from .subject import SubjectSet

logger = annif.logger


class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
"""A directory of files as a full text document corpus"""
Expand Down Expand Up @@ -61,15 +63,21 @@ def opener(path):
return gzip.open(path, mode='rt')
else:
opener = open

with opener(self.path) as tsvfile:
for line in tsvfile:
text, uris = line.split('\t', maxsplit=1)
subjects = [annif.util.cleanup_uri(uri)
for uri in uris.split()]
yield self._create_document(text=text,
uris=subjects,
labels=[])
yield from self._parse_tsv_line(line)

def _parse_tsv_line(self, line):
if '\t' in line:
text, uris = line.split('\t', maxsplit=1)
subjects = [annif.util.cleanup_uri(uri)
for uri in uris.split()]
yield self._create_document(text=text,
uris=subjects,
labels=[])
else:
logger.warning('Skipping invalid line (missing tab): "%s"',
line.rstrip())


class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
Expand Down
17 changes: 17 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,23 @@ def test_docfile_plain(tmpdir):
assert len(list(docs.documents)) == 3


def test_docfile_plain_invalid_lines(tmpdir, caplog):
logger = annif.logger
logger.propagate = True
docfile = tmpdir.join('documents_invalid.tsv')
docfile.write("""Läntinen\t<http://www.yso.fi/onto/yso/p2557>

Oulunlinnan\t<http://www.yso.fi/onto/yso/p7346>
A line with no tabs
Harald Hirmuinen\t<http://www.yso.fi/onto/yso/p6479>""")
docs = annif.corpus.DocumentFile(str(docfile))
assert len(list(docs.documents)) == 3
assert len(caplog.records) == 2
expected_msg = 'Skipping invalid line (missing tab):'
for record in caplog.records:
assert expected_msg in record.message


def test_docfile_gzipped(tmpdir):
docfile = tmpdir.join('documents.tsv.gz')
with gzip.open(str(docfile), 'wt') as gzf:
Expand Down