Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue281 handle invalid lines in training data tsv file #299

Merged
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from .types import Document, DocumentCorpus
from .convert import DocumentToSubjectCorpusMixin
from .subject import SubjectSet
from ..exception import AnnifException

logger = annif.logger


class DocumentDirectory(DocumentCorpus, DocumentToSubjectCorpusMixin):
Expand Down Expand Up @@ -60,13 +63,26 @@ def opener(path):
return gzip.open(path, mode='rt')
else:
opener = open

with opener(self.path) as tsvfile:
for line in tsvfile:
text, uris = line.split('\t', maxsplit=1)
subjects = [annif.util.cleanup_uri(uri)
for uri in uris.split()]
yield Document(text=text, uris=subjects, labels=[])
try:
with opener(self.path) as tsvfile:
for line in tsvfile:
yield from self._parse_tsv_line(line)
except FileNotFoundError as err:
raise AnnifException(str(err))
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved

@staticmethod
def _parse_tsv_line(line):
try:
text, uris = line.split('\t', maxsplit=1)
subjects = [annif.util.cleanup_uri(uri)
for uri in uris.split()]
yield Document(text=text, uris=subjects, labels=[])
except ValueError as err:
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved
if 'not enough values to unpack' in str(err):
msg = 'Skipping invalid line (missing tab): "%s"'
logger.warning(msg, line.rstrip())
else:
raise


class DocumentList(DocumentCorpus, DocumentToSubjectCorpusMixin):
Expand Down
29 changes: 29 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import gzip
import annif.corpus
import pytest
from annif.exception import AnnifException


def test_subjectset_uris():
Expand Down Expand Up @@ -201,6 +203,33 @@ def test_docfile_plain(tmpdir):
assert len(list(docs.documents)) == 3


def test_docfile_nonexistent(tmpdir):
docfile = tmpdir.join('documents_nonexistent.tsv')
with pytest.raises(AnnifException) as err:
docs = annif.corpus.DocumentFile(str(docfile))
list(docs.documents)
assert "No such file or directory: '{}'".format(docfile) in str(err.value)


def test_docfile_plain_invalid_lines(tmpdir, caplog, capsys):
docfile = tmpdir.join('documents_invalid.tsv')
docfile.write("""Läntinen\t<http://www.yso.fi/onto/yso/p2557>

Oulunlinnan\t<http://www.yso.fi/onto/yso/p7346>
A line with no tabs
Harald Hirmuinen\t<http://www.yso.fi/onto/yso/p6479>""")
docs = annif.corpus.DocumentFile(str(docfile))
assert len(list(docs.documents)) == 3
# Need to capture both logs and stderr: when running this test individually
juhoinkinen marked this conversation as resolved.
Show resolved Hide resolved
# the warning goes to warnings log, but when full suite it goes to stderror
logs = caplog.text
stdout, stderr = capsys.readouterr()
lines_all_streams = (logs + stdout + stderr).splitlines()
assert len(lines_all_streams) == 2
expected_msg = 'Skipping invalid line (missing tab):'
assert all(expected_msg in l for l in lines_all_streams)


def test_docfile_gzipped(tmpdir):
docfile = tmpdir.join('documents.tsv.gz')
with gzip.open(str(docfile), 'wt') as gzf:
Expand Down