From 099c4aaa0f4146eae61e17071a735e77841d51af Mon Sep 17 00:00:00 2001 From: Osma Suominen Date: Mon, 2 Sep 2019 15:37:26 +0300 Subject: [PATCH] Always use UTF-8 encoding when parsing TSV file document corpora --- annif/corpus/document.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/annif/corpus/document.py b/annif/corpus/document.py index f29fc2ddb..773dfc623 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -58,12 +58,10 @@ def __init__(self, path): @property def documents(self): if self.path.endswith('.gz'): - def opener(path): - """open a gzip compressed file in text mode""" - return gzip.open(path, mode='rt') + opener = gzip.open else: opener = open - with opener(self.path) as tsvfile: + with opener(self.path, mode='rt', encoding='utf-8') as tsvfile: for line in tsvfile: yield from self._parse_tsv_line(line)