Merge pull request #263 from Veldhoen/pull-request

Explicit character encoding for all calls to open() Fixes #262
NatLibFi · Mar 6, 2019 · 75c51d0 · 75c51d0
2 parents d832514 + a0afde3
commit 75c51d0
Show file tree

Hide file tree

Showing 7 changed files with 13 additions and 12 deletions.
diff --git a/annif/backend/fasttext.py b/annif/backend/fasttext.py
@@ -63,7 +63,7 @@ def _label_to_subject(project, label):
         return project.subjects[subject_id]
 
     def _write_train_file(self, doc_subjects, filename):
-        with open(filename, 'w') as trainfile:
+        with open(filename, 'w', encoding='utf-8') as trainfile:
             for doc, subject_ids in doc_subjects.items():
                 labels = [self._id_to_label(sid) for sid in subject_ids
                           if sid is not None]

diff --git a/annif/backend/vw_multi.py b/annif/backend/vw_multi.py
@@ -86,7 +86,7 @@ def _normalize_text(project, text):
 
     @staticmethod
     def _write_train_file(examples, filename):
-        with open(filename, 'w') as trainfile:
+        with open(filename, 'w', encoding='utf-8') as trainfile:
             for ex in examples:
                 print(ex, file=trainfile)
 

diff --git a/annif/cli.py b/annif/cli.py
@@ -193,15 +193,15 @@ def run_analyzedir(project_id, directory, suffix, force,
 
     for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
             directory, require_subjects=False):
-        with open(docfilename) as docfile:
+        with open(docfilename, encoding='utf-8') as docfile:
             text = docfile.read()
         subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
         if os.path.exists(subjectfilename) and not force:
             click.echo(
                 "Not overwriting {} (use --force to override)".format(
                     subjectfilename))
             continue
-        with open(subjectfilename, 'w') as subjfile:
+        with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
             results = project.analyze(text, backend_params)
             for hit in hit_filter(results):
                 line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)

diff --git a/annif/corpus/convert.py b/annif/corpus/convert.py
@@ -31,12 +31,12 @@ def _subject_filename(self, subject_id):
 
     def _create_subject(self, subject_id, uri, label):
         filename = self._subject_filename(subject_id)
-        with open(filename, 'w') as subjfile:
+        with open(filename, 'w', encoding='utf-8') as subjfile:
             print("{} {}".format(uri, label), file=subjfile)
 
     def _add_text_to_subject(self, subject_id, text):
         filename = self._subject_filename(subject_id)
-        with open(filename, 'a') as subjfile:
+        with open(filename, 'a', encoding='utf-8') as subjfile:
             print(text, file=subjfile)
 
     def _generate_corpus_from_documents(self):

diff --git a/annif/corpus/document.py b/annif/corpus/document.py
@@ -37,9 +37,10 @@ def __iter__(self):
     @property
     def documents(self):
         for docfilename, keyfilename in self:
-            with open(docfilename, errors='replace') as docfile:
+            with open(docfilename, errors='replace',
+                      encoding='utf-8') as docfile:
                 text = docfile.read()
-            with open(keyfilename) as keyfile:
+            with open(keyfilename, encoding='utf-8') as keyfile:
                 subjects = SubjectSet.from_string(keyfile.read())
             yield Document(text=text, uris=subjects.subject_uris,
                            labels=subjects.subject_labels)

diff --git a/annif/corpus/subject.py b/annif/corpus/subject.py
@@ -19,7 +19,7 @@ def __init__(self, path):
     @property
     def subjects(self):
         for filename in self._filenames:
-            with open(filename) as subjfile:
+            with open(filename, encoding='utf-8') as subjfile:
                 uri, label = subjfile.readline().strip().split(' ', 1)
                 text = ' '.join(subjfile.readlines())
                 yield Subject(uri=uri, label=label, text=text)
@@ -33,7 +33,7 @@ def __init__(self, path):
 
     @property
     def subjects(self):
-        with open(self.path) as subjfile:
+        with open(self.path, encoding='utf-8') as subjfile:
             for line in subjfile:
                 uri, label = line.strip().split(None, 1)
                 clean_uri = annif.util.cleanup_uri(uri)
@@ -81,7 +81,7 @@ def by_label(self, label):
     def save(self, path):
         """Save this subject index into a file."""
 
-        with open(path, 'w') as subjfile:
+        with open(path, 'w', encoding='utf-8') as subjfile:
             for subject_id in range(len(self)):
                 line = "<{}>\t{}".format(
                     self._uris[subject_id], self._labels[subject_id])

diff --git a/annif/project.py b/annif/project.py
@@ -227,7 +227,7 @@ def _create_projects(projects_file, datadir, init_projects):
 
     config = configparser.ConfigParser()
     config.optionxform = lambda option: option
-    with open(projects_file) as projf:
+    with open(projects_file, encoding='utf-8') as projf:
         config.read_file(projf)
 
     # create AnnifProject objects from the configuration file