Skip to content

Commit

Permalink
Merge pull request #263 from Veldhoen/pull-request
Browse files Browse the repository at this point in the history
Explicit character encoding for all calls to open()
Fixes #262
  • Loading branch information
osma authored Mar 6, 2019
2 parents d832514 + a0afde3 commit 75c51d0
Show file tree
Hide file tree
Showing 7 changed files with 13 additions and 12 deletions.
2 changes: 1 addition & 1 deletion annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def _label_to_subject(project, label):
return project.subjects[subject_id]

def _write_train_file(self, doc_subjects, filename):
with open(filename, 'w') as trainfile:
with open(filename, 'w', encoding='utf-8') as trainfile:
for doc, subject_ids in doc_subjects.items():
labels = [self._id_to_label(sid) for sid in subject_ids
if sid is not None]
Expand Down
2 changes: 1 addition & 1 deletion annif/backend/vw_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _normalize_text(project, text):

@staticmethod
def _write_train_file(examples, filename):
with open(filename, 'w') as trainfile:
with open(filename, 'w', encoding='utf-8') as trainfile:
for ex in examples:
print(ex, file=trainfile)

Expand Down
4 changes: 2 additions & 2 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,15 +193,15 @@ def run_analyzedir(project_id, directory, suffix, force,

for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
directory, require_subjects=False):
with open(docfilename) as docfile:
with open(docfilename, encoding='utf-8') as docfile:
text = docfile.read()
subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
if os.path.exists(subjectfilename) and not force:
click.echo(
"Not overwriting {} (use --force to override)".format(
subjectfilename))
continue
with open(subjectfilename, 'w') as subjfile:
with open(subjectfilename, 'w', encoding='utf-8') as subjfile:
results = project.analyze(text, backend_params)
for hit in hit_filter(results):
line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
Expand Down
4 changes: 2 additions & 2 deletions annif/corpus/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ def _subject_filename(self, subject_id):

def _create_subject(self, subject_id, uri, label):
filename = self._subject_filename(subject_id)
with open(filename, 'w') as subjfile:
with open(filename, 'w', encoding='utf-8') as subjfile:
print("{} {}".format(uri, label), file=subjfile)

def _add_text_to_subject(self, subject_id, text):
filename = self._subject_filename(subject_id)
with open(filename, 'a') as subjfile:
with open(filename, 'a', encoding='utf-8') as subjfile:
print(text, file=subjfile)

def _generate_corpus_from_documents(self):
Expand Down
5 changes: 3 additions & 2 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ def __iter__(self):
@property
def documents(self):
for docfilename, keyfilename in self:
with open(docfilename, errors='replace') as docfile:
with open(docfilename, errors='replace',
encoding='utf-8') as docfile:
text = docfile.read()
with open(keyfilename) as keyfile:
with open(keyfilename, encoding='utf-8') as keyfile:
subjects = SubjectSet.from_string(keyfile.read())
yield Document(text=text, uris=subjects.subject_uris,
labels=subjects.subject_labels)
Expand Down
6 changes: 3 additions & 3 deletions annif/corpus/subject.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self, path):
@property
def subjects(self):
for filename in self._filenames:
with open(filename) as subjfile:
with open(filename, encoding='utf-8') as subjfile:
uri, label = subjfile.readline().strip().split(' ', 1)
text = ' '.join(subjfile.readlines())
yield Subject(uri=uri, label=label, text=text)
Expand All @@ -33,7 +33,7 @@ def __init__(self, path):

@property
def subjects(self):
with open(self.path) as subjfile:
with open(self.path, encoding='utf-8') as subjfile:
for line in subjfile:
uri, label = line.strip().split(None, 1)
clean_uri = annif.util.cleanup_uri(uri)
Expand Down Expand Up @@ -81,7 +81,7 @@ def by_label(self, label):
def save(self, path):
"""Save this subject index into a file."""

with open(path, 'w') as subjfile:
with open(path, 'w', encoding='utf-8') as subjfile:
for subject_id in range(len(self)):
line = "<{}>\t{}".format(
self._uris[subject_id], self._labels[subject_id])
Expand Down
2 changes: 1 addition & 1 deletion annif/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def _create_projects(projects_file, datadir, init_projects):

config = configparser.ConfigParser()
config.optionxform = lambda option: option
with open(projects_file) as projf:
with open(projects_file, encoding='utf-8') as projf:
config.read_file(projf)

# create AnnifProject objects from the configuration file
Expand Down

0 comments on commit 75c51d0

Please sign in to comment.