Skip to content

Commit

Permalink
BUG: closes #705, csv is encoded utf-8 and then decoded on the read side
Browse files Browse the repository at this point in the history
  • Loading branch information
adamklein committed Jan 27, 2012
1 parent 163d8b4 commit c0fc368
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 6 deletions.
39 changes: 39 additions & 0 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@

import pandas._tseries as lib
from pandas.util import py3compat
import codecs
import csv

# XXX: HACK for NumPy 1.5.1 to suppress warnings
try:
Expand Down Expand Up @@ -828,3 +830,40 @@ def console_encode(value):
return value.encode(sys.stdin.encoding, 'replace')
except (AttributeError, TypeError):
return value.encode('ascii', 'replace')

def csv_encode(value):
if py3compat.PY3 or not isinstance(value, unicode):
return value

return value.encode('UTF-8', 'replace')

class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)

def __iter__(self):
return self

def next(self):
return self.reader.next().encode("utf-8")

class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""

def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)

def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]

def __iter__(self):
return self

13 changes: 9 additions & 4 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import numpy.ma as ma

from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
_default_index, _stringify)
_default_index, _stringify, csv_encode)
from pandas.core.daterange import DateRange
from pandas.core.generic import NDFrame
from pandas.core.index import Index, MultiIndex, NULL_INDEX, _ensure_index
Expand Down Expand Up @@ -890,9 +890,13 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
elif not isinstance(index_label, (list, tuple, np.ndarray)):
# given a string for a DF with Index
index_label = [index_label]
csvout.writerow(list(index_label) + list(cols))

encoded_labels = [csv_encode(val) for val in index_label]
encoded_cols = [csv_encode(val) for val in cols]
csvout.writerow(encoded_labels + encoded_cols)
else:
csvout.writerow(cols)
encoded_cols = [csv_encode(val) for val in cols]
csvout.writerow(encoded_cols)

nlevels = getattr(self.index, 'nlevels', 1)
for idx in self.index:
Expand All @@ -909,7 +913,8 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,

row_fields.append(val)

csvout.writerow(row_fields)
encoded_rows = [csv_encode(val) for val in row_fields]
csvout.writerow(encoded_rows)

f.close()

Expand Down
5 changes: 3 additions & 2 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,9 @@ def _make_reader(self, f):
self.pos += 1
sniffed = csv.Sniffer().sniff(line)
dia.delimiter = sniffed.delimiter
self.buf.extend(list(csv.reader(StringIO(line), dialect=dia)))
reader = csv.reader(f, dialect=dia)
self.buf.extend(list(com.UnicodeReader(StringIO(line),
dialect=dia)))
reader = com.UnicodeReader(f, dialect=dia)
else:
reader = (re.split(sep, line.strip()) for line in f)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2482,6 +2482,15 @@ def test_to_csv_bug(self):

os.remove(path)

def test_to_csv_unicode(self):
from pandas import read_csv
path = '__tmp__.csv'
df = DataFrame({u'c/\u03c3':[1,2,3]})
df.to_csv(path)
df2 = read_csv(path, index_col=0)
assert_frame_equal(df, df2)
os.remove(path)

def test_info(self):
io = StringIO()
self.frame.info(buf=io)
Expand Down

0 comments on commit c0fc368

Please sign in to comment.