Skip to content

Commit

Permalink
BUG: fix utf-16 handling re: #2418, #2298
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Dec 6, 2012
1 parent 67ef67f commit f985aa1
Show file tree
Hide file tree
Showing 5 changed files with 31 additions and 5 deletions.
12 changes: 11 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,8 @@ def load(path):
f.close()




class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
Expand All @@ -1076,6 +1078,12 @@ def __init__(self, f, encoding):
def __iter__(self):
return self

def read(self, bytes=-1):
return self.reader.read(bytes).encode('utf-8')

def readline(self):
return self.reader.readline().encode('utf-8')

def next(self):
return self.reader.next().encode("utf-8")

Expand All @@ -1088,6 +1096,8 @@ def _get_handle(path, mode, encoding=None):
f = open(path, mode, errors='replace')
else:
f = open(path, mode)
if encoding is not None and 'r' in mode:
f = UTF8Recoder(f, encoding)
return f

if py3compat.PY3: # pragma: no cover
Expand All @@ -1108,7 +1118,7 @@ class UnicodeReader:
"""

def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
# f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)

def next(self):
Expand Down
15 changes: 11 additions & 4 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -841,6 +841,12 @@ def __init__(self, src, **kwds):

ParserBase.__init__(self, kwds)

if 'utf-16' in (kwds.get('encoding') or ''):
if isinstance(src, basestring):
src = open(src, 'rb')
src = com.UTF8Recoder(src, kwds['encoding'])
kwds['encoding'] = 'utf-8'

self._reader = _parser.TextReader(src, **kwds)

# XXX
Expand Down Expand Up @@ -1078,11 +1084,11 @@ def __init__(self, f, **kwds):


if isinstance(f, basestring):
try:
if self.encoding is None:
# universal newline mode
f = com._get_handle(f, 'U', encoding=self.encoding)
except Exception: # pragma: no cover
f = com._get_handle(f, 'r', encoding=self.encoding)
f = com._get_handle(f, 'U')
else:
f = com._get_handle(f, 'rb', encoding=self.encoding)

if hasattr(f, 'readline'):
self._make_reader(f)
Expand Down Expand Up @@ -1150,6 +1156,7 @@ class MyDialect(csv.Dialect):
else:
reader = csv.reader(f, dialect=dia,
strict=True)

else:
reader = (re.split(sep, line.strip()) for line in f)

Expand Down
6 changes: 6 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1633,6 +1633,12 @@ def test_utf16_bom_skiprows(self):
except os.error:
pass

def test_utf16_example(self):
path = os.path.join(self.dirpath, 'utf16_ex.txt')

# it works! and is the right length
result = self.read_table(path, encoding='utf-16')
self.assertEquals(len(result), 50)

class TestCParserHighMemory(ParserTests, unittest.TestCase):

Expand Down
Binary file added pandas/io/tests/utf16_ex.txt
Binary file not shown.
3 changes: 3 additions & 0 deletions pandas/src/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ cimport util
import pandas.lib as lib

import time
import os

cnp.import_array()

Expand Down Expand Up @@ -485,6 +486,8 @@ cdef class TextReader:
self.parser.cb_cleanup = &del_file_source

if ptr == NULL:
if not os.path.exists(source):
raise Exception('File %s does not exist' % source)
raise Exception('Initializing from file failed')

self.parser.source = ptr
Expand Down

0 comments on commit f985aa1

Please sign in to comment.