diff --git a/pandas/core/common.py b/pandas/core/common.py index d63029b447705..b16c4d445a256 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -1066,6 +1066,8 @@ def load(path): f.close() + + class UTF8Recoder: """ Iterator that reads an encoded stream and reencodes the input to UTF-8 @@ -1076,6 +1078,12 @@ def __init__(self, f, encoding): def __iter__(self): return self + def read(self, bytes=-1): + return self.reader.read(bytes).encode('utf-8') + + def readline(self): + return self.reader.readline().encode('utf-8') + def next(self): return self.reader.next().encode("utf-8") @@ -1088,6 +1096,8 @@ def _get_handle(path, mode, encoding=None): f = open(path, mode, errors='replace') else: f = open(path, mode) + if encoding is not None and 'r' in mode: + f = UTF8Recoder(f, encoding) return f if py3compat.PY3: # pragma: no cover @@ -1108,7 +1118,7 @@ class UnicodeReader: """ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): - f = UTF8Recoder(f, encoding) + # f = UTF8Recoder(f, encoding) self.reader = csv.reader(f, dialect=dialect, **kwds) def next(self): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 44c7be853b6a0..4fd2dca2e4843 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -841,6 +841,12 @@ def __init__(self, src, **kwds): ParserBase.__init__(self, kwds) + if 'utf-16' in (kwds.get('encoding') or ''): + if isinstance(src, basestring): + src = open(src, 'rb') + src = com.UTF8Recoder(src, kwds['encoding']) + kwds['encoding'] = 'utf-8' + self._reader = _parser.TextReader(src, **kwds) # XXX @@ -1078,11 +1084,11 @@ def __init__(self, f, **kwds): if isinstance(f, basestring): - try: + if self.encoding is None: # universal newline mode - f = com._get_handle(f, 'U', encoding=self.encoding) - except Exception: # pragma: no cover - f = com._get_handle(f, 'r', encoding=self.encoding) + f = com._get_handle(f, 'U') + else: + f = com._get_handle(f, 'rb', encoding=self.encoding) if hasattr(f, 'readline'): self._make_reader(f) @@ -1150,6 +1156,7 @@ class MyDialect(csv.Dialect): else: reader = csv.reader(f, dialect=dia, strict=True) + else: reader = (re.split(sep, line.strip()) for line in f) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index d3fc6049b02c6..f48b561ff56e8 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1633,6 +1633,12 @@ def test_utf16_bom_skiprows(self): except os.error: pass + def test_utf16_example(self): + path = os.path.join(self.dirpath, 'utf16_ex.txt') + + # it works! and is the right length + result = self.read_table(path, encoding='utf-16') + self.assertEquals(len(result), 50) class TestCParserHighMemory(ParserTests, unittest.TestCase): diff --git a/pandas/io/tests/utf16_ex.txt b/pandas/io/tests/utf16_ex.txt new file mode 100644 index 0000000000000..f0b452a2bd5ff Binary files /dev/null and b/pandas/io/tests/utf16_ex.txt differ diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index b6518701eae68..420d5f4033e0f 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -30,6 +30,7 @@ cimport util import pandas.lib as lib import time +import os cnp.import_array() @@ -485,6 +486,8 @@ cdef class TextReader: self.parser.cb_cleanup = &del_file_source if ptr == NULL: + if not os.path.exists(source): + raise Exception('File %s does not exist' % source) raise Exception('Initializing from file failed') self.parser.source = ptr