Skip to content

Commit

Permalink
BUG: Ignore the BOM in BOM UTF-8 CSV files
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung committed Aug 5, 2016
1 parent 2beab41 commit 34bc8e5
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 1 deletion.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,7 @@ Bug Fixes

- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`)
- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`)
- Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`)
- Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`)
- Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`)
- Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`)
Expand Down
76 changes: 75 additions & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
import numpy as np

from pandas import compat
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
from pandas.compat import (range, lrange, StringIO, lzip,
zip, string_types, map, u)
from pandas.types.common import (is_integer, _ensure_object,
is_list_like, is_integer_dtype,
is_float,
Expand Down Expand Up @@ -40,6 +41,12 @@
'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', ''
])

# BOM character (byte order mark)
# This exists at the beginning of a file to indicate endianness
# of a file (stream). Unfortunately, this marker screws up parsing,
# so we need to remove it if we see it.
_BOM = u('\ufeff')

_parser_params = """Also supports optionally iterating or breaking of the file
into chunks.
Expand Down Expand Up @@ -2161,6 +2168,67 @@ def _buffered_line(self):
else:
return self._next_line()

def _check_for_bom(self, first_row):
"""
Checks whether the file begins with the BOM character.
If it does, remove it. In addition, if there is quoting
in the field subsequent to the BOM, remove it as well
because it technically takes place at the beginning of
the name, not the middle of it.
"""
# first_row will be a list, so we need to check
# that that list is not empty before proceeding.
if not first_row:
return first_row

# The first element of this row is the one that could have the
# BOM that we want to remove. Check that the first element is a
# string before proceeding.
if not isinstance(first_row[0], compat.string_types):
return first_row

# Check that the string is not empty, as that would
# obviously not have a BOM at the start of it.
if not first_row[0]:
return first_row

# Since the string is non-empty, check that it does
# in fact begin with a BOM.
first_elt = first_row[0][0]

# This is to avoid warnings we get in Python 2.x if
# we find ourselves comparing with non-Unicode
if compat.PY2 and not isinstance(first_elt, unicode): # noqa
try:
first_elt = u(first_elt)
except UnicodeDecodeError:
return first_row

if first_elt != _BOM:
return first_row

first_row = first_row[0]

if len(first_row) > 1 and first_row[1] == self.quotechar:
start = 2
quote = first_row[1]
end = first_row[2:].index(quote) + 2

# Extract the data between the quotation marks
new_row = first_row[start:end]

# Extract any remaining data after the second
# quotation mark.
if len(first_row) > end + 1:
new_row += first_row[end + 1:]
return [new_row]
elif len(first_row) > 1:
return [first_row[1:]]
else:
# First row is just the BOM, so we
# return an empty string.
return [""]

def _empty(self, line):
return not line or all(not x for x in line)

Expand Down Expand Up @@ -2212,6 +2280,12 @@ def _next_line(self):
line = ret[0]
break

# This was the first line of the file,
# which could contain the BOM at the
# beginning of it.
if self.pos == 1:
line = self._check_for_bom(line)

self.line_pos += 1
self.buf.append(line)
return line
Expand Down
51 changes: 51 additions & 0 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1517,3 +1517,54 @@ def test_null_byte_char(self):
msg = "NULL byte detected"
with tm.assertRaisesRegexp(csv.Error, msg):
self.read_csv(StringIO(data), names=cols)

def test_utf8_bom(self):
# see gh-4793
bom = u('\ufeff')
utf8 = 'utf-8'

def _encode_data_with_bom(_data):
bom_data = (bom + _data).encode(utf8)
return BytesIO(bom_data)

# basic test
data = 'a\n1'
expected = DataFrame({'a': [1]})

out = self.read_csv(_encode_data_with_bom(data),
encoding=utf8)
tm.assert_frame_equal(out, expected)

# test with "regular" quoting
data = '"a"\n1'
expected = DataFrame({'a': [1]})

out = self.read_csv(_encode_data_with_bom(data),
encoding=utf8, quotechar='"')
tm.assert_frame_equal(out, expected)

# test in a data row instead of header
data = 'b\n1'
expected = DataFrame({'a': ['b', '1']})

out = self.read_csv(_encode_data_with_bom(data),
encoding=utf8, names=['a'])
tm.assert_frame_equal(out, expected)

# test in empty data row with skipping
data = '\n1'
expected = DataFrame({'a': [1]})

out = self.read_csv(_encode_data_with_bom(data),
encoding=utf8, names=['a'],
skip_blank_lines=True)
tm.assert_frame_equal(out, expected)

# test in empty data row without skipping
data = '\n1'
expected = DataFrame({'a': [np.nan, 1.0]})

out = self.read_csv(_encode_data_with_bom(data),
encoding=utf8, names=['a'],
skip_blank_lines=False)
tm.assert_frame_equal(out, expected)
9 changes: 9 additions & 0 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,11 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
self->datapos = i; \
TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen));

#define CHECK_FOR_BOM() \
if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \
buf += 3; \
self->datapos += 3; \
}

int skip_this_line(parser_t *self, int64_t rownum) {
if (self->skipset != NULL) {
Expand Down Expand Up @@ -736,6 +741,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit)

TRACE(("%s\n", buf));

if (self->file_lines == 0) {
CHECK_FOR_BOM();
}

for (i = self->datapos; i < self->datalen; ++i)
{
// next character in file
Expand Down

0 comments on commit 34bc8e5

Please sign in to comment.