diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 59a106291dad8..9b976c9a7e4da 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -43,8 +43,8 @@ The following are now part of this API: .. _whatsnew_0190.enhancements.asof_merge: -:func:`merge_asof` for asof-style time-series joining -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``merge_asof`` for asof-style time-series joining +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A long-time requested feature has been added through the :func:`merge_asof` function, to support asof style joining of time-series. (:issue:`1870`, :issue:`13695`, :issue:`13709`). Full documentation is @@ -192,8 +192,8 @@ default of the index) in a DataFrame. .. _whatsnew_0190.enhancements.read_csv_dupe_col_names_support: -:func:`read_csv` has improved support for duplicate column names -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``read_csv`` has improved support for duplicate column names +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ :ref:`Duplicate column names ` are now supported in :func:`read_csv` whether they are in the file or passed in as the ``names`` parameter (:issue:`7160`, :issue:`9424`) @@ -307,48 +307,6 @@ Google BigQuery Enhancements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - The :func:`pandas.io.gbq.read_gbq` method has gained the ``dialect`` argument to allow users to specify whether to use BigQuery's legacy SQL or BigQuery's standard SQL. See the :ref:`docs ` for more details (:issue:`13615`). -.. _whatsnew_0190.sparse: - -Sparse changes -~~~~~~~~~~~~~~ - -These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. - -- Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) - -.. ipython:: python - - s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) - s.dtype - - s + 1 - -- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) - -.. ipython:: python - - s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) - s - s.astype(np.int64) - -``astype`` fails if data contains values which cannot be converted to specified ``dtype``. -Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. - -.. code-block:: ipython - - In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) - Out[7]: - ValueError: unable to coerce current fill_value nan to int64 dtype - -- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) -- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) -- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) -- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) -- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) -- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) -- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) - .. _whatsnew_0190.enhancements.other: Other enhancements @@ -684,8 +642,8 @@ New Behavior: .. _whatsnew_0190.api.autogenerated_chunksize_index: -:func:`read_csv` called with ``chunksize`` will progressively enumerate chunks -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``read_csv`` will progressively enumerate chunks +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ When :func:`read_csv` is called with ``chunksize='n'`` and without specifying an index, each chunk used to have an independently generated index from `0`` to ``n-1``. @@ -716,10 +674,52 @@ New behaviour: pd.concat(pd.read_csv(StringIO(data), chunksize=2)) +.. _whatsnew_0190.sparse: + +Sparse Changes +^^^^^^^^^^^^^^ + +These changes allow pandas to handle sparse data with more dtypes, and for work to make a smoother experience with data handling. + +- Sparse data structure now can preserve ``dtype`` after arithmetic ops (:issue:`13848`) + +.. ipython:: python + + s = pd.SparseSeries([0, 2, 0, 1], fill_value=0, dtype=np.int64) + s.dtype + + s + 1 + +- Sparse data structure now support ``astype`` to convert internal ``dtype`` (:issue:`13900`) + +.. ipython:: python + + s = pd.SparseSeries([1., 0., 2., 0.], fill_value=0) + s + s.astype(np.int64) + +``astype`` fails if data contains values which cannot be converted to specified ``dtype``. +Note that the limitation is applied to ``fill_value`` which default is ``np.nan``. + +.. code-block:: ipython + + In [7]: pd.SparseSeries([1., np.nan, 2., np.nan], fill_value=np.nan).astype(np.int64) + Out[7]: + ValueError: unable to coerce current fill_value nan to int64 dtype + +- Subclassed ``SparseDataFrame`` and ``SparseSeries`` now preserve class types when slicing or transposing. (:issue:`13787`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) +- Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) +- Bug in ``SparseDataFrame`` in which ``axis=None`` did not default to ``axis=0`` (:issue:`13048`) +- Bug in ``SparseSeries`` and ``SparseDataFrame`` creation with ``object`` dtype may raise ``TypeError`` (:issue:`11633`) +- Bug in ``SparseDataFrame`` doesn't respect passed ``SparseArray`` or ``SparseSeries`` 's dtype and ``fill_value`` (:issue:`13866`) +- Bug in ``SparseArray`` and ``SparseSeries`` don't apply ufunc to ``fill_value`` (:issue:`13853`) +- Bug in ``SparseSeries.abs`` incorrectly keeps negative ``fill_value`` (:issue:`13853`) + .. _whatsnew_0190.deprecations: Deprecations -^^^^^^^^^^^^ +~~~~~~~~~~~~ - ``Categorical.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) - ``Series.reshape`` has been deprecated and will be removed in a subsequent release (:issue:`12882`) @@ -738,7 +738,7 @@ Deprecations .. _whatsnew_0190.prior_deprecations: Removal of prior version deprecations/changes -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The ``SparsePanel`` class has been removed (:issue:`13778`) - The ``pd.sandbox`` module has been removed in favor of the external library ``pandas-qt`` (:issue:`13670`) - The ``pandas.io.data`` and ``pandas.io.wb`` modules are removed in favor of @@ -797,6 +797,7 @@ Bug Fixes - Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) +- Bug in ``pd.read_csv()``, which caused BOM files to be incorrectly parsed by not ignoring the BOM (:issue:`4793`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug when passing a not-default-indexed ``Series`` as ``xerr`` or ``yerr`` in ``.plot()`` (:issue:`11858`) - Bug in matplotlib ``AutoDataFormatter``; this restores the second scaled formatting and re-adds micro-second scaled formatting (:issue:`13131`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8c615741679b5..7846ccd1a6660 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -11,7 +11,8 @@ import numpy as np from pandas import compat -from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map +from pandas.compat import (range, lrange, StringIO, lzip, + zip, string_types, map, u) from pandas.types.common import (is_integer, _ensure_object, is_list_like, is_integer_dtype, is_float, @@ -40,6 +41,12 @@ 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' ]) +# BOM character (byte order mark) +# This exists at the beginning of a file to indicate endianness +# of a file (stream). Unfortunately, this marker screws up parsing, +# so we need to remove it if we see it. +_BOM = u('\ufeff') + _parser_params = """Also supports optionally iterating or breaking of the file into chunks. @@ -2161,6 +2168,67 @@ def _buffered_line(self): else: return self._next_line() + def _check_for_bom(self, first_row): + """ + Checks whether the file begins with the BOM character. + If it does, remove it. In addition, if there is quoting + in the field subsequent to the BOM, remove it as well + because it technically takes place at the beginning of + the name, not the middle of it. + """ + # first_row will be a list, so we need to check + # that that list is not empty before proceeding. + if not first_row: + return first_row + + # The first element of this row is the one that could have the + # BOM that we want to remove. Check that the first element is a + # string before proceeding. + if not isinstance(first_row[0], compat.string_types): + return first_row + + # Check that the string is not empty, as that would + # obviously not have a BOM at the start of it. + if not first_row[0]: + return first_row + + # Since the string is non-empty, check that it does + # in fact begin with a BOM. + first_elt = first_row[0][0] + + # This is to avoid warnings we get in Python 2.x if + # we find ourselves comparing with non-Unicode + if compat.PY2 and not isinstance(first_elt, unicode): # noqa + try: + first_elt = u(first_elt) + except UnicodeDecodeError: + return first_row + + if first_elt != _BOM: + return first_row + + first_row = first_row[0] + + if len(first_row) > 1 and first_row[1] == self.quotechar: + start = 2 + quote = first_row[1] + end = first_row[2:].index(quote) + 2 + + # Extract the data between the quotation marks + new_row = first_row[start:end] + + # Extract any remaining data after the second + # quotation mark. + if len(first_row) > end + 1: + new_row += first_row[end + 1:] + return [new_row] + elif len(first_row) > 1: + return [first_row[1:]] + else: + # First row is just the BOM, so we + # return an empty string. + return [""] + def _empty(self, line): return not line or all(not x for x in line) @@ -2212,6 +2280,12 @@ def _next_line(self): line = ret[0] break + # This was the first line of the file, + # which could contain the BOM at the + # beginning of it. + if self.pos == 1: + line = self._check_for_bom(line) + self.line_pos += 1 self.buf.append(line) return line diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 129e925e38d5b..7558e4bb63226 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1517,3 +1517,54 @@ def test_null_byte_char(self): msg = "NULL byte detected" with tm.assertRaisesRegexp(csv.Error, msg): self.read_csv(StringIO(data), names=cols) + + def test_utf8_bom(self): + # see gh-4793 + bom = u('\ufeff') + utf8 = 'utf-8' + + def _encode_data_with_bom(_data): + bom_data = (bom + _data).encode(utf8) + return BytesIO(bom_data) + + # basic test + data = 'a\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8) + tm.assert_frame_equal(out, expected) + + # test with "regular" quoting + data = '"a"\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, quotechar='"') + tm.assert_frame_equal(out, expected) + + # test in a data row instead of header + data = 'b\n1' + expected = DataFrame({'a': ['b', '1']}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a']) + tm.assert_frame_equal(out, expected) + + # test in empty data row with skipping + data = '\n1' + expected = DataFrame({'a': [1]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a'], + skip_blank_lines=True) + tm.assert_frame_equal(out, expected) + + # test in empty data row without skipping + data = '\n1' + expected = DataFrame({'a': [np.nan, 1.0]}) + + out = self.read_csv(_encode_data_with_bom(data), + encoding=utf8, names=['a'], + skip_blank_lines=False) + tm.assert_frame_equal(out, expected) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index cc89fc51792dd..3c09933b3ec87 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -704,6 +704,11 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) { self->datapos = i; \ TRACE(("_TOKEN_CLEANUP: datapos: %d, datalen: %d\n", self->datapos, self->datalen)); +#define CHECK_FOR_BOM() \ + if (*buf == '\xef' && *(buf + 1) == '\xbb' && *(buf + 2) == '\xbf') { \ + buf += 3; \ + self->datapos += 3; \ + } int skip_this_line(parser_t *self, int64_t rownum) { if (self->skipset != NULL) { @@ -736,6 +741,10 @@ int tokenize_bytes(parser_t *self, size_t line_limit) TRACE(("%s\n", buf)); + if (self->file_lines == 0) { + CHECK_FOR_BOM(); + } + for (i = self->datapos; i < self->datalen; ++i) { // next character in file