From fcb0263762a31724ba6db39bf1564569dda068a0 Mon Sep 17 00:00:00 2001 From: Lucas Kushner Date: Tue, 18 Jul 2017 00:01:26 -0500 Subject: [PATCH] DOC, TST: Clarify whitespace behavior in read_fwf documentation (#16950) Closes gh-16772 --- doc/source/io.rst | 6 ++++- pandas/io/parsers.py | 13 ++++++----- pandas/tests/io/parser/test_read_fwf.py | 29 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 9bf84e5419ffa..495d4e9c3a5a3 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1258,7 +1258,8 @@ Files with Fixed Width Columns While ``read_csv`` reads delimited data, the :func:`read_fwf` function works with data files that have known and fixed column widths. The function parameters -to ``read_fwf`` are largely the same as `read_csv` with two extra parameters: +to ``read_fwf`` are largely the same as `read_csv` with two extra parameters, and +a different usage of the ``delimiter`` parameter: - ``colspecs``: A list of pairs (tuples) giving the extents of the fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). @@ -1267,6 +1268,9 @@ to ``read_fwf`` are largely the same as `read_csv` with two extra parameters: behaviour, if not specified, is to infer. - ``widths``: A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. + - ``delimiter``: Characters to consider as filler characters in the fixed-width file. + Can be used to specify the filler character of the fields + if it is not spaces (e.g., '~'). .. ipython:: python :suppress: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 343bc7a74fde8..1e7d9d420b35d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -63,8 +63,6 @@ file. For file URLs, a host is expected. For instance, a local file could be file ://localhost/path/to/table.csv %s -delimiter : str, default ``None`` - Alternative argument name for sep. delim_whitespace : boolean, default False Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be used as the sep. Equivalent to setting ``sep='\s+'``. If this option @@ -316,7 +314,9 @@ be used automatically. In addition, separators longer than 1 character and different from ``'\s+'`` will be interpreted as regular expressions and will also force the use of the Python parsing engine. Note that regex - delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``""" + delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'`` +delimiter : str, default ``None`` + Alternative argument name for sep.""" _read_csv_doc = """ Read CSV (comma-separated) file into DataFrame @@ -341,15 +341,16 @@ widths : list of ints. optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. +delimiter : str, default ``'\t' + ' '`` + Characters to consider as filler characters in the fixed-width file. + Can be used to specify the filler character of the fields + if it is not spaces (e.g., '~'). """ _read_fwf_doc = """ Read a table of fixed-width formatted lines into DataFrame %s - -Also, 'delimiter' is used to specify the filler character of the -fields if it is not spaces (e.g., '~'). """ % (_parser_params % (_fwf_widths, '')) diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 0bfeb5215f370..ec1d1a2a51cdc 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -405,3 +405,32 @@ def test_skiprows_inference_empty(self): with pytest.raises(EmptyDataError): read_fwf(StringIO(test), skiprows=3) + + def test_whitespace_preservation(self): + # Addresses Issue #16772 + data_expected = """ + a ,bbb + cc,dd """ + expected = read_csv(StringIO(data_expected), header=None) + + test_data = """ + a bbb + ccdd """ + result = read_fwf(StringIO(test_data), widths=[3, 3], + header=None, skiprows=[0], delimiter="\n\t") + + tm.assert_frame_equal(result, expected) + + def test_default_delimiter(self): + data_expected = """ +a,bbb +cc,dd""" + expected = read_csv(StringIO(data_expected), header=None) + + test_data = """ +a \tbbb +cc\tdd """ + result = read_fwf(StringIO(test_data), widths=[3, 3], + header=None, skiprows=[0]) + + tm.assert_frame_equal(result, expected)