diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9159f7c8056ec..1940f22fd0661 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -235,6 +235,97 @@ If installed, we now require: | scipy | 0.18.1 | | +-----------------+-----------------+----------+ +.. _whatsnew_0240.api_breaking.csv_line_terminator: + +`os.linesep` is used for ``line_terminator`` of ``DataFrame.to_csv`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:func:`DataFrame.to_csv` now uses :func:`os.linesep` rather than ``'\n'`` + for the default line terminator (:issue:`20353`). +This change only affects when running on Windows, where ``'\r\n'`` was used for line terminator +even when ``'\n'`` was passed in ``line_terminator``. + +Previous Behavior on Windows: + +.. code-block:: ipython + +In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + +In [2]: # When passing file PATH to to_csv, line_terminator does not work, and csv is saved with '\r\n'. + ...: # Also, this converts all '\n's in the data to '\r\n'. + ...: data.to_csv("test.csv", index=False, line_terminator='\n') + +In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\r\n"a\r\nbc","a\r\r\nbc"\r\n' + +In [4]: # When passing file OBJECT with newline option to to_csv, line_terminator works. + ...: with open("test2.csv", mode='w', newline='\n') as f: + ...: data.to_csv(f, index=False, line_terminator='\n') + +In [5]: with open("test2.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' + + +New Behavior on Windows: + +- By passing ``line_terminator`` explicitly, line terminator is set to that character. +- The value of ``line_terminator`` only affects the line terminator of CSV, + so it does not change the value inside the data. + +.. code-block:: ipython + +In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + +In [2]: data.to_csv("test.csv", index=False, line_terminator='\n') + +In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\n"a\nbc","a\r\nbc"\n' + + +- On Windows, the value of ``os.linesep`` is ``'\r\n'``, + so if ``line_terminator`` is not set, ``'\r\n'`` is used for line terminator. +- Again, it does not affect the value inside the data. + +.. code-block:: ipython + +In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + +In [2]: data.to_csv("test.csv", index=False) + +In [3]: with open("test.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n' + + +- For files objects, specifying ``newline`` is not sufficient to set the line terminator. + You must pass in the ``line_terminator`` explicitly, even in this case. + +.. code-block:: ipython + +In [1]: data = pd.DataFrame({ + ...: "string_with_lf": ["a\nbc"], + ...: "string_with_crlf": ["a\r\nbc"] + ...: }) + +In [2]: with open("test2.csv", mode='w', newline='\n') as f: + ...: data.to_csv(f, index=False) + +In [3]: with open("test2.csv", mode='rb') as f: + ...: print(f.read()) +b'string_with_lf,string_with_crlf\r\n"a\nbc","a\r\nbc"\r\n' + .. _whatsnew_0240.api_breaking.interval_values: ``IntervalIndex.values`` is now an ``IntervalArray`` diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba050bfc8db77..e12a3f0d225eb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9518,7 +9518,7 @@ def last_valid_index(self): def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression='infer', quoting=None, - quotechar='"', line_terminator='\n', chunksize=None, + quotechar='"', line_terminator=None, chunksize=None, tupleize_cols=None, date_format=None, doublequote=True, escapechar=None, decimal='.'): r""" @@ -9583,9 +9583,12 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, will treat them as non-numeric. quotechar : str, default '\"' String of length 1. Character used to quote fields. - line_terminator : string, default ``'\n'`` + line_terminator : string, optional The newline character or character sequence to use in the output - file. + file. Defaults to `os.linesep`, which depends on the OS in which + this method is called ('\n' for linux, '\r\n' for Windows, i.e.). + + .. versionchanged:: 0.24.0 chunksize : int or None Rows to write at a time. tupleize_cols : bool, default False diff --git a/pandas/io/common.py b/pandas/io/common.py index 9bf7c5af2cd3a..2056c25ddc5f4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -417,13 +417,14 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, elif is_path: if compat.PY2: # Python 2 + mode = "wb" if mode == "w" else mode f = open(path_or_buf, mode) elif encoding: # Python 3 and encoding - f = open(path_or_buf, mode, encoding=encoding) + f = open(path_or_buf, mode, encoding=encoding, newline="") elif is_text: # Python 3 and no explicit encoding - f = open(path_or_buf, mode, errors='replace') + f = open(path_or_buf, mode, errors='replace', newline="") else: # Python 3 and binary mode f = open(path_or_buf, mode) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0344689183dbb..115e885a23b96 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,6 +11,7 @@ from zipfile import ZipFile import numpy as np +import os from pandas._libs import writers as libwriters @@ -73,7 +74,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', self.doublequote = doublequote self.escapechar = escapechar - self.line_terminator = line_terminator + self.line_terminator = line_terminator or os.linesep self.date_format = date_format diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e1c3c29ef2846..aa91b7510a2b5 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -2,6 +2,7 @@ from __future__ import print_function +import os import csv import pytest @@ -841,11 +842,11 @@ def test_to_csv_unicodewriter_quoting(self): encoding='utf-8') result = buf.getvalue() - expected = ('"A","B"\n' - '1,"foo"\n' - '2,"bar"\n' - '3,"baz"\n') - + expected_rows = ['"A","B"', + '1,"foo"', + '2,"bar"', + '3,"baz"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_to_csv_quote_none(self): @@ -855,8 +856,12 @@ def test_to_csv_quote_none(self): buf = StringIO() df.to_csv(buf, quoting=csv.QUOTE_NONE, encoding=encoding, index=False) + result = buf.getvalue() - expected = 'A\nhello\n{"hello"}\n' + expected_rows = ['A', + 'hello', + '{"hello"}'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_to_csv_index_no_leading_comma(self): @@ -865,31 +870,44 @@ def test_to_csv_index_no_leading_comma(self): buf = StringIO() df.to_csv(buf, index_label=False) - expected = ('A,B\n' - 'one,1,4\n' - 'two,2,5\n' - 'three,3,6\n') + + expected_rows = ['A,B', + 'one,1,4', + 'two,2,5', + 'three,3,6'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert buf.getvalue() == expected def test_to_csv_line_terminators(self): + # see gh-20353 df = DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['one', 'two', 'three']) - buf = StringIO() - df.to_csv(buf, line_terminator='\r\n') - expected = (',A,B\r\n' - 'one,1,4\r\n' - 'two,2,5\r\n' - 'three,3,6\r\n') - assert buf.getvalue() == expected + with ensure_clean() as path: + # case 1: CRLF as line terminator + df.to_csv(path, line_terminator='\r\n') + expected = b',A,B\r\none,1,4\r\ntwo,2,5\r\nthree,3,6\r\n' - buf = StringIO() - df.to_csv(buf) # The default line terminator remains \n - expected = (',A,B\n' - 'one,1,4\n' - 'two,2,5\n' - 'three,3,6\n') - assert buf.getvalue() == expected + with open(path, mode='rb') as f: + assert f.read() == expected + + with ensure_clean() as path: + # case 2: LF as line terminator + df.to_csv(path, line_terminator='\n') + expected = b',A,B\none,1,4\ntwo,2,5\nthree,3,6\n' + + with open(path, mode='rb') as f: + assert f.read() == expected + + with ensure_clean() as path: + # case 3: The default line terminator(=os.linesep)(gh-21406) + df.to_csv(path) + os_linesep = os.linesep.encode('utf-8') + expected = (b',A,B' + os_linesep + b'one,1,4' + os_linesep + + b'two,2,5' + os_linesep + b'three,3,6' + os_linesep) + + with open(path, mode='rb') as f: + assert f.read() == expected def test_to_csv_from_csv_categorical(self): @@ -1069,35 +1087,39 @@ def test_to_csv_quoting(self): 'c_string': ['a', 'b,c'], }) - expected = """\ -,c_bool,c_float,c_int,c_string -0,True,1.0,42.0,a -1,False,3.2,,"b,c" -""" + expected_rows = [',c_bool,c_float,c_int,c_string', + '0,True,1.0,42.0,a', + '1,False,3.2,,"b,c"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv() assert result == expected result = df.to_csv(quoting=None) assert result == expected + expected_rows = [',c_bool,c_float,c_int,c_string', + '0,True,1.0,42.0,a', + '1,False,3.2,,"b,c"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_MINIMAL) assert result == expected - expected = """\ -"","c_bool","c_float","c_int","c_string" -"0","True","1.0","42.0","a" -"1","False","3.2","","b,c" -""" + expected_rows = ['"","c_bool","c_float","c_int","c_string"', + '"0","True","1.0","42.0","a"', + '"1","False","3.2","","b,c"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + result = df.to_csv(quoting=csv.QUOTE_ALL) assert result == expected # see gh-12922, gh-13259: make sure changes to # the formatters do not break this behaviour - expected = """\ -"","c_bool","c_float","c_int","c_string" -0,True,1.0,42.0,"a" -1,False,3.2,"","b,c" -""" + expected_rows = ['"","c_bool","c_float","c_int","c_string"', + '0,True,1.0,42.0,"a"', + '1,False,3.2,"","b,c"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_NONNUMERIC) assert result == expected @@ -1108,28 +1130,29 @@ def test_to_csv_quoting(self): quoting=csv.QUOTE_NONE, escapechar=None) - expected = """\ -,c_bool,c_float,c_int,c_string -0,True,1.0,42.0,a -1,False,3.2,,b!,c -""" + expected_rows = [',c_bool,c_float,c_int,c_string', + '0,True,1.0,42.0,a', + '1,False,3.2,,b!,c'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='!') assert result == expected - expected = """\ -,c_bool,c_ffloat,c_int,c_string -0,True,1.0,42.0,a -1,False,3.2,,bf,c -""" + expected_rows = [',c_bool,c_ffloat,c_int,c_string', + '0,True,1.0,42.0,a', + '1,False,3.2,,bf,c'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) result = df.to_csv(quoting=csv.QUOTE_NONE, escapechar='f') assert result == expected # see gh-3503: quoting Windows line terminators # presents with encoding? - text = 'a,b,c\n1,"test \r\n",3\n' + text_rows = ['a,b,c', + '1,"test \r\n",3'] + text = tm.convert_rows_list_to_csv_str(text_rows) df = pd.read_csv(StringIO(text)) + buf = StringIO() df.to_csv(buf, encoding='utf-8', index=False) assert buf.getvalue() == text @@ -1138,7 +1161,11 @@ def test_to_csv_quoting(self): # with multi-indexes df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [5, 6]}) df = df.set_index(['a', 'b']) - expected = '"a","b","c"\n"1","3","5"\n"2","4","6"\n' + + expected_rows = ['"a","b","c"', + '"1","3","5"', + '"2","4","6"'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(quoting=csv.QUOTE_ALL) == expected def test_period_index_date_overflow(self): @@ -1150,13 +1177,21 @@ def test_period_index_date_overflow(self): df = pd.DataFrame([4, 5, 6], index=index) result = df.to_csv() - expected = ',0\n1990-01-01,4\n2000-01-01,5\n3005-01-01,6\n' + expected_rows = [',0', + '1990-01-01,4', + '2000-01-01,5', + '3005-01-01,6'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected date_format = "%m-%d-%Y" result = df.to_csv(date_format=date_format) - expected = ',0\n01-01-1990,4\n01-01-2000,5\n01-01-3005,6\n' + expected_rows = [',0', + '01-01-1990,4', + '01-01-2000,5', + '01-01-3005,6'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected # Overflow with pd.NaT @@ -1166,7 +1201,11 @@ def test_period_index_date_overflow(self): df = pd.DataFrame([4, 5, 6], index=index) result = df.to_csv() - expected = ',0\n1990-01-01,4\n,5\n3005-01-01,6\n' + expected_rows = [',0', + '1990-01-01,4', + ',5', + '3005-01-01,6'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected def test_multi_index_header(self): @@ -1179,5 +1218,8 @@ def test_multi_index_header(self): header = ["a", "b", "c", "d"] result = df.to_csv(header=header) - expected = ",a,b,c,d\n0,1,2,3,4\n1,5,6,7,8\n" + expected_rows = [',a,b,c,d', + '0,1,2,3,4', + '1,5,6,7,8'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index ea0b5f5cc0c66..7042cae526207 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -4,10 +4,11 @@ import pytest +import os import numpy as np import pandas as pd -from pandas import DataFrame +from pandas import DataFrame, compat from pandas.util import testing as tm @@ -132,29 +133,46 @@ def test_to_csv_escapechar(self): def test_csv_to_string(self): df = DataFrame({'col': [1, 2]}) - expected = ',col\n0,1\n1,2\n' + expected_rows = [',col', + '0,1', + '1,2'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected def test_to_csv_decimal(self): - # GH 781 + # see gh-781 df = DataFrame({'col1': [1], 'col2': ['a'], 'col3': [10.1]}) - expected_default = ',col1,col2,col3\n0,1,a,10.1\n' + expected_rows = [',col1,col2,col3', + '0,1,a,10.1'] + expected_default = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv() == expected_default - expected_european_excel = ';col1;col2;col3\n0;1;a;10,1\n' + expected_rows = [';col1;col2;col3', + '0;1;a;10,1'] + expected_european_excel = tm.convert_rows_list_to_csv_str( + expected_rows) assert df.to_csv(decimal=',', sep=';') == expected_european_excel - expected_float_format_default = ',col1,col2,col3\n0,1,a,10.10\n' + expected_rows = [',col1,col2,col3', + '0,1,a,10.10'] + expected_float_format_default = tm.convert_rows_list_to_csv_str( + expected_rows) assert df.to_csv(float_format='%.2f') == expected_float_format_default - expected_float_format = ';col1;col2;col3\n0;1;a;10,10\n' + expected_rows = [';col1;col2;col3', + '0;1;a;10,10'] + expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(decimal=',', sep=';', float_format='%.2f') == expected_float_format - # GH 11553: testing if decimal is taken into account for '0.0' + # see gh-11553: testing if decimal is taken into account for '0.0' df = pd.DataFrame({'a': [0, 1.1], 'b': [2.2, 3.3], 'c': 1}) - expected = 'a,b,c\n0^0,2^2,1\n1^1,3^3,1\n' + + expected_rows = ['a,b,c', + '0^0,2^2,1', + '1^1,3^3,1'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.to_csv(index=False, decimal='^') == expected # same but for an index @@ -167,7 +185,11 @@ def test_to_csv_float_format(self): # testing if float_format is taken into account for the index # GH 11553 df = pd.DataFrame({'a': [0, 1], 'b': [2.2, 3.3], 'c': 1}) - expected = 'a,b,c\n0,2.20,1\n1,3.30,1\n' + + expected_rows = ['a,b,c', + '0,2.20,1', + '1,3.30,1'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) assert df.set_index('a').to_csv(float_format='%.2f') == expected # same for a multi-index @@ -175,22 +197,35 @@ def test_to_csv_float_format(self): float_format='%.2f') == expected def test_to_csv_na_rep(self): - # testing if NaN values are correctly represented in the index - # GH 11553 + # see gh-11553 + # + # Testing if NaN values are correctly represented in the index. df = DataFrame({'a': [0, np.NaN], 'b': [0, 1], 'c': [2, 3]}) - expected = "a,b,c\n0.0,0,2\n_,1,3\n" + expected_rows = ['a,b,c', + '0.0,0,2', + '_,1,3'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected # now with an index containing only NaNs df = DataFrame({'a': np.NaN, 'b': [0, 1], 'c': [2, 3]}) - expected = "a,b,c\n_,0,2\n_,1,3\n" + expected_rows = ['a,b,c', + '_,0,2', + '_,1,3'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected # check if na_rep parameter does not break anything when no NaN df = DataFrame({'a': 0, 'b': [0, 1], 'c': [2, 3]}) - expected = "a,b,c\n0,0,2\n0,1,3\n" + expected_rows = ['a,b,c', + '0,0,2', + '0,1,3'] + expected = tm.convert_rows_list_to_csv_str(expected_rows) + assert df.set_index('a').to_csv(na_rep='_') == expected assert df.set_index(['a', 'b']).to_csv(na_rep='_') == expected @@ -201,63 +236,93 @@ def test_to_csv_date_format(self): df_day = DataFrame({'A': pd.date_range('20130101', periods=5, freq='d') }) - expected_default_sec = (',A\n0,2013-01-01 00:00:00\n1,' - '2013-01-01 00:00:01\n2,2013-01-01 00:00:02' - '\n3,2013-01-01 00:00:03\n4,' - '2013-01-01 00:00:04\n') + expected_rows = [',A', + '0,2013-01-01 00:00:00', + '1,2013-01-01 00:00:01', + '2,2013-01-01 00:00:02', + '3,2013-01-01 00:00:03', + '4,2013-01-01 00:00:04'] + expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv() == expected_default_sec - expected_ymdhms_day = (',A\n0,2013-01-01 00:00:00\n1,' - '2013-01-02 00:00:00\n2,2013-01-03 00:00:00' - '\n3,2013-01-04 00:00:00\n4,' - '2013-01-05 00:00:00\n') + expected_rows = [',A', + '0,2013-01-01 00:00:00', + '1,2013-01-02 00:00:00', + '2,2013-01-03 00:00:00', + '3,2013-01-04 00:00:00', + '4,2013-01-05 00:00:00'] + expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) assert (df_day.to_csv(date_format='%Y-%m-%d %H:%M:%S') == expected_ymdhms_day) - expected_ymd_sec = (',A\n0,2013-01-01\n1,2013-01-01\n2,' - '2013-01-01\n3,2013-01-01\n4,2013-01-01\n') + expected_rows = [',A', + '0,2013-01-01', + '1,2013-01-01', + '2,2013-01-01', + '3,2013-01-01', + '4,2013-01-01'] + expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) assert df_sec.to_csv(date_format='%Y-%m-%d') == expected_ymd_sec - expected_default_day = (',A\n0,2013-01-01\n1,2013-01-02\n2,' - '2013-01-03\n3,2013-01-04\n4,2013-01-05\n') + expected_rows = [',A', + '0,2013-01-01', + '1,2013-01-02', + '2,2013-01-03', + '3,2013-01-04', + '4,2013-01-05'] + expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) assert df_day.to_csv() == expected_default_day assert df_day.to_csv(date_format='%Y-%m-%d') == expected_default_day - # testing if date_format parameter is taken into account for - # multi-indexed dataframes (GH 7791) + # see gh-7791 + # + # Testing if date_format parameter is taken into account + # for multi-indexed DataFrames. df_sec['B'] = 0 df_sec['C'] = 1 - expected_ymd_sec = 'A,B,C\n2013-01-01,0,1\n' + + expected_rows = ['A,B,C', + '2013-01-01,0,1'] + expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) + df_sec_grouped = df_sec.groupby([pd.Grouper(key='A', freq='1h'), 'B']) assert (df_sec_grouped.mean().to_csv(date_format='%Y-%m-%d') == expected_ymd_sec) def test_to_csv_multi_index(self): - # GH 6618 + # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) - exp = ",1\n,2\n0,1\n" + exp_rows = [',1', + ',2', + '0,1'] + exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp = "1\n2\n1\n" + exp_rows = ['1', '2', '1'] + exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]), index=pd.MultiIndex.from_arrays([[1], [2]])) - exp = ",,1\n,,2\n1,2,1\n" + exp_rows = [',,1', ',,2', '1,2,1'] + exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp = "1\n2\n1\n" + exp_rows = ['1', '2', '1'] + exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp df = DataFrame( [1], columns=pd.MultiIndex.from_arrays([['foo'], ['bar']])) - exp = ",foo\n,bar\n0,1\n" + exp_rows = [',foo', ',bar', '0,1'] + exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv() == exp - exp = "foo\nbar\n1\n" + exp_rows = ['foo', 'bar', '1'] + exp = tm.convert_rows_list_to_csv_str(exp_rows) assert df.to_csv(index=False) == exp def test_to_csv_string_array_ascii(self): @@ -289,21 +354,113 @@ def test_to_csv_string_array_utf8(self): with open(path, 'r') as f: assert f.read() == expected_utf8 + def test_to_csv_string_with_lf(self): + # GH 20353 + data = { + 'int': [1, 2, 3], + 'str_lf': ['abc', 'd\nef', 'g\nh\n\ni'] + } + df = pd.DataFrame(data) + with tm.ensure_clean('lf_test.csv') as path: + # case 1: The default line terminator(=os.linesep)(PR 21406) + os_linesep = os.linesep.encode('utf-8') + expected_noarg = ( + b'int,str_lf' + os_linesep + + b'1,abc' + os_linesep + + b'2,"d\nef"' + os_linesep + + b'3,"g\nh\n\ni"' + os_linesep + ) + df.to_csv(path, index=False) + with open(path, 'rb') as f: + assert f.read() == expected_noarg + with tm.ensure_clean('lf_test.csv') as path: + # case 2: LF as line terminator + expected_lf = ( + b'int,str_lf\n' + b'1,abc\n' + b'2,"d\nef"\n' + b'3,"g\nh\n\ni"\n' + ) + df.to_csv(path, line_terminator='\n', index=False) + with open(path, 'rb') as f: + assert f.read() == expected_lf + with tm.ensure_clean('lf_test.csv') as path: + # case 3: CRLF as line terminator + # 'line_terminator' should not change inner element + expected_crlf = ( + b'int,str_lf\r\n' + b'1,abc\r\n' + b'2,"d\nef"\r\n' + b'3,"g\nh\n\ni"\r\n' + ) + df.to_csv(path, line_terminator='\r\n', index=False) + with open(path, 'rb') as f: + assert f.read() == expected_crlf + + def test_to_csv_string_with_crlf(self): + # GH 20353 + data = { + 'int': [1, 2, 3], + 'str_crlf': ['abc', 'd\r\nef', 'g\r\nh\r\n\r\ni'] + } + df = pd.DataFrame(data) + with tm.ensure_clean('crlf_test.csv') as path: + # case 1: The default line terminator(=os.linesep)(PR 21406) + os_linesep = os.linesep.encode('utf-8') + expected_noarg = ( + b'int,str_crlf' + os_linesep + + b'1,abc' + os_linesep + + b'2,"d\r\nef"' + os_linesep + + b'3,"g\r\nh\r\n\r\ni"' + os_linesep + ) + df.to_csv(path, index=False) + with open(path, 'rb') as f: + assert f.read() == expected_noarg + with tm.ensure_clean('crlf_test.csv') as path: + # case 2: LF as line terminator + expected_lf = ( + b'int,str_crlf\n' + b'1,abc\n' + b'2,"d\r\nef"\n' + b'3,"g\r\nh\r\n\r\ni"\n' + ) + df.to_csv(path, line_terminator='\n', index=False) + with open(path, 'rb') as f: + assert f.read() == expected_lf + with tm.ensure_clean('crlf_test.csv') as path: + # case 3: CRLF as line terminator + # 'line_terminator' should not change inner element + expected_crlf = ( + b'int,str_crlf\r\n' + b'1,abc\r\n' + b'2,"d\r\nef"\r\n' + b'3,"g\r\nh\r\n\r\ni"\r\n' + ) + df.to_csv(path, line_terminator='\r\n', index=False) + with open(path, 'rb') as f: + assert f.read() == expected_crlf + @tm.capture_stdout def test_to_csv_stdout_file(self): # GH 21561 df = pd.DataFrame([['foo', 'bar'], ['baz', 'qux']], columns=['name_1', 'name_2']) - expected_ascii = '''\ -,name_1,name_2 -0,foo,bar -1,baz,qux -''' + expected_rows = [',name_1,name_2', + '0,foo,bar', + '1,baz,qux'] + expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) + df.to_csv(sys.stdout, encoding='ascii') output = sys.stdout.getvalue() + assert output == expected_ascii assert not sys.stdout.closed + @pytest.mark.xfail( + compat.is_platform_windows(), + reason=("Especially in Windows, file stream should not be passed" + "to csv writer without newline='' option." + "(https://docs.python.org/3.6/library/csv.html#csv.writer)")) def test_to_csv_write_to_open_file(self): # GH 21696 df = pd.DataFrame({'a': ['x', 'y', 'z']}) @@ -320,6 +477,42 @@ def test_to_csv_write_to_open_file(self): with open(path, 'r') as f: assert f.read() == expected + @pytest.mark.skipif(compat.PY2, reason="Test case for python3") + def test_to_csv_write_to_open_file_with_newline_py3(self): + # see gh-21696 + # see gh-20353 + df = pd.DataFrame({'a': ['x', 'y', 'z']}) + expected_rows = ["x", + "y", + "z"] + expected = ("manual header\n" + + tm.convert_rows_list_to_csv_str(expected_rows)) + with tm.ensure_clean('test.txt') as path: + with open(path, 'w', newline='') as f: + f.write('manual header\n') + df.to_csv(f, header=None, index=None) + + with open(path, 'rb') as f: + assert f.read() == bytes(expected, 'utf-8') + + @pytest.mark.skipif(compat.PY3, reason="Test case for python2") + def test_to_csv_write_to_open_file_with_newline_py2(self): + # see gh-21696 + # see gh-20353 + df = pd.DataFrame({'a': ['x', 'y', 'z']}) + expected_rows = ["x", + "y", + "z"] + expected = ("manual header\n" + + tm.convert_rows_list_to_csv_str(expected_rows)) + with tm.ensure_clean('test.txt') as path: + with open(path, 'wb') as f: + f.write('manual header\n') + df.to_csv(f, header=None, index=None) + + with open(path, 'rb') as f: + assert f.read() == expected + @pytest.mark.parametrize("to_infer", [True, False]) @pytest.mark.parametrize("read_infer", [True, False]) def test_to_csv_compression(self, compression_only, diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index e08899a03d2d7..b748e9aa5ef5b 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -270,7 +270,7 @@ def test_invalid_url(self): self.read_html('http://www.a23950sdfa908sd.com', match='.*Water.*') except ValueError as e: - assert str(e) == 'No tables found' + assert 'No tables found' in str(e) @pytest.mark.slow def test_file_url(self): diff --git a/pandas/tests/util/test_testing.py b/pandas/tests/util/test_testing.py index da84973274933..d968005a25006 100644 --- a/pandas/tests/util/test_testing.py +++ b/pandas/tests/util/test_testing.py @@ -12,6 +12,7 @@ assert_index_equal, assert_series_equal, assert_frame_equal, assert_numpy_array_equal, RNGContext) +from pandas import compat class TestAssertAlmostEqual(object): @@ -164,6 +165,17 @@ def test_raise_with_traceback(self): _, _, traceback = sys.exc_info() raise_with_traceback(e, traceback) + def test_convert_rows_list_to_csv_str(self): + rows_list = ["aaa", "bbb", "ccc"] + ret = tm.convert_rows_list_to_csv_str(rows_list) + + if compat.is_platform_windows(): + expected = "aaa\r\nbbb\r\nccc\r\n" + else: + expected = "aaa\nbbb\nccc\n" + + assert ret == expected + class TestAssertNumpyArrayEqual(object): diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 1bd9043f42634..b5ec0912c5c26 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -2983,3 +2983,24 @@ def skipna_wrapper(x): return alternative(nona) return skipna_wrapper + + +def convert_rows_list_to_csv_str(rows_list): + """ + Convert list of CSV rows to single CSV-formatted string for current OS. + + This method is used for creating expected value of to_csv() method. + + Parameters + ---------- + rows_list : list + The list of string. Each element represents the row of csv. + + Returns + ------- + expected : string + Expected output of to_csv() in current OS + """ + sep = os.linesep + expected = sep.join(rows_list) + sep + return expected