Skip to content

Commit

Permalink
TEST-modin-project#2295: add Quoting, Compression, and File Format tests
Browse files Browse the repository at this point in the history
Signed-off-by: Alexander Myskov <alexander.myskov@intel.com>
  • Loading branch information
amyskov committed Dec 3, 2020
1 parent f273121 commit f64d142
Showing 1 changed file with 135 additions and 68 deletions.
203 changes: 135 additions & 68 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@

# Number of rows in the test file
NROWS = DATASET_SIZE_DICT.get(TestDatasetSize.get(), DATASET_SIZE_DICT["Small"])
comp_to_ext = {"infer": "", "gzip": "gz", "bz2": "bz2", "xz": "xz", "zip": "zip"}

test_csv_dialect_params = {
"delimiter": "_",
"doublequote": False,
"escapechar": "d",
"quotechar": "d",
"quoting": csv.QUOTE_ALL,
}
csv.register_dialect("test_csv_dialect", **test_csv_dialect_params)

if not os.path.exists(IO_OPS_DATA_DIR):
os.mkdir(IO_OPS_DATA_DIR)
Expand Down Expand Up @@ -235,11 +245,12 @@ def _csv_file_maker(
df["col6"] = df["col6"].apply(
lambda x: f"{x:,f}".replace(",", thousands_separator)
)
filename = (
f"{filename}.{comp_to_ext[compression]}"
if compression != "infer"
else filename
)

if compression == "gzip":
filename = "{}.gz".format(filename)
elif compression == "zip" or compression == "xz" or compression == "bz2":
filename = "{fname}.{comp}".format(fname=filename, comp=compression)
df.to_csv(
filename,
sep=delimiter,
Expand Down Expand Up @@ -865,6 +876,126 @@ def test_read_csv_iteration(self, make_csv_file, iterator):
pd_df = pd_reader.read()

df_equals(modin_df, pd_df)
# Quoting, Compression, and File Format parameters tests
@pytest.mark.parametrize("compression", ["infer", "gzip", "bz2", "xz", "zip"])
@pytest.mark.parametrize(
"encoding",
[None, "latin8", "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8"],
)
@pytest.mark.parametrize("engine", [None, "python", "c"])
def test_read_csv_compression(self, make_csv_file, compression, encoding, engine):
unique_filename = get_unique_filename()
make_csv_file(
filename=unique_filename, encoding=encoding, compression=compression
)
compressed_file_path = (
f"{unique_filename}.{comp_to_ext[compression]}"
if compression != "infer"
else unique_filename
)

eval_io(
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=compressed_file_path,
compression=compression,
encoding=encoding,
engine=engine,
)

@pytest.mark.parametrize("thousands", [None, ",", "_", " "])
@pytest.mark.parametrize("decimal", [".", "_"])
@pytest.mark.parametrize("lineterminator", [None, "x", "\n"])
@pytest.mark.parametrize("escapechar", [None, "d", "x"])
@pytest.mark.parametrize("dialect", ["test_csv_dialect", None])
def test_read_csv_file_format(
self,
request,
make_csv_file,
thousands,
decimal,
lineterminator,
escapechar,
dialect,
):
if request.config.getoption("--simulate-cloud").lower() != "off" and dialect:
pytest.xfail(
"The reason of tests fail in `cloud` mode is unknown for now - issue #2340"
)
elif Engine.get() != "Python" and lineterminator == "x":
pytest.xfail("read_csv with Ray engine outputs empty frame - issue #2493")
elif Engine.get() != "Python" and escapechar:
pytest.xfail(
"read_csv with Ray engine fails with some 'escapechar' parameter - issue #2494"
)

unique_filename = get_unique_filename()
if dialect:
make_csv_file(filename=unique_filename, **test_csv_dialect_params)
else:
make_csv_file(
filename=unique_filename,
thousands_separator=thousands,
decimal_separator=decimal,
escapechar=escapechar,
line_terminator=lineterminator,
)

eval_io(
check_exception_type=None, # issue #2320
raising_exceptions=None,
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=unique_filename,
thousands=thousands,
decimal=decimal,
lineterminator=lineterminator,
escapechar=escapechar,
dialect=dialect,
)

@pytest.mark.parametrize(
"quoting",
[csv.QUOTE_ALL, csv.QUOTE_MINIMAL, csv.QUOTE_NONNUMERIC, csv.QUOTE_NONE],
)
@pytest.mark.parametrize("quotechar", ['"', "_", "d"])
@pytest.mark.parametrize("doublequote", [True, False])
@pytest.mark.parametrize("comment", [None, "#", "x"])
def test_read_csv_quoting(
self,
make_csv_file,
quoting,
quotechar,
doublequote,
comment,
):
# in these cases escapechar should be set, otherwise error occures
# _csv.Error: need to escape, but no escapechar set"
use_escapechar = (
not doublequote and quotechar != '"' and quoting != csv.QUOTE_NONE
)
escapechar = "\\" if use_escapechar else None
unique_filename = get_unique_filename()

make_csv_file(
filename=unique_filename,
quoting=quoting,
quotechar=quotechar,
doublequote=doublequote,
escapechar=escapechar,
comment_col_char=comment,
)

eval_io(
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=unique_filename,
quoting=quoting,
quotechar=quotechar,
doublequote=doublequote,
escapechar=escapechar,
comment=comment,
)

# Error Handling parameters tests
@pytest.mark.xfail(
Expand Down Expand Up @@ -1288,58 +1419,6 @@ def test_from_csv_categories():
df_equals(modin_df, pandas_df)


def test_from_csv_gzip(make_csv_file):
make_csv_file(compression="gzip")
gzip_path = "{}.gz".format(TEST_CSV_FILENAME)

pandas_df = pandas.read_csv(gzip_path)
modin_df = pd.read_csv(gzip_path)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(gzip_path, compression="gzip")
modin_df = pd.read_csv(gzip_path, compression="gzip")
df_equals(modin_df, pandas_df)


def test_from_csv_bz2(make_csv_file):
make_csv_file(compression="bz2")
bz2_path = "{}.bz2".format(TEST_CSV_FILENAME)

pandas_df = pandas.read_csv(bz2_path)
modin_df = pd.read_csv(bz2_path)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(bz2_path, compression="bz2")
modin_df = pd.read_csv(bz2_path, compression="bz2")
df_equals(modin_df, pandas_df)


def test_from_csv_xz(make_csv_file):
make_csv_file(compression="xz")
xz_path = "{}.xz".format(TEST_CSV_FILENAME)

pandas_df = pandas.read_csv(xz_path)
modin_df = pd.read_csv(xz_path)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(xz_path, compression="xz")
modin_df = pd.read_csv(xz_path, compression="xz")
df_equals(modin_df, pandas_df)


def test_from_csv_zip(make_csv_file):
make_csv_file(compression="zip")
zip_path = "{}.zip".format(TEST_CSV_FILENAME)

pandas_df = pandas.read_csv(zip_path)
modin_df = pd.read_csv(zip_path)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(zip_path, compression="zip")
modin_df = pd.read_csv(zip_path, compression="zip")
df_equals(modin_df, pandas_df)


def test_parse_dates_read_csv():
pandas_df = pandas.read_csv("modin/pandas/test/data/test_time_parsing.csv")
modin_df = pd.read_csv("modin/pandas/test/data/test_time_parsing.csv")
Expand Down Expand Up @@ -1525,18 +1604,6 @@ def test_from_csv_skiprows_names(names, skiprows):
df_equals(pandas_df, modin_df)


@pytest.mark.parametrize(
"encoding", ["latin8", "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8"]
)
def test_from_csv_encoding(make_csv_file, encoding):
make_csv_file(encoding=encoding)

pandas_df = pandas.read_csv(TEST_CSV_FILENAME, encoding=encoding)
modin_df = pd.read_csv(TEST_CSV_FILENAME, encoding=encoding)

df_equals(modin_df, pandas_df)


def test_from_csv_default_to_pandas_behavior(make_csv_file):
make_csv_file()

Expand Down

0 comments on commit f64d142

Please sign in to comment.