From 8eeb7d320452ac9966e23ae2dd1fefdd8bd8ed37 Mon Sep 17 00:00:00 2001 From: katyak Date: Tue, 15 Jun 2021 13:18:30 +0300 Subject: [PATCH 1/2] ML-655: none value should remain None in CsvSource --- storey/sources.py | 2 ++ tests/test_flow.py | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/storey/sources.py b/storey/sources.py index 1a776e58..eeb499a8 100644 --- a/storey/sources.py +++ b/storey/sources.py @@ -544,6 +544,8 @@ def _infer_type(self, value): def _parse_field(self, field, index): typ = self._types[index] if typ == 's': + if field == '': + return None return field if typ == 'f': return float(field) if field != '' else math.nan diff --git a/tests/test_flow.py b/tests/test_flow.py index cae0cf15..f97c52c6 100644 --- a/tests/test_flow.py +++ b/tests/test_flow.py @@ -2589,3 +2589,27 @@ def test_csv_none_value_first_row(tmpdir): for c in columns: assert read_back_df.dtypes.to_dict()[c] == data.dtypes.to_dict()[c] + + +def test_csv_none_value_string(tmpdir): + out_file_par = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.parquet' + out_file_csv = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.csv' + + columns = ['first_name', "str"] + data = pd.DataFrame([['katya', "strrrr"], ['dina', None]], + columns=columns) + data.to_csv(out_file_csv) + + controller = build_flow([ + CSVSource(out_file_csv, header=True, key_field='first_name', build_dict=True), + ParquetTarget(out_file_par) + ]).run() + + controller.await_termination() + read_back_df = pd.read_parquet(out_file_par) + + u = pd.read_csv(out_file_csv) + u.to_parquet(out_file_par) + r2 = pd.read_parquet(out_file_par) + + assert r2["str"].compare(read_back_df["str"]).empty From 4a6edb4accec5cd180258c555fa08656f1c7123d Mon Sep 17 00:00:00 2001 From: katyak Date: Tue, 15 Jun 2021 14:53:58 +0300 Subject: [PATCH 2/2] pr comments --- tests/test_flow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_flow.py b/tests/test_flow.py index f97c52c6..e5ec851d 100644 --- a/tests/test_flow.py +++ b/tests/test_flow.py @@ -2595,8 +2595,8 @@ def test_csv_none_value_string(tmpdir): out_file_par = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.parquet' out_file_csv = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.csv' - columns = ['first_name', "str"] - data = pd.DataFrame([['katya', "strrrr"], ['dina', None]], + columns = ['first_name', 'str'] + data = pd.DataFrame([['katya', 'strrrr'], ['dina', None]], columns=columns) data.to_csv(out_file_csv) @@ -2612,4 +2612,4 @@ def test_csv_none_value_string(tmpdir): u.to_parquet(out_file_par) r2 = pd.read_parquet(out_file_par) - assert r2["str"].compare(read_back_df["str"]).empty + assert r2['str'].compare(read_back_df['str']).empty