From 8eeb7d320452ac9966e23ae2dd1fefdd8bd8ed37 Mon Sep 17 00:00:00 2001
From: katyak <katyak@iguazio.com>
Date: Tue, 15 Jun 2021 13:18:30 +0300
Subject: [PATCH 1/2] ML-655: none value should remain None in CsvSource

---
 storey/sources.py  |  2 ++
 tests/test_flow.py | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/storey/sources.py b/storey/sources.py
index 1a776e58..eeb499a8 100644
--- a/storey/sources.py
+++ b/storey/sources.py
@@ -544,6 +544,8 @@ def _infer_type(self, value):
     def _parse_field(self, field, index):
         typ = self._types[index]
         if typ == 's':
+            if field == '':
+                return None
             return field
         if typ == 'f':
             return float(field) if field != '' else math.nan
diff --git a/tests/test_flow.py b/tests/test_flow.py
index cae0cf15..f97c52c6 100644
--- a/tests/test_flow.py
+++ b/tests/test_flow.py
@@ -2589,3 +2589,27 @@ def test_csv_none_value_first_row(tmpdir):
 
     for c in columns:
         assert read_back_df.dtypes.to_dict()[c] == data.dtypes.to_dict()[c]
+
+
+def test_csv_none_value_string(tmpdir):
+    out_file_par = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.parquet'
+    out_file_csv = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.csv'
+
+    columns = ['first_name', "str"]
+    data = pd.DataFrame([['katya', "strrrr"], ['dina', None]],
+                        columns=columns)
+    data.to_csv(out_file_csv)
+
+    controller = build_flow([
+        CSVSource(out_file_csv, header=True, key_field='first_name', build_dict=True),
+        ParquetTarget(out_file_par)
+    ]).run()
+
+    controller.await_termination()
+    read_back_df = pd.read_parquet(out_file_par)
+
+    u = pd.read_csv(out_file_csv)
+    u.to_parquet(out_file_par)
+    r2 = pd.read_parquet(out_file_par)
+
+    assert r2["str"].compare(read_back_df["str"]).empty

From 4a6edb4accec5cd180258c555fa08656f1c7123d Mon Sep 17 00:00:00 2001
From: katyak <katyak@iguazio.com>
Date: Tue, 15 Jun 2021 14:53:58 +0300
Subject: [PATCH 2/2] pr comments

---
 tests/test_flow.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_flow.py b/tests/test_flow.py
index f97c52c6..e5ec851d 100644
--- a/tests/test_flow.py
+++ b/tests/test_flow.py
@@ -2595,8 +2595,8 @@ def test_csv_none_value_string(tmpdir):
     out_file_par = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.parquet'
     out_file_csv = f'{tmpdir}/test_csv_none_value_first_row_{uuid.uuid4().hex}.csv'
 
-    columns = ['first_name', "str"]
-    data = pd.DataFrame([['katya', "strrrr"], ['dina', None]],
+    columns = ['first_name', 'str']
+    data = pd.DataFrame([['katya', 'strrrr'], ['dina', None]],
                         columns=columns)
     data.to_csv(out_file_csv)
 
@@ -2612,4 +2612,4 @@ def test_csv_none_value_string(tmpdir):
     u.to_parquet(out_file_par)
     r2 = pd.read_parquet(out_file_par)
 
-    assert r2["str"].compare(read_back_df["str"]).empty
+    assert r2['str'].compare(read_back_df['str']).empty