FIX-#976: add encoding parameter to read_csv call (#2593)

* FIX-#976: add failed test Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com> * FIX-#976: add encoding parameter to read_csv call Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com> * FIX-#976: fix test in experimental mode Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
modin-project · Jan 13, 2021 · 477c5f6 · 477c5f6
1 parent ff9bdbf
commit 477c5f6
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 1 deletion.
diff --git a/modin/engines/base/io/text/csv_dispatcher.py b/modin/engines/base/io/text/csv_dispatcher.py
@@ -59,6 +59,7 @@ def _read(cls, filepath_or_buffer, **kwargs):
         names = kwargs.get("names", None)
         index_col = kwargs.get("index_col", None)
         usecols = kwargs.get("usecols", None)
+        encoding = kwargs.get("encoding", None)
         if names is None:
             # For the sake of the empty df, we assume no `index_col` to get the correct
             # column names before we build the index. Because we pass `names` in, this
@@ -71,7 +72,9 @@ def _read(cls, filepath_or_buffer, **kwargs):
         elif index_col is None and not usecols:
             # When names is set to some list that is smaller than the number of columns
             # in the file, the first columns are built as a hierarchical index.
-            empty_pd_df = pandas.read_csv(filepath_or_buffer, nrows=0)
+            empty_pd_df = pandas.read_csv(
+                filepath_or_buffer, nrows=0, encoding=encoding
+            )
             num_cols = len(empty_pd_df.columns)
             if num_cols > len(names):
                 index_col = list(range(num_cols - len(names)))

diff --git a/modin/pandas/test/data/issue_976.csv b/modin/pandas/test/data/issue_976.csv
@@ -0,0 +1,5 @@
+1;11800000560005;11800000560005;������� ����� ����������;;-;���. ����;�. �������i���;������������� �����;����������� �������;105.6000
+1;10200007400477;10200007400477;�������� ����� ����������;;-;���. ������;����������;³����������� �����;����������� �������;696.6400
+1;11100008540930;11100008540930;���������� ������� ��������;2;9;���. ������;�.�������;����������� �����;����������� �������;124.4800
+1;12300000051493;12300000051493;���������� ����� ����������;;50;���. ����������;��.���������;���'�����-���������� �����;����������� �������;-0.4700
+1;12300000117460;12300000117460;����� ³���� ���������;;60;���. ���������;������;���'�����-���������� �����;����������� �������;221.0400
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
@@ -818,6 +818,24 @@ def test_read_csv_iteration(self, iterator):
 
         df_equals(modin_df, pd_df)
 
+    def test_read_csv_encoding_976(self):
+        file_name = "modin/pandas/test/data/issue_976.csv"
+        names = [str(i) for i in range(11)]
+
+        kwargs = {
+            "sep": ";",
+            "names": names,
+            "encoding": "windows-1251",
+        }
+        df1 = pd.read_csv(file_name, **kwargs)
+        df2 = pandas.read_csv(file_name, **kwargs)
+        # these columns contain data of various types in partitions
+        # see #1931 for details;
+        df1 = df1.drop(["4", "5"], axis=1)
+        df2 = df2.drop(["4", "5"], axis=1)
+
+        df_equals(df1, df2)
+
     # Quoting, Compression, and File Format parameters tests
     @pytest.mark.parametrize("compression", ["infer", "gzip", "bz2", "xz", "zip"])
     @pytest.mark.parametrize(