Skip to content

Commit

Permalink
FIX-#976: add encoding parameter to read_csv call (#2593)
Browse files Browse the repository at this point in the history
* FIX-#976: add failed test

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* FIX-#976: add encoding parameter to read_csv call

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>

* FIX-#976: fix test in experimental mode

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev committed Jan 13, 2021
1 parent ff9bdbf commit 477c5f6
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 1 deletion.
5 changes: 4 additions & 1 deletion modin/engines/base/io/text/csv_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def _read(cls, filepath_or_buffer, **kwargs):
names = kwargs.get("names", None)
index_col = kwargs.get("index_col", None)
usecols = kwargs.get("usecols", None)
encoding = kwargs.get("encoding", None)
if names is None:
# For the sake of the empty df, we assume no `index_col` to get the correct
# column names before we build the index. Because we pass `names` in, this
Expand All @@ -71,7 +72,9 @@ def _read(cls, filepath_or_buffer, **kwargs):
elif index_col is None and not usecols:
# When names is set to some list that is smaller than the number of columns
# in the file, the first columns are built as a hierarchical index.
empty_pd_df = pandas.read_csv(filepath_or_buffer, nrows=0)
empty_pd_df = pandas.read_csv(
filepath_or_buffer, nrows=0, encoding=encoding
)
num_cols = len(empty_pd_df.columns)
if num_cols > len(names):
index_col = list(range(num_cols - len(names)))
Expand Down
5 changes: 5 additions & 0 deletions modin/pandas/test/data/issue_976.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1;11800000560005;11800000560005;������� ����� ����������;;-;���. ����;�. �������i���;������������� �����;����������� �������;105.6000
1;10200007400477;10200007400477;�������� ����� ����������;;-;���. ������;����������;³����������� �����;����������� �������;696.6400
1;11100008540930;11100008540930;���������� ������� ��������;2;9;���. ������;�.�������;����������� �����;����������� �������;124.4800
1;12300000051493;12300000051493;���������� ����� ����������;;50;���. ����������;��.���������;���'�����-���������� �����;����������� �������;-0.4700
1;12300000117460;12300000117460;����� ³���� ���������;;60;���. ���������;������;���'�����-���������� �����;����������� �������;221.0400
18 changes: 18 additions & 0 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,24 @@ def test_read_csv_iteration(self, iterator):

df_equals(modin_df, pd_df)

def test_read_csv_encoding_976(self):
file_name = "modin/pandas/test/data/issue_976.csv"
names = [str(i) for i in range(11)]

kwargs = {
"sep": ";",
"names": names,
"encoding": "windows-1251",
}
df1 = pd.read_csv(file_name, **kwargs)
df2 = pandas.read_csv(file_name, **kwargs)
# these columns contain data of various types in partitions
# see #1931 for details;
df1 = df1.drop(["4", "5"], axis=1)
df2 = df2.drop(["4", "5"], axis=1)

df_equals(df1, df2)

# Quoting, Compression, and File Format parameters tests
@pytest.mark.parametrize("compression", ["infer", "gzip", "bz2", "xz", "zip"])
@pytest.mark.parametrize(
Expand Down

0 comments on commit 477c5f6

Please sign in to comment.