Skip to content

Commit

Permalink
FIX-#2239: Compute row index start using pandas (#2240)
Browse files Browse the repository at this point in the history
* FIX-#2239: Compute row index start using pandas

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>

* FIX-#2239: Documentation

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>

* FIX-#2239: Improve testing for case

Signed-off-by: Devin Petersohn <devin.petersohn@gmail.com>
  • Loading branch information
devin-petersohn authored Oct 27, 2020
1 parent 8866ca8 commit a7d3093
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 6 deletions.
5 changes: 0 additions & 5 deletions modin/engines/base/io/text/csv_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,11 +180,6 @@ def _read(cls, filepath_or_buffer, **kwargs):
if index_col is None:
row_lengths = cls.materialize(index_ids)
new_index = pandas.RangeIndex(sum(row_lengths))
# pandas has a really weird edge case here.
if kwargs.get("names", None) is not None and skiprows > 1:
new_index = pandas.RangeIndex(
skiprows - 1, new_index.stop + skiprows - 1
)
else:
index_objs = cls.materialize(index_ids)
row_lengths = [len(o) for o in index_objs]
Expand Down
146 changes: 146 additions & 0 deletions modin/pandas/test/data/issue_2239.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
1585542839.000000, 1585542839.000000, 1585542839.000000
32.000000, 32.000000, 32.000000
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
-38,-14,51
-38,-13,51
-38,-14,51
-38,-14,50
-38,-13,51
-38,-14,50
-38,-14,51
-38,-13,51
27 changes: 26 additions & 1 deletion modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -1113,7 +1113,7 @@ def test_from_csv_chunksize(make_csv_file):
df_equals(modin_df, pd_df)


@pytest.mark.parametrize("nrows", [123, None])
@pytest.mark.parametrize("nrows", [1, 2, 123, None])
def test_from_csv_skiprows(make_csv_file, nrows):
make_csv_file()

Expand All @@ -1129,6 +1129,22 @@ def test_from_csv_skiprows(make_csv_file, nrows):
)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(
TEST_CSV_FILENAME,
header=None,
names=["c1", "c2", "c3", "c4"],
skiprows=2,
nrows=nrows,
)
modin_df = pd.read_csv(
TEST_CSV_FILENAME,
header=None,
names=["c1", "c2", "c3", "c4"],
skiprows=2,
nrows=nrows,
)
df_equals(modin_df, pandas_df)

pandas_df = pandas.read_csv(
TEST_CSV_FILENAME,
names=["c1", "c2", "c3", "c4"],
Expand All @@ -1144,6 +1160,15 @@ def test_from_csv_skiprows(make_csv_file, nrows):
df_equals(modin_df, pandas_df)


@pytest.mark.parametrize("names", [list("XYZ"), None])
@pytest.mark.parametrize("skiprows", [1, 2, 3, 4, None])
def test_from_csv_skiprows_names(names, skiprows):
path = "modin/pandas/test/data/issue_2239.csv"
pandas_df = pandas.read_csv(path, names=names, skiprows=skiprows)
modin_df = pd.read_csv(path, names=names, skiprows=skiprows)
df_equals(pandas_df, modin_df)


@pytest.mark.parametrize(
"encoding", ["latin8", "ISO-8859-1", "latin1", "iso-8859-1", "cp1252", "utf8"]
)
Expand Down

1 comment on commit a7d3093

@modin-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This commit has been mentioned on Modin Discuss. There might be relevant details there:

https://discuss.modin.org/t/modin-errors-out-on-pytz-timezone/119/50

Please sign in to comment.