modin-project · devin-petersohn · Feb 23, 2022 · Feb 4, 2022 · Feb 4, 2022 · Feb 4, 2022
@@ -14,6 +14,7 @@ Key Features and Updates
   * FIX-#4177: Support read_feather from pathlike objects (#4177)
   * FIX-#4234: Upgrade pandas to 1.4.1 (#4235)
   * FIX-#4057: Allow reading an empty parquet file (#4075)  
+  * FIX-#3884: Fix read_excel() dropping empty rows (#4161)
 * Performance enhancements
   * FIX-#4138, FIX-#4009: remove redundant sorting in the internal '.mask()' flow (#4140)
 * Benchmarking enhancements
@@ -63,3 +64,4 @@ Contributors
 @dchigarev
 @Garra1980
 @mvashishtha
+@naren-ponder
@@ -564,12 +564,14 @@ def update_row_nums(match):
             has_index_names=is_list_like(header) and len(header) > 1,
             skiprows=skiprows,
             usecols=usecols,
+            skip_blank_lines=False,
             **kwargs,
         )
-        # In excel if you create a row with only a border (no values), this parser will
-        # interpret that as a row of NaN values. pandas discards these values, so we
-        # also must discard these values.
-        pandas_df = parser.read().dropna(how="all")
+        pandas_df = parser.read()
+        if len(pandas_df) > 1 and pandas_df.isnull().all().all():
+            # Drop NaN rows at the end of the DataFrame
+            pandas_df = pandas.DataFrame(columns=pandas_df.columns)
+
         # Since we know the number of rows that occur before this partition, we can
         # correctly assign the index in cases of RangeIndex. If it is not a RangeIndex,
         # the index is already correct because it came from the data.

@@ -1600,6 +1600,30 @@ def test_excel_empty_line(self):
         modin_df = pd.read_excel(path)
         assert str(modin_df)
 
+    @check_file_leaks
+    def test_read_excel_empty_rows(self):
+        # Test parsing empty rows in middle of excel dataframe as NaN values
+        eval_io(
+            fn_name="read_excel",
+            io="modin/pandas/test/data/test_empty_rows.xlsx",
+        )
+
+    @check_file_leaks
+    def test_read_excel_border_rows(self):
+        # Test parsing border rows as NaN values in excel dataframe
+        eval_io(
+            fn_name="read_excel",
+            io="modin/pandas/test/data/test_border_rows.xlsx",
+        )
+
+    @check_file_leaks
+    def test_read_excel_every_other_nan(self):
+        # Test for reading excel dataframe with every other row as a NaN value
+        eval_io(
+            fn_name="read_excel",
+            io="modin/pandas/test/data/every_other_row_nan.xlsx",
+        )
+
     @pytest.mark.parametrize(
         "sheet_name",
         [