From d710a16c2896024fe5b7c743555935e973b4cfaf Mon Sep 17 00:00:00 2001
From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com>
Date: Tue, 8 Dec 2020 14:37:59 +0300
Subject: [PATCH] Ensure excel reader closes file if it is passed as path
 (#2514)

Signed-off-by: Vasilij Litvinov <vasilij.n.litvinov@intel.com>
---
 modin/config/envvars.py                       | 13 +++++++
 .../engines/base/io/text/excel_dispatcher.py  | 24 ++++++++----
 modin/pandas/test/test_io.py                  |  7 ++++
 modin/pandas/test/utils.py                    | 39 ++++++++++++++++++-
 4 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/modin/config/envvars.py b/modin/config/envvars.py
index e3ea3d1814f..4a7a5d9f664 100644
--- a/modin/config/envvars.py
+++ b/modin/config/envvars.py
@@ -12,6 +12,7 @@
 # governing permissions and limitations under the License.
 
 import os
+import sys
 from textwrap import dedent
 import warnings
 from packaging import version
@@ -208,6 +209,18 @@ class TestDatasetSize(EnvironmentVariable, type=str):
     choices = ("Small", "Normal", "Big")
 
 
+class TrackFileLeaks(EnvironmentVariable, type=bool):
+    """
+    Whether to track for open file handles leakage during testing
+    """
+
+    varname = "MODIN_TEST_TRACK_FILE_LEAKS"
+    # Turn off tracking on Windows by default because
+    # psutil's open_files() can be extremely slow on Windows (up to adding a few hours).
+    # see https://github.com/giampaolo/psutil/pull/597
+    default = sys.platform != "win32"
+
+
 def _check_vars():
     """
     Look out for any environment variables that start with "MODIN_" prefix
diff --git a/modin/engines/base/io/text/excel_dispatcher.py b/modin/engines/base/io/text/excel_dispatcher.py
index c48872ae61e..15dff6f1786 100644
--- a/modin/engines/base/io/text/excel_dispatcher.py
+++ b/modin/engines/base/io/text/excel_dispatcher.py
@@ -41,7 +41,6 @@ def _read(cls, io, **kwargs):
             return cls.single_worker_read(io, **kwargs)
 
         from zipfile import ZipFile
-        from openpyxl import load_workbook
         from openpyxl.worksheet.worksheet import Worksheet
         from openpyxl.worksheet._reader import WorksheetReader
         from openpyxl.reader.excel import ExcelReader
@@ -59,12 +58,23 @@ def _read(cls, io, **kwargs):
             "Parallel `read_excel` is a new feature! Please email "
             "bug_reports@modin.org if you run into any problems."
         )
-        wb = load_workbook(filename=io, read_only=True)
-        # Get shared strings
-        ex = ExcelReader(io, read_only=True)
-        ex.read_manifest()
-        ex.read_strings()
-        ws = Worksheet(wb)
+
+        # NOTE: ExcelReader() in read-only mode does not close file handle by itself
+        # work around that by passing file object if we received some path
+        io_file = open(io, "rb") if isinstance(io, str) else io
+        try:
+            ex = ExcelReader(io_file, read_only=True)
+            ex.read()
+            wb = ex.wb
+
+            # Get shared strings
+            ex.read_manifest()
+            ex.read_strings()
+            ws = Worksheet(wb)
+        finally:
+            if isinstance(io, str):
+                # close only if it were us who opened the object
+                io_file.close()
 
         with ZipFile(io) as z:
             from io import BytesIO
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
index 9e6aa8a95b1..c91092c3de0 100644
--- a/modin/pandas/test/test_io.py
+++ b/modin/pandas/test/test_io.py
@@ -28,6 +28,7 @@
 import csv
 
 from .utils import (
+    check_file_leaks,
     df_equals,
     json_short_string,
     json_short_bytes,
@@ -1260,6 +1261,7 @@ def test_from_clipboard():
 
 
 @pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details")
+@check_file_leaks
 def test_from_excel():
     setup_excel_file(NROWS)
 
@@ -1271,6 +1273,7 @@ def test_from_excel():
     teardown_excel_file()
 
 
+@check_file_leaks
 def test_from_excel_engine():
     setup_excel_file(NROWS)
 
@@ -1283,6 +1286,7 @@ def test_from_excel_engine():
     teardown_excel_file()
 
 
+@check_file_leaks
 def test_from_excel_index_col():
     setup_excel_file(NROWS)
 
@@ -1295,6 +1299,7 @@ def test_from_excel_index_col():
     teardown_excel_file()
 
 
+@check_file_leaks
 def test_from_excel_all_sheets():
     setup_excel_file(NROWS)
 
@@ -1312,6 +1317,7 @@ def test_from_excel_all_sheets():
     teardown_excel_file()
 
 
+@check_file_leaks
 def test_from_excel_sheetname_title():
     path = "modin/pandas/test/data/excel_sheetname_title.xlsx"
     modin_df = pd.read_excel(path)
@@ -1323,6 +1329,7 @@ def test_from_excel_sheetname_title():
     "sheet_name",
     ["Sheet1", "AnotherSpecialName", "SpecialName", "SecondSpecialName", 0, 1, 2, 3],
 )
+@check_file_leaks
 def test_from_excel_sheet_name(sheet_name):
     fname = "modin/pandas/test/data/modin_error_book.xlsx"
     modin_df = pd.read_excel(fname, sheet_name=sheet_name)
diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py
index b7ea0162df2..17ac0ca20d9 100644
--- a/modin/pandas/test/utils.py
+++ b/modin/pandas/test/utils.py
@@ -23,11 +23,13 @@
 )
 import modin.pandas as pd
 from modin.utils import to_pandas
-from modin.config import TestDatasetSize
+from modin.config import TestDatasetSize, TrackFileLeaks
 from io import BytesIO
 import os
 from string import ascii_letters
 import csv
+import psutil
+import functools
 
 random_state = np.random.RandomState(seed=42)
 
@@ -1005,3 +1007,38 @@ def insert_lines_to_csv(
             **csv_reader_writer_params,
         )
         writer.writerows(lines)
+
+
+def _get_open_files():
+    """
+    psutil open_files() can return a lot of extra information that we can allow to
+    be different, like file position; for simplicity we care about path and fd only.
+    """
+    return sorted((info.path, info.fd) for info in psutil.Process().open_files())
+
+
+def check_file_leaks(func):
+    """
+    A decorator that ensures that no *newly* opened file handles are left
+    after decorated function is finished.
+    """
+    if not TrackFileLeaks.get():
+        return func
+
+    @functools.wraps(func)
+    def check(*a, **kw):
+        fstart = _get_open_files()
+        try:
+            return func(*a, **kw)
+        finally:
+            leaks = []
+            for item in _get_open_files():
+                try:
+                    fstart.remove(item)
+                except ValueError:
+                    leaks.append(item)
+            assert (
+                not leaks
+            ), f"Unexpected open handles left for: {', '.join(item[0] for item in leaks)}"
+
+    return check