From d710a16c2896024fe5b7c743555935e973b4cfaf Mon Sep 17 00:00:00 2001 From: Vasily Litvinov <45396231+vnlitvinov@users.noreply.github.com> Date: Tue, 8 Dec 2020 14:37:59 +0300 Subject: [PATCH] Ensure excel reader closes file if it is passed as path (#2514) Signed-off-by: Vasilij Litvinov --- modin/config/envvars.py | 13 +++++++ .../engines/base/io/text/excel_dispatcher.py | 24 ++++++++---- modin/pandas/test/test_io.py | 7 ++++ modin/pandas/test/utils.py | 39 ++++++++++++++++++- 4 files changed, 75 insertions(+), 8 deletions(-) diff --git a/modin/config/envvars.py b/modin/config/envvars.py index e3ea3d1814f..4a7a5d9f664 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -12,6 +12,7 @@ # governing permissions and limitations under the License. import os +import sys from textwrap import dedent import warnings from packaging import version @@ -208,6 +209,18 @@ class TestDatasetSize(EnvironmentVariable, type=str): choices = ("Small", "Normal", "Big") +class TrackFileLeaks(EnvironmentVariable, type=bool): + """ + Whether to track for open file handles leakage during testing + """ + + varname = "MODIN_TEST_TRACK_FILE_LEAKS" + # Turn off tracking on Windows by default because + # psutil's open_files() can be extremely slow on Windows (up to adding a few hours). + # see https://github.com/giampaolo/psutil/pull/597 + default = sys.platform != "win32" + + def _check_vars(): """ Look out for any environment variables that start with "MODIN_" prefix diff --git a/modin/engines/base/io/text/excel_dispatcher.py b/modin/engines/base/io/text/excel_dispatcher.py index c48872ae61e..15dff6f1786 100644 --- a/modin/engines/base/io/text/excel_dispatcher.py +++ b/modin/engines/base/io/text/excel_dispatcher.py @@ -41,7 +41,6 @@ def _read(cls, io, **kwargs): return cls.single_worker_read(io, **kwargs) from zipfile import ZipFile - from openpyxl import load_workbook from openpyxl.worksheet.worksheet import Worksheet from openpyxl.worksheet._reader import WorksheetReader from openpyxl.reader.excel import ExcelReader @@ -59,12 +58,23 @@ def _read(cls, io, **kwargs): "Parallel `read_excel` is a new feature! Please email " "bug_reports@modin.org if you run into any problems." ) - wb = load_workbook(filename=io, read_only=True) - # Get shared strings - ex = ExcelReader(io, read_only=True) - ex.read_manifest() - ex.read_strings() - ws = Worksheet(wb) + + # NOTE: ExcelReader() in read-only mode does not close file handle by itself + # work around that by passing file object if we received some path + io_file = open(io, "rb") if isinstance(io, str) else io + try: + ex = ExcelReader(io_file, read_only=True) + ex.read() + wb = ex.wb + + # Get shared strings + ex.read_manifest() + ex.read_strings() + ws = Worksheet(wb) + finally: + if isinstance(io, str): + # close only if it were us who opened the object + io_file.close() with ZipFile(io) as z: from io import BytesIO diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 9e6aa8a95b1..c91092c3de0 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -28,6 +28,7 @@ import csv from .utils import ( + check_file_leaks, df_equals, json_short_string, json_short_bytes, @@ -1260,6 +1261,7 @@ def test_from_clipboard(): @pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details") +@check_file_leaks def test_from_excel(): setup_excel_file(NROWS) @@ -1271,6 +1273,7 @@ def test_from_excel(): teardown_excel_file() +@check_file_leaks def test_from_excel_engine(): setup_excel_file(NROWS) @@ -1283,6 +1286,7 @@ def test_from_excel_engine(): teardown_excel_file() +@check_file_leaks def test_from_excel_index_col(): setup_excel_file(NROWS) @@ -1295,6 +1299,7 @@ def test_from_excel_index_col(): teardown_excel_file() +@check_file_leaks def test_from_excel_all_sheets(): setup_excel_file(NROWS) @@ -1312,6 +1317,7 @@ def test_from_excel_all_sheets(): teardown_excel_file() +@check_file_leaks def test_from_excel_sheetname_title(): path = "modin/pandas/test/data/excel_sheetname_title.xlsx" modin_df = pd.read_excel(path) @@ -1323,6 +1329,7 @@ def test_from_excel_sheetname_title(): "sheet_name", ["Sheet1", "AnotherSpecialName", "SpecialName", "SecondSpecialName", 0, 1, 2, 3], ) +@check_file_leaks def test_from_excel_sheet_name(sheet_name): fname = "modin/pandas/test/data/modin_error_book.xlsx" modin_df = pd.read_excel(fname, sheet_name=sheet_name) diff --git a/modin/pandas/test/utils.py b/modin/pandas/test/utils.py index b7ea0162df2..17ac0ca20d9 100644 --- a/modin/pandas/test/utils.py +++ b/modin/pandas/test/utils.py @@ -23,11 +23,13 @@ ) import modin.pandas as pd from modin.utils import to_pandas -from modin.config import TestDatasetSize +from modin.config import TestDatasetSize, TrackFileLeaks from io import BytesIO import os from string import ascii_letters import csv +import psutil +import functools random_state = np.random.RandomState(seed=42) @@ -1005,3 +1007,38 @@ def insert_lines_to_csv( **csv_reader_writer_params, ) writer.writerows(lines) + + +def _get_open_files(): + """ + psutil open_files() can return a lot of extra information that we can allow to + be different, like file position; for simplicity we care about path and fd only. + """ + return sorted((info.path, info.fd) for info in psutil.Process().open_files()) + + +def check_file_leaks(func): + """ + A decorator that ensures that no *newly* opened file handles are left + after decorated function is finished. + """ + if not TrackFileLeaks.get(): + return func + + @functools.wraps(func) + def check(*a, **kw): + fstart = _get_open_files() + try: + return func(*a, **kw) + finally: + leaks = [] + for item in _get_open_files(): + try: + fstart.remove(item) + except ValueError: + leaks.append(item) + assert ( + not leaks + ), f"Unexpected open handles left for: {', '.join(item[0] for item in leaks)}" + + return check