Skip to content

Commit

Permalink
Ensure excel reader closes file if it is passed as path (#2514)
Browse files Browse the repository at this point in the history
Signed-off-by: Vasilij Litvinov <vasilij.n.litvinov@intel.com>
  • Loading branch information
vnlitvinov committed Dec 8, 2020
1 parent 031f444 commit d710a16
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 8 deletions.
13 changes: 13 additions & 0 deletions modin/config/envvars.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# governing permissions and limitations under the License.

import os
import sys
from textwrap import dedent
import warnings
from packaging import version
Expand Down Expand Up @@ -208,6 +209,18 @@ class TestDatasetSize(EnvironmentVariable, type=str):
choices = ("Small", "Normal", "Big")


class TrackFileLeaks(EnvironmentVariable, type=bool):
"""
Whether to track for open file handles leakage during testing
"""

varname = "MODIN_TEST_TRACK_FILE_LEAKS"
# Turn off tracking on Windows by default because
# psutil's open_files() can be extremely slow on Windows (up to adding a few hours).
# see https://github.com/giampaolo/psutil/pull/597
default = sys.platform != "win32"


def _check_vars():
"""
Look out for any environment variables that start with "MODIN_" prefix
Expand Down
24 changes: 17 additions & 7 deletions modin/engines/base/io/text/excel_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def _read(cls, io, **kwargs):
return cls.single_worker_read(io, **kwargs)

from zipfile import ZipFile
from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.worksheet._reader import WorksheetReader
from openpyxl.reader.excel import ExcelReader
Expand All @@ -59,12 +58,23 @@ def _read(cls, io, **kwargs):
"Parallel `read_excel` is a new feature! Please email "
"bug_reports@modin.org if you run into any problems."
)
wb = load_workbook(filename=io, read_only=True)
# Get shared strings
ex = ExcelReader(io, read_only=True)
ex.read_manifest()
ex.read_strings()
ws = Worksheet(wb)

# NOTE: ExcelReader() in read-only mode does not close file handle by itself
# work around that by passing file object if we received some path
io_file = open(io, "rb") if isinstance(io, str) else io
try:
ex = ExcelReader(io_file, read_only=True)
ex.read()
wb = ex.wb

# Get shared strings
ex.read_manifest()
ex.read_strings()
ws = Worksheet(wb)
finally:
if isinstance(io, str):
# close only if it were us who opened the object
io_file.close()

with ZipFile(io) as z:
from io import BytesIO
Expand Down
7 changes: 7 additions & 0 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import csv

from .utils import (
check_file_leaks,
df_equals,
json_short_string,
json_short_bytes,
Expand Down Expand Up @@ -1260,6 +1261,7 @@ def test_from_clipboard():


@pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details")
@check_file_leaks
def test_from_excel():
setup_excel_file(NROWS)

Expand All @@ -1271,6 +1273,7 @@ def test_from_excel():
teardown_excel_file()


@check_file_leaks
def test_from_excel_engine():
setup_excel_file(NROWS)

Expand All @@ -1283,6 +1286,7 @@ def test_from_excel_engine():
teardown_excel_file()


@check_file_leaks
def test_from_excel_index_col():
setup_excel_file(NROWS)

Expand All @@ -1295,6 +1299,7 @@ def test_from_excel_index_col():
teardown_excel_file()


@check_file_leaks
def test_from_excel_all_sheets():
setup_excel_file(NROWS)

Expand All @@ -1312,6 +1317,7 @@ def test_from_excel_all_sheets():
teardown_excel_file()


@check_file_leaks
def test_from_excel_sheetname_title():
path = "modin/pandas/test/data/excel_sheetname_title.xlsx"
modin_df = pd.read_excel(path)
Expand All @@ -1323,6 +1329,7 @@ def test_from_excel_sheetname_title():
"sheet_name",
["Sheet1", "AnotherSpecialName", "SpecialName", "SecondSpecialName", 0, 1, 2, 3],
)
@check_file_leaks
def test_from_excel_sheet_name(sheet_name):
fname = "modin/pandas/test/data/modin_error_book.xlsx"
modin_df = pd.read_excel(fname, sheet_name=sheet_name)
Expand Down
39 changes: 38 additions & 1 deletion modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@
)
import modin.pandas as pd
from modin.utils import to_pandas
from modin.config import TestDatasetSize
from modin.config import TestDatasetSize, TrackFileLeaks
from io import BytesIO
import os
from string import ascii_letters
import csv
import psutil
import functools

random_state = np.random.RandomState(seed=42)

Expand Down Expand Up @@ -1005,3 +1007,38 @@ def insert_lines_to_csv(
**csv_reader_writer_params,
)
writer.writerows(lines)


def _get_open_files():
"""
psutil open_files() can return a lot of extra information that we can allow to
be different, like file position; for simplicity we care about path and fd only.
"""
return sorted((info.path, info.fd) for info in psutil.Process().open_files())


def check_file_leaks(func):
"""
A decorator that ensures that no *newly* opened file handles are left
after decorated function is finished.
"""
if not TrackFileLeaks.get():
return func

@functools.wraps(func)
def check(*a, **kw):
fstart = _get_open_files()
try:
return func(*a, **kw)
finally:
leaks = []
for item in _get_open_files():
try:
fstart.remove(item)
except ValueError:
leaks.append(item)
assert (
not leaks
), f"Unexpected open handles left for: {', '.join(item[0] for item in leaks)}"

return check

0 comments on commit d710a16

Please sign in to comment.