Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure excel reader closes file if it is passed as path #2514

Merged
merged 4 commits into from
Dec 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions modin/config/envvars.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# governing permissions and limitations under the License.

import os
import sys
from textwrap import dedent
import warnings
from packaging import version
Expand Down Expand Up @@ -208,6 +209,18 @@ class TestDatasetSize(EnvironmentVariable, type=str):
choices = ("Small", "Normal", "Big")


class TrackFileLeaks(EnvironmentVariable, type=bool):
"""
Whether to track for open file handles leakage during testing
"""

varname = "MODIN_TEST_TRACK_FILE_LEAKS"
# Turn off tracking on Windows by default because
# psutil's open_files() can be extremely slow on Windows (up to adding a few hours).
# see https://github.com/giampaolo/psutil/pull/597
default = sys.platform != "win32"


def _check_vars():
"""
Look out for any environment variables that start with "MODIN_" prefix
Expand Down
24 changes: 17 additions & 7 deletions modin/engines/base/io/text/excel_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ def _read(cls, io, **kwargs):
return cls.single_worker_read(io, **kwargs)

from zipfile import ZipFile
from openpyxl import load_workbook
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.worksheet._reader import WorksheetReader
from openpyxl.reader.excel import ExcelReader
Expand All @@ -59,12 +58,23 @@ def _read(cls, io, **kwargs):
"Parallel `read_excel` is a new feature! Please email "
"bug_reports@modin.org if you run into any problems."
)
wb = load_workbook(filename=io, read_only=True)
# Get shared strings
ex = ExcelReader(io, read_only=True)
ex.read_manifest()
ex.read_strings()
ws = Worksheet(wb)

# NOTE: ExcelReader() in read-only mode does not close file handle by itself
# work around that by passing file object if we received some path
io_file = open(io, "rb") if isinstance(io, str) else io
try:
ex = ExcelReader(io_file, read_only=True)
ex.read()
wb = ex.wb

# Get shared strings
ex.read_manifest()
ex.read_strings()
ws = Worksheet(wb)
finally:
if isinstance(io, str):
# close only if it were us who opened the object
io_file.close()

with ZipFile(io) as z:
from io import BytesIO
Expand Down
7 changes: 7 additions & 0 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import csv

from .utils import (
check_file_leaks,
df_equals,
json_short_string,
json_short_bytes,
Expand Down Expand Up @@ -1184,6 +1185,7 @@ def test_from_clipboard():


@pytest.mark.xfail(reason="read_excel is broken for now, see #1733 for details")
@check_file_leaks
def test_from_excel():
setup_excel_file(NROWS)

Expand All @@ -1195,6 +1197,7 @@ def test_from_excel():
teardown_excel_file()


@check_file_leaks
def test_from_excel_engine():
setup_excel_file(NROWS)

Expand All @@ -1207,6 +1210,7 @@ def test_from_excel_engine():
teardown_excel_file()


@check_file_leaks
def test_from_excel_index_col():
setup_excel_file(NROWS)

Expand All @@ -1219,6 +1223,7 @@ def test_from_excel_index_col():
teardown_excel_file()


@check_file_leaks
def test_from_excel_all_sheets():
setup_excel_file(NROWS)

Expand All @@ -1236,6 +1241,7 @@ def test_from_excel_all_sheets():
teardown_excel_file()


@check_file_leaks
def test_from_excel_sheetname_title():
path = "modin/pandas/test/data/excel_sheetname_title.xlsx"
modin_df = pd.read_excel(path)
Expand All @@ -1247,6 +1253,7 @@ def test_from_excel_sheetname_title():
"sheet_name",
["Sheet1", "AnotherSpecialName", "SpecialName", "SecondSpecialName", 0, 1, 2, 3],
)
@check_file_leaks
def test_from_excel_sheet_name(sheet_name):
fname = "modin/pandas/test/data/modin_error_book.xlsx"
modin_df = pd.read_excel(fname, sheet_name=sheet_name)
Expand Down
39 changes: 38 additions & 1 deletion modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,13 @@
)
import modin.pandas as pd
from modin.utils import to_pandas
from modin.config import TestDatasetSize
from modin.config import TestDatasetSize, TrackFileLeaks
from io import BytesIO
import os
from string import ascii_letters
import csv
import psutil
import functools

random_state = np.random.RandomState(seed=42)

Expand Down Expand Up @@ -1001,3 +1003,38 @@ def insert_lines_to_csv(
**csv_reader_writer_params,
)
writer.writerows(lines)


def _get_open_files():
"""
psutil open_files() can return a lot of extra information that we can allow to
be different, like file position; for simplicity we care about path and fd only.
"""
return sorted((info.path, info.fd) for info in psutil.Process().open_files())


def check_file_leaks(func):
"""
A decorator that ensures that no *newly* opened file handles are left
after decorated function is finished.
"""
if not TrackFileLeaks.get():
return func

@functools.wraps(func)
def check(*a, **kw):
fstart = _get_open_files()
try:
return func(*a, **kw)
finally:
leaks = []
for item in _get_open_files():
try:
fstart.remove(item)
except ValueError:
leaks.append(item)
assert (
not leaks
), f"Unexpected open handles left for: {', '.join(item[0] for item in leaks)}"

return check