Skip to content

Commit

Permalink
REG/REF: close file handles engine-independently in read_csv (#45389)
Browse files Browse the repository at this point in the history
  • Loading branch information
twoertwein authored Jan 16, 2022
1 parent 3743dbc commit 6cc5584
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 107 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -933,6 +933,7 @@ I/O
- Bug in :func:`read_csv` when passing a ``tempfile.SpooledTemporaryFile`` opened in binary mode (:issue:`44748`)
- Bug in :func:`read_json` raising ``ValueError`` when attempting to parse json strings containing "://" (:issue:`36271`)
- Bug in :func:`read_csv` when ``engine="c"`` and ``encoding_errors=None`` which caused a segfault (:issue:`45180`)
- Bug in :func:`read_csv` an invalid value of ``usecols`` leading to an un-closed file handle (:issue:`45384`)

Period
^^^^^^
Expand Down
3 changes: 3 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,6 @@ def closed(self) -> bool:

# Windowing rank methods
WindowingRankType = Literal["average", "min", "max"]

# read_csv engines
CSVEngine = Literal["c", "python", "pyarrow", "python-fwf"]
30 changes: 11 additions & 19 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,12 @@
from __future__ import annotations

from pandas._typing import (
FilePath,
ReadBuffer,
)
from pandas._typing import ReadBuffer
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.inference import is_integer

from pandas.core.frame import DataFrame

from pandas.io.common import get_handle
from pandas.io.parsers.base_parser import ParserBase


Expand All @@ -19,12 +15,11 @@ class ArrowParserWrapper(ParserBase):
Wrapper for the pyarrow engine for read_csv()
"""

def __init__(self, src: FilePath | ReadBuffer[bytes], **kwds):
def __init__(self, src: ReadBuffer[bytes], **kwds):
super().__init__(kwds)
self.kwds = kwds
self.src = src

ParserBase.__init__(self, kwds)

self._parse_kwds()

def _parse_kwds(self):
Expand Down Expand Up @@ -151,15 +146,12 @@ def read(self) -> DataFrame:
pyarrow_csv = import_optional_dependency("pyarrow.csv")
self._get_pyarrow_options()

with get_handle(
self.src, "rb", encoding=self.encoding, is_text=False
) as handles:
table = pyarrow_csv.read_csv(
handles.handle,
read_options=pyarrow_csv.ReadOptions(**self.read_options),
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
)
table = pyarrow_csv.read_csv(
self.src,
read_options=pyarrow_csv.ReadOptions(**self.read_options),
parse_options=pyarrow_csv.ParseOptions(**self.parse_options),
convert_options=pyarrow_csv.ConvertOptions(**self.convert_options),
)

frame = table.to_pandas()
return self._finalize_output(frame)
frame = table.to_pandas()
return self._finalize_output(frame)
30 changes: 1 addition & 29 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from enum import Enum
import itertools
from typing import (
Any,
Callable,
DefaultDict,
Hashable,
Expand All @@ -32,8 +31,6 @@
from pandas._typing import (
ArrayLike,
DtypeArg,
FilePath,
ReadCsvBuffer,
)
from pandas.errors import (
ParserError,
Expand Down Expand Up @@ -71,10 +68,6 @@
from pandas.core.series import Series
from pandas.core.tools import datetimes as tools

from pandas.io.common import (
IOHandles,
get_handle,
)
from pandas.io.date_converters import generic_parser


Expand Down Expand Up @@ -176,30 +169,10 @@ def __init__(self, kwds):

self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])

self.handles: IOHandles[str] | None = None

# Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
# Normally, this arg would get pre-processed earlier on
self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)

def _open_handles(
self,
src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
kwds: dict[str, Any],
) -> None:
"""
Let the readers open IOHandles after they are done with their potential raises.
"""
self.handles = get_handle(
src,
"r",
encoding=kwds.get("encoding", None),
compression=kwds.get("compression", None),
memory_map=kwds.get("memory_map", False),
storage_options=kwds.get("storage_options", None),
errors=kwds.get("encoding_errors", "strict"),
)

def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
"""
Check if parse_dates are in columns.
Expand Down Expand Up @@ -262,8 +235,7 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl
]

def close(self):
if self.handles is not None:
self.handles.close()
pass

@final
@property
Expand Down
21 changes: 4 additions & 17 deletions pandas/io/parsers/c_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
ArrayLike,
DtypeArg,
DtypeObj,
FilePath,
ReadCsvBuffer,
)
from pandas.errors import DtypeWarning
Expand Down Expand Up @@ -43,12 +42,10 @@ class CParserWrapper(ParserBase):
low_memory: bool
_reader: parsers.TextReader

def __init__(
self, src: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds
):
def __init__(self, src: ReadCsvBuffer[str], **kwds):
super().__init__(kwds)
self.kwds = kwds
kwds = kwds.copy()
ParserBase.__init__(self, kwds)

self.low_memory = kwds.pop("low_memory", False)

Expand All @@ -61,10 +58,6 @@ def __init__(
# GH20529, validate usecol arg before TextReader
kwds["usecols"] = self.usecols

# open handles
self._open_handles(src, kwds)
assert self.handles is not None

# Have to pass int, would break tests using TextReader directly otherwise :(
kwds["on_bad_lines"] = self.on_bad_lines.value

Expand All @@ -79,11 +72,7 @@ def __init__(
kwds.pop(key, None)

kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
try:
self._reader = parsers.TextReader(self.handles.handle, **kwds)
except Exception:
self.handles.close()
raise
self._reader = parsers.TextReader(src, **kwds)

self.unnamed_cols = self._reader.unnamed_cols

Expand Down Expand Up @@ -196,9 +185,7 @@ def __init__(
self._implicit_index = self._reader.leading_cols > 0

def close(self) -> None:
super().close()

# close additional handles opened by C parser
# close handles opened by C parser
try:
self._reader.close()
except ValueError:
Expand Down
35 changes: 10 additions & 25 deletions pandas/io/parsers/python_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
import pandas._libs.lib as lib
from pandas._typing import (
ArrayLike,
FilePath,
ReadCsvBuffer,
Scalar,
)
Expand All @@ -51,13 +50,11 @@


class PythonParser(ParserBase):
def __init__(
self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list, **kwds
):
def __init__(self, f: ReadCsvBuffer[str] | list, **kwds):
"""
Workhorse function for processing nested list into DataFrame
"""
ParserBase.__init__(self, kwds)
super().__init__(kwds)

self.data: Iterator[str] | None = None
self.buf: list = []
Expand Down Expand Up @@ -104,28 +101,18 @@ def __init__(
# read_excel: f is a list
self.data = cast(Iterator[str], f)
else:
self._open_handles(f, kwds)
assert self.handles is not None
assert hasattr(self.handles.handle, "readline")
try:
self._make_reader(self.handles.handle)
except (csv.Error, UnicodeDecodeError):
self.close()
raise
assert hasattr(f, "readline")
self._make_reader(f)

# Get columns in two steps: infer from data, then
# infer column indices from self.usecols if it is specified.
self._col_indices: list[int] | None = None
columns: list[list[Scalar | None]]
try:
(
columns,
self.num_original_columns,
self.unnamed_cols,
) = self._infer_columns()
except (TypeError, ValueError):
self.close()
raise
(
columns,
self.num_original_columns,
self.unnamed_cols,
) = self._infer_columns()

# Now self.columns has the set of columns that we will process.
# The original set is stored in self.original_columns.
Expand Down Expand Up @@ -1259,9 +1246,7 @@ class FixedWidthFieldParser(PythonParser):
See PythonParser for details.
"""

def __init__(
self, f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], **kwds
) -> None:
def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
# Support iterators, convert to a list.
self.colspecs = kwds.pop("colspecs")
self.infer_nrows = kwds.pop("infer_nrows")
Expand Down
Loading

0 comments on commit 6cc5584

Please sign in to comment.