Skip to content

Commit

Permalink
pythongh-117151: IO performance improvement, increase io.DEFAULT_BUFF…
Browse files Browse the repository at this point in the history
…ER_SIZE to 128k, adjust open() to use max(st_blksize, io.DEFAULT_BUFFER_SIZE)
  • Loading branch information
rmmancom committed Feb 6, 2025
1 parent 63f0406 commit 1d85458
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 16 deletions.
15 changes: 8 additions & 7 deletions Lib/_pyio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
valid_seek_flags.add(os.SEEK_HOLE)
valid_seek_flags.add(os.SEEK_DATA)

# open() uses st_blksize whenever we can
DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes
# open() uses max(st_blksize, io.DEFAULT_BUFFER_SIZE) when st_blksize is available
DEFAULT_BUFFER_SIZE = 128 * 1024 # bytes

# NOTE: Base classes defined here are registered with the "official" ABCs
# defined in io.py. We don't use real inheritance though, because we don't want
Expand Down Expand Up @@ -123,10 +123,11 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
the size of a fixed-size chunk buffer. When no buffering argument is
given, the default buffering policy works as follows:
* Binary files are buffered in fixed-size chunks; the size of the buffer
is chosen using a heuristic trying to determine the underlying device's
"block size" and falling back on `io.DEFAULT_BUFFER_SIZE`.
On many systems, the buffer will typically be 4096 or 8192 bytes long.
* Binary files are buffered in fixed-size chunks; the size of the buffer
is set to `max(io.DEFAULT_BUFFER_SIZE, st_blksize)` using a heuristic
trying to determine the underlying device's "block size" when available
and falling back on `io.DEFAULT_BUFFER_SIZE`.
On most systems, the buffer will typically be 131072 bytes long.
* "Interactive" text files (files for which isatty() returns True)
use line buffering. Other text files use the policy described above
Expand Down Expand Up @@ -242,7 +243,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None,
buffering = -1
line_buffering = True
if buffering < 0:
buffering = raw._blksize
buffering = max(raw._blksize, DEFAULT_BUFFER_SIZE)
if buffering < 0:
raise ValueError("invalid buffering size")
if buffering == 0:
Expand Down
12 changes: 12 additions & 0 deletions Lib/test/test_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,18 @@ def testSetBufferSize(self):
with self.assertWarnsRegex(RuntimeWarning, 'line buffering'):
self._checkBufferSize(1)

def testDefaultBufferSize(self):
f = self.open(TESTFN, 'wb')
blksize = f.raw._blksize
f.write(bytes([0] * 5_000_000))
f.close()

f = self.open(TESTFN, 'rb')
data = f.read1()
expected_size = max(blksize, io.DEFAULT_BUFFER_SIZE)
self.assertEqual(len(data), expected_size)
f.close()

def testTruncateOnWindows(self):
# SF bug <https://bugs.python.org/issue801631>
# "file.truncate fault on windows"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Increase ``io.DEFAULT_BUFFER_SIZE`` from 8k to 128k and adjust :func:`open` on
platforms where ``fstat`` provides a ``st_blksize`` field (such as Linux) to use
``max(io.DEFAULT_BUFFER_SIZE, device block size)`` rather than always using the
device block size. This should improve I/O performance.
Patch by Romain Morotti.
11 changes: 7 additions & 4 deletions Modules/_io/_iomodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,10 @@ the size of a fixed-size chunk buffer. When no buffering argument is
given, the default buffering policy works as follows:
* Binary files are buffered in fixed-size chunks; the size of the buffer
is chosen using a heuristic trying to determine the underlying device's
"block size" and falling back on `io.DEFAULT_BUFFER_SIZE`.
On many systems, the buffer will typically be 4096 or 8192 bytes long.
is set to `max(io.DEFAULT_BUFFER_SIZE, st_blksize)` using a heuristic
trying to determine the underlying device's "block size" when available
and falling back on `io.DEFAULT_BUFFER_SIZE`.
On most systems, the buffer will typically be 131072 bytes long.
* "Interactive" text files (files for which isatty() returns True)
use line buffering. Other text files use the policy described above
Expand Down Expand Up @@ -200,7 +201,7 @@ static PyObject *
_io_open_impl(PyObject *module, PyObject *file, const char *mode,
int buffering, const char *encoding, const char *errors,
const char *newline, int closefd, PyObject *opener)
/*[clinic end generated code: output=aefafc4ce2b46dc0 input=cd034e7cdfbf4e78]*/
/*[clinic end generated code: output=aefafc4ce2b46dc0 input=bac1cd70f431fe9a]*/
{
size_t i;

Expand Down Expand Up @@ -368,6 +369,8 @@ _io_open_impl(PyObject *module, PyObject *file, const char *mode,
if (blksize_obj == NULL)
goto error;
buffering = PyLong_AsLong(blksize_obj);
if (buffering < DEFAULT_BUFFER_SIZE)
buffering = DEFAULT_BUFFER_SIZE;
Py_DECREF(blksize_obj);
if (buffering == -1 && PyErr_Occurred())
goto error;
Expand Down
2 changes: 1 addition & 1 deletion Modules/_io/_iomodule.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ extern Py_ssize_t _PyIO_find_line_ending(
*/
extern int _PyIO_trap_eintr(void);

#define DEFAULT_BUFFER_SIZE (8 * 1024) /* bytes */
#define DEFAULT_BUFFER_SIZE (128 * 1024) /* bytes */

/*
* Offset type for positioning.
Expand Down
9 changes: 5 additions & 4 deletions Modules/_io/clinic/_iomodule.c.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1d85458

Please sign in to comment.