diff --git a/Lib/_pyio.py b/Lib/_pyio.py index b3a8f37d68acdb..5402653bb1b362 100644 --- a/Lib/_pyio.py +++ b/Lib/_pyio.py @@ -23,8 +23,8 @@ valid_seek_flags.add(os.SEEK_HOLE) valid_seek_flags.add(os.SEEK_DATA) -# open() uses st_blksize whenever we can -DEFAULT_BUFFER_SIZE = 8 * 1024 # bytes +# open() uses max(st_blksize, io.DEFAULT_BUFFER_SIZE) when st_blksize is available +DEFAULT_BUFFER_SIZE = 128 * 1024 # bytes # NOTE: Base classes defined here are registered with the "official" ABCs # defined in io.py. We don't use real inheritance though, because we don't want @@ -123,10 +123,11 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, the size of a fixed-size chunk buffer. When no buffering argument is given, the default buffering policy works as follows: - * Binary files are buffered in fixed-size chunks; the size of the buffer - is chosen using a heuristic trying to determine the underlying device's - "block size" and falling back on `io.DEFAULT_BUFFER_SIZE`. - On many systems, the buffer will typically be 4096 or 8192 bytes long. + * Binary files are buffered in fixed-size chunks; the size of the buffer + is set to `max(io.DEFAULT_BUFFER_SIZE, st_blksize)` using a heuristic + trying to determine the underlying device's "block size" when available + and falling back on `io.DEFAULT_BUFFER_SIZE`. + On most systems, the buffer will typically be 131072 bytes long. * "Interactive" text files (files for which isatty() returns True) use line buffering. Other text files use the policy described above @@ -242,7 +243,7 @@ def open(file, mode="r", buffering=-1, encoding=None, errors=None, buffering = -1 line_buffering = True if buffering < 0: - buffering = raw._blksize + buffering = max(raw._blksize, DEFAULT_BUFFER_SIZE) if buffering < 0: raise ValueError("invalid buffering size") if buffering == 0: diff --git a/Lib/test/test_file.py b/Lib/test/test_file.py index 1206032a93566e..70c5256b9a562e 100644 --- a/Lib/test/test_file.py +++ b/Lib/test/test_file.py @@ -216,6 +216,18 @@ def testSetBufferSize(self): with self.assertWarnsRegex(RuntimeWarning, 'line buffering'): self._checkBufferSize(1) + def testDefaultBufferSize(self): + f = self.open(TESTFN, 'wb') + blksize = f.raw._blksize + f.write(bytes([0] * 5_000_000)) + f.close() + + f = self.open(TESTFN, 'rb') + data = f.read1() + expected_size = max(blksize, io.DEFAULT_BUFFER_SIZE) + self.assertEqual(len(data), expected_size) + f.close() + def testTruncateOnWindows(self): # SF bug # "file.truncate fault on windows" diff --git a/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst b/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst new file mode 100644 index 00000000000000..7140c5762e2279 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-04-30-14-03-09.gh-issue-117151.yt2H8c.rst @@ -0,0 +1,5 @@ +Increase ``io.DEFAULT_BUFFER_SIZE`` from 8k to 128k and adjust :func:`open` on +platforms where ``fstat`` provides a ``st_blksize`` field (such as Linux) to use +``max(io.DEFAULT_BUFFER_SIZE, device block size)`` rather than always using the +device block size. This should improve I/O performance. +Patch by Romain Morotti. diff --git a/Modules/_io/_iomodule.c b/Modules/_io/_iomodule.c index 6622f2cabb908b..b5a16eb1abe8a9 100644 --- a/Modules/_io/_iomodule.c +++ b/Modules/_io/_iomodule.c @@ -132,9 +132,10 @@ the size of a fixed-size chunk buffer. When no buffering argument is given, the default buffering policy works as follows: * Binary files are buffered in fixed-size chunks; the size of the buffer - is chosen using a heuristic trying to determine the underlying device's - "block size" and falling back on `io.DEFAULT_BUFFER_SIZE`. - On many systems, the buffer will typically be 4096 or 8192 bytes long. + is set to `max(io.DEFAULT_BUFFER_SIZE, st_blksize)` using a heuristic + trying to determine the underlying device's "block size" when available + and falling back on `io.DEFAULT_BUFFER_SIZE`. + On most systems, the buffer will typically be 131072 bytes long. * "Interactive" text files (files for which isatty() returns True) use line buffering. Other text files use the policy described above @@ -200,7 +201,7 @@ static PyObject * _io_open_impl(PyObject *module, PyObject *file, const char *mode, int buffering, const char *encoding, const char *errors, const char *newline, int closefd, PyObject *opener) -/*[clinic end generated code: output=aefafc4ce2b46dc0 input=cd034e7cdfbf4e78]*/ +/*[clinic end generated code: output=aefafc4ce2b46dc0 input=bac1cd70f431fe9a]*/ { size_t i; @@ -368,6 +369,8 @@ _io_open_impl(PyObject *module, PyObject *file, const char *mode, if (blksize_obj == NULL) goto error; buffering = PyLong_AsLong(blksize_obj); + if (buffering < DEFAULT_BUFFER_SIZE) + buffering = DEFAULT_BUFFER_SIZE; Py_DECREF(blksize_obj); if (buffering == -1 && PyErr_Occurred()) goto error; diff --git a/Modules/_io/_iomodule.h b/Modules/_io/_iomodule.h index afd638a120ba08..18cf20edf26f7d 100644 --- a/Modules/_io/_iomodule.h +++ b/Modules/_io/_iomodule.h @@ -78,7 +78,7 @@ extern Py_ssize_t _PyIO_find_line_ending( */ extern int _PyIO_trap_eintr(void); -#define DEFAULT_BUFFER_SIZE (8 * 1024) /* bytes */ +#define DEFAULT_BUFFER_SIZE (128 * 1024) /* bytes */ /* * Offset type for positioning. diff --git a/Modules/_io/clinic/_iomodule.c.h b/Modules/_io/clinic/_iomodule.c.h index 82932a23331ab6..1fe0b1de36342c 100644 --- a/Modules/_io/clinic/_iomodule.c.h +++ b/Modules/_io/clinic/_iomodule.c.h @@ -64,9 +64,10 @@ PyDoc_STRVAR(_io_open__doc__, "given, the default buffering policy works as follows:\n" "\n" "* Binary files are buffered in fixed-size chunks; the size of the buffer\n" -" is chosen using a heuristic trying to determine the underlying device\'s\n" -" \"block size\" and falling back on `io.DEFAULT_BUFFER_SIZE`.\n" -" On many systems, the buffer will typically be 4096 or 8192 bytes long.\n" +" is set to `max(io.DEFAULT_BUFFER_SIZE, st_blksize)` using a heuristic\n" +" trying to determine the underlying device\'s \"block size\" when available\n" +" and falling back on `io.DEFAULT_BUFFER_SIZE`.\n" +" On most systems, the buffer will typically be 131072 bytes long.\n" "\n" "* \"Interactive\" text files (files for which isatty() returns True)\n" " use line buffering. Other text files use the policy described above\n" @@ -406,4 +407,4 @@ _io_open_code(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObjec exit: return return_value; } -/*[clinic end generated code: output=ec1df2ff5265ab16 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=ec27299cc4de03e3 input=a9049054013a1b77]*/