Skip to content

Commit

Permalink
vdev_disk: make "classic" BIO submission available
Browse files Browse the repository at this point in the history
The old submission code has issues, but its also been in use for a long
time. The new code should fix these things and be an overall improvement
in many areas, but does not have anywhere near the amount of testing and
real-world use to give the same confidence and understanding.

This commit adds a load-time option `zfs_vdev_disk_classic` that, when
enabled, uses the old code instead. This will allow us to backport these
changes to 2.2 and set zfs_vdev_disk_classic=1 by default, and then
recommend the option to users that need it now to fix problems or want
to try it out, while not accidentally breaking things for anyone else.

Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
  • Loading branch information
robn committed Dec 19, 2023
1 parent 367317a commit e59cf87
Show file tree
Hide file tree
Showing 4 changed files with 306 additions and 2 deletions.
2 changes: 2 additions & 0 deletions include/sys/abd.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ void abd_fini(void);

/*
* Linux ABD bio functions
* Note: these are only needed to support vdev_classic. See comment in
* vdev_disk.c.
*/
#if defined(__linux__) && defined(_KERNEL)
unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t);
Expand Down
16 changes: 16 additions & 0 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -1360,6 +1360,22 @@ If this is higher than the maximum allowed by the device queue or the kernel
itself, it will be clamped.
Setting it to zero will cause the kernel's ideal size to be used.
This parameter only applies on Linux.
This parameter is ignored if
.Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
.
.It Sy zfs_vdev_disk_classic Ns = Ns Sy 0 Ns | Ns 1 Pq uint
If set to 1, OpenZFS will submit IO to Linux using the method it used in 2.2
and earlier.
This "classic" method has known issues with highly fragmented IO requests and
is slower on many workloads, but it has been in use for many years and is known
to be very stable.
If you set this parameter, please also open a bug report why you did so,
including the workload involved and any error messages.
.Pp
This parameter and the classic submission method will be removed once we have
total confidence in the new method.
.Pp
This parameter only applies on Linux, and can only be set at module load time.
.
.It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
Time before expiring
Expand Down
5 changes: 5 additions & 0 deletions module/os/linux/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,11 @@ abd_cache_reap_now(void)
}

#if defined(_KERNEL)
/*
* Note: ABD BIO functions only needed to support vdev_classic. See comments in
* vdev_disk.c.
*/

/*
* bio_nr_pages for ABD.
* @off is the offset in @abd
Expand Down
285 changes: 283 additions & 2 deletions module/os/linux/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -1091,6 +1091,240 @@ BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error)
zio_interrupt(zio);
}

/* ========== */

/*
* This is the classic, battle-tested BIO submission code. Until we're totally
* sure that the new code is safe and correct in all cases, this will remain
* available and can be enabled by setting zfs_vdev_disk_classic=1 at module
* load time.
*
* These functions have been renamed to vdev_classic_* to make it clear what
* they belong to, but their implementations are unchanged.
*/

/*
* Virtual device vector for disks.
*/
typedef struct dio_request {
zio_t *dr_zio; /* Parent ZIO */
atomic_t dr_ref; /* References */
int dr_error; /* Bio error */
int dr_bio_count; /* Count of bio's */
struct bio *dr_bio[]; /* Attached bio's */
} dio_request_t;

static dio_request_t *
vdev_classic_dio_alloc(int bio_count)
{
dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) +
sizeof (struct bio *) * bio_count, KM_SLEEP);
atomic_set(&dr->dr_ref, 0);
dr->dr_bio_count = bio_count;
dr->dr_error = 0;

for (int i = 0; i < dr->dr_bio_count; i++)
dr->dr_bio[i] = NULL;

return (dr);
}

static void
vdev_classic_dio_free(dio_request_t *dr)
{
int i;

for (i = 0; i < dr->dr_bio_count; i++)
if (dr->dr_bio[i])
bio_put(dr->dr_bio[i]);

kmem_free(dr, sizeof (dio_request_t) +
sizeof (struct bio *) * dr->dr_bio_count);
}

static void
vdev_classic_dio_get(dio_request_t *dr)
{
atomic_inc(&dr->dr_ref);
}

static void
vdev_classic_dio_put(dio_request_t *dr)
{
int rc = atomic_dec_return(&dr->dr_ref);

/*
* Free the dio_request when the last reference is dropped and
* ensure zio_interpret is called only once with the correct zio
*/
if (rc == 0) {
zio_t *zio = dr->dr_zio;
int error = dr->dr_error;

vdev_classic_dio_free(dr);

if (zio) {
zio->io_error = error;
ASSERT3S(zio->io_error, >=, 0);
if (zio->io_error)
vdev_disk_error(zio);

zio_delay_interrupt(zio);
}
}
}

BIO_END_IO_PROTO(vdev_classic_physio_completion, bio, error)
{
dio_request_t *dr = bio->bi_private;

if (dr->dr_error == 0) {
#ifdef HAVE_1ARG_BIO_END_IO_T
dr->dr_error = BIO_END_IO_ERROR(bio);
#else
if (error)
dr->dr_error = -(error);
else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
dr->dr_error = EIO;
#endif
}

/* Drop reference acquired by vdev_classic_physio */
vdev_classic_dio_put(dr);
}

static inline unsigned int
vdev_classic_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset)
{
unsigned long nr_segs = abd_nr_pages_off(zio->io_abd,
bio_size, abd_offset);

#ifdef HAVE_BIO_MAX_SEGS
return (bio_max_segs(nr_segs));
#else
return (MIN(nr_segs, BIO_MAX_PAGES));
#endif
}

static int
vdev_classic_physio(zio_t *zio)
{
vdev_t *v = zio->io_vd;
vdev_disk_t *vd = v->vdev_tsd;
struct block_device *bdev = vd->vd_bdev;
size_t io_size = zio->io_size;
uint64_t io_offset = zio->io_offset;
int rw = zio->io_type == ZIO_TYPE_READ ? READ : WRITE;
int flags = 0;

dio_request_t *dr;
uint64_t abd_offset;
uint64_t bio_offset;
int bio_size;
int bio_count = 16;
int error = 0;
struct blk_plug plug;
unsigned short nr_vecs;

/*
* Accessing outside the block device is never allowed.
*/
if (io_offset + io_size > bdev->bd_inode->i_size) {
vdev_dbgmsg(zio->io_vd,
"Illegal access %llu size %llu, device size %llu",
(u_longlong_t)io_offset,
(u_longlong_t)io_size,
(u_longlong_t)i_size_read(bdev->bd_inode));
return (SET_ERROR(EIO));
}

retry:
dr = vdev_classic_dio_alloc(bio_count);

if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) &&
zio->io_vd->vdev_failfast == B_TRUE) {
bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1,
zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4);
}

dr->dr_zio = zio;

/*
* Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which
* is at least 512 bytes and at most PAGESIZE (typically 4K), one bio
* can cover at least 128KB and at most 1MB. When the required number
* of iovec's exceeds this, we are forced to break the IO in multiple
* bio's and wait for them all to complete. This is likely if the
* recordsize property is increased beyond 1MB. The default
* bio_count=16 should typically accommodate the maximum-size zio of
* 16MB.
*/

abd_offset = 0;
bio_offset = io_offset;
bio_size = io_size;
for (int i = 0; i <= dr->dr_bio_count; i++) {

/* Finished constructing bio's for given buffer */
if (bio_size <= 0)
break;

/*
* If additional bio's are required, we have to retry, but
* this should be rare - see the comment above.
*/
if (dr->dr_bio_count == i) {
vdev_classic_dio_free(dr);
bio_count *= 2;
goto retry;
}

nr_vecs = vdev_classic_bio_max_segs(zio, bio_size, abd_offset);
dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs);
if (unlikely(dr->dr_bio[i] == NULL)) {
vdev_classic_dio_free(dr);
return (SET_ERROR(ENOMEM));
}

/* Matching put called by vdev_classic_physio_completion */
vdev_classic_dio_get(dr);

BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9;
dr->dr_bio[i]->bi_end_io = vdev_classic_physio_completion;
dr->dr_bio[i]->bi_private = dr;
bio_set_op_attrs(dr->dr_bio[i], rw, flags);

/* Remaining size is returned to become the new size */
bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd,
bio_size, abd_offset);

/* Advance in buffer and construct another bio if needed */
abd_offset += BIO_BI_SIZE(dr->dr_bio[i]);
bio_offset += BIO_BI_SIZE(dr->dr_bio[i]);
}

/* Extra reference to protect dio_request during vdev_submit_bio */
vdev_classic_dio_get(dr);

if (dr->dr_bio_count > 1)
blk_start_plug(&plug);

/* Submit all bio's associated with this dio */
for (int i = 0; i < dr->dr_bio_count; i++) {
if (dr->dr_bio[i])
vdev_submit_bio(dr->dr_bio[i]);
}

if (dr->dr_bio_count > 1)
blk_finish_plug(&plug);

vdev_classic_dio_put(dr);

return (error);
}

/* ========== */

static int
vdev_disk_io_flush(struct block_device *bdev, zio_t *zio)
{
Expand Down Expand Up @@ -1141,6 +1375,8 @@ vdev_disk_io_trim(zio_t *zio)
#endif
}

int (*vdev_disk_io_rw_fn)(zio_t *zio) = NULL;

static void
vdev_disk_io_start(zio_t *zio)
{
Expand Down Expand Up @@ -1219,7 +1455,7 @@ vdev_disk_io_start(zio_t *zio)
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
zio->io_target_timestamp = zio_handle_io_delay(zio);
error = vdev_disk_io_rw(zio);
error = vdev_disk_io_rw_fn(zio);
rw_exit(&vd->vd_lock);
if (error) {
zio->io_error = error;
Expand Down Expand Up @@ -1292,8 +1528,49 @@ vdev_disk_rele(vdev_t *vd)
/* XXX: Implement me as a vnode rele for the device */
}

/*
* BIO submission method. See comment above about vdev_classic.
* Set zfs_vdev_disk_classic=0 for new, =1 for classic
*/
static uint_t zfs_vdev_disk_classic = 0; /* default new */

/* Set submission function from module parameter */
static int
vdev_disk_param_set_classic(const char *buf, zfs_kernel_param_t *kp)
{
int err = param_set_uint(buf, kp);
if (err < 0)
return (SET_ERROR(err));

vdev_disk_io_rw_fn =
zfs_vdev_disk_classic ? vdev_classic_physio : vdev_disk_io_rw;

printk(KERN_INFO "ZFS: forcing %s BIO submission\n",
zfs_vdev_disk_classic ? "classic" : "new");

return (0);
}

/*
* At first use vdev use, set the submission function from the default value if
* it hasn't been set already.
*/
static int
vdev_disk_init(spa_t *spa, nvlist_t *nv, void **tsd)
{
(void) spa;
(void) nv;
(void) tsd;

if (vdev_disk_io_rw_fn == NULL)
vdev_disk_io_rw_fn = zfs_vdev_disk_classic ?
vdev_classic_physio : vdev_disk_io_rw;

return (0);
}

vdev_ops_t vdev_disk_ops = {
.vdev_op_init = NULL,
.vdev_op_init = vdev_disk_init,
.vdev_op_fini = NULL,
.vdev_op_open = vdev_disk_open,
.vdev_op_close = vdev_disk_close,
Expand Down Expand Up @@ -1389,3 +1666,7 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW,

ZFS_MODULE_PARAM(zfs_vdev_disk, zfs_vdev_disk_, max_segs, UINT, ZMOD_RW,
"Maximum number of data segments to add to an IO request (min 4)");

ZFS_MODULE_PARAM_CALL(zfs_vdev_disk, zfs_vdev_disk_, classic,
vdev_disk_param_set_classic, param_get_uint, ZMOD_RD,
"Use classic BIO submission method");

0 comments on commit e59cf87

Please sign in to comment.