Skip to content

Commit

Permalink
abd: add page iterator
Browse files Browse the repository at this point in the history
The regular ABD iterators yield data buffers, so they have to map and
unmap pages into kernel memory. vdev_disk only needs the page pointers,
so has to then reverse the math to get them, and the map/unmap is just
unnecessary overhead.

This adds adb_iterate_page_func, which yields unmapped struct page
instead, and then updates vdev_disk to use it.

Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
  • Loading branch information
robn committed Dec 17, 2023
1 parent c22ae5b commit ac214be
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 88 deletions.
7 changes: 7 additions & 0 deletions include/sys/abd.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ typedef struct abd {

typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
#if defined(__linux__) && defined(_KERNEL)
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
#endif

extern int zfs_abd_scatter_enabled;

Expand Down Expand Up @@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *);
#if defined(__linux__) && defined(_KERNEL)
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
#endif
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
Expand Down
9 changes: 8 additions & 1 deletion include/sys/abd_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ typedef enum abd_stats_op {
ABDSTAT_DECR /* Decrease abdstat values */
} abd_stats_op_t;

struct scatterlist; /* forward declaration */
/* forward declarations */
struct scatterlist;
struct page;

struct abd_iter {
/* public interface */
Expand All @@ -51,6 +53,10 @@ struct abd_iter {
size_t iter_offset; /* offset in current sg/abd_buf, */
/* abd_offset included */
struct scatterlist *iter_sg; /* current sg */

struct page *iter_page; /* current page */
size_t iter_page_dsize; /* size of data in page */
size_t iter_page_doff; /* offset of data in page */
};

extern abd_t *abd_zero_scatter;
Expand Down Expand Up @@ -79,6 +85,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
void abd_iter_advance(struct abd_iter *, size_t);
void abd_iter_map(struct abd_iter *);
void abd_iter_unmap(struct abd_iter *);
void abd_iter_page(struct abd_iter *);

/*
* Helper macros
Expand Down
95 changes: 91 additions & 4 deletions module/os/linux/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, Klara Inc.
*/

/*
Expand Down Expand Up @@ -59,6 +60,7 @@
#include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h>
#include <linux/mm_compat.h>
#include <linux/scatterlist.h>
#else
#define MAX_ORDER 1
Expand Down Expand Up @@ -926,6 +928,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
}
aiter->iter_page = NULL;
aiter->iter_page_doff = 0;
aiter->iter_page_dsize = 0;
}

/*
Expand All @@ -935,6 +940,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}

Expand Down Expand Up @@ -1028,10 +1034,90 @@ abd_cache_reap_now(void)
{
}

#if defined(_KERNEL)
#ifdef _KERNEL
void
abd_iter_page(struct abd_iter *aiter)
{
if (abd_iter_at_end(aiter)) {
aiter->iter_page = NULL;
aiter->iter_page_doff = 0;
aiter->iter_page_dsize = 0;
return;
}

struct page *page;
size_t doff, dsize;

if (abd_is_linear(aiter->iter_abd)) {
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);

/* memory address at iter_pos */
void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;

/* struct page for address */
page = is_vmalloc_addr(paddr) ?
vmalloc_to_page(paddr) : virt_to_page(paddr);

/* offset of address within the page */
doff = offset_in_page(paddr);

/* total data remaining in abd from this position */
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
} else {
ASSERT(!abd_is_gang(aiter->iter_abd));

/* current scatter page */
page = sg_page(aiter->iter_sg);

/* position within page */
doff = aiter->iter_offset;

/* remaining data in scatterlist */
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
}
ASSERT(page);

if (PageTail(page)) {

/*
* This page is part of a "compound page", which is a group of
* pages that can be referenced from a single struct page *.
* Its organised as a "head" page, followed by a series of
* "tail" pages.
*
* In OpenZFS, compound pages are allocated using the
* __GFP_COMP flag, which we get from scatter ABDs and SPL
* vmalloc slabs (ie >16K allocations). So a great many of the
* IO buffers we get are going to be of this type.
*
* The tail pages are just regular PAGE_SIZE pages, and can be
* safely used as-is. However, the head page has length
* covering itself and all the tail pages. If this ABD chunk
* spans multiple pages, then we can use the head page and a
* >PAGE_SIZE length, which is far more efficient.
*
* To do this, we need to adjust the offset to be counted from
* the head page. struct page for compound pages are stored
* contiguously, so we can just adjust by a simple offset.
*/
struct page *head = compound_head(page);
doff += ((page - head) * PAGESIZE);
page = head;
}

/* final page and position within it */
aiter->iter_page = page;
aiter->iter_page_doff = doff;

/* amount of data in the chunk, up to the end of the page */
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
}


/*
* Note: ABD BIO functions only needed to support vdev_classic. See comments in
* vdev_disk.c.
* Note: The below ABD BIO functions only needed to support vdev_classic. See
* comments in vdev_disk.c.
*/

/*
Expand Down Expand Up @@ -1188,4 +1274,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD.");
#endif

#endif /* _KERNEL */
92 changes: 9 additions & 83 deletions module/os/linux/zfs/vdev_disk.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#include <linux/blkpg.h>
#include <linux/msdos_fs.h>
#include <linux/vfs_compat.h>
#include <linux/mm_compat.h>
#ifdef HAVE_LINUX_BLK_CGROUP_HEADER
#include <linux/blk-cgroup.h>
#endif
Expand Down Expand Up @@ -847,45 +846,6 @@ BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error)
vbio_put(vbio);
}

static inline void
_buf_to_page_and_offset(const void *buf, struct page **pagep, uint_t *offp)
{
struct page *page = is_vmalloc_addr(buf) ?
vmalloc_to_page(buf) : virt_to_page(buf);

if (!PageCompound(page)) {
/* Single boring page, nothing more to see */
*pagep = page;
*offp = offset_in_page(buf);
return;
}

/*
* This page is part of a "compound page", which is a group of pages
* that can be referenced from a single struct page *. Its organised as
* a "head" page, followed by a series of "tail" pages.
*
* In OpenZFS, compound pages are allocated using the __GFP_COMP flag,
* which we get from scatter ABDs and SPL vmalloc slabs (ie >16K
* allocations). So a great many of the IO buffers we get are going to
* be of this type.
*
* The tail pages are just regular PAGE_SIZE pages, and we can just
* load them into the BIO the same as we would for non-compound pages
* above, and it all works just fine. However, the head page has length
* covering itself and all the tail pages. If our buffer spans multiple
* pages, then we can load the head page and a >PAGE_SIZE length into
* the BIO, which is far more efficient.
*
* To do this, we need to calculate the offset of the buffer from the
* head page (offset_in_page() is the offset within its PAGE_SIZE'd
* page ie just a simple ~(PAGE_SIZE-1) mask).
*/

*pagep = compound_head(page);
*offp = (uint_t)((uintptr_t)(buf) & (page_size(*pagep)-1));
}

/*
* Iterator callback to count ABD pages and check their size & alignment.
*
Expand All @@ -902,13 +862,10 @@ typedef struct {
} vdev_disk_check_pages_t;

static int
vdev_disk_check_pages_cb(void *buf, size_t len, void *priv)
vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv)
{
vdev_disk_check_pages_t *s = priv;

struct page *page;
uint_t off;

/*
* If we didn't finish on a block size boundary last time, then there
* would be a gap if we tried to use this ABD as-is, so abort.
Expand All @@ -922,24 +879,11 @@ vdev_disk_check_pages_cb(void *buf, size_t len, void *priv)
*/
s->end = len & s->bmask;

while (len > 0) {
_buf_to_page_and_offset(buf, &page, &off);

/*
* All blocks after the first must start on a block size
* boundary.
*/
if (s->npages != 0 && (off & s->bmask) != 0)
return (1);

uint_t take = MIN(len, page_size(page)-off);

buf += take;
len -= take;

s->npages++;
}
/* All blocks after the first must start on a block size boundary. */
if (s->npages != 0 && (off & s->bmask) != 0)
return (1);

s->npages++;
return (0);
}

Expand All @@ -956,36 +900,18 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, uint_t lbs)
.end = 0
};

if (abd_iterate_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s))
return (0);

return (s.npages);
}

/* Iterator callback to submit ABD pages to the vbio. */
static int
vdev_disk_fill_vbio_cb(void *buf, size_t len, void *priv)
vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv)
{
vbio_t *vbio = priv;
int err;

struct page *page;
uint_t off;

while (len > 0) {
_buf_to_page_and_offset(buf, &page, &off);

uint_t take = MIN(len, page_size(page)-off);

err = vbio_add_page(vbio, page, take, off);
if (err != 0)
return (err);

buf += take;
len -= take;
}

return (0);
return (vbio_add_page(vbio, page, len, off));
}

static int
Expand Down Expand Up @@ -1063,7 +989,7 @@ vdev_disk_io_rw(zio_t *zio)
vbio->vbio_abd = abd;

/* Fill it with pages */
error = abd_iterate_func(abd, 0, zio->io_size,
error = abd_iterate_page_func(abd, 0, zio->io_size,
vdev_disk_fill_vbio_cb, vbio);
if (error != 0) {
vbio_free(vbio);
Expand Down
38 changes: 38 additions & 0 deletions module/zfs/abd.c
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,44 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
return (ret);
}

#if defined(__linux__) && defined(_KERNEL)
int
abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
abd_iter_page_func_t *func, void *private)
{
struct abd_iter aiter;
int ret = 0;

if (size == 0)
return (0);

abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);

abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);

while (size > 0) {
IMPLY(abd_is_gang(abd), c_abd != NULL);

abd_iter_page(&aiter);

size_t len = MIN(aiter.iter_page_dsize, size);
ASSERT3U(len, >, 0);

ret = func(aiter.iter_page, aiter.iter_page_doff,
len, private);

if (ret != 0)
break;

size -= len;
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
}

return (ret);
}
#endif

struct buf_arg {
void *arg_buf;
};
Expand Down

0 comments on commit ac214be

Please sign in to comment.