diff --git a/include/sys/abd.h b/include/sys/abd.h index 16e5ca68d5ee..19fe96292d5f 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -79,6 +79,9 @@ typedef struct abd { typedef int abd_iter_func_t(void *buf, size_t len, void *priv); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); +#if defined(__linux__) && defined(_KERNEL) +typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *); +#endif extern int zfs_abd_scatter_enabled; @@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *); int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, abd_iter_func2_t *, void *); +#if defined(__linux__) && defined(_KERNEL) +int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *, + void *); +#endif void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 78316ecb0a48..3bf2f78f8a0d 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -38,12 +38,30 @@ typedef enum abd_stats_op { ABDSTAT_DECR /* Decrease abdstat values */ } abd_stats_op_t; -struct scatterlist; /* forward declaration */ +/* forward declarations */ +struct scatterlist; +struct page; struct abd_iter { /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ + union { + /* for abd_iter_map()/abd_iter_unmap() */ + struct { + /* addr corresponding to iter_pos */ + void *iter_mapaddr; + /* length of data valid at mapaddr */ + size_t iter_mapsize; + }; + /* for abd_iter_page() */ + struct { + /* current page */ + struct page *iter_page; + /* offset of data in page */ + size_t iter_page_doff; + /* size of data in page */ + size_t iter_page_dsize; + }; + }; /* private */ abd_t *iter_abd; /* ABD being iterated through */ @@ -79,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *); void abd_iter_advance(struct abd_iter *, size_t); void abd_iter_map(struct abd_iter *); void abd_iter_unmap(struct abd_iter *); +void abd_iter_page(struct abd_iter *); /* * Helper macros diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index 7602f96fc1d4..2fc71f31c6b2 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -446,10 +446,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; } /* diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 171786d254dd..4123d93abae2 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. * Copyright (c) 2019 by Delphix. All rights reserved. + * Copyright (c) 2023, Klara Inc. */ /* @@ -59,6 +60,7 @@ #include #ifdef _KERNEL #include +#include #include #else #define MAX_ORDER 1 @@ -915,14 +917,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) { ASSERT(!abd_is_gang(abd)); abd_verify(abd); + memset(aiter, 0, sizeof (struct abd_iter)); aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { + if (!abd_is_linear(abd)) { aiter->iter_offset = ABD_SCATTER(abd).abd_offset; aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; } @@ -935,6 +932,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd) boolean_t abd_iter_at_end(struct abd_iter *aiter) { + ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size); return (aiter->iter_pos == aiter->iter_abd->abd_size); } @@ -1028,10 +1026,90 @@ abd_cache_reap_now(void) { } -#if defined(_KERNEL) +#ifdef _KERNEL +void +abd_iter_page(struct abd_iter *aiter) +{ + if (abd_iter_at_end(aiter)) { + aiter->iter_page = NULL; + aiter->iter_page_doff = 0; + aiter->iter_page_dsize = 0; + return; + } + + struct page *page; + size_t doff, dsize; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + + /* memory address at iter_pos */ + void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos; + + /* struct page for address */ + page = is_vmalloc_addr(paddr) ? + vmalloc_to_page(paddr) : virt_to_page(paddr); + + /* offset of address within the page */ + doff = offset_in_page(paddr); + + /* total data remaining in abd from this position */ + dsize = aiter->iter_abd->abd_size - aiter->iter_offset; + } else { + ASSERT(!abd_is_gang(aiter->iter_abd)); + + /* current scatter page */ + page = sg_page(aiter->iter_sg); + + /* position within page */ + doff = aiter->iter_offset; + + /* remaining data in scatterlist */ + dsize = MIN(aiter->iter_sg->length - aiter->iter_offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + } + ASSERT(page); + + if (PageTail(page)) { + + /* + * This page is part of a "compound page", which is a group of + * pages that can be referenced from a single struct page *. + * Its organised as a "head" page, followed by a series of + * "tail" pages. + * + * In OpenZFS, compound pages are allocated using the + * __GFP_COMP flag, which we get from scatter ABDs and SPL + * vmalloc slabs (ie >16K allocations). So a great many of the + * IO buffers we get are going to be of this type. + * + * The tail pages are just regular PAGE_SIZE pages, and can be + * safely used as-is. However, the head page has length + * covering itself and all the tail pages. If this ABD chunk + * spans multiple pages, then we can use the head page and a + * >PAGE_SIZE length, which is far more efficient. + * + * To do this, we need to adjust the offset to be counted from + * the head page. struct page for compound pages are stored + * contiguously, so we can just adjust by a simple offset. + */ + struct page *head = compound_head(page); + doff += ((page - head) * PAGESIZE); + page = head; + } + + /* final page and position within it */ + aiter->iter_page = page; + aiter->iter_page_doff = doff; + + /* amount of data in the chunk, up to the end of the page */ + aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff); +} + + /* - * Note: ABD BIO functions only needed to support vdev_classic. See comments in - * vdev_disk.c. + * Note: The below ABD BIO functions only needed to support vdev_classic. See + * comments in vdev_disk.c. */ /* @@ -1188,4 +1266,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size, module_param(zfs_abd_scatter_max_order, uint, 0644); MODULE_PARM_DESC(zfs_abd_scatter_max_order, "Maximum order allocation used for a scatter ABD."); -#endif + +#endif /* _KERNEL */ diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 1c5f0ab40b91..599b294af9a0 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -38,7 +38,6 @@ #include #include #include -#include #ifdef HAVE_LINUX_BLK_CGROUP_HEADER #include #endif @@ -845,45 +844,6 @@ BIO_END_IO_PROTO(vdev_disk_io_rw_completion, bio, error) vbio_put(vbio); } -static inline void -_buf_to_page_and_offset(const void *buf, struct page **pagep, uint_t *offp) -{ - struct page *page = is_vmalloc_addr(buf) ? - vmalloc_to_page(buf) : virt_to_page(buf); - - if (!PageCompound(page)) { - /* Single boring page, nothing more to see */ - *pagep = page; - *offp = offset_in_page(buf); - return; - } - - /* - * This page is part of a "compound page", which is a group of pages - * that can be referenced from a single struct page *. Its organised as - * a "head" page, followed by a series of "tail" pages. - * - * In OpenZFS, compound pages are allocated using the __GFP_COMP flag, - * which we get from scatter ABDs and SPL vmalloc slabs (ie >16K - * allocations). So a great many of the IO buffers we get are going to - * be of this type. - * - * The tail pages are just regular PAGE_SIZE pages, and we can just - * load them into the BIO the same as we would for non-compound pages - * above, and it all works just fine. However, the head page has length - * covering itself and all the tail pages. If our buffer spans multiple - * pages, then we can load the head page and a >PAGE_SIZE length into - * the BIO, which is far more efficient. - * - * To do this, we need to calculate the offset of the buffer from the - * head page (offset_in_page() is the offset within its PAGE_SIZE'd - * page ie just a simple ~(PAGE_SIZE-1) mask). - */ - - *pagep = compound_head(page); - *offp = (uint_t)((uintptr_t)(buf) & (page_size(*pagep)-1)); -} - /* * Iterator callback to count ABD pages and check their size & alignment. * @@ -900,13 +860,10 @@ typedef struct { } vdev_disk_check_pages_t; static int -vdev_disk_check_pages_cb(void *buf, size_t len, void *priv) +vdev_disk_check_pages_cb(struct page *page, size_t off, size_t len, void *priv) { vdev_disk_check_pages_t *s = priv; - struct page *page; - uint_t off; - /* * If we didn't finish on a block size boundary last time, then there * would be a gap if we tried to use this ABD as-is, so abort. @@ -920,24 +877,11 @@ vdev_disk_check_pages_cb(void *buf, size_t len, void *priv) */ s->end = len & s->bmask; - while (len > 0) { - _buf_to_page_and_offset(buf, &page, &off); - - /* - * All blocks after the first must start on a block size - * boundary. - */ - if (s->npages != 0 && (off & s->bmask) != 0) - return (1); - - uint_t take = MIN(len, page_size(page)-off); - - buf += take; - len -= take; - - s->npages++; - } + /* All blocks after the first must start on a block size boundary. */ + if (s->npages != 0 && (off & s->bmask) != 0) + return (1); + s->npages++; return (0); } @@ -954,7 +898,7 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, uint_t lbs) .end = 0 }; - if (abd_iterate_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) + if (abd_iterate_page_func(abd, 0, size, vdev_disk_check_pages_cb, &s)) return (0); return (s.npages); @@ -962,28 +906,10 @@ vdev_disk_check_pages(abd_t *abd, uint64_t size, uint_t lbs) /* Iterator callback to submit ABD pages to the vbio. */ static int -vdev_disk_fill_vbio_cb(void *buf, size_t len, void *priv) +vdev_disk_fill_vbio_cb(struct page *page, size_t off, size_t len, void *priv) { vbio_t *vbio = priv; - int err; - - struct page *page; - uint_t off; - - while (len > 0) { - _buf_to_page_and_offset(buf, &page, &off); - - uint_t take = MIN(len, page_size(page)-off); - - err = vbio_add_page(vbio, page, take, off); - if (err != 0) - return (err); - - buf += take; - len -= take; - } - - return (0); + return (vbio_add_page(vbio, page, len, off)); } static int @@ -1061,7 +987,7 @@ vdev_disk_io_rw(zio_t *zio) vbio->vbio_abd = abd; /* Fill it with pages */ - error = abd_iterate_func(abd, 0, zio->io_size, + error = abd_iterate_page_func(abd, 0, zio->io_size, vdev_disk_fill_vbio_cb, vbio); if (error != 0) { vbio_free(vbio); diff --git a/module/zfs/abd.c b/module/zfs/abd.c index c7d9ef6899c2..7cd9728abae2 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -828,6 +828,44 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, return (ret); } +#if defined(__linux__) && defined(_KERNEL) +int +abd_iterate_page_func(abd_t *abd, size_t off, size_t size, + abd_iter_page_func_t *func, void *private) +{ + struct abd_iter aiter; + int ret = 0; + + if (size == 0) + return (0); + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); + + while (size > 0) { + IMPLY(abd_is_gang(abd), c_abd != NULL); + + abd_iter_page(&aiter); + + size_t len = MIN(aiter.iter_page_dsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_page, aiter.iter_page_doff, + len, private); + + if (ret != 0) + break; + + size -= len; + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); + } + + return (ret); +} +#endif + struct buf_arg { void *arg_buf; };