Skip to content

Commit

Permalink
abd: add page iterator
Browse files Browse the repository at this point in the history
The regular ABD iterators yield data buffers, so they have to map and
unmap pages into kernel memory. If the caller only wants to count
chunks, or can use page pointers directly, then the map/unmap is just
unnecessary overhead.

This adds adb_iterate_page_func, which yields unmapped struct page
instead.

Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Rob Norris <rob.norris@klarasystems.com>
Sponsored-by: Klara, Inc.
Sponsored-by: Wasabi Technology, Inc.
Closes #15533
Closes #15588
  • Loading branch information
robn authored and behlendorf committed Mar 25, 2024
1 parent df04efe commit 390b448
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 14 deletions.
7 changes: 7 additions & 0 deletions include/sys/abd.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ typedef struct abd {

typedef int abd_iter_func_t(void *buf, size_t len, void *priv);
typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv);
#if defined(__linux__) && defined(_KERNEL)
typedef int abd_iter_page_func_t(struct page *, size_t, size_t, void *);
#endif

extern int zfs_abd_scatter_enabled;

Expand Down Expand Up @@ -125,6 +128,10 @@ void abd_release_ownership_of_buf(abd_t *);
int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *);
int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t,
abd_iter_func2_t *, void *);
#if defined(__linux__) && defined(_KERNEL)
int abd_iterate_page_func(abd_t *, size_t, size_t, abd_iter_page_func_t *,
void *);
#endif
void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t);
void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t);
void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t);
Expand Down
26 changes: 23 additions & 3 deletions include/sys/abd_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/

#ifndef _ABD_IMPL_H
Expand All @@ -38,12 +39,30 @@ typedef enum abd_stats_op {
ABDSTAT_DECR /* Decrease abdstat values */
} abd_stats_op_t;

struct scatterlist; /* forward declaration */
/* forward declarations */
struct scatterlist;
struct page;

struct abd_iter {
/* public interface */
void *iter_mapaddr; /* addr corresponding to iter_pos */
size_t iter_mapsize; /* length of data valid at mapaddr */
union {
/* for abd_iter_map()/abd_iter_unmap() */
struct {
/* addr corresponding to iter_pos */
void *iter_mapaddr;
/* length of data valid at mapaddr */
size_t iter_mapsize;
};
/* for abd_iter_page() */
struct {
/* current page */
struct page *iter_page;
/* offset of data in page */
size_t iter_page_doff;
/* size of data in page */
size_t iter_page_dsize;
};
};

/* private */
abd_t *iter_abd; /* ABD being iterated through */
Expand Down Expand Up @@ -78,6 +97,7 @@ boolean_t abd_iter_at_end(struct abd_iter *);
void abd_iter_advance(struct abd_iter *, size_t);
void abd_iter_map(struct abd_iter *);
void abd_iter_unmap(struct abd_iter *);
void abd_iter_page(struct abd_iter *);

/*
* Helper macros
Expand Down
4 changes: 1 addition & 3 deletions module/os/freebsd/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -417,10 +417,8 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
aiter->iter_pos = 0;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
}

/*
Expand Down
104 changes: 96 additions & 8 deletions module/os/linux/zfs/abd_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2019 by Delphix. All rights reserved.
* Copyright (c) 2023, 2024, Klara Inc.
*/

/*
Expand Down Expand Up @@ -59,6 +60,7 @@
#include <sys/zfs_znode.h>
#ifdef _KERNEL
#include <linux/kmap_compat.h>
#include <linux/mm_compat.h>
#include <linux/scatterlist.h>
#endif

Expand Down Expand Up @@ -895,14 +897,9 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
{
ASSERT(!abd_is_gang(abd));
abd_verify(abd);
memset(aiter, 0, sizeof (struct abd_iter));
aiter->iter_abd = abd;
aiter->iter_mapaddr = NULL;
aiter->iter_mapsize = 0;
aiter->iter_pos = 0;
if (abd_is_linear(abd)) {
aiter->iter_offset = 0;
aiter->iter_sg = NULL;
} else {
if (!abd_is_linear(abd)) {
aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
}
Expand All @@ -915,6 +912,7 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd)
boolean_t
abd_iter_at_end(struct abd_iter *aiter)
{
ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
return (aiter->iter_pos == aiter->iter_abd->abd_size);
}

Expand All @@ -926,8 +924,15 @@ abd_iter_at_end(struct abd_iter *aiter)
void
abd_iter_advance(struct abd_iter *aiter, size_t amount)
{
/*
* Ensure that last chunk is not in use. abd_iterate_*() must clear
* this state (directly or abd_iter_unmap()) before advancing.
*/
ASSERT3P(aiter->iter_mapaddr, ==, NULL);
ASSERT0(aiter->iter_mapsize);
ASSERT3P(aiter->iter_page, ==, NULL);
ASSERT0(aiter->iter_page_doff);
ASSERT0(aiter->iter_page_dsize);

/* There's nothing left to advance to, so do nothing */
if (abd_iter_at_end(aiter))
Expand Down Expand Up @@ -1009,6 +1014,88 @@ abd_cache_reap_now(void)
}

#if defined(_KERNEL)
/*
* Yield the next page struct and data offset and size within it, without
* mapping it into the address space.
*/
void
abd_iter_page(struct abd_iter *aiter)
{
if (abd_iter_at_end(aiter)) {
aiter->iter_page = NULL;
aiter->iter_page_doff = 0;
aiter->iter_page_dsize = 0;
return;
}

struct page *page;
size_t doff, dsize;

if (abd_is_linear(aiter->iter_abd)) {
ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);

/* memory address at iter_pos */
void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;

/* struct page for address */
page = is_vmalloc_addr(paddr) ?
vmalloc_to_page(paddr) : virt_to_page(paddr);

/* offset of address within the page */
doff = offset_in_page(paddr);

/* total data remaining in abd from this position */
dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
} else {
ASSERT(!abd_is_gang(aiter->iter_abd));

/* current scatter page */
page = sg_page(aiter->iter_sg);

/* position within page */
doff = aiter->iter_offset;

/* remaining data in scatterlist */
dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
aiter->iter_abd->abd_size - aiter->iter_pos);
}
ASSERT(page);

if (PageTail(page)) {
/*
* This page is part of a "compound page", which is a group of
* pages that can be referenced from a single struct page *.
* Its organised as a "head" page, followed by a series of
* "tail" pages.
*
* In OpenZFS, compound pages are allocated using the
* __GFP_COMP flag, which we get from scatter ABDs and SPL
* vmalloc slabs (ie >16K allocations). So a great many of the
* IO buffers we get are going to be of this type.
*
* The tail pages are just regular PAGE_SIZE pages, and can be
* safely used as-is. However, the head page has length
* covering itself and all the tail pages. If this ABD chunk
* spans multiple pages, then we can use the head page and a
* >PAGE_SIZE length, which is far more efficient.
*
* To do this, we need to adjust the offset to be counted from
* the head page. struct page for compound pages are stored
* contiguously, so we can just adjust by a simple offset.
*/
struct page *head = compound_head(page);
doff += ((page - head) * PAGESIZE);
page = head;
}

/* final page and position within it */
aiter->iter_page = page;
aiter->iter_page_doff = doff;

/* amount of data in the chunk, up to the end of the page */
aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
}

/*
* bio_nr_pages for ABD.
* @off is the offset in @abd
Expand Down Expand Up @@ -1163,4 +1250,5 @@ MODULE_PARM_DESC(zfs_abd_scatter_min_size,
module_param(zfs_abd_scatter_max_order, uint, 0644);
MODULE_PARM_DESC(zfs_abd_scatter_max_order,
"Maximum order allocation used for a scatter ABD.");
#endif

#endif /* _KERNEL */
42 changes: 42 additions & 0 deletions module/zfs/abd.c
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,48 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size,
return (ret);
}

#if defined(__linux__) && defined(_KERNEL)
int
abd_iterate_page_func(abd_t *abd, size_t off, size_t size,
abd_iter_page_func_t *func, void *private)
{
struct abd_iter aiter;
int ret = 0;

if (size == 0)
return (0);

abd_verify(abd);
ASSERT3U(off + size, <=, abd->abd_size);

abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off);

while (size > 0) {
IMPLY(abd_is_gang(abd), c_abd != NULL);

abd_iter_page(&aiter);

size_t len = MIN(aiter.iter_page_dsize, size);
ASSERT3U(len, >, 0);

ret = func(aiter.iter_page, aiter.iter_page_doff,
len, private);

aiter.iter_page = NULL;
aiter.iter_page_doff = 0;
aiter.iter_page_dsize = 0;

if (ret != 0)
break;

size -= len;
c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len);
}

return (ret);
}
#endif

struct buf_arg {
void *arg_buf;
};
Expand Down

0 comments on commit 390b448

Please sign in to comment.