From e7c2d157e29cabd66518f9539e3395f3080bc8a0 Mon Sep 17 00:00:00 2001 From: Brian Date: Thu, 27 Feb 2020 13:06:02 -0700 Subject: [PATCH] Multi ABD Type Adding the mutli abd type, which allows for linear and scatter abd's to be chained together into a single abd. As part of this PR, the ABD code has been reorganized so the OS dependant ABD code has been split into their own independent files. The shared ABD code is now under: module/zfs/abd.c With the independent OS code in: module/os/linux/zfs/abd_os.c module/os/freebsd/zfs/abd_os.c Signed-off-by: Brian Atkinson Co-authored-by: Mark Maybee Co-authored-by: Brian Atkinson --- include/sys/abd.h | 103 ++- lib/libzpool/Makefile.am | 1 + module/Makefile.bsd | 5 +- module/os/freebsd/zfs/abd_os.c | 589 ++++++++++++ module/os/linux/zfs/Makefile.in | 2 +- module/os/linux/zfs/{abd.c => abd_os.c} | 1086 +++++++---------------- module/os/linux/zfs/vdev_disk.c | 50 +- module/zfs/Makefile.in | 1 + module/{os/freebsd => }/zfs/abd.c | 712 ++++++--------- module/zfs/vdev_queue.c | 50 +- 10 files changed, 1333 insertions(+), 1266 deletions(-) create mode 100644 module/os/freebsd/zfs/abd_os.c rename module/os/linux/zfs/{abd.c => abd_os.c} (58%) rename module/{os/freebsd => }/zfs/abd.c (56%) diff --git a/include/sys/abd.h b/include/sys/abd.h index 82b73589bbe..603900741d6 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -36,19 +36,29 @@ extern "C" { #endif typedef enum abd_flags { - ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ - ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ - ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ - ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ - ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ - ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ + ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ + ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ + ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ + ABD_FLAG_MULTI_LIST = 1 << 6, /* mult ABDs chained together */ + ABD_FLAG_MULTI_FREE = 1 << 7, /* mult ABD is responsible for mem */ + ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */ } abd_flags_t; +typedef enum abd_stats_op { + ABDSTAT_INCR, /* Increase abdstat values */ + ABDSTAT_DECR /* Decrease abdstat values */ +} abd_stats_op_t; + typedef struct abd { abd_flags_t abd_flags; uint_t abd_size; /* excludes scattered abd_offset */ + list_node_t abd_multi_link; struct abd *abd_parent; zfs_refcount_t abd_children; + kmutex_t abd_mtx; union { struct abd_scatter { uint_t abd_offset; @@ -64,13 +74,37 @@ typedef struct abd { void *abd_buf; struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ } abd_linear; + struct abd_multi { + list_t abd_chain; + } abd_multi; } abd_u; } abd_t; +#if defined(_KERNEL) +unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +#endif + +struct scatterlist; /* forward declartion */ + +struct abd_iter { + /* public interface */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ + + /* private */ + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; + size_t iter_offset; /* offset in current sg/abd_buf, */ + /* abd_offset included */ + struct scatterlist *iter_sg; /* current sg */ +}; + typedef int abd_iter_func_t(void *buf, size_t len, void *private); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private); extern int zfs_abd_scatter_enabled; +extern abd_t *abd_zero_scatter; static inline boolean_t abd_is_linear(abd_t *abd) @@ -78,6 +112,13 @@ abd_is_linear(abd_t *abd) return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); } +static inline boolean_t +abd_is_multi(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_MULTI_LIST) != 0 ? B_TRUE : + B_FALSE); +} + static inline boolean_t abd_is_linear_page(abd_t *abd) { @@ -91,13 +132,16 @@ abd_is_linear_page(abd_t *abd) abd_t *abd_alloc(size_t, boolean_t); abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_multi(void); abd_t *abd_alloc_for_io(size_t, boolean_t); abd_t *abd_alloc_sametype(abd_t *, size_t); +void abd_add_child(abd_t *, abd_t *, boolean_t); +abd_t *abd_find_child_off(abd_t *, size_t *); void abd_free(abd_t *); abd_t *abd_get_offset(abd_t *, size_t); abd_t *abd_get_offset_size(abd_t *, size_t, size_t); +abd_t *abd_get_zeros(size_t); abd_t *abd_get_from_buf(void *, size_t); -void abd_put(abd_t *); /* * Conversion to and from a normal buffer @@ -124,12 +168,7 @@ void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); int abd_cmp(abd_t *, abd_t *); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); - -#if defined(_KERNEL) -unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, - size_t); -unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); -#endif +void abd_verify(abd_t *); void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ssize_t csize, ssize_t dsize, const unsigned parity, @@ -174,13 +213,51 @@ abd_zero(abd_t *abd, size_t size) abd_zero_off(abd, 0, size); } +/* + * OS specific functions + */ + +void abd_put(abd_t *); +abd_t *abd_alloc_struct(size_t size); +void abd_free_struct(abd_t *abd); +void abd_alloc_pages(abd_t *abd, size_t size); +void abd_free_pages(abd_t *abd); +boolean_t abd_size_alloc_linear(size_t size); +void abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op); +void abd_update_linear_stats(abd_t *abd, abd_stats_op_t op); +void abd_verify_scatter(abd_t *abd); +void abd_free_linear_page(abd_t *abd); +abd_t *abd_get_offset_impl(abd_t *sabd, size_t off, size_t size); +void abd_enter_critical(unsigned long flags); +void abd_exit_critical(unsigned long flags); +/* OS specific abd_iter functions */ +void abd_iter_init(struct abd_iter *aiter, abd_t *abd); +boolean_t abd_iter_at_end(struct abd_iter *aiter); +void abd_iter_advance(struct abd_iter *aiter, size_t amount); +void abd_iter_map(struct abd_iter *aiter); +void abd_iter_unmap(struct abd_iter *aiter); + /* * Module lifecycle + * Defined in each specific OS's abd.c */ void abd_init(void); void abd_fini(void); +/* + * Helper macros + */ +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) +#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) +#define ABD_MULTI(abd) (abd->abd_u.abd_multi) + #ifdef __cplusplus } #endif diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index a9396105bc6..0e6a1058ec2 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -39,6 +39,7 @@ KERNEL_C = \ zpool_prop.c \ zprop_common.c \ abd.c \ + abd_os.c \ aggsum.c \ arc.c \ arc_os.c \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 6d76796f51e..92b5c1906c1 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -127,7 +127,7 @@ SRCS+= spl_atomic.c .endif #os/freebsd/zfs -SRCS+= abd.c \ +SRCS+= abd_os.c \ crypto_os.c \ dmu_os.c \ hkdf.c \ @@ -169,7 +169,8 @@ SRCS+= zfeature_common.c \ zprop_common.c #zfs -SRCS+= aggsum.c \ +SRCS+= abd.c \ + aggsum.c \ arc.c \ arc_os.c \ blkptr.c \ diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c new file mode 100644 index 00000000000..637f9acecb5 --- /dev/null +++ b/module/os/freebsd/zfs/abd_os.c @@ -0,0 +1,589 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. + */ + +#include +#include +#include +#include +#include + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +/* + * It is possible to make all future ABDs be linear by setting this to B_FALSE. + * Otherwise, ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear(). + */ +boolean_t zfs_abd_scatter_enabled = B_TRUE; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +#if defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, + &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, + &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +/* + * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are + * just a single zero'd sized zfs_abd_chunk_size buffer. This + * allows us to conserve memory by only using a single zero buffer + * for the scatter chunks. + */ +abd_t *abd_zero_scatter = NULL; +static char *abd_zero_buf = NULL; + +#define ABD_ZERO_PAGE (abd_zero_buf) + +extern inline boolean_t abd_is_linear(abd_t *abd); +extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); +extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); +extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); +extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); +extern inline void abd_zero(abd_t *abd, size_t size); + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + ABD_SCATTER(abd).abd_offset + abd->abd_size)); +} + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + size_t n = abd_scatter_chunkcnt(abd); + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + + +void +abd_verify_scatter(abd_t *abd) +{ + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + } +} + +void +abd_alloc_pages(abd_t *abd, size_t size) +{ + size_t n = abd_chunkcnt_for_bytes(size); + for (int i = 0; i < n; n++) { + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + ABD_SCATTER(abd).abd_chunks[i] = c; + } + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; +} + +void +abd_free_pages(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + } +} + +abd_t * +abd_alloc_struct(size_t size) +{ + size_t n = abd_chunkcnt_for_bytes(size); + size_t abd_size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[n]); + abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + list_link_init(&abd->abd_multi_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + mutex_destroy(&abd->abd_mtx); + ASSERT(!list_link_active(&abd->abd_multi_link)); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where + * each page in the scatterlist will be set to ABD_ZERO_PAGE. + */ +static void +abd_alloc_zero_scatter(void) +{ + size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP); + abd_zero_scatter = abd_alloc_struct(n); + + abd_zero_scatter->abd_flags = ABD_FLAG_OWNER; + abd_zero_scatter->abd_flags |= ABD_FLAG_ZEROS; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_parent = NULL; + zfs_refcount_create(&abd_zero_scatter->abd_children); + + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_chunk_size = + zfs_abd_chunk_size; + + for (int i = 0; i < n; i++) { + ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = + ABD_ZERO_PAGE; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size); +} + +static void +abd_free_zero_scatter(void) +{ + zfs_refcount_destroy(&abd_zero_scatter->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size); + + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; + kmem_free(abd_zero_buf, zfs_abd_chunk_size); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } + + abd_alloc_zero_scatter(); +} + +void +abd_fini(void) +{ + abd_free_zero_scatter(); + + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* + * FreeBSD does not have have scatter linear pages + * so there is an error. + */ + ASSERT(!"cannot reach"); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +/* ARGSUSED */ +abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; + } else if (abd_is_multi(sabd)) { + size_t left = size; + abd = abd_alloc_multi(); + + for (abd_t *cabd = abd_find_child_off(sabd, &off); + cabd != NULL && left > 0; + cabd = list_next(&ABD_MULTI(sabd).abd_chain, cabd)) { + int csize = MIN(left, cabd->abd_size - off); + + abd_t *nabd = abd_get_offset_impl(cabd, off, csize); + abd_add_child(abd, nabd, B_TRUE); + left -= csize; + off = 0; + } + ASSERT3U(left, ==, 0); + } else { + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_struct(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&ABD_SCATTER(abd).abd_chunks, + &ABD_SCATTER(sabd).abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + } + + if (size == 0) + abd->abd_size = sabd->abd_size - off; + else + abd->abd_size = size; + abd->abd_parent = sabd; + zfs_refcount_create(&abd->abd_children); + (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + + return (abd_get_offset_impl(sabd, off, 0)); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + if (abd == NULL) + return; + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + zfs_refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + ASSERT(!abd_is_multi(abd)); + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + ABD_SCATTER(aiter->iter_abd).abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_enter_critical(unsigned long flags) +{ + critical_enter(); +} + +void +abd_exit_critical(unsigned long flags) +{ + critical_exit(); +} diff --git a/module/os/linux/zfs/Makefile.in b/module/os/linux/zfs/Makefile.in index 8c11a1ee6f5..cb4edbbc1a3 100644 --- a/module/os/linux/zfs/Makefile.in +++ b/module/os/linux/zfs/Makefile.in @@ -7,7 +7,7 @@ ccflags-$(CONFIG_SPARC64) += -Wno-unused-value ccflags-y += -I@abs_top_srcdir@/module/os/linux/zfs -$(MODULE)-objs += ../os/linux/zfs/abd.o +$(MODULE)-objs += ../os/linux/zfs/abd_os.o $(MODULE)-objs += ../os/linux/zfs/arc_os.o $(MODULE)-objs += ../os/linux/zfs/mmp_os.o $(MODULE)-objs += ../os/linux/zfs/policy.o diff --git a/module/os/linux/zfs/abd.c b/module/os/linux/zfs/abd_os.c similarity index 58% rename from module/os/linux/zfs/abd.c rename to module/os/linux/zfs/abd_os.c index bc6f81000d4..e8a8228f485 100644 --- a/module/os/linux/zfs/abd.c +++ b/module/os/linux/zfs/abd_os.c @@ -85,35 +85,6 @@ * * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to * B_FALSE. - * - * In addition to directly allocating a linear or scattered ABD, it is also - * possible to create an ABD by requesting the "sub-ABD" starting at an offset - * within an existing ABD. In linear buffers this is simple (set abd_buf of - * the new ABD to the starting point within the original raw buffer), but - * scattered ABDs are a little more complex. The new ABD makes a copy of the - * relevant abd_chunks pointers (but not the underlying data). However, to - * provide arbitrary rather than only chunk-aligned starting offsets, it also - * tracks an abd_offset field which represents the starting point of the data - * within the first chunk in abd_chunks. For both linear and scattered ABDs, - * creating an offset ABD marks the original ABD as the offset's parent, and the - * original ABD's abd_children refcount is incremented. This data allows us to - * ensure the root ABD isn't deleted before its children. - * - * Most consumers should never need to know what type of ABD they're using -- - * the ABD public API ensures that it's possible to transparently switch from - * using a linear ABD to a scattered one when doing so would be beneficial. - * - * If you need to use the data within an ABD directly, if you know it's linear - * (because you allocated it) you can use abd_to_buf() to access the underlying - * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions - * which will allocate a raw buffer if necessary. Use the abd_return_buf* - * functions to return any raw buffers that are no longer necessary when you're - * done using them. - * - * There are a variety of ABD APIs that implement basic buffer operations: - * compare, copy, read, write, and fill with zeroes. If you need a custom - * function which progressively accesses the whole ABD, use the abd_iterate_* - * functions. */ #include @@ -122,8 +93,8 @@ #include #include #ifdef _KERNEL -#include #include +#include #else #define MAX_ORDER 1 #endif @@ -196,14 +167,6 @@ static abd_stats_t abd_stats = { { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, }; -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) -#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) #define abd_for_each_sg(abd, sg, n, i) \ for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) @@ -235,6 +198,13 @@ unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; */ int zfs_abd_scatter_min_size = 512 * 3; +/* + * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are + * just a single zero'd page. This allows us to conserve memory by + * only using a single zero page for the scatterlist. + */ +abd_t *abd_zero_scatter = NULL; + static kmem_cache_t *abd_cache = NULL; static kstat_t *abd_ksp; @@ -244,6 +214,27 @@ abd_chunkcnt_for_bytes(size_t size) return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); } +abd_t * +abd_alloc_struct(size_t size) +{ + abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + list_link_init(&abd->abd_multi_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + mutex_destroy(&abd->abd_mtx); + ASSERT(!list_link_active(&abd->abd_multi_link)); + kmem_cache_free(abd_cache, abd); + ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); +} + #ifdef _KERNEL /* * Mark zfs data pages so they can be excluded from kernel crash dumps @@ -284,7 +275,7 @@ abd_unmark_zfs_page(struct page *page) * reclaim or compaction. When necessary this function will degenerate to * allocating individual pages and allowing reclaim to satisfy allocations. */ -static void +void abd_alloc_pages(abd_t *abd, size_t size) { struct list_head pages; @@ -383,8 +374,7 @@ abd_alloc_pages(abd_t *abd, size_t size) abd->abd_flags |= ABD_FLAG_LINEAR; abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; abd->abd_u.abd_linear.abd_sgl = table.sgl; - abd->abd_u.abd_linear.abd_buf = - page_address(sg_page(table.sgl)); + ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); } else if (table.nents > 1) { ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; @@ -404,7 +394,7 @@ abd_alloc_pages(abd_t *abd, size_t size) * makes no attempt to request contiguous pages and requires the minimal * number of kernel interfaces. It's designed for maximum compatibility. */ -static void +void abd_alloc_pages(abd_t *abd, size_t size) { struct scatterlist *sg = NULL; @@ -441,11 +431,24 @@ abd_alloc_pages(abd_t *abd, size_t size) } #endif /* !CONFIG_HIGHMEM */ +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ static void +abd_free_sg_table(abd_t *abd) +{ + struct sg_table table; + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; + sg_free_table(&table); +} + +void abd_free_pages(abd_t *abd) { struct scatterlist *sg = NULL; - struct sg_table table; struct page *page; int nr_pages = ABD_SCATTER(abd).abd_nents; int order, i = 0; @@ -464,14 +467,62 @@ abd_free_pages(abd_t *abd) ASSERT3U(sg->length, <=, PAGE_SIZE << order); ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); } + abd_free_sg_table(abd); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); +} - table.sgl = ABD_SCATTER(abd).abd_sgl; - table.nents = table.orig_nents = nr_pages; - sg_free_table(&table); +#define ABD_ZERO_PAGE (ZERO_PAGE(0)) + +/* + * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where + * each page in the scatterlist will be set to ABD_ZERO_PAGE. + */ +static void +abd_alloc_zero_scatter(void) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + int i = 0; + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + ASSERT3U(table.nents, ==, nr_pages); + + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + abd_zero_scatter->abd_flags = ABD_FLAG_OWNER; + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; + ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_parent = NULL; + abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK + | ABD_FLAG_ZEROS; + zfs_refcount_create(&abd_zero_scatter->abd_children); + + abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { + sg_set_page(sg, ABD_ZERO_PAGE, PAGESIZE, 0); + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); } #else /* _KERNEL */ +/* + * In the kernel the macro ZERO_PAGE(0) returns a global shared page + * that is always zero. In the case of user space we will just + * return the allocated zero'd page abd_zero_buf. + */ +static char *abd_zero_buf = NULL; +#define ABD_ZERO_PAGE ((struct page *)abd_zero_buf) + #ifndef PAGE_SHIFT #define PAGE_SHIFT (highbit64(PAGESIZE)-1) #endif @@ -498,6 +549,18 @@ sg_init_table(struct scatterlist *sg, int nr) sg[nr - 1].end = 1; } +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + int nents = ABD_SCATTER(abd).abd_nents; + vmem_free(ABD_SCATTER(abd).abd_sgl, + nents * sizeof (struct scatterlist)); +} + #define for_each_sg(sgl, sg, nr, i) \ for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) @@ -526,7 +589,7 @@ sg_next(struct scatterlist *sg) return (sg + 1); } -static void +void abd_alloc_pages(abd_t *abd, size_t size) { unsigned nr_pages = abd_chunkcnt_for_bytes(size); @@ -544,7 +607,7 @@ abd_alloc_pages(abd_t *abd, size_t size) ABD_SCATTER(abd).abd_nents = nr_pages; } -static void +void abd_free_pages(abd_t *abd) { int i, n = ABD_SCATTER(abd).abd_nents; @@ -556,11 +619,112 @@ abd_free_pages(abd_t *abd) umem_free(p, PAGESIZE); } } + abd_free_sg_table(abd); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); +} - vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); +static void +abd_alloc_zero_scatter(void) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + struct scatterlist *sg; + int i; + + abd_zero_buf = umem_zalloc(PAGESIZE, KM_SLEEP); + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + abd_zero_scatter->abd_flags = ABD_FLAG_OWNER; + abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK + | ABD_FLAG_ZEROS; + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_parent = NULL; + zfs_refcount_create(&abd_zero_scatter->abd_children); + ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + + sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); + + abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { + sg_set_page(sg, ABD_ZERO_PAGE, PAGESIZE, 0); + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); +} + +#endif /* _KERNEL */ + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE); } +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + (int)abd->abd_size + -(int)P2ROUNDUP(abd->abd_size, PAGESIZE)); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + size_t n; + int i = 0; + struct scatterlist *sg = NULL; + + ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_sgl->length); + n = ABD_SCATTER(abd).abd_nents; + abd_for_each_sg(abd, sg, n, i) { + ASSERT3P(sg_page(sg), !=, NULL); + } +} + +static void +abd_free_zero_scatter(void) +{ + zfs_refcount_destroy(&abd_zero_scatter->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_free_sg_table(abd_zero_scatter); + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; +#if !defined(_KERNEL) + umem_free(abd_zero_buf, PAGESIZE); #endif /* _KERNEL */ +} void abd_init(void) @@ -582,11 +746,15 @@ abd_init(void) abd_ksp->ks_data = &abd_stats; kstat_install(abd_ksp); } + + abd_alloc_zero_scatter(); } void abd_fini(void) { + abd_free_zero_scatter(); + if (abd_ksp != NULL) { kstat_delete(abd_ksp); abd_ksp = NULL; @@ -598,189 +766,23 @@ abd_fini(void) } } -static inline void -abd_verify(abd_t *abd) -{ - ASSERT3U(abd->abd_size, >, 0); - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | - ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); - IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); - IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); - } else { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; - - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); - ASSERT3U(ABD_SCATTER(abd).abd_offset, <, - ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; - abd_for_each_sg(abd, sg, n, i) { - ASSERT3P(sg_page(sg), !=, NULL); - } - } -} - -static inline abd_t * -abd_alloc_struct(void) -{ - abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); - - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) -{ - kmem_cache_free(abd_cache, abd); - ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); -} - -/* - * Allocate an ABD, along with its own underlying data buffers. Use this if you - * don't care whether the ABD is linear or not. - */ -abd_t * -abd_alloc(size_t size, boolean_t is_metadata) -{ - /* see the comment above zfs_abd_scatter_min_size */ - if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size) - return (abd_alloc_linear(size, is_metadata)); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd_t *abd = abd_alloc_struct(); - abd->abd_flags = ABD_FLAG_OWNER; - abd->abd_u.abd_scatter.abd_offset = 0; - abd_alloc_pages(abd, size); - - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - P2ROUNDUP(size, PAGESIZE) - size); - - return (abd); -} - -static void -abd_free_scatter(abd_t *abd) +void +abd_free_linear_page(abd_t *abd) { + /* Transform it back into a scatter ABD for freeing */ + struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + abd->abd_flags &= ~ABD_FLAG_LINEAR; + abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; + ABD_SCATTER(abd).abd_nents = 1; + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_sgl = sg; abd_free_pages(abd); zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); - + abd_update_scatter_stats(abd, ABDSTAT_DECR); abd_free_struct(abd); } -/* - * Allocate an ABD that must be linear, along with its own underlying data - * buffer. Only use this when it would be very annoying to write your ABD - * consumer with a scattered ABD. - */ -abd_t * -abd_alloc_linear(size_t size, boolean_t is_metadata) -{ - abd_t *abd = abd_alloc_struct(); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); - } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); - - return (abd); -} - -static void -abd_free_linear(abd_t *abd) -{ - if (abd_is_linear_page(abd)) { - /* Transform it back into a scatter ABD for freeing */ - struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; - abd->abd_flags &= ~ABD_FLAG_LINEAR; - abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; - ABD_SCATTER(abd).abd_nents = 1; - ABD_SCATTER(abd).abd_offset = 0; - ABD_SCATTER(abd).abd_sgl = sg; - abd_free_scatter(abd); - return; - } - if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); - - abd_free_struct(abd); -} - -/* - * Free an ABD. Only use this on ABDs allocated with abd_alloc() or - * abd_alloc_linear(). - */ -void -abd_free(abd_t *abd) -{ - abd_verify(abd); - ASSERT3P(abd->abd_parent, ==, NULL); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) - abd_free_linear(abd); - else - abd_free_scatter(abd); -} - -/* - * Allocate an ABD of the same format (same metadata flag, same scatterize - * setting) as another ABD. - */ -abd_t * -abd_alloc_sametype(abd_t *sabd, size_t size) -{ - boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd) && - !abd_is_linear_page(sabd)) { - return (abd_alloc_linear(size, is_metadata)); - } else { - return (abd_alloc(size, is_metadata)); - } -} - /* * If we're going to use this ABD for doing I/O using the block layer, the * consumer of the ABD data doesn't care if it's scattered or not, and we don't @@ -807,16 +809,16 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) * buffer data with sabd. Use abd_put() to free. sabd must not be freed while * any derived ABDs exist. */ -static inline abd_t * +abd_t * abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) { - abd_t *abd; + abd_t *abd = NULL; abd_verify(sabd); ASSERT3U(off, <=, sabd->abd_size); if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(); + abd = abd_alloc_struct(0); /* * Even if this buf is filesystem metadata, we only track that @@ -825,14 +827,27 @@ abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) */ abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; + } else if (abd_is_multi(sabd)) { + size_t left = size; + abd = abd_alloc_multi(); + for (abd_t *cabd = abd_find_child_off(sabd, &off); + cabd != NULL && left > 0; + cabd = list_next(&ABD_MULTI(sabd).abd_chain, cabd)) { + int csize = MIN(left, cabd->abd_size - off); + + abd_t *nabd = abd_get_offset_impl(cabd, off, csize); + abd_add_child(abd, nabd, B_TRUE); + left -= csize; + off = 0; + } + ASSERT3U(left, ==, 0); } else { int i = 0; struct scatterlist *sg = NULL; - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; - abd = abd_alloc_struct(); + abd = abd_alloc_struct(0); /* * Even if this buf is filesystem metadata, we only track that @@ -856,7 +871,6 @@ abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) abd->abd_parent = sabd; zfs_refcount_create(&abd->abd_children); (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - return (abd); } @@ -870,40 +884,6 @@ abd_get_offset(abd_t *sabd, size_t off) return (abd_get_offset_impl(sabd, off, size)); } -abd_t * -abd_get_offset_size(abd_t *sabd, size_t off, size_t size) -{ - ASSERT3U(off + size, <=, sabd->abd_size); - - return (abd_get_offset_impl(sabd, off, size)); -} - -/* - * Allocate a linear ABD structure for buf. You must free this with abd_put() - * since the resulting ABD doesn't own its own buffer. - */ -abd_t * -abd_get_from_buf(void *buf, size_t size) -{ - abd_t *abd = abd_alloc_struct(); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - /* - * Even if this buf is filesystem metadata, we only track that if we - * own the underlying data buffer, which is not true in this case. - * Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - abd->abd_u.abd_linear.abd_buf = buf; - - return (abd); -} - /* * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not * free the underlying scatterlist or buffer. @@ -923,100 +903,6 @@ abd_put(abd_t *abd) abd_free_struct(abd); } -/* - * Get the raw buffer associated with a linear ABD. - */ -void * -abd_to_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); -} - -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - -/* - * Give this ABD ownership of the buffer that it's storing. Can only be used on - * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated - * with abd_alloc_linear() which subsequently released ownership of their buf - * with abd_release_ownership_of_buf(). - */ -void -abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - abd_verify(abd); - - abd->abd_flags |= ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - void abd_release_ownership_of_buf(abd_t *abd) { @@ -1042,25 +928,13 @@ abd_release_ownership_of_buf(abd_t *abd) ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); } -struct abd_iter { - /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ - - /* private */ - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; - size_t iter_offset; /* offset in current sg/abd_buf, */ - /* abd_offset included */ - struct scatterlist *iter_sg; /* current sg */ -}; - /* * Initialize the abd_iter. */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) { + ASSERT(!abd_is_multi(abd)); abd_verify(abd); aiter->iter_abd = abd; aiter->iter_mapaddr = NULL; @@ -1075,19 +949,29 @@ abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) } } +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + /* * Advance the iterator by a certain amount. Cannot be called when a chunk is * in use. This can be safely called when the aiter has already exhausted, in * which case this does nothing. */ -static void +void abd_iter_advance(struct abd_iter *aiter, size_t amount) { ASSERT3P(aiter->iter_mapaddr, ==, NULL); ASSERT0(aiter->iter_mapsize); /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) + if (abd_iter_at_end(aiter)) return; aiter->iter_pos += amount; @@ -1108,7 +992,7 @@ abd_iter_advance(struct abd_iter *aiter, size_t amount) * Map the current chunk into aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. */ -static void +void abd_iter_map(struct abd_iter *aiter) { void *paddr; @@ -1118,14 +1002,14 @@ abd_iter_map(struct abd_iter *aiter) ASSERT0(aiter->iter_mapsize); /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) + if (abd_iter_at_end(aiter)) return; if (abd_is_linear(aiter->iter_abd)) { ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); offset = aiter->iter_offset; aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); } else { offset = aiter->iter_offset; aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, @@ -1142,11 +1026,11 @@ abd_iter_map(struct abd_iter *aiter) * Unmap the current chunk from aiter. This can be safely called when the aiter * has already exhausted, in which case this does nothing. */ -static void +void abd_iter_unmap(struct abd_iter *aiter) { /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) + if (abd_iter_at_end(aiter)) return; if (!abd_is_linear(aiter->iter_abd)) { @@ -1162,419 +1046,123 @@ abd_iter_unmap(struct abd_iter *aiter) aiter->iter_mapsize = 0; } -int -abd_iterate_func(abd_t *abd, size_t off, size_t size, - abd_iter_func_t *func, void *private) -{ - int ret = 0; - struct abd_iter aiter; - - abd_verify(abd); - ASSERT3U(off + size, <=, abd->abd_size); - - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); - - while (size > 0) { - abd_iter_map(&aiter); - - size_t len = MIN(aiter.iter_mapsize, size); - ASSERT3U(len, >, 0); - - ret = func(aiter.iter_mapaddr, len, private); - - abd_iter_unmap(&aiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&aiter, len); - } - - return (ret); -} - -struct buf_arg { - void *arg_buf; -}; - -static int -abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(ba_ptr->arg_buf, buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy abd to buf. (off is the offset in abd.) - */ void -abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +abd_enter_critical(unsigned long flags) { - struct buf_arg ba_ptr = { buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, - &ba_ptr); -} - -static int -abd_cmp_buf_off_cb(void *buf, size_t size, void *private) -{ - int ret; - struct buf_arg *ba_ptr = private; - - ret = memcmp(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (ret); -} - -/* - * Compare the contents of abd to buf. (off is the offset in abd.) - */ -int -abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); -} - -static int -abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy from buf to abd. (off is the offset in abd.) - */ -void -abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, - &ba_ptr); -} - -/*ARGSUSED*/ -static int -abd_zero_off_cb(void *buf, size_t size, void *private) -{ - (void) memset(buf, 0, size); - return (0); + local_irq_save(flags); } -/* - * Zero out the abd from a particular offset to the end. - */ void -abd_zero_off(abd_t *abd, size_t off, size_t size) +abd_exit_critical(unsigned long flags) { - (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); + local_irq_restore(flags); } +#if defined(_KERNEL) /* - * Iterate over two ABDs and call func incrementally on the two ABDs' data in - * equal-sized chunks (passed to func as raw buffers). func could be called many - * times during this iteration. + * bio_nr_pages for ABD. + * @off is the offset in @abd */ -int -abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, - size_t size, abd_iter_func2_t *func, void *private) -{ - int ret = 0; - struct abd_iter daiter, saiter; - - abd_verify(dabd); - abd_verify(sabd); - - ASSERT3U(doff + size, <=, dabd->abd_size); - ASSERT3U(soff + size, <=, sabd->abd_size); - - abd_iter_init(&daiter, dabd, 0); - abd_iter_init(&saiter, sabd, 1); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); - - while (size > 0) { - abd_iter_map(&daiter); - abd_iter_map(&saiter); - - size_t dlen = MIN(daiter.iter_mapsize, size); - size_t slen = MIN(saiter.iter_mapsize, size); - size_t len = MIN(dlen, slen); - ASSERT(dlen > 0 || slen > 0); - - ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, - private); - - abd_iter_unmap(&saiter); - abd_iter_unmap(&daiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); - } - - return (ret); -} - -/*ARGSUSED*/ -static int -abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) { - (void) memcpy(dbuf, sbuf, size); - return (0); -} + unsigned long pos; -/* - * Copy from sabd to dabd starting from soff and doff. - */ -void -abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) -{ - (void) abd_iterate_func2(dabd, sabd, doff, soff, size, - abd_copy_off_cb, NULL); -} + while (abd_is_multi(abd)) + abd = abd_find_child_off(abd, &off); -/*ARGSUSED*/ -static int -abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) -{ - return (memcmp(bufa, bufb, size)); -} + ASSERT(!abd_is_multi(abd)); + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = ABD_SCATTER(abd).abd_offset + off; -/* - * Compares the contents of two ABDs. - */ -int -abd_cmp(abd_t *dabd, abd_t *sabd) -{ - ASSERT3U(dabd->abd_size, ==, sabd->abd_size); - return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, - abd_cmp_cb, NULL)); + return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT); } -/* - * Iterate over code ABDs and a data ABD and call @func_raidz_gen. - * - * @cabds parity ABDs, must have equal size - * @dabd data ABD. Can be NULL (in this case @dsize = 0) - * @func_raidz_gen should be implemented so that its behaviour - * is the same when taking linear and when taking scatter - */ -void -abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, - void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +static unsigned int +bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) { - int i; - ssize_t len, dlen; - struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; - void *caddrs[3]; - unsigned long flags; - - ASSERT3U(parity, <=, 3); - - for (i = 0; i < parity; i++) - abd_iter_init(&caiters[i], cabds[i], i); - - if (dabd) - abd_iter_init(&daiter, dabd, i); - - ASSERT3S(dsize, >=, 0); - - local_irq_save(flags); - while (csize > 0) { - len = csize; - - if (dabd && dsize > 0) - abd_iter_map(&daiter); + unsigned int offset, size, i; + struct page *page; - for (i = 0; i < parity; i++) { - abd_iter_map(&caiters[i]); - caddrs[i] = caiters[i].iter_mapaddr; - } + offset = offset_in_page(buf_ptr); + for (i = 0; i < bio->bi_max_vecs; i++) { + size = PAGE_SIZE - offset; - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - /* falls through */ - case 2: - len = MIN(caiters[1].iter_mapsize, len); - /* falls through */ - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } + if (bio_size <= 0) + break; - /* must be progressive */ - ASSERT3S(len, >, 0); + if (size > bio_size) + size = bio_size; - if (dabd && dsize > 0) { - /* this needs precise iter.length */ - len = MIN(daiter.iter_mapsize, len); - dlen = len; - } else - dlen = 0; + if (is_vmalloc_addr(buf_ptr)) + page = vmalloc_to_page(buf_ptr); + else + page = virt_to_page(buf_ptr); - /* must be progressive */ - ASSERT3S(len, >, 0); /* - * The iterated function likely will not do well if each - * segment except the last one is not multiple of 512 (raidz). + * Some network related block device uses tcp_sendpage, which + * doesn't behave well when using 0-count page, this is a + * safety net to catch them. */ - ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - - func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); - - for (i = parity-1; i >= 0; i--) { - abd_iter_unmap(&caiters[i]); - abd_iter_advance(&caiters[i], len); - } - - if (dabd && dsize > 0) { - abd_iter_unmap(&daiter); - abd_iter_advance(&daiter, dlen); - dsize -= dlen; - } - - csize -= len; - - ASSERT3S(dsize, >=, 0); - ASSERT3S(csize, >=, 0); - } - local_irq_restore(flags); -} + ASSERT3S(page_count(page), >, 0); -/* - * Iterate over code ABDs and data reconstruction target ABDs and call - * @func_raidz_rec. Function maps at most 6 pages atomically. - * - * @cabds parity ABDs, must have equal size - * @tabds rec target ABDs, at most 3 - * @tsize size of data target columns - * @func_raidz_rec expects syndrome data in target columns. Function - * reconstructs data and overwrites target columns. - */ -void -abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, - void (*func_raidz_rec)(void **t, const size_t tsize, void **c, - const unsigned *mul), - const unsigned *mul) -{ - int i; - ssize_t len; - struct abd_iter citers[3]; - struct abd_iter xiters[3]; - void *caddrs[3], *xaddrs[3]; - unsigned long flags; - - ASSERT3U(parity, <=, 3); + if (bio_add_page(bio, page, size, offset) != size) + break; - for (i = 0; i < parity; i++) { - abd_iter_init(&citers[i], cabds[i], 2*i); - abd_iter_init(&xiters[i], tabds[i], 2*i+1); + buf_ptr += size; + bio_size -= size; + offset = 0; } - local_irq_save(flags); - while (tsize > 0) { - - for (i = 0; i < parity; i++) { - abd_iter_map(&citers[i]); - abd_iter_map(&xiters[i]); - caddrs[i] = citers[i].iter_mapaddr; - xaddrs[i] = xiters[i].iter_mapaddr; - } - - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - /* falls through */ - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - /* falls through */ - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } - /* must be progressive */ - ASSERT3S(len, >, 0); - /* - * The iterated function likely will not do well if each - * segment except the last one is not multiple of 512 (raidz). - */ - ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - - func_raidz_rec(xaddrs, len, caddrs, mul); - - for (i = parity-1; i >= 0; i--) { - abd_iter_unmap(&xiters[i]); - abd_iter_unmap(&citers[i]); - abd_iter_advance(&xiters[i], len); - abd_iter_advance(&citers[i], len); - } - - tsize -= len; - ASSERT3S(tsize, >=, 0); - } - local_irq_restore(flags); + return (bio_size); } -#if defined(_KERNEL) /* - * bio_nr_pages for ABD. - * @off is the offset in @abd + * bio_map for multi_list ABD. */ -unsigned long -abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +static unsigned int +abd_multi_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) { - unsigned long pos; - - if (abd_is_linear(abd)) - pos = (unsigned long)abd_to_buf(abd) + off; - else - pos = abd->abd_u.abd_scatter.abd_offset + off; - - return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT); + ASSERT(abd_is_multi(abd)); + + for (abd_t *cabd = abd_find_child_off(abd, &off); + cabd != NULL; cabd = list_next(&ABD_MULTI(abd).abd_chain, cabd)) { + int size = MIN(io_size, cabd->abd_size - off); + int remainder = abd_bio_map_off(bio, cabd, size, off); + io_size -= (size - remainder); + if (io_size == 0 || remainder > 0) + return (io_size); + off = 0; + } + ASSERT(io_size == 0); + return (io_size); } /* - * bio_map for scatter ABD. + * bio_map for ABD. * @off is the offset in @abd * Remaining IO size is returned */ unsigned int -abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, +abd_bio_map_off(struct bio *bio, abd_t *abd, unsigned int io_size, size_t off) { int i; struct abd_iter aiter; - ASSERT(!abd_is_linear(abd)); ASSERT3U(io_size, <=, abd->abd_size - off); + if (abd_is_linear(abd)) + return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); + + ASSERT(!abd_is_linear(abd)); + if (abd_is_multi(abd)) + return (abd_multi_bio_map_off(bio, abd, io_size, off)); - abd_iter_init(&aiter, abd, 0); + abd_iter_init(&aiter, abd); abd_iter_advance(&aiter, off); for (i = 0; i < bio->bi_max_vecs; i++) { diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 66e408c6c98..b514df3bc17 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -396,54 +396,6 @@ BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) rc = vdev_disk_dio_put(dr); } -static unsigned int -bio_map(struct bio *bio, void *bio_ptr, unsigned int bio_size) -{ - unsigned int offset, size, i; - struct page *page; - - offset = offset_in_page(bio_ptr); - for (i = 0; i < bio->bi_max_vecs; i++) { - size = PAGE_SIZE - offset; - - if (bio_size <= 0) - break; - - if (size > bio_size) - size = bio_size; - - if (is_vmalloc_addr(bio_ptr)) - page = vmalloc_to_page(bio_ptr); - else - page = virt_to_page(bio_ptr); - - /* - * Some network related block device uses tcp_sendpage, which - * doesn't behave well when using 0-count page, this is a - * safety net to catch them. - */ - ASSERT3S(page_count(page), >, 0); - - if (bio_add_page(bio, page, size, offset) != size) - break; - - bio_ptr += size; - bio_size -= size; - offset = 0; - } - - return (bio_size); -} - -static unsigned int -bio_map_abd_off(struct bio *bio, abd_t *abd, unsigned int size, size_t off) -{ - if (abd_is_linear(abd)) - return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, size)); - - return (abd_scatter_bio_map_off(bio, abd, size, off)); -} - static inline void vdev_submit_bio_impl(struct bio *bio) { @@ -603,7 +555,7 @@ __vdev_disk_physio(struct block_device *bdev, zio_t *zio, bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ - bio_size = bio_map_abd_off(dr->dr_bio[i], zio->io_abd, + bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 6737336caef..3a966399703 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -14,6 +14,7 @@ ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE) # Suppress unused-value warnings in sparc64 architecture headers ccflags-$(CONFIG_SPARC64) += -Wno-unused-value +$(MODULE)-objs += abd.o $(MODULE)-objs += aggsum.o $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o diff --git a/module/os/freebsd/zfs/abd.c b/module/zfs/abd.c similarity index 56% rename from module/os/freebsd/zfs/abd.c rename to module/zfs/abd.c index 888a113a429..bbbd55ec596 100644 --- a/module/os/freebsd/zfs/abd.c +++ b/module/zfs/abd.c @@ -1,17 +1,26 @@ /* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. + * CDDL HEADER START * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END */ - /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. */ /* @@ -50,11 +59,6 @@ * +----------------->| chunk N-1 | * +-----------+ * - * Using a large proportion of scattered ABDs decreases ARC fragmentation since - * when we are at the limit of allocatable space, using equal-size chunks will - * allow us to quickly reclaim enough space for a new large allocation (assuming - * it is also scattered). - * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset * within an existing ABD. In linear buffers this is simple (set abd_buf of @@ -83,6 +87,13 @@ * compare, copy, read, write, and fill with zeroes. If you need a custom * function which progressively accesses the whole ABD, use the abd_iterate_* * functions. + * + * As an additional feature, linear and scatter ABD's can be stitched together + * by using a the multilist ABD type (abd_alloc_multi()). This allows for + * multiple ABD's to be view as singular ABD. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. */ #include @@ -91,180 +102,30 @@ #include #include -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -/* - * It is possible to make all future ABDs be linear by setting this to B_FALSE. - * Otherwise, ABDs are allocated scattered by default unless the caller uses - * abd_alloc_linear(). - */ -boolean_t zfs_abd_scatter_enabled = B_TRUE; - -/* - * The size of the chunks ABD allocates. Because the sizes allocated from the - * kmem_cache can't change, this tunable can only be modified at boot. Changing - * it at runtime would cause ABD iteration to work incorrectly for ABDs which - * were allocated with the old size, so a safeguard has been put in place which - * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. - */ -size_t zfs_abd_chunk_size = 4096; - -#if defined(_KERNEL) -SYSCTL_DECL(_vfs_zfs); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, - &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, - &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); -#endif - -kmem_cache_t *abd_chunk_cache; -static kstat_t *abd_ksp; - -extern inline boolean_t abd_is_linear(abd_t *abd); -extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); -extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); -extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_zero(abd_t *abd, size_t size); - -static void * -abd_alloc_chunk() -{ - void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); - ASSERT3P(c, !=, NULL); - return (c); -} - -static void -abd_free_chunk(void *c) -{ - kmem_cache_free(abd_chunk_cache, c); -} - -void -abd_init(void) -{ - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, - NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); - - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - } -} - void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - kmem_cache_destroy(abd_chunk_cache); - abd_chunk_cache = NULL; -} - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); -} - -static inline size_t -abd_scatter_chunkcnt(abd_t *abd) -{ - ASSERT(!abd_is_linear(abd)); - return (abd_chunkcnt_for_bytes( - abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); -} - -static inline void abd_verify(abd_t *abd) { ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META)); + ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | + ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_MULTI_LIST | + ABD_FLAG_MULTI_FREE | ABD_FLAG_ZEROS)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); - } else { - ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, - zfs_abd_chunk_size); - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - ASSERT3P( - abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); + } else if (abd_is_multi(abd)) { + for (abd_t *cabd = list_head(&ABD_MULTI(abd).abd_chain); + cabd != NULL; + cabd = list_next(&ABD_MULTI(abd).abd_chain, cabd)) { + abd_verify(cabd); } + } else { + abd_verify_scatter(abd); } } -static inline abd_t * -abd_alloc_struct(size_t chunkcnt) -{ - size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, size); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) -{ - size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); - int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - kmem_free(abd, size); - ABDSTAT_INCR(abdstat_struct_size, -size); -} - /* * Allocate an ABD, along with its own underlying data buffers. Use this if you * don't care whether the ABD is linear or not. @@ -272,15 +133,16 @@ abd_free_struct(abd_t *abd) abd_t * abd_alloc(size_t size, boolean_t is_metadata) { - if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) + if (!zfs_abd_scatter_enabled || abd_size_alloc_linear(size)) return (abd_alloc_linear(size, is_metadata)); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - size_t n = abd_chunkcnt_for_bytes(size); - abd_t *abd = abd_alloc_struct(n); - + abd_t *abd = abd_alloc_struct(size); abd->abd_flags = ABD_FLAG_OWNER; + abd->abd_u.abd_scatter.abd_offset = 0; + abd_alloc_pages(abd, size); + if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } @@ -288,19 +150,7 @@ abd_alloc(size_t size, boolean_t is_metadata) abd->abd_parent = NULL; zfs_refcount_create(&abd->abd_children); - abd->abd_u.abd_scatter.abd_offset = 0; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - for (int i = 0; i < n; i++) { - void *c = abd_alloc_chunk(); - ASSERT3P(c, !=, NULL); - abd->abd_u.abd_scatter.abd_chunks[i] = c; - } - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - n * zfs_abd_chunk_size - size); + abd_update_scatter_stats(abd, ABDSTAT_INCR); return (abd); } @@ -308,17 +158,10 @@ abd_alloc(size_t size, boolean_t is_metadata) static void abd_free_scatter(abd_t *abd) { - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); - } + abd_free_pages(abd); zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - abd->abd_size - n * zfs_abd_chunk_size); - + abd_update_scatter_stats(abd, ABDSTAT_DECR); abd_free_struct(abd); } @@ -348,8 +191,7 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); } - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); + abd_update_linear_stats(abd, ABDSTAT_INCR); return (abd); } @@ -357,6 +199,10 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) static void abd_free_linear(abd_t *abd) { + if (abd_is_linear_page(abd)) { + abd_free_linear_page(abd); + return; + } if (abd->abd_flags & ABD_FLAG_META) { zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); } else { @@ -364,12 +210,32 @@ abd_free_linear(abd_t *abd) } zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + abd_update_linear_stats(abd, ABDSTAT_DECR); abd_free_struct(abd); } +static void +abd_free_multi(abd_t *abd) +{ + ASSERT(abd_is_multi(abd)); + abd_t *cabd; + + while ((cabd = list_remove_head(&ABD_MULTI(abd).abd_chain)) != NULL) { + abd->abd_size -= cabd->abd_size; + if (cabd->abd_flags & ABD_FLAG_MULTI_FREE) { + if (cabd->abd_flags & ABD_FLAG_OWNER) + abd_free(cabd); + else + abd_put(cabd); + } + } + ASSERT0(abd->abd_size); + list_destroy(&ABD_MULTI(abd).abd_chain); + zfs_refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + /* * Free an ABD. Only use this on ABDs allocated with abd_alloc() or * abd_alloc_linear(). @@ -377,14 +243,13 @@ abd_free_linear(abd_t *abd) void abd_free(abd_t *abd) { - if (abd == NULL) - return; - abd_verify(abd); ASSERT3P(abd->abd_parent, ==, NULL); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) abd_free_linear(abd); + else if (abd_is_multi(abd)) + abd_free_multi(abd); else abd_free_scatter(abd); } @@ -397,106 +262,126 @@ abd_t * abd_alloc_sametype(abd_t *sabd, size_t size) { boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd)) { + if (abd_is_linear(sabd) && + !abd_is_linear_page(sabd)) { return (abd_alloc_linear(size, is_metadata)); } else { return (abd_alloc(size, is_metadata)); } } + /* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). + * Create an ABD that will be the head of a list of ABD's. This is used + * to "chain" scatter/gather lists together when constructing aggregated + * IO's. To free this abd, abd_free() must be called. */ abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) +abd_alloc_multi(void) { - return (abd_alloc_linear(size, is_metadata)); + abd_t *abd; + + abd = abd_alloc_struct(0); + abd->abd_flags = ABD_FLAG_MULTI_LIST | ABD_FLAG_OWNER; + abd->abd_size = 0; + abd->abd_parent = NULL; + list_create(&ABD_MULTI(abd).abd_chain, + sizeof (abd_t), offsetof(abd_t, abd_multi_link)); + zfs_refcount_create(&abd->abd_children); + return (abd); } /* - * Allocate a new ABD to point to offset off of sabd. It shares the underlying - * buffer data with sabd. Use abd_put() to free. sabd must not be freed while - * any derived ABDs exist. + * Add a child ABD to a chained list of ABD's. */ -/* ARGSUSED */ -static inline abd_t * -abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +void +abd_add_child(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) { - abd_t *abd; - - abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); - - if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(0); + ASSERT(abd_is_multi(pabd)); + abd_t *child_abd = NULL; + mutex_enter(&cabd->abd_mtx); + if (list_link_active(&cabd->abd_multi_link)) { /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. + * If the child ABD is already part of another + * multilist ABD then we must allocate a new + * ABD to use a seperate link. We mark the newly + * allocated ABD with ABD_FLAG_MULTI_FREE, before + * adding it to the multilist, to make the multilist + * aware that it is it's responsibility to call + * abd_put(). We use abd_get_offset() in order to + * just allocate a new ABD but avoid copying the data + * over into the newly allocated ABD. + * + * Cases where an ABD may be part of multiple + * multilist ABD's are ditto blocks and when + * vdev_label_write() is called (see vdev_label.c). + * + * The ASSERT below is to make sure that if + * free_on_free is passed as B_TRUE, the ABD can + * not be in mulitple mutlilist ABD's. The multilist + * can not be responsible for cleaning up the child + * ABD memory allocation if the ABD can be in + * multiple multilist ABD's at one time. */ - abd->abd_flags = ABD_FLAG_LINEAR; - - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; + ASSERT3U(free_on_free, ==, B_FALSE); + child_abd = abd_get_offset(cabd, 0); + child_abd->abd_flags |= ABD_FLAG_MULTI_FREE; } else { - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - size_t chunkcnt = abd_scatter_chunkcnt(sabd) - - (new_offset / zfs_abd_chunk_size); - - abd = abd_alloc_struct(chunkcnt); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd->abd_u.abd_scatter.abd_offset = - new_offset % zfs_abd_chunk_size; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - /* Copy the scatterlist starting at the correct offset */ - (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, - &sabd->abd_u.abd_scatter.abd_chunks[new_offset / - zfs_abd_chunk_size], - chunkcnt * sizeof (void *)); + child_abd = cabd; + if (free_on_free) + child_abd->abd_flags |= ABD_FLAG_MULTI_FREE; } + ASSERT3P(child_abd, !=, NULL); - if (size == 0) - abd->abd_size = sabd->abd_size - off; - else - abd->abd_size = size; - abd->abd_parent = sabd; - zfs_refcount_create(&abd->abd_children); - (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - - return (abd); + mutex_enter(&pabd->abd_mtx); + list_insert_tail(&ABD_MULTI(pabd).abd_chain, child_abd); + pabd->abd_size += child_abd->abd_size; + mutex_exit(&pabd->abd_mtx); + mutex_exit(&cabd->abd_mtx); } +/* + * Locate the child abd for the supplied offset. + * Return a new offset relative to the child. + */ abd_t * -abd_get_offset(abd_t *sabd, size_t off) +abd_find_child_off(abd_t *abd, size_t *off) { + abd_t *cabd; + + ASSERT(abd_is_multi(abd)); + ASSERT3U(*off, <, abd->abd_size); + for (cabd = list_head(&ABD_MULTI(abd).abd_chain); cabd != NULL; + cabd = list_next(&ABD_MULTI(abd).abd_chain, cabd)) { - return (abd_get_offset_impl(sabd, off, 0)); + if (*off >= cabd->abd_size) + *off -= cabd->abd_size; + else + return (cabd); + } + ASSERT(!"not reached"); + return (cabd); } abd_t * abd_get_offset_size(abd_t *sabd, size_t off, size_t size) { ASSERT3U(off + size, <=, sabd->abd_size); - return (abd_get_offset_impl(sabd, off, size)); } +/* + * Return a size scatter ABD. In order to free the returned + * ABD abd_put() must be called. + */ +abd_t * +abd_get_zeros(size_t size) +{ + ASSERT3P(abd_zero_scatter, !=, NULL); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + return (abd_get_offset_size(abd_zero_scatter, 0, size)); +} /* * Allocate a linear ABD structure for buf. You must free this with abd_put() @@ -524,27 +409,6 @@ abd_get_from_buf(void *buf, size_t size) return (abd); } -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - if (abd == NULL) - return; - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - /* * Get the raw buffer associated with a linear ABD. */ @@ -574,7 +438,6 @@ abd_borrow_buf(abd_t *abd, size_t n) buf = zio_buf_alloc(n); } (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - return (buf); } @@ -635,130 +498,50 @@ abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) abd->abd_flags |= ABD_FLAG_META; } - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - -void -abd_release_ownership_of_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - abd_verify(abd); - - abd->abd_flags &= ~ABD_FLAG_OWNER; - /* Disable this flag since we no longer own the data buffer */ - abd->abd_flags &= ~ABD_FLAG_META; - - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); -} - -struct abd_iter { - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; /* position (relative to abd_offset) */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ -}; - -static inline size_t -abd_iter_scatter_chunk_offset(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); -} - -static inline size_t -abd_iter_scatter_chunk_index(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); + abd_update_linear_stats(abd, ABDSTAT_INCR); } /* - * Initialize the abd_iter. + * Initializes an abd_iter based on whether the abd is a chain of ABD's + * or just a single ABD. */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd) -{ - abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) -{ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - aiter->iter_pos += amount; -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) +static inline abd_t * +abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off) { - void *paddr; - size_t offset = 0; + abd_t *cabd = NULL; - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* Panic if someone has changed zfs_abd_chunk_size */ - IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == - aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - offset = aiter->iter_pos; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + if (abd_is_multi(abd)) { + cabd = abd_find_child_off(abd, &off); + if (cabd) { + abd_iter_init(aiter, cabd); + abd_iter_advance(aiter, off); + } } else { - size_t index = abd_iter_scatter_chunk_index(aiter); - offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + abd_iter_init(aiter, abd); + abd_iter_advance(aiter, off); } - aiter->iter_mapaddr = (char *)paddr + offset; + return (cabd); } /* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. + * Advances an abd_iter. We have to be careful with chains of ABD's as + * advancing could mean that we are at the end of a particular ABD and + * must grab the next one from the chain. */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; +static inline abd_t * +abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter, + size_t len) +{ + abd_iter_advance(aiter, len); + if (abd_is_multi(abd) && abd_iter_at_end(aiter)) { + ASSERT3P(cabd, !=, NULL); + cabd = list_next(&ABD_MULTI(abd).abd_chain, cabd); + if (cabd) { + abd_iter_init(aiter, cabd); + abd_iter_advance(aiter, 0); + } + } + return (cabd); } int @@ -767,14 +550,20 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, { int ret = 0; struct abd_iter aiter; + boolean_t abd_multi; + abd_t *c_abd; abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - abd_iter_init(&aiter, abd); - abd_iter_advance(&aiter, off); + abd_multi = abd_is_multi(abd); + c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { + /* If we are at the end of multi chain abd we are done */ + if (abd_multi && !c_abd) + break; + abd_iter_map(&aiter); size_t len = MIN(aiter.iter_mapsize, size); @@ -788,7 +577,7 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, break; size -= len; - abd_iter_advance(&aiter, len); + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); } return (ret); @@ -895,6 +684,8 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, { int ret = 0; struct abd_iter daiter, saiter; + boolean_t dabd_is_multi, sabd_is_multi; + abd_t *c_dabd, *c_sabd; abd_verify(dabd); abd_verify(sabd); @@ -902,12 +693,17 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); - abd_iter_init(&daiter, dabd); - abd_iter_init(&saiter, sabd); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); + dabd_is_multi = abd_is_multi(dabd); + sabd_is_multi = abd_is_multi(sabd); + c_dabd = abd_init_abd_iter(dabd, &daiter, doff); + c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { + /* if we are at the end of a multi abd chain we are done */ + if ((dabd_is_multi && !c_dabd) || + (sabd_is_multi && !c_sabd)) + break; + abd_iter_map(&daiter); abd_iter_map(&saiter); @@ -926,8 +722,10 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, break; size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); + c_dabd = + abd_advance_abd_iter(dabd, c_dabd, &daiter, len); + c_sabd = + abd_advance_abd_iter(sabd, c_sabd, &saiter, len); } return (ret); @@ -987,34 +785,55 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, struct abd_iter caiters[3]; struct abd_iter daiter = {0}; void *caddrs[3]; + unsigned long flags = 0; + abd_t *c_cabds[3]; + abd_t *c_dabd = NULL; + boolean_t cabds_is_multi[3]; + boolean_t dabd_is_multi = B_FALSE; ASSERT3U(parity, <=, 3); - for (i = 0; i < parity; i++) - abd_iter_init(&caiters[i], cabds[i]); + for (i = 0; i < parity; i++) { + cabds_is_multi[i] = abd_is_multi(cabds[i]); + c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); + } - if (dabd) - abd_iter_init(&daiter, dabd); + if (dabd) { + dabd_is_multi = abd_is_multi(dabd); + c_dabd = abd_init_abd_iter(dabd, &daiter, 0); + } ASSERT3S(dsize, >=, 0); - critical_enter(); + abd_enter_critical(flags); while (csize > 0) { - len = csize; - - if (dabd && dsize > 0) - abd_iter_map(&daiter); + /* if we are at the end of a multi abd chain we are done */ + if (dabd_is_multi && !c_dabd) + break; for (i = 0; i < parity; i++) { + /* + * If we are at the end of a multi abd chain we are + * done. + */ + if (cabds_is_multi[i] && !c_cabds[i]) + break; abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; } + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + switch (parity) { case 3: len = MIN(caiters[2].iter_mapsize, len); + /* falls through */ case 2: len = MIN(caiters[1].iter_mapsize, len); + /* falls through */ case 1: len = MIN(caiters[0].iter_mapsize, len); } @@ -1041,12 +860,16 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&caiters[i]); - abd_iter_advance(&caiters[i], len); + c_cabds[i] = + abd_advance_abd_iter(cabds[i], c_cabds[i], + &caiters[i], len); } if (dabd && dsize > 0) { abd_iter_unmap(&daiter); - abd_iter_advance(&daiter, dlen); + c_dabd = + abd_advance_abd_iter(dabd, c_dabd, &daiter, + dlen); dsize -= dlen; } @@ -1055,7 +878,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ASSERT3S(dsize, >=, 0); ASSERT3S(csize, >=, 0); } - critical_exit(); + abd_exit_critical(flags); } /* @@ -1080,18 +903,35 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; + unsigned long flags = 0; + boolean_t cabds_is_multi[3]; + boolean_t tabds_is_multi[3]; + abd_t *c_cabds[3]; + abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { - abd_iter_init(&citers[i], cabds[i]); - abd_iter_init(&xiters[i], tabds[i]); + cabds_is_multi[i] = abd_is_multi(cabds[i]); + tabds_is_multi[i] = abd_is_multi(tabds[i]); + c_cabds[i] = + abd_init_abd_iter(cabds[i], &citers[i], 0); + c_tabds[i] = + abd_init_abd_iter(tabds[i], &xiters[i], 0); } - critical_enter(); + abd_enter_critical(flags); while (tsize > 0) { for (i = 0; i < parity; i++) { + /* + * If we are at the end of a multi abd chain we + * are done. + */ + if (cabds_is_multi[i] && !c_cabds[i]) + break; + if (tabds_is_multi[i] && !c_tabds[i]) + break; abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; @@ -1103,9 +943,11 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, case 3: len = MIN(xiters[2].iter_mapsize, len); len = MIN(citers[2].iter_mapsize, len); + /* falls through */ case 2: len = MIN(xiters[1].iter_mapsize, len); len = MIN(citers[1].iter_mapsize, len); + /* falls through */ case 1: len = MIN(xiters[0].iter_mapsize, len); len = MIN(citers[0].iter_mapsize, len); @@ -1123,12 +965,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, for (i = parity-1; i >= 0; i--) { abd_iter_unmap(&xiters[i]); abd_iter_unmap(&citers[i]); - abd_iter_advance(&xiters[i], len); - abd_iter_advance(&citers[i], len); + c_tabds[i] = + abd_advance_abd_iter(tabds[i], c_tabds[i], + &xiters[i], len); + c_cabds[i] = + abd_advance_abd_iter(cabds[i], c_cabds[i], + &citers[i], len); } tsize -= len; ASSERT3S(tsize, >=, 0); } - critical_exit(); + abd_exit_critical(flags); } diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index e156e2b0139..3091a8b8869 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -535,15 +535,6 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_agg_io_done(zio_t *aio) { - if (aio->io_type == ZIO_TYPE_READ) { - zio_t *pio; - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(aio, &zl)) != NULL) { - abd_copy_off(pio->io_abd, aio->io_abd, - 0, pio->io_offset - aio->io_offset, pio->io_size); - } - } - abd_free(aio->io_abd); } @@ -568,6 +559,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) boolean_t stretch = B_FALSE; avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + uint64_t next_offset; abd_t *abd; maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); @@ -695,7 +687,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) size = IO_SPAN(first, last); ASSERT3U(size, <=, maxblocksize); - abd = abd_alloc_for_io(size, B_TRUE); + abd = abd_alloc_multi(); if (abd == NULL) return (NULL); @@ -706,12 +698,41 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) aio->io_timestamp = first->io_timestamp; nio = first; + next_offset = first->io_offset; do { dio = nio; nio = AVL_NEXT(t, dio); zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); + + if (dio->io_offset != next_offset) { + /* allocate a buffer for a read gap */ + ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ); + ASSERT3U(dio->io_offset, >, next_offset); + abd = abd_alloc_for_io( + dio->io_offset - next_offset, B_TRUE); + abd_add_child(aio->io_abd, abd, B_TRUE); + } + if (dio->io_abd && (dio->io_size != dio->io_abd->abd_size)) { + /* abd size not the same as IO size */ + ASSERT3U(dio->io_abd->abd_size, >, dio->io_size); + abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size); + abd_add_child(aio->io_abd, abd, B_TRUE); + } else { + if (dio->io_flags & ZIO_FLAG_NODATA) { + /* allocate a buffer for a write gap */ + ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3P(dio->io_abd, ==, NULL); + abd_add_child(aio->io_abd, + abd_get_zeros(dio->io_size), B_TRUE); + } else { + abd_add_child(aio->io_abd, dio->io_abd, + B_FALSE); + } + } + next_offset = dio->io_offset + dio->io_size; } while (dio != last); + ASSERT3U(aio->io_abd->abd_size, ==, aio->io_size); /* * We need to drop the vdev queue's lock during zio_execute() to @@ -723,15 +744,6 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) while ((dio = zio_walk_parents(aio, &zl)) != NULL) { ASSERT3U(dio->io_type, ==, aio->io_type); - if (dio->io_flags & ZIO_FLAG_NODATA) { - ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); - abd_zero_off(aio->io_abd, - dio->io_offset - aio->io_offset, dio->io_size); - } else if (dio->io_type == ZIO_TYPE_WRITE) { - abd_copy_off(aio->io_abd, dio->io_abd, - dio->io_offset - aio->io_offset, 0, dio->io_size); - } - zio_vdev_io_bypass(dio); zio_execute(dio); }