diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 5ab13b470dc0..61f1258f72b9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -326,7 +326,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) int err; struct sublivelist_verify *sv = args; - zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, + zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL, sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, @@ -390,7 +390,7 @@ sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) { (void) args; sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); int err = sublivelist_verify_func(&sv, dle); zfs_btree_clear(&sv.sv_leftover); @@ -682,7 +682,7 @@ livelist_metaslab_validate(spa_t *spa) (void) printf("Verifying deleted livelist entries\n"); sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); iterate_deleted_livelists(spa, livelist_verify, &sv); @@ -716,7 +716,7 @@ livelist_metaslab_validate(spa_t *spa) mv.mv_start = m->ms_start; mv.mv_end = m->ms_start + m->ms_size; zfs_btree_create(&mv.mv_livelist_allocs, - livelist_block_compare, + livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); mv_populate_livelist_allocs(&mv, &sv); diff --git a/include/sys/btree.h b/include/sys/btree.h index 883abb5181c9..6e05eee8f01d 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -105,8 +105,13 @@ typedef struct zfs_btree_index { boolean_t bti_before; } zfs_btree_index_t; -typedef struct btree { +typedef struct btree zfs_btree_t; +typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t, + const void *, zfs_btree_index_t *); + +struct btree { int (*bt_compar) (const void *, const void *); + bt_find_in_buf_f bt_find_in_buf; size_t bt_elem_size; size_t bt_leaf_size; uint32_t bt_leaf_cap; @@ -115,7 +120,54 @@ typedef struct btree { uint64_t bt_num_nodes; zfs_btree_hdr_t *bt_root; zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading -} zfs_btree_t; +}; + +/* + * Implementation of Shar's algorithm designed to accelerate binary search by + * eliminating impossible to predict branches. + * + * For optimality, this should be used to generate the search function in the + * same file as the comparator and the comparator should be marked + * `__attribute__((always_inline) inline` so that the compiler will inline it. + * + * Arguments are: + * + * NAME - The function name for this instance of the search function. Use it + * in a subsequent call to zfs_btree_create(). + * T - The element type stored inside the B-Tree. + * COMP - A comparator to compare two nodes, it must return exactly: -1, 0, + * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons, + * TREE_CMP() from avl.h can be used in a boilerplate function. + */ +/* BEGIN CSTYLED */ +#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \ +_Pragma("GCC diagnostic push") \ +_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \ +static void * \ +NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \ + const void *value, zfs_btree_index_t *where) \ +{ \ + T *i = (T *)buf; \ + (void) tree; \ + _Pragma("GCC unroll 9") \ + while (nelems > 1) { \ + uint32_t half = nelems / 2; \ + nelems -= half; \ + i += (COMP(&i[half - 1], value) < 0) * half; \ + } \ + \ + int comp = COMP(i, value); \ + where->bti_offset = (i - (T *)buf) + (comp < 0); \ + where->bti_before = (comp != 0); \ + \ + if (comp == 0) { \ + return (i); \ + } \ + \ + return (NULL); \ +} \ +_Pragma("GCC diagnostic pop") +/* END CSTYLED */ /* * Allocate and deallocate caches for btree nodes. @@ -129,13 +181,19 @@ void zfs_btree_fini(void); * tree - the tree to be initialized * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 * -1 for <, 0 for ==, and +1 for > + * find - optional function to accelerate searches inside B-Tree nodes + * through Shar's algorithm and comparator inlining. Setting this to + * NULL will use a generic function. The function should be created + * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar. + * compar should be marked `__attribute__((always_inline)) inline` or + * performance is unlikely to improve very much. * size - the value of sizeof(struct my_type) * lsize - custom leaf size */ void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), - size_t); + bt_find_in_buf_f, size_t); void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *), - size_t, size_t); + bt_find_in_buf_f, size_t, size_t); /* * Find a node with a matching value in the tree. Returns the matching node diff --git a/module/Kbuild.in b/module/Kbuild.in index 8d29f56c2fb8..29a55c9778b1 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -34,6 +34,20 @@ ifeq ($(CONFIG_KASAN),y) ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= endif +# Generated binary search code is particularly bad with this optimization. +# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c +# is not affected when unrolling is done. +# Disable it until the following upstream issue is resolved: +# https://github.com/llvm/llvm-project/issues/62790 +ifeq ($(CONFIG_X86),y) +ifeq ($(CONFIG_CC_IS_CLANG),y) +CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false +endif +endif + ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 365609fb8585..9464223f6ca6 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -400,6 +400,20 @@ beforeinstall: .include +# Generated binary search code is particularly bad with this optimization. +# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c +# is not affected when unrolling is done. +# Disable it until the following upstream issue is resolved: +# https://github.com/llvm/llvm-project/issues/62790 +.if ${CC} == "clang" +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64" +CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false +CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false +CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false +CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false +.endif +.endif + CFLAGS.sysctl_os.c= -include ../zfs_config.h CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 4c25afaa8199..af2b94a850be 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -193,14 +193,20 @@ zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), - size_t size) + bt_find_in_buf_f bt_find_in_buf, size_t size) { - zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE); + zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, + BTREE_LEAF_SIZE); } +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, + const void *value, zfs_btree_index_t *where); + void zfs_btree_create_custom(zfs_btree_t *tree, int (*compar) (const void *, const void *), + bt_find_in_buf_f bt_find_in_buf, size_t size, size_t lsize) { size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); @@ -208,6 +214,8 @@ zfs_btree_create_custom(zfs_btree_t *tree, ASSERT3U(size, <=, esize / 2); memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; + tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? + zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; tree->bt_leaf_cap = P2ALIGN(esize / size, 2); @@ -303,7 +311,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) * element in the last leaf, it's in the last leaf or * it's not in the tree. */ - void *d = zfs_btree_find_in_buf(tree, + void *d = tree->bt_find_in_buf(tree, last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, last_leaf->btl_hdr.bth_count, value, &idx); @@ -327,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); - void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + void *d = tree->bt_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { @@ -347,7 +355,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) */ zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); - void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems + + void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + leaf->btl_hdr.bth_first * size, leaf->btl_hdr.bth_count, value, &idx); @@ -671,7 +679,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; zfs_btree_index_t idx; ASSERT(zfs_btree_is_core(par_hdr)); - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint32_t offset = idx.bti_offset; @@ -897,7 +905,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) } zfs_btree_index_t idx; zfs_btree_core_t *parent = hdr->bth_parent; - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 6cad339104a4..9ee719a5eef6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -4877,6 +4877,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, * with single operation. Plus it makes scrubs more sequential and reduces * chances that minor extent change move it within the B-tree. */ +__attribute__((always_inline)) inline static int ext_size_compare(const void *x, const void *y) { @@ -4885,13 +4886,17 @@ ext_size_compare(const void *x, const void *y) return (TREE_CMP(*a, *b)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, + ext_size_compare) + static void ext_size_create(range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; - zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t)); + zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf, + sizeof (uint64_t)); } static void diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 24d52a74933f..94b131fcdb79 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { @@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { @@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } + typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; @@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt) range_tree_walk(rt, metaslab_size_sorted_add, &arg); } + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, + range_seg32_t, metaslab_rangesize32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, + range_seg64_t, metaslab_rangesize64_compare) + /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. @@ -1424,19 +1430,22 @@ metaslab_rt_create(range_tree_t *rt, void *arg) size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (rt->rt_type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; + bt_find = metaslab_rt_find_rangesize32_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; + bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } - zfs_btree_create(size_tree, compare, size); + zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 894c30fcae16..5174e2c46633 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) rt->rt_histogram[idx]--; } +__attribute__((always_inline)) inline static int range_tree_seg32_compare(const void *x1, const void *x2) { @@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg64_compare(const void *x1, const void *x2) { @@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg_gap_compare(const void *x1, const void *x2) { @@ -187,6 +190,15 @@ range_tree_seg_gap_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t, + range_tree_seg32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t, + range_tree_seg64_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t, + range_tree_seg_gap_compare) + range_tree_t * range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, uint64_t gap) @@ -197,23 +209,27 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = range_tree_seg32_compare; + bt_find = range_tree_seg32_find_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = range_tree_seg64_compare; + bt_find = range_tree_seg64_find_in_buf; break; case RANGE_SEG_GAP: size = sizeof (range_seg_gap_t); compare = range_tree_seg_gap_compare; + bt_find = range_tree_seg_gap_find_in_buf; break; default: panic("Invalid range seg type %d", type); } - zfs_btree_create(&rt->rt_root, compare, size); + zfs_btree_create(&rt->rt_root, compare, bt_find, size); rt->rt_ops = ops; rt->rt_gap = gap; diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index d6ad8b2b8bc5..085d9cd8b4b6 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -285,6 +285,7 @@ zap_byteswap(void *buf, size_t size) } } +__attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) { @@ -295,6 +296,9 @@ mze_compare(const void *arg1, const void *arg2) (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, + mze_compare) + static void mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { @@ -461,7 +465,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * 62 entries before we have to add 2KB B-tree core node. */ zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, - sizeof (mzap_ent_t), 512); + mze_find_in_buf, sizeof (mzap_ent_t), 512); zap_name_t *zn = zap_name_alloc(zap); for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { diff --git a/tests/zfs-tests/cmd/btree_test.c b/tests/zfs-tests/cmd/btree_test.c index 9a34bf559be0..fda9229915ce 100644 --- a/tests/zfs-tests/cmd/btree_test.c +++ b/tests/zfs-tests/cmd/btree_test.c @@ -501,7 +501,7 @@ main(int argc, char *argv[]) srandom(seed); zfs_btree_init(); - zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t)); + zfs_btree_create(&bt, zfs_btree_compare, NULL, sizeof (uint64_t)); /* * This runs the named negative test. None of them should