diff --git a/cmd/arcstat/arcstat b/cmd/arcstat/arcstat index 66d7e3a8da7a..df2f62d6b29f 100755 --- a/cmd/arcstat/arcstat +++ b/cmd/arcstat/arcstat @@ -1,20 +1,25 @@ #!/usr/bin/env python3 # # Print out ZFS ARC Statistics exported via kstat(1) -# For a definition of fields, or usage, use arctstat.pl -v +# For a definition of fields, or usage, use arcstat -v # -# This script is a fork of the original arcstat.pl (0.1) by -# Neelakanth Nadgir, originally published on his Sun blog on +# This script was originally a fork of the original arcstat.pl (0.1) +# by Neelakanth Nadgir, originally published on his Sun blog on # 09/18/2007 # http://blogs.sun.com/realneel/entry/zfs_arc_statistics # -# This version aims to improve upon the original by adding features -# and fixing bugs as needed. This version is maintained by -# Mike Harsch and is hosted in a public open source repository: +# A new version aimed to improve upon the original by adding features +# and fixing bugs as needed. This version was maintained by Mike +# Harsch and was hosted in a public open source repository: # http://github.com/mharsch/arcstat # -# Comments, Questions, or Suggestions are always welcome. -# Contact the maintainer at ( mike at harschsystems dot com ) +# but has since moved to the illumos-gate repository. +# +# This Python port was written by John Hixson for FreeNAS, introduced +# in commit e2c29f: +# https://github.com/freenas/freenas +# +# and has been improved by many people since. # # CDDL HEADER START # @@ -60,7 +65,7 @@ cols = { "hits": [4, 1000, "ARC reads per second"], "miss": [4, 1000, "ARC misses per second"], "read": [4, 1000, "Total ARC accesses per second"], - "hit%": [4, 100, "ARC Hit percentage"], + "hit%": [4, 100, "ARC hit percentage"], "miss%": [5, 100, "ARC miss percentage"], "dhit": [4, 1000, "Demand hits per second"], "dmis": [4, 1000, "Demand misses per second"], @@ -75,12 +80,12 @@ cols = { "mread": [5, 1000, "Metadata accesses per second"], "mh%": [3, 100, "Metadata hit percentage"], "mm%": [3, 100, "Metadata miss percentage"], - "arcsz": [5, 1024, "ARC Size"], - "c": [4, 1024, "ARC Target Size"], - "mfu": [4, 1000, "MFU List hits per second"], - "mru": [4, 1000, "MRU List hits per second"], - "mfug": [4, 1000, "MFU Ghost List hits per second"], - "mrug": [4, 1000, "MRU Ghost List hits per second"], + "arcsz": [5, 1024, "ARC size"], + "c": [4, 1024, "ARC target size"], + "mfu": [4, 1000, "MFU list hits per second"], + "mru": [4, 1000, "MRU list hits per second"], + "mfug": [4, 1000, "MFU ghost list hits per second"], + "mrug": [4, 1000, "MRU ghost list hits per second"], "eskip": [5, 1000, "evict_skip per second"], "mtxmis": [6, 1000, "mutex_miss per second"], "dread": [5, 1000, "Demand accesses per second"], @@ -92,10 +97,10 @@ cols = { "l2miss%": [7, 100, "L2ARC access miss percentage"], "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], "l2size": [6, 1024, "Size of the L2ARC"], - "l2bytes": [7, 1024, "bytes read per second from the L2ARC"], - "grow": [4, 1000, "ARC Grow disabled"], - "need": [4, 1024, "ARC Reclaim need"], - "free": [4, 1024, "ARC Free memory"], + "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"], + "grow": [4, 1000, "ARC grow disabled"], + "need": [4, 1024, "ARC reclaim need"], + "free": [4, 1024, "ARC free memory"], } v = {} diff --git a/cmd/mount_zfs/Makefile.am b/cmd/mount_zfs/Makefile.am index 1ffeef7fe189..ddacf32c6dd5 100644 --- a/cmd/mount_zfs/Makefile.am +++ b/cmd/mount_zfs/Makefile.am @@ -14,4 +14,5 @@ mount_zfs_SOURCES = \ mount_zfs_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la diff --git a/cmd/raidz_test/Makefile.am b/cmd/raidz_test/Makefile.am index c04d101b885b..0b173ed505fd 100644 --- a/cmd/raidz_test/Makefile.am +++ b/cmd/raidz_test/Makefile.am @@ -14,6 +14,7 @@ raidz_test_SOURCES = \ raidz_bench.c raidz_test_LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libzpool/libzpool.la raidz_test_LDADD += -lm -ldl diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index f4b4b454b44d..00258799bb04 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3493,12 +3493,13 @@ print_l2arc_log_blocks(void) static void dump_l2arc_log_entries(uint64_t log_entries, - l2arc_log_ent_phys_t *le, int i) + l2arc_log_ent_phys_t *le, uint64_t i) { for (int j = 0; j < log_entries; j++) { dva_t dva = le[j].le_dva; - (void) printf("lb[%4d]\tle[%4d]\tDVA asize: %llu, " - "vdev: %llu, offset: %llu\n", i, j + 1, + (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " + "vdev: %llu, offset: %llu\n", + (u_longlong_t)i, j + 1, (u_longlong_t)DVA_GET_ASIZE(&dva), (u_longlong_t)DVA_GET_VDEV(&dva), (u_longlong_t)DVA_GET_OFFSET(&dva)); @@ -3533,7 +3534,7 @@ dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) (u_longlong_t)lbps.lbp_payload_start); (void) printf("|\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); - (void) printf("|\t\tpsize: %llu\n", + (void) printf("|\t\tasize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); (void) printf("|\t\tcompralgo: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); @@ -3543,17 +3544,19 @@ dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) } static void -dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) +dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, + l2arc_dev_hdr_phys_t *rebuild) { l2arc_log_blk_phys_t this_lb; - uint64_t psize; + uint64_t asize; l2arc_log_blkptr_t lbps[2]; abd_t *abd; zio_cksum_t cksum; - int i = 0, failed = 0; + int failed = 0; l2arc_dev_t dev; - print_l2arc_log_blocks(); + if (!dump_opt['q']) + print_l2arc_log_blocks(); bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); dev.l2ad_evict = l2dhdr.dh_evict; @@ -3562,8 +3565,10 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { /* no log blocks to read */ - (void) printf("No log blocks to read\n"); - (void) printf("\n"); + if (!dump_opt['q']) { + (void) printf("No log blocks to read\n"); + (void) printf("\n"); + } return; } else { dev.l2ad_hand = lbps[0].lbp_daddr + @@ -3576,17 +3581,23 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) break; - psize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); - if (pread64(fd, &this_lb, psize, lbps[0].lbp_daddr) != psize) { - (void) printf("Error while reading next log block\n\n"); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { + if (!dump_opt['q']) { + (void) printf("Error while reading next log " + "block\n\n"); + } break; } - fletcher_4_native_varsize(&this_lb, psize, &cksum); + fletcher_4_native_varsize(&this_lb, asize, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { failed++; - (void) printf("Invalid cksum\n"); - dump_l2arc_log_blkptr(lbps[0]); + if (!dump_opt['q']) { + (void) printf("Invalid cksum\n"); + dump_l2arc_log_blkptr(lbps[0]); + } break; } @@ -3594,11 +3605,11 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: - abd = abd_alloc_for_io(psize, B_TRUE); - abd_copy_from_buf_off(abd, &this_lb, 0, psize); + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, &this_lb, 0, asize); zio_decompress_data(L2BLK_GET_COMPRESS( (&lbps[0])->lbp_prop), abd, &this_lb, - psize, sizeof (this_lb)); + asize, sizeof (this_lb)); abd_free(abd); break; default: @@ -3608,39 +3619,52 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(&this_lb, sizeof (this_lb)); if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { - (void) printf("Invalid log block magic\n\n"); + if (!dump_opt['q']) + (void) printf("Invalid log block magic\n\n"); break; } - i++; - if (dump_opt['l'] > 1) { - (void) printf("lb[%4d]\tmagic: %llu\n", i, + rebuild->dh_lb_count++; + rebuild->dh_lb_asize += asize; + if (dump_opt['l'] > 1 && !dump_opt['q']) { + (void) printf("lb[%4llu]\tmagic: %llu\n", + (u_longlong_t)rebuild->dh_lb_count, (u_longlong_t)this_lb.lb_magic); dump_l2arc_log_blkptr(lbps[0]); } - if (dump_opt['l'] > 2) - dump_l2arc_log_entries(l2dhdr.dh_log_blk_ent, - this_lb.lb_entries, i); + if (dump_opt['l'] > 2 && !dump_opt['q']) + dump_l2arc_log_entries(l2dhdr.dh_log_entries, + this_lb.lb_entries, + rebuild->dh_lb_count); - if (l2arc_range_check_overlap(lbps[1].lbp_daddr, - lbps[0].lbp_daddr, dev.l2ad_evict) && !dev.l2ad_first) + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev.l2ad_evict) && + !dev.l2ad_first) break; lbps[0] = lbps[1]; lbps[1] = this_lb.lb_prev_lbp; } - (void) printf("log_blk_count:\t %d with valid cksum\n", i); - (void) printf("\t\t %d with invalid cksum\n\n", failed); + if (!dump_opt['q']) { + (void) printf("log_blk_count:\t %llu with valid cksum\n", + (u_longlong_t)rebuild->dh_lb_count); + (void) printf("\t\t %d with invalid cksum\n", failed); + (void) printf("log_blk_asize:\t %llu\n\n", + (u_longlong_t)rebuild->dh_lb_asize); + } } -static void +static int dump_l2arc_header(int fd) { - l2arc_dev_hdr_phys_t l2dhdr; + l2arc_dev_hdr_phys_t l2dhdr, rebuild; int error = B_FALSE; + bzero(&l2dhdr, sizeof (l2dhdr)); + bzero(&rebuild, sizeof (rebuild)); + if (pread64(fd, &l2dhdr, sizeof (l2dhdr), VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { error = B_TRUE; @@ -3654,6 +3678,8 @@ dump_l2arc_header(int fd) if (error) { (void) printf("L2ARC device header not found\n\n"); + /* Do not return an error here for backward compatibility */ + return (0); } else if (!dump_opt['q']) { print_l2arc_header(); @@ -3672,16 +3698,39 @@ dump_l2arc_header(int fd) (u_longlong_t) l2dhdr.dh_start_lbps[1].lbp_daddr); (void) printf(" log_blk_ent: %llu\n", - (u_longlong_t)l2dhdr.dh_log_blk_ent); + (u_longlong_t)l2dhdr.dh_log_entries); (void) printf(" start: %llu\n", (u_longlong_t)l2dhdr.dh_start); (void) printf(" end: %llu\n", (u_longlong_t)l2dhdr.dh_end); - (void) printf(" evict: %llu\n\n", + (void) printf(" evict: %llu\n", (u_longlong_t)l2dhdr.dh_evict); - - dump_l2arc_log_blocks(fd, l2dhdr); + (void) printf(" lb_asize_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_asize); + (void) printf(" lb_count_refcount: %llu\n\n", + (u_longlong_t)l2dhdr.dh_lb_count); } + + dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); + /* + * The total aligned size of log blocks and the number of log blocks + * reported in the header of the device may be less than what zdb + * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). + * This happens because dump_l2arc_log_blocks() lacks the memory + * pressure valve that l2arc_rebuild() has. Thus, if we are on a system + * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize + * and dh_lb_count will be lower to begin with than what exists on the + * device. This is normal and zdb should not exit with an error. The + * opposite case should never happen though, the values reported in the + * header should never be higher than what dump_l2arc_log_blocks() and + * l2arc_rebuild() report. If this happens there is a leak in the + * accounting of log blocks. + */ + if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || + l2dhdr.dh_lb_count > rebuild.dh_lb_count) + return (1); + + return (0); } static void @@ -4009,7 +4058,7 @@ dump_label(const char *dev) * Dump the L2ARC header, if existent. */ if (read_l2arc_header) - dump_l2arc_header(fd); + error |= dump_l2arc_header(fd); cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index 40c0834af0a2..82b000ce162e 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -38,6 +38,7 @@ zed_SOURCES = $(ZED_SRC) $(FMA_SRC) zed_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la zed_LDADD += -lrt diff --git a/cmd/zinject/Makefile.am b/cmd/zinject/Makefile.am index 71b48255e66b..b056a6db545e 100644 --- a/cmd/zinject/Makefile.am +++ b/cmd/zinject/Makefile.am @@ -9,4 +9,5 @@ zinject_SOURCES = \ zinject_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c index 8542d37c50bd..4939c0b85b5f 100644 --- a/cmd/zinject/translate.c +++ b/cmd/zinject/translate.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. */ #include @@ -388,7 +388,7 @@ translate_device(const char *pool, const char *device, err_type_t label_type, record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; break; case TYPE_LABEL_PAD2: - record->zi_start = offsetof(vdev_label_t, vl_pad2); + record->zi_start = offsetof(vdev_label_t, vl_be); record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; break; } diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index 7b25726f498e..5efa1318440d 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -22,6 +22,7 @@ endif zpool_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la if BUILD_FREEBSD diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d62a6eb3c9a2..e154e40e482b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -5148,22 +5148,48 @@ print_zpool_script_list(char *subcommand) /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. + * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; - int width, columns; + int width, available_width; + /* + * get_namewidth() returns the maximum width of any name in that column + * for any pool/vdev/device line that will be output. + */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); - columns = get_columns(); + /* + * The width we are calculating is the width of the header and also the + * padding width for names that are less than maximum width. The stats + * take up 42 characters, so the width available for names is: + */ + available_width = get_columns() - 42; + + /* + * If the maximum width fits on a screen, then great! Make everything + * line up by justifying all lines to the same width. If that max + * width is larger than what's available, the name plus stats won't fit + * on one line, and justifying to that width would cause every line to + * wrap on the screen. We only want lines with long names to wrap. + * Limit the padding to what won't wrap. + */ + if (width > available_width) + width = available_width; + + /* + * And regardless of whatever the screen width is (get_columns can + * return 0 if the width is not known or less than 42 for a narrow + * terminal) have the width be a minimum of 10. + */ if (width < 10) width = 10; - if (width > columns - 42) - width = columns - 42; + /* Save the calculated width */ cb->cb_namewidth = width; return (0); diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am index 892e1583072c..ebc07d2eaa04 100644 --- a/cmd/zstream/Makefile.am +++ b/cmd/zstream/Makefile.am @@ -10,4 +10,5 @@ zstream_SOURCES = \ zstream_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la diff --git a/configure.ac b/configure.ac index 902108f3649d..9f20b2bea1b7 100644 --- a/configure.ac +++ b/configure.ac @@ -92,6 +92,8 @@ AC_CONFIG_FILES([ contrib/dracut/90zfs/Makefile contrib/dracut/Makefile contrib/initramfs/Makefile + contrib/initramfs/conf.d/Makefile + contrib/initramfs/conf-hooks.d/Makefile contrib/initramfs/hooks/Makefile contrib/initramfs/scripts/Makefile contrib/initramfs/scripts/local-top/Makefile diff --git a/contrib/initramfs/Makefile.am b/contrib/initramfs/Makefile.am index 849b1d83cc5b..ee17de0965ca 100644 --- a/contrib/initramfs/Makefile.am +++ b/contrib/initramfs/Makefile.am @@ -1,23 +1,9 @@ initrddir = /usr/share/initramfs-tools -initrd_SCRIPTS = \ - conf.d/zfs conf-hooks.d/zfs hooks/zfs scripts/zfs scripts/local-top/zfs +dist_initrd_SCRIPTS = \ + zfsunlock -SUBDIRS = hooks scripts +SUBDIRS = conf.d conf-hooks.d hooks scripts EXTRA_DIST = \ - $(top_srcdir)/contrib/initramfs/conf.d/zfs \ - $(top_srcdir)/contrib/initramfs/conf-hooks.d/zfs \ $(top_srcdir)/contrib/initramfs/README.initramfs.markdown - -install-initrdSCRIPTS: $(EXTRA_DIST) - for d in conf.d conf-hooks.d scripts/local-top; do \ - $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \ - cp $(top_srcdir)/contrib/initramfs/$$d/zfs \ - $(DESTDIR)$(initrddir)/$$d/; \ - done - for d in hooks scripts; do \ - $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \ - cp $(top_builddir)/contrib/initramfs/$$d/zfs \ - $(DESTDIR)$(initrddir)/$$d/; \ - done diff --git a/contrib/initramfs/README.initramfs.markdown b/contrib/initramfs/README.initramfs.markdown index fa19f001af9c..ddae71a2e295 100644 --- a/contrib/initramfs/README.initramfs.markdown +++ b/contrib/initramfs/README.initramfs.markdown @@ -1,94 +1,86 @@ -DESCRIPTION - These scripts are intended to be used with initramfs-tools, which is a similar - software product to "dracut" (which is used in RedHat based distributions), - and is mainly used by Debian GNU/Linux and derivatives to create an initramfs - so that the system can be booted off a ZFS filesystem. If you have no need or - interest in this, then it can safely be ignored. - - These script were written with the primary intention of being portable and - usable on as many systems as possible. - - This is, in practice, usually not possible. But the intention is there. - And it is a good one. - - They have been tested successfully on: - - * Debian GNU/Linux Wheezy - * Debian GNU/Linux Jessie - - It uses some functionality common with the SYSV init scripts, primarily - the "/etc/zfs/zfs-functions" script. - -FUNCTIONALITY - * Supports booting of a ZFS snapshot. - Do this by cloning the snapshot into a dataset. If this, the resulting - dataset, already exists, destroy it. Then mount it as the root filesystem. - * If snapshot does not exist, use base dataset (the part before '@') - as boot filesystem instead. - * Clone with 'mountpoint=none' and 'canmount=noauto' - we mount manually - and explicitly. - * Allow rollback of snapshots instead of clone it and boot from the clone. - * If no snapshot is specified on the 'root=' kernel command line, but - there is an '@', then get a list of snapshots below that filesystem - and ask the user which to use. - - * Support all currently used kernel command line arguments - * Core options: - All the different distributions have their own standard on what to specify - on the kernel command line to boot of a ZFS filesystem. - - Supports the following kernel command line argument combinations - (in this order - first match win): - * rpool= (tries to finds bootfs automatically) - * bootfs=/ (uses this for rpool - first part) - * rpool= bootfs=/ - * -B zfs-bootfs=/ (uses this for rpool - first part) - * rpool=rpool (default if none of the above is used) - * root=/ (uses this for rpool - first part) - * root=ZFS=/ (uses this for rpool - first part, without 'ZFS=') - * root=zfs:AUTO (tries to detect both pool and rootfs - * root=zfs:/ (uses this for rpool - first part, without 'zfs:') - - Option could also be - * Extra (control) options: - * zfsdebug=(on,yes,1) Show extra debugging information - * zfsforce=(on,yes,1) Force import the pool - * rollback=(on,yes,1) Rollback (instead of clone) the snapshot - - * 'Smarter' way to import pools. Don't just try cache file or /dev. - * Try to use /dev/disk/by-vdev (if /etc/zfs/vdev_id.conf exists), - * Try /dev/mapper (to be able to use LUKS backed pools as well as - multi-path devices). - * /dev/disk/by-id and any other /dev/disk/by-* directory that may exist. - * Use /dev as a last ditch attempt. - * Fallback to using the cache file if that exist if nothing else worked. - * Only try to import pool if it haven't already been imported - * This will negate the need to force import a pool that have not been - exported cleanly. - * Support exclusion of pools to import by setting ZFS_POOL_EXCEPTIONS - in /etc/default/zfs. - - Controlling in which order devices is searched for is controlled by - ZPOOL_IMPORT_PATH variable set in /etc/defaults/zfs. - - * Support additional configuration variable ZFS_INITRD_ADDITIONAL_DATASETS - to mount additional filesystems not located under your root dataset. - - For example, if the root fs is specified as 'rpool/ROOT/rootfs', it will - automatically and without specific configuration mount any filesystems - below this on the mount point specified in the 'mountpoint' property. - Such as 'rpool/root/rootfs/var', 'rpool/root/rootfs/usr' etc) - - However, if one prefer to have separate filesystems, not located below - the root fs (such as 'rpool/var', 'rpool/ROOT/opt' etc), special - configuration needs to be done. This is what the variable, set in - /etc/defaults/zfs file, needs to be configured. The 'mountpoint' - property needs to be correct for this to work though. - - * Allows mounting a rootfs with mountpoint=legacy set. - - * Include /etc/modprobe.d/{zfs,spl}.conf in the initrd if it/they exist. - - * Include the udev rule to use by-vdev for pool imports. - - * Include the /etc/default/zfs file to the initrd. +## Description + +These scripts are intended to be used with `initramfs-tools`, which is a +similar software product to `dracut` (which is used in Red Hat based +distributions), and is mainly used by Debian GNU/Linux and derivatives. + +These scripts share some common functionality with the SysV init scripts, +primarily the `/etc/zfs/zfs-functions` script. + +## Configuration + +### Root pool/filesystem + +Different distributions have their own standard on what to specify on the +kernel command line to boot off a ZFS filesystem. + +This script supports the following kernel command line argument combinations +(in this order - first match wins): + +* `rpool=` +* `bootfs=/` +* `rpool= bootfs=/` +* `-B zfs-bootfs=/` +* `root=/` +* `root=ZFS=/` +* `root=zfs:AUTO` +* `root=zfs:/` +* `rpool=rpool` + +If a pool is specified, it will be used. Otherwise, in `AUTO` mode, all pools +will be searched. Pools may be excluded from the search by listing them in +`ZFS_POOL_EXCEPTIONS` in `/etc/default/zfs`. + +Pools will be imported as follows: + +* Try `/dev/disk/by-vdev` if it exists; see `/etc/zfs/vdev_id.conf`. +* Try `/dev/disk/by-id` and any other `/dev/disk/by-*` directories. +* Try `/dev`. +* Use the cache file if nothing else worked. + +This order may be modified by setting `ZPOOL_IMPORT_PATH` in +`/etc/default/zfs`. + +If a dataset is specified, it will be used as the root filesystem. Otherwise, +this script will attempt to find a root filesystem automatically (in the +specified pool or all pools, as described above). + +Filesystems below the root filesystem will be automatically mounted with no +additional configuration necessary. For example, if the root filesystem is +`rpool/ROOT/rootfs`, `rpool/root/rootfs/var`, `rpool/root/rootfs/usr`, etc. +will be mounted (if they exist). Additional filesystems (that are not located +under the root filesystem) can be mounted by listing them in +`ZFS_INITRD_ADDITIONAL_DATASETS` in `/etc/default/zfs`. + +### Snapshots + +The `` can be a snapshot. In this case, the snapshot will be cloned +and the clone used as the root filesystem. Note: + +* If the snapshot does not exist, the base dataset (the part before `@`) is + used as the boot filesystem instead. +* If the resulting clone dataset already exists, it is destroyed. +* The clone is created with `mountpoint=none` and `canmount=noauto`. The root + filesystem is mounted manually by the initramfs script. +* If no snapshot is specified on the `root=` kernel command line, but + there is an `@`, the user will be prompted to choose a snapshot to use. + +### Extra options + +The following kernel command line arguments are supported: + +* `zfsdebug=(on,yes,1)`: Show extra debugging information +* `zfsforce=(on,yes,1)`: Force import the pool +* `rollback=(on,yes,1)`: Rollback to (instead of clone) the snapshot + +### Unlocking a ZFS encrypted root over SSH + +To use this feature: + +1. Install the `dropbear-initramfs` package. You may wish to uninstall the + `cryptsetup-initramfs` package to avoid warnings. +2. Add your SSH key(s) to `/etc/dropbear-initramfs/authorized_keys`. Note + that Dropbear does not support ed25519 keys; use RSA (2048-bit or more) + instead. +3. Rebuild the initramfs with your keys: `update-initramfs -u` +4. During the system boot, login via SSH and run: `zfsunlock` diff --git a/contrib/initramfs/conf-hooks.d/Makefile.am b/contrib/initramfs/conf-hooks.d/Makefile.am new file mode 100644 index 000000000000..f84ba5cc7e37 --- /dev/null +++ b/contrib/initramfs/conf-hooks.d/Makefile.am @@ -0,0 +1,4 @@ +confhooksddir = /usr/share/initramfs-tools/conf-hooks.d + +dist_confhooksd_DATA = \ + zfs diff --git a/contrib/initramfs/conf.d/Makefile.am b/contrib/initramfs/conf.d/Makefile.am new file mode 100644 index 000000000000..5ef27e0aa1ce --- /dev/null +++ b/contrib/initramfs/conf.d/Makefile.am @@ -0,0 +1,4 @@ +confddir = /usr/share/initramfs-tools/conf.d + +dist_confd_DATA = \ + zfs diff --git a/contrib/initramfs/hooks/.gitignore b/contrib/initramfs/hooks/.gitignore index 73304bc2cd4a..4e1604e18899 100644 --- a/contrib/initramfs/hooks/.gitignore +++ b/contrib/initramfs/hooks/.gitignore @@ -1 +1,2 @@ zfs +zfsunlock diff --git a/contrib/initramfs/hooks/Makefile.am b/contrib/initramfs/hooks/Makefile.am index 3d8ef627ed47..9b20c080a655 100644 --- a/contrib/initramfs/hooks/Makefile.am +++ b/contrib/initramfs/hooks/Makefile.am @@ -1,23 +1,20 @@ hooksdir = /usr/share/initramfs-tools/hooks hooks_SCRIPTS = \ - zfs + zfs \ + zfsunlock EXTRA_DIST = \ - $(top_srcdir)/contrib/initramfs/hooks/zfs.in + $(top_srcdir)/contrib/initramfs/hooks/zfs.in \ + $(top_srcdir)/contrib/initramfs/hooks/zfsunlock.in -$(hooks_SCRIPTS):%:%.in +$(hooks_SCRIPTS):%:%.in Makefile -$(SED) -e 's,@sbindir\@,$(sbindir),g' \ -e 's,@sysconfdir\@,$(sysconfdir),g' \ -e 's,@udevdir\@,$(udevdir),g' \ -e 's,@udevruledir\@,$(udevruledir),g' \ -e 's,@mounthelperdir\@,$(mounthelperdir),g' \ + -e 's,@DEFAULT_INITCONF_DIR\@,$(DEFAULT_INITCONF_DIR),g' \ $< >'$@' -# Double-colon rules are allowed; there are multiple independent definitions. -clean-local:: - -$(RM) $(hooks_SCRIPTS) - -# Double-colon rules are allowed; there are multiple independent definitions. -distclean-local:: - -$(RM) $(hooks_SCRIPTS) +CLEANFILES = $(hooks_SCRIPTS) diff --git a/contrib/initramfs/hooks/zfs.in b/contrib/initramfs/hooks/zfs.in index 15f23c908b23..ff7e49f12537 100755 --- a/contrib/initramfs/hooks/zfs.in +++ b/contrib/initramfs/hooks/zfs.in @@ -21,6 +21,7 @@ COPY_FILE_LIST="$COPY_FILE_LIST @udevruledir@/69-vdev.rules" # These prerequisites are provided by the base system. COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/dirname /bin/hostname /sbin/blkid" COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/env" +COPY_EXEC_LIST="$COPY_EXEC_LIST $(which systemd-ask-password)" # Explicitly specify all kernel modules because automatic dependency resolution # is unreliable on many systems. diff --git a/contrib/initramfs/hooks/zfsunlock.in b/contrib/initramfs/hooks/zfsunlock.in new file mode 100644 index 000000000000..c8ae86363981 --- /dev/null +++ b/contrib/initramfs/hooks/zfsunlock.in @@ -0,0 +1,18 @@ +#!/bin/sh + +PREREQ="dropbear" + +prereqs() { + echo "$PREREQ" +} + +case "$1" in + prereqs) + prereqs + exit 0 + ;; +esac + +. /usr/share/initramfs-tools/hook-functions + +copy_exec /usr/share/initramfs-tools/zfsunlock /usr/bin diff --git a/contrib/initramfs/scripts/local-top/Makefile.am b/contrib/initramfs/scripts/local-top/Makefile.am index c820325947b0..1523a907c860 100644 --- a/contrib/initramfs/scripts/local-top/Makefile.am +++ b/contrib/initramfs/scripts/local-top/Makefile.am @@ -1,3 +1,4 @@ localtopdir = /usr/share/initramfs-tools/scripts/local-top -EXTRA_DIST = zfs +dist_localtop_SCRIPTS = \ + zfs diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index dbc4e253f113..a795fd39f605 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -405,6 +405,8 @@ decrypt_fs() ENCRYPTIONROOT="$(get_fs_value "${fs}" encryptionroot)" KEYLOCATION="$(get_fs_value "${ENCRYPTIONROOT}" keylocation)" + echo "${ENCRYPTIONROOT}" > /run/zfs_fs_name + # If root dataset is encrypted... if ! [ "${ENCRYPTIONROOT}" = "-" ]; then KEYSTATUS="$(get_fs_value "${ENCRYPTIONROOT}" keystatus)" @@ -418,6 +420,7 @@ decrypt_fs() # Prompt with plymouth, if active elif [ -e /bin/plymouth ] && /bin/plymouth --ping 2>/dev/null; then + echo "plymouth" > /run/zfs_console_askpwd_cmd while [ $TRY_COUNT -gt 0 ]; do plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" | \ $ZFS load-key "${ENCRYPTIONROOT}" && break @@ -426,6 +429,7 @@ decrypt_fs() # Prompt with systemd, if active elif [ -e /run/systemd/system ]; then + echo "systemd-ask-password" > /run/zfs_console_askpwd_cmd while [ $TRY_COUNT -gt 0 ]; do systemd-ask-password "Encrypted ZFS password for ${ENCRYPTIONROOT}" --no-tty | \ $ZFS load-key "${ENCRYPTIONROOT}" && break @@ -434,7 +438,8 @@ decrypt_fs() # Prompt with ZFS tty, otherwise else - # Setting "printk" temporarily to "7" will allow prompt even if kernel option "quiet" + # Temporarily setting "printk" to "7" allows the prompt to appear even when the "quiet" kernel option has been used + echo "load-key" > /run/zfs_console_askpwd_cmd storeprintk="$(awk '{print $1}' /proc/sys/kernel/printk)" echo 7 > /proc/sys/kernel/printk $ZFS load-key "${ENCRYPTIONROOT}" @@ -964,6 +969,11 @@ mountroot() mount_fs "$fs" done + touch /run/zfs_unlock_complete + if [ -e /run/zfs_unlock_complete_notify ]; then + read zfs_unlock_complete_notify < /run/zfs_unlock_complete_notify + fi + # ------------ # Debugging information if [ -n "${ZFS_DEBUG}" ] diff --git a/contrib/initramfs/zfsunlock b/contrib/initramfs/zfsunlock new file mode 100755 index 000000000000..f6b6b9dbe14d --- /dev/null +++ b/contrib/initramfs/zfsunlock @@ -0,0 +1,42 @@ +#!/bin/sh + +set -eu +if [ ! -e /run/zfs_fs_name ]; then + echo "Wait for the root pool to be imported or press Ctrl-C to exit." +fi +while [ ! -e /run/zfs_fs_name ]; do + if [ -e /run/zfs_unlock_complete ]; then + exit 0 + fi + sleep 0.5 +done +echo +echo "Unlocking encrypted ZFS filesystems..." +echo "Enter the password or press Ctrl-C to exit." +echo +zfs_fs_name="" +if [ ! -e /run/zfs_unlock_complete_notify ]; then + mkfifo /run/zfs_unlock_complete_notify +fi +while [ ! -e /run/zfs_unlock_complete ]; do + zfs_fs_name=$(cat /run/zfs_fs_name) + zfs_console_askpwd_cmd=$(cat /run/zfs_console_askpwd_cmd) + systemd-ask-password "Encrypted ZFS password for ${zfs_fs_name}:" | \ + /sbin/zfs load-key "$zfs_fs_name" || true + if [ "$(/sbin/zfs get -H -ovalue keystatus "$zfs_fs_name" 2> /dev/null)" = "available" ]; then + echo "Password for $zfs_fs_name accepted." + zfs_console_askpwd_pid=$(ps | awk '!'"/awk/ && /$zfs_console_askpwd_cmd/ { print \$1; exit }") + if [ -n "$zfs_console_askpwd_pid" ]; then + kill "$zfs_console_askpwd_pid" + fi + # Wait for another filesystem to unlock. + while [ "$(cat /run/zfs_fs_name)" = "$zfs_fs_name" ] && [ ! -e /run/zfs_unlock_complete ]; do + sleep 0.5 + done + else + echo "Wrong password. Try again." + fi +done +echo "Unlocking complete. Resuming boot sequence..." +echo "Please reconnect in a while." +echo "ok" > /run/zfs_unlock_complete_notify diff --git a/include/libzfs.h b/include/libzfs.h index 404ff85015d1..caae1115fc38 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -869,6 +869,8 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, * Label manipulation. */ extern int zpool_clear_label(int); +extern int zpool_set_bootenv(zpool_handle_t *, const char *); +extern int zpool_get_bootenv(zpool_handle_t *, char *, size_t, off_t); /* * Management interfaces for SMB ACL files diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 18ce6994a0ab..e69fe32cd0a1 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. @@ -135,6 +135,8 @@ int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *); int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); +int lzc_set_bootenv(const char *, const char *); +int lzc_get_bootenv(const char *, nvlist_t **); #ifdef __cplusplus } #endif diff --git a/include/os/freebsd/spl/sys/proc.h b/include/os/freebsd/spl/sys/proc.h index 07201dd6a095..fdb2126d6a3c 100644 --- a/include/os/freebsd/spl/sys/proc.h +++ b/include/os/freebsd/spl/sys/proc.h @@ -63,28 +63,12 @@ typedef struct proc proc_t; extern struct proc *zfsproc; -struct thread_wrap { - void *tw_arg; - void (*tw_proc)(void*); -}; - -static __inline void -solthread_wrapper(void *arg) -{ - struct thread_wrap *tw = arg; - - tw->tw_proc(tw->tw_arg); - free(tw, M_SOLARIS); - kthread_exit(); -} - static __inline kthread_t * do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, size_t len, proc_t *pp, int state, pri_t pri) { kthread_t *td = NULL; int error; - struct thread_wrap *tw; /* * Be sure there are no surprises. @@ -92,11 +76,8 @@ do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, ASSERT(stk == NULL); ASSERT(len == 0); ASSERT(state == TS_RUN); - tw = malloc(sizeof (*tw), M_SOLARIS, M_WAITOK); - tw->tw_proc = proc; - tw->tw_arg = arg; - error = kproc_kthread_add(solthread_wrapper, tw, &zfsproc, &td, + error = kproc_kthread_add(proc, arg, &zfsproc, &td, RFSTOPPED, stksize / PAGE_SIZE, "zfskern", "solthread %p", proc); if (error == 0) { thread_lock(td); @@ -105,8 +86,6 @@ do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, #if __FreeBSD_version < 1300068 thread_unlock(td); #endif - } else { - free(tw, M_SOLARIS); } return (td); } diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 82165170a386..17d78658c653 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -2,6 +2,7 @@ SUBDIRS = fm fs crypto lua sysevent COMMON_H = \ $(top_srcdir)/include/sys/abd.h \ + $(top_srcdir)/include/sys/abd_impl.h \ $(top_srcdir)/include/sys/aggsum.h \ $(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc_impl.h \ diff --git a/include/sys/abd.h b/include/sys/abd.h index 82b73589bbef..df4234f3c794 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -35,56 +35,14 @@ extern "C" { #endif -typedef enum abd_flags { - ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ - ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ - ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ - ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ - ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ - ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ -} abd_flags_t; - -typedef struct abd { - abd_flags_t abd_flags; - uint_t abd_size; /* excludes scattered abd_offset */ - struct abd *abd_parent; - zfs_refcount_t abd_children; - union { - struct abd_scatter { - uint_t abd_offset; -#if defined(__FreeBSD__) && defined(_KERNEL) - uint_t abd_chunk_size; - void *abd_chunks[]; -#else - uint_t abd_nents; - struct scatterlist *abd_sgl; -#endif - } abd_scatter; - struct abd_linear { - void *abd_buf; - struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ - } abd_linear; - } abd_u; -} abd_t; +struct abd; /* forward declaration */ +typedef struct abd abd_t; typedef int abd_iter_func_t(void *buf, size_t len, void *private); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private); extern int zfs_abd_scatter_enabled; -static inline boolean_t -abd_is_linear(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); -} - -static inline boolean_t -abd_is_linear_page(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ? - B_TRUE : B_FALSE); -} - /* * Allocations and deallocations */ @@ -124,12 +82,8 @@ void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); int abd_cmp(abd_t *, abd_t *); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); - -#if defined(_KERNEL) -unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, - size_t); -unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); -#endif +void abd_verify(abd_t *); +uint_t abd_get_size(abd_t *); void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ssize_t csize, ssize_t dsize, const unsigned parity, @@ -174,13 +128,29 @@ abd_zero(abd_t *abd, size_t size) abd_zero_off(abd, 0, size); } +/* + * ABD type check functions + */ +boolean_t abd_is_linear(abd_t *); +boolean_t abd_is_linear_page(abd_t *); + /* * Module lifecycle + * Defined in each specific OS's abd_os.c */ void abd_init(void); void abd_fini(void); +/* + * Linux ABD bio functions + */ +#if defined(__linux__) && defined(_KERNEL) +unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, + size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h new file mode 100644 index 000000000000..5aee772b1e04 --- /dev/null +++ b/include/sys/abd_impl.h @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#ifndef _ABD_IMPL_H +#define _ABD_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_flags { + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ + ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ + ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ + ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ +} abd_flags_t; + +typedef enum abd_stats_op { + ABDSTAT_INCR, /* Increase abdstat values */ + ABDSTAT_DECR /* Decrease abdstat values */ +} abd_stats_op_t; + +struct abd { + abd_flags_t abd_flags; + uint_t abd_size; /* excludes scattered abd_offset */ + struct abd *abd_parent; + zfs_refcount_t abd_children; + union { + struct abd_scatter { + uint_t abd_offset; +#if defined(__FreeBSD__) && defined(_KERNEL) + uint_t abd_chunk_size; + void *abd_chunks[]; +#else + uint_t abd_nents; + struct scatterlist *abd_sgl; +#endif + } abd_scatter; + struct abd_linear { + void *abd_buf; + struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ + } abd_linear; + } abd_u; +}; + +struct scatterlist; /* forward declaration */ + +struct abd_iter { + /* public interface */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ + + /* private */ + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; + size_t iter_offset; /* offset in current sg/abd_buf, */ + /* abd_offset included */ + struct scatterlist *iter_sg; /* current sg */ +}; + +/* + * OS specific functions + */ + +abd_t *abd_alloc_struct(size_t); +abd_t *abd_get_offset_scatter(abd_t *, size_t); +void abd_free_struct(abd_t *); +void abd_alloc_chunks(abd_t *, size_t); +void abd_free_chunks(abd_t *); +boolean_t abd_size_alloc_linear(size_t); +void abd_update_scatter_stats(abd_t *, abd_stats_op_t); +void abd_update_linear_stats(abd_t *, abd_stats_op_t); +void abd_verify_scatter(abd_t *); +void abd_free_linear_page(abd_t *); +/* OS specific abd_iter functions */ +void abd_iter_init(struct abd_iter *, abd_t *); +boolean_t abd_iter_at_end(struct abd_iter *); +void abd_iter_advance(struct abd_iter *, size_t); +void abd_iter_map(struct abd_iter *); +void abd_iter_unmap(struct abd_iter *); + +/* + * Helper macros + */ +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) +#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) + +#if defined(_KERNEL) +#if defined(__FreeBSD__) +#define abd_enter_critical(flags) critical_enter() +#define abd_exit_critical(flags) critical_exit() +#else +#define abd_enter_critical(flags) local_irq_save(flags) +#define abd_exit_critical(flags) local_irq_restore(flags) +#endif +#else /* !_KERNEL */ +#define abd_enter_critical(flags) ((void)0) +#define abd_exit_critical(flags) ((void)0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_IMPL_H */ diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 928b7232556f..e8c944ce8369 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -200,7 +200,7 @@ typedef struct l2arc_log_blkptr { /* * lbp_prop has the following format: * * logical size (in bytes) - * * physical (compressed) size (in bytes) + * * aligned (after compression) size (in bytes) * * compression algorithm (we always LZ4-compress l2arc logs) * * checksum algorithm (used for lbp_cksum) */ @@ -221,22 +221,26 @@ typedef struct l2arc_dev_hdr_phys { */ uint64_t dh_spa_guid; uint64_t dh_vdev_guid; - uint64_t dh_log_blk_ent; /* entries per log blk */ + uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ uint64_t dh_evict; /* evicted offset in bytes */ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ /* * Used in zdb.c for determining if a log block is valid, in the same * way that l2arc_rebuild() does. */ - uint64_t dh_start; - uint64_t dh_end; - + uint64_t dh_start; /* mirror of l2ad_start */ + uint64_t dh_end; /* mirror of l2ad_end */ /* * Start of log block chain. [0] -> newest log, [1] -> one older (used * for initiating prefetch). */ l2arc_log_blkptr_t dh_start_lbps[2]; - const uint64_t dh_pad[34]; /* pad to 512 bytes */ + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ + uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ + const uint64_t dh_pad[32]; /* pad to 512 bytes */ zio_eck_t dh_tail; } l2arc_dev_hdr_phys_t; CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); @@ -387,6 +391,14 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* evicted offset in bytes */ /* List of pointers to log blocks present in the L2ARC device */ list_t l2ad_lbptr_list; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + zfs_refcount_t l2ad_lb_asize; + /* + * Number of log blocks present on the device. + */ + zfs_refcount_t l2ad_lb_count; } l2arc_dev_t; /* @@ -738,14 +750,18 @@ typedef struct arc_stats { */ kstat_named_t arcstat_l2_log_blk_writes; /* - * Moving average of the physical size of the L2ARC log blocks, in + * Moving average of the aligned size of the L2ARC log blocks, in * bytes. Updated during L2ARC rebuild and during writing of L2ARC * log blocks. */ - kstat_named_t arcstat_l2_log_blk_avg_size; + kstat_named_t arcstat_l2_log_blk_avg_asize; + /* Aligned size of L2ARC log blocks on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_asize; + /* Number of L2ARC log blocks present on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_count; /* - * Moving average of the physical size of L2ARC restored data, in bytes, - * to the physical size of their metadata in ARC, in bytes. + * Moving average of the aligned size of L2ARC restored data, in bytes, + * to the aligned size of their metadata in L2ARC, in bytes. * Updated during L2ARC rebuild and during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_data_to_meta_ratio; @@ -780,6 +796,8 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_rebuild_abort_lowmem; /* Logical size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_size; + /* Aligned size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_asize; /* * Number of L2ARC log entries (buffers) that were successfully * restored in ARC. @@ -790,8 +808,6 @@ typedef struct arc_stats { * were not restored again. */ kstat_named_t arcstat_l2_rebuild_bufs_precached; - /* Physical size of L2ARC restored data, in bytes. */ - kstat_named_t arcstat_l2_rebuild_psize; /* * Number of L2ARC log blocks that were restored successfully. Each * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h index c0a562115a20..ed041ba70b73 100644 --- a/include/sys/dmu_recv.h +++ b/include/sys/dmu_recv.h @@ -73,7 +73,6 @@ typedef struct dmu_recv_cookie { struct receive_record_arg *drc_next_rrd; zio_cksum_t drc_cksum; zio_cksum_t drc_prev_cksum; - int drc_err; /* Sorted list of objects not to issue prefetches for. */ objlist_t *drc_ignore_objlist; } dmu_recv_cookie_t; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index f5aced0da7cd..ecdfd42d01a8 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -1153,9 +1153,6 @@ typedef struct ddt_histogram { #define ZFS_SUPER_MAGIC 0x2fc12fc1 -/* general zvol path */ -#define ZVOL_DIR "/dev" - #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 #define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) @@ -1290,7 +1287,7 @@ typedef enum zfs_ioc { ZFS_IOC_WAIT_FS, /* 0x5a54 */ /* - * Per-platform (Optional) - 6/128 numbers reserved. + * Per-platform (Optional) - 8/128 numbers reserved. */ ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80, ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */ @@ -1299,6 +1296,8 @@ typedef enum zfs_ioc { ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ + ZFS_IOC_SET_BOOTENV, /* 0x87 (Linux) */ + ZFS_IOC_GET_BOOTENV, /* 0x88 (Linux) */ ZFS_IOC_LAST } zfs_ioc_t; diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 56a869fec62a..c4ef479b5faf 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. */ @@ -179,6 +179,8 @@ extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags); +extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); +extern int vdev_label_write_bootenv(vdev_t *, char *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index b55871a5d242..96546ac35078 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -414,7 +414,7 @@ struct vdev { #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_PAD_SIZE (8 << 10) -/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +/* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) @@ -441,12 +441,32 @@ typedef struct vdev_phys { zio_eck_t vp_zbt; } vdev_phys_t; +typedef enum vbe_vers { + /* The bootenv file is stored as ascii text in the envblock */ + VB_RAW = 0, + + /* + * The bootenv file is converted to an nvlist and then packed into the + * envblock. + */ + VB_NVLIST = 1 +} vbe_vers_t; + +typedef struct vdev_boot_envblock { + uint64_t vbe_version; + char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - + sizeof (zio_eck_t)]; + zio_eck_t vbe_zbt; +} vdev_boot_envblock_t; + +CTASSERT_GLOBAL(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE); + typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ - char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ + vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ -} vdev_label_t; /* 256K total */ +} vdev_label_t; /* 256K total */ /* * vdev_dirty() flags diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 06c85f145f82..2b21787eef42 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -22,7 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. @@ -429,7 +429,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, * Assuming bootfs is a valid dataset name. */ static boolean_t -bootfs_name_valid(const char *pool, char *bootfs) +bootfs_name_valid(const char *pool, const char *bootfs) { int len = strlen(pool); if (bootfs[0] == '\0') @@ -3452,7 +3452,13 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, lastlog = 0; verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + + if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { + vdev = child[c]; + if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) + goto out; + continue; + } else if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool must be composed only of mirrors\n")); retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); @@ -4453,3 +4459,39 @@ zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity, return (error); } + +int +zpool_set_bootenv(zpool_handle_t *zhp, const char *envmap) +{ + int error = lzc_set_bootenv(zhp->zpool_name, envmap); + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, + "error setting bootenv in pool '%s'"), zhp->zpool_name); + } + + return (error); +} + +int +zpool_get_bootenv(zpool_handle_t *zhp, char *outbuf, size_t size, off_t offset) +{ + nvlist_t *nvl = NULL; + int error = lzc_get_bootenv(zhp->zpool_name, &nvl); + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, + "error getting bootenv in pool '%s'"), zhp->zpool_name); + return (-1); + } + char *envmap = fnvlist_lookup_string(nvl, "envmap"); + if (offset >= strlen(envmap)) { + fnvlist_free(nvl); + return (0); + } + + strlcpy(outbuf, envmap + offset, size); + int bytes = MIN(strlen(envmap + offset), size); + fnvlist_free(nvl); + return (bytes); +} diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 4e83b624b261..22996fc9be5f 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1619,3 +1619,25 @@ lzc_wait_fs(const char *fs, zfs_wait_activity_t activity, boolean_t *waited) return (error); } + +/* + * Set the bootenv contents for the given pool. + */ +int +lzc_set_bootenv(const char *pool, const char *env) +{ + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_string(args, "envmap", env); + int error = lzc_ioctl(ZFS_IOC_SET_BOOTENV, pool, args, NULL); + fnvlist_free(args); + return (error); +} + +/* + * Get the contents of the bootenv of the given pool. + */ +int +lzc_get_bootenv(const char *pool, nvlist_t **outnvl) +{ + return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); +} diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index a9396105bc6b..0e6a1058ec2f 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -39,6 +39,7 @@ KERNEL_C = \ zpool_prop.c \ zprop_common.c \ abd.c \ + abd_os.c \ aggsum.c \ arc.c \ arc_os.c \ diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am index 2af917fa5c2e..de54328617eb 100644 --- a/man/man1/Makefile.am +++ b/man/man1/Makefile.am @@ -1,4 +1,4 @@ -dist_man_MANS = zhack.1 ztest.1 raidz_test.1 zvol_wait.1 +dist_man_MANS = zhack.1 ztest.1 raidz_test.1 zvol_wait.1 arcstat.1 EXTRA_DIST = cstyle.1 install-data-local: diff --git a/man/man1/arcstat.1 b/man/man1/arcstat.1 new file mode 100644 index 000000000000..6dcc39b67f2f --- /dev/null +++ b/man/man1/arcstat.1 @@ -0,0 +1,482 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2014 Adam Stevko. All rights reserved. +.\" Copyright (c) 2015 by Delphix. All rights reserved. +.\" Copyright (c) 2020 by AJ Jordan. All rights reserved. +.\" +.TH ARCSTAT 1 "May 7, 2020" +.SH NAME +arcstat \- report ZFS ARC and L2ARC statistics +.SH SYNOPSIS +.LP +.nf +\fBarcstat\fR [\fB-hvx\fR] [\fB-f field[,field]...\fR] [\fB-o file\fR] [\fB-s string\fR] [\fBinterval\fR [\fBcount\fR]] +.fi + +.SH DESCRIPTION +.LP +The \fBarcstat\fR utility print various ZFS ARC and L2ARC statistics in +vmstat-like fashion. +.sp + +.sp +.LP +The \fBarcstat\fR command reports the following information: +.sp +.ne 2 + +.\" +.sp +.ne 1 +.na +\fBc \fR +.ad +.RS 14n +ARC target size +.RE + +.sp +.ne 2 +.na +\fBdh% \fR +.ad +.RS 14n +Demand data hit percentage +.RE + +.sp +.ne 2 +.na +\fBdm% \fR +.ad +.RS 14n +Demand data miss percentage +.RE + +.sp +.ne 2 +.na +\fBmfu \fR +.ad +.RS 14n +MFU list hits per second +.RE + +.sp +.ne 2 +.na +\fBmh% \fR +.ad +.RS 14n +Metadata hit percentage +.RE + +.sp +.ne 2 +.na +\fBmm% \fR +.ad +.RS 14n +Metadata miss percentage +.RE + +.sp +.ne 2 +.na +\fBmru \fR +.ad +.RS 14n +MRU list hits per second +.RE + +.sp +.ne 2 +.na +\fBph% \fR +.ad +.RS 14n +Prefetch hits percentage +.RE + +.sp +.ne 2 +.na +\fBpm% \fR +.ad +.RS 14n +Prefetch miss percentage +.RE + +.sp +.ne 2 +.na +\fBdhit \fR +.ad +.RS 14n +Demand data hits per second +.RE + +.sp +.ne 2 +.na +\fBdmis \fR +.ad +.RS 14n +Demand data misses per second +.RE + +.sp +.ne 2 +.na +\fBhit% \fR +.ad +.RS 14n +ARC hit percentage +.RE + +.sp +.ne 2 +.na +\fBhits \fR +.ad +.RS 14n +ARC reads per second +.RE + +.sp +.ne 2 +.na +\fBmfug \fR +.ad +.RS 14n +MFU ghost list hits per second +.RE + +.sp +.ne 2 +.na +\fBmhit \fR +.ad +.RS 14n +Metadata hits per second +.RE + +.sp +.ne 2 +.na +\fBmiss \fR +.ad +.RS 14n +ARC misses per second +.RE + +.sp +.ne 2 +.na +\fBmmis \fR +.ad +.RS 14n +Metadata misses per second +.RE + +.sp +.ne 2 +.na +\fBmrug \fR +.ad +.RS 14n +MRU ghost list hits per second +.RE + +.sp +.ne 2 +.na +\fBphit \fR +.ad +.RS 14n +Prefetch hits per second +.RE + +.sp +.ne 2 +.na +\fBpmis \fR +.ad +.RS 14n +Prefetch misses per second +.RE + +.sp +.ne 2 +.na +\fBread \fR +.ad +.RS 14n +Total ARC accesses per second +.RE + +.sp +.ne 2 +.na +\fBtime \fR +.ad +.RS 14n +Time +.RE + +.sp +.ne 2 +.na +\fBarcsz \fR +.ad +.RS 14n +ARC size +.RE + +.sp +.ne 2 +.na +\fBdread \fR +.ad +.RS 14n +Demand data accesses per second +.RE + +.sp +.ne 2 +.na +\fBeskip \fR +.ad +.RS 14n +evict_skip per second +.RE + +.sp +.ne 2 +.na +\fBmiss% \fR +.ad +.RS 14n +ARC miss percentage +.RE + +.sp +.ne 2 +.na +\fBmread \fR +.ad +.RS 14n +Metadata accesses per second +.RE + +.sp +.ne 2 +.na +\fBpread \fR +.ad +.RS 14n +Prefetch accesses per second +.RE + +.sp +.ne 2 +.na +\fBl2hit% \fR +.ad +.RS 14n +L2ARC access hit percentage +.RE + +.sp +.ne 2 +.na +\fBl2hits \fR +.ad +.RS 14n +L2ARC hits per second +.RE + +.sp +.ne 2 +.na +\fBl2miss \fR +.ad +.RS 14n +L2ARC misses per second +.RE + +.sp +.ne 2 +.na +\fBl2read \fR +.ad +.RS 14n +Total L2ARC accesses per second +.RE + +.sp +.ne 2 +.na +\fBl2size \fR +.ad +.RS 14n +Size of the L2ARC +.RE + +.sp +.ne 2 +.na +\fBmtxmis \fR +.ad +.RS 14n +mutex_miss per second +.RE + +.sp +.ne 2 +.na +\fBl2bytes \fR +.ad +.RS 14n +Bytes read per second from the L2ARC +.RE + +.sp +.ne 2 +.na +\fBl2miss% \fR +.ad +.RS 14n +L2ARC access miss percentage +.RE + +.sp +.ne 2 +.na +\fBl2asize \fR +.ad +.RS 14n +Actual (compressed) size of the L2ARC +.RE + +.sp +.ne 2 +.na +\fBgrow \fR +.ad +.RS 14n +ARC grow disabled +.RE + +.sp +.ne 2 +.na +\fBneed \fR +.ad +.RS 14n +ARC reclaim needed +.RE + +.sp +.ne 2 +.na +\fBfree \fR +.ad +.RS 14n +ARC free memory +.RE +.\" + +.SH OPTIONS +.LP +The following options are supported: + +.sp +.ne 2 +.na +\fB\fB-f\fR\fR +.ad +.RS 12n +Display only specific fields. See \fBDESCRIPTION\fR for supported statistics. +.RE + +.sp +.ne 2 +.na +\fB\fB-h\fR\fR +.ad +.RS 12n +Display help message. +.RE + +.sp +.ne 2 +.na +\fB\fB-o\fR\fR +.ad +.RS 12n +Report statistics to a file instead of the standard output. +.RE + +.sp +.ne 2 +.na +\fB\fB-s\fR\fR +.ad +.RS 12n +Display data with a specified separator (default: 2 spaces). +.RE + +.sp +.ne 2 +.na +\fB\fB-x\fR\fR +.ad +.RS 12n +Print extended stats (same as -f time,mfu,mru,mfug,mrug,eskip,mtxmis,dread,pread,read). +.RE + +.sp +.ne 2 +.na +\fB\fB-v\fR\fR +.ad +.RS 12n +Show field headers and definitions +.RE + +.SH OPERANDS +.LP +The following operands are supported: +.sp +.ne 2 +.na +\fB\fIcount\fR\fR +.ad +.RS 12n +Display only \fIcount\fR reports. +.RE + +.sp +.ne 2 +.na +\fB\fIinterval\fR\fR +.ad +.RS 12n +Specify the sampling interval in seconds. +.RE + +.SH AUTHORS +.LP +arcstat was originally written in Perl by Neelakanth Nadgir and supported only ZFS ARC statistics. +Mike Harsch updated it to support L2ARC statistics. +John Hixson ported it to Python for FreeNAS over some beer, after which many individuals from the OpenZFS community continued to maintain and improve it. diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 3915be3f8ef9..01ad95b3e900 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -216,7 +216,10 @@ Read the vdev labels and L2ARC header from the specified device. .Nm Fl l will return 0 if valid label was found, 1 if error occurred, and 2 if no valid labels were found. The presence of L2ARC header is indicated by a specific -sequence (L2ARC_DEV_HDR_MAGIC). Each unique configuration is displayed only +sequence (L2ARC_DEV_HDR_MAGIC). If there is an accounting error in the size +or the number of L2ARC log blocks +.Nm Fl l +will return 1. Each unique configuration is displayed only once. .It Fl ll Ar device In addition display label space usage stats. If a valid L2ARC header was found diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 6d76796f51ed..92b5c1906c1c 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -127,7 +127,7 @@ SRCS+= spl_atomic.c .endif #os/freebsd/zfs -SRCS+= abd.c \ +SRCS+= abd_os.c \ crypto_os.c \ dmu_os.c \ hkdf.c \ @@ -169,7 +169,8 @@ SRCS+= zfeature_common.c \ zprop_common.c #zfs -SRCS+= aggsum.c \ +SRCS+= abd.c \ + aggsum.c \ arc.c \ arc_os.c \ blkptr.c \ diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index b6a501f6773f..28cebc5dcbde 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -97,6 +97,7 @@ static void system_taskq_fini(void *arg) { + taskq_destroy(system_delay_taskq); taskq_destroy(system_taskq); uma_zdestroy(taskq_zone); tsd_destroy(&taskq_tsd); diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c new file mode 100644 index 000000000000..6b967bc070cb --- /dev/null +++ b/module/os/freebsd/zfs/abd_os.c @@ -0,0 +1,421 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * See abd.c for a general overview of the arc buffered data (ABD). + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled. + */ + +#include +#include +#include +#include +#include + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +#if defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, + &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, + &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +static size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + ABD_SCATTER(abd).abd_offset + abd->abd_size)); +} + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + size_t n = abd_scatter_chunkcnt(abd); + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + /* + * There is no scatter linear pages in FreeBSD so there is an + * if an error if the ABD has been marked as a linear page. + */ + VERIFY(!abd_is_linear_page(abd)); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + } +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + size_t n = abd_chunkcnt_for_bytes(size); + for (int i = 0; i < n; i++) { + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + ABD_SCATTER(abd).abd_chunks[i] = c; + } + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; +} + +void +abd_free_chunks(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + } +} + +abd_t * +abd_alloc_struct(size_t size) +{ + size_t chunkcnt = abd_chunkcnt_for_bytes(size); + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* + * FreeBSD does not have have scatter linear pages + * so there is an error. + */ + VERIFY(0); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * This is just a helper function to abd_get_offset_scatter() to alloc a + * scatter ABD using the calculated chunkcnt based on the offset within the + * parent ABD. + */ +static abd_t * +abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt) +{ + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + + +abd_t * +abd_get_offset_scatter(abd_t *sabd, size_t off) +{ + abd_t *abd = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&ABD_SCATTER(abd).abd_chunks, + &ABD_SCATTER(sabd).abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + + return (abd); +} + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + ABD_SCATTER(aiter->iter_abd).abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c index e734a2af8370..97cb201934dc 100644 --- a/module/os/freebsd/zfs/vdev_label_os.c +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -61,7 +61,7 @@ vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) retry: zio = zio_root(spa, NULL, NULL, flags); vdev_label_write(zio, vd, 0, pad2, - offsetof(vdev_label_t, vl_pad2), + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { diff --git a/module/os/freebsd/zfs/zfs_vnops.c b/module/os/freebsd/zfs/zfs_vnops.c index 711e0b1f75a8..817c3bb9b46f 100644 --- a/module/os/freebsd/zfs/zfs_vnops.c +++ b/module/os/freebsd/zfs/zfs_vnops.c @@ -743,6 +743,12 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + /* We don't copy out anything useful for directories. */ + if (vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); @@ -5941,7 +5947,7 @@ zfs_getextattr(struct vop_getextattr_args *ap) flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); - error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL); + error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index bef97a9b34ab..caef0b55b371 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -98,6 +98,7 @@ #include "zfs_namecheck.h" +#define ZVOL_DIR "/dev/zvol/" #define ZVOL_DUMPSIZE "dumpsize" #ifdef ZVOL_LOCK_DEBUG diff --git a/module/os/linux/zfs/Makefile.in b/module/os/linux/zfs/Makefile.in index 8c11a1ee6f58..cb4edbbc1a33 100644 --- a/module/os/linux/zfs/Makefile.in +++ b/module/os/linux/zfs/Makefile.in @@ -7,7 +7,7 @@ ccflags-$(CONFIG_SPARC64) += -Wno-unused-value ccflags-y += -I@abs_top_srcdir@/module/os/linux/zfs -$(MODULE)-objs += ../os/linux/zfs/abd.o +$(MODULE)-objs += ../os/linux/zfs/abd_os.o $(MODULE)-objs += ../os/linux/zfs/arc_os.o $(MODULE)-objs += ../os/linux/zfs/mmp_os.o $(MODULE)-objs += ../os/linux/zfs/policy.o diff --git a/module/os/linux/zfs/abd.c b/module/os/linux/zfs/abd.c deleted file mode 100644 index bc6f81000d48..000000000000 --- a/module/os/linux/zfs/abd.c +++ /dev/null @@ -1,1616 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2019 by Delphix. All rights reserved. - */ - -/* - * ARC buffer data (ABD). - * - * ABDs are an abstract data structure for the ARC which can use two - * different ways of storing the underlying data: - * - * (a) Linear buffer. In this case, all the data in the ABD is stored in one - * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). - * - * +-------------------+ - * | ABD (linear) | - * | abd_flags = ... | - * | abd_size = ... | +--------------------------------+ - * | abd_buf ------------->| raw buffer of size abd_size | - * +-------------------+ +--------------------------------+ - * no abd_chunks - * - * (b) Scattered buffer. In this case, the data in the ABD is split into - * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers - * to the chunks recorded in an array at the end of the ABD structure. - * - * +-------------------+ - * | ABD (scattered) | - * | abd_flags = ... | - * | abd_size = ... | - * | abd_offset = 0 | +-----------+ - * | abd_chunks[0] ----------------------------->| chunk 0 | - * | abd_chunks[1] ---------------------+ +-----------+ - * | ... | | +-----------+ - * | abd_chunks[N-1] ---------+ +------->| chunk 1 | - * +-------------------+ | +-----------+ - * | ... - * | +-----------+ - * +----------------->| chunk N-1 | - * +-----------+ - * - * Linear buffers act exactly like normal buffers and are always mapped into the - * kernel's virtual memory space, while scattered ABD data chunks are allocated - * as physical pages and then mapped in only while they are actually being - * accessed through one of the abd_* library functions. Using scattered ABDs - * provides several benefits: - * - * (1) They avoid use of kmem_*, preventing performance problems where running - * kmem_reap on very large memory systems never finishes and causes - * constant TLB shootdowns. - * - * (2) Fragmentation is less of an issue since when we are at the limit of - * allocatable space, we won't have to search around for a long free - * hole in the VA space for large ARC allocations. Each chunk is mapped in - * individually, so even if we are using HIGHMEM (see next point) we - * wouldn't need to worry about finding a contiguous address range. - * - * (3) If we are not using HIGHMEM, then all physical memory is always - * mapped into the kernel's address space, so we also avoid the map / - * unmap costs on each ABD access. - * - * If we are not using HIGHMEM, scattered buffers which have only one chunk - * can be treated as linear buffers, because they are contiguous in the - * kernel's virtual address space. See abd_alloc_pages() for details. - * - * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to - * B_FALSE. - * - * In addition to directly allocating a linear or scattered ABD, it is also - * possible to create an ABD by requesting the "sub-ABD" starting at an offset - * within an existing ABD. In linear buffers this is simple (set abd_buf of - * the new ABD to the starting point within the original raw buffer), but - * scattered ABDs are a little more complex. The new ABD makes a copy of the - * relevant abd_chunks pointers (but not the underlying data). However, to - * provide arbitrary rather than only chunk-aligned starting offsets, it also - * tracks an abd_offset field which represents the starting point of the data - * within the first chunk in abd_chunks. For both linear and scattered ABDs, - * creating an offset ABD marks the original ABD as the offset's parent, and the - * original ABD's abd_children refcount is incremented. This data allows us to - * ensure the root ABD isn't deleted before its children. - * - * Most consumers should never need to know what type of ABD they're using -- - * the ABD public API ensures that it's possible to transparently switch from - * using a linear ABD to a scattered one when doing so would be beneficial. - * - * If you need to use the data within an ABD directly, if you know it's linear - * (because you allocated it) you can use abd_to_buf() to access the underlying - * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions - * which will allocate a raw buffer if necessary. Use the abd_return_buf* - * functions to return any raw buffers that are no longer necessary when you're - * done using them. - * - * There are a variety of ABD APIs that implement basic buffer operations: - * compare, copy, read, write, and fill with zeroes. If you need a custom - * function which progressively accesses the whole ABD, use the abd_iterate_* - * functions. - */ - -#include -#include -#include -#include -#include -#ifdef _KERNEL -#include -#include -#else -#define MAX_ORDER 1 -#endif - -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_scatter_orders[MAX_ORDER]; - kstat_named_t abdstat_scatter_page_multi_chunk; - kstat_named_t abdstat_scatter_page_multi_zone; - kstat_named_t abdstat_scatter_page_alloc_retry; - kstat_named_t abdstat_scatter_sg_table_retry; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of compound allocations of a given order. These - * allocations are spread over all currently allocated ABDs, and - * act as a measure of memory fragmentation. - */ - { { "scatter_order_N", KSTAT_DATA_UINT64 } }, - /* - * The number of scatter ABDs which contain multiple chunks. - * ABDs are preferentially allocated from the minimum number of - * contiguous multi-page chunks, a single chunk is optimal. - */ - { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are split across memory zones. - * ABDs are preferentially allocated using pages from a single zone. - */ - { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, - /* - * The total number of retries encountered when attempting to - * allocate the pages to populate the scatter ABD. - */ - { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, - /* - * The total number of retries encountered when attempting to - * allocate the sg table for an ABD. - */ - { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) -#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) -#define abd_for_each_sg(abd, sg, n, i) \ - for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) - -/* see block comment above for description */ -int zfs_abd_scatter_enabled = B_TRUE; -unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; - -/* - * zfs_abd_scatter_min_size is the minimum allocation size to use scatter - * ABD's. Smaller allocations will use linear ABD's which uses - * zio_[data_]buf_alloc(). - * - * Scatter ABD's use at least one page each, so sub-page allocations waste - * some space when allocated as scatter (e.g. 2KB scatter allocation wastes - * half of each page). Using linear ABD's for small allocations means that - * they will be put on slabs which contain many allocations. This can - * improve memory efficiency, but it also makes it much harder for ARC - * evictions to actually free pages, because all the buffers on one slab need - * to be freed in order for the slab (and underlying pages) to be freed. - * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's - * possible for them to actually waste more memory than scatter (one page per - * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). - * - * Spill blocks are typically 512B and are heavily used on systems running - * selinux with the default dnode size and the `xattr=sa` property set. - * - * By default we use linear allocations for 512B and 1KB, and scatter - * allocations for larger (1.5KB and up). - */ -int zfs_abd_scatter_min_size = 512 * 3; - -static kmem_cache_t *abd_cache = NULL; -static kstat_t *abd_ksp; - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); -} - -#ifdef _KERNEL -/* - * Mark zfs data pages so they can be excluded from kernel crash dumps - */ -#ifdef _LP64 -#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E - -static inline void -abd_mark_zfs_page(struct page *page) -{ - get_page(page); - SetPagePrivate(page); - set_page_private(page, ABD_FILE_CACHE_PAGE); -} - -static inline void -abd_unmark_zfs_page(struct page *page) -{ - set_page_private(page, 0UL); - ClearPagePrivate(page); - put_page(page); -} -#else -#define abd_mark_zfs_page(page) -#define abd_unmark_zfs_page(page) -#endif /* _LP64 */ - -#ifndef CONFIG_HIGHMEM - -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - -/* - * The goal is to minimize fragmentation by preferentially populating ABDs - * with higher order compound pages from a single zone. Allocation size is - * progressively decreased until it can be satisfied without performing - * reclaim or compaction. When necessary this function will degenerate to - * allocating individual pages and allowing reclaim to satisfy allocations. - */ -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - struct list_head pages; - struct sg_table table; - struct scatterlist *sg; - struct page *page, *tmp_page = NULL; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; - gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; - int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); - int nr_pages = abd_chunkcnt_for_bytes(size); - int chunks = 0, zones = 0; - size_t remaining_size; - int nid = NUMA_NO_NODE; - int alloc_pages = 0; - - INIT_LIST_HEAD(&pages); - - while (alloc_pages < nr_pages) { - unsigned chunk_pages; - int order; - - order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); - chunk_pages = (1U << order); - - page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); - if (page == NULL) { - if (order == 0) { - ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); - schedule_timeout_interruptible(1); - } else { - max_order = MAX(0, order - 1); - } - continue; - } - - list_add_tail(&page->lru, &pages); - - if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) - zones++; - - nid = page_to_nid(page); - ABDSTAT_BUMP(abdstat_scatter_orders[order]); - chunks++; - alloc_pages += chunk_pages; - } - - ASSERT3S(alloc_pages, ==, nr_pages); - - while (sg_alloc_table(&table, chunks, gfp)) { - ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); - schedule_timeout_interruptible(1); - } - - sg = table.sgl; - remaining_size = size; - list_for_each_entry_safe(page, tmp_page, &pages, lru) { - size_t sg_size = MIN(PAGESIZE << compound_order(page), - remaining_size); - sg_set_page(sg, page, sg_size, 0); - abd_mark_zfs_page(page); - remaining_size -= sg_size; - - sg = sg_next(sg); - list_del(&page->lru); - } - - /* - * These conditions ensure that a possible transformation to a linear - * ABD would be valid. - */ - ASSERT(!PageHighMem(sg_page(table.sgl))); - ASSERT0(ABD_SCATTER(abd).abd_offset); - - if (table.nents == 1) { - /* - * Since there is only one entry, this ABD can be represented - * as a linear buffer. All single-page (4K) ABD's can be - * represented this way. Some multi-page ABD's can also be - * represented this way, if we were able to allocate a single - * "chunk" (higher-order "page" which represents a power-of-2 - * series of physically-contiguous pages). This is often the - * case for 2-page (8K) ABD's. - * - * Representing a single-entry scatter ABD as a linear ABD - * has the performance advantage of avoiding the copy (and - * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. - * A performance increase of around 5% has been observed for - * ARC-cached reads (of small blocks which can take advantage - * of this). - * - * Note that this optimization is only possible because the - * pages are always mapped into the kernel's address space. - * This is not the case for highmem pages, so the - * optimization can not be made there. - */ - abd->abd_flags |= ABD_FLAG_LINEAR; - abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; - abd->abd_u.abd_linear.abd_sgl = table.sgl; - abd->abd_u.abd_linear.abd_buf = - page_address(sg_page(table.sgl)); - } else if (table.nents > 1) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); - abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; - - if (zones) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); - abd->abd_flags |= ABD_FLAG_MULTI_ZONE; - } - - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = table.nents; - } -} -#else -/* - * Allocate N individual pages to construct a scatter ABD. This function - * makes no attempt to request contiguous pages and requires the minimal - * number of kernel interfaces. It's designed for maximum compatibility. - */ -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - struct scatterlist *sg = NULL; - struct sg_table table; - struct page *page; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; - int nr_pages = abd_chunkcnt_for_bytes(size); - int i = 0; - - while (sg_alloc_table(&table, nr_pages, gfp)) { - ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); - schedule_timeout_interruptible(1); - } - - ASSERT3U(table.nents, ==, nr_pages); - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = nr_pages; - - abd_for_each_sg(abd, sg, nr_pages, i) { - while ((page = __page_cache_alloc(gfp)) == NULL) { - ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); - schedule_timeout_interruptible(1); - } - - ABDSTAT_BUMP(abdstat_scatter_orders[0]); - sg_set_page(sg, page, PAGESIZE, 0); - abd_mark_zfs_page(page); - } - - if (nr_pages > 1) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); - abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; - } -} -#endif /* !CONFIG_HIGHMEM */ - -static void -abd_free_pages(abd_t *abd) -{ - struct scatterlist *sg = NULL; - struct sg_table table; - struct page *page; - int nr_pages = ABD_SCATTER(abd).abd_nents; - int order, i = 0; - - if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); - - if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); - - abd_for_each_sg(abd, sg, nr_pages, i) { - page = sg_page(sg); - abd_unmark_zfs_page(page); - order = compound_order(page); - __free_pages(page, order); - ASSERT3U(sg->length, <=, PAGE_SIZE << order); - ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); - } - - table.sgl = ABD_SCATTER(abd).abd_sgl; - table.nents = table.orig_nents = nr_pages; - sg_free_table(&table); -} - -#else /* _KERNEL */ - -#ifndef PAGE_SHIFT -#define PAGE_SHIFT (highbit64(PAGESIZE)-1) -#endif - -struct page; - -#define zfs_kmap_atomic(chunk, km) ((void *)chunk) -#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) -#define local_irq_save(flags) do { (void)(flags); } while (0) -#define local_irq_restore(flags) do { (void)(flags); } while (0) -#define nth_page(pg, i) \ - ((struct page *)((void *)(pg) + (i) * PAGESIZE)) - -struct scatterlist { - struct page *page; - int length; - int end; -}; - -static void -sg_init_table(struct scatterlist *sg, int nr) -{ - memset(sg, 0, nr * sizeof (struct scatterlist)); - sg[nr - 1].end = 1; -} - -#define for_each_sg(sgl, sg, nr, i) \ - for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) - -static inline void -sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, - unsigned int offset) -{ - /* currently we don't use offset */ - ASSERT(offset == 0); - sg->page = page; - sg->length = len; -} - -static inline struct page * -sg_page(struct scatterlist *sg) -{ - return (sg->page); -} - -static inline struct scatterlist * -sg_next(struct scatterlist *sg) -{ - if (sg->end) - return (NULL); - - return (sg + 1); -} - -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - unsigned nr_pages = abd_chunkcnt_for_bytes(size); - struct scatterlist *sg; - int i; - - ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * - sizeof (struct scatterlist), KM_SLEEP); - sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); - - abd_for_each_sg(abd, sg, nr_pages, i) { - struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); - sg_set_page(sg, p, PAGESIZE, 0); - } - ABD_SCATTER(abd).abd_nents = nr_pages; -} - -static void -abd_free_pages(abd_t *abd) -{ - int i, n = ABD_SCATTER(abd).abd_nents; - struct scatterlist *sg; - - abd_for_each_sg(abd, sg, n, i) { - for (int j = 0; j < sg->length; j += PAGESIZE) { - struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); - umem_free(p, PAGESIZE); - } - } - - vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); -} - -#endif /* _KERNEL */ - -void -abd_init(void) -{ - int i; - - abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - for (i = 0; i < MAX_ORDER; i++) { - snprintf(abd_stats.abdstat_scatter_orders[i].name, - KSTAT_STRLEN, "scatter_order_%d", i); - abd_stats.abdstat_scatter_orders[i].data_type = - KSTAT_DATA_UINT64; - } - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - } -} - -void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - if (abd_cache) { - kmem_cache_destroy(abd_cache); - abd_cache = NULL; - } -} - -static inline void -abd_verify(abd_t *abd) -{ - ASSERT3U(abd->abd_size, >, 0); - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | - ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); - IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); - IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); - } else { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; - - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); - ASSERT3U(ABD_SCATTER(abd).abd_offset, <, - ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; - abd_for_each_sg(abd, sg, n, i) { - ASSERT3P(sg_page(sg), !=, NULL); - } - } -} - -static inline abd_t * -abd_alloc_struct(void) -{ - abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); - - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) -{ - kmem_cache_free(abd_cache, abd); - ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); -} - -/* - * Allocate an ABD, along with its own underlying data buffers. Use this if you - * don't care whether the ABD is linear or not. - */ -abd_t * -abd_alloc(size_t size, boolean_t is_metadata) -{ - /* see the comment above zfs_abd_scatter_min_size */ - if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size) - return (abd_alloc_linear(size, is_metadata)); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd_t *abd = abd_alloc_struct(); - abd->abd_flags = ABD_FLAG_OWNER; - abd->abd_u.abd_scatter.abd_offset = 0; - abd_alloc_pages(abd, size); - - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - P2ROUNDUP(size, PAGESIZE) - size); - - return (abd); -} - -static void -abd_free_scatter(abd_t *abd) -{ - abd_free_pages(abd); - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); - - abd_free_struct(abd); -} - -/* - * Allocate an ABD that must be linear, along with its own underlying data - * buffer. Only use this when it would be very annoying to write your ABD - * consumer with a scattered ABD. - */ -abd_t * -abd_alloc_linear(size_t size, boolean_t is_metadata) -{ - abd_t *abd = abd_alloc_struct(); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); - } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); - - return (abd); -} - -static void -abd_free_linear(abd_t *abd) -{ - if (abd_is_linear_page(abd)) { - /* Transform it back into a scatter ABD for freeing */ - struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; - abd->abd_flags &= ~ABD_FLAG_LINEAR; - abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; - ABD_SCATTER(abd).abd_nents = 1; - ABD_SCATTER(abd).abd_offset = 0; - ABD_SCATTER(abd).abd_sgl = sg; - abd_free_scatter(abd); - return; - } - if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); - - abd_free_struct(abd); -} - -/* - * Free an ABD. Only use this on ABDs allocated with abd_alloc() or - * abd_alloc_linear(). - */ -void -abd_free(abd_t *abd) -{ - abd_verify(abd); - ASSERT3P(abd->abd_parent, ==, NULL); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) - abd_free_linear(abd); - else - abd_free_scatter(abd); -} - -/* - * Allocate an ABD of the same format (same metadata flag, same scatterize - * setting) as another ABD. - */ -abd_t * -abd_alloc_sametype(abd_t *sabd, size_t size) -{ - boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd) && - !abd_is_linear_page(sabd)) { - return (abd_alloc_linear(size, is_metadata)); - } else { - return (abd_alloc(size, is_metadata)); - } -} - -/* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). - * - * On Linux the optimal thing to do would be to use abd_get_offset() and - * construct a new ABD which shares the original pages thereby eliminating - * the copy. But for the moment a new linear ABD is allocated until this - * performance optimization can be implemented. - */ -abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) -{ - return (abd_alloc(size, is_metadata)); -} - -/* - * Allocate a new ABD to point to offset off of sabd. It shares the underlying - * buffer data with sabd. Use abd_put() to free. sabd must not be freed while - * any derived ABDs exist. - */ -static inline abd_t * -abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) -{ - abd_t *abd; - - abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); - - if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; - } else { - int i = 0; - struct scatterlist *sg = NULL; - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - - abd = abd_alloc_struct(); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { - if (new_offset < sg->length) - break; - new_offset -= sg->length; - } - - ABD_SCATTER(abd).abd_sgl = sg; - ABD_SCATTER(abd).abd_offset = new_offset; - ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; - } - - abd->abd_size = size; - abd->abd_parent = sabd; - zfs_refcount_create(&abd->abd_children); - (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - - return (abd); -} - -abd_t * -abd_get_offset(abd_t *sabd, size_t off) -{ - size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; - - VERIFY3U(size, >, 0); - - return (abd_get_offset_impl(sabd, off, size)); -} - -abd_t * -abd_get_offset_size(abd_t *sabd, size_t off, size_t size) -{ - ASSERT3U(off + size, <=, sabd->abd_size); - - return (abd_get_offset_impl(sabd, off, size)); -} - -/* - * Allocate a linear ABD structure for buf. You must free this with abd_put() - * since the resulting ABD doesn't own its own buffer. - */ -abd_t * -abd_get_from_buf(void *buf, size_t size) -{ - abd_t *abd = abd_alloc_struct(); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - /* - * Even if this buf is filesystem metadata, we only track that if we - * own the underlying data buffer, which is not true in this case. - * Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - abd->abd_u.abd_linear.abd_buf = buf; - - return (abd); -} - -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - -/* - * Get the raw buffer associated with a linear ABD. - */ -void * -abd_to_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); -} - -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - -/* - * Give this ABD ownership of the buffer that it's storing. Can only be used on - * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated - * with abd_alloc_linear() which subsequently released ownership of their buf - * with abd_release_ownership_of_buf(). - */ -void -abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - abd_verify(abd); - - abd->abd_flags |= ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - -void -abd_release_ownership_of_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - - /* - * abd_free() needs to handle LINEAR_PAGE ABD's specially. - * Since that flag does not survive the - * abd_release_ownership_of_buf() -> abd_get_from_buf() -> - * abd_take_ownership_of_buf() sequence, we don't allow releasing - * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. - */ - ASSERT(!abd_is_linear_page(abd)); - - abd_verify(abd); - - abd->abd_flags &= ~ABD_FLAG_OWNER; - /* Disable this flag since we no longer own the data buffer */ - abd->abd_flags &= ~ABD_FLAG_META; - - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); -} - -struct abd_iter { - /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ - - /* private */ - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; - size_t iter_offset; /* offset in current sg/abd_buf, */ - /* abd_offset included */ - struct scatterlist *iter_sg; /* current sg */ -}; - -/* - * Initialize the abd_iter. - */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) -{ - abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { - aiter->iter_offset = ABD_SCATTER(abd).abd_offset; - aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; - } -} - -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) -{ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - aiter->iter_pos += amount; - aiter->iter_offset += amount; - if (!abd_is_linear(aiter->iter_abd)) { - while (aiter->iter_offset >= aiter->iter_sg->length) { - aiter->iter_offset -= aiter->iter_sg->length; - aiter->iter_sg = sg_next(aiter->iter_sg); - if (aiter->iter_sg == NULL) { - ASSERT0(aiter->iter_offset); - break; - } - } - } -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) -{ - void *paddr; - size_t offset = 0; - - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); - offset = aiter->iter_offset; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; - } else { - offset = aiter->iter_offset; - aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - - paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), - km_table[aiter->iter_km]); - } - - aiter->iter_mapaddr = (char *)paddr + offset; -} - -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (!abd_is_linear(aiter->iter_abd)) { - /* LINTED E_FUNC_SET_NOT_USED */ - zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, - km_table[aiter->iter_km]); - } - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -int -abd_iterate_func(abd_t *abd, size_t off, size_t size, - abd_iter_func_t *func, void *private) -{ - int ret = 0; - struct abd_iter aiter; - - abd_verify(abd); - ASSERT3U(off + size, <=, abd->abd_size); - - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); - - while (size > 0) { - abd_iter_map(&aiter); - - size_t len = MIN(aiter.iter_mapsize, size); - ASSERT3U(len, >, 0); - - ret = func(aiter.iter_mapaddr, len, private); - - abd_iter_unmap(&aiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&aiter, len); - } - - return (ret); -} - -struct buf_arg { - void *arg_buf; -}; - -static int -abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(ba_ptr->arg_buf, buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy abd to buf. (off is the offset in abd.) - */ -void -abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, - &ba_ptr); -} - -static int -abd_cmp_buf_off_cb(void *buf, size_t size, void *private) -{ - int ret; - struct buf_arg *ba_ptr = private; - - ret = memcmp(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (ret); -} - -/* - * Compare the contents of abd to buf. (off is the offset in abd.) - */ -int -abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); -} - -static int -abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy from buf to abd. (off is the offset in abd.) - */ -void -abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, - &ba_ptr); -} - -/*ARGSUSED*/ -static int -abd_zero_off_cb(void *buf, size_t size, void *private) -{ - (void) memset(buf, 0, size); - return (0); -} - -/* - * Zero out the abd from a particular offset to the end. - */ -void -abd_zero_off(abd_t *abd, size_t off, size_t size) -{ - (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); -} - -/* - * Iterate over two ABDs and call func incrementally on the two ABDs' data in - * equal-sized chunks (passed to func as raw buffers). func could be called many - * times during this iteration. - */ -int -abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, - size_t size, abd_iter_func2_t *func, void *private) -{ - int ret = 0; - struct abd_iter daiter, saiter; - - abd_verify(dabd); - abd_verify(sabd); - - ASSERT3U(doff + size, <=, dabd->abd_size); - ASSERT3U(soff + size, <=, sabd->abd_size); - - abd_iter_init(&daiter, dabd, 0); - abd_iter_init(&saiter, sabd, 1); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); - - while (size > 0) { - abd_iter_map(&daiter); - abd_iter_map(&saiter); - - size_t dlen = MIN(daiter.iter_mapsize, size); - size_t slen = MIN(saiter.iter_mapsize, size); - size_t len = MIN(dlen, slen); - ASSERT(dlen > 0 || slen > 0); - - ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, - private); - - abd_iter_unmap(&saiter); - abd_iter_unmap(&daiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); - } - - return (ret); -} - -/*ARGSUSED*/ -static int -abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) -{ - (void) memcpy(dbuf, sbuf, size); - return (0); -} - -/* - * Copy from sabd to dabd starting from soff and doff. - */ -void -abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) -{ - (void) abd_iterate_func2(dabd, sabd, doff, soff, size, - abd_copy_off_cb, NULL); -} - -/*ARGSUSED*/ -static int -abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) -{ - return (memcmp(bufa, bufb, size)); -} - -/* - * Compares the contents of two ABDs. - */ -int -abd_cmp(abd_t *dabd, abd_t *sabd) -{ - ASSERT3U(dabd->abd_size, ==, sabd->abd_size); - return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, - abd_cmp_cb, NULL)); -} - -/* - * Iterate over code ABDs and a data ABD and call @func_raidz_gen. - * - * @cabds parity ABDs, must have equal size - * @dabd data ABD. Can be NULL (in this case @dsize = 0) - * @func_raidz_gen should be implemented so that its behaviour - * is the same when taking linear and when taking scatter - */ -void -abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, - void (*func_raidz_gen)(void **, const void *, size_t, size_t)) -{ - int i; - ssize_t len, dlen; - struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; - void *caddrs[3]; - unsigned long flags; - - ASSERT3U(parity, <=, 3); - - for (i = 0; i < parity; i++) - abd_iter_init(&caiters[i], cabds[i], i); - - if (dabd) - abd_iter_init(&daiter, dabd, i); - - ASSERT3S(dsize, >=, 0); - - local_irq_save(flags); - while (csize > 0) { - len = csize; - - if (dabd && dsize > 0) - abd_iter_map(&daiter); - - for (i = 0; i < parity; i++) { - abd_iter_map(&caiters[i]); - caddrs[i] = caiters[i].iter_mapaddr; - } - - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - /* falls through */ - case 2: - len = MIN(caiters[1].iter_mapsize, len); - /* falls through */ - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } - - /* must be progressive */ - ASSERT3S(len, >, 0); - - if (dabd && dsize > 0) { - /* this needs precise iter.length */ - len = MIN(daiter.iter_mapsize, len); - dlen = len; - } else - dlen = 0; - - /* must be progressive */ - ASSERT3S(len, >, 0); - /* - * The iterated function likely will not do well if each - * segment except the last one is not multiple of 512 (raidz). - */ - ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - - func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); - - for (i = parity-1; i >= 0; i--) { - abd_iter_unmap(&caiters[i]); - abd_iter_advance(&caiters[i], len); - } - - if (dabd && dsize > 0) { - abd_iter_unmap(&daiter); - abd_iter_advance(&daiter, dlen); - dsize -= dlen; - } - - csize -= len; - - ASSERT3S(dsize, >=, 0); - ASSERT3S(csize, >=, 0); - } - local_irq_restore(flags); -} - -/* - * Iterate over code ABDs and data reconstruction target ABDs and call - * @func_raidz_rec. Function maps at most 6 pages atomically. - * - * @cabds parity ABDs, must have equal size - * @tabds rec target ABDs, at most 3 - * @tsize size of data target columns - * @func_raidz_rec expects syndrome data in target columns. Function - * reconstructs data and overwrites target columns. - */ -void -abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, - void (*func_raidz_rec)(void **t, const size_t tsize, void **c, - const unsigned *mul), - const unsigned *mul) -{ - int i; - ssize_t len; - struct abd_iter citers[3]; - struct abd_iter xiters[3]; - void *caddrs[3], *xaddrs[3]; - unsigned long flags; - - ASSERT3U(parity, <=, 3); - - for (i = 0; i < parity; i++) { - abd_iter_init(&citers[i], cabds[i], 2*i); - abd_iter_init(&xiters[i], tabds[i], 2*i+1); - } - - local_irq_save(flags); - while (tsize > 0) { - - for (i = 0; i < parity; i++) { - abd_iter_map(&citers[i]); - abd_iter_map(&xiters[i]); - caddrs[i] = citers[i].iter_mapaddr; - xaddrs[i] = xiters[i].iter_mapaddr; - } - - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - /* falls through */ - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - /* falls through */ - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } - /* must be progressive */ - ASSERT3S(len, >, 0); - /* - * The iterated function likely will not do well if each - * segment except the last one is not multiple of 512 (raidz). - */ - ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - - func_raidz_rec(xaddrs, len, caddrs, mul); - - for (i = parity-1; i >= 0; i--) { - abd_iter_unmap(&xiters[i]); - abd_iter_unmap(&citers[i]); - abd_iter_advance(&xiters[i], len); - abd_iter_advance(&citers[i], len); - } - - tsize -= len; - ASSERT3S(tsize, >=, 0); - } - local_irq_restore(flags); -} - -#if defined(_KERNEL) -/* - * bio_nr_pages for ABD. - * @off is the offset in @abd - */ -unsigned long -abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) -{ - unsigned long pos; - - if (abd_is_linear(abd)) - pos = (unsigned long)abd_to_buf(abd) + off; - else - pos = abd->abd_u.abd_scatter.abd_offset + off; - - return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT); -} - -/* - * bio_map for scatter ABD. - * @off is the offset in @abd - * Remaining IO size is returned - */ -unsigned int -abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, - unsigned int io_size, size_t off) -{ - int i; - struct abd_iter aiter; - - ASSERT(!abd_is_linear(abd)); - ASSERT3U(io_size, <=, abd->abd_size - off); - - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); - - for (i = 0; i < bio->bi_max_vecs; i++) { - struct page *pg; - size_t len, sgoff, pgoff; - struct scatterlist *sg; - - if (io_size <= 0) - break; - - sg = aiter.iter_sg; - sgoff = aiter.iter_offset; - pgoff = sgoff & (PAGESIZE - 1); - len = MIN(io_size, PAGESIZE - pgoff); - ASSERT(len > 0); - - pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); - if (bio_add_page(bio, pg, len, pgoff) != len) - break; - - io_size -= len; - abd_iter_advance(&aiter, len); - } - - return (io_size); -} - -/* Tunable Parameters */ -module_param(zfs_abd_scatter_enabled, int, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_enabled, - "Toggle whether ABD allocations must be linear."); -module_param(zfs_abd_scatter_min_size, int, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_min_size, - "Minimum size of scatter allocations."); -/* CSTYLED */ -module_param(zfs_abd_scatter_max_order, uint, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_max_order, - "Maximum order allocation used for a scatter ABD."); -#endif diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c new file mode 100644 index 000000000000..a8e8f404dd2d --- /dev/null +++ b/module/os/linux/zfs/abd_os.c @@ -0,0 +1,879 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +/* + * See abd.c for an general overview of the arc buffered data (ABD). + * + * Linear buffers act exactly like normal buffers and are always mapped into the + * kernel's virtual memory space, while scattered ABD data chunks are allocated + * as physical pages and then mapped in only while they are actually being + * accessed through one of the abd_* library functions. Using scattered ABDs + * provides several benefits: + * + * (1) They avoid use of kmem_*, preventing performance problems where running + * kmem_reap on very large memory systems never finishes and causes + * constant TLB shootdowns. + * + * (2) Fragmentation is less of an issue since when we are at the limit of + * allocatable space, we won't have to search around for a long free + * hole in the VA space for large ARC allocations. Each chunk is mapped in + * individually, so even if we are using HIGHMEM (see next point) we + * wouldn't need to worry about finding a contiguous address range. + * + * (3) If we are not using HIGHMEM, then all physical memory is always + * mapped into the kernel's address space, so we also avoid the map / + * unmap costs on each ABD access. + * + * If we are not using HIGHMEM, scattered buffers which have only one chunk + * can be treated as linear buffers, because they are contiguous in the + * kernel's virtual address space. See abd_alloc_chunks() for details. + */ + +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#else +#define MAX_ORDER 1 +#endif + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_page_multi_chunk; + kstat_named_t abdstat_scatter_page_multi_zone; + kstat_named_t abdstat_scatter_page_alloc_retry; + kstat_named_t abdstat_scatter_sg_table_retry; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of compound allocations of a given order. These + * allocations are spread over all currently allocated ABDs, and + * act as a measure of memory fragmentation. + */ + { { "scatter_order_N", KSTAT_DATA_UINT64 } }, + /* + * The number of scatter ABDs which contain multiple chunks. + * ABDs are preferentially allocated from the minimum number of + * contiguous multi-page chunks, a single chunk is optimal. + */ + { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are split across memory zones. + * ABDs are preferentially allocated using pages from a single zone. + */ + { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the pages to populate the scatter ABD. + */ + { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the sg table for an ABD. + */ + { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, +}; + +#define abd_for_each_sg(abd, sg, n, i) \ + for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) + +unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; + +/* + * zfs_abd_scatter_min_size is the minimum allocation size to use scatter + * ABD's. Smaller allocations will use linear ABD's which uses + * zio_[data_]buf_alloc(). + * + * Scatter ABD's use at least one page each, so sub-page allocations waste + * some space when allocated as scatter (e.g. 2KB scatter allocation wastes + * half of each page). Using linear ABD's for small allocations means that + * they will be put on slabs which contain many allocations. This can + * improve memory efficiency, but it also makes it much harder for ARC + * evictions to actually free pages, because all the buffers on one slab need + * to be freed in order for the slab (and underlying pages) to be freed. + * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's + * possible for them to actually waste more memory than scatter (one page per + * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). + * + * Spill blocks are typically 512B and are heavily used on systems running + * selinux with the default dnode size and the `xattr=sa` property set. + * + * By default we use linear allocations for 512B and 1KB, and scatter + * allocations for larger (1.5KB and up). + */ +int zfs_abd_scatter_min_size = 512 * 3; + +static kmem_cache_t *abd_cache = NULL; +static kstat_t *abd_ksp; + +static size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); +} + +abd_t * +abd_alloc_struct(size_t size) +{ + /* + * In Linux we do not use the size passed in during ABD + * allocation, so we just ignore it. + */ + abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + kmem_cache_free(abd_cache, abd); + ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); +} + +#ifdef _KERNEL +/* + * Mark zfs data pages so they can be excluded from kernel crash dumps + */ +#ifdef _LP64 +#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E + +static inline void +abd_mark_zfs_page(struct page *page) +{ + get_page(page); + SetPagePrivate(page); + set_page_private(page, ABD_FILE_CACHE_PAGE); +} + +static inline void +abd_unmark_zfs_page(struct page *page) +{ + set_page_private(page, 0UL); + ClearPagePrivate(page); + put_page(page); +} +#else +#define abd_mark_zfs_page(page) +#define abd_unmark_zfs_page(page) +#endif /* _LP64 */ + +#ifndef CONFIG_HIGHMEM + +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + +/* + * The goal is to minimize fragmentation by preferentially populating ABDs + * with higher order compound pages from a single zone. Allocation size is + * progressively decreased until it can be satisfied without performing + * reclaim or compaction. When necessary this function will degenerate to + * allocating individual pages and allowing reclaim to satisfy allocations. + */ +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + struct list_head pages; + struct sg_table table; + struct scatterlist *sg; + struct page *page, *tmp_page = NULL; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; + int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + int nr_pages = abd_chunkcnt_for_bytes(size); + int chunks = 0, zones = 0; + size_t remaining_size; + int nid = NUMA_NO_NODE; + int alloc_pages = 0; + + INIT_LIST_HEAD(&pages); + + while (alloc_pages < nr_pages) { + unsigned chunk_pages; + int order; + + order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); + chunk_pages = (1U << order); + + page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); + if (page == NULL) { + if (order == 0) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } else { + max_order = MAX(0, order - 1); + } + continue; + } + + list_add_tail(&page->lru, &pages); + + if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) + zones++; + + nid = page_to_nid(page); + ABDSTAT_BUMP(abdstat_scatter_orders[order]); + chunks++; + alloc_pages += chunk_pages; + } + + ASSERT3S(alloc_pages, ==, nr_pages); + + while (sg_alloc_table(&table, chunks, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + sg = table.sgl; + remaining_size = size; + list_for_each_entry_safe(page, tmp_page, &pages, lru) { + size_t sg_size = MIN(PAGESIZE << compound_order(page), + remaining_size); + sg_set_page(sg, page, sg_size, 0); + abd_mark_zfs_page(page); + remaining_size -= sg_size; + + sg = sg_next(sg); + list_del(&page->lru); + } + + /* + * These conditions ensure that a possible transformation to a linear + * ABD would be valid. + */ + ASSERT(!PageHighMem(sg_page(table.sgl))); + ASSERT0(ABD_SCATTER(abd).abd_offset); + + if (table.nents == 1) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's can be + * represented this way. Some multi-page ABD's can also be + * represented this way, if we were able to allocate a single + * "chunk" (higher-order "page" which represents a power-of-2 + * series of physically-contiguous pages). This is often the + * case for 2-page (8K) ABD's. + * + * Representing a single-entry scatter ABD as a linear ABD + * has the performance advantage of avoiding the copy (and + * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. + * A performance increase of around 5% has been observed for + * ARC-cached reads (of small blocks which can take advantage + * of this). + * + * Note that this optimization is only possible because the + * pages are always mapped into the kernel's address space. + * This is not the case for highmem pages, so the + * optimization can not be made there. + */ + abd->abd_flags |= ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); + } else if (table.nents > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + if (zones) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); + abd->abd_flags |= ABD_FLAG_MULTI_ZONE; + } + + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + } +} +#else + +/* + * Allocate N individual pages to construct a scatter ABD. This function + * makes no attempt to request contiguous pages and requires the minimal + * number of kernel interfaces. It's designed for maximum compatibility. + */ +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + struct page *page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + int nr_pages = abd_chunkcnt_for_bytes(size); + int i = 0; + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + ASSERT3U(table.nents, ==, nr_pages); + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = nr_pages; + + abd_for_each_sg(abd, sg, nr_pages, i) { + while ((page = __page_cache_alloc(gfp)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + + ABDSTAT_BUMP(abdstat_scatter_orders[0]); + sg_set_page(sg, page, PAGESIZE, 0); + abd_mark_zfs_page(page); + } + + if (nr_pages > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + } +} +#endif /* !CONFIG_HIGHMEM */ + +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + struct sg_table table; + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; + sg_free_table(&table); +} + +void +abd_free_chunks(abd_t *abd) +{ + struct scatterlist *sg = NULL; + struct page *page; + int nr_pages = ABD_SCATTER(abd).abd_nents; + int order, i = 0; + + if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); + + if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_for_each_sg(abd, sg, nr_pages, i) { + page = sg_page(sg); + abd_unmark_zfs_page(page); + order = compound_order(page); + __free_pages(page, order); + ASSERT3U(sg->length, <=, PAGE_SIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } + abd_free_sg_table(abd); +} + +#else /* _KERNEL */ + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT (highbit64(PAGESIZE)-1) +#endif + +struct page; + +#define zfs_kmap_atomic(chunk, km) ((void *)chunk) +#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) +#define local_irq_save(flags) do { (void)(flags); } while (0) +#define local_irq_restore(flags) do { (void)(flags); } while (0) +#define nth_page(pg, i) \ + ((struct page *)((void *)(pg) + (i) * PAGESIZE)) + +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) +{ + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + int nents = ABD_SCATTER(abd).abd_nents; + vmem_free(ABD_SCATTER(abd).abd_sgl, + nents * sizeof (struct scatterlist)); +} + +#define for_each_sg(sgl, sg, nr, i) \ + for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) +{ + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) +{ + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + + return (sg + 1); +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(size); + struct scatterlist *sg; + int i; + + ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); + + abd_for_each_sg(abd, sg, nr_pages, i) { + struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); + sg_set_page(sg, p, PAGESIZE, 0); + } + ABD_SCATTER(abd).abd_nents = nr_pages; +} + +void +abd_free_chunks(abd_t *abd) +{ + int i, n = ABD_SCATTER(abd).abd_nents; + struct scatterlist *sg; + + abd_for_each_sg(abd, sg, n, i) { + for (int j = 0; j < sg->length; j += PAGESIZE) { + struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); + umem_free(p, PAGESIZE); + } + } + abd_free_sg_table(abd); +} + +#endif /* _KERNEL */ + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + (int)abd->abd_size + -(int)P2ROUNDUP(abd->abd_size, PAGESIZE)); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + size_t n; + int i = 0; + struct scatterlist *sg = NULL; + + ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_sgl->length); + n = ABD_SCATTER(abd).abd_nents; + abd_for_each_sg(abd, sg, n, i) { + ASSERT3P(sg_page(sg), !=, NULL); + } +} + +void +abd_init(void) +{ + int i; + + abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + for (i = 0; i < MAX_ORDER; i++) { + snprintf(abd_stats.abdstat_scatter_orders[i].name, + KSTAT_STRLEN, "scatter_order_%d", i); + abd_stats.abdstat_scatter_orders[i].data_type = + KSTAT_DATA_UINT64; + } + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + if (abd_cache) { + kmem_cache_destroy(abd_cache); + abd_cache = NULL; + } +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* Transform it back into a scatter ABD for freeing */ + struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + abd->abd_flags &= ~ABD_FLAG_LINEAR; + abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; + ABD_SCATTER(abd).abd_nents = 1; + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_sgl = sg; + abd_free_chunks(abd); + + zfs_refcount_destroy(&abd->abd_children); + abd_update_scatter_stats(abd, ABDSTAT_DECR); + abd_free_struct(abd); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * On Linux the optimal thing to do would be to use abd_get_offset() and + * construct a new ABD which shares the original pages thereby eliminating + * the copy. But for the moment a new linear ABD is allocated until this + * performance optimization can be implemented. + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc(size, is_metadata)); +} + +abd_t * +abd_get_offset_scatter(abd_t *sabd, size_t off) +{ + abd_t *abd = NULL; + int i = 0; + struct scatterlist *sg = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { + if (new_offset < sg->length) + break; + new_offset -= sg->length; + } + + ABD_SCATTER(abd).abd_sgl = sg; + ABD_SCATTER(abd).abd_offset = new_offset; + ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + + return (abd); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; + aiter->iter_pos = 0; + if (abd_is_linear(abd)) { + aiter->iter_offset = 0; + aiter->iter_sg = NULL; + } else { + aiter->iter_offset = ABD_SCATTER(abd).abd_offset; + aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; + } +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; + aiter->iter_offset += amount; + if (!abd_is_linear(aiter->iter_abd)) { + while (aiter->iter_offset >= aiter->iter_sg->length) { + aiter->iter_offset -= aiter->iter_sg->length; + aiter->iter_sg = sg_next(aiter->iter_sg); + if (aiter->iter_sg == NULL) { + ASSERT0(aiter->iter_offset); + break; + } + } + } +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + offset = aiter->iter_offset; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + offset = aiter->iter_offset; + aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + + paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), + km_table[aiter->iter_km]); + } + + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (!abd_is_linear(aiter->iter_abd)) { + /* LINTED E_FUNC_SET_NOT_USED */ + zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, + km_table[aiter->iter_km]); + } + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +#if defined(_KERNEL) +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +{ + unsigned long pos; + + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = ABD_SCATTER(abd).abd_offset + off; + + return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * Remaining IO size is returned + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + int i; + struct abd_iter aiter; + + ASSERT(!abd_is_linear(abd)); + ASSERT3U(io_size, <=, abd->abd_size - off); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + struct page *pg; + size_t len, sgoff, pgoff; + struct scatterlist *sg; + + if (io_size <= 0) + break; + + sg = aiter.iter_sg; + sgoff = aiter.iter_offset; + pgoff = sgoff & (PAGESIZE - 1); + len = MIN(io_size, PAGESIZE - pgoff); + ASSERT(len > 0); + + pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); + if (bio_add_page(bio, pg, len, pgoff) != len) + break; + + io_size -= len; + abd_iter_advance(&aiter, len); + } + + return (io_size); +} + +/* Tunable Parameters */ +module_param(zfs_abd_scatter_enabled, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_enabled, + "Toggle whether ABD allocations must be linear."); +module_param(zfs_abd_scatter_min_size, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_min_size, + "Minimum size of scatter allocations."); +/* CSTYLED */ +module_param(zfs_abd_scatter_max_order, uint, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_max_order, + "Maximum order allocation used for a scatter ABD."); +#endif diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 6737336caef9..3a9663997033 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -14,6 +14,7 @@ ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE) # Suppress unused-value warnings in sparc64 architecture headers ccflags-$(CONFIG_SPARC64) += -Wno-unused-value +$(MODULE)-objs += abd.o $(MODULE)-objs += aggsum.o $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o diff --git a/module/os/freebsd/zfs/abd.c b/module/zfs/abd.c similarity index 61% rename from module/os/freebsd/zfs/abd.c rename to module/zfs/abd.c index 888a113a4291..abb5d5f2ed38 100644 --- a/module/os/freebsd/zfs/abd.c +++ b/module/zfs/abd.c @@ -1,17 +1,26 @@ /* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. + * CDDL HEADER START * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END */ - /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. */ /* @@ -50,11 +59,6 @@ * +----------------->| chunk N-1 | * +-----------+ * - * Using a large proportion of scattered ABDs decreases ARC fragmentation since - * when we are at the limit of allocatable space, using equal-size chunks will - * allow us to quickly reclaim enough space for a new large allocation (assuming - * it is also scattered). - * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset * within an existing ABD. In linear buffers this is simple (set abd_buf of @@ -83,186 +87,55 @@ * compare, copy, read, write, and fill with zeroes. If you need a custom * function which progressively accesses the whole ABD, use the abd_iterate_* * functions. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. */ -#include +#include #include #include #include #include -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -/* - * It is possible to make all future ABDs be linear by setting this to B_FALSE. - * Otherwise, ABDs are allocated scattered by default unless the caller uses - * abd_alloc_linear(). - */ -boolean_t zfs_abd_scatter_enabled = B_TRUE; - -/* - * The size of the chunks ABD allocates. Because the sizes allocated from the - * kmem_cache can't change, this tunable can only be modified at boot. Changing - * it at runtime would cause ABD iteration to work incorrectly for ABDs which - * were allocated with the old size, so a safeguard has been put in place which - * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. - */ -size_t zfs_abd_chunk_size = 4096; - -#if defined(_KERNEL) -SYSCTL_DECL(_vfs_zfs); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, - &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, - &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); -#endif - -kmem_cache_t *abd_chunk_cache; -static kstat_t *abd_ksp; - -extern inline boolean_t abd_is_linear(abd_t *abd); -extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); -extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); -extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_zero(abd_t *abd, size_t size); +/* see block comment above for description */ +int zfs_abd_scatter_enabled = B_TRUE; -static void * -abd_alloc_chunk() +boolean_t +abd_is_linear(abd_t *abd) { - void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); - ASSERT3P(c, !=, NULL); - return (c); + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); } -static void -abd_free_chunk(void *c) +boolean_t +abd_is_linear_page(abd_t *abd) { - kmem_cache_free(abd_chunk_cache, c); + return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ? + B_TRUE : B_FALSE); } void -abd_init(void) -{ - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, - NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); - - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - } -} - -void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - kmem_cache_destroy(abd_chunk_cache); - abd_chunk_cache = NULL; -} - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); -} - -static inline size_t -abd_scatter_chunkcnt(abd_t *abd) -{ - ASSERT(!abd_is_linear(abd)); - return (abd_chunkcnt_for_bytes( - abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); -} - -static inline void abd_verify(abd_t *abd) { ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META)); + ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | + ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); } else { - ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, - zfs_abd_chunk_size); - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - ASSERT3P( - abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); - } + abd_verify_scatter(abd); } } -static inline abd_t * -abd_alloc_struct(size_t chunkcnt) -{ - size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, size); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) +uint_t +abd_get_size(abd_t *abd) { - size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); - int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - kmem_free(abd, size); - ABDSTAT_INCR(abdstat_struct_size, -size); + abd_verify(abd); + return (abd->abd_size); } /* @@ -272,15 +145,16 @@ abd_free_struct(abd_t *abd) abd_t * abd_alloc(size_t size, boolean_t is_metadata) { - if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) + if (!zfs_abd_scatter_enabled || abd_size_alloc_linear(size)) return (abd_alloc_linear(size, is_metadata)); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - size_t n = abd_chunkcnt_for_bytes(size); - abd_t *abd = abd_alloc_struct(n); - + abd_t *abd = abd_alloc_struct(size); abd->abd_flags = ABD_FLAG_OWNER; + abd->abd_u.abd_scatter.abd_offset = 0; + abd_alloc_chunks(abd, size); + if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } @@ -288,19 +162,7 @@ abd_alloc(size_t size, boolean_t is_metadata) abd->abd_parent = NULL; zfs_refcount_create(&abd->abd_children); - abd->abd_u.abd_scatter.abd_offset = 0; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - for (int i = 0; i < n; i++) { - void *c = abd_alloc_chunk(); - ASSERT3P(c, !=, NULL); - abd->abd_u.abd_scatter.abd_chunks[i] = c; - } - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - n * zfs_abd_chunk_size - size); + abd_update_scatter_stats(abd, ABDSTAT_INCR); return (abd); } @@ -308,17 +170,32 @@ abd_alloc(size_t size, boolean_t is_metadata) static void abd_free_scatter(abd_t *abd) { - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); - } + abd_free_chunks(abd); zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - abd->abd_size - n * zfs_abd_chunk_size); + abd_update_scatter_stats(abd, ABDSTAT_DECR); + abd_free_struct(abd); +} +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + if (abd == NULL) + return; + + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + zfs_refcount_destroy(&abd->abd_children); abd_free_struct(abd); } @@ -343,13 +220,12 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) zfs_refcount_create(&abd->abd_children); if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + ABD_LINEAR_BUF(abd) = zio_buf_alloc(size); } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size); } - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); + abd_update_linear_stats(abd, ABDSTAT_INCR); return (abd); } @@ -357,15 +233,18 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) static void abd_free_linear(abd_t *abd) { + if (abd_is_linear_page(abd)) { + abd_free_linear_page(abd); + return; + } if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + abd_update_linear_stats(abd, ABDSTAT_DECR); abd_free_struct(abd); } @@ -397,39 +276,23 @@ abd_t * abd_alloc_sametype(abd_t *sabd, size_t size) { boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd)) { + if (abd_is_linear(sabd) && + !abd_is_linear_page(sabd)) { return (abd_alloc_linear(size, is_metadata)); } else { return (abd_alloc(size, is_metadata)); } } -/* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). - */ -abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) -{ - return (abd_alloc_linear(size, is_metadata)); -} - /* * Allocate a new ABD to point to offset off of sabd. It shares the underlying * buffer data with sabd. Use abd_put() to free. sabd must not be freed while * any derived ABDs exist. */ -/* ARGSUSED */ -static inline abd_t * +static abd_t * abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) { - abd_t *abd; + abd_t *abd = NULL; abd_verify(sabd); ASSERT3U(off, <=, sabd->abd_size); @@ -444,60 +307,33 @@ abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) */ abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; } else { - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - size_t chunkcnt = abd_scatter_chunkcnt(sabd) - - (new_offset / zfs_abd_chunk_size); - - abd = abd_alloc_struct(chunkcnt); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd->abd_u.abd_scatter.abd_offset = - new_offset % zfs_abd_chunk_size; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - /* Copy the scatterlist starting at the correct offset */ - (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, - &sabd->abd_u.abd_scatter.abd_chunks[new_offset / - zfs_abd_chunk_size], - chunkcnt * sizeof (void *)); + abd = abd_get_offset_scatter(sabd, off); } - if (size == 0) - abd->abd_size = sabd->abd_size - off; - else - abd->abd_size = size; + abd->abd_size = size; abd->abd_parent = sabd; zfs_refcount_create(&abd->abd_children); (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - return (abd); } abd_t * abd_get_offset(abd_t *sabd, size_t off) { - - return (abd_get_offset_impl(sabd, off, 0)); + size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; + VERIFY3U(size, >, 0); + return (abd_get_offset_impl(sabd, off, size)); } abd_t * abd_get_offset_size(abd_t *sabd, size_t off, size_t size) { ASSERT3U(off + size, <=, sabd->abd_size); - return (abd_get_offset_impl(sabd, off, size)); } - /* * Allocate a linear ABD structure for buf. You must free this with abd_put() * since the resulting ABD doesn't own its own buffer. @@ -519,32 +355,11 @@ abd_get_from_buf(void *buf, size_t size) abd->abd_parent = NULL; zfs_refcount_create(&abd->abd_children); - abd->abd_u.abd_linear.abd_buf = buf; + ABD_LINEAR_BUF(abd) = buf; return (abd); } -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - if (abd == NULL) - return; - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - /* * Get the raw buffer associated with a linear ABD. */ @@ -553,7 +368,7 @@ abd_to_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); + return (ABD_LINEAR_BUF(abd)); } /* @@ -574,7 +389,6 @@ abd_borrow_buf(abd_t *abd, size_t n) buf = zio_buf_alloc(n); } (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - return (buf); } @@ -617,148 +431,50 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n) abd_return_buf(abd, buf, n); } -/* - * Give this ABD ownership of the buffer that it's storing. Can only be used on - * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated - * with abd_alloc_linear() which subsequently released ownership of their buf - * with abd_release_ownership_of_buf(). - */ -void -abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - abd_verify(abd); - - abd->abd_flags |= ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - void abd_release_ownership_of_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + + /* + * abd_free() needs to handle LINEAR_PAGE ABD's specially. + * Since that flag does not survive the + * abd_release_ownership_of_buf() -> abd_get_from_buf() -> + * abd_take_ownership_of_buf() sequence, we don't allow releasing + * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. + */ + ASSERT(!abd_is_linear_page(abd)); + abd_verify(abd); abd->abd_flags &= ~ABD_FLAG_OWNER; /* Disable this flag since we no longer own the data buffer */ abd->abd_flags &= ~ABD_FLAG_META; - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + abd_update_linear_stats(abd, ABDSTAT_DECR); } -struct abd_iter { - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; /* position (relative to abd_offset) */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ -}; - -static inline size_t -abd_iter_scatter_chunk_offset(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); -} - -static inline size_t -abd_iter_scatter_chunk_index(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); -} /* - * Initialize the abd_iter. + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd) +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) { + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) -{ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - aiter->iter_pos += amount; -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) -{ - void *paddr; - size_t offset = 0; - - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* Panic if someone has changed zfs_abd_chunk_size */ - IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == - aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - offset = aiter->iter_pos; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; - } else { - size_t index = abd_iter_scatter_chunk_index(aiter); - offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; } - aiter->iter_mapaddr = (char *)paddr + offset; -} - -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; + abd_update_linear_stats(abd, ABDSTAT_INCR); } int @@ -987,6 +703,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, struct abd_iter caiters[3]; struct abd_iter daiter = {0}; void *caddrs[3]; + unsigned long flags __maybe_unused = 0; ASSERT3U(parity, <=, 3); @@ -998,7 +715,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ASSERT3S(dsize, >=, 0); - critical_enter(); + abd_enter_critical(flags); while (csize > 0) { len = csize; @@ -1010,11 +727,14 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, caddrs[i] = caiters[i].iter_mapaddr; } + switch (parity) { case 3: len = MIN(caiters[2].iter_mapsize, len); + /* falls through */ case 2: len = MIN(caiters[1].iter_mapsize, len); + /* falls through */ case 1: len = MIN(caiters[0].iter_mapsize, len); } @@ -1055,7 +775,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ASSERT3S(dsize, >=, 0); ASSERT3S(csize, >=, 0); } - critical_exit(); + abd_exit_critical(flags); } /* @@ -1080,6 +800,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; + unsigned long flags __maybe_unused = 0; ASSERT3U(parity, <=, 3); @@ -1088,7 +809,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, abd_iter_init(&xiters[i], tabds[i]); } - critical_enter(); + abd_enter_critical(flags); while (tsize > 0) { for (i = 0; i < parity; i++) { @@ -1103,9 +824,11 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, case 3: len = MIN(xiters[2].iter_mapsize, len); len = MIN(citers[2].iter_mapsize, len); + /* falls through */ case 2: len = MIN(xiters[1].iter_mapsize, len); len = MIN(citers[1].iter_mapsize, len); + /* falls through */ case 1: len = MIN(xiters[0].iter_mapsize, len); len = MIN(citers[0].iter_mapsize, len); @@ -1130,5 +853,5 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, tsize -= len; ASSERT3S(tsize, >=, 0); } - critical_exit(); + abd_exit_critical(flags); } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5b34d4abd4ac..a6b739ec377a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -530,7 +530,9 @@ arc_stats_t arc_stats = { { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, - { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, @@ -539,9 +541,9 @@ arc_stats_t arc_stats = { { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, - { "l2_rebuild_psize", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, @@ -895,7 +897,7 @@ static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, - const l2arc_log_blk_phys_t *lb, uint64_t lb_psize, uint64_t lb_daddr); + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); @@ -7864,6 +7866,7 @@ l2arc_write_done(zio_t *zio) l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; + l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; @@ -7872,6 +7875,7 @@ l2arc_write_done(zio_t *zio) cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; + l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); @@ -7975,8 +7979,18 @@ l2arc_write_done(zio_t *zio) zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); - bytes_dropped += + /* + * L2BLK_GET_PSIZE returns aligned size for log + * blocks. + */ + uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + bytes_dropped += asize; + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); @@ -7984,6 +7998,17 @@ l2arc_write_done(zio_t *zio) } list_destroy(&cb->l2wcb_abd_list); + if (zio->io_error != 0) { + /* restore the lbps array in the header to its previous state */ + lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); + for (int i = 0; i < 2; i++) { + bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, + lb_ptr_buf); + } + } + atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); @@ -8277,21 +8302,21 @@ l2arc_sublist_lock(int list_num) /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given - * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this + * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { - if (dev->l2ad_dev_hdr->dh_log_blk_ent == 0) { + if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + - dev->l2ad_dev_hdr->dh_log_blk_ent - 1) / - dev->l2ad_dev_hdr->dh_log_blk_ent; + dev->l2ad_log_entries - 1) / + dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); @@ -8373,17 +8398,24 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop); + /* * We don't worry about log blocks left behind (ie - * lbp_daddr + psize < l2ad_hand) because l2arc_write_buffers() + * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { - vdev_space_update(dev->l2ad_vdev, - -L2BLK_GET_PSIZE( - (lb_ptr_buf->lb_ptr)->lbp_prop), 0, 0); + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); @@ -8475,6 +8507,10 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) dev->l2ad_first = B_FALSE; goto top; } + + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } /* @@ -8777,6 +8813,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + /* + * Create a list to save allocated abd buffers + * for l2arc_log_blk_commit(). + */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); @@ -8846,6 +8886,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) return (0); } + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); @@ -9036,6 +9079,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); + zfs_refcount_create(&adddev->l2ad_lb_asize); + zfs_refcount_create(&adddev->l2ad_lb_count); /* * Add device to global list @@ -9059,7 +9104,7 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) uint64_t l2dhdr_asize; spa_t *spa; int err; - boolean_t rebuild = B_TRUE; + boolean_t l2dhdr_valid = B_TRUE; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); @@ -9089,9 +9134,9 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) * Read the device header, if an error is returned do not rebuild L2ARC. */ if ((err = l2arc_dev_hdr_read(dev)) != 0) - rebuild = B_FALSE; + l2dhdr_valid = B_FALSE; - if (rebuild && l2dhdr->dh_log_blk_ent > 0) { + if (l2dhdr_valid && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, @@ -9117,12 +9162,10 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; - } else if (!rebuild && spa_writeable(spa)) { + } else if (spa_writeable(spa)) { /* - * The boolean rebuild is false if reading the device header - * returned an error. In this case create a new header. We - * zero out the memory holding the header to reset - * dh_start_lbps. + * In this case create a new header. We zero out the memory + * holding the header to reset dh_start_lbps. */ bzero(l2dhdr, l2dhdr_asize); l2arc_dev_hdr_update(dev); @@ -9172,6 +9215,8 @@ l2arc_remove_vdev(vdev_t *vd) list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -9309,7 +9354,7 @@ l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; - int i = 0, err = 0; + int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; @@ -9332,6 +9377,7 @@ l2arc_rebuild(l2arc_dev_t *dev) /* * Retrieve the persistent L2ARC device state. + * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + @@ -9381,11 +9427,10 @@ l2arc_rebuild(l2arc_dev_t *dev) /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. + * L2BLK_GET_PSIZE returns aligned size for log blocks. */ - l2arc_log_blk_restore(dev, this_lb, - L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), - lbps[0].lbp_daddr); - i++; + uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); /* * log block restored, include its pointer in the list of @@ -9398,9 +9443,12 @@ l2arc_rebuild(l2arc_dev_t *dev) sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); - vdev_space_update(vd, - L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), 0, 0); + vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: @@ -9417,13 +9465,16 @@ l2arc_rebuild(l2arc_dev_t *dev) * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check - * whether l2ad_evict lies in between the next log block - * offset (lbps[1].lbp_daddr) and the present log block offset - * (lbps[0].lbp_daddr). If true and this isn't the first pass, - * we are looping from the beginning and we should stop. + * whether l2ad_evict lies in between the payload starting + * offset of the next log block (lbps[1].lbp_payload_start) + * and the payload starting offset of the present log block + * (lbps[0].lbp_payload_start). If true and this isn't the + * first pass, we are looping from the beginning and we should + * stop. */ - if (l2arc_range_check_overlap(lbps[1].lbp_daddr, - lbps[0].lbp_daddr, dev->l2ad_evict) && !dev->l2ad_first) + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev->l2ad_evict) && + !dev->l2ad_first) goto out; for (;;) { @@ -9470,14 +9521,27 @@ l2arc_rebuild(l2arc_dev_t *dev) vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { - zfs_dbgmsg("L2ARC rebuild disabled"); - } else if (err == 0 && i > 0) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "disabled"); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); - zfs_dbgmsg("L2ARC successfully rebuilt, " - "restored %d blocks", i); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "successful, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { + /* + * No error but also nothing restored, meaning the lbps array + * in the device header points to invalid/non-present log + * blocks. Reset the header. + */ + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "no valid log blocks"); + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); } else if (err != 0) { - zfs_dbgmsg("L2ARC rebuild aborted, " - "restored %d blocks", i); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) @@ -9527,7 +9591,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || - l2dhdr->dh_log_blk_ent != dev->l2ad_log_entries || + l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict)) { @@ -9578,7 +9642,7 @@ l2arc_log_blk_read(l2arc_dev_t *dev, int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; - uint64_t psize; + uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); @@ -9616,9 +9680,12 @@ l2arc_log_blk_read(l2arc_dev_t *dev, goto cleanup; } - /* Make sure the buffer checks out */ - psize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); - fletcher_4_native(this_lb, psize, NULL, &cksum); + /* + * Make sure the buffer checks out. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); + fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " @@ -9634,11 +9701,11 @@ l2arc_log_blk_read(l2arc_dev_t *dev, case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: - abd = abd_alloc_for_io(psize, B_TRUE); - abd_copy_from_buf_off(abd, this_lb, 0, psize); + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), - abd, this_lb, psize, sizeof (*this_lb))) != 0) { + abd, this_lb, asize, sizeof (*this_lb))) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } @@ -9672,10 +9739,10 @@ l2arc_log_blk_read(l2arc_dev_t *dev, */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, - uint64_t lb_psize, uint64_t lb_daddr) + uint64_t lb_asize, uint64_t lb_daddr) { - uint64_t size = 0, psize = 0; - uint64_t log_entries = dev->l2ad_dev_hdr->dh_log_blk_ent; + uint64_t size = 0, asize = 0; + uint64_t log_entries = dev->l2ad_log_entries; for (int i = log_entries - 1; i >= 0; i--) { /* @@ -9692,27 +9759,28 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, * ^ ^ * | | * | | - * l2arc_fill_thread l2arc_rebuild - * places new bufs here restores bufs here + * l2arc_feed_thread l2arc_rebuild + * will place new bufs here restores bufs here * - * This also works when the restored bufs get evicted at any - * point during the rebuild. + * During l2arc_rebuild() the device is not used by + * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); - psize += L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop); + asize += vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC - * psize Physical size of restored buffers in the L2ARC + * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); - ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize); + ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); - ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize); - ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } @@ -9800,18 +9868,20 @@ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { - uint32_t psize; + uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; - psize = L2BLK_GET_PSIZE((lbp)->lbp_prop); - ASSERT(psize <= sizeof (l2arc_log_blk_phys_t)); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_abd = abd_get_from_buf(lb, psize); + cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); - (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize, + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); @@ -9841,14 +9911,18 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) abd_t *abd; int err; + VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; - l2dhdr->dh_log_blk_ent = dev->l2ad_log_entries; + l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); + l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; @@ -9884,7 +9958,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) uint8_t *tmpbuf; l2arc_lb_ptr_buf_t *lb_ptr_buf; - VERIFY3S(dev->l2ad_log_ent_idx, ==, l2dhdr->dh_log_blk_ent); + VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); tmpbuf = zio_buf_alloc(sizeof (*lb)); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); @@ -9896,8 +9970,14 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; - /* try to compress the buffer */ + /* + * l2arc_log_blk_commit() may be called multiple times during a single + * l2arc_write_buffers() call. Save the allocated abd buffers in a list + * so we can free them in l2arc_write_done() later on. + */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + + /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, tmpbuf, sizeof (*lb)); @@ -9962,13 +10042,17 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); - ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); @@ -9985,8 +10069,9 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { - uint64_t psize = L2BLK_GET_PSIZE((lbp)->lbp_prop); - uint64_t end = lbp->lbp_daddr + psize - 1; + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; @@ -10017,7 +10102,7 @@ l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && - psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t) && + asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } @@ -10032,14 +10117,13 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; - l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; - if (l2dhdr->dh_log_blk_ent == 0) + if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; - ASSERT3S(index, <, l2dhdr->dh_log_blk_ent); + ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; @@ -10059,7 +10143,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); - return (dev->l2ad_log_ent_idx == l2dhdr->dh_log_blk_ent); + return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index aaeaa4c34f7a..29fbe854d793 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1217,10 +1217,8 @@ receive_read(dmu_recv_cookie_t *drc, int len, void *buf) while (done < len) { ssize_t resid; - zfs_file_t *fp; - - fp = drc->drc_fp; - drc->drc_err = zfs_file_read(fp, (char *)buf + done, + zfs_file_t *fp = drc->drc_fp; + int err = zfs_file_read(fp, (char *)buf + done, len - done, &resid); if (resid == len - done) { /* @@ -1228,12 +1226,12 @@ receive_read(dmu_recv_cookie_t *drc, int len, void *buf) * that the receive was interrupted and can * potentially be resumed. */ - drc->drc_err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); + err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); } drc->drc_voff += len - done - resid; done = len - resid; - if (drc->drc_err != 0) - return (drc->drc_err); + if (err != 0) + return (err); } drc->drc_bytes_read += len; @@ -2574,7 +2572,8 @@ receive_writer_thread(void *arg) * free it. */ if (err != EAGAIN) { - rwa->err = err; + if (rwa->err == 0) + rwa->err = err; kmem_free(rrd, sizeof (*rrd)); } } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 9069f7e7d095..a5df78edd855 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1319,6 +1319,8 @@ redact_list_thread(void *arg) record = range_alloc(DATA, 0, 0, 0, B_TRUE); bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record)); spl_fstrans_unmark(cookie); + + thread_exit(); } /* diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 74ef2e15569a..f095017936c0 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -542,6 +542,22 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", (longlong_t)scn->scn_restart_txg); + } else if (dsl_scan_resilvering(dp)) { + /* + * If a resilver is in progress and there are already + * errors, restart it instead of finishing this scan and + * then restarting it. If there haven't been any errors + * then remember that the incore DTL is valid. + */ + if (scn->scn_phys.scn_errors > 0) { + scn->scn_restart_txg = txg; + zfs_dbgmsg("resilver can't excise DTL_MISSING " + "when finished; restarting in txg %llu", + (u_longlong_t)scn->scn_restart_txg); + } else { + /* it's safe to excise DTL when finished */ + spa->spa_scrub_started = B_TRUE; + } } } @@ -887,7 +903,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* @@ -914,6 +929,12 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) } spa_errlog_rotate(spa); + /* + * Don't clear flag until after vdev_dtl_reassess to ensure that + * DTL_MISSING will get updated when possible. + */ + spa->spa_scrub_started = B_FALSE; + /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index bd1e091cadcb..73d63f849ee0 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7297,7 +7297,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ - if (vd->vdev_islog || !vdev_is_concrete(vd)) { + if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && + !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; @@ -7333,6 +7334,11 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, } } + /* deal with indirect vdevs */ + if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == + &vdev_indirect_ops) + continue; + /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { @@ -7460,7 +7466,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { - if (vml[c] != NULL) { + if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); @@ -7521,7 +7527,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { - if (vml[c] != NULL) { + if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 59147ce31d36..923bf2e336a6 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome @@ -1554,7 +1554,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); @@ -2583,7 +2583,6 @@ vdev_dtl_should_excise(vdev_t *vd) spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) @@ -2634,6 +2633,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); @@ -2643,6 +2643,18 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) if (zfs_scan_ignore_errors && scn) scn->scn_phys.scn_errors = 0; + if (scrub_txg != 0 && + !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + wasempty = B_FALSE; + zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " + "dtl:%llu/%llu errors:%llu", + (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, + (u_longlong_t)scrub_txg, spa->spa_scrub_started, + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd), + (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); + } + /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to @@ -2679,6 +2691,14 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); + + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + zfs_dbgmsg("update DTL_MISSING:%llu/%llu", + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd)); + } else if (!wasempty) { + zfs_dbgmsg("DTL_MISSING is now empty"); + } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 078ba8bab69b..bfae40b6d76a 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1638,7 +1638,7 @@ vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) if (ic->ic_data == NULL) continue; - abd_zero(ic->ic_data, ic->ic_data->abd_size); + abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); } iv->iv_attempts_max *= 2; diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 5899af9fc67b..0d45d9958ef3 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -558,6 +558,8 @@ vdev_initialize_thread(void *arg) vd->vdev_initialize_thread = NULL; cv_broadcast(&vd->vdev_initialize_cv); mutex_exit(&vd->vdev_initialize_lock); + + thread_exit(); } /* diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index aeee6499bfaa..844bca79c9cf 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -957,7 +957,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; - abd_t *pad2; + abd_t *bootenv; uberblock_t *ub; abd_t *ub_abd; zio_t *zio; @@ -1118,8 +1118,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); - abd_zero(pad2, VDEV_PAD_SIZE); + bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(bootenv, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -1138,8 +1138,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ - vdev_label_write(zio, vd, l, pad2, - offsetof(vdev_label_t, vl_pad2), + vdev_label_write(zio, vd, l, bootenv, + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); vdev_label_write(zio, vd, l, ub_abd, @@ -1155,7 +1155,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) } nvlist_free(label); - abd_free(pad2); + abd_free(bootenv); abd_free(ub_abd); abd_free(vp_abd); @@ -1178,6 +1178,148 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) return (error); } +/* + * Done callback for vdev_label_read_bootenv_impl. If this is the first + * callback to finish, store our abd in the callback pointer. Otherwise, we + * just free our abd and return. + */ +static void +vdev_label_read_bootenv_done(zio_t *zio) +{ + zio_t *rio = zio->io_private; + abd_t **cbp = rio->io_private; + + ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); + + if (zio->io_error == 0) { + mutex_enter(&rio->io_lock); + if (*cbp == NULL) { + /* Will free this buffer in vdev_label_read_bootenv. */ + *cbp = zio->io_abd; + } else { + abd_free(zio->io_abd); + } + mutex_exit(&rio->io_lock); + } else { + abd_free(zio->io_abd); + } +} + +static void +vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); + + /* + * We just use the first label that has a correct checksum; the + * bootloader should have rewritten them all to be the same on boot, + * and any changes we made since boot have been the same across all + * labels. + * + * While grub supports writing to all four labels, other bootloaders + * don't, so we only use the first two labels to store boot + * information. + */ + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS / 2; l++) { + vdev_label_read(zio, vd, l, + abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, + vdev_label_read_bootenv_done, zio, flags); + } + } +} + +int +vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command) +{ + spa_t *spa = rvd->vdev_spa; + abd_t *abd = NULL; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(command); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + zio_t *zio = zio_root(spa, NULL, &abd, flags); + vdev_label_read_bootenv_impl(zio, rvd, flags); + int err = zio_wait(zio); + + if (abd != NULL) { + vdev_boot_envblock_t *vbe = abd_to_buf(abd); + if (vbe->vbe_version != VB_RAW) { + abd_free(abd); + return (SET_ERROR(ENOTSUP)); + } + vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; + fnvlist_add_string(command, "envmap", vbe->vbe_bootenv); + /* abd was allocated in vdev_label_read_bootenv_impl() */ + abd_free(abd); + /* If we managed to read any successfully, return success. */ + return (0); + } + return (err); +} + +int +vdev_label_write_bootenv(vdev_t *vd, char *envmap) +{ + zio_t *zio; + spa_t *spa = vd->vdev_spa; + vdev_boot_envblock_t *bootenv; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error = ENXIO; + + if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) { + return (SET_ERROR(E2BIG)); + } + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + for (int c = 0; c < vd->vdev_children; c++) { + int child_err = vdev_label_write_bootenv(vd->vdev_child[c], + envmap); + /* + * As long as any of the disks managed to write all of their + * labels successfully, return success. + */ + if (child_err == 0) + error = child_err; + } + + if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || + !vdev_writeable(vd)) { + return (error); + } + ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); + abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(abd, VDEV_PAD_SIZE); + bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); + + char *buf = bootenv->vbe_bootenv; + (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv)); + bootenv->vbe_version = VB_RAW; + abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + for (int l = 0; l < VDEV_LABELS / 2; l++) { + vdev_label_write(zio, vd, l, abd, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + } + + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(abd); + return (error); +} + /* * ========================================================================== * uberblock load/sync diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index dee46f4b3d3d..3f4f9091f43d 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1595,6 +1595,8 @@ spa_vdev_remove_thread(void *arg) ASSERT0(range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); } + + thread_exit(); } void diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 7170f7013608..ce79f7c73f64 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -98,7 +98,8 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - if (cvd->vdev_open_error && !cvd->vdev_islog) { + if (cvd->vdev_open_error && !cvd->vdev_islog && + cvd->vdev_ops != &vdev_indirect_ops) { lasterror = cvd->vdev_open_error; numerrors++; } diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 137ba83dff27..b0cd40f68765 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -896,6 +896,8 @@ vdev_trim_thread(void *arg) vd->vdev_trim_thread = NULL; cv_broadcast(&vd->vdev_trim_cv); mutex_exit(&vd->vdev_trim_lock); + + thread_exit(); } /* diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 2104bef714c2..d55ce20ef76f 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3512,6 +3512,58 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +/* + * This ioctl is used to set the bootenv configuration on the current + * pool. This configuration is stored in the second padding area of the label, + * and it is used by the GRUB bootloader used on Linux to store the contents + * of the grubenv file. The file is stored as raw ASCII, and is protected by + * an embedded checksum. By default, GRUB will check if the boot filesystem + * supports storing the environment data in a special location, and if so, + * will invoke filesystem specific logic to retrieve it. This can be overriden + * by a variable, should the user so desire. + */ +/* ARGSUSED */ +static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { + {"envmap", DATA_TYPE_STRING, 0}, +}; + +static int +zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + char *envmap; + int error; + spa_t *spa; + + envmap = fnvlist_lookup_string(innvl, "envmap"); + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + +static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. @@ -6981,6 +7033,16 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); + zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, + zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); + + zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, + zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, + zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index f849125a4a7c..29e8fd7dde0d 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -93,7 +93,7 @@ fi CONFIG_H="/var/lib/dkms/%{module}/%{version}/*/*/%{module}_config.h" SPEC_META_ALIAS="@PACKAGE@-@VERSION@-@RELEASE@" DKMS_META_ALIAS=`cat $CONFIG_H 2>/dev/null | - awk -F'"' '/META_ALIAS/ { print $2; exit 0 }'` + awk -F'"' '/META_ALIAS\s+"/ { print $2; exit 0 }'` if [ "$SPEC_META_ALIAS" = "$DKMS_META_ALIAS" ]; then echo -e echo -e "Uninstall of %{module} module ($SPEC_META_ALIAS) beginning:" diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a475db297ccc..a9bede475361 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -442,7 +442,7 @@ tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', - 'zpool_split_resilver'] + 'zpool_split_resilver', 'zpool_split_indirect'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] @@ -758,7 +758,7 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', tags = ['functional', 'reservation'] [tests/functional/resilver] -tests = ['resilver_restart_001'] +tests = ['resilver_restart_001', 'resilver_restart_002'] tags = ['functional', 'resilver'] [tests/functional/rootpool] diff --git a/tests/test-runner/bin/zts-report.py b/tests/test-runner/bin/zts-report.py index d74aa9d7aef8..5420ff9d29c0 100755 --- a/tests/test-runner/bin/zts-report.py +++ b/tests/test-runner/bin/zts-report.py @@ -272,8 +272,8 @@ def process_results(pathname): pattern_log = r'^\s*Log directory:\s*(\S*)' d = {} - for l in f.readlines(): - m = re.match(pattern, l) + for line in f.readlines(): + m = re.match(pattern, line) if m and len(m.groups()) == 4: summary['total'] += 1 if m.group(4) == "PASS": @@ -281,7 +281,7 @@ def process_results(pathname): d[m.group(1)] = m.group(4) continue - m = re.match(pattern_log, l) + m = re.match(pattern_log, line) if m: summary['logfile'] = m.group(1) diff --git a/tests/zfs-tests/cmd/btree_test/Makefile.am b/tests/zfs-tests/cmd/btree_test/Makefile.am index 632f0472668c..bf09cdb82da4 100644 --- a/tests/zfs-tests/cmd/btree_test/Makefile.am +++ b/tests/zfs-tests/cmd/btree_test/Makefile.am @@ -29,4 +29,5 @@ btree_test_SOURCES = btree_test.c btree_test_LDADD = \ $(top_builddir)/lib/libavl/libavl.la \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libzpool/libzpool.la diff --git a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c index 3f6147509fc9..00924dda9158 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -751,6 +751,24 @@ test_wait_fs(const char *dataset) nvlist_free(required); } +static void +test_get_bootenv(const char *pool) +{ + IOC_INPUT_TEST(ZFS_IOC_GET_BOOTENV, pool, NULL, NULL, 0); +} + +static void +test_set_bootenv(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_string(required, "envmap", "test"); + + IOC_INPUT_TEST(ZFS_IOC_SET_BOOTENV, pool, required, NULL, 0); + + nvlist_free(required); +} + static void zfs_ioc_input_tests(const char *pool) { @@ -840,6 +858,9 @@ zfs_ioc_input_tests(const char *pool) test_wait(pool); test_wait_fs(dataset); + test_set_bootenv(pool); + test_get_bootenv(pool); + /* * cleanup */ @@ -1000,6 +1021,8 @@ validate_ioc_values(void) CHECK(ZFS_IOC_PLATFORM_BASE + 4 == ZFS_IOC_NEXTBOOT); CHECK(ZFS_IOC_PLATFORM_BASE + 5 == ZFS_IOC_JAIL); CHECK(ZFS_IOC_PLATFORM_BASE + 6 == ZFS_IOC_UNJAIL); + CHECK(ZFS_IOC_PLATFORM_BASE + 7 == ZFS_IOC_SET_BOOTENV); + CHECK(ZFS_IOC_PLATFORM_BASE + 8 == ZFS_IOC_GET_BOOTENV); #undef CHECK diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index f85b156586be..efbcc09e7eb4 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -59,6 +59,7 @@ OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_esti REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms +SCAN_LEGACY scan_legacy zfs_scan_legacy SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am index d00f39d35d77..1ca05a4e8e8d 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am @@ -11,7 +11,8 @@ dist_pkgdata_SCRIPTS = \ zpool_split_props.ksh \ zpool_split_vdevs.ksh \ zpool_split_resilver.ksh \ - zpool_split_wholedisk.ksh + zpool_split_wholedisk.ksh \ + zpool_split_indirect.ksh dist_pkgdata_DATA = \ zpool_split.cfg diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh new file mode 100755 index 000000000000..13f0d08b7f20 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh @@ -0,0 +1,69 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# 'zpool split' should succeed on pools with indirect vdevs. +# +# STRATEGY: +# Create a mirrored pool, add a single device, remove it. `zpool split` +# should succeed. +# + +verify_runnable "global" + +log_assert "'zpool split' works on pools with indirect VDEVs." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + if poolexists $TESTPOOL2 ; then + destroy_pool $TESTPOOL2 + fi + rm -f $VDEV_TEMP $VDEV_M1 $VDEV_M2 +} +log_onexit cleanup + +typeset vdev_m12_mb=400 +typeset vdev_temp_mb=$(( floor($vdev_m12_mb / 2) )) +typeset VDEV_TEMP="$TEST_BASE_DIR/vdev_temp" +typeset VDEV_M1="$TEST_BASE_DIR/vdev_m1" +typeset VDEV_M2="$TEST_BASE_DIR/vdev_m2" +typeset altroot="$TESTDIR/altroot-$TESTPOOL2" + +log_must truncate -s ${vdev_temp_mb}M $VDEV_TEMP +log_must truncate -s ${vdev_m12_mb}M $VDEV_M1 +log_must truncate -s ${vdev_m12_mb}M $VDEV_M2 + +log_must zpool create -f $TESTPOOL $VDEV_TEMP +log_must zpool add -f $TESTPOOL mirror $VDEV_M1 $VDEV_M2 +log_must zpool remove $TESTPOOL $VDEV_TEMP +log_must wait_for_removal $TESTPOOL +log_must zpool split -R $altroot $TESTPOOL $TESTPOOL2 +log_must poolexists $TESTPOOL2 +log_must test "$(get_pool_prop 'altroot' $TESTPOOL2)" == "$altroot" + +log_pass "'zpool split' works on pools with indirect VDEVs." diff --git a/tests/zfs-tests/tests/functional/hkdf/Makefile.am b/tests/zfs-tests/tests/functional/hkdf/Makefile.am index c1266214fefd..378bcf531d84 100644 --- a/tests/zfs-tests/tests/functional/hkdf/Makefile.am +++ b/tests/zfs-tests/tests/functional/hkdf/Makefile.am @@ -1,6 +1,8 @@ include $(top_srcdir)/config/Rules.am -LDADD = $(top_builddir)/lib/libzpool/libzpool.la +LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzpool/libzpool.la AUTOMAKE_OPTIONS = subdir-objects diff --git a/tests/zfs-tests/tests/functional/libzfs/Makefile.am b/tests/zfs-tests/tests/functional/libzfs/Makefile.am index e9a703f4902d..545af77e7d12 100644 --- a/tests/zfs-tests/tests/functional/libzfs/Makefile.am +++ b/tests/zfs-tests/tests/functional/libzfs/Makefile.am @@ -10,6 +10,8 @@ dist_pkgdata_SCRIPTS = \ libzfs_input.ksh many_fds_LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la pkgexec_PROGRAMS = many_fds diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh index b202fac40aca..f313923d1469 100755 --- a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh @@ -99,7 +99,7 @@ typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk -gt 0 -log_must zdb -lq $VDEV_CACHE +log_must zdb -lll $VDEV_CACHE log_must zpool destroy -f $TESTPOOL diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh index 8c044eca59d5..1ccc9828d4f7 100755 --- a/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh @@ -45,9 +45,9 @@ verify_runnable "global" function cleanup { - log_must zfs destroy -rf $TESTPOOL/$TESTFS - log_must zfs create $TESTPOOL/$TESTFS - log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS + destroy_dataset "$fs" "-rf" + log_must zfs create $fs + log_must zfs set mountpoint=$TESTDIR $fs } log_assert "Volume (ref)reservation is not limited by volsize" diff --git a/tests/zfs-tests/tests/functional/resilver/Makefile.am b/tests/zfs-tests/tests/functional/resilver/Makefile.am index 465d8f3a3a31..38136a843aac 100644 --- a/tests/zfs-tests/tests/functional/resilver/Makefile.am +++ b/tests/zfs-tests/tests/functional/resilver/Makefile.am @@ -2,7 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - resilver_restart_001.ksh + resilver_restart_001.ksh \ + resilver_restart_002.ksh dist_pkgdata_DATA = \ resilver.cfg diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh new file mode 100755 index 000000000000..ebe5e693b2cf --- /dev/null +++ b/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Datto Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/resilver/resilver.cfg + +# +# DESCRIPTION: +# Testing resilver completes when scan errors are encountered, but relevant +# DTL's have not been lost. +# +# STRATEGY: +# 1. Create a pool (1k recordsize) +# 2. Create a 32m file (32k records) +# 3. Inject an error halfway through the file +# 4. Start a resilver, ensure the error is triggered and that the resilver +# does not restart after finishing +# +# NB: use legacy scanning to ensure scan of specific block causes error +# + +function cleanup +{ + log_must zinject -c all + destroy_pool $TESTPOOL + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE + log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY +} + +log_assert "Check for resilver restarts caused by scan errors" + +ORIG_SCAN_LEGACY=$(get_tunable SCAN_LEGACY) + +log_onexit cleanup + +# use legacy scan to ensure injected error will be triggered +log_must set_tunable32 SCAN_LEGACY 1 + + # create the pool and a 32M file (32k blocks) +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE +log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]} +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1 + +# determine objset/object +objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') +object=$(ls -i /$TESTPOOL/file | awk '{print $1}') + +# inject event to cause error during resilver +log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL + +# clear events and start resilver +log_must zpool events -c +log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE + +log_note "waiting for read errors to start showing up" +for iter in {0..59} +do + zpool sync $TESTPOOL + err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}') + (( $err > 0 )) && break + sleep 1 +done + +(( $err == 0 )) && log_fail "Unable to induce errors in resilver" + +log_note "waiting for resilver to finish" +for iter in {0..59} +do + finish=$(zpool events | grep "sysevent.fs.zfs.resilver_finish" | wc -l) + (( $finish > 0 )) && break + sleep 1 +done + +(( $finish == 0 )) && log_fail "resilver took too long to finish" + +# wait a few syncs to ensure that zfs does not restart the resilver +log_must zpool sync $TESTPOOL +log_must zpool sync $TESTPOOL + +# check if resilver was restarted +start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l) +(( $start != 1 )) && log_fail "resilver restarted unnecessarily" + +log_pass "Resilver did not restart unnecessarily from scan errors"