From ddc7a2dd3b099c280b4f3ed978f16fa6bd7012c0 Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Tue, 5 May 2020 12:36:41 -0400 Subject: [PATCH 01/27] taskq: Don't leak system_delay_taskq on FreeBSD Adds a missing taskq_destroy() call. Reported by: Jorgen Lundman Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ryan Moeller Closes #10292 --- module/os/freebsd/spl/spl_taskq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c index b6a501f6773f..28cebc5dcbde 100644 --- a/module/os/freebsd/spl/spl_taskq.c +++ b/module/os/freebsd/spl/spl_taskq.c @@ -97,6 +97,7 @@ static void system_taskq_fini(void *arg) { + taskq_destroy(system_delay_taskq); taskq_destroy(system_taskq); uma_zdestroy(taskq_zone); tsd_destroy(&taskq_tsd); From 1b664952ae34d28c0408c20947f8aa5420c4ab63 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Wed, 6 May 2020 13:32:28 -0400 Subject: [PATCH 02/27] Enable splitting mirrors with indirect vdevs When a top-level vdev is removed from a pool it is converted to an indirect vdev. Until now splitting such mirrored pools was not possible with zpool split. This patch enables handling of indirect vdevs and splitting of those pools with zpool split. Reviewed-by: Matthew Ahrens Reviewed by: Brian Behlendorf Signed-off-by: George Amanakis Closes #10283 --- lib/libzfs/libzfs_pool.c | 8 ++- module/zfs/spa.c | 12 +++- module/zfs/vdev_root.c | 3 +- tests/runfiles/common.run | 2 +- .../cli_root/zpool_split/Makefile.am | 3 +- .../zpool_split/zpool_split_indirect.ksh | 68 +++++++++++++++++++ 6 files changed, 89 insertions(+), 7 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 06c85f145f82..9122b3ee1258 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3452,7 +3452,13 @@ zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, lastlog = 0; verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + + if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { + vdev = child[c]; + if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) + goto out; + continue; + } else if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Source pool must be composed only of mirrors\n")); retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index bd1e091cadcb..73d63f849ee0 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7297,7 +7297,8 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ - if (vd->vdev_islog || !vdev_is_concrete(vd)) { + if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && + !vdev_is_concrete(vd))) { if (lastlog == 0) lastlog = c; continue; @@ -7333,6 +7334,11 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, } } + /* deal with indirect vdevs */ + if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == + &vdev_indirect_ops) + continue; + /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { @@ -7460,7 +7466,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, offsetof(vdev_t, vdev_trim_node)); for (c = 0; c < children; c++) { - if (vml[c] != NULL) { + if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { mutex_enter(&vml[c]->vdev_initialize_lock); vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); @@ -7521,7 +7527,7 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { - if (vml[c] != NULL) { + if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { vdev_t *tvd = vml[c]->vdev_top; /* diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c index 7170f7013608..ce79f7c73f64 100644 --- a/module/zfs/vdev_root.c +++ b/module/zfs/vdev_root.c @@ -98,7 +98,8 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - if (cvd->vdev_open_error && !cvd->vdev_islog) { + if (cvd->vdev_open_error && !cvd->vdev_islog && + cvd->vdev_ops != &vdev_indirect_ops) { lasterror = cvd->vdev_open_error; numerrors++; } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a475db297ccc..2fcde83b3c39 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -442,7 +442,7 @@ tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] tests = ['zpool_split_cliargs', 'zpool_split_devices', 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', - 'zpool_split_resilver'] + 'zpool_split_resilver', 'zpool_split_indirect'] tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am index d00f39d35d77..1ca05a4e8e8d 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am @@ -11,7 +11,8 @@ dist_pkgdata_SCRIPTS = \ zpool_split_props.ksh \ zpool_split_vdevs.ksh \ zpool_split_resilver.ksh \ - zpool_split_wholedisk.ksh + zpool_split_wholedisk.ksh \ + zpool_split_indirect.ksh dist_pkgdata_DATA = \ zpool_split.cfg diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh new file mode 100755 index 000000000000..d6b0e7358ed7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh @@ -0,0 +1,68 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/removal/removal.kshlib + +# +# DESCRIPTION: +# 'zpool split' should succeed on pools with indirect vdevs. +# +# STRATEGY: +# Create a mirrored pool, add a single device, remove it. `zpool split` +# should succeed. +# + +verify_runnable "global" + +log_assert "'zpool split' works on pools with indirect VDEVs." + +function cleanup +{ + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + if poolexists $TESTPOOL2 ; then + destroy_pool $TESTPOOL2 + fi + rm -f $VDEV_* +} +log_onexit cleanup + +typeset vdev_m12_mb=400 +typeset vdev_temp_mb=$(( floor($vdev_m12_mb / 2) )) +typeset VDEV_TEMP="$TEST_BASE_DIR/vdev_temp" +typeset VDEV_M1="$TEST_BASE_DIR/vdev_m1" +typeset VDEV_M2="$TEST_BASE_DIR/vdev_m2" +typeset altroot="$TESTDIR/altroot-$TESTPOOL2" + +log_must truncate -s ${vdev_temp_mb}M $VDEV_TEMP +log_must truncate -s ${vdev_m12_mb}M $VDEV_M1 +log_must truncate -s ${vdev_m12_mb}M $VDEV_M2 + +log_must zpool create -f $TESTPOOL $VDEV_TEMP +log_must zpool add -f $TESTPOOL mirror $VDEV_M1 $VDEV_M2 +log_must zpool remove $TESTPOOL $VDEV_TEMP +log_must wait_for_removal $TESTPOOL +log_must zpool split -R $altroot $TESTPOOL $TESTPOOL2 +log_must poolexists $TESTPOOL2 +log_must test "$(get_pool_prop 'altroot' $TESTPOOL2)" == "$altroot" + +log_pass "'zpool split' works on pools with indirect VDEVs." From a36bad17596e5cbc472a0d1fecb200a6b2e3530d Mon Sep 17 00:00:00 2001 From: Philip Pokorny Date: Wed, 6 May 2020 17:17:38 -0700 Subject: [PATCH 03/27] Fix column width calculation issue with certain terminal widths If the reported terminal width is 0 or less than 42, the signed variable width was set to a negative number that was then assigned to the unsigned column width becoming a huge number. Add comments and change logic to better explain what's happening. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Philip Pokorny Closes #10247 --- cmd/zpool/zpool_main.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d62a6eb3c9a2..e154e40e482b 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -5148,22 +5148,48 @@ print_zpool_script_list(char *subcommand) /* * Set the minimum pool/vdev name column width. The width must be at least 10, * but may be as large as the column width - 42 so it still fits on one line. + * NOTE: 42 is the width of the default capacity/operations/bandwidth output */ static int get_namewidth_iostat(zpool_handle_t *zhp, void *data) { iostat_cbdata_t *cb = data; - int width, columns; + int width, available_width; + /* + * get_namewidth() returns the maximum width of any name in that column + * for any pool/vdev/device line that will be output. + */ width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, cb->cb_verbose); - columns = get_columns(); + /* + * The width we are calculating is the width of the header and also the + * padding width for names that are less than maximum width. The stats + * take up 42 characters, so the width available for names is: + */ + available_width = get_columns() - 42; + + /* + * If the maximum width fits on a screen, then great! Make everything + * line up by justifying all lines to the same width. If that max + * width is larger than what's available, the name plus stats won't fit + * on one line, and justifying to that width would cause every line to + * wrap on the screen. We only want lines with long names to wrap. + * Limit the padding to what won't wrap. + */ + if (width > available_width) + width = available_width; + + /* + * And regardless of whatever the screen width is (get_columns can + * return 0 if the width is not known or less than 42 for a narrow + * terminal) have the width be a minimum of 10. + */ if (width < 10) width = 10; - if (width > columns - 42) - width = columns - 42; + /* Save the calculated width */ cb->cb_namewidth = width; return (0); From 108a454a4604df6ea3be817f3cf076726df2c67a Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 7 May 2020 09:36:33 -0700 Subject: [PATCH 04/27] Add support for boot environment data to be stored in the label Modern bootloaders leverage data stored in the root filesystem to enable some of their powerful features. GRUB specifically has a grubenv file which can store large amounts of configuration data that can be read and written at boot time and during normal operation. This allows sysadmins to configure useful features like automated failover after failed boot attempts. Unfortunately, due to the Copy-on-Write nature of ZFS, the standard behavior of these tools cannot handle writing to ZFS files safely at boot time. We need an alternative way to store data that allows the bootloader to make changes to the data. This work is very similar to work that was done on Illumos to enable similar functionality in the FreeBSD bootloader. This patch is different in that the data being stored is a raw grubenv file; this file can store arbitrary variables and values, and the scripting provided by grub is powerful enough that special structures are not required to implement advanced behavior. We repurpose the second padding area in each label to store the grubenv file, protected by an embedded checksum. We add two ioctls to get and set this data, and libzfs_core and libzfs functions to access them more easily. There are no direct command line interfaces to these functions; these will be added directly to the bootloader utilities. Reviewed-by: Pavel Zakharov Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #10009 --- cmd/zinject/translate.c | 4 +- include/libzfs.h | 2 + include/libzfs_core.h | 4 +- include/sys/fs/zfs.h | 6 +- include/sys/vdev.h | 4 +- include/sys/vdev_impl.h | 28 +++- lib/libzfs/libzfs_pool.c | 40 ++++- lib/libzfs_core/libzfs_core.c | 22 +++ module/os/freebsd/zfs/vdev_label_os.c | 2 +- module/zfs/vdev.c | 4 +- module/zfs/vdev_label.c | 156 +++++++++++++++++- module/zfs/zfs_ioctl.c | 62 +++++++ .../libzfs_input_check/libzfs_input_check.c | 23 +++ 13 files changed, 335 insertions(+), 22 deletions(-) diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c index 8542d37c50bd..4939c0b85b5f 100644 --- a/cmd/zinject/translate.c +++ b/cmd/zinject/translate.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. */ #include @@ -388,7 +388,7 @@ translate_device(const char *pool, const char *device, err_type_t label_type, record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; break; case TYPE_LABEL_PAD2: - record->zi_start = offsetof(vdev_label_t, vl_pad2); + record->zi_start = offsetof(vdev_label_t, vl_be); record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; break; } diff --git a/include/libzfs.h b/include/libzfs.h index c4f08882ed69..dd013ad0c47f 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -862,6 +862,8 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, * Label manipulation. */ extern int zpool_clear_label(int); +extern int zpool_set_bootenv(zpool_handle_t *, const char *); +extern int zpool_get_bootenv(zpool_handle_t *, char *, size_t, off_t); /* * Management interfaces for SMB ACL files diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 18ce6994a0ab..e69fe32cd0a1 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. @@ -135,6 +135,8 @@ int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *); int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); +int lzc_set_bootenv(const char *, const char *); +int lzc_get_bootenv(const char *, nvlist_t **); #ifdef __cplusplus } #endif diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index f5aced0da7cd..39be630d8b3b 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] @@ -1290,7 +1290,7 @@ typedef enum zfs_ioc { ZFS_IOC_WAIT_FS, /* 0x5a54 */ /* - * Per-platform (Optional) - 6/128 numbers reserved. + * Per-platform (Optional) - 8/128 numbers reserved. */ ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80, ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */ @@ -1299,6 +1299,8 @@ typedef enum zfs_ioc { ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ + ZFS_IOC_SET_BOOTENV, /* 0x87 (Linux) */ + ZFS_IOC_GET_BOOTENV, /* 0x88 (Linux) */ ZFS_IOC_LAST } zfs_ioc_t; diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 56a869fec62a..c4ef479b5faf 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. */ @@ -179,6 +179,8 @@ extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, int flags); +extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); +extern int vdev_label_write_bootenv(vdev_t *, char *); typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index b55871a5d242..96546ac35078 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -414,7 +414,7 @@ struct vdev { #define VDEV_RAIDZ_MAXPARITY 3 #define VDEV_PAD_SIZE (8 << 10) -/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +/* 2 padding areas (vl_pad1 and vl_be) to skip */ #define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) @@ -441,12 +441,32 @@ typedef struct vdev_phys { zio_eck_t vp_zbt; } vdev_phys_t; +typedef enum vbe_vers { + /* The bootenv file is stored as ascii text in the envblock */ + VB_RAW = 0, + + /* + * The bootenv file is converted to an nvlist and then packed into the + * envblock. + */ + VB_NVLIST = 1 +} vbe_vers_t; + +typedef struct vdev_boot_envblock { + uint64_t vbe_version; + char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - + sizeof (zio_eck_t)]; + zio_eck_t vbe_zbt; +} vdev_boot_envblock_t; + +CTASSERT_GLOBAL(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE); + typedef struct vdev_label { char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ - char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ + vdev_boot_envblock_t vl_be; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ -} vdev_label_t; /* 256K total */ +} vdev_label_t; /* 256K total */ /* * vdev_dirty() flags diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 9122b3ee1258..2b21787eef42 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -22,7 +22,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2016 Igor Kozhukhov * Copyright (c) 2018 Datto Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. @@ -429,7 +429,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, * Assuming bootfs is a valid dataset name. */ static boolean_t -bootfs_name_valid(const char *pool, char *bootfs) +bootfs_name_valid(const char *pool, const char *bootfs) { int len = strlen(pool); if (bootfs[0] == '\0') @@ -4459,3 +4459,39 @@ zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity, return (error); } + +int +zpool_set_bootenv(zpool_handle_t *zhp, const char *envmap) +{ + int error = lzc_set_bootenv(zhp->zpool_name, envmap); + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, + "error setting bootenv in pool '%s'"), zhp->zpool_name); + } + + return (error); +} + +int +zpool_get_bootenv(zpool_handle_t *zhp, char *outbuf, size_t size, off_t offset) +{ + nvlist_t *nvl = NULL; + int error = lzc_get_bootenv(zhp->zpool_name, &nvl); + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, + "error getting bootenv in pool '%s'"), zhp->zpool_name); + return (-1); + } + char *envmap = fnvlist_lookup_string(nvl, "envmap"); + if (offset >= strlen(envmap)) { + fnvlist_free(nvl); + return (0); + } + + strlcpy(outbuf, envmap + offset, size); + int bytes = MIN(strlen(envmap + offset), size); + fnvlist_free(nvl); + return (bytes); +} diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 4e83b624b261..22996fc9be5f 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1619,3 +1619,25 @@ lzc_wait_fs(const char *fs, zfs_wait_activity_t activity, boolean_t *waited) return (error); } + +/* + * Set the bootenv contents for the given pool. + */ +int +lzc_set_bootenv(const char *pool, const char *env) +{ + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_string(args, "envmap", env); + int error = lzc_ioctl(ZFS_IOC_SET_BOOTENV, pool, args, NULL); + fnvlist_free(args); + return (error); +} + +/* + * Get the contents of the bootenv of the given pool. + */ +int +lzc_get_bootenv(const char *pool, nvlist_t **outnvl) +{ + return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); +} diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c index e734a2af8370..97cb201934dc 100644 --- a/module/os/freebsd/zfs/vdev_label_os.c +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -61,7 +61,7 @@ vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) retry: zio = zio_root(spa, NULL, NULL, flags); vdev_label_write(zio, vd, 0, pad2, - offsetof(vdev_label_t, vl_pad2), + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); error = zio_wait(zio); if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 59147ce31d36..3c2135029bd0 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome @@ -1554,7 +1554,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) for (int l = 1; l < VDEV_LABELS; l++) { zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad2)), VDEV_PAD_SIZE, + offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index aeee6499bfaa..844bca79c9cf 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2017, Intel Corporation. */ @@ -957,7 +957,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) nvlist_t *label; vdev_phys_t *vp; abd_t *vp_abd; - abd_t *pad2; + abd_t *bootenv; uberblock_t *ub; abd_t *ub_abd; zio_t *zio; @@ -1118,8 +1118,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ub->ub_txg = 0; /* Initialize the 2nd padding area. */ - pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); - abd_zero(pad2, VDEV_PAD_SIZE); + bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(bootenv, VDEV_PAD_SIZE); /* * Write everything in parallel. @@ -1138,8 +1138,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Zero out the 2nd padding area where it might have * left over data from previous filesystem format. */ - vdev_label_write(zio, vd, l, pad2, - offsetof(vdev_label_t, vl_pad2), + vdev_label_write(zio, vd, l, bootenv, + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, NULL, NULL, flags); vdev_label_write(zio, vd, l, ub_abd, @@ -1155,7 +1155,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) } nvlist_free(label); - abd_free(pad2); + abd_free(bootenv); abd_free(ub_abd); abd_free(vp_abd); @@ -1178,6 +1178,148 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) return (error); } +/* + * Done callback for vdev_label_read_bootenv_impl. If this is the first + * callback to finish, store our abd in the callback pointer. Otherwise, we + * just free our abd and return. + */ +static void +vdev_label_read_bootenv_done(zio_t *zio) +{ + zio_t *rio = zio->io_private; + abd_t **cbp = rio->io_private; + + ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); + + if (zio->io_error == 0) { + mutex_enter(&rio->io_lock); + if (*cbp == NULL) { + /* Will free this buffer in vdev_label_read_bootenv. */ + *cbp = zio->io_abd; + } else { + abd_free(zio->io_abd); + } + mutex_exit(&rio->io_lock); + } else { + abd_free(zio->io_abd); + } +} + +static void +vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); + + /* + * We just use the first label that has a correct checksum; the + * bootloader should have rewritten them all to be the same on boot, + * and any changes we made since boot have been the same across all + * labels. + * + * While grub supports writing to all four labels, other bootloaders + * don't, so we only use the first two labels to store boot + * information. + */ + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS / 2; l++) { + vdev_label_read(zio, vd, l, + abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, + vdev_label_read_bootenv_done, zio, flags); + } + } +} + +int +vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command) +{ + spa_t *spa = rvd->vdev_spa; + abd_t *abd = NULL; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(command); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + zio_t *zio = zio_root(spa, NULL, &abd, flags); + vdev_label_read_bootenv_impl(zio, rvd, flags); + int err = zio_wait(zio); + + if (abd != NULL) { + vdev_boot_envblock_t *vbe = abd_to_buf(abd); + if (vbe->vbe_version != VB_RAW) { + abd_free(abd); + return (SET_ERROR(ENOTSUP)); + } + vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; + fnvlist_add_string(command, "envmap", vbe->vbe_bootenv); + /* abd was allocated in vdev_label_read_bootenv_impl() */ + abd_free(abd); + /* If we managed to read any successfully, return success. */ + return (0); + } + return (err); +} + +int +vdev_label_write_bootenv(vdev_t *vd, char *envmap) +{ + zio_t *zio; + spa_t *spa = vd->vdev_spa; + vdev_boot_envblock_t *bootenv; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error = ENXIO; + + if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) { + return (SET_ERROR(E2BIG)); + } + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + for (int c = 0; c < vd->vdev_children; c++) { + int child_err = vdev_label_write_bootenv(vd->vdev_child[c], + envmap); + /* + * As long as any of the disks managed to write all of their + * labels successfully, return success. + */ + if (child_err == 0) + error = child_err; + } + + if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || + !vdev_writeable(vd)) { + return (error); + } + ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); + abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(abd, VDEV_PAD_SIZE); + bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); + + char *buf = bootenv->vbe_bootenv; + (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv)); + bootenv->vbe_version = VB_RAW; + abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + for (int l = 0; l < VDEV_LABELS / 2; l++) { + vdev_label_write(zio, vd, l, abd, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + } + + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(abd); + return (error); +} + /* * ========================================================================== * uberblock load/sync diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 2104bef714c2..d55ce20ef76f 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -3512,6 +3512,58 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) return (error); } +/* + * This ioctl is used to set the bootenv configuration on the current + * pool. This configuration is stored in the second padding area of the label, + * and it is used by the GRUB bootloader used on Linux to store the contents + * of the grubenv file. The file is stored as raw ASCII, and is protected by + * an embedded checksum. By default, GRUB will check if the boot filesystem + * supports storing the environment data in a special location, and if so, + * will invoke filesystem specific logic to retrieve it. This can be overriden + * by a variable, should the user so desire. + */ +/* ARGSUSED */ +static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { + {"envmap", DATA_TYPE_STRING, 0}, +}; + +static int +zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + char *envmap; + int error; + spa_t *spa; + + envmap = fnvlist_lookup_string(innvl, "envmap"); + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + +static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. @@ -6981,6 +7033,16 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); + zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, + zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); + + zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, + zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, + zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c index 3f6147509fc9..00924dda9158 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -751,6 +751,24 @@ test_wait_fs(const char *dataset) nvlist_free(required); } +static void +test_get_bootenv(const char *pool) +{ + IOC_INPUT_TEST(ZFS_IOC_GET_BOOTENV, pool, NULL, NULL, 0); +} + +static void +test_set_bootenv(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_string(required, "envmap", "test"); + + IOC_INPUT_TEST(ZFS_IOC_SET_BOOTENV, pool, required, NULL, 0); + + nvlist_free(required); +} + static void zfs_ioc_input_tests(const char *pool) { @@ -840,6 +858,9 @@ zfs_ioc_input_tests(const char *pool) test_wait(pool); test_wait_fs(dataset); + test_set_bootenv(pool); + test_get_bootenv(pool); + /* * cleanup */ @@ -1000,6 +1021,8 @@ validate_ioc_values(void) CHECK(ZFS_IOC_PLATFORM_BASE + 4 == ZFS_IOC_NEXTBOOT); CHECK(ZFS_IOC_PLATFORM_BASE + 5 == ZFS_IOC_JAIL); CHECK(ZFS_IOC_PLATFORM_BASE + 6 == ZFS_IOC_UNJAIL); + CHECK(ZFS_IOC_PLATFORM_BASE + 7 == ZFS_IOC_SET_BOOTENV); + CHECK(ZFS_IOC_PLATFORM_BASE + 8 == ZFS_IOC_GET_BOOTENV); #undef CHECK From 657fd33bcff17e44ad55dffdf294d7c107b4bf5d Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Thu, 7 May 2020 19:34:03 -0400 Subject: [PATCH 05/27] Improvements on persistent L2ARC Functional changes: We implement refcounts of log blocks and their aligned size on the cache device along with two corresponding arcstats. The refcounts are reflected in the header of the device and provide valuable information as to whether log blocks are accounted for correctly. These are dynamically adjusted as log blocks are committed/evicted. zdb also uses this information in the device header and compares it to the corresponding values as reported by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). If the refcounts saved in the device header report higher values, zdb exits with an error. For this feature to work correctly there should be no active writes on the device. This is also employed in the tests of persistent L2ARC. We extend the structure of the cache device header by adding the two new variables mirroring the refcounts after the existing variables to preserve backward compatibility in terms of persistent L2ARC. 1) a new arcstat "l2_log_blk_asize" and refcount "l2ad_lb_asize" which reflect the total aligned size of log blocks on the device. This is also reflected in the header of the cache device as "dh_lb_asize". 2) a new arcstat "l2arc_log_blk_count" and refcount "l2ad_lb_count" which reflect the total number of L2ARC log blocks present on cache devices. It is also reflected in the header of the cache device as "dh_lb_count". In l2arc_rebuild_vdev() if the amount of committed log entries in a log block is 0 and the device header is valid we update the device header. This will facilitate trimming of the whole device in this case when TRIM for L2ARC is implemented. Improve loop protection in l2arc_rebuild() by using the starting offset of the payload of each log block instead of the starting offset of the log block. If the zio in l2arc_write_buffers() fails, restore the lbps array in the header of the device to its previous state in l2arc_write_done(). If l2arc_rebuild() ends the rebuild process without restoring any L2ARC log blocks in ARC and without any other error, this means that the lbps array in the header is pointing to non-existent or invalid log blocks. Reset the device header in this case. In l2arc_rebuild() change the zfs_dbgmsg messages to spa_history_log_internal() making them user visible with zpool history command. Non-functional changes: Make the first test in persistent L2ARC use `zdb -lll` to increase coverage in `zdb.c`. Rename psize with asize when referring to log blocks, since L2ARC_SET_PSIZE stores the vdev aligned size for log blocks. Also rename dh_log_blk_entries to dh_log_entries to make it clear that it is a mirror of l2ad_log_entries. Added comments for both changes. Fix inaccurate comments for example in l2arc_log_blk_restore(). Add asserts at the end in l2arc_evict() and l2arc_write_buffers(). Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #10228 --- cmd/zdb/zdb.c | 123 ++++++--- include/sys/arc_impl.h | 40 ++- man/man8/zdb.8 | 5 +- module/zfs/arc.c | 234 ++++++++++++------ .../persist_l2arc/persist_l2arc_001_pos.ksh | 2 +- 5 files changed, 278 insertions(+), 126 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index f4b4b454b44d..00258799bb04 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3493,12 +3493,13 @@ print_l2arc_log_blocks(void) static void dump_l2arc_log_entries(uint64_t log_entries, - l2arc_log_ent_phys_t *le, int i) + l2arc_log_ent_phys_t *le, uint64_t i) { for (int j = 0; j < log_entries; j++) { dva_t dva = le[j].le_dva; - (void) printf("lb[%4d]\tle[%4d]\tDVA asize: %llu, " - "vdev: %llu, offset: %llu\n", i, j + 1, + (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " + "vdev: %llu, offset: %llu\n", + (u_longlong_t)i, j + 1, (u_longlong_t)DVA_GET_ASIZE(&dva), (u_longlong_t)DVA_GET_VDEV(&dva), (u_longlong_t)DVA_GET_OFFSET(&dva)); @@ -3533,7 +3534,7 @@ dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) (u_longlong_t)lbps.lbp_payload_start); (void) printf("|\t\tlsize: %llu\n", (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); - (void) printf("|\t\tpsize: %llu\n", + (void) printf("|\t\tasize: %llu\n", (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); (void) printf("|\t\tcompralgo: %llu\n", (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); @@ -3543,17 +3544,19 @@ dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) } static void -dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) +dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, + l2arc_dev_hdr_phys_t *rebuild) { l2arc_log_blk_phys_t this_lb; - uint64_t psize; + uint64_t asize; l2arc_log_blkptr_t lbps[2]; abd_t *abd; zio_cksum_t cksum; - int i = 0, failed = 0; + int failed = 0; l2arc_dev_t dev; - print_l2arc_log_blocks(); + if (!dump_opt['q']) + print_l2arc_log_blocks(); bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); dev.l2ad_evict = l2dhdr.dh_evict; @@ -3562,8 +3565,10 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { /* no log blocks to read */ - (void) printf("No log blocks to read\n"); - (void) printf("\n"); + if (!dump_opt['q']) { + (void) printf("No log blocks to read\n"); + (void) printf("\n"); + } return; } else { dev.l2ad_hand = lbps[0].lbp_daddr + @@ -3576,17 +3581,23 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) break; - psize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); - if (pread64(fd, &this_lb, psize, lbps[0].lbp_daddr) != psize) { - (void) printf("Error while reading next log block\n\n"); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { + if (!dump_opt['q']) { + (void) printf("Error while reading next log " + "block\n\n"); + } break; } - fletcher_4_native_varsize(&this_lb, psize, &cksum); + fletcher_4_native_varsize(&this_lb, asize, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { failed++; - (void) printf("Invalid cksum\n"); - dump_l2arc_log_blkptr(lbps[0]); + if (!dump_opt['q']) { + (void) printf("Invalid cksum\n"); + dump_l2arc_log_blkptr(lbps[0]); + } break; } @@ -3594,11 +3605,11 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: - abd = abd_alloc_for_io(psize, B_TRUE); - abd_copy_from_buf_off(abd, &this_lb, 0, psize); + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, &this_lb, 0, asize); zio_decompress_data(L2BLK_GET_COMPRESS( (&lbps[0])->lbp_prop), abd, &this_lb, - psize, sizeof (this_lb)); + asize, sizeof (this_lb)); abd_free(abd); break; default: @@ -3608,39 +3619,52 @@ dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr) if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) byteswap_uint64_array(&this_lb, sizeof (this_lb)); if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { - (void) printf("Invalid log block magic\n\n"); + if (!dump_opt['q']) + (void) printf("Invalid log block magic\n\n"); break; } - i++; - if (dump_opt['l'] > 1) { - (void) printf("lb[%4d]\tmagic: %llu\n", i, + rebuild->dh_lb_count++; + rebuild->dh_lb_asize += asize; + if (dump_opt['l'] > 1 && !dump_opt['q']) { + (void) printf("lb[%4llu]\tmagic: %llu\n", + (u_longlong_t)rebuild->dh_lb_count, (u_longlong_t)this_lb.lb_magic); dump_l2arc_log_blkptr(lbps[0]); } - if (dump_opt['l'] > 2) - dump_l2arc_log_entries(l2dhdr.dh_log_blk_ent, - this_lb.lb_entries, i); + if (dump_opt['l'] > 2 && !dump_opt['q']) + dump_l2arc_log_entries(l2dhdr.dh_log_entries, + this_lb.lb_entries, + rebuild->dh_lb_count); - if (l2arc_range_check_overlap(lbps[1].lbp_daddr, - lbps[0].lbp_daddr, dev.l2ad_evict) && !dev.l2ad_first) + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev.l2ad_evict) && + !dev.l2ad_first) break; lbps[0] = lbps[1]; lbps[1] = this_lb.lb_prev_lbp; } - (void) printf("log_blk_count:\t %d with valid cksum\n", i); - (void) printf("\t\t %d with invalid cksum\n\n", failed); + if (!dump_opt['q']) { + (void) printf("log_blk_count:\t %llu with valid cksum\n", + (u_longlong_t)rebuild->dh_lb_count); + (void) printf("\t\t %d with invalid cksum\n", failed); + (void) printf("log_blk_asize:\t %llu\n\n", + (u_longlong_t)rebuild->dh_lb_asize); + } } -static void +static int dump_l2arc_header(int fd) { - l2arc_dev_hdr_phys_t l2dhdr; + l2arc_dev_hdr_phys_t l2dhdr, rebuild; int error = B_FALSE; + bzero(&l2dhdr, sizeof (l2dhdr)); + bzero(&rebuild, sizeof (rebuild)); + if (pread64(fd, &l2dhdr, sizeof (l2dhdr), VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { error = B_TRUE; @@ -3654,6 +3678,8 @@ dump_l2arc_header(int fd) if (error) { (void) printf("L2ARC device header not found\n\n"); + /* Do not return an error here for backward compatibility */ + return (0); } else if (!dump_opt['q']) { print_l2arc_header(); @@ -3672,16 +3698,39 @@ dump_l2arc_header(int fd) (u_longlong_t) l2dhdr.dh_start_lbps[1].lbp_daddr); (void) printf(" log_blk_ent: %llu\n", - (u_longlong_t)l2dhdr.dh_log_blk_ent); + (u_longlong_t)l2dhdr.dh_log_entries); (void) printf(" start: %llu\n", (u_longlong_t)l2dhdr.dh_start); (void) printf(" end: %llu\n", (u_longlong_t)l2dhdr.dh_end); - (void) printf(" evict: %llu\n\n", + (void) printf(" evict: %llu\n", (u_longlong_t)l2dhdr.dh_evict); - - dump_l2arc_log_blocks(fd, l2dhdr); + (void) printf(" lb_asize_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_asize); + (void) printf(" lb_count_refcount: %llu\n\n", + (u_longlong_t)l2dhdr.dh_lb_count); } + + dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); + /* + * The total aligned size of log blocks and the number of log blocks + * reported in the header of the device may be less than what zdb + * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). + * This happens because dump_l2arc_log_blocks() lacks the memory + * pressure valve that l2arc_rebuild() has. Thus, if we are on a system + * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize + * and dh_lb_count will be lower to begin with than what exists on the + * device. This is normal and zdb should not exit with an error. The + * opposite case should never happen though, the values reported in the + * header should never be higher than what dump_l2arc_log_blocks() and + * l2arc_rebuild() report. If this happens there is a leak in the + * accounting of log blocks. + */ + if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || + l2dhdr.dh_lb_count > rebuild.dh_lb_count) + return (1); + + return (0); } static void @@ -4009,7 +4058,7 @@ dump_label(const char *dev) * Dump the L2ARC header, if existent. */ if (read_l2arc_header) - dump_l2arc_header(fd); + error |= dump_l2arc_header(fd); cookie = NULL; while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index 928b7232556f..e8c944ce8369 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -200,7 +200,7 @@ typedef struct l2arc_log_blkptr { /* * lbp_prop has the following format: * * logical size (in bytes) - * * physical (compressed) size (in bytes) + * * aligned (after compression) size (in bytes) * * compression algorithm (we always LZ4-compress l2arc logs) * * checksum algorithm (used for lbp_cksum) */ @@ -221,22 +221,26 @@ typedef struct l2arc_dev_hdr_phys { */ uint64_t dh_spa_guid; uint64_t dh_vdev_guid; - uint64_t dh_log_blk_ent; /* entries per log blk */ + uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ uint64_t dh_evict; /* evicted offset in bytes */ uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ /* * Used in zdb.c for determining if a log block is valid, in the same * way that l2arc_rebuild() does. */ - uint64_t dh_start; - uint64_t dh_end; - + uint64_t dh_start; /* mirror of l2ad_start */ + uint64_t dh_end; /* mirror of l2ad_end */ /* * Start of log block chain. [0] -> newest log, [1] -> one older (used * for initiating prefetch). */ l2arc_log_blkptr_t dh_start_lbps[2]; - const uint64_t dh_pad[34]; /* pad to 512 bytes */ + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ + uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ + const uint64_t dh_pad[32]; /* pad to 512 bytes */ zio_eck_t dh_tail; } l2arc_dev_hdr_phys_t; CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); @@ -387,6 +391,14 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* evicted offset in bytes */ /* List of pointers to log blocks present in the L2ARC device */ list_t l2ad_lbptr_list; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + zfs_refcount_t l2ad_lb_asize; + /* + * Number of log blocks present on the device. + */ + zfs_refcount_t l2ad_lb_count; } l2arc_dev_t; /* @@ -738,14 +750,18 @@ typedef struct arc_stats { */ kstat_named_t arcstat_l2_log_blk_writes; /* - * Moving average of the physical size of the L2ARC log blocks, in + * Moving average of the aligned size of the L2ARC log blocks, in * bytes. Updated during L2ARC rebuild and during writing of L2ARC * log blocks. */ - kstat_named_t arcstat_l2_log_blk_avg_size; + kstat_named_t arcstat_l2_log_blk_avg_asize; + /* Aligned size of L2ARC log blocks on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_asize; + /* Number of L2ARC log blocks present on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_count; /* - * Moving average of the physical size of L2ARC restored data, in bytes, - * to the physical size of their metadata in ARC, in bytes. + * Moving average of the aligned size of L2ARC restored data, in bytes, + * to the aligned size of their metadata in L2ARC, in bytes. * Updated during L2ARC rebuild and during writing of L2ARC log blocks. */ kstat_named_t arcstat_l2_data_to_meta_ratio; @@ -780,6 +796,8 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_rebuild_abort_lowmem; /* Logical size of L2ARC restored data, in bytes. */ kstat_named_t arcstat_l2_rebuild_size; + /* Aligned size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_asize; /* * Number of L2ARC log entries (buffers) that were successfully * restored in ARC. @@ -790,8 +808,6 @@ typedef struct arc_stats { * were not restored again. */ kstat_named_t arcstat_l2_rebuild_bufs_precached; - /* Physical size of L2ARC restored data, in bytes. */ - kstat_named_t arcstat_l2_rebuild_psize; /* * Number of L2ARC log blocks that were restored successfully. Each * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 3915be3f8ef9..01ad95b3e900 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -216,7 +216,10 @@ Read the vdev labels and L2ARC header from the specified device. .Nm Fl l will return 0 if valid label was found, 1 if error occurred, and 2 if no valid labels were found. The presence of L2ARC header is indicated by a specific -sequence (L2ARC_DEV_HDR_MAGIC). Each unique configuration is displayed only +sequence (L2ARC_DEV_HDR_MAGIC). If there is an accounting error in the size +or the number of L2ARC log blocks +.Nm Fl l +will return 1. Each unique configuration is displayed only once. .It Fl ll Ar device In addition display label space usage stats. If a valid L2ARC header was found diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 5b34d4abd4ac..a6b739ec377a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -530,7 +530,9 @@ arc_stats_t arc_stats = { { "l2_asize", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, - { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_count", KSTAT_DATA_UINT64 }, { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, { "l2_rebuild_success", KSTAT_DATA_UINT64 }, { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, @@ -539,9 +541,9 @@ arc_stats_t arc_stats = { { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, - { "l2_rebuild_psize", KSTAT_DATA_UINT64 }, { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, { "memory_throttle_count", KSTAT_DATA_UINT64 }, { "memory_direct_count", KSTAT_DATA_UINT64 }, @@ -895,7 +897,7 @@ static void l2arc_log_blk_fetch_abort(zio_t *zio); /* L2ARC persistence block restoration routines. */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, - const l2arc_log_blk_phys_t *lb, uint64_t lb_psize, uint64_t lb_daddr); + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); @@ -7864,6 +7866,7 @@ l2arc_write_done(zio_t *zio) l2arc_lb_abd_buf_t *abd_buf; l2arc_lb_ptr_buf_t *lb_ptr_buf; l2arc_dev_t *dev; + l2arc_dev_hdr_phys_t *l2dhdr; list_t *buflist; arc_buf_hdr_t *head, *hdr, *hdr_prev; kmutex_t *hash_lock; @@ -7872,6 +7875,7 @@ l2arc_write_done(zio_t *zio) cb = zio->io_private; ASSERT3P(cb, !=, NULL); dev = cb->l2wcb_dev; + l2dhdr = dev->l2ad_dev_hdr; ASSERT3P(dev, !=, NULL); head = cb->l2wcb_head; ASSERT3P(head, !=, NULL); @@ -7975,8 +7979,18 @@ l2arc_write_done(zio_t *zio) zio_buf_free(abd_buf, sizeof (*abd_buf)); if (zio->io_error != 0) { lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); - bytes_dropped += + /* + * L2BLK_GET_PSIZE returns aligned size for log + * blocks. + */ + uint64_t asize = L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + bytes_dropped += asize; + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); @@ -7984,6 +7998,17 @@ l2arc_write_done(zio_t *zio) } list_destroy(&cb->l2wcb_abd_list); + if (zio->io_error != 0) { + /* restore the lbps array in the header to its previous state */ + lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); + for (int i = 0; i < 2; i++) { + bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, + lb_ptr_buf); + } + } + atomic_inc_64(&l2arc_writes_done); list_remove(buflist, head); ASSERT(!HDR_HAS_L1HDR(head)); @@ -8277,21 +8302,21 @@ l2arc_sublist_lock(int list_num) /* * Calculates the maximum overhead of L2ARC metadata log blocks for a given - * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this + * L2ARC write size. l2arc_evict and l2arc_write_size need to include this * overhead in processing to make sure there is enough headroom available * when writing buffers. */ static inline uint64_t l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) { - if (dev->l2ad_dev_hdr->dh_log_blk_ent == 0) { + if (dev->l2ad_log_entries == 0) { return (0); } else { uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; uint64_t log_blocks = (log_entries + - dev->l2ad_dev_hdr->dh_log_blk_ent - 1) / - dev->l2ad_dev_hdr->dh_log_blk_ent; + dev->l2ad_log_entries - 1) / + dev->l2ad_log_entries; return (vdev_psize_to_asize(dev->l2ad_vdev, sizeof (l2arc_log_blk_phys_t)) * log_blocks); @@ -8373,17 +8398,24 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop); + /* * We don't worry about log blocks left behind (ie - * lbp_daddr + psize < l2ad_hand) because l2arc_write_buffers() + * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() * will never write more than l2arc_evict() evicts. */ if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { - vdev_space_update(dev->l2ad_vdev, - -L2BLK_GET_PSIZE( - (lb_ptr_buf->lb_ptr)->lbp_prop), 0, 0); + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); kmem_free(lb_ptr_buf->lb_ptr, sizeof (l2arc_log_blkptr_t)); @@ -8475,6 +8507,10 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) dev->l2ad_first = B_FALSE; goto top; } + + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); } /* @@ -8777,6 +8813,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) sizeof (l2arc_write_callback_t), KM_SLEEP); cb->l2wcb_dev = dev; cb->l2wcb_head = head; + /* + * Create a list to save allocated abd buffers + * for l2arc_log_blk_commit(). + */ list_create(&cb->l2wcb_abd_list, sizeof (l2arc_lb_abd_buf_t), offsetof(l2arc_lb_abd_buf_t, node)); @@ -8846,6 +8886,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) return (0); } + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + ASSERT3U(write_asize, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); @@ -9036,6 +9079,8 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); zfs_refcount_create(&adddev->l2ad_alloc); + zfs_refcount_create(&adddev->l2ad_lb_asize); + zfs_refcount_create(&adddev->l2ad_lb_count); /* * Add device to global list @@ -9059,7 +9104,7 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) uint64_t l2dhdr_asize; spa_t *spa; int err; - boolean_t rebuild = B_TRUE; + boolean_t l2dhdr_valid = B_TRUE; dev = l2arc_vdev_get(vd); ASSERT3P(dev, !=, NULL); @@ -9089,9 +9134,9 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) * Read the device header, if an error is returned do not rebuild L2ARC. */ if ((err = l2arc_dev_hdr_read(dev)) != 0) - rebuild = B_FALSE; + l2dhdr_valid = B_FALSE; - if (rebuild && l2dhdr->dh_log_blk_ent > 0) { + if (l2dhdr_valid && dev->l2ad_log_entries > 0) { /* * If we are onlining a cache device (vdev_reopen) that was * still present (l2arc_vdev_present()) and rebuild is enabled, @@ -9117,12 +9162,10 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) * async task which will call l2arc_spa_rebuild_start. */ dev->l2ad_rebuild = B_TRUE; - } else if (!rebuild && spa_writeable(spa)) { + } else if (spa_writeable(spa)) { /* - * The boolean rebuild is false if reading the device header - * returned an error. In this case create a new header. We - * zero out the memory holding the header to reset - * dh_start_lbps. + * In this case create a new header. We zero out the memory + * holding the header to reset dh_start_lbps. */ bzero(l2dhdr, l2dhdr_asize); l2arc_dev_hdr_update(dev); @@ -9172,6 +9215,8 @@ l2arc_remove_vdev(vdev_t *vd) list_destroy(&remdev->l2ad_lbptr_list); mutex_destroy(&remdev->l2ad_mtx); zfs_refcount_destroy(&remdev->l2ad_alloc); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); vmem_free(remdev, sizeof (l2arc_dev_t)); } @@ -9309,7 +9354,7 @@ l2arc_rebuild(l2arc_dev_t *dev) { vdev_t *vd = dev->l2ad_vdev; spa_t *spa = vd->vdev_spa; - int i = 0, err = 0; + int err = 0; l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; l2arc_log_blk_phys_t *this_lb, *next_lb; zio_t *this_io = NULL, *next_io = NULL; @@ -9332,6 +9377,7 @@ l2arc_rebuild(l2arc_dev_t *dev) /* * Retrieve the persistent L2ARC device state. + * L2BLK_GET_PSIZE returns aligned size for log blocks. */ dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + @@ -9381,11 +9427,10 @@ l2arc_rebuild(l2arc_dev_t *dev) /* * Now that we know that the next_lb checks out alright, we * can start reconstruction from this log block. + * L2BLK_GET_PSIZE returns aligned size for log blocks. */ - l2arc_log_blk_restore(dev, this_lb, - L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), - lbps[0].lbp_daddr); - i++; + uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); /* * log block restored, include its pointer in the list of @@ -9398,9 +9443,12 @@ l2arc_rebuild(l2arc_dev_t *dev) sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); - vdev_space_update(vd, - L2BLK_GET_PSIZE((&lbps[0])->lbp_prop), 0, 0); + vdev_space_update(vd, asize, 0, 0); /* * Protection against loops of log blocks: @@ -9417,13 +9465,16 @@ l2arc_rebuild(l2arc_dev_t *dev) * l2arc_log_blkptr_valid() but the log block should not be * restored as it is overwritten by the payload of log block * (0). Only log blocks (0)-(3) should be restored. We check - * whether l2ad_evict lies in between the next log block - * offset (lbps[1].lbp_daddr) and the present log block offset - * (lbps[0].lbp_daddr). If true and this isn't the first pass, - * we are looping from the beginning and we should stop. + * whether l2ad_evict lies in between the payload starting + * offset of the next log block (lbps[1].lbp_payload_start) + * and the payload starting offset of the present log block + * (lbps[0].lbp_payload_start). If true and this isn't the + * first pass, we are looping from the beginning and we should + * stop. */ - if (l2arc_range_check_overlap(lbps[1].lbp_daddr, - lbps[0].lbp_daddr, dev->l2ad_evict) && !dev->l2ad_first) + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev->l2ad_evict) && + !dev->l2ad_first) goto out; for (;;) { @@ -9470,14 +9521,27 @@ l2arc_rebuild(l2arc_dev_t *dev) vmem_free(next_lb, sizeof (*next_lb)); if (!l2arc_rebuild_enabled) { - zfs_dbgmsg("L2ARC rebuild disabled"); - } else if (err == 0 && i > 0) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "disabled"); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { ARCSTAT_BUMP(arcstat_l2_rebuild_success); - zfs_dbgmsg("L2ARC successfully rebuilt, " - "restored %d blocks", i); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "successful, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { + /* + * No error but also nothing restored, meaning the lbps array + * in the device header points to invalid/non-present log + * blocks. Reset the header. + */ + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "no valid log blocks"); + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); } else if (err != 0) { - zfs_dbgmsg("L2ARC rebuild aborted, " - "restored %d blocks", i); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); } if (lock_held) @@ -9527,7 +9591,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) l2dhdr->dh_spa_guid != guid || l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || - l2dhdr->dh_log_blk_ent != dev->l2ad_log_entries || + l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, l2dhdr->dh_evict)) { @@ -9578,7 +9642,7 @@ l2arc_log_blk_read(l2arc_dev_t *dev, int err = 0; zio_cksum_t cksum; abd_t *abd = NULL; - uint64_t psize; + uint64_t asize; ASSERT(this_lbp != NULL && next_lbp != NULL); ASSERT(this_lb != NULL && next_lb != NULL); @@ -9616,9 +9680,12 @@ l2arc_log_blk_read(l2arc_dev_t *dev, goto cleanup; } - /* Make sure the buffer checks out */ - psize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); - fletcher_4_native(this_lb, psize, NULL, &cksum); + /* + * Make sure the buffer checks out. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); + fletcher_4_native(this_lb, asize, NULL, &cksum); if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " @@ -9634,11 +9701,11 @@ l2arc_log_blk_read(l2arc_dev_t *dev, case ZIO_COMPRESS_OFF: break; case ZIO_COMPRESS_LZ4: - abd = abd_alloc_for_io(psize, B_TRUE); - abd_copy_from_buf_off(abd, this_lb, 0, psize); + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, asize); if ((err = zio_decompress_data( L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), - abd, this_lb, psize, sizeof (*this_lb))) != 0) { + abd, this_lb, asize, sizeof (*this_lb))) != 0) { err = SET_ERROR(EINVAL); goto cleanup; } @@ -9672,10 +9739,10 @@ l2arc_log_blk_read(l2arc_dev_t *dev, */ static void l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, - uint64_t lb_psize, uint64_t lb_daddr) + uint64_t lb_asize, uint64_t lb_daddr) { - uint64_t size = 0, psize = 0; - uint64_t log_entries = dev->l2ad_dev_hdr->dh_log_blk_ent; + uint64_t size = 0, asize = 0; + uint64_t log_entries = dev->l2ad_log_entries; for (int i = log_entries - 1; i >= 0; i--) { /* @@ -9692,27 +9759,28 @@ l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, * ^ ^ * | | * | | - * l2arc_fill_thread l2arc_rebuild - * places new bufs here restores bufs here + * l2arc_feed_thread l2arc_rebuild + * will place new bufs here restores bufs here * - * This also works when the restored bufs get evicted at any - * point during the rebuild. + * During l2arc_rebuild() the device is not used by + * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. */ size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); - psize += L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop); + asize += vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); l2arc_hdr_restore(&lb->lb_entries[i], dev); } /* * Record rebuild stats: * size Logical size of restored buffers in the L2ARC - * psize Physical size of restored buffers in the L2ARC + * asize Aligned size of restored buffers in the L2ARC */ ARCSTAT_INCR(arcstat_l2_rebuild_size, size); - ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize); + ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); - ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize); - ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); } @@ -9800,18 +9868,20 @@ static zio_t * l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, l2arc_log_blk_phys_t *lb) { - uint32_t psize; + uint32_t asize; zio_t *pio; l2arc_read_callback_t *cb; - psize = L2BLK_GET_PSIZE((lbp)->lbp_prop); - ASSERT(psize <= sizeof (l2arc_log_blk_phys_t)); + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); - cb->l2rcb_abd = abd_get_from_buf(lb, psize); + cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); - (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize, + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); @@ -9841,14 +9911,18 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) abd_t *abd; int err; + VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; - l2dhdr->dh_log_blk_ent = dev->l2ad_log_entries; + l2dhdr->dh_log_entries = dev->l2ad_log_entries; l2dhdr->dh_evict = dev->l2ad_evict; l2dhdr->dh_start = dev->l2ad_start; l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); + l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; @@ -9884,7 +9958,7 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) uint8_t *tmpbuf; l2arc_lb_ptr_buf_t *lb_ptr_buf; - VERIFY3S(dev->l2ad_log_ent_idx, ==, l2dhdr->dh_log_blk_ent); + VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); tmpbuf = zio_buf_alloc(sizeof (*lb)); abd_buf = zio_buf_alloc(sizeof (*abd_buf)); @@ -9896,8 +9970,14 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; lb->lb_magic = L2ARC_LOG_BLK_MAGIC; - /* try to compress the buffer */ + /* + * l2arc_log_blk_commit() may be called multiple times during a single + * l2arc_write_buffers() call. Save the allocated abd buffers in a list + * so we can free them in l2arc_write_done() later on. + */ list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + + /* try to compress the buffer */ psize = zio_compress_data(ZIO_COMPRESS_LZ4, abd_buf->abd, tmpbuf, sizeof (*lb)); @@ -9962,13 +10042,17 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) sizeof (l2arc_log_blkptr_t)); mutex_enter(&dev->l2ad_mtx); list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); mutex_exit(&dev->l2ad_mtx); vdev_space_update(dev->l2ad_vdev, asize, 0, 0); /* bump the kstats */ ARCSTAT_INCR(arcstat_l2_write_bytes, asize); ARCSTAT_BUMP(arcstat_l2_log_blk_writes); - ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, dev->l2ad_log_blk_payload_asize / asize); @@ -9985,8 +10069,9 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) { - uint64_t psize = L2BLK_GET_PSIZE((lbp)->lbp_prop); - uint64_t end = lbp->lbp_daddr + psize - 1; + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + asize - 1; uint64_t start = lbp->lbp_payload_start; boolean_t evicted = B_FALSE; @@ -10017,7 +10102,7 @@ l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); return (start >= dev->l2ad_start && end <= dev->l2ad_end && - psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t) && + asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && (!evicted || dev->l2ad_first)); } @@ -10032,14 +10117,13 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; l2arc_log_ent_phys_t *le; - l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; - if (l2dhdr->dh_log_blk_ent == 0) + if (dev->l2ad_log_entries == 0) return (B_FALSE); int index = dev->l2ad_log_ent_idx++; - ASSERT3S(index, <, l2dhdr->dh_log_blk_ent); + ASSERT3S(index, <, dev->l2ad_log_entries); ASSERT(HDR_HAS_L2HDR(hdr)); le = &lb->lb_entries[index]; @@ -10059,7 +10143,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, HDR_GET_PSIZE(hdr)); - return (dev->l2ad_log_ent_idx == l2dhdr->dh_log_blk_ent); + return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); } /* diff --git a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh index b202fac40aca..f313923d1469 100755 --- a/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh @@ -99,7 +99,7 @@ typeset l2_rebuild_log_blk_end=$(get_arcstat l2_rebuild_log_blks) log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - $l2_rebuild_log_blk_start )) log_must test $l2_dh_log_blk -gt 0 -log_must zdb -lq $VDEV_CACHE +log_must zdb -lll $VDEV_CACHE log_must zpool destroy -f $TESTPOOL From 4cfd339ce4ae1fb199fad04ff826e67b221cfb41 Mon Sep 17 00:00:00 2001 From: Richard Laager Date: Sat, 2 May 2020 18:46:46 -0500 Subject: [PATCH 06/27] Cleanup contrib/initramfs automake The initramfs hook scripts depend on Makefile. This way, if the substitution code is changed, they should update. This brings it in line with etc/init.d (which was modified to match the example in the automake docs). The initramfs hook script cleaning now matches etc/init.d. There was a mix of SUBDIRS recursion and custom install rules for files in subdirectories. This was duplicated for the "hooks" and "scripts" subdirectories. Now everything uses SUBDIRS. I fixed the substitution of DEFAULT_INITCONF_DIR for hooks/zfs. Reviewed-By: Andrey Prokopenko Reviewed-By: Brian Behlendorf Reviewed-By: Tom Caputi Signed-off-by: Richard Laager Closes #10027 --- configure.ac | 2 ++ contrib/initramfs/Makefile.am | 19 +------------------ contrib/initramfs/conf-hooks.d/Makefile.am | 4 ++++ contrib/initramfs/conf.d/Makefile.am | 4 ++++ contrib/initramfs/hooks/Makefile.am | 11 +++-------- .../initramfs/scripts/local-top/Makefile.am | 3 ++- 6 files changed, 16 insertions(+), 27 deletions(-) create mode 100644 contrib/initramfs/conf-hooks.d/Makefile.am create mode 100644 contrib/initramfs/conf.d/Makefile.am diff --git a/configure.ac b/configure.ac index 902108f3649d..9f20b2bea1b7 100644 --- a/configure.ac +++ b/configure.ac @@ -92,6 +92,8 @@ AC_CONFIG_FILES([ contrib/dracut/90zfs/Makefile contrib/dracut/Makefile contrib/initramfs/Makefile + contrib/initramfs/conf.d/Makefile + contrib/initramfs/conf-hooks.d/Makefile contrib/initramfs/hooks/Makefile contrib/initramfs/scripts/Makefile contrib/initramfs/scripts/local-top/Makefile diff --git a/contrib/initramfs/Makefile.am b/contrib/initramfs/Makefile.am index 849b1d83cc5b..7d85973129b6 100644 --- a/contrib/initramfs/Makefile.am +++ b/contrib/initramfs/Makefile.am @@ -1,23 +1,6 @@ initrddir = /usr/share/initramfs-tools -initrd_SCRIPTS = \ - conf.d/zfs conf-hooks.d/zfs hooks/zfs scripts/zfs scripts/local-top/zfs - -SUBDIRS = hooks scripts +SUBDIRS = conf.d conf-hooks.d hooks scripts EXTRA_DIST = \ - $(top_srcdir)/contrib/initramfs/conf.d/zfs \ - $(top_srcdir)/contrib/initramfs/conf-hooks.d/zfs \ $(top_srcdir)/contrib/initramfs/README.initramfs.markdown - -install-initrdSCRIPTS: $(EXTRA_DIST) - for d in conf.d conf-hooks.d scripts/local-top; do \ - $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \ - cp $(top_srcdir)/contrib/initramfs/$$d/zfs \ - $(DESTDIR)$(initrddir)/$$d/; \ - done - for d in hooks scripts; do \ - $(MKDIR_P) $(DESTDIR)$(initrddir)/$$d; \ - cp $(top_builddir)/contrib/initramfs/$$d/zfs \ - $(DESTDIR)$(initrddir)/$$d/; \ - done diff --git a/contrib/initramfs/conf-hooks.d/Makefile.am b/contrib/initramfs/conf-hooks.d/Makefile.am new file mode 100644 index 000000000000..f84ba5cc7e37 --- /dev/null +++ b/contrib/initramfs/conf-hooks.d/Makefile.am @@ -0,0 +1,4 @@ +confhooksddir = /usr/share/initramfs-tools/conf-hooks.d + +dist_confhooksd_DATA = \ + zfs diff --git a/contrib/initramfs/conf.d/Makefile.am b/contrib/initramfs/conf.d/Makefile.am new file mode 100644 index 000000000000..5ef27e0aa1ce --- /dev/null +++ b/contrib/initramfs/conf.d/Makefile.am @@ -0,0 +1,4 @@ +confddir = /usr/share/initramfs-tools/conf.d + +dist_confd_DATA = \ + zfs diff --git a/contrib/initramfs/hooks/Makefile.am b/contrib/initramfs/hooks/Makefile.am index 3d8ef627ed47..d768454943f8 100644 --- a/contrib/initramfs/hooks/Makefile.am +++ b/contrib/initramfs/hooks/Makefile.am @@ -6,18 +6,13 @@ hooks_SCRIPTS = \ EXTRA_DIST = \ $(top_srcdir)/contrib/initramfs/hooks/zfs.in -$(hooks_SCRIPTS):%:%.in +$(hooks_SCRIPTS):%:%.in Makefile -$(SED) -e 's,@sbindir\@,$(sbindir),g' \ -e 's,@sysconfdir\@,$(sysconfdir),g' \ -e 's,@udevdir\@,$(udevdir),g' \ -e 's,@udevruledir\@,$(udevruledir),g' \ -e 's,@mounthelperdir\@,$(mounthelperdir),g' \ + -e 's,@DEFAULT_INITCONF_DIR\@,$(DEFAULT_INITCONF_DIR),g' \ $< >'$@' -# Double-colon rules are allowed; there are multiple independent definitions. -clean-local:: - -$(RM) $(hooks_SCRIPTS) - -# Double-colon rules are allowed; there are multiple independent definitions. -distclean-local:: - -$(RM) $(hooks_SCRIPTS) +CLEANFILES = $(hooks_SCRIPTS) diff --git a/contrib/initramfs/scripts/local-top/Makefile.am b/contrib/initramfs/scripts/local-top/Makefile.am index c820325947b0..1523a907c860 100644 --- a/contrib/initramfs/scripts/local-top/Makefile.am +++ b/contrib/initramfs/scripts/local-top/Makefile.am @@ -1,3 +1,4 @@ localtopdir = /usr/share/initramfs-tools/scripts/local-top -EXTRA_DIST = zfs +dist_localtop_SCRIPTS = \ + zfs From 746d22ee02d2617ee982f1620b06f882b924ce8e Mon Sep 17 00:00:00 2001 From: Richard Laager Date: Sat, 2 May 2020 18:16:46 -0500 Subject: [PATCH 07/27] Rework README.initramfs.markdown This file is listed as being in Markdown format, but it didn't really use much Markdown. I have added a fair amount of formatting. I have reordered and reworded things to improve the flow of the text. Reviewed-By: Andrey Prokopenko Reviewed-By: Brian Behlendorf Reviewed-By: Tom Caputi Signed-off-by: Richard Laager Closes #10027 --- contrib/initramfs/README.initramfs.markdown | 168 +++++++++----------- 1 file changed, 74 insertions(+), 94 deletions(-) diff --git a/contrib/initramfs/README.initramfs.markdown b/contrib/initramfs/README.initramfs.markdown index fa19f001af9c..c8bc9f4bdc97 100644 --- a/contrib/initramfs/README.initramfs.markdown +++ b/contrib/initramfs/README.initramfs.markdown @@ -1,94 +1,74 @@ -DESCRIPTION - These scripts are intended to be used with initramfs-tools, which is a similar - software product to "dracut" (which is used in RedHat based distributions), - and is mainly used by Debian GNU/Linux and derivatives to create an initramfs - so that the system can be booted off a ZFS filesystem. If you have no need or - interest in this, then it can safely be ignored. - - These script were written with the primary intention of being portable and - usable on as many systems as possible. - - This is, in practice, usually not possible. But the intention is there. - And it is a good one. - - They have been tested successfully on: - - * Debian GNU/Linux Wheezy - * Debian GNU/Linux Jessie - - It uses some functionality common with the SYSV init scripts, primarily - the "/etc/zfs/zfs-functions" script. - -FUNCTIONALITY - * Supports booting of a ZFS snapshot. - Do this by cloning the snapshot into a dataset. If this, the resulting - dataset, already exists, destroy it. Then mount it as the root filesystem. - * If snapshot does not exist, use base dataset (the part before '@') - as boot filesystem instead. - * Clone with 'mountpoint=none' and 'canmount=noauto' - we mount manually - and explicitly. - * Allow rollback of snapshots instead of clone it and boot from the clone. - * If no snapshot is specified on the 'root=' kernel command line, but - there is an '@', then get a list of snapshots below that filesystem - and ask the user which to use. - - * Support all currently used kernel command line arguments - * Core options: - All the different distributions have their own standard on what to specify - on the kernel command line to boot of a ZFS filesystem. - - Supports the following kernel command line argument combinations - (in this order - first match win): - * rpool= (tries to finds bootfs automatically) - * bootfs=/ (uses this for rpool - first part) - * rpool= bootfs=/ - * -B zfs-bootfs=/ (uses this for rpool - first part) - * rpool=rpool (default if none of the above is used) - * root=/ (uses this for rpool - first part) - * root=ZFS=/ (uses this for rpool - first part, without 'ZFS=') - * root=zfs:AUTO (tries to detect both pool and rootfs - * root=zfs:/ (uses this for rpool - first part, without 'zfs:') - - Option could also be - * Extra (control) options: - * zfsdebug=(on,yes,1) Show extra debugging information - * zfsforce=(on,yes,1) Force import the pool - * rollback=(on,yes,1) Rollback (instead of clone) the snapshot - - * 'Smarter' way to import pools. Don't just try cache file or /dev. - * Try to use /dev/disk/by-vdev (if /etc/zfs/vdev_id.conf exists), - * Try /dev/mapper (to be able to use LUKS backed pools as well as - multi-path devices). - * /dev/disk/by-id and any other /dev/disk/by-* directory that may exist. - * Use /dev as a last ditch attempt. - * Fallback to using the cache file if that exist if nothing else worked. - * Only try to import pool if it haven't already been imported - * This will negate the need to force import a pool that have not been - exported cleanly. - * Support exclusion of pools to import by setting ZFS_POOL_EXCEPTIONS - in /etc/default/zfs. - - Controlling in which order devices is searched for is controlled by - ZPOOL_IMPORT_PATH variable set in /etc/defaults/zfs. - - * Support additional configuration variable ZFS_INITRD_ADDITIONAL_DATASETS - to mount additional filesystems not located under your root dataset. - - For example, if the root fs is specified as 'rpool/ROOT/rootfs', it will - automatically and without specific configuration mount any filesystems - below this on the mount point specified in the 'mountpoint' property. - Such as 'rpool/root/rootfs/var', 'rpool/root/rootfs/usr' etc) - - However, if one prefer to have separate filesystems, not located below - the root fs (such as 'rpool/var', 'rpool/ROOT/opt' etc), special - configuration needs to be done. This is what the variable, set in - /etc/defaults/zfs file, needs to be configured. The 'mountpoint' - property needs to be correct for this to work though. - - * Allows mounting a rootfs with mountpoint=legacy set. - - * Include /etc/modprobe.d/{zfs,spl}.conf in the initrd if it/they exist. - - * Include the udev rule to use by-vdev for pool imports. - - * Include the /etc/default/zfs file to the initrd. +## Description + +These scripts are intended to be used with `initramfs-tools`, which is a +similar software product to `dracut` (which is used in Red Hat based +distributions), and is mainly used by Debian GNU/Linux and derivatives. + +These scripts share some common functionality with the SysV init scripts, +primarily the `/etc/zfs/zfs-functions` script. + +## Configuration + +### Root pool/filesystem + +Different distributions have their own standard on what to specify on the +kernel command line to boot off a ZFS filesystem. + +This script supports the following kernel command line argument combinations +(in this order - first match wins): + +* `rpool=` +* `bootfs=/` +* `rpool= bootfs=/` +* `-B zfs-bootfs=/` +* `root=/` +* `root=ZFS=/` +* `root=zfs:AUTO` +* `root=zfs:/` +* `rpool=rpool` + +If a pool is specified, it will be used. Otherwise, in `AUTO` mode, all pools +will be searched. Pools may be excluded from the search by listing them in +`ZFS_POOL_EXCEPTIONS` in `/etc/default/zfs`. + +Pools will be imported as follows: + +* Try `/dev/disk/by-vdev` if it exists; see `/etc/zfs/vdev_id.conf`. +* Try `/dev/disk/by-id` and any other `/dev/disk/by-*` directories. +* Try `/dev`. +* Use the cache file if nothing else worked. + +This order may be modified by setting `ZPOOL_IMPORT_PATH` in +`/etc/default/zfs`. + +If a dataset is specified, it will be used as the root filesystem. Otherwise, +this script will attempt to find a root filesystem automatically (in the +specified pool or all pools, as described above). + +Filesystems below the root filesystem will be automatically mounted with no +additional configuration necessary. For example, if the root filesystem is +`rpool/ROOT/rootfs`, `rpool/root/rootfs/var`, `rpool/root/rootfs/usr`, etc. +will be mounted (if they exist). Additional filesystems (that are not located +under the root filesystem) can be mounted by listing them in +`ZFS_INITRD_ADDITIONAL_DATASETS` in `/etc/default/zfs`. + +### Snapshots + +The `` can be a snapshot. In this case, the snapshot will be cloned +and the clone used as the root filesystem. Note: + +* If the snapshot does not exist, the base dataset (the part before `@`) is + used as the boot filesystem instead. +* If the resulting clone dataset already exists, it is destroyed. +* The clone is created with `mountpoint=none` and `canmount=noauto`. The root + filesystem is mounted manually by the initramfs script. +* If no snapshot is specified on the `root=` kernel command line, but + there is an `@`, the user will be prompted to choose a snapshot to use. + +### Extra options + +The following kernel command line arguments are supported: + +* `zfsdebug=(on,yes,1)`: Show extra debugging information +* `zfsforce=(on,yes,1)`: Force import the pool +* `rollback=(on,yes,1)`: Rollback to (instead of clone) the snapshot From 1cc635a2dd0379181950a1458255ea8ae8b9c1e0 Mon Sep 17 00:00:00 2001 From: Andrey Prokopenko Date: Sun, 3 May 2020 00:43:42 -0500 Subject: [PATCH 08/27] Unlock encrypted root partition over SSH This commit add a new feature for Debian-based distributions to unlock encrypted root partition over SSH. This feature is very handy on headless NAS or VPS cloud servers. To use this feature, you will need to install the dropbear-initramfs package. Reviewed-By: Brian Behlendorf Reviewed-By: Tom Caputi Signed-off-by: Andrey Prokopenko Signed-off-by: Richard Laager Closes #10027 --- contrib/initramfs/Makefile.am | 3 ++ contrib/initramfs/README.initramfs.markdown | 12 ++++++ contrib/initramfs/hooks/.gitignore | 1 + contrib/initramfs/hooks/Makefile.am | 6 ++- contrib/initramfs/hooks/zfs.in | 1 + contrib/initramfs/hooks/zfsunlock.in | 18 +++++++++ contrib/initramfs/scripts/zfs | 12 +++++- contrib/initramfs/zfsunlock | 42 +++++++++++++++++++++ 8 files changed, 92 insertions(+), 3 deletions(-) create mode 100644 contrib/initramfs/hooks/zfsunlock.in create mode 100755 contrib/initramfs/zfsunlock diff --git a/contrib/initramfs/Makefile.am b/contrib/initramfs/Makefile.am index 7d85973129b6..ee17de0965ca 100644 --- a/contrib/initramfs/Makefile.am +++ b/contrib/initramfs/Makefile.am @@ -1,5 +1,8 @@ initrddir = /usr/share/initramfs-tools +dist_initrd_SCRIPTS = \ + zfsunlock + SUBDIRS = conf.d conf-hooks.d hooks scripts EXTRA_DIST = \ diff --git a/contrib/initramfs/README.initramfs.markdown b/contrib/initramfs/README.initramfs.markdown index c8bc9f4bdc97..ddae71a2e295 100644 --- a/contrib/initramfs/README.initramfs.markdown +++ b/contrib/initramfs/README.initramfs.markdown @@ -72,3 +72,15 @@ The following kernel command line arguments are supported: * `zfsdebug=(on,yes,1)`: Show extra debugging information * `zfsforce=(on,yes,1)`: Force import the pool * `rollback=(on,yes,1)`: Rollback to (instead of clone) the snapshot + +### Unlocking a ZFS encrypted root over SSH + +To use this feature: + +1. Install the `dropbear-initramfs` package. You may wish to uninstall the + `cryptsetup-initramfs` package to avoid warnings. +2. Add your SSH key(s) to `/etc/dropbear-initramfs/authorized_keys`. Note + that Dropbear does not support ed25519 keys; use RSA (2048-bit or more) + instead. +3. Rebuild the initramfs with your keys: `update-initramfs -u` +4. During the system boot, login via SSH and run: `zfsunlock` diff --git a/contrib/initramfs/hooks/.gitignore b/contrib/initramfs/hooks/.gitignore index 73304bc2cd4a..4e1604e18899 100644 --- a/contrib/initramfs/hooks/.gitignore +++ b/contrib/initramfs/hooks/.gitignore @@ -1 +1,2 @@ zfs +zfsunlock diff --git a/contrib/initramfs/hooks/Makefile.am b/contrib/initramfs/hooks/Makefile.am index d768454943f8..9b20c080a655 100644 --- a/contrib/initramfs/hooks/Makefile.am +++ b/contrib/initramfs/hooks/Makefile.am @@ -1,10 +1,12 @@ hooksdir = /usr/share/initramfs-tools/hooks hooks_SCRIPTS = \ - zfs + zfs \ + zfsunlock EXTRA_DIST = \ - $(top_srcdir)/contrib/initramfs/hooks/zfs.in + $(top_srcdir)/contrib/initramfs/hooks/zfs.in \ + $(top_srcdir)/contrib/initramfs/hooks/zfsunlock.in $(hooks_SCRIPTS):%:%.in Makefile -$(SED) -e 's,@sbindir\@,$(sbindir),g' \ diff --git a/contrib/initramfs/hooks/zfs.in b/contrib/initramfs/hooks/zfs.in index 15f23c908b23..ff7e49f12537 100755 --- a/contrib/initramfs/hooks/zfs.in +++ b/contrib/initramfs/hooks/zfs.in @@ -21,6 +21,7 @@ COPY_FILE_LIST="$COPY_FILE_LIST @udevruledir@/69-vdev.rules" # These prerequisites are provided by the base system. COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/dirname /bin/hostname /sbin/blkid" COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/env" +COPY_EXEC_LIST="$COPY_EXEC_LIST $(which systemd-ask-password)" # Explicitly specify all kernel modules because automatic dependency resolution # is unreliable on many systems. diff --git a/contrib/initramfs/hooks/zfsunlock.in b/contrib/initramfs/hooks/zfsunlock.in new file mode 100644 index 000000000000..c8ae86363981 --- /dev/null +++ b/contrib/initramfs/hooks/zfsunlock.in @@ -0,0 +1,18 @@ +#!/bin/sh + +PREREQ="dropbear" + +prereqs() { + echo "$PREREQ" +} + +case "$1" in + prereqs) + prereqs + exit 0 + ;; +esac + +. /usr/share/initramfs-tools/hook-functions + +copy_exec /usr/share/initramfs-tools/zfsunlock /usr/bin diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index dbc4e253f113..a795fd39f605 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -405,6 +405,8 @@ decrypt_fs() ENCRYPTIONROOT="$(get_fs_value "${fs}" encryptionroot)" KEYLOCATION="$(get_fs_value "${ENCRYPTIONROOT}" keylocation)" + echo "${ENCRYPTIONROOT}" > /run/zfs_fs_name + # If root dataset is encrypted... if ! [ "${ENCRYPTIONROOT}" = "-" ]; then KEYSTATUS="$(get_fs_value "${ENCRYPTIONROOT}" keystatus)" @@ -418,6 +420,7 @@ decrypt_fs() # Prompt with plymouth, if active elif [ -e /bin/plymouth ] && /bin/plymouth --ping 2>/dev/null; then + echo "plymouth" > /run/zfs_console_askpwd_cmd while [ $TRY_COUNT -gt 0 ]; do plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" | \ $ZFS load-key "${ENCRYPTIONROOT}" && break @@ -426,6 +429,7 @@ decrypt_fs() # Prompt with systemd, if active elif [ -e /run/systemd/system ]; then + echo "systemd-ask-password" > /run/zfs_console_askpwd_cmd while [ $TRY_COUNT -gt 0 ]; do systemd-ask-password "Encrypted ZFS password for ${ENCRYPTIONROOT}" --no-tty | \ $ZFS load-key "${ENCRYPTIONROOT}" && break @@ -434,7 +438,8 @@ decrypt_fs() # Prompt with ZFS tty, otherwise else - # Setting "printk" temporarily to "7" will allow prompt even if kernel option "quiet" + # Temporarily setting "printk" to "7" allows the prompt to appear even when the "quiet" kernel option has been used + echo "load-key" > /run/zfs_console_askpwd_cmd storeprintk="$(awk '{print $1}' /proc/sys/kernel/printk)" echo 7 > /proc/sys/kernel/printk $ZFS load-key "${ENCRYPTIONROOT}" @@ -964,6 +969,11 @@ mountroot() mount_fs "$fs" done + touch /run/zfs_unlock_complete + if [ -e /run/zfs_unlock_complete_notify ]; then + read zfs_unlock_complete_notify < /run/zfs_unlock_complete_notify + fi + # ------------ # Debugging information if [ -n "${ZFS_DEBUG}" ] diff --git a/contrib/initramfs/zfsunlock b/contrib/initramfs/zfsunlock new file mode 100755 index 000000000000..1202a144deac --- /dev/null +++ b/contrib/initramfs/zfsunlock @@ -0,0 +1,42 @@ +#!/bin/sh + +set -eu +if [ ! -e /run/zfs_fs_name ]; then + echo "Wait for the root pool to be imported or press Ctrl-C to exit." +fi +while [ ! -e /run/zfs_fs_name ]; do + if [ -e /run/zfs_unlock_complete ]; then + exit 0 + fi + sleep 0.5 +done +echo +echo "Unlocking encrypted ZFS filesystems..." +echo "Enter the password or press Ctrl-C to exit." +echo +zfs_fs_name="" +if [ ! -e /run/zfs_unlock_complete_notify ]; then + mkfifo /run/zfs_unlock_complete_notify +fi +while [ ! -e /run/zfs_unlock_complete ]; do + zfs_fs_name=$(cat /run/zfs_fs_name) + zfs_console_askpwd_cmd=$(cat /run/zfs_console_askpwd_cmd) + systemd-ask-password "Encrypted ZFS password for ${zfs_fs_name}:" | \ + /sbin/zfs load-key "$zfs_fs_name" || true + if [ "$(/sbin/zfs get -H -ovalue keystatus "$zfs_fs_name" 2> /dev/null)" = "available" ]; then + echo "Password for $zfs_fs_name accepted." + zfs_console_askpwd_pid=$(ps a -o pid= -o args | grep -v grep | grep "$zfs_console_askpwd_cmd" | cut -d ' ' -f3 | sort -n | head -n1) + if [ -n "$zfs_console_askpwd_pid" ]; then + kill "$zfs_console_askpwd_pid" + fi + # Wait for another filesystem to unlock. + while [ "$(cat /run/zfs_fs_name)" = "$zfs_fs_name" ] && [ ! -e /run/zfs_unlock_complete ]; do + sleep 0.5 + done + else + echo "Wrong password. Try again." + fi +done +echo "Unlocking complete. Resuming boot sequence..." +echo "Please reconnect in a while." +echo "ok" > /run/zfs_unlock_complete_notify From d775c86dd4ced233ac0636607706cc5a08bcb923 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 8 May 2020 13:50:02 -0700 Subject: [PATCH 09/27] ZTS: refreserv_005_pos.ksh When recursively destroying the dataset it's possible for the dataset volume to be open by an unrelated process, like blkid. Use the destroy_dataset() which will retry when this occurs. Reviewed-by: John Kennedy Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #10305 --- .../tests/functional/refreserv/refreserv_005_pos.ksh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh b/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh index 8c044eca59d5..1ccc9828d4f7 100755 --- a/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh @@ -45,9 +45,9 @@ verify_runnable "global" function cleanup { - log_must zfs destroy -rf $TESTPOOL/$TESTFS - log_must zfs create $TESTPOOL/$TESTFS - log_must zfs set mountpoint=$TESTDIR $TESTPOOL/$TESTFS + destroy_dataset "$fs" "-rf" + log_must zfs create $fs + log_must zfs set mountpoint=$TESTDIR $fs } log_assert "Volume (ref)reservation is not limited by volsize" From bd95f00d4b6aa185bf508f12d2cdbfbb2350b80e Mon Sep 17 00:00:00 2001 From: Petros Koutoupis Date: Sat, 9 May 2020 12:17:08 -0500 Subject: [PATCH 10/27] Fixed LDADD library links in Makefiles for cross compilation builds When building on native dev system, there are no issues but when cross-compiling for target system, some linker errors are observed. The only way to avoid these errors is by adjusting the Makefile.am of those various components to add the library dependencies. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Signed-off-by: Petros Koutoupis Closes #10304 --- cmd/mount_zfs/Makefile.am | 1 + cmd/raidz_test/Makefile.am | 1 + cmd/zed/Makefile.am | 1 + cmd/zinject/Makefile.am | 1 + cmd/zpool/Makefile.am | 1 + cmd/zstream/Makefile.am | 1 + tests/zfs-tests/cmd/btree_test/Makefile.am | 1 + tests/zfs-tests/tests/functional/hkdf/Makefile.am | 4 +++- tests/zfs-tests/tests/functional/libzfs/Makefile.am | 2 ++ 9 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cmd/mount_zfs/Makefile.am b/cmd/mount_zfs/Makefile.am index 1ffeef7fe189..ddacf32c6dd5 100644 --- a/cmd/mount_zfs/Makefile.am +++ b/cmd/mount_zfs/Makefile.am @@ -14,4 +14,5 @@ mount_zfs_SOURCES = \ mount_zfs_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la diff --git a/cmd/raidz_test/Makefile.am b/cmd/raidz_test/Makefile.am index c04d101b885b..0b173ed505fd 100644 --- a/cmd/raidz_test/Makefile.am +++ b/cmd/raidz_test/Makefile.am @@ -14,6 +14,7 @@ raidz_test_SOURCES = \ raidz_bench.c raidz_test_LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libzpool/libzpool.la raidz_test_LDADD += -lm -ldl diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am index 40c0834af0a2..82b000ce162e 100644 --- a/cmd/zed/Makefile.am +++ b/cmd/zed/Makefile.am @@ -38,6 +38,7 @@ zed_SOURCES = $(ZED_SRC) $(FMA_SRC) zed_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la zed_LDADD += -lrt diff --git a/cmd/zinject/Makefile.am b/cmd/zinject/Makefile.am index 71b48255e66b..b056a6db545e 100644 --- a/cmd/zinject/Makefile.am +++ b/cmd/zinject/Makefile.am @@ -9,4 +9,5 @@ zinject_SOURCES = \ zinject_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index 7b25726f498e..5efa1318440d 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -22,6 +22,7 @@ endif zpool_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libuutil/libuutil.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la if BUILD_FREEBSD diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am index 892e1583072c..ebc07d2eaa04 100644 --- a/cmd/zstream/Makefile.am +++ b/cmd/zstream/Makefile.am @@ -10,4 +10,5 @@ zstream_SOURCES = \ zstream_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la diff --git a/tests/zfs-tests/cmd/btree_test/Makefile.am b/tests/zfs-tests/cmd/btree_test/Makefile.am index 632f0472668c..bf09cdb82da4 100644 --- a/tests/zfs-tests/cmd/btree_test/Makefile.am +++ b/tests/zfs-tests/cmd/btree_test/Makefile.am @@ -29,4 +29,5 @@ btree_test_SOURCES = btree_test.c btree_test_LDADD = \ $(top_builddir)/lib/libavl/libavl.la \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ $(top_builddir)/lib/libzpool/libzpool.la diff --git a/tests/zfs-tests/tests/functional/hkdf/Makefile.am b/tests/zfs-tests/tests/functional/hkdf/Makefile.am index c1266214fefd..378bcf531d84 100644 --- a/tests/zfs-tests/tests/functional/hkdf/Makefile.am +++ b/tests/zfs-tests/tests/functional/hkdf/Makefile.am @@ -1,6 +1,8 @@ include $(top_srcdir)/config/Rules.am -LDADD = $(top_builddir)/lib/libzpool/libzpool.la +LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzpool/libzpool.la AUTOMAKE_OPTIONS = subdir-objects diff --git a/tests/zfs-tests/tests/functional/libzfs/Makefile.am b/tests/zfs-tests/tests/functional/libzfs/Makefile.am index e9a703f4902d..545af77e7d12 100644 --- a/tests/zfs-tests/tests/functional/libzfs/Makefile.am +++ b/tests/zfs-tests/tests/functional/libzfs/Makefile.am @@ -10,6 +10,8 @@ dist_pkgdata_SCRIPTS = \ libzfs_input.ksh many_fds_LDADD = \ + $(top_builddir)/lib/libnvpair/libnvpair.la \ + $(top_builddir)/lib/libzfs_core/libzfs_core.la \ $(top_builddir)/lib/libzfs/libzfs.la pkgexec_PROGRAMS = many_fds From fc551d7efbbf26cc1671ddb51f2f0df57ec53ee7 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Sun, 10 May 2020 13:23:52 -0600 Subject: [PATCH 11/27] Combine OS-independent ABD Code into Common Source File Reorganizing ABD code base so OS-independent ABD code has been placed into a common abd.c file. OS-dependent ABD code has been left in each OS's ABD source files, and these source files have been renamed to abd_os. The OS-independent ABD code is now under: module/zfs/abd.c With the OS-dependent code in: module/os/linux/zfs/abd_os.c module/os/freebsd/zfs/abd_os.c Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Brian Atkinson Closes #10293 --- include/sys/Makefile.am | 1 + include/sys/abd.h | 70 +- include/sys/abd_impl.h | 126 +++ lib/libzpool/Makefile.am | 1 + module/Makefile.bsd | 5 +- module/os/freebsd/zfs/abd_os.c | 433 ++++++++ module/os/linux/zfs/Makefile.in | 2 +- module/os/linux/zfs/abd.c | 1616 ----------------------------- module/os/linux/zfs/abd_os.c | 891 ++++++++++++++++ module/zfs/Makefile.in | 1 + module/{os/freebsd => }/zfs/abd.c | 525 +++------- module/zfs/vdev_indirect.c | 2 +- 12 files changed, 1602 insertions(+), 2071 deletions(-) create mode 100644 include/sys/abd_impl.h create mode 100644 module/os/freebsd/zfs/abd_os.c delete mode 100644 module/os/linux/zfs/abd.c create mode 100644 module/os/linux/zfs/abd_os.c rename module/{os/freebsd => }/zfs/abd.c (61%) diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am index 82165170a386..17d78658c653 100644 --- a/include/sys/Makefile.am +++ b/include/sys/Makefile.am @@ -2,6 +2,7 @@ SUBDIRS = fm fs crypto lua sysevent COMMON_H = \ $(top_srcdir)/include/sys/abd.h \ + $(top_srcdir)/include/sys/abd_impl.h \ $(top_srcdir)/include/sys/aggsum.h \ $(top_srcdir)/include/sys/arc.h \ $(top_srcdir)/include/sys/arc_impl.h \ diff --git a/include/sys/abd.h b/include/sys/abd.h index 82b73589bbef..df4234f3c794 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -35,56 +35,14 @@ extern "C" { #endif -typedef enum abd_flags { - ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ - ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ - ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ - ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ - ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ - ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ -} abd_flags_t; - -typedef struct abd { - abd_flags_t abd_flags; - uint_t abd_size; /* excludes scattered abd_offset */ - struct abd *abd_parent; - zfs_refcount_t abd_children; - union { - struct abd_scatter { - uint_t abd_offset; -#if defined(__FreeBSD__) && defined(_KERNEL) - uint_t abd_chunk_size; - void *abd_chunks[]; -#else - uint_t abd_nents; - struct scatterlist *abd_sgl; -#endif - } abd_scatter; - struct abd_linear { - void *abd_buf; - struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ - } abd_linear; - } abd_u; -} abd_t; +struct abd; /* forward declaration */ +typedef struct abd abd_t; typedef int abd_iter_func_t(void *buf, size_t len, void *private); typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *private); extern int zfs_abd_scatter_enabled; -static inline boolean_t -abd_is_linear(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); -} - -static inline boolean_t -abd_is_linear_page(abd_t *abd) -{ - return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ? - B_TRUE : B_FALSE); -} - /* * Allocations and deallocations */ @@ -124,12 +82,8 @@ void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); int abd_cmp(abd_t *, abd_t *); int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); void abd_zero_off(abd_t *, size_t, size_t); - -#if defined(_KERNEL) -unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, - size_t); -unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); -#endif +void abd_verify(abd_t *); +uint_t abd_get_size(abd_t *); void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ssize_t csize, ssize_t dsize, const unsigned parity, @@ -174,13 +128,29 @@ abd_zero(abd_t *abd, size_t size) abd_zero_off(abd, 0, size); } +/* + * ABD type check functions + */ +boolean_t abd_is_linear(abd_t *); +boolean_t abd_is_linear_page(abd_t *); + /* * Module lifecycle + * Defined in each specific OS's abd_os.c */ void abd_init(void); void abd_fini(void); +/* + * Linux ABD bio functions + */ +#if defined(__linux__) && defined(_KERNEL) +unsigned int abd_scatter_bio_map_off(struct bio *, abd_t *, unsigned int, + size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +#endif + #ifdef __cplusplus } #endif diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h new file mode 100644 index 000000000000..6027678af15a --- /dev/null +++ b/include/sys/abd_impl.h @@ -0,0 +1,126 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#ifndef _ABD_IMPL_H +#define _ABD_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_flags { + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ + ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ + ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ + ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ +} abd_flags_t; + +typedef enum abd_stats_op { + ABDSTAT_INCR, /* Increase abdstat values */ + ABDSTAT_DECR /* Decrease abdstat values */ +} abd_stats_op_t; + +struct abd { + abd_flags_t abd_flags; + uint_t abd_size; /* excludes scattered abd_offset */ + struct abd *abd_parent; + zfs_refcount_t abd_children; + union { + struct abd_scatter { + uint_t abd_offset; +#if defined(__FreeBSD__) && defined(_KERNEL) + uint_t abd_chunk_size; + void *abd_chunks[]; +#else + uint_t abd_nents; + struct scatterlist *abd_sgl; +#endif + } abd_scatter; + struct abd_linear { + void *abd_buf; + struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ + } abd_linear; + } abd_u; +}; + +struct scatterlist; /* forward declaration */ + +struct abd_iter { + /* public interface */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ + + /* private */ + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; + size_t iter_offset; /* offset in current sg/abd_buf, */ + /* abd_offset included */ + struct scatterlist *iter_sg; /* current sg */ +}; + +/* + * OS specific functions + */ + +abd_t *abd_alloc_struct(size_t); +abd_t *abd_get_offset_scatter(abd_t *, size_t); +void abd_free_struct(abd_t *); +void abd_alloc_chunks(abd_t *, size_t); +void abd_free_chunks(abd_t *); +boolean_t abd_size_alloc_linear(size_t); +void abd_update_scatter_stats(abd_t *, abd_stats_op_t); +void abd_update_linear_stats(abd_t *, abd_stats_op_t); +void abd_verify_scatter(abd_t *); +void abd_free_linear_page(abd_t *); +void abd_enter_critical(unsigned long); +void abd_exit_critical(unsigned long); +/* OS specific abd_iter functions */ +void abd_iter_init(struct abd_iter *, abd_t *); +boolean_t abd_iter_at_end(struct abd_iter *); +void abd_iter_advance(struct abd_iter *, size_t); +void abd_iter_map(struct abd_iter *); +void abd_iter_unmap(struct abd_iter *); + +/* + * Helper macros + */ +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) +#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_IMPL_H */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index a9396105bc6b..0e6a1058ec2f 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -39,6 +39,7 @@ KERNEL_C = \ zpool_prop.c \ zprop_common.c \ abd.c \ + abd_os.c \ aggsum.c \ arc.c \ arc_os.c \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 6d76796f51ed..92b5c1906c1c 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -127,7 +127,7 @@ SRCS+= spl_atomic.c .endif #os/freebsd/zfs -SRCS+= abd.c \ +SRCS+= abd_os.c \ crypto_os.c \ dmu_os.c \ hkdf.c \ @@ -169,7 +169,8 @@ SRCS+= zfeature_common.c \ zprop_common.c #zfs -SRCS+= aggsum.c \ +SRCS+= abd.c \ + aggsum.c \ arc.c \ arc_os.c \ blkptr.c \ diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c new file mode 100644 index 000000000000..f438841cd411 --- /dev/null +++ b/module/os/freebsd/zfs/abd_os.c @@ -0,0 +1,433 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * See abd.c for a general overview of the arc buffered data (ABD). + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled. + */ + +#include +#include +#include +#include +#include + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +#if defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, + &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, + &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +static size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + ABD_SCATTER(abd).abd_offset + abd->abd_size)); +} + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + size_t n = abd_scatter_chunkcnt(abd); + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + /* + * There is no scatter linear pages in FreeBSD so there is an + * if an error if the ABD has been marked as a linear page. + */ + VERIFY(!abd_is_linear_page(abd)); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + } +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + size_t n = abd_chunkcnt_for_bytes(size); + for (int i = 0; i < n; i++) { + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + ABD_SCATTER(abd).abd_chunks[i] = c; + } + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; +} + +void +abd_free_chunks(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + } +} + +abd_t * +abd_alloc_struct(size_t size) +{ + size_t chunkcnt = abd_chunkcnt_for_bytes(size); + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* + * FreeBSD does not have have scatter linear pages + * so there is an error. + */ + VERIFY(0); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * This is just a helper function to abd_get_offset_scatter() to alloc a + * scatter ABD using the calculated chunkcnt based on the offset within the + * parent ABD. + */ +static abd_t * +abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt) +{ + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + + +abd_t * +abd_get_offset_scatter(abd_t *sabd, size_t off) +{ + abd_t *abd = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&ABD_SCATTER(abd).abd_chunks, + &ABD_SCATTER(sabd).abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + + return (abd); +} + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + ABD_SCATTER(aiter->iter_abd).abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_enter_critical(unsigned long flags) +{ + critical_enter(); +} + +void +abd_exit_critical(unsigned long flags) +{ + critical_exit(); +} diff --git a/module/os/linux/zfs/Makefile.in b/module/os/linux/zfs/Makefile.in index 8c11a1ee6f58..cb4edbbc1a33 100644 --- a/module/os/linux/zfs/Makefile.in +++ b/module/os/linux/zfs/Makefile.in @@ -7,7 +7,7 @@ ccflags-$(CONFIG_SPARC64) += -Wno-unused-value ccflags-y += -I@abs_top_srcdir@/module/os/linux/zfs -$(MODULE)-objs += ../os/linux/zfs/abd.o +$(MODULE)-objs += ../os/linux/zfs/abd_os.o $(MODULE)-objs += ../os/linux/zfs/arc_os.o $(MODULE)-objs += ../os/linux/zfs/mmp_os.o $(MODULE)-objs += ../os/linux/zfs/policy.o diff --git a/module/os/linux/zfs/abd.c b/module/os/linux/zfs/abd.c deleted file mode 100644 index bc6f81000d48..000000000000 --- a/module/os/linux/zfs/abd.c +++ /dev/null @@ -1,1616 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2019 by Delphix. All rights reserved. - */ - -/* - * ARC buffer data (ABD). - * - * ABDs are an abstract data structure for the ARC which can use two - * different ways of storing the underlying data: - * - * (a) Linear buffer. In this case, all the data in the ABD is stored in one - * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). - * - * +-------------------+ - * | ABD (linear) | - * | abd_flags = ... | - * | abd_size = ... | +--------------------------------+ - * | abd_buf ------------->| raw buffer of size abd_size | - * +-------------------+ +--------------------------------+ - * no abd_chunks - * - * (b) Scattered buffer. In this case, the data in the ABD is split into - * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers - * to the chunks recorded in an array at the end of the ABD structure. - * - * +-------------------+ - * | ABD (scattered) | - * | abd_flags = ... | - * | abd_size = ... | - * | abd_offset = 0 | +-----------+ - * | abd_chunks[0] ----------------------------->| chunk 0 | - * | abd_chunks[1] ---------------------+ +-----------+ - * | ... | | +-----------+ - * | abd_chunks[N-1] ---------+ +------->| chunk 1 | - * +-------------------+ | +-----------+ - * | ... - * | +-----------+ - * +----------------->| chunk N-1 | - * +-----------+ - * - * Linear buffers act exactly like normal buffers and are always mapped into the - * kernel's virtual memory space, while scattered ABD data chunks are allocated - * as physical pages and then mapped in only while they are actually being - * accessed through one of the abd_* library functions. Using scattered ABDs - * provides several benefits: - * - * (1) They avoid use of kmem_*, preventing performance problems where running - * kmem_reap on very large memory systems never finishes and causes - * constant TLB shootdowns. - * - * (2) Fragmentation is less of an issue since when we are at the limit of - * allocatable space, we won't have to search around for a long free - * hole in the VA space for large ARC allocations. Each chunk is mapped in - * individually, so even if we are using HIGHMEM (see next point) we - * wouldn't need to worry about finding a contiguous address range. - * - * (3) If we are not using HIGHMEM, then all physical memory is always - * mapped into the kernel's address space, so we also avoid the map / - * unmap costs on each ABD access. - * - * If we are not using HIGHMEM, scattered buffers which have only one chunk - * can be treated as linear buffers, because they are contiguous in the - * kernel's virtual address space. See abd_alloc_pages() for details. - * - * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to - * B_FALSE. - * - * In addition to directly allocating a linear or scattered ABD, it is also - * possible to create an ABD by requesting the "sub-ABD" starting at an offset - * within an existing ABD. In linear buffers this is simple (set abd_buf of - * the new ABD to the starting point within the original raw buffer), but - * scattered ABDs are a little more complex. The new ABD makes a copy of the - * relevant abd_chunks pointers (but not the underlying data). However, to - * provide arbitrary rather than only chunk-aligned starting offsets, it also - * tracks an abd_offset field which represents the starting point of the data - * within the first chunk in abd_chunks. For both linear and scattered ABDs, - * creating an offset ABD marks the original ABD as the offset's parent, and the - * original ABD's abd_children refcount is incremented. This data allows us to - * ensure the root ABD isn't deleted before its children. - * - * Most consumers should never need to know what type of ABD they're using -- - * the ABD public API ensures that it's possible to transparently switch from - * using a linear ABD to a scattered one when doing so would be beneficial. - * - * If you need to use the data within an ABD directly, if you know it's linear - * (because you allocated it) you can use abd_to_buf() to access the underlying - * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions - * which will allocate a raw buffer if necessary. Use the abd_return_buf* - * functions to return any raw buffers that are no longer necessary when you're - * done using them. - * - * There are a variety of ABD APIs that implement basic buffer operations: - * compare, copy, read, write, and fill with zeroes. If you need a custom - * function which progressively accesses the whole ABD, use the abd_iterate_* - * functions. - */ - -#include -#include -#include -#include -#include -#ifdef _KERNEL -#include -#include -#else -#define MAX_ORDER 1 -#endif - -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_scatter_orders[MAX_ORDER]; - kstat_named_t abdstat_scatter_page_multi_chunk; - kstat_named_t abdstat_scatter_page_multi_zone; - kstat_named_t abdstat_scatter_page_alloc_retry; - kstat_named_t abdstat_scatter_sg_table_retry; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of compound allocations of a given order. These - * allocations are spread over all currently allocated ABDs, and - * act as a measure of memory fragmentation. - */ - { { "scatter_order_N", KSTAT_DATA_UINT64 } }, - /* - * The number of scatter ABDs which contain multiple chunks. - * ABDs are preferentially allocated from the minimum number of - * contiguous multi-page chunks, a single chunk is optimal. - */ - { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are split across memory zones. - * ABDs are preferentially allocated using pages from a single zone. - */ - { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, - /* - * The total number of retries encountered when attempting to - * allocate the pages to populate the scatter ABD. - */ - { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, - /* - * The total number of retries encountered when attempting to - * allocate the sg table for an ABD. - */ - { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) -#define ABD_BUF(abd) (abd->abd_u.abd_linear.abd_buf) -#define abd_for_each_sg(abd, sg, n, i) \ - for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) - -/* see block comment above for description */ -int zfs_abd_scatter_enabled = B_TRUE; -unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; - -/* - * zfs_abd_scatter_min_size is the minimum allocation size to use scatter - * ABD's. Smaller allocations will use linear ABD's which uses - * zio_[data_]buf_alloc(). - * - * Scatter ABD's use at least one page each, so sub-page allocations waste - * some space when allocated as scatter (e.g. 2KB scatter allocation wastes - * half of each page). Using linear ABD's for small allocations means that - * they will be put on slabs which contain many allocations. This can - * improve memory efficiency, but it also makes it much harder for ARC - * evictions to actually free pages, because all the buffers on one slab need - * to be freed in order for the slab (and underlying pages) to be freed. - * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's - * possible for them to actually waste more memory than scatter (one page per - * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). - * - * Spill blocks are typically 512B and are heavily used on systems running - * selinux with the default dnode size and the `xattr=sa` property set. - * - * By default we use linear allocations for 512B and 1KB, and scatter - * allocations for larger (1.5KB and up). - */ -int zfs_abd_scatter_min_size = 512 * 3; - -static kmem_cache_t *abd_cache = NULL; -static kstat_t *abd_ksp; - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); -} - -#ifdef _KERNEL -/* - * Mark zfs data pages so they can be excluded from kernel crash dumps - */ -#ifdef _LP64 -#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E - -static inline void -abd_mark_zfs_page(struct page *page) -{ - get_page(page); - SetPagePrivate(page); - set_page_private(page, ABD_FILE_CACHE_PAGE); -} - -static inline void -abd_unmark_zfs_page(struct page *page) -{ - set_page_private(page, 0UL); - ClearPagePrivate(page); - put_page(page); -} -#else -#define abd_mark_zfs_page(page) -#define abd_unmark_zfs_page(page) -#endif /* _LP64 */ - -#ifndef CONFIG_HIGHMEM - -#ifndef __GFP_RECLAIM -#define __GFP_RECLAIM __GFP_WAIT -#endif - -/* - * The goal is to minimize fragmentation by preferentially populating ABDs - * with higher order compound pages from a single zone. Allocation size is - * progressively decreased until it can be satisfied without performing - * reclaim or compaction. When necessary this function will degenerate to - * allocating individual pages and allowing reclaim to satisfy allocations. - */ -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - struct list_head pages; - struct sg_table table; - struct scatterlist *sg; - struct page *page, *tmp_page = NULL; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; - gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; - int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); - int nr_pages = abd_chunkcnt_for_bytes(size); - int chunks = 0, zones = 0; - size_t remaining_size; - int nid = NUMA_NO_NODE; - int alloc_pages = 0; - - INIT_LIST_HEAD(&pages); - - while (alloc_pages < nr_pages) { - unsigned chunk_pages; - int order; - - order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); - chunk_pages = (1U << order); - - page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); - if (page == NULL) { - if (order == 0) { - ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); - schedule_timeout_interruptible(1); - } else { - max_order = MAX(0, order - 1); - } - continue; - } - - list_add_tail(&page->lru, &pages); - - if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) - zones++; - - nid = page_to_nid(page); - ABDSTAT_BUMP(abdstat_scatter_orders[order]); - chunks++; - alloc_pages += chunk_pages; - } - - ASSERT3S(alloc_pages, ==, nr_pages); - - while (sg_alloc_table(&table, chunks, gfp)) { - ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); - schedule_timeout_interruptible(1); - } - - sg = table.sgl; - remaining_size = size; - list_for_each_entry_safe(page, tmp_page, &pages, lru) { - size_t sg_size = MIN(PAGESIZE << compound_order(page), - remaining_size); - sg_set_page(sg, page, sg_size, 0); - abd_mark_zfs_page(page); - remaining_size -= sg_size; - - sg = sg_next(sg); - list_del(&page->lru); - } - - /* - * These conditions ensure that a possible transformation to a linear - * ABD would be valid. - */ - ASSERT(!PageHighMem(sg_page(table.sgl))); - ASSERT0(ABD_SCATTER(abd).abd_offset); - - if (table.nents == 1) { - /* - * Since there is only one entry, this ABD can be represented - * as a linear buffer. All single-page (4K) ABD's can be - * represented this way. Some multi-page ABD's can also be - * represented this way, if we were able to allocate a single - * "chunk" (higher-order "page" which represents a power-of-2 - * series of physically-contiguous pages). This is often the - * case for 2-page (8K) ABD's. - * - * Representing a single-entry scatter ABD as a linear ABD - * has the performance advantage of avoiding the copy (and - * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. - * A performance increase of around 5% has been observed for - * ARC-cached reads (of small blocks which can take advantage - * of this). - * - * Note that this optimization is only possible because the - * pages are always mapped into the kernel's address space. - * This is not the case for highmem pages, so the - * optimization can not be made there. - */ - abd->abd_flags |= ABD_FLAG_LINEAR; - abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; - abd->abd_u.abd_linear.abd_sgl = table.sgl; - abd->abd_u.abd_linear.abd_buf = - page_address(sg_page(table.sgl)); - } else if (table.nents > 1) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); - abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; - - if (zones) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); - abd->abd_flags |= ABD_FLAG_MULTI_ZONE; - } - - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = table.nents; - } -} -#else -/* - * Allocate N individual pages to construct a scatter ABD. This function - * makes no attempt to request contiguous pages and requires the minimal - * number of kernel interfaces. It's designed for maximum compatibility. - */ -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - struct scatterlist *sg = NULL; - struct sg_table table; - struct page *page; - gfp_t gfp = __GFP_NOWARN | GFP_NOIO; - int nr_pages = abd_chunkcnt_for_bytes(size); - int i = 0; - - while (sg_alloc_table(&table, nr_pages, gfp)) { - ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); - schedule_timeout_interruptible(1); - } - - ASSERT3U(table.nents, ==, nr_pages); - ABD_SCATTER(abd).abd_sgl = table.sgl; - ABD_SCATTER(abd).abd_nents = nr_pages; - - abd_for_each_sg(abd, sg, nr_pages, i) { - while ((page = __page_cache_alloc(gfp)) == NULL) { - ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); - schedule_timeout_interruptible(1); - } - - ABDSTAT_BUMP(abdstat_scatter_orders[0]); - sg_set_page(sg, page, PAGESIZE, 0); - abd_mark_zfs_page(page); - } - - if (nr_pages > 1) { - ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); - abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; - } -} -#endif /* !CONFIG_HIGHMEM */ - -static void -abd_free_pages(abd_t *abd) -{ - struct scatterlist *sg = NULL; - struct sg_table table; - struct page *page; - int nr_pages = ABD_SCATTER(abd).abd_nents; - int order, i = 0; - - if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); - - if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) - ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); - - abd_for_each_sg(abd, sg, nr_pages, i) { - page = sg_page(sg); - abd_unmark_zfs_page(page); - order = compound_order(page); - __free_pages(page, order); - ASSERT3U(sg->length, <=, PAGE_SIZE << order); - ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); - } - - table.sgl = ABD_SCATTER(abd).abd_sgl; - table.nents = table.orig_nents = nr_pages; - sg_free_table(&table); -} - -#else /* _KERNEL */ - -#ifndef PAGE_SHIFT -#define PAGE_SHIFT (highbit64(PAGESIZE)-1) -#endif - -struct page; - -#define zfs_kmap_atomic(chunk, km) ((void *)chunk) -#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) -#define local_irq_save(flags) do { (void)(flags); } while (0) -#define local_irq_restore(flags) do { (void)(flags); } while (0) -#define nth_page(pg, i) \ - ((struct page *)((void *)(pg) + (i) * PAGESIZE)) - -struct scatterlist { - struct page *page; - int length; - int end; -}; - -static void -sg_init_table(struct scatterlist *sg, int nr) -{ - memset(sg, 0, nr * sizeof (struct scatterlist)); - sg[nr - 1].end = 1; -} - -#define for_each_sg(sgl, sg, nr, i) \ - for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) - -static inline void -sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, - unsigned int offset) -{ - /* currently we don't use offset */ - ASSERT(offset == 0); - sg->page = page; - sg->length = len; -} - -static inline struct page * -sg_page(struct scatterlist *sg) -{ - return (sg->page); -} - -static inline struct scatterlist * -sg_next(struct scatterlist *sg) -{ - if (sg->end) - return (NULL); - - return (sg + 1); -} - -static void -abd_alloc_pages(abd_t *abd, size_t size) -{ - unsigned nr_pages = abd_chunkcnt_for_bytes(size); - struct scatterlist *sg; - int i; - - ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * - sizeof (struct scatterlist), KM_SLEEP); - sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); - - abd_for_each_sg(abd, sg, nr_pages, i) { - struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); - sg_set_page(sg, p, PAGESIZE, 0); - } - ABD_SCATTER(abd).abd_nents = nr_pages; -} - -static void -abd_free_pages(abd_t *abd) -{ - int i, n = ABD_SCATTER(abd).abd_nents; - struct scatterlist *sg; - - abd_for_each_sg(abd, sg, n, i) { - for (int j = 0; j < sg->length; j += PAGESIZE) { - struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); - umem_free(p, PAGESIZE); - } - } - - vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist)); -} - -#endif /* _KERNEL */ - -void -abd_init(void) -{ - int i; - - abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), - 0, NULL, NULL, NULL, NULL, NULL, 0); - - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - for (i = 0; i < MAX_ORDER; i++) { - snprintf(abd_stats.abdstat_scatter_orders[i].name, - KSTAT_STRLEN, "scatter_order_%d", i); - abd_stats.abdstat_scatter_orders[i].data_type = - KSTAT_DATA_UINT64; - } - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - } -} - -void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - if (abd_cache) { - kmem_cache_destroy(abd_cache); - abd_cache = NULL; - } -} - -static inline void -abd_verify(abd_t *abd) -{ - ASSERT3U(abd->abd_size, >, 0); - ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); - ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | - ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); - IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); - IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); - } else { - size_t n; - int i = 0; - struct scatterlist *sg = NULL; - - ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); - ASSERT3U(ABD_SCATTER(abd).abd_offset, <, - ABD_SCATTER(abd).abd_sgl->length); - n = ABD_SCATTER(abd).abd_nents; - abd_for_each_sg(abd, sg, n, i) { - ASSERT3P(sg_page(sg), !=, NULL); - } - } -} - -static inline abd_t * -abd_alloc_struct(void) -{ - abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); - - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) -{ - kmem_cache_free(abd_cache, abd); - ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); -} - -/* - * Allocate an ABD, along with its own underlying data buffers. Use this if you - * don't care whether the ABD is linear or not. - */ -abd_t * -abd_alloc(size_t size, boolean_t is_metadata) -{ - /* see the comment above zfs_abd_scatter_min_size */ - if (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size) - return (abd_alloc_linear(size, is_metadata)); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd_t *abd = abd_alloc_struct(); - abd->abd_flags = ABD_FLAG_OWNER; - abd->abd_u.abd_scatter.abd_offset = 0; - abd_alloc_pages(abd, size); - - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - P2ROUNDUP(size, PAGESIZE) - size); - - return (abd); -} - -static void -abd_free_scatter(abd_t *abd) -{ - abd_free_pages(abd); - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - (int)abd->abd_size - (int)P2ROUNDUP(abd->abd_size, PAGESIZE)); - - abd_free_struct(abd); -} - -/* - * Allocate an ABD that must be linear, along with its own underlying data - * buffer. Only use this when it would be very annoying to write your ABD - * consumer with a scattered ABD. - */ -abd_t * -abd_alloc_linear(size_t size, boolean_t is_metadata) -{ - abd_t *abd = abd_alloc_struct(); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); - } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); - - return (abd); -} - -static void -abd_free_linear(abd_t *abd) -{ - if (abd_is_linear_page(abd)) { - /* Transform it back into a scatter ABD for freeing */ - struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; - abd->abd_flags &= ~ABD_FLAG_LINEAR; - abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; - ABD_SCATTER(abd).abd_nents = 1; - ABD_SCATTER(abd).abd_offset = 0; - ABD_SCATTER(abd).abd_sgl = sg; - abd_free_scatter(abd); - return; - } - if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); - } - - zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); - - abd_free_struct(abd); -} - -/* - * Free an ABD. Only use this on ABDs allocated with abd_alloc() or - * abd_alloc_linear(). - */ -void -abd_free(abd_t *abd) -{ - abd_verify(abd); - ASSERT3P(abd->abd_parent, ==, NULL); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - if (abd_is_linear(abd)) - abd_free_linear(abd); - else - abd_free_scatter(abd); -} - -/* - * Allocate an ABD of the same format (same metadata flag, same scatterize - * setting) as another ABD. - */ -abd_t * -abd_alloc_sametype(abd_t *sabd, size_t size) -{ - boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd) && - !abd_is_linear_page(sabd)) { - return (abd_alloc_linear(size, is_metadata)); - } else { - return (abd_alloc(size, is_metadata)); - } -} - -/* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). - * - * On Linux the optimal thing to do would be to use abd_get_offset() and - * construct a new ABD which shares the original pages thereby eliminating - * the copy. But for the moment a new linear ABD is allocated until this - * performance optimization can be implemented. - */ -abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) -{ - return (abd_alloc(size, is_metadata)); -} - -/* - * Allocate a new ABD to point to offset off of sabd. It shares the underlying - * buffer data with sabd. Use abd_put() to free. sabd must not be freed while - * any derived ABDs exist. - */ -static inline abd_t * -abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) -{ - abd_t *abd; - - abd_verify(sabd); - ASSERT3U(off, <=, sabd->abd_size); - - if (abd_is_linear(sabd)) { - abd = abd_alloc_struct(); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; - } else { - int i = 0; - struct scatterlist *sg = NULL; - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - - abd = abd_alloc_struct(); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { - if (new_offset < sg->length) - break; - new_offset -= sg->length; - } - - ABD_SCATTER(abd).abd_sgl = sg; - ABD_SCATTER(abd).abd_offset = new_offset; - ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; - } - - abd->abd_size = size; - abd->abd_parent = sabd; - zfs_refcount_create(&abd->abd_children); - (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - - return (abd); -} - -abd_t * -abd_get_offset(abd_t *sabd, size_t off) -{ - size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; - - VERIFY3U(size, >, 0); - - return (abd_get_offset_impl(sabd, off, size)); -} - -abd_t * -abd_get_offset_size(abd_t *sabd, size_t off, size_t size) -{ - ASSERT3U(off + size, <=, sabd->abd_size); - - return (abd_get_offset_impl(sabd, off, size)); -} - -/* - * Allocate a linear ABD structure for buf. You must free this with abd_put() - * since the resulting ABD doesn't own its own buffer. - */ -abd_t * -abd_get_from_buf(void *buf, size_t size) -{ - abd_t *abd = abd_alloc_struct(); - - VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - - /* - * Even if this buf is filesystem metadata, we only track that if we - * own the underlying data buffer, which is not true in this case. - * Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_size = size; - abd->abd_parent = NULL; - zfs_refcount_create(&abd->abd_children); - - abd->abd_u.abd_linear.abd_buf = buf; - - return (abd); -} - -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - -/* - * Get the raw buffer associated with a linear ABD. - */ -void * -abd_to_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); -} - -/* - * Borrow a raw buffer from an ABD without copying the contents of the ABD - * into the buffer. If the ABD is scattered, this will allocate a raw buffer - * whose contents are undefined. To copy over the existing data in the ABD, use - * abd_borrow_buf_copy() instead. - */ -void * -abd_borrow_buf(abd_t *abd, size_t n) -{ - void *buf; - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - buf = abd_to_buf(abd); - } else { - buf = zio_buf_alloc(n); - } - (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - - return (buf); -} - -void * -abd_borrow_buf_copy(abd_t *abd, size_t n) -{ - void *buf = abd_borrow_buf(abd, n); - if (!abd_is_linear(abd)) { - abd_copy_to_buf(buf, abd, n); - } - return (buf); -} - -/* - * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will - * not change the contents of the ABD and will ASSERT that you didn't modify - * the buffer since it was borrowed. If you want any changes you made to buf to - * be copied back to abd, use abd_return_buf_copy() instead. - */ -void -abd_return_buf(abd_t *abd, void *buf, size_t n) -{ - abd_verify(abd); - ASSERT3U(abd->abd_size, >=, n); - if (abd_is_linear(abd)) { - ASSERT3P(buf, ==, abd_to_buf(abd)); - } else { - ASSERT0(abd_cmp_buf(abd, buf, n)); - zio_buf_free(buf, n); - } - (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); -} - -void -abd_return_buf_copy(abd_t *abd, void *buf, size_t n) -{ - if (!abd_is_linear(abd)) { - abd_copy_from_buf(abd, buf, n); - } - abd_return_buf(abd, buf, n); -} - -/* - * Give this ABD ownership of the buffer that it's storing. Can only be used on - * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated - * with abd_alloc_linear() which subsequently released ownership of their buf - * with abd_release_ownership_of_buf(). - */ -void -abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - abd_verify(abd); - - abd->abd_flags |= ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - -void -abd_release_ownership_of_buf(abd_t *abd) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(abd->abd_flags & ABD_FLAG_OWNER); - - /* - * abd_free() needs to handle LINEAR_PAGE ABD's specially. - * Since that flag does not survive the - * abd_release_ownership_of_buf() -> abd_get_from_buf() -> - * abd_take_ownership_of_buf() sequence, we don't allow releasing - * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. - */ - ASSERT(!abd_is_linear_page(abd)); - - abd_verify(abd); - - abd->abd_flags &= ~ABD_FLAG_OWNER; - /* Disable this flag since we no longer own the data buffer */ - abd->abd_flags &= ~ABD_FLAG_META; - - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); -} - -struct abd_iter { - /* public interface */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ - - /* private */ - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; - size_t iter_offset; /* offset in current sg/abd_buf, */ - /* abd_offset included */ - struct scatterlist *iter_sg; /* current sg */ -}; - -/* - * Initialize the abd_iter. - */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type) -{ - abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; - aiter->iter_pos = 0; - if (abd_is_linear(abd)) { - aiter->iter_offset = 0; - aiter->iter_sg = NULL; - } else { - aiter->iter_offset = ABD_SCATTER(abd).abd_offset; - aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; - } -} - -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) -{ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - aiter->iter_pos += amount; - aiter->iter_offset += amount; - if (!abd_is_linear(aiter->iter_abd)) { - while (aiter->iter_offset >= aiter->iter_sg->length) { - aiter->iter_offset -= aiter->iter_sg->length; - aiter->iter_sg = sg_next(aiter->iter_sg); - if (aiter->iter_sg == NULL) { - ASSERT0(aiter->iter_offset); - break; - } - } - } -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) -{ - void *paddr; - size_t offset = 0; - - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); - offset = aiter->iter_offset; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; - } else { - offset = aiter->iter_offset; - aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - - paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), - km_table[aiter->iter_km]); - } - - aiter->iter_mapaddr = (char *)paddr + offset; -} - -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (!abd_is_linear(aiter->iter_abd)) { - /* LINTED E_FUNC_SET_NOT_USED */ - zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, - km_table[aiter->iter_km]); - } - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -int -abd_iterate_func(abd_t *abd, size_t off, size_t size, - abd_iter_func_t *func, void *private) -{ - int ret = 0; - struct abd_iter aiter; - - abd_verify(abd); - ASSERT3U(off + size, <=, abd->abd_size); - - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); - - while (size > 0) { - abd_iter_map(&aiter); - - size_t len = MIN(aiter.iter_mapsize, size); - ASSERT3U(len, >, 0); - - ret = func(aiter.iter_mapaddr, len, private); - - abd_iter_unmap(&aiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&aiter, len); - } - - return (ret); -} - -struct buf_arg { - void *arg_buf; -}; - -static int -abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(ba_ptr->arg_buf, buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy abd to buf. (off is the offset in abd.) - */ -void -abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, - &ba_ptr); -} - -static int -abd_cmp_buf_off_cb(void *buf, size_t size, void *private) -{ - int ret; - struct buf_arg *ba_ptr = private; - - ret = memcmp(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (ret); -} - -/* - * Compare the contents of abd to buf. (off is the offset in abd.) - */ -int -abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); -} - -static int -abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) -{ - struct buf_arg *ba_ptr = private; - - (void) memcpy(buf, ba_ptr->arg_buf, size); - ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; - - return (0); -} - -/* - * Copy from buf to abd. (off is the offset in abd.) - */ -void -abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) -{ - struct buf_arg ba_ptr = { (void *) buf }; - - (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, - &ba_ptr); -} - -/*ARGSUSED*/ -static int -abd_zero_off_cb(void *buf, size_t size, void *private) -{ - (void) memset(buf, 0, size); - return (0); -} - -/* - * Zero out the abd from a particular offset to the end. - */ -void -abd_zero_off(abd_t *abd, size_t off, size_t size) -{ - (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); -} - -/* - * Iterate over two ABDs and call func incrementally on the two ABDs' data in - * equal-sized chunks (passed to func as raw buffers). func could be called many - * times during this iteration. - */ -int -abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, - size_t size, abd_iter_func2_t *func, void *private) -{ - int ret = 0; - struct abd_iter daiter, saiter; - - abd_verify(dabd); - abd_verify(sabd); - - ASSERT3U(doff + size, <=, dabd->abd_size); - ASSERT3U(soff + size, <=, sabd->abd_size); - - abd_iter_init(&daiter, dabd, 0); - abd_iter_init(&saiter, sabd, 1); - abd_iter_advance(&daiter, doff); - abd_iter_advance(&saiter, soff); - - while (size > 0) { - abd_iter_map(&daiter); - abd_iter_map(&saiter); - - size_t dlen = MIN(daiter.iter_mapsize, size); - size_t slen = MIN(saiter.iter_mapsize, size); - size_t len = MIN(dlen, slen); - ASSERT(dlen > 0 || slen > 0); - - ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, - private); - - abd_iter_unmap(&saiter); - abd_iter_unmap(&daiter); - - if (ret != 0) - break; - - size -= len; - abd_iter_advance(&daiter, len); - abd_iter_advance(&saiter, len); - } - - return (ret); -} - -/*ARGSUSED*/ -static int -abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) -{ - (void) memcpy(dbuf, sbuf, size); - return (0); -} - -/* - * Copy from sabd to dabd starting from soff and doff. - */ -void -abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) -{ - (void) abd_iterate_func2(dabd, sabd, doff, soff, size, - abd_copy_off_cb, NULL); -} - -/*ARGSUSED*/ -static int -abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) -{ - return (memcmp(bufa, bufb, size)); -} - -/* - * Compares the contents of two ABDs. - */ -int -abd_cmp(abd_t *dabd, abd_t *sabd) -{ - ASSERT3U(dabd->abd_size, ==, sabd->abd_size); - return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, - abd_cmp_cb, NULL)); -} - -/* - * Iterate over code ABDs and a data ABD and call @func_raidz_gen. - * - * @cabds parity ABDs, must have equal size - * @dabd data ABD. Can be NULL (in this case @dsize = 0) - * @func_raidz_gen should be implemented so that its behaviour - * is the same when taking linear and when taking scatter - */ -void -abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, - ssize_t csize, ssize_t dsize, const unsigned parity, - void (*func_raidz_gen)(void **, const void *, size_t, size_t)) -{ - int i; - ssize_t len, dlen; - struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; - void *caddrs[3]; - unsigned long flags; - - ASSERT3U(parity, <=, 3); - - for (i = 0; i < parity; i++) - abd_iter_init(&caiters[i], cabds[i], i); - - if (dabd) - abd_iter_init(&daiter, dabd, i); - - ASSERT3S(dsize, >=, 0); - - local_irq_save(flags); - while (csize > 0) { - len = csize; - - if (dabd && dsize > 0) - abd_iter_map(&daiter); - - for (i = 0; i < parity; i++) { - abd_iter_map(&caiters[i]); - caddrs[i] = caiters[i].iter_mapaddr; - } - - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - /* falls through */ - case 2: - len = MIN(caiters[1].iter_mapsize, len); - /* falls through */ - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } - - /* must be progressive */ - ASSERT3S(len, >, 0); - - if (dabd && dsize > 0) { - /* this needs precise iter.length */ - len = MIN(daiter.iter_mapsize, len); - dlen = len; - } else - dlen = 0; - - /* must be progressive */ - ASSERT3S(len, >, 0); - /* - * The iterated function likely will not do well if each - * segment except the last one is not multiple of 512 (raidz). - */ - ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - - func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); - - for (i = parity-1; i >= 0; i--) { - abd_iter_unmap(&caiters[i]); - abd_iter_advance(&caiters[i], len); - } - - if (dabd && dsize > 0) { - abd_iter_unmap(&daiter); - abd_iter_advance(&daiter, dlen); - dsize -= dlen; - } - - csize -= len; - - ASSERT3S(dsize, >=, 0); - ASSERT3S(csize, >=, 0); - } - local_irq_restore(flags); -} - -/* - * Iterate over code ABDs and data reconstruction target ABDs and call - * @func_raidz_rec. Function maps at most 6 pages atomically. - * - * @cabds parity ABDs, must have equal size - * @tabds rec target ABDs, at most 3 - * @tsize size of data target columns - * @func_raidz_rec expects syndrome data in target columns. Function - * reconstructs data and overwrites target columns. - */ -void -abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, - ssize_t tsize, const unsigned parity, - void (*func_raidz_rec)(void **t, const size_t tsize, void **c, - const unsigned *mul), - const unsigned *mul) -{ - int i; - ssize_t len; - struct abd_iter citers[3]; - struct abd_iter xiters[3]; - void *caddrs[3], *xaddrs[3]; - unsigned long flags; - - ASSERT3U(parity, <=, 3); - - for (i = 0; i < parity; i++) { - abd_iter_init(&citers[i], cabds[i], 2*i); - abd_iter_init(&xiters[i], tabds[i], 2*i+1); - } - - local_irq_save(flags); - while (tsize > 0) { - - for (i = 0; i < parity; i++) { - abd_iter_map(&citers[i]); - abd_iter_map(&xiters[i]); - caddrs[i] = citers[i].iter_mapaddr; - xaddrs[i] = xiters[i].iter_mapaddr; - } - - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - /* falls through */ - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - /* falls through */ - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } - /* must be progressive */ - ASSERT3S(len, >, 0); - /* - * The iterated function likely will not do well if each - * segment except the last one is not multiple of 512 (raidz). - */ - ASSERT3U(((uint64_t)len & 511ULL), ==, 0); - - func_raidz_rec(xaddrs, len, caddrs, mul); - - for (i = parity-1; i >= 0; i--) { - abd_iter_unmap(&xiters[i]); - abd_iter_unmap(&citers[i]); - abd_iter_advance(&xiters[i], len); - abd_iter_advance(&citers[i], len); - } - - tsize -= len; - ASSERT3S(tsize, >=, 0); - } - local_irq_restore(flags); -} - -#if defined(_KERNEL) -/* - * bio_nr_pages for ABD. - * @off is the offset in @abd - */ -unsigned long -abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) -{ - unsigned long pos; - - if (abd_is_linear(abd)) - pos = (unsigned long)abd_to_buf(abd) + off; - else - pos = abd->abd_u.abd_scatter.abd_offset + off; - - return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - - (pos >> PAGE_SHIFT); -} - -/* - * bio_map for scatter ABD. - * @off is the offset in @abd - * Remaining IO size is returned - */ -unsigned int -abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, - unsigned int io_size, size_t off) -{ - int i; - struct abd_iter aiter; - - ASSERT(!abd_is_linear(abd)); - ASSERT3U(io_size, <=, abd->abd_size - off); - - abd_iter_init(&aiter, abd, 0); - abd_iter_advance(&aiter, off); - - for (i = 0; i < bio->bi_max_vecs; i++) { - struct page *pg; - size_t len, sgoff, pgoff; - struct scatterlist *sg; - - if (io_size <= 0) - break; - - sg = aiter.iter_sg; - sgoff = aiter.iter_offset; - pgoff = sgoff & (PAGESIZE - 1); - len = MIN(io_size, PAGESIZE - pgoff); - ASSERT(len > 0); - - pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); - if (bio_add_page(bio, pg, len, pgoff) != len) - break; - - io_size -= len; - abd_iter_advance(&aiter, len); - } - - return (io_size); -} - -/* Tunable Parameters */ -module_param(zfs_abd_scatter_enabled, int, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_enabled, - "Toggle whether ABD allocations must be linear."); -module_param(zfs_abd_scatter_min_size, int, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_min_size, - "Minimum size of scatter allocations."); -/* CSTYLED */ -module_param(zfs_abd_scatter_max_order, uint, 0644); -MODULE_PARM_DESC(zfs_abd_scatter_max_order, - "Maximum order allocation used for a scatter ABD."); -#endif diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c new file mode 100644 index 000000000000..57e415ef31ae --- /dev/null +++ b/module/os/linux/zfs/abd_os.c @@ -0,0 +1,891 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +/* + * See abd.c for an general overview of the arc buffered data (ABD). + * + * Linear buffers act exactly like normal buffers and are always mapped into the + * kernel's virtual memory space, while scattered ABD data chunks are allocated + * as physical pages and then mapped in only while they are actually being + * accessed through one of the abd_* library functions. Using scattered ABDs + * provides several benefits: + * + * (1) They avoid use of kmem_*, preventing performance problems where running + * kmem_reap on very large memory systems never finishes and causes + * constant TLB shootdowns. + * + * (2) Fragmentation is less of an issue since when we are at the limit of + * allocatable space, we won't have to search around for a long free + * hole in the VA space for large ARC allocations. Each chunk is mapped in + * individually, so even if we are using HIGHMEM (see next point) we + * wouldn't need to worry about finding a contiguous address range. + * + * (3) If we are not using HIGHMEM, then all physical memory is always + * mapped into the kernel's address space, so we also avoid the map / + * unmap costs on each ABD access. + * + * If we are not using HIGHMEM, scattered buffers which have only one chunk + * can be treated as linear buffers, because they are contiguous in the + * kernel's virtual address space. See abd_alloc_chunks() for details. + */ + +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#else +#define MAX_ORDER 1 +#endif + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_page_multi_chunk; + kstat_named_t abdstat_scatter_page_multi_zone; + kstat_named_t abdstat_scatter_page_alloc_retry; + kstat_named_t abdstat_scatter_sg_table_retry; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of compound allocations of a given order. These + * allocations are spread over all currently allocated ABDs, and + * act as a measure of memory fragmentation. + */ + { { "scatter_order_N", KSTAT_DATA_UINT64 } }, + /* + * The number of scatter ABDs which contain multiple chunks. + * ABDs are preferentially allocated from the minimum number of + * contiguous multi-page chunks, a single chunk is optimal. + */ + { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are split across memory zones. + * ABDs are preferentially allocated using pages from a single zone. + */ + { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the pages to populate the scatter ABD. + */ + { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the sg table for an ABD. + */ + { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, +}; + +#define abd_for_each_sg(abd, sg, n, i) \ + for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) + +unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; + +/* + * zfs_abd_scatter_min_size is the minimum allocation size to use scatter + * ABD's. Smaller allocations will use linear ABD's which uses + * zio_[data_]buf_alloc(). + * + * Scatter ABD's use at least one page each, so sub-page allocations waste + * some space when allocated as scatter (e.g. 2KB scatter allocation wastes + * half of each page). Using linear ABD's for small allocations means that + * they will be put on slabs which contain many allocations. This can + * improve memory efficiency, but it also makes it much harder for ARC + * evictions to actually free pages, because all the buffers on one slab need + * to be freed in order for the slab (and underlying pages) to be freed. + * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's + * possible for them to actually waste more memory than scatter (one page per + * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). + * + * Spill blocks are typically 512B and are heavily used on systems running + * selinux with the default dnode size and the `xattr=sa` property set. + * + * By default we use linear allocations for 512B and 1KB, and scatter + * allocations for larger (1.5KB and up). + */ +int zfs_abd_scatter_min_size = 512 * 3; + +static kmem_cache_t *abd_cache = NULL; +static kstat_t *abd_ksp; + +static size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); +} + +abd_t * +abd_alloc_struct(size_t size) +{ + /* + * In Linux we do not use the size passed in during ABD + * allocation, so we just ignore it. + */ + abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + kmem_cache_free(abd_cache, abd); + ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); +} + +#ifdef _KERNEL +/* + * Mark zfs data pages so they can be excluded from kernel crash dumps + */ +#ifdef _LP64 +#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E + +static inline void +abd_mark_zfs_page(struct page *page) +{ + get_page(page); + SetPagePrivate(page); + set_page_private(page, ABD_FILE_CACHE_PAGE); +} + +static inline void +abd_unmark_zfs_page(struct page *page) +{ + set_page_private(page, 0UL); + ClearPagePrivate(page); + put_page(page); +} +#else +#define abd_mark_zfs_page(page) +#define abd_unmark_zfs_page(page) +#endif /* _LP64 */ + +#ifndef CONFIG_HIGHMEM + +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + +/* + * The goal is to minimize fragmentation by preferentially populating ABDs + * with higher order compound pages from a single zone. Allocation size is + * progressively decreased until it can be satisfied without performing + * reclaim or compaction. When necessary this function will degenerate to + * allocating individual pages and allowing reclaim to satisfy allocations. + */ +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + struct list_head pages; + struct sg_table table; + struct scatterlist *sg; + struct page *page, *tmp_page = NULL; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; + int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + int nr_pages = abd_chunkcnt_for_bytes(size); + int chunks = 0, zones = 0; + size_t remaining_size; + int nid = NUMA_NO_NODE; + int alloc_pages = 0; + + INIT_LIST_HEAD(&pages); + + while (alloc_pages < nr_pages) { + unsigned chunk_pages; + int order; + + order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); + chunk_pages = (1U << order); + + page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); + if (page == NULL) { + if (order == 0) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } else { + max_order = MAX(0, order - 1); + } + continue; + } + + list_add_tail(&page->lru, &pages); + + if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) + zones++; + + nid = page_to_nid(page); + ABDSTAT_BUMP(abdstat_scatter_orders[order]); + chunks++; + alloc_pages += chunk_pages; + } + + ASSERT3S(alloc_pages, ==, nr_pages); + + while (sg_alloc_table(&table, chunks, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + sg = table.sgl; + remaining_size = size; + list_for_each_entry_safe(page, tmp_page, &pages, lru) { + size_t sg_size = MIN(PAGESIZE << compound_order(page), + remaining_size); + sg_set_page(sg, page, sg_size, 0); + abd_mark_zfs_page(page); + remaining_size -= sg_size; + + sg = sg_next(sg); + list_del(&page->lru); + } + + /* + * These conditions ensure that a possible transformation to a linear + * ABD would be valid. + */ + ASSERT(!PageHighMem(sg_page(table.sgl))); + ASSERT0(ABD_SCATTER(abd).abd_offset); + + if (table.nents == 1) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's can be + * represented this way. Some multi-page ABD's can also be + * represented this way, if we were able to allocate a single + * "chunk" (higher-order "page" which represents a power-of-2 + * series of physically-contiguous pages). This is often the + * case for 2-page (8K) ABD's. + * + * Representing a single-entry scatter ABD as a linear ABD + * has the performance advantage of avoiding the copy (and + * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. + * A performance increase of around 5% has been observed for + * ARC-cached reads (of small blocks which can take advantage + * of this). + * + * Note that this optimization is only possible because the + * pages are always mapped into the kernel's address space. + * This is not the case for highmem pages, so the + * optimization can not be made there. + */ + abd->abd_flags |= ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); + } else if (table.nents > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + if (zones) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); + abd->abd_flags |= ABD_FLAG_MULTI_ZONE; + } + + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + } +} +#else + +/* + * Allocate N individual pages to construct a scatter ABD. This function + * makes no attempt to request contiguous pages and requires the minimal + * number of kernel interfaces. It's designed for maximum compatibility. + */ +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + struct page *page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + int nr_pages = abd_chunkcnt_for_bytes(size); + int i = 0; + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + ASSERT3U(table.nents, ==, nr_pages); + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = nr_pages; + + abd_for_each_sg(abd, sg, nr_pages, i) { + while ((page = __page_cache_alloc(gfp)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + + ABDSTAT_BUMP(abdstat_scatter_orders[0]); + sg_set_page(sg, page, PAGESIZE, 0); + abd_mark_zfs_page(page); + } + + if (nr_pages > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + } +} +#endif /* !CONFIG_HIGHMEM */ + +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + struct sg_table table; + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; + sg_free_table(&table); +} + +void +abd_free_chunks(abd_t *abd) +{ + struct scatterlist *sg = NULL; + struct page *page; + int nr_pages = ABD_SCATTER(abd).abd_nents; + int order, i = 0; + + if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); + + if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_for_each_sg(abd, sg, nr_pages, i) { + page = sg_page(sg); + abd_unmark_zfs_page(page); + order = compound_order(page); + __free_pages(page, order); + ASSERT3U(sg->length, <=, PAGE_SIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } + abd_free_sg_table(abd); +} + +#else /* _KERNEL */ + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT (highbit64(PAGESIZE)-1) +#endif + +struct page; + +#define zfs_kmap_atomic(chunk, km) ((void *)chunk) +#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) +#define local_irq_save(flags) do { (void)(flags); } while (0) +#define local_irq_restore(flags) do { (void)(flags); } while (0) +#define nth_page(pg, i) \ + ((struct page *)((void *)(pg) + (i) * PAGESIZE)) + +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) +{ + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + int nents = ABD_SCATTER(abd).abd_nents; + vmem_free(ABD_SCATTER(abd).abd_sgl, + nents * sizeof (struct scatterlist)); +} + +#define for_each_sg(sgl, sg, nr, i) \ + for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) +{ + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) +{ + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + + return (sg + 1); +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(size); + struct scatterlist *sg; + int i; + + ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); + + abd_for_each_sg(abd, sg, nr_pages, i) { + struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); + sg_set_page(sg, p, PAGESIZE, 0); + } + ABD_SCATTER(abd).abd_nents = nr_pages; +} + +void +abd_free_chunks(abd_t *abd) +{ + int i, n = ABD_SCATTER(abd).abd_nents; + struct scatterlist *sg; + + abd_for_each_sg(abd, sg, n, i) { + for (int j = 0; j < sg->length; j += PAGESIZE) { + struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); + umem_free(p, PAGESIZE); + } + } + abd_free_sg_table(abd); +} + +#endif /* _KERNEL */ + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + (int)abd->abd_size + -(int)P2ROUNDUP(abd->abd_size, PAGESIZE)); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + size_t n; + int i = 0; + struct scatterlist *sg = NULL; + + ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_sgl->length); + n = ABD_SCATTER(abd).abd_nents; + abd_for_each_sg(abd, sg, n, i) { + ASSERT3P(sg_page(sg), !=, NULL); + } +} + +void +abd_init(void) +{ + int i; + + abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + for (i = 0; i < MAX_ORDER; i++) { + snprintf(abd_stats.abdstat_scatter_orders[i].name, + KSTAT_STRLEN, "scatter_order_%d", i); + abd_stats.abdstat_scatter_orders[i].data_type = + KSTAT_DATA_UINT64; + } + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + if (abd_cache) { + kmem_cache_destroy(abd_cache); + abd_cache = NULL; + } +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* Transform it back into a scatter ABD for freeing */ + struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + abd->abd_flags &= ~ABD_FLAG_LINEAR; + abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; + ABD_SCATTER(abd).abd_nents = 1; + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_sgl = sg; + abd_free_chunks(abd); + + zfs_refcount_destroy(&abd->abd_children); + abd_update_scatter_stats(abd, ABDSTAT_DECR); + abd_free_struct(abd); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * On Linux the optimal thing to do would be to use abd_get_offset() and + * construct a new ABD which shares the original pages thereby eliminating + * the copy. But for the moment a new linear ABD is allocated until this + * performance optimization can be implemented. + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc(size, is_metadata)); +} + +abd_t * +abd_get_offset_scatter(abd_t *sabd, size_t off) +{ + abd_t *abd = NULL; + int i = 0; + struct scatterlist *sg = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { + if (new_offset < sg->length) + break; + new_offset -= sg->length; + } + + ABD_SCATTER(abd).abd_sgl = sg; + ABD_SCATTER(abd).abd_offset = new_offset; + ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + + return (abd); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; + aiter->iter_pos = 0; + if (abd_is_linear(abd)) { + aiter->iter_offset = 0; + aiter->iter_sg = NULL; + } else { + aiter->iter_offset = ABD_SCATTER(abd).abd_offset; + aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; + } +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; + aiter->iter_offset += amount; + if (!abd_is_linear(aiter->iter_abd)) { + while (aiter->iter_offset >= aiter->iter_sg->length) { + aiter->iter_offset -= aiter->iter_sg->length; + aiter->iter_sg = sg_next(aiter->iter_sg); + if (aiter->iter_sg == NULL) { + ASSERT0(aiter->iter_offset); + break; + } + } + } +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + offset = aiter->iter_offset; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + offset = aiter->iter_offset; + aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + + paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), + km_table[aiter->iter_km]); + } + + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (!abd_is_linear(aiter->iter_abd)) { + /* LINTED E_FUNC_SET_NOT_USED */ + zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, + km_table[aiter->iter_km]); + } + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_enter_critical(unsigned long flags) +{ + local_irq_save(flags); +} + +void +abd_exit_critical(unsigned long flags) +{ + local_irq_restore(flags); +} + +#if defined(_KERNEL) +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +{ + unsigned long pos; + + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = ABD_SCATTER(abd).abd_offset + off; + + return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT); +} + +/* + * bio_map for scatter ABD. + * @off is the offset in @abd + * Remaining IO size is returned + */ +unsigned int +abd_scatter_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + int i; + struct abd_iter aiter; + + ASSERT(!abd_is_linear(abd)); + ASSERT3U(io_size, <=, abd->abd_size - off); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + struct page *pg; + size_t len, sgoff, pgoff; + struct scatterlist *sg; + + if (io_size <= 0) + break; + + sg = aiter.iter_sg; + sgoff = aiter.iter_offset; + pgoff = sgoff & (PAGESIZE - 1); + len = MIN(io_size, PAGESIZE - pgoff); + ASSERT(len > 0); + + pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); + if (bio_add_page(bio, pg, len, pgoff) != len) + break; + + io_size -= len; + abd_iter_advance(&aiter, len); + } + + return (io_size); +} + +/* Tunable Parameters */ +module_param(zfs_abd_scatter_enabled, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_enabled, + "Toggle whether ABD allocations must be linear."); +module_param(zfs_abd_scatter_min_size, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_min_size, + "Minimum size of scatter allocations."); +/* CSTYLED */ +module_param(zfs_abd_scatter_max_order, uint, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_max_order, + "Maximum order allocation used for a scatter ABD."); +#endif diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in index 6737336caef9..3a9663997033 100644 --- a/module/zfs/Makefile.in +++ b/module/zfs/Makefile.in @@ -14,6 +14,7 @@ ccflags-y += $(NO_UNUSED_BUT_SET_VARIABLE) # Suppress unused-value warnings in sparc64 architecture headers ccflags-$(CONFIG_SPARC64) += -Wno-unused-value +$(MODULE)-objs += abd.o $(MODULE)-objs += aggsum.o $(MODULE)-objs += arc.o $(MODULE)-objs += blkptr.o diff --git a/module/os/freebsd/zfs/abd.c b/module/zfs/abd.c similarity index 61% rename from module/os/freebsd/zfs/abd.c rename to module/zfs/abd.c index 888a113a4291..2e4554da7a62 100644 --- a/module/os/freebsd/zfs/abd.c +++ b/module/zfs/abd.c @@ -1,17 +1,26 @@ /* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. + * CDDL HEADER START * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END */ - /* * Copyright (c) 2014 by Chunwei Chen. All rights reserved. - * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. */ /* @@ -50,11 +59,6 @@ * +----------------->| chunk N-1 | * +-----------+ * - * Using a large proportion of scattered ABDs decreases ARC fragmentation since - * when we are at the limit of allocatable space, using equal-size chunks will - * allow us to quickly reclaim enough space for a new large allocation (assuming - * it is also scattered). - * * In addition to directly allocating a linear or scattered ABD, it is also * possible to create an ABD by requesting the "sub-ABD" starting at an offset * within an existing ABD. In linear buffers this is simple (set abd_buf of @@ -83,186 +87,55 @@ * compare, copy, read, write, and fill with zeroes. If you need a custom * function which progressively accesses the whole ABD, use the abd_iterate_* * functions. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. */ -#include +#include #include #include #include #include -typedef struct abd_stats { - kstat_named_t abdstat_struct_size; - kstat_named_t abdstat_scatter_cnt; - kstat_named_t abdstat_scatter_data_size; - kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_linear_cnt; - kstat_named_t abdstat_linear_data_size; -} abd_stats_t; - -static abd_stats_t abd_stats = { - /* Amount of memory occupied by all of the abd_t struct allocations */ - { "struct_size", KSTAT_DATA_UINT64 }, - /* - * The number of scatter ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset()). - */ - { "scatter_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ - { "scatter_data_size", KSTAT_DATA_UINT64 }, - /* - * The amount of space wasted at the end of the last chunk across all - * scatter ABDs tracked by scatter_cnt. - */ - { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, - /* - * The number of linear ABDs which are currently allocated, excluding - * ABDs which don't own their data (for instance the ones which were - * allocated through abd_get_offset() and abd_get_from_buf()). If an - * ABD takes ownership of its buf then it will become tracked. - */ - { "linear_cnt", KSTAT_DATA_UINT64 }, - /* Amount of data stored in all linear ABDs tracked by linear_cnt */ - { "linear_data_size", KSTAT_DATA_UINT64 }, -}; - -#define ABDSTAT(stat) (abd_stats.stat.value.ui64) -#define ABDSTAT_INCR(stat, val) \ - atomic_add_64(&abd_stats.stat.value.ui64, (val)) -#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) -#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) - -/* - * It is possible to make all future ABDs be linear by setting this to B_FALSE. - * Otherwise, ABDs are allocated scattered by default unless the caller uses - * abd_alloc_linear(). - */ -boolean_t zfs_abd_scatter_enabled = B_TRUE; - -/* - * The size of the chunks ABD allocates. Because the sizes allocated from the - * kmem_cache can't change, this tunable can only be modified at boot. Changing - * it at runtime would cause ABD iteration to work incorrectly for ABDs which - * were allocated with the old size, so a safeguard has been put in place which - * will cause the machine to panic if you change it and try to access the data - * within a scattered ABD. - */ -size_t zfs_abd_chunk_size = 4096; - -#if defined(_KERNEL) -SYSCTL_DECL(_vfs_zfs); - -SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, - &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); -SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, - &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); -#endif - -kmem_cache_t *abd_chunk_cache; -static kstat_t *abd_ksp; - -extern inline boolean_t abd_is_linear(abd_t *abd); -extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); -extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); -extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); -extern inline void abd_zero(abd_t *abd, size_t size); +/* see block comment above for description */ +int zfs_abd_scatter_enabled = B_TRUE; -static void * -abd_alloc_chunk() +boolean_t +abd_is_linear(abd_t *abd) { - void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); - ASSERT3P(c, !=, NULL); - return (c); + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); } -static void -abd_free_chunk(void *c) +boolean_t +abd_is_linear_page(abd_t *abd) { - kmem_cache_free(abd_chunk_cache, c); + return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ? + B_TRUE : B_FALSE); } void -abd_init(void) -{ - abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, - NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); - - abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, - sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); - if (abd_ksp != NULL) { - abd_ksp->ks_data = &abd_stats; - kstat_install(abd_ksp); - } -} - -void -abd_fini(void) -{ - if (abd_ksp != NULL) { - kstat_delete(abd_ksp); - abd_ksp = NULL; - } - - kmem_cache_destroy(abd_chunk_cache); - abd_chunk_cache = NULL; -} - -static inline size_t -abd_chunkcnt_for_bytes(size_t size) -{ - return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); -} - -static inline size_t -abd_scatter_chunkcnt(abd_t *abd) -{ - ASSERT(!abd_is_linear(abd)); - return (abd_chunkcnt_for_bytes( - abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); -} - -static inline void abd_verify(abd_t *abd) { ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | - ABD_FLAG_OWNER | ABD_FLAG_META)); + ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | + ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE)); IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { - ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); } else { - ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, - zfs_abd_chunk_size); - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - ASSERT3P( - abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); - } + abd_verify_scatter(abd); } } -static inline abd_t * -abd_alloc_struct(size_t chunkcnt) -{ - size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); - ASSERT3P(abd, !=, NULL); - ABDSTAT_INCR(abdstat_struct_size, size); - - return (abd); -} - -static inline void -abd_free_struct(abd_t *abd) +uint_t +abd_get_size(abd_t *abd) { - size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); - int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); - kmem_free(abd, size); - ABDSTAT_INCR(abdstat_struct_size, -size); + abd_verify(abd); + return (abd->abd_size); } /* @@ -272,15 +145,16 @@ abd_free_struct(abd_t *abd) abd_t * abd_alloc(size_t size, boolean_t is_metadata) { - if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) + if (!zfs_abd_scatter_enabled || abd_size_alloc_linear(size)) return (abd_alloc_linear(size, is_metadata)); VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); - size_t n = abd_chunkcnt_for_bytes(size); - abd_t *abd = abd_alloc_struct(n); - + abd_t *abd = abd_alloc_struct(size); abd->abd_flags = ABD_FLAG_OWNER; + abd->abd_u.abd_scatter.abd_offset = 0; + abd_alloc_chunks(abd, size); + if (is_metadata) { abd->abd_flags |= ABD_FLAG_META; } @@ -288,19 +162,7 @@ abd_alloc(size_t size, boolean_t is_metadata) abd->abd_parent = NULL; zfs_refcount_create(&abd->abd_children); - abd->abd_u.abd_scatter.abd_offset = 0; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - for (int i = 0; i < n; i++) { - void *c = abd_alloc_chunk(); - ASSERT3P(c, !=, NULL); - abd->abd_u.abd_scatter.abd_chunks[i] = c; - } - - ABDSTAT_BUMP(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - n * zfs_abd_chunk_size - size); + abd_update_scatter_stats(abd, ABDSTAT_INCR); return (abd); } @@ -308,17 +170,32 @@ abd_alloc(size_t size, boolean_t is_metadata) static void abd_free_scatter(abd_t *abd) { - size_t n = abd_scatter_chunkcnt(abd); - for (int i = 0; i < n; i++) { - abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); - } + abd_free_chunks(abd); zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); - ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); - ABDSTAT_INCR(abdstat_scatter_chunk_waste, - abd->abd_size - n * zfs_abd_chunk_size); + abd_update_scatter_stats(abd, ABDSTAT_DECR); + abd_free_struct(abd); +} +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + if (abd == NULL) + return; + + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + zfs_refcount_destroy(&abd->abd_children); abd_free_struct(abd); } @@ -343,13 +220,12 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) zfs_refcount_create(&abd->abd_children); if (is_metadata) { - abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + ABD_LINEAR_BUF(abd) = zio_buf_alloc(size); } else { - abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size); } - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, size); + abd_update_linear_stats(abd, ABDSTAT_INCR); return (abd); } @@ -357,15 +233,18 @@ abd_alloc_linear(size_t size, boolean_t is_metadata) static void abd_free_linear(abd_t *abd) { + if (abd_is_linear_page(abd)) { + abd_free_linear_page(abd); + return; + } if (abd->abd_flags & ABD_FLAG_META) { - zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } else { - zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); } zfs_refcount_destroy(&abd->abd_children); - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + abd_update_linear_stats(abd, ABDSTAT_DECR); abd_free_struct(abd); } @@ -397,39 +276,23 @@ abd_t * abd_alloc_sametype(abd_t *sabd, size_t size) { boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; - if (abd_is_linear(sabd)) { + if (abd_is_linear(sabd) && + !abd_is_linear_page(sabd)) { return (abd_alloc_linear(size, is_metadata)); } else { return (abd_alloc(size, is_metadata)); } } -/* - * If we're going to use this ABD for doing I/O using the block layer, the - * consumer of the ABD data doesn't care if it's scattered or not, and we don't - * plan to store this ABD in memory for a long period of time, we should - * allocate the ABD type that requires the least data copying to do the I/O. - * - * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os - * using a scatter/gather list we should switch to that and replace this call - * with vanilla abd_alloc(). - */ -abd_t * -abd_alloc_for_io(size_t size, boolean_t is_metadata) -{ - return (abd_alloc_linear(size, is_metadata)); -} - /* * Allocate a new ABD to point to offset off of sabd. It shares the underlying * buffer data with sabd. Use abd_put() to free. sabd must not be freed while * any derived ABDs exist. */ -/* ARGSUSED */ -static inline abd_t * +static abd_t * abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) { - abd_t *abd; + abd_t *abd = NULL; abd_verify(sabd); ASSERT3U(off, <=, sabd->abd_size); @@ -444,60 +307,33 @@ abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) */ abd->abd_flags = ABD_FLAG_LINEAR; - abd->abd_u.abd_linear.abd_buf = - (char *)sabd->abd_u.abd_linear.abd_buf + off; + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; } else { - size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; - size_t chunkcnt = abd_scatter_chunkcnt(sabd) - - (new_offset / zfs_abd_chunk_size); - - abd = abd_alloc_struct(chunkcnt); - - /* - * Even if this buf is filesystem metadata, we only track that - * if we own the underlying data buffer, which is not true in - * this case. Therefore, we don't ever use ABD_FLAG_META here. - */ - abd->abd_flags = 0; - - abd->abd_u.abd_scatter.abd_offset = - new_offset % zfs_abd_chunk_size; - abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; - - /* Copy the scatterlist starting at the correct offset */ - (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, - &sabd->abd_u.abd_scatter.abd_chunks[new_offset / - zfs_abd_chunk_size], - chunkcnt * sizeof (void *)); + abd = abd_get_offset_scatter(sabd, off); } - if (size == 0) - abd->abd_size = sabd->abd_size - off; - else - abd->abd_size = size; + abd->abd_size = size; abd->abd_parent = sabd; zfs_refcount_create(&abd->abd_children); (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); - return (abd); } abd_t * abd_get_offset(abd_t *sabd, size_t off) { - - return (abd_get_offset_impl(sabd, off, 0)); + size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; + VERIFY3U(size, >, 0); + return (abd_get_offset_impl(sabd, off, size)); } abd_t * abd_get_offset_size(abd_t *sabd, size_t off, size_t size) { ASSERT3U(off + size, <=, sabd->abd_size); - return (abd_get_offset_impl(sabd, off, size)); } - /* * Allocate a linear ABD structure for buf. You must free this with abd_put() * since the resulting ABD doesn't own its own buffer. @@ -519,32 +355,11 @@ abd_get_from_buf(void *buf, size_t size) abd->abd_parent = NULL; zfs_refcount_create(&abd->abd_children); - abd->abd_u.abd_linear.abd_buf = buf; + ABD_LINEAR_BUF(abd) = buf; return (abd); } -/* - * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not - * free the underlying scatterlist or buffer. - */ -void -abd_put(abd_t *abd) -{ - if (abd == NULL) - return; - abd_verify(abd); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - - if (abd->abd_parent != NULL) { - (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, - abd->abd_size, abd); - } - - zfs_refcount_destroy(&abd->abd_children); - abd_free_struct(abd); -} - /* * Get the raw buffer associated with a linear ABD. */ @@ -553,7 +368,7 @@ abd_to_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); abd_verify(abd); - return (abd->abd_u.abd_linear.abd_buf); + return (ABD_LINEAR_BUF(abd)); } /* @@ -574,7 +389,6 @@ abd_borrow_buf(abd_t *abd, size_t n) buf = zio_buf_alloc(n); } (void) zfs_refcount_add_many(&abd->abd_children, n, buf); - return (buf); } @@ -617,148 +431,50 @@ abd_return_buf_copy(abd_t *abd, void *buf, size_t n) abd_return_buf(abd, buf, n); } -/* - * Give this ABD ownership of the buffer that it's storing. Can only be used on - * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated - * with abd_alloc_linear() which subsequently released ownership of their buf - * with abd_release_ownership_of_buf(). - */ -void -abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) -{ - ASSERT(abd_is_linear(abd)); - ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); - abd_verify(abd); - - abd->abd_flags |= ABD_FLAG_OWNER; - if (is_metadata) { - abd->abd_flags |= ABD_FLAG_META; - } - - ABDSTAT_BUMP(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); -} - void abd_release_ownership_of_buf(abd_t *abd) { ASSERT(abd_is_linear(abd)); ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + + /* + * abd_free() needs to handle LINEAR_PAGE ABD's specially. + * Since that flag does not survive the + * abd_release_ownership_of_buf() -> abd_get_from_buf() -> + * abd_take_ownership_of_buf() sequence, we don't allow releasing + * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. + */ + ASSERT(!abd_is_linear_page(abd)); + abd_verify(abd); abd->abd_flags &= ~ABD_FLAG_OWNER; /* Disable this flag since we no longer own the data buffer */ abd->abd_flags &= ~ABD_FLAG_META; - ABDSTAT_BUMPDOWN(abdstat_linear_cnt); - ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + abd_update_linear_stats(abd, ABDSTAT_DECR); } -struct abd_iter { - abd_t *iter_abd; /* ABD being iterated through */ - size_t iter_pos; /* position (relative to abd_offset) */ - void *iter_mapaddr; /* addr corresponding to iter_pos */ - size_t iter_mapsize; /* length of data valid at mapaddr */ -}; - -static inline size_t -abd_iter_scatter_chunk_offset(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) % zfs_abd_chunk_size); -} - -static inline size_t -abd_iter_scatter_chunk_index(struct abd_iter *aiter) -{ - ASSERT(!abd_is_linear(aiter->iter_abd)); - return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + - aiter->iter_pos) / zfs_abd_chunk_size); -} /* - * Initialize the abd_iter. + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). */ -static void -abd_iter_init(struct abd_iter *aiter, abd_t *abd) +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) { + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); abd_verify(abd); - aiter->iter_abd = abd; - aiter->iter_pos = 0; - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; -} - -/* - * Advance the iterator by a certain amount. Cannot be called when a chunk is - * in use. This can be safely called when the aiter has already exhausted, in - * which case this does nothing. - */ -static void -abd_iter_advance(struct abd_iter *aiter, size_t amount) -{ - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* There's nothing left to advance to, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - aiter->iter_pos += amount; -} - -/* - * Map the current chunk into aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_map(struct abd_iter *aiter) -{ - void *paddr; - size_t offset = 0; - - ASSERT3P(aiter->iter_mapaddr, ==, NULL); - ASSERT0(aiter->iter_mapsize); - - /* Panic if someone has changed zfs_abd_chunk_size */ - IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == - aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); - - /* There's nothing left to iterate over, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - if (abd_is_linear(aiter->iter_abd)) { - offset = aiter->iter_pos; - aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; - paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; - } else { - size_t index = abd_iter_scatter_chunk_index(aiter); - offset = abd_iter_scatter_chunk_offset(aiter); - aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, - aiter->iter_abd->abd_size - aiter->iter_pos); - paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; } - aiter->iter_mapaddr = (char *)paddr + offset; -} - -/* - * Unmap the current chunk from aiter. This can be safely called when the aiter - * has already exhausted, in which case this does nothing. - */ -static void -abd_iter_unmap(struct abd_iter *aiter) -{ - /* There's nothing left to unmap, so do nothing */ - if (aiter->iter_pos == aiter->iter_abd->abd_size) - return; - - ASSERT3P(aiter->iter_mapaddr, !=, NULL); - ASSERT3U(aiter->iter_mapsize, >, 0); - aiter->iter_mapaddr = NULL; - aiter->iter_mapsize = 0; + abd_update_linear_stats(abd, ABDSTAT_INCR); } int @@ -987,6 +703,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, struct abd_iter caiters[3]; struct abd_iter daiter = {0}; void *caddrs[3]; + unsigned long flags = 0; ASSERT3U(parity, <=, 3); @@ -998,7 +715,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ASSERT3S(dsize, >=, 0); - critical_enter(); + abd_enter_critical(flags); while (csize > 0) { len = csize; @@ -1010,11 +727,14 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, caddrs[i] = caiters[i].iter_mapaddr; } + switch (parity) { case 3: len = MIN(caiters[2].iter_mapsize, len); + /* falls through */ case 2: len = MIN(caiters[1].iter_mapsize, len); + /* falls through */ case 1: len = MIN(caiters[0].iter_mapsize, len); } @@ -1055,7 +775,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, ASSERT3S(dsize, >=, 0); ASSERT3S(csize, >=, 0); } - critical_exit(); + abd_exit_critical(flags); } /* @@ -1080,6 +800,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; + unsigned long flags = 0; ASSERT3U(parity, <=, 3); @@ -1088,7 +809,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, abd_iter_init(&xiters[i], tabds[i]); } - critical_enter(); + abd_enter_critical(flags); while (tsize > 0) { for (i = 0; i < parity; i++) { @@ -1103,9 +824,11 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, case 3: len = MIN(xiters[2].iter_mapsize, len); len = MIN(citers[2].iter_mapsize, len); + /* falls through */ case 2: len = MIN(xiters[1].iter_mapsize, len); len = MIN(citers[1].iter_mapsize, len); + /* falls through */ case 1: len = MIN(xiters[0].iter_mapsize, len); len = MIN(citers[0].iter_mapsize, len); @@ -1130,5 +853,5 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, tsize -= len; ASSERT3S(tsize, >=, 0); } - critical_exit(); + abd_exit_critical(flags); } diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 078ba8bab69b..bfae40b6d76a 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1638,7 +1638,7 @@ vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) if (ic->ic_data == NULL) continue; - abd_zero(ic->ic_data, ic->ic_data->abd_size); + abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); } iv->iv_attempts_max *= 2; From 7fcf82451c4b75afe327c77683f66bf0c6396a48 Mon Sep 17 00:00:00 2001 From: Richard Laager Date: Sun, 10 May 2020 14:26:08 -0500 Subject: [PATCH 12/27] Change zfsunlock for better busybox compatibility It turns out that there are two versions of Busybox, at least on Ubuntu 18.04. If you have the busybox-static package installed, you get a busybox that supports `ps a` and `head`. If you only have busybox-initramfs, you don't. Either way, you have `awk`. This change should also make this compatible with GNU ps, if you somehow end up with that in the initramfs environment. Reviewed-by: Tom Caputi Reviewed-by: Andrey Prokopenko Signed-off-by: Richard Laager Closes #10307 --- contrib/initramfs/zfsunlock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/initramfs/zfsunlock b/contrib/initramfs/zfsunlock index 1202a144deac..f6b6b9dbe14d 100755 --- a/contrib/initramfs/zfsunlock +++ b/contrib/initramfs/zfsunlock @@ -25,7 +25,7 @@ while [ ! -e /run/zfs_unlock_complete ]; do /sbin/zfs load-key "$zfs_fs_name" || true if [ "$(/sbin/zfs get -H -ovalue keystatus "$zfs_fs_name" 2> /dev/null)" = "available" ]; then echo "Password for $zfs_fs_name accepted." - zfs_console_askpwd_pid=$(ps a -o pid= -o args | grep -v grep | grep "$zfs_console_askpwd_cmd" | cut -d ' ' -f3 | sort -n | head -n1) + zfs_console_askpwd_pid=$(ps | awk '!'"/awk/ && /$zfs_console_askpwd_cmd/ { print \$1; exit }") if [ -n "$zfs_console_askpwd_pid" ]; then kill "$zfs_console_askpwd_pid" fi From 2b21da4f763a076e5ea16cf758849503126c5d6c Mon Sep 17 00:00:00 2001 From: AJ Jordan Date: Thu, 7 May 2020 17:49:00 -0400 Subject: [PATCH 13/27] Fix inconsistent capitalization in `arcstat -v` Reviewed-by: Richard Laager Reviewed-by: Brian Behlendorf Signed-off-by: AJ Jordan Closes #10288 --- cmd/arcstat/arcstat | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/cmd/arcstat/arcstat b/cmd/arcstat/arcstat index 66d7e3a8da7a..ecdbfafe3a3f 100755 --- a/cmd/arcstat/arcstat +++ b/cmd/arcstat/arcstat @@ -60,7 +60,7 @@ cols = { "hits": [4, 1000, "ARC reads per second"], "miss": [4, 1000, "ARC misses per second"], "read": [4, 1000, "Total ARC accesses per second"], - "hit%": [4, 100, "ARC Hit percentage"], + "hit%": [4, 100, "ARC hit percentage"], "miss%": [5, 100, "ARC miss percentage"], "dhit": [4, 1000, "Demand hits per second"], "dmis": [4, 1000, "Demand misses per second"], @@ -75,12 +75,12 @@ cols = { "mread": [5, 1000, "Metadata accesses per second"], "mh%": [3, 100, "Metadata hit percentage"], "mm%": [3, 100, "Metadata miss percentage"], - "arcsz": [5, 1024, "ARC Size"], - "c": [4, 1024, "ARC Target Size"], - "mfu": [4, 1000, "MFU List hits per second"], - "mru": [4, 1000, "MRU List hits per second"], - "mfug": [4, 1000, "MFU Ghost List hits per second"], - "mrug": [4, 1000, "MRU Ghost List hits per second"], + "arcsz": [5, 1024, "ARC size"], + "c": [4, 1024, "ARC target size"], + "mfu": [4, 1000, "MFU list hits per second"], + "mru": [4, 1000, "MRU list hits per second"], + "mfug": [4, 1000, "MFU ghost list hits per second"], + "mrug": [4, 1000, "MRU ghost list hits per second"], "eskip": [5, 1000, "evict_skip per second"], "mtxmis": [6, 1000, "mutex_miss per second"], "dread": [5, 1000, "Demand accesses per second"], @@ -92,10 +92,10 @@ cols = { "l2miss%": [7, 100, "L2ARC access miss percentage"], "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], "l2size": [6, 1024, "Size of the L2ARC"], - "l2bytes": [7, 1024, "bytes read per second from the L2ARC"], - "grow": [4, 1000, "ARC Grow disabled"], - "need": [4, 1024, "ARC Reclaim need"], - "free": [4, 1024, "ARC Free memory"], + "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"], + "grow": [4, 1000, "ARC grow disabled"], + "need": [4, 1024, "ARC reclaim need"], + "free": [4, 1024, "ARC free memory"], } v = {} From ac806a25591d5419f819405199b1962b84bad60c Mon Sep 17 00:00:00 2001 From: AJ Jordan Date: Sun, 3 May 2020 22:23:46 -0400 Subject: [PATCH 14/27] Import the arcstat(1m) manpage from illumos And move it from section 1m to section 1 for consistency. Imported from illumos commit f34d737f. Reviewed-by: Richard Laager Reviewed-by: Brian Behlendorf Signed-off-by: AJ Jordan Closes #10288 --- man/man1/arcstat.1 | 455 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 455 insertions(+) create mode 100644 man/man1/arcstat.1 diff --git a/man/man1/arcstat.1 b/man/man1/arcstat.1 new file mode 100644 index 000000000000..f65095fe57dc --- /dev/null +++ b/man/man1/arcstat.1 @@ -0,0 +1,455 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2014 Adam Stevko. All rights reserved. +.\" Copyright (c) 2015 by Delphix. All rights reserved. +.\" +.TH ARCSTAT 1M "Feb 4, 2014" +.SH NAME +arcstat \- report ZFS ARC and L2ARC statistics +.SH SYNOPSIS +.LP +.nf +\fBarcstat\fR [\fB-hvxr\fR] [\fB-f field[,field]...\fR] [\fB-o file\fR] [\fB-s string\fR] + [\fBinterval\fR [\fBcount\fR]] +.fi + +.SH DESCRIPTION +.LP +The \fBarcstat\fR utility print various ZFS ARC and L2ARC statistics in +vmstat-like fashion. +.sp + +.sp +.LP +The \fBarcstat\fR command reports the following information: +.sp +.ne 2 + +.\" +.sp +.ne 1 +.na +\fBc \fR +.ad +.RS 14n +ARC Target Size +.RE + +.sp +.ne 2 +.na +\fBdh% \fR +.ad +.RS 14n +Demand Data hit percentage +.RE + +.sp +.ne 2 +.na +\fBdm% \fR +.ad +.RS 14n +Demand Data miss percentage +.RE + +.sp +.ne 2 +.na +\fBmfu \fR +.ad +.RS 14n +MFU List hits per second +.RE + +.sp +.ne 2 +.na +\fBmh% \fR +.ad +.RS 14n +Metadata hit percentage +.RE + +.sp +.ne 2 +.na +\fBmm% \fR +.ad +.RS 14n +Metadata miss percentage +.RE + +.sp +.ne 2 +.na +\fBmru \fR +.ad +.RS 14n +MRU List hits per second +.RE + +.sp +.ne 2 +.na +\fBph% \fR +.ad +.RS 14n +Prefetch hits percentage +.RE + +.sp +.ne 2 +.na +\fBpm% \fR +.ad +.RS 14n +Prefetch miss percentage +.RE + +.sp +.ne 2 +.na +\fBdhit \fR +.ad +.RS 14n +Demand Data hits per second +.RE + +.sp +.ne 2 +.na +\fBdmis \fR +.ad +.RS 14n +Demand Data misses per second +.RE + +.sp +.ne 2 +.na +\fBhit% \fR +.ad +.RS 14n +ARC Hit percentage +.RE + +.sp +.ne 2 +.na +\fBhits \fR +.ad +.RS 14n +ARC reads per second +.RE + +.sp +.ne 2 +.na +\fBmfug \fR +.ad +.RS 14n +MFU Ghost List hits per second +.RE + +.sp +.ne 2 +.na +\fBmhit \fR +.ad +.RS 14n +Metadata hits per second +.RE + +.sp +.ne 2 +.na +\fBmiss \fR +.ad +.RS 14n +ARC misses per second +.RE + +.sp +.ne 2 +.na +\fBmmis \fR +.ad +.RS 14n +Metadata misses per second +.RE + +.sp +.ne 2 +.na +\fBmrug \fR +.ad +.RS 14n +MRU Ghost List hits per second +.RE + +.sp +.ne 2 +.na +\fBphit \fR +.ad +.RS 14n +Prefetch hits per second +.RE + +.sp +.ne 2 +.na +\fBpmis \fR +.ad +.RS 14n +Prefetch misses per second +.RE + +.sp +.ne 2 +.na +\fBread \fR +.ad +.RS 14n +Total ARC accesses per second +.RE + +.sp +.ne 2 +.na +\fBtime \fR +.ad +.RS 14n +Time +.RE + +.sp +.ne 2 +.na +\fBarcsz \fR +.ad +.RS 14n +ARC Size +.RE + +.sp +.ne 2 +.na +\fBdread \fR +.ad +.RS 14n +Demand data accesses per second +.RE + +.sp +.ne 2 +.na +\fBeskip \fR +.ad +.RS 14n +evict_skip per second +.RE + +.sp +.ne 2 +.na +\fBmiss% \fR +.ad +.RS 14n +ARC miss percentage +.RE + +.sp +.ne 2 +.na +\fBmread \fR +.ad +.RS 14n +Metadata accesses per second +.RE + +.sp +.ne 2 +.na +\fBpread \fR +.ad +.RS 14n +Prefetch accesses per second +.RE + +.sp +.ne 2 +.na +\fBl2hit% \fR +.ad +.RS 14n +L2ARC access hit percentage +.RE + +.sp +.ne 2 +.na +\fBl2hits \fR +.ad +.RS 14n +L2ARC hits per second +.RE + +.sp +.ne 2 +.na +\fBl2miss \fR +.ad +.RS 14n +L2ARC misses per second +.RE + +.sp +.ne 2 +.na +\fBl2read \fR +.ad +.RS 14n +Total L2ARC accesses per second +.RE + +.sp +.ne 2 +.na +\fBl2size \fR +.ad +.RS 14n +Size of the L2ARC +.RE + +.sp +.ne 2 +.na +\fBmtxmis \fR +.ad +.RS 14n +mutex_miss per second +.RE + +.sp +.ne 2 +.na +\fBl2bytes \fR +.ad +.RS 14n +bytes read per second from the L2ARC +.RE + +.sp +.ne 2 +.na +\fBl2miss% \fR +.ad +.RS 14n +L2ARC access miss percentage +.RE + +.sp +.ne 2 +.na +\fBl2asize \fR +.ad +.RS 14n +Actual (compressed) size of the L2ARC +.RE +.\" + +.SH OPTIONS +.LP +The following options are supported: + +.sp +.ne 2 +.na +\fB\fB-f\fR\fR +.ad +.RS 12n +Display only specific fields. See \fBDESCRIPTION\fR for supported statistics. +.RE + +.sp +.ne 2 +.na +\fB\fB-h\fR\fR +.ad +.RS 12n +Display help message. +.RE + +.sp +.ne 2 +.na +\fB\fB-o\fR\fR +.ad +.RS 12n +Report statistics to a file instead of the standard output. +.RE + +.sp +.ne 2 +.na +\fB\fB-s\fR\fR +.ad +.RS 12n +Display data with a specified separator (default: 2 spaces). +.RE + +.sp +.ne 2 +.na +\fB\fB-x\fR\fR + +.ad +.RS 12n +Print extended stats (same as -f time,mfu,mru,mfug,mrug,eskip,mtxmis,dread,pread,read). +.RE + +.sp +.ne 2 +.na +\fB\fB-v\fR\fR +.ad +.RS 12n +Show field headers and definitions +.RE + +.SH OPERANDS +.LP +The following operands are supported: +.sp +.ne 2 +.na +\fB\fIcount\fR\fR +.ad +.RS 12n +Display only \fIcount\fR reports. +.RE + +.sp +.ne 2 +.na +\fB\fIinterval\fR\fR +.ad +.RS 12n +Specify the sampling interval in seconds. +.RE + +.SH AUTHORS +.LP +arcstat was originally written by Neelakanth Nadgir and supported only ZFS ARC statistics. +Mike Harsch updated it to support L2ARC statistics. From 5a04b177175607d1ac2ab7d43c80ce1a033584dc Mon Sep 17 00:00:00 2001 From: AJ Jordan Date: Mon, 4 May 2020 03:49:33 -0400 Subject: [PATCH 15/27] Fix up arcstat(1) to match our version Turns out the illumos manpage, which is what this originates from, was written for the original Perl version of the utility which is not the version in the OpenZFS tree. *That* version originates from a Python rewrite that was done for FreeNAS. So fix up the manpage to match what we actually ship (and fix a few typos in the process). Reviewed-by: Richard Laager Reviewed-by: Brian Behlendorf Signed-off-by: AJ Jordan Closes #10288 --- man/man1/Makefile.am | 2 +- man/man1/arcstat.1 | 63 +++++++++++++++++++++++++++++++------------- 2 files changed, 46 insertions(+), 19 deletions(-) diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am index 2af917fa5c2e..de54328617eb 100644 --- a/man/man1/Makefile.am +++ b/man/man1/Makefile.am @@ -1,4 +1,4 @@ -dist_man_MANS = zhack.1 ztest.1 raidz_test.1 zvol_wait.1 +dist_man_MANS = zhack.1 ztest.1 raidz_test.1 zvol_wait.1 arcstat.1 EXTRA_DIST = cstyle.1 install-data-local: diff --git a/man/man1/arcstat.1 b/man/man1/arcstat.1 index f65095fe57dc..6dcc39b67f2f 100644 --- a/man/man1/arcstat.1 +++ b/man/man1/arcstat.1 @@ -11,15 +11,15 @@ .\" .\" Copyright 2014 Adam Stevko. All rights reserved. .\" Copyright (c) 2015 by Delphix. All rights reserved. +.\" Copyright (c) 2020 by AJ Jordan. All rights reserved. .\" -.TH ARCSTAT 1M "Feb 4, 2014" +.TH ARCSTAT 1 "May 7, 2020" .SH NAME arcstat \- report ZFS ARC and L2ARC statistics .SH SYNOPSIS .LP .nf -\fBarcstat\fR [\fB-hvxr\fR] [\fB-f field[,field]...\fR] [\fB-o file\fR] [\fB-s string\fR] - [\fBinterval\fR [\fBcount\fR]] +\fBarcstat\fR [\fB-hvx\fR] [\fB-f field[,field]...\fR] [\fB-o file\fR] [\fB-s string\fR] [\fBinterval\fR [\fBcount\fR]] .fi .SH DESCRIPTION @@ -36,12 +36,12 @@ The \fBarcstat\fR command reports the following information: .\" .sp -.ne 1 +.ne 1 .na \fBc \fR .ad .RS 14n -ARC Target Size +ARC target size .RE .sp @@ -50,7 +50,7 @@ ARC Target Size \fBdh% \fR .ad .RS 14n -Demand Data hit percentage +Demand data hit percentage .RE .sp @@ -59,7 +59,7 @@ Demand Data hit percentage \fBdm% \fR .ad .RS 14n -Demand Data miss percentage +Demand data miss percentage .RE .sp @@ -68,7 +68,7 @@ Demand Data miss percentage \fBmfu \fR .ad .RS 14n -MFU List hits per second +MFU list hits per second .RE .sp @@ -95,7 +95,7 @@ Metadata miss percentage \fBmru \fR .ad .RS 14n -MRU List hits per second +MRU list hits per second .RE .sp @@ -122,7 +122,7 @@ Prefetch miss percentage \fBdhit \fR .ad .RS 14n -Demand Data hits per second +Demand data hits per second .RE .sp @@ -131,7 +131,7 @@ Demand Data hits per second \fBdmis \fR .ad .RS 14n -Demand Data misses per second +Demand data misses per second .RE .sp @@ -140,7 +140,7 @@ Demand Data misses per second \fBhit% \fR .ad .RS 14n -ARC Hit percentage +ARC hit percentage .RE .sp @@ -158,7 +158,7 @@ ARC reads per second \fBmfug \fR .ad .RS 14n -MFU Ghost List hits per second +MFU ghost list hits per second .RE .sp @@ -194,7 +194,7 @@ Metadata misses per second \fBmrug \fR .ad .RS 14n -MRU Ghost List hits per second +MRU ghost list hits per second .RE .sp @@ -239,7 +239,7 @@ Time \fBarcsz \fR .ad .RS 14n -ARC Size +ARC size .RE .sp @@ -347,7 +347,7 @@ mutex_miss per second \fBl2bytes \fR .ad .RS 14n -bytes read per second from the L2ARC +Bytes read per second from the L2ARC .RE .sp @@ -367,6 +367,33 @@ L2ARC access miss percentage .RS 14n Actual (compressed) size of the L2ARC .RE + +.sp +.ne 2 +.na +\fBgrow \fR +.ad +.RS 14n +ARC grow disabled +.RE + +.sp +.ne 2 +.na +\fBneed \fR +.ad +.RS 14n +ARC reclaim needed +.RE + +.sp +.ne 2 +.na +\fBfree \fR +.ad +.RS 14n +ARC free memory +.RE .\" .SH OPTIONS @@ -413,7 +440,6 @@ Display data with a specified separator (default: 2 spaces). .ne 2 .na \fB\fB-x\fR\fR - .ad .RS 12n Print extended stats (same as -f time,mfu,mru,mfug,mrug,eskip,mtxmis,dread,pread,read). @@ -451,5 +477,6 @@ Specify the sampling interval in seconds. .SH AUTHORS .LP -arcstat was originally written by Neelakanth Nadgir and supported only ZFS ARC statistics. +arcstat was originally written in Perl by Neelakanth Nadgir and supported only ZFS ARC statistics. Mike Harsch updated it to support L2ARC statistics. +John Hixson ported it to Python for FreeNAS over some beer, after which many individuals from the OpenZFS community continued to maintain and improve it. From b29e31d80d6cb78dbd889e9b529333944b4c3ba1 Mon Sep 17 00:00:00 2001 From: AJ Jordan Date: Mon, 4 May 2020 04:00:59 -0400 Subject: [PATCH 16/27] Fix outdated comment header Reviewed-by: Richard Laager Reviewed-by: Brian Behlendorf Signed-off-by: AJ Jordan Closes #10288 --- cmd/arcstat/arcstat | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/cmd/arcstat/arcstat b/cmd/arcstat/arcstat index ecdbfafe3a3f..df2f62d6b29f 100755 --- a/cmd/arcstat/arcstat +++ b/cmd/arcstat/arcstat @@ -1,20 +1,25 @@ #!/usr/bin/env python3 # # Print out ZFS ARC Statistics exported via kstat(1) -# For a definition of fields, or usage, use arctstat.pl -v +# For a definition of fields, or usage, use arcstat -v # -# This script is a fork of the original arcstat.pl (0.1) by -# Neelakanth Nadgir, originally published on his Sun blog on +# This script was originally a fork of the original arcstat.pl (0.1) +# by Neelakanth Nadgir, originally published on his Sun blog on # 09/18/2007 # http://blogs.sun.com/realneel/entry/zfs_arc_statistics # -# This version aims to improve upon the original by adding features -# and fixing bugs as needed. This version is maintained by -# Mike Harsch and is hosted in a public open source repository: +# A new version aimed to improve upon the original by adding features +# and fixing bugs as needed. This version was maintained by Mike +# Harsch and was hosted in a public open source repository: # http://github.com/mharsch/arcstat # -# Comments, Questions, or Suggestions are always welcome. -# Contact the maintainer at ( mike at harschsystems dot com ) +# but has since moved to the illumos-gate repository. +# +# This Python port was written by John Hixson for FreeNAS, introduced +# in commit e2c29f: +# https://github.com/freenas/freenas +# +# and has been improved by many people since. # # CDDL HEADER START # From 41035a049643ff7083a6cb6cd43b8eb70a7d18a1 Mon Sep 17 00:00:00 2001 From: John Poduska Date: Wed, 13 May 2020 13:54:27 -0400 Subject: [PATCH 17/27] Resilver restarts unnecessarily when it encounters errors When a resilver finishes, vdev_dtl_reassess is called to hopefully excise DTL_MISSING (amongst other things). If there are errors during the resilver, they are tracked in DTL_SCRUB, as spelled out in the block comment in vdev.c. DTL_SCRUB is in-core only, so it can only be used if the pool was online for the whole resilver. This state is tracked with the spa_scrub_started flag, which only gets set when the scan is initialized. Unfortunately, this flag gets cleared right before vdev_dtl_reassess gets called, so if there are any errors during the scan, DTL_MISSING will never get excised and the resilver will just continually restart. This fix simply moves clearing that flag until after the call to vdev_dtl_reasses. In addition, if a pool is imported and already has scn_errors > 0, this change will restart the resilver immediately instead of doing the rest of the scan and then restarting it from the beginning. On the other hand, if scn_errors == 0 at import, then no errors have been encountered so far, so the spa_scrub_started flag can be safely set. A test has been added to verify that resilver does not restart when relevant DTL's are available. Reviewed-by: Brian Behlendorf Reviewed-by: Paul Zuchowski Signed-off-by: John Poduska Closes #10291 --- module/zfs/dsl_scan.c | 23 +++- module/zfs/vdev.c | 22 +++- tests/runfiles/common.run | 2 +- tests/zfs-tests/include/tunables.cfg | 1 + .../tests/functional/resilver/Makefile.am | 3 +- .../resilver/resilver_restart_002.ksh | 102 ++++++++++++++++++ 6 files changed, 149 insertions(+), 4 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 74ef2e15569a..f095017936c0 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -542,6 +542,22 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", (longlong_t)scn->scn_restart_txg); + } else if (dsl_scan_resilvering(dp)) { + /* + * If a resilver is in progress and there are already + * errors, restart it instead of finishing this scan and + * then restarting it. If there haven't been any errors + * then remember that the incore DTL is valid. + */ + if (scn->scn_phys.scn_errors > 0) { + scn->scn_restart_txg = txg; + zfs_dbgmsg("resilver can't excise DTL_MISSING " + "when finished; restarting in txg %llu", + (u_longlong_t)scn->scn_restart_txg); + } else { + /* it's safe to excise DTL when finished */ + spa->spa_scrub_started = B_TRUE; + } } } @@ -887,7 +903,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* @@ -914,6 +929,12 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) } spa_errlog_rotate(spa); + /* + * Don't clear flag until after vdev_dtl_reassess to ensure that + * DTL_MISSING will get updated when possible. + */ + spa->spa_scrub_started = B_FALSE; + /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 3c2135029bd0..923bf2e336a6 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2583,7 +2583,6 @@ vdev_dtl_should_excise(vdev_t *vd) spa_t *spa = vd->vdev_spa; dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; - ASSERT0(scn->scn_phys.scn_errors); ASSERT0(vd->vdev_children); if (vd->vdev_state < VDEV_STATE_DEGRADED) @@ -2634,6 +2633,7 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) if (vd->vdev_ops->vdev_op_leaf) { dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + boolean_t wasempty = B_TRUE; mutex_enter(&vd->vdev_dtl_lock); @@ -2643,6 +2643,18 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) if (zfs_scan_ignore_errors && scn) scn->scn_phys.scn_errors = 0; + if (scrub_txg != 0 && + !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + wasempty = B_FALSE; + zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " + "dtl:%llu/%llu errors:%llu", + (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, + (u_longlong_t)scrub_txg, spa->spa_scrub_started, + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd), + (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); + } + /* * If we've completed a scan cleanly then determine * if this vdev should remove any DTLs. We only want to @@ -2679,6 +2691,14 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) space_reftree_generate_map(&reftree, vd->vdev_dtl[DTL_MISSING], 1); space_reftree_destroy(&reftree); + + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + zfs_dbgmsg("update DTL_MISSING:%llu/%llu", + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd)); + } else if (!wasempty) { + zfs_dbgmsg("DTL_MISSING is now empty"); + } } range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); range_tree_walk(vd->vdev_dtl[DTL_MISSING], diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 2fcde83b3c39..a9bede475361 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -758,7 +758,7 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', tags = ['functional', 'reservation'] [tests/functional/resilver] -tests = ['resilver_restart_001'] +tests = ['resilver_restart_001', 'resilver_restart_002'] tags = ['functional', 'resilver'] [tests/functional/rootpool] diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index f85b156586be..efbcc09e7eb4 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -59,6 +59,7 @@ OVERRIDE_ESTIMATE_RECORDSIZE send.override_estimate_recordsize zfs_override_esti REMOVAL_SUSPEND_PROGRESS removal_suspend_progress zfs_removal_suspend_progress REMOVE_MAX_SEGMENT remove_max_segment zfs_remove_max_segment RESILVER_MIN_TIME_MS resilver_min_time_ms zfs_resilver_min_time_ms +SCAN_LEGACY scan_legacy zfs_scan_legacy SCAN_SUSPEND_PROGRESS scan_suspend_progress zfs_scan_suspend_progress SCAN_VDEV_LIMIT scan_vdev_limit zfs_scan_vdev_limit SEND_HOLES_WITHOUT_BIRTH_TIME send_holes_without_birth_time send_holes_without_birth_time diff --git a/tests/zfs-tests/tests/functional/resilver/Makefile.am b/tests/zfs-tests/tests/functional/resilver/Makefile.am index 465d8f3a3a31..38136a843aac 100644 --- a/tests/zfs-tests/tests/functional/resilver/Makefile.am +++ b/tests/zfs-tests/tests/functional/resilver/Makefile.am @@ -2,7 +2,8 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver dist_pkgdata_SCRIPTS = \ setup.ksh \ cleanup.ksh \ - resilver_restart_001.ksh + resilver_restart_001.ksh \ + resilver_restart_002.ksh dist_pkgdata_DATA = \ resilver.cfg diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh new file mode 100755 index 000000000000..ebe5e693b2cf --- /dev/null +++ b/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh @@ -0,0 +1,102 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2020, Datto Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/resilver/resilver.cfg + +# +# DESCRIPTION: +# Testing resilver completes when scan errors are encountered, but relevant +# DTL's have not been lost. +# +# STRATEGY: +# 1. Create a pool (1k recordsize) +# 2. Create a 32m file (32k records) +# 3. Inject an error halfway through the file +# 4. Start a resilver, ensure the error is triggered and that the resilver +# does not restart after finishing +# +# NB: use legacy scanning to ensure scan of specific block causes error +# + +function cleanup +{ + log_must zinject -c all + destroy_pool $TESTPOOL + rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE + log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY +} + +log_assert "Check for resilver restarts caused by scan errors" + +ORIG_SCAN_LEGACY=$(get_tunable SCAN_LEGACY) + +log_onexit cleanup + +# use legacy scan to ensure injected error will be triggered +log_must set_tunable32 SCAN_LEGACY 1 + + # create the pool and a 32M file (32k blocks) +log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE +log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]} +log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1 + +# determine objset/object +objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p') +object=$(ls -i /$TESTPOOL/file | awk '{print $1}') + +# inject event to cause error during resilver +log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL + +# clear events and start resilver +log_must zpool events -c +log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE + +log_note "waiting for read errors to start showing up" +for iter in {0..59} +do + zpool sync $TESTPOOL + err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}') + (( $err > 0 )) && break + sleep 1 +done + +(( $err == 0 )) && log_fail "Unable to induce errors in resilver" + +log_note "waiting for resilver to finish" +for iter in {0..59} +do + finish=$(zpool events | grep "sysevent.fs.zfs.resilver_finish" | wc -l) + (( $finish > 0 )) && break + sleep 1 +done + +(( $finish == 0 )) && log_fail "resilver took too long to finish" + +# wait a few syncs to ensure that zfs does not restart the resilver +log_must zpool sync $TESTPOOL +log_must zpool sync $TESTPOOL + +# check if resilver was restarted +start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l) +(( $start != 1 )) && log_fail "resilver restarted unnecessarily" + +log_pass "Resilver did not restart unnecessarily from scan errors" From e1fcd940e7a550e0631fb7e80c5df6d84c446d28 Mon Sep 17 00:00:00 2001 From: John Wren Kennedy Date: Thu, 14 May 2020 10:39:47 -0600 Subject: [PATCH 18/27] ZTS: zpool_split_indirect deletes zfstest log file The cleanup routine for this test attempts to remove some temporary files with `rm -f $VDEV_*`, but VDEV_ is undefined. As a result, all files in the current working directory (/var/tmp/test_results/current) get removed instead. This includes the complete log file of all tests. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: George Amanakis Reviewed-by: Ryan Moeller Signed-off-by: John Kennedy Closes #10324 --- .../functional/cli_root/zpool_split/zpool_split_indirect.ksh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh index d6b0e7358ed7..13f0d08b7f20 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh @@ -16,6 +16,7 @@ # # Copyright (c) 2020, George Amanakis. All rights reserved. +# Copyright (c) 2020 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib @@ -42,7 +43,7 @@ function cleanup if poolexists $TESTPOOL2 ; then destroy_pool $TESTPOOL2 fi - rm -f $VDEV_* + rm -f $VDEV_TEMP $VDEV_M1 $VDEV_M2 } log_onexit cleanup From c87f9586687d53bc7c3f2e8887841267a3960269 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 14 May 2020 09:41:29 -0700 Subject: [PATCH 19/27] flake8 E741 variable name warning Update the zts-report.py script to conform to the flake8 E741 rule. "Variables named I, O, and l can be very hard to read. This is because the letter I and the letter l are easily confused, and the letter O and the number 0 can be easily confused." - https://www.flake8rules.com/rules/E741.html Reviewed-by: George Melikov Reviewed-by: Ryan Moeller Reviewed-by: John Kennedy Signed-off-by: Brian Behlendorf Closes #10323 --- tests/test-runner/bin/zts-report.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-runner/bin/zts-report.py b/tests/test-runner/bin/zts-report.py index d74aa9d7aef8..5420ff9d29c0 100755 --- a/tests/test-runner/bin/zts-report.py +++ b/tests/test-runner/bin/zts-report.py @@ -272,8 +272,8 @@ def process_results(pathname): pattern_log = r'^\s*Log directory:\s*(\S*)' d = {} - for l in f.readlines(): - m = re.match(pattern, l) + for line in f.readlines(): + m = re.match(pattern, line) if m and len(m.groups()) == 4: summary['total'] += 1 if m.group(4) == "PASS": @@ -281,7 +281,7 @@ def process_results(pathname): d[m.group(1)] = m.group(4) continue - m = re.match(pattern_log, l) + m = re.match(pattern_log, line) if m: summary['logfile'] = m.group(1) From 8b240f14f93822129ab9fb0674fc27f6353b0a2d Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 14 May 2020 12:10:29 -0700 Subject: [PATCH 20/27] remove unneeded member drc_err of dmu_recv_cookie_t The member drc_err of dmu_recv_cookie_t is used only locally in receive_read, so we can replace it with a local variable. Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens Closes #10319 --- include/sys/dmu_recv.h | 1 - module/zfs/dmu_recv.c | 12 +++++------- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h index c0a562115a20..ed041ba70b73 100644 --- a/include/sys/dmu_recv.h +++ b/include/sys/dmu_recv.h @@ -73,7 +73,6 @@ typedef struct dmu_recv_cookie { struct receive_record_arg *drc_next_rrd; zio_cksum_t drc_cksum; zio_cksum_t drc_prev_cksum; - int drc_err; /* Sorted list of objects not to issue prefetches for. */ objlist_t *drc_ignore_objlist; } dmu_recv_cookie_t; diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index aaeaa4c34f7a..ed52b25e6187 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1217,10 +1217,8 @@ receive_read(dmu_recv_cookie_t *drc, int len, void *buf) while (done < len) { ssize_t resid; - zfs_file_t *fp; - - fp = drc->drc_fp; - drc->drc_err = zfs_file_read(fp, (char *)buf + done, + zfs_file_t *fp = drc->drc_fp; + int err = zfs_file_read(fp, (char *)buf + done, len - done, &resid); if (resid == len - done) { /* @@ -1228,12 +1226,12 @@ receive_read(dmu_recv_cookie_t *drc, int len, void *buf) * that the receive was interrupted and can * potentially be resumed. */ - drc->drc_err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); + err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); } drc->drc_voff += len - done - resid; done = len - resid; - if (drc->drc_err != 0) - return (drc->drc_err); + if (err != 0) + return (err); } drc->drc_bytes_read += len; From eeb8fae9c7dc9a116f061ee8b266f0a51fd6c8ad Mon Sep 17 00:00:00 2001 From: Jorgen Lundman Date: Fri, 15 May 2020 07:58:09 +0900 Subject: [PATCH 21/27] Upstream: add missing thread_exit() Undo FreeBSD wrapper for thread_create() added to call thread_exit. Reviewed-by: Brian Behlendorf Reviewed-by: Ryan Moeller Signed-off-by: Jorgen Lundman Closes #10314 --- include/os/freebsd/spl/sys/proc.h | 23 +---------------------- module/zfs/dmu_send.c | 2 ++ module/zfs/vdev_initialize.c | 2 ++ module/zfs/vdev_removal.c | 2 ++ module/zfs/vdev_trim.c | 2 ++ 5 files changed, 9 insertions(+), 22 deletions(-) diff --git a/include/os/freebsd/spl/sys/proc.h b/include/os/freebsd/spl/sys/proc.h index 07201dd6a095..fdb2126d6a3c 100644 --- a/include/os/freebsd/spl/sys/proc.h +++ b/include/os/freebsd/spl/sys/proc.h @@ -63,28 +63,12 @@ typedef struct proc proc_t; extern struct proc *zfsproc; -struct thread_wrap { - void *tw_arg; - void (*tw_proc)(void*); -}; - -static __inline void -solthread_wrapper(void *arg) -{ - struct thread_wrap *tw = arg; - - tw->tw_proc(tw->tw_arg); - free(tw, M_SOLARIS); - kthread_exit(); -} - static __inline kthread_t * do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, size_t len, proc_t *pp, int state, pri_t pri) { kthread_t *td = NULL; int error; - struct thread_wrap *tw; /* * Be sure there are no surprises. @@ -92,11 +76,8 @@ do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, ASSERT(stk == NULL); ASSERT(len == 0); ASSERT(state == TS_RUN); - tw = malloc(sizeof (*tw), M_SOLARIS, M_WAITOK); - tw->tw_proc = proc; - tw->tw_arg = arg; - error = kproc_kthread_add(solthread_wrapper, tw, &zfsproc, &td, + error = kproc_kthread_add(proc, arg, &zfsproc, &td, RFSTOPPED, stksize / PAGE_SIZE, "zfskern", "solthread %p", proc); if (error == 0) { thread_lock(td); @@ -105,8 +86,6 @@ do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, #if __FreeBSD_version < 1300068 thread_unlock(td); #endif - } else { - free(tw, M_SOLARIS); } return (td); } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 9069f7e7d095..a5df78edd855 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1319,6 +1319,8 @@ redact_list_thread(void *arg) record = range_alloc(DATA, 0, 0, 0, B_TRUE); bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record)); spl_fstrans_unmark(cookie); + + thread_exit(); } /* diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 5899af9fc67b..0d45d9958ef3 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -558,6 +558,8 @@ vdev_initialize_thread(void *arg) vd->vdev_initialize_thread = NULL; cv_broadcast(&vd->vdev_initialize_cv); mutex_exit(&vd->vdev_initialize_lock); + + thread_exit(); } /* diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index dee46f4b3d3d..3f4f9091f43d 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -1595,6 +1595,8 @@ spa_vdev_remove_thread(void *arg) ASSERT0(range_tree_space(svr->svr_allocd_segs)); vdev_remove_complete(spa); } + + thread_exit(); } void diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 137ba83dff27..b0cd40f68765 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -896,6 +896,8 @@ vdev_trim_thread(void *arg) vd->vdev_trim_thread = NULL; cv_broadcast(&vd->vdev_trim_cv); mutex_exit(&vd->vdev_trim_lock); + + thread_exit(); } /* From 2ade659eb4f836931f10b69477657a054d743894 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 14 May 2020 20:45:16 -0700 Subject: [PATCH 22/27] Fix abd_enter/exit_critical wrappers Commit fc551d7 introduced the wrappers abd_enter_critical() and abd_exit_critical() to mark critical sections. On Linux these are implemented with the local_irq_save() and local_irq_restore() macros which set the 'flags' argument when saving. By wrapping them with a function the local variable is no longer set by the macro and is no longer properly restored. Convert abd_enter_critical() and abd_exit_critical() to macros to resolve this issue and ensure the flags are properly restored. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #10332 --- include/sys/abd_impl.h | 15 +++++++++++++-- module/os/freebsd/zfs/abd_os.c | 12 ------------ module/os/linux/zfs/abd_os.c | 12 ------------ module/zfs/abd.c | 4 ++-- 4 files changed, 15 insertions(+), 28 deletions(-) diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h index 6027678af15a..5aee772b1e04 100644 --- a/include/sys/abd_impl.h +++ b/include/sys/abd_impl.h @@ -98,8 +98,6 @@ void abd_update_scatter_stats(abd_t *, abd_stats_op_t); void abd_update_linear_stats(abd_t *, abd_stats_op_t); void abd_verify_scatter(abd_t *); void abd_free_linear_page(abd_t *); -void abd_enter_critical(unsigned long); -void abd_exit_critical(unsigned long); /* OS specific abd_iter functions */ void abd_iter_init(struct abd_iter *, abd_t *); boolean_t abd_iter_at_end(struct abd_iter *); @@ -119,6 +117,19 @@ void abd_iter_unmap(struct abd_iter *); #define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) #define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) +#if defined(_KERNEL) +#if defined(__FreeBSD__) +#define abd_enter_critical(flags) critical_enter() +#define abd_exit_critical(flags) critical_exit() +#else +#define abd_enter_critical(flags) local_irq_save(flags) +#define abd_exit_critical(flags) local_irq_restore(flags) +#endif +#else /* !_KERNEL */ +#define abd_enter_critical(flags) ((void)0) +#define abd_exit_critical(flags) ((void)0) +#endif + #ifdef __cplusplus } #endif diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c index f438841cd411..6b967bc070cb 100644 --- a/module/os/freebsd/zfs/abd_os.c +++ b/module/os/freebsd/zfs/abd_os.c @@ -419,15 +419,3 @@ abd_iter_unmap(struct abd_iter *aiter) aiter->iter_mapaddr = NULL; aiter->iter_mapsize = 0; } - -void -abd_enter_critical(unsigned long flags) -{ - critical_enter(); -} - -void -abd_exit_critical(unsigned long flags) -{ - critical_exit(); -} diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 57e415ef31ae..a8e8f404dd2d 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -803,18 +803,6 @@ abd_iter_unmap(struct abd_iter *aiter) aiter->iter_mapsize = 0; } -void -abd_enter_critical(unsigned long flags) -{ - local_irq_save(flags); -} - -void -abd_exit_critical(unsigned long flags) -{ - local_irq_restore(flags); -} - #if defined(_KERNEL) /* * bio_nr_pages for ABD. diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 2e4554da7a62..abb5d5f2ed38 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -703,7 +703,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, struct abd_iter caiters[3]; struct abd_iter daiter = {0}; void *caddrs[3]; - unsigned long flags = 0; + unsigned long flags __maybe_unused = 0; ASSERT3U(parity, <=, 3); @@ -800,7 +800,7 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter citers[3]; struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; - unsigned long flags = 0; + unsigned long flags __maybe_unused = 0; ASSERT3U(parity, <=, 3); From cdcce2f0190e05127d304fcc261cfdb5284ab621 Mon Sep 17 00:00:00 2001 From: yparitcher Date: Thu, 14 May 2020 22:47:14 -0500 Subject: [PATCH 23/27] Fix VN_OPEN_INVFS typo The VN_OPEN_INVFS literal is in the wrong field. Reviewed-by: Matt Macy Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: yparitcher Closes #10322 --- module/os/freebsd/zfs/zfs_vnops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/zfs_vnops.c b/module/os/freebsd/zfs/zfs_vnops.c index 711e0b1f75a8..9f5e58446ff7 100644 --- a/module/os/freebsd/zfs/zfs_vnops.c +++ b/module/os/freebsd/zfs/zfs_vnops.c @@ -5941,7 +5941,7 @@ zfs_getextattr(struct vop_getextattr_args *ap) flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); - error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL); + error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { From 1b9cd1a9d9ebd213df1427d38e75d3233c873bde Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Thu, 14 May 2020 20:48:29 -0700 Subject: [PATCH 24/27] Fix error handling in receive_writer_thread() If `receive_writer_thread()` gets an error from `receive_process_record()`, it should be saved in `rwa->err` so that we will stop processing records, and the main thread will notice that the receive has failed. When an error is first encountered, this happens correctly. However, if there are more records to dequeue, the next time through the loop we will reset `rwa->err` to zero, allowing us to try to process the following record (2 after the failed record). Depending on what types of records remain, we may incorrectly complete the receive "successfully", but without actually having processed all the records. The fix is to only set `rwa->err` if we got a *non-zero* error. This bug was introduced by #10099 "Improve zfs receive performance by batching writes". Reviewed-by: Brian Behlendorf Reviewed-by: Paul Dagnelie Signed-off-by: Matthew Ahrens Closes #10320 --- module/zfs/dmu_recv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index ed52b25e6187..29fbe854d793 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2572,7 +2572,8 @@ receive_writer_thread(void *arg) * free it. */ if (err != EAGAIN) { - rwa->err = err; + if (rwa->err == 0) + rwa->err = err; kmem_free(rrd, sizeof (*rrd)); } } From 4d6043f2b74ded58d36d3254acd12da16023c844 Mon Sep 17 00:00:00 2001 From: ColMelvin Date: Thu, 14 May 2020 22:51:33 -0500 Subject: [PATCH 25/27] RPM: Remove old versions of DKMS on upgrade Due to a mismatch between the text and a regex looking for that text, the `%preuninstall` script would never run the `dkms remove` command necessary to avoid corrupting the DKMS data configuration. Increase regex specificity to avoid this issue. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Chris Lindee Closes: #9891 Closes #10327 --- rpm/generic/zfs-dkms.spec.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in index f849125a4a7c..29e8fd7dde0d 100644 --- a/rpm/generic/zfs-dkms.spec.in +++ b/rpm/generic/zfs-dkms.spec.in @@ -93,7 +93,7 @@ fi CONFIG_H="/var/lib/dkms/%{module}/%{version}/*/*/%{module}_config.h" SPEC_META_ALIAS="@PACKAGE@-@VERSION@-@RELEASE@" DKMS_META_ALIAS=`cat $CONFIG_H 2>/dev/null | - awk -F'"' '/META_ALIAS/ { print $2; exit 0 }'` + awk -F'"' '/META_ALIAS\s+"/ { print $2; exit 0 }'` if [ "$SPEC_META_ALIAS" = "$DKMS_META_ALIAS" ]; then echo -e echo -e "Uninstall of %{module} module ($SPEC_META_ALIAS) beginning:" From d2782af461c62350abb5abe439b57961c72f9cee Mon Sep 17 00:00:00 2001 From: Ryan Moeller Date: Sat, 16 May 2020 13:10:38 -0400 Subject: [PATCH 26/27] Fix ZVOL_DIR We only use ZVOL_DIR on FreeBSD, and on FreeBSD it isn't correct. Move the definition to the file where it is needed, and define it as /dev/zvol/. Reviewed-by: Brian Behlendorf Signed-off-by: Ryan Moeller Closes #10337 --- include/sys/fs/zfs.h | 3 --- module/os/freebsd/zfs/zvol_os.c | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 39be630d8b3b..ecdfd42d01a8 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1153,9 +1153,6 @@ typedef struct ddt_histogram { #define ZFS_SUPER_MAGIC 0x2fc12fc1 -/* general zvol path */ -#define ZVOL_DIR "/dev" - #define ZVOL_MAJOR 230 #define ZVOL_MINOR_BITS 4 #define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index bef97a9b34ab..caef0b55b371 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -98,6 +98,7 @@ #include "zfs_namecheck.h" +#define ZVOL_DIR "/dev/zvol/" #define ZVOL_DUMPSIZE "dumpsize" #ifdef ZVOL_LOCK_DEBUG From 5b090d57d4da09a3372b9566c46dfc51a13433c6 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Sat, 16 May 2020 12:12:01 -0500 Subject: [PATCH 27/27] freebsd: return EISDIR for read(2) on directories This is arguably a change for internal consistency within OpenZFS, as the Linux implementation will reject read(2) on directories with EISDIR. It's not unreasonable for read(2) to do something here on FreeBSD, but we don't currently copy out anything useful anyways so start rejecting it with the appropriate error. Reviewed-by: Ryan Moeller Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Kyle Evans Closes #10338 --- module/os/freebsd/zfs/zfs_vnops.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/module/os/freebsd/zfs/zfs_vnops.c b/module/os/freebsd/zfs/zfs_vnops.c index 9f5e58446ff7..817c3bb9b46f 100644 --- a/module/os/freebsd/zfs/zfs_vnops.c +++ b/module/os/freebsd/zfs/zfs_vnops.c @@ -743,6 +743,12 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); + /* We don't copy out anything useful for directories. */ + if (vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES));