From 6e5eb6d925d2424aeb978a81438f5f81cb7b0e49 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Tue, 30 Jan 2018 13:39:11 -0800 Subject: [PATCH 1/6] Fix zdb -c traverse stop on damaged objset root If a corruption happens to be on a root block of an objset, zdb -c will not correctly report the error, and it will not traverse the datasets that come after. This is because traverse_visitbp, which does the callback and reset error for TRAVERSE_HARD, is skipped when traversing zil is failed in traverse_impl. Here's example of what 'zdb -eLcc' command looks like on a pool with damaged objset root: == before patch: Traversing all blocks to verify checksums ... Error counts: errno count block traversal size 379392 != alloc 33987072 (unreachable 33607680) bp count: 172 ganged count: 0 bp logical: 1678336 avg: 9757 bp physical: 130560 avg: 759 compression: 12.85 bp allocated: 379392 avg: 2205 compression: 4.42 bp deduped: 0 ref>1: 0 deduplication: 1.00 SPA allocated: 33987072 used: 0.80% additional, non-pointer bps of type 0: 71 Dittoed blocks on same vdev: 101 == after patch: Traversing all blocks to verify checksums ... zdb_blkptr_cb: Got error 52 reading <54, 0, -1, 0> -- skipping Error counts: errno count 52 1 block traversal size 33963520 != alloc 33987072 (unreachable 23552) bp count: 447 ganged count: 0 bp logical: 36093440 avg: 80745 bp physical: 33699840 avg: 75391 compression: 1.07 bp allocated: 33963520 avg: 75981 compression: 1.06 bp deduped: 0 ref>1: 0 deduplication: 1.00 SPA allocated: 33987072 used: 0.80% additional, non-pointer bps of type 0: 76 Dittoed blocks on same vdev: 115 == Signed-off-by: Chunwei Chen --- module/zfs/dmu_traverse.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 280e0ee347e9..15d29198fb7e 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -634,12 +634,20 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb); - if (err != 0) - return (err); - - osp = buf->b_data; - traverse_zil(td, &osp->os_zil_header); - arc_buf_destroy(buf, &buf); + if (err != 0) { + /* + * If both TRAVERSE_HARD and TRAVERSE_PRE are set, + * continue to visitbp so that td_func can be called + * in pre stage, and err will reset to zero. + */ + if (!(td->td_flags & TRAVERSE_HARD) || + !(td->td_flags & TRAVERSE_PRE)) + return (err); + } else { + osp = buf->b_data; + traverse_zil(td, &osp->os_zil_header); + arc_buf_destroy(buf, &buf); + } } if (!(flags & TRAVERSE_PREFETCH_DATA) || From 5d95c958b35ed0e7ed5b09a47b99f252ebc04861 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 1 Feb 2018 15:41:05 -0800 Subject: [PATCH 2/6] Fix zle_decompress out of bound access Signed-off-by: Chunwei Chen --- module/zfs/zle.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/module/zfs/zle.c b/module/zfs/zle.c index 13c5673fbe26..613607faaa97 100644 --- a/module/zfs/zle.c +++ b/module/zfs/zle.c @@ -74,10 +74,14 @@ zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) while (src < s_end && dst < d_end) { int len = 1 + *src++; if (len <= n) { + if (src + len > s_end || dst + len > d_end) + return (-1); while (len-- != 0) *dst++ = *src++; } else { len -= n; + if (dst + len > d_end) + return (-1); while (len-- != 0) *dst++ = 0; } From 9bafb7627d2d293f97159a8aa5f6057a08a9ba15 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 1 Feb 2018 15:42:41 -0800 Subject: [PATCH 3/6] Fix racy assignment of zcb.zcb_haderrors zcb_haderrors will be modified in zdb_blkptr_done, which is asynchronous. So we must move this assignment after zio_wait. Signed-off-by: Chunwei Chen --- cmd/zdb/zdb.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 3719d22e14ed..c381d02a39b3 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3388,7 +3388,7 @@ dump_block_stats(spa_t *spa) int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; boolean_t leaks = B_FALSE; - int e, c; + int e, c, err; bp_embedded_type_t i; bzero(&zcb, sizeof (zcb)); @@ -3430,7 +3430,7 @@ dump_block_stats(spa_t *spa) zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); - zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); /* * If we've traversed the data blocks then we need to wait for those @@ -3446,6 +3446,12 @@ dump_block_stats(spa_t *spa) } } + /* + * Done after zio_wait() since zcb_haderrors is modified in + * zdb_blkptr_done() + */ + zcb.zcb_haderrors |= err; + if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); From a514714bde777641beef569988517ac2eeac8255 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 1 Feb 2018 16:19:36 -0800 Subject: [PATCH 4/6] Fix zdb -R decompression There are some issues in the zdb -R decompression implementation. The first is that ZLE can easily decompress non-ZLE streams. So we add ZDB_NO_ZLE env to make zdb skip ZLE. The second is the random bytes appended to pabd, pbuf2 stuff. This serve no purpose at all, those bytes shouldn't be read during decompression anyway. Instead, we randomize lbuf2, so that we can make sure decompression fill exactly to lsize by bcmp lbuf and lbuf2. The last one is the condition to detect fail is wrong. Signed-off-by: Chunwei Chen --- cmd/zdb/zdb.c | 38 ++++++++++++++++++-------------------- man/man8/zdb.8 | 4 +++- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index c381d02a39b3..4c0a8245fb9e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3984,13 +3984,6 @@ zdb_vdev_lookup(vdev_t *vdev, const char *path) return (NULL); } -/* ARGSUSED */ -static int -random_get_pseudo_bytes_cb(void *buf, size_t len, void *unused) -{ - return (random_get_pseudo_bytes(buf, len)); -} - /* * Read a block from a pool and print it out. The syntax of the * block descriptor is: @@ -4160,17 +4153,8 @@ zdb_read_block(char *thing, spa_t *spa) * every decompress function at every inflated blocksize. */ enum zio_compress c; - void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); - abd_copy_to_buf(pbuf2, pabd, psize); - - VERIFY0(abd_iterate_func(pabd, psize, SPA_MAXBLOCKSIZE - psize, - random_get_pseudo_bytes_cb, NULL)); - - VERIFY0(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, - SPA_MAXBLOCKSIZE - psize)); - /* * XXX - On the one hand, with SPA_MAXBLOCKSIZE at 16MB, * this could take a while and we should let the user know @@ -4180,13 +4164,29 @@ zdb_read_block(char *thing, spa_t *spa) for (lsize = psize + SPA_MINBLOCKSIZE; lsize <= SPA_MAXBLOCKSIZE; lsize += SPA_MINBLOCKSIZE) { for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { + /* + * ZLE can easily decompress non zle stream. + * So have an option to disable it. + */ + if (c == ZIO_COMPRESS_ZLE && + getenv("ZDB_NO_ZLE")) + continue; + (void) fprintf(stderr, "Trying %05llx -> %05llx (%s)\n", (u_longlong_t)psize, (u_longlong_t)lsize, zio_compress_table[c].ci_name); + + /* + * We randomize lbuf2, and decompress to both + * lbuf and lbuf2. This way, we will know if + * decompression fill exactly to lsize. + */ + VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); + if (zio_decompress_data(c, pabd, lbuf, psize, lsize) == 0 && - zio_decompress_data_buf(c, pbuf2, + zio_decompress_data(c, pabd, lbuf2, psize, lsize) == 0 && bcmp(lbuf, lbuf2, lsize) == 0) break; @@ -4194,11 +4194,9 @@ zdb_read_block(char *thing, spa_t *spa) if (c != ZIO_COMPRESS_FUNCTIONS) break; } - - umem_free(pbuf2, SPA_MAXBLOCKSIZE); umem_free(lbuf2, SPA_MAXBLOCKSIZE); - if (lsize <= psize) { + if (lsize > SPA_MAXBLOCKSIZE) { (void) printf("Decompress of %s failed\n", thing); goto out; } diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index d69d6f529d13..0ce4d852d890 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -248,7 +248,9 @@ and, optionally, .It Sy b Ar offset Print block pointer .It Sy d -Decompress the block +Decompress the block. Set environment variable +.Nm ZBD_NO_ZLE +to skip zle when guessing. .It Sy e Byte swap the block .It Sy g From 14b919cfbfef147d68ad7619cf7e833bcf630664 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 1 Feb 2018 16:28:11 -0800 Subject: [PATCH 5/6] Fix zdb -E segfault SPA_MAXBLOCKSIZE is too large for stack. Signed-off-by: Chunwei Chen --- cmd/zdb/zdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 4c0a8245fb9e..7f2cab382500 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4235,9 +4235,11 @@ zdb_embedded_block(char *thing) { blkptr_t bp; unsigned long long *words = (void *)&bp; - char buf[SPA_MAXBLOCKSIZE]; + char *buf; int err; + buf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + bzero(&bp, sizeof (bp)); err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", @@ -4256,6 +4258,7 @@ zdb_embedded_block(char *thing) exit(1); } zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); + umem_free(buf, SPA_MAXBLOCKSIZE); } int From cdf827520c81a2adbc208381f4675cd1ef7204f5 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Thu, 1 Feb 2018 16:36:40 -0800 Subject: [PATCH 6/6] Fix zdb -ed on objset for exported pool zdb -ed on objset for exported pool would failed with: failed to own dataset 'qq/fs0': No such file or directory The reason is that zdb pass objset name to spa_import, it uses that name to create a spa. Later, when dmu_objset_own tries to lookup the spa using real pool name, it can't find one. We fix this by make sure we pass pool name rather than objset name to spa_import. Signed-off-by: Chunwei Chen --- cmd/zdb/zdb.c | 32 ++++++---- tests/runfiles/linux.run | 2 +- .../tests/functional/clean_mirror/cleanup.ksh | 4 +- .../tests/functional/cli_root/zdb/Makefile.am | 3 +- .../functional/cli_root/zdb/zdb_006_pos.ksh | 64 +++++++++++++++++++ 5 files changed, 90 insertions(+), 15 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_006_pos.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 7f2cab382500..063358a0432c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -4273,7 +4273,7 @@ main(int argc, char **argv) int error = 0; char **searchdirs = NULL; int nsearch = 0; - char *target; + char *target, *target_pool; nvlist_t *policy = NULL; uint64_t max_txg = UINT64_MAX; int flags = ZFS_IMPORT_MISSING_LOG; @@ -4476,6 +4476,20 @@ main(int argc, char **argv) error = 0; target = argv[0]; + if (strpbrk(target, "/@") != NULL) { + size_t targetlen; + + target_pool = strdup(target); + *strpbrk(target_pool, "/@") = '\0'; + + target_is_spa = B_FALSE; + targetlen = strlen(target); + if (targetlen && target[targetlen - 1] == '/') + target[targetlen - 1] = '\0'; + } else { + target_pool = target; + } + if (dump_opt['e']) { importargs_t args = { 0 }; nvlist_t *cfg = NULL; @@ -4484,8 +4498,10 @@ main(int argc, char **argv) args.path = searchdirs; args.can_be_active = B_TRUE; - error = zpool_tryimport(g_zfs, target, &cfg, &args); + error = zpool_tryimport(g_zfs, target_pool, &cfg, &args); + if (error == 0) { + if (nvlist_add_nvlist(cfg, ZPOOL_REWIND_POLICY, policy) != 0) { fatal("can't open '%s': %s", @@ -4500,19 +4516,13 @@ main(int argc, char **argv) (void) printf("\nConfiguration for import:\n"); dump_nvlist(cfg, 8); } - error = spa_import(target, cfg, NULL, + error = spa_import(target_pool, cfg, NULL, flags | ZFS_IMPORT_SKIP_MMP); } } - if (strpbrk(target, "/@") != NULL) { - size_t targetlen; - - target_is_spa = B_FALSE; - targetlen = strlen(target); - if (targetlen && target[targetlen - 1] == '/') - target[targetlen - 1] = '\0'; - } + if (target_pool != target) + free(target_pool); if (error == 0) { if (target_is_spa || dump_opt['R']) { diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 16c10d41a7ff..6c60e0ba8631 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -77,7 +77,7 @@ tags = ['functional', 'clean_mirror'] [tests/functional/cli_root/zdb] tests = ['zdb_001_neg', 'zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', - 'zdb_005_pos'] + 'zdb_005_pos', 'zdb_006_pos'] pre = post = tags = ['functional', 'cli_root', 'zdb'] diff --git a/tests/zfs-tests/tests/functional/clean_mirror/cleanup.ksh b/tests/zfs-tests/tests/functional/clean_mirror/cleanup.ksh index ac3bfbca8940..fb0db312ebba 100755 --- a/tests/zfs-tests/tests/functional/clean_mirror/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/clean_mirror/cleanup.ksh @@ -38,10 +38,10 @@ df -F zfs -h | grep "$TESTFS " >/dev/null [[ $? == 0 ]] && log_must zfs umount -f $TESTDIR destroy_pool $TESTPOOL -if is_mpath_device $MIRROR_PRIMARY; then +if ( is_mpath_device $MIRROR_PRIMARY || is_loop_device $MIRROR_SECONDARY); then parted $DEV_DSKDIR/$MIRROR_PRIMARY -s rm 1 fi -if is_mpath_device $MIRROR_SECONDARY; then +if ( is_mpath_device $MIRROR_SECONDARY || is_loop_device $MIRROR_SECONDARY); then parted $DEV_DSKDIR/$MIRROR_SECONDARY -s rm 1 fi # recreate and destroy a zpool over the disks to restore the partitions to diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am index 51170fbc894d..d37bcf607f46 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am @@ -4,4 +4,5 @@ dist_pkgdata_SCRIPTS = \ zdb_002_pos.ksh \ zdb_003_pos.ksh \ zdb_004_pos.ksh \ - zdb_005_pos.ksh + zdb_005_pos.ksh \ + zdb_006_pos.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_006_pos.ksh new file mode 100755 index 000000000000..97b00e9e1996 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_006_pos.ksh @@ -0,0 +1,64 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Nutanix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# zdb -d will work on imported/exported pool with pool/dataset argument +# +# Strategy: +# 1. Create a pool +# 2. Run zdb -d with pool and dataset arguments. +# 3. Export the pool +# 4. Run zdb -ed with pool and dataset arguments. +# + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + for DISK in $DISKS; do + zpool labelclear -f $DEV_RDSKDIR/$DISK + done +} + +log_assert "Verify zdb -d works on imported/exported pool with pool/dataset argument" +log_onexit cleanup + +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +log_must zfs snap $TESTPOOL/$TESTFS@snap + +log_must zdb -d $TESTPOOL +log_must zdb -d $TESTPOOL/ +log_must zdb -d $TESTPOOL/$TESTFS +log_must zdb -d $TESTPOOL/$TESTFS@snap + +log_must zpool export $TESTPOOL + +log_must zdb -ed $TESTPOOL +log_must zdb -ed $TESTPOOL/ +log_must zdb -ed $TESTPOOL/$TESTFS +log_must zdb -ed $TESTPOOL/$TESTFS@snap + +log_must zpool import $TESTPOOL + +cleanup + +log_pass "zdb -d works on imported/exported pool with pool/dataset argument"