diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index c940c8f462c5..20182fb265e8 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -130,6 +130,8 @@ static int zpool_do_version(int, char **); static int zpool_do_wait(int, char **); +static int zpool_do_ddt_prune(int, char **); + static int zpool_do_help(int argc, char **argv); static zpool_compat_status_t zpool_do_load_compat( @@ -167,6 +169,7 @@ typedef enum { HELP_CLEAR, HELP_CREATE, HELP_CHECKPOINT, + HELP_DDT_PRUNE, HELP_DESTROY, HELP_DETACH, HELP_EXPORT, @@ -343,6 +346,8 @@ static zpool_command_t command_table[] = { { "sync", zpool_do_sync, HELP_SYNC }, { NULL }, { "wait", zpool_do_wait, HELP_WAIT }, + { NULL }, + { "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE }, }; #define NCOMMAND (ARRAY_SIZE(command_table)) @@ -456,6 +461,8 @@ get_usage(zpool_help_t idx) case HELP_WAIT: return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " " [interval]\n")); + case HELP_DDT_PRUNE: + return (gettext("\tddtprune -d|-p \n")); default: __builtin_unreachable(); } @@ -11660,6 +11667,89 @@ found:; return (error); } +/* + * zpool ddtprune -d|-p + * + * -d Prune entries old and older + * -p Prune amount of entries + * + * Prune entries from DDT that have only a single reference to + * satisfy the amount specified. + */ +int +zpool_do_ddt_prune(int argc, char **argv) +{ + zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE; + uint_t amount = 0; + zpool_handle_t *zhp; + char *endptr; + int c; + + while ((c = getopt(argc, argv, "d:p:")) != -1) { + switch (c) { + case 'd': + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + (void) fprintf(stderr, gettext("-d cannot be " + "combined with -p option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || amount == 0) { + (void) fprintf(stderr, + gettext("invalid days value\n")); + usage(B_FALSE); + } + amount *= 86400; /* convert days to seconds */ + unit = ZPOOL_DDT_PRUNE_AGE; + break; + case 'p': + if (unit == ZPOOL_DDT_PRUNE_AGE) { + (void) fprintf(stderr, gettext("-p cannot be " + "combined with -d option\n")); + usage(B_FALSE); + } + errno = 0; + amount = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0' || + amount == 0 || amount > 100) { + (void) fprintf(stderr, + gettext("invalid percentage value\n")); + usage(B_FALSE); + } + unit = ZPOOL_DDT_PRUNE_PERCENTAGE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (unit == ZPOOL_DDT_PRUNE_NONE) { + (void) fprintf(stderr, + gettext("missing amount option (-d|-p )\n")); + usage(B_FALSE); + } else if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } else if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + zhp = zpool_open(g_zfs, argv[0]); + if (zhp == NULL) + return (-1); + + int error = zpool_ddt_prune(zhp, unit, amount); + + zpool_close(zhp); + + return (error); +} + static int find_command_idx(const char *command, int *idx) { diff --git a/cmd/ztest.c b/cmd/ztest.c index 6a9264ddcc4c..ee6c808413ae 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -446,6 +446,7 @@ ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; ztest_func_t ztest_pool_prefetch_ddt; +ztest_func_t ztest_ddt_prune; static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ @@ -502,6 +503,7 @@ static ztest_info_t ztest_info[] = { ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely), + ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely), }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) @@ -7288,6 +7290,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id) mutex_exit(&ztest_vdev_lock); } +void +ztest_ddt_prune(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + + spa_t *spa = ztest_spa; + int32_t pct = ztest_random(15) + 1; + + (void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct); +} + /* * Verify pool integrity by running zdb. */ diff --git a/include/libzfs.h b/include/libzfs.h index 979b919ce2fa..96bce6e8ea4b 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -304,6 +304,9 @@ _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *); _LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *); +_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t, + uint64_t); + _LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); _LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 8acb39171ae0..56273abbfed3 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -160,6 +160,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t, + uint64_t); + #ifdef __cplusplus } #endif diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 7ef51d8ad90f..444f0b2d8003 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -143,6 +143,7 @@ typedef struct { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; uint64_t ddp_phys_birth; + uint64_t ddp_class_birth; } ddt_phys_t; typedef struct { @@ -176,6 +177,7 @@ typedef struct { #define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ #define DDE_FLAG_OVERQUOTA (1 << 1) /* entry unusable, no space */ #define DDE_FLAG_LOGGED (1 << 2) /* loaded from log */ +#define DDE_FLAG_PRUNE_WANTED (1 << 3) /* prune has been requested */ /* * Additional data to support entry update or repair. This is fixed size @@ -362,6 +364,9 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); +extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount); + #ifdef __cplusplus } #endif diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 4358654c7484..f3ab435d78f0 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1419,7 +1419,7 @@ typedef enum { */ typedef enum zfs_ioc { /* - * Core features - 88/128 numbers reserved. + * Core features - 89/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, @@ -1516,6 +1516,7 @@ typedef enum zfs_ioc { ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */ + ZFS_IOC_DDT_PRUNE, /* 0x5a59 */ /* * Per-platform (Optional) - 8/128 numbers reserved. @@ -1652,6 +1653,12 @@ typedef enum { ZPOOL_PREFETCH_DDT } zpool_prefetch_type_t; +typedef enum { + ZPOOL_DDT_PRUNE_NONE, + ZPOOL_DDT_PRUNE_AGE, /* in seconds */ + ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */ +} zpool_ddt_prune_unit_t; + /* * Bookmark name values. */ @@ -1745,6 +1752,12 @@ typedef enum { */ #define ZPOOL_PREFETCH_TYPE "prefetch_type" +/* + * The following are names used when invoking ZFS_IOC_DDT_PRUNE. + */ +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + /* * Flags for ZFS_IOC_VDEV_SET_STATE */ diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 47f349327461..8cf1b808c960 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -411,6 +411,7 @@ struct spa { uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ + boolean_t spa_active_ddt_prune; /* ddt prune process active */ struct brt *spa_brt; /* in-core BRT */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 4782606b5492..145ac15bb47a 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -5549,3 +5549,31 @@ zpool_set_vdev_prop(zpool_handle_t *zhp, const char *vdevname, return (ret); } + +/* + * + */ +int +zpool_ddt_prune(zpool_handle_t *zhp, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + int error = lzc_ddt_prune(zhp->zpool_name, unit, amount); + if (error != 0 && error != ENODATA) { + libzfs_handle_t *hdl = zhp->zpool_hdl; + char errbuf[ERRBUFLEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot prune dedup table on '%s'"), zhp->zpool_name); + + if (error == EALREADY) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "a prune operation is already in progress")); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + } else { + (void) zpool_standard_error(hdl, errno, errbuf); + } + return (-1); + } + + return (0); +} diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 0cec82e5662c..706081c5b976 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -1921,3 +1921,25 @@ lzc_get_bootenv(const char *pool, nvlist_t **outnvl) { return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); } + +/* + * Prune the specified amount from the pool's dedup table. + */ +int +lzc_ddt_prune(const char *pool, zpool_ddt_prune_unit_t unit, uint64_t amount) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_int32(args, DDT_PRUNE_UNIT, unit); + fnvlist_add_uint64(args, DDT_PRUNE_AMOUNT, amount); + + error = lzc_ioctl(ZFS_IOC_DDT_PRUNE, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index b90306c28f1d..f5a5dcb9ddae 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -226,6 +226,17 @@ int zfs_dedup_prefetch = 0; */ uint_t dedup_class_wait_txgs = 5; +/* + * How many DDT prune entries to add to the DDT sync AVL tree. + * Note these addtional entries have a memory footprint of a + * ddt_entry_t (448 bytes). + */ +static uint32_t zfs_ddt_prunes_per_txg = 50000; + +/* + * For testing, synthesize aged DDT entries + */ +static boolean_t ddt_prune_artificial_age = B_FALSE; /* * Don't do more than this many incremental flush rounds per txg. @@ -678,10 +689,38 @@ ddt_phys_clear(ddt_phys_t *ddp) memset(ddp, 0, sizeof (*ddp)); } +static uint64_t +ddt_class_birth(void) +{ + uint64_t birth = gethrestime_sec(); + + if (ddt_prune_artificial_age) { + /* + * debug aide -- simulate a wider distribution + * so we don't have to wait for an aged DDT + * to test prune. + */ + uint64_t one_day_old = 60 * 60 * 24; + int days = random_in_range(60); + + if (random_in_range(100) < 10) + days += random_in_range(30); + if (random_in_range(100) < 5) + days += random_in_range(10); + birth -= one_day_old * days; + } + return (birth); +} + void ddt_phys_addref(ddt_phys_t *ddp) { ddp->ddp_refcnt++; + + if (ddp->ddp_refcnt == 1) + ddp->ddp_class_birth = ddt_class_birth(); + else + ddp->ddp_class_birth = 0; } void @@ -690,6 +729,11 @@ ddt_phys_decref(ddt_phys_t *ddp) if (ddp) { ASSERT3U(ddp->ddp_refcnt, >, 0); ddp->ddp_refcnt--; + + if (ddp->ddp_refcnt == 1) + ddp->ddp_class_birth = ddt_class_birth(); + else + ddp->ddp_class_birth = 0; } } @@ -1721,6 +1765,18 @@ ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_class_t nclass = (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE; + /* + * When a prune is requested and the entry is still in the unique + * class, then treat as having no references and remove it. + */ +/* XXX prune rework -- robn, 2024-02-08 + if ((dde->dde_flags & DDE_FLAG_PRUNE_WANTED) && + oclass == DDT_CLASS_UNIQUE && + total_refcnt == 1) { + total_refcnt = 0; + } +*/ + /* * If an existing entry changed type or class, or its refcount reached * zero, delete it from the DDT object @@ -2192,8 +2248,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe) * counter for the DDT entry if the block is already in DDT. * * Return false if the block, despite having the D bit set, is not present - * in the DDT. Currently this is not possible but might be in the future. - * See the comment below. + * in the DDT. This is possible when the DDT has been pruned by an admin + * or by the DDT quota mechanism. */ boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp) @@ -2230,12 +2286,9 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) result = B_TRUE; } else { /* - * At the time of implementating this if the block has the - * DEDUP flag set it must exist in the DEDUP table, but - * there are many advocates that want ability to remove - * entries from DDT with refcnt=1. If this will happen, - * we may have a block with the DEDUP set, but which doesn't - * have a corresponding entry in the DDT. Be ready. + * If the block has the DEDUP flag set it still might not + * exist in the DEDUP table due to DDT pruning of entries + * where refcnt=1. */ ddt_remove(ddt, dde); result = B_FALSE; @@ -2247,6 +2300,256 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) return (result); } +typedef struct ddt_prune_entry { + ddt_t *dpe_ddt; + ddt_key_t dpe_key; + ddt_phys_t dpe_phys; + list_node_t dpe_node; +} ddt_prune_entry_t; + +typedef struct ddt_prune_info { + spa_t *dpi_spa; + int dpi_txg_syncs; + int dpi_pruned; + list_t dpi_candidates; +} ddt_prune_info_t; + +/* + * Add prune candidates for ddt_sync during spa_sync + */ +static void +prune_candidates_sync(void *arg, dmu_tx_t *tx) +{ + (void) tx; + ddt_prune_info_t *dpi = arg; + ddt_prune_entry_t *dpe; + int count = 0; + + spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER); + + /* Process the prune candidates collected so far */ + while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) { + blkptr_t blk; + ddt_t *ddt = dpe->dpe_ddt; + + ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key, + &dpe->dpe_phys, &blk); + + ddt_enter(ddt); + ddt_entry_t *dde = ddt_lookup(ddt, &blk, B_TRUE); + if (dde != NULL) { + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); + /* + * Inform ddt_sync_entry() that we want to + * prune this entry + */ + dde->dde_flags |= DDE_FLAG_PRUNE_WANTED; + count++; + } + ddt_exit(ddt); + + kmem_free(dpe, sizeof (*dpe)); + dpi->dpi_pruned++; + } + + spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG); + dpi->dpi_txg_syncs++; +} + +/* + * Prune candidates are collected in open context and processed + * in sync context as part of ddt_sync_table(). + */ +static void +ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk, + const ddt_phys_t *ddp) +{ + ddt_prune_entry_t *dpe = kmem_alloc(sizeof (*dpe), KM_SLEEP); + + dpe->dpe_ddt = ddt; + dpe->dpe_key = *ddk; + dpe->dpe_phys = *ddp; + list_insert_head(list, dpe); +} + +/* + * We use a histogram to convert a percentage request into a + * cutoff value where entries older than the cutoff get pruned. + * + * The histogram bins represent hours in power-of-two increments. + * 16 bins covers up to four years. + */ +#define HIST_BINS 16 + +typedef struct ddt_age_histo { + uint64_t dah_entries; + uint64_t dah_age_histo[HIST_BINS]; +} ddt_age_histo_t; + +/* + * Interate over all the entries in the DDT unique class. + * The walk will perform one of the following operations: + * (a) build a histogram than can be used when pruning + * (b) prune entries older than the cutoff + */ +static void +ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram) +{ + ddt_bookmark_t ddb = { + .ddb_class = DDT_CLASS_UNIQUE, + .ddb_type = 0, + .ddb_checksum = 0, + .ddb_cursor = 0 + }; + ddt_lightweight_entry_t ddlwe = {0}; + int error; + int total = 0, valid = 0; + int candidates = 0; + uint64_t today = gethrestime_sec(); + ddt_prune_info_t dpi; + boolean_t pruning = (cutoff != 0); + + if (pruning) { + dpi.dpi_txg_syncs = 0; + dpi.dpi_pruned = 0; + dpi.dpi_spa = spa; + list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t), + offsetof(ddt_prune_entry_t, dpe_node)); + } + + if (histogram != NULL) + memset(histogram, 0, sizeof (ddt_age_histo_t)); + + while ((error = ddt_walk(spa, &ddb, &ddlwe)) == 0) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + VERIFY(ddt); + + if (spa_shutting_down(spa) || + (issig(JUSTLOOKING) && issig(FORREAL))) { + break; + } + total++; + + for (int p = 0; p < ddlwe.ddlwe_nphys; p++) { + const ddt_phys_t *ddp = &ddlwe.ddlwe_phys[p]; + + if (ddp->ddp_class_birth == 0) + continue; + + ASSERT(ddp->ddp_refcnt == 1); + + /* prune older entries */ + if (pruning && ddp->ddp_class_birth < cutoff) { + if (candidates++ >= zfs_ddt_prunes_per_txg) { + /* sync prune candidates in batches */ + VERIFY0(dsl_sync_task(spa_name(spa), + NULL, prune_candidates_sync, + &dpi, 0, ZFS_SPACE_CHECK_NONE)); + candidates = 1; + } + ddt_prune_entry(&dpi.dpi_candidates, ddt, + &ddlwe.ddlwe_key, ddp); + } + + /* build a histogram */ + if (histogram != NULL) { + uint64_t age = + (today - ddp->ddp_class_birth) / 3600; + int bin = + MIN(highbit64(age) - 1, HIST_BINS - 1); + histogram->dah_entries++; + histogram->dah_age_histo[bin]++; + } + + valid++; + } + } + + if (pruning) { + if (!list_is_empty(&dpi.dpi_candidates)) { + /* sync out final batch of prune candidates */ + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + prune_candidates_sync, &dpi, 0, + ZFS_SPACE_CHECK_NONE)); + } + list_destroy(&dpi.dpi_candidates); + + zfs_dbgmsg("pruned %d entries (%d%%) across %d txg syncs", + dpi.dpi_pruned, (dpi.dpi_pruned * 100) / valid, + dpi.dpi_txg_syncs); + } +} + +static uint64_t +ddt_total_entries(spa_t *spa) +{ + ddt_object_t ddo; + ddt_get_dedup_object_stats(spa, &ddo); + + return (ddo.ddo_count); +} + +int +ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit, + uint64_t amount) +{ + uint64_t cutoff; + uint64_t start_time = gethrtime(); + + if (spa->spa_active_ddt_prune) + return (EALREADY); + if (ddt_total_entries(spa) == 0) + return (ENODATA); + + spa->spa_active_ddt_prune = B_TRUE; + + zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount, + unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older"); + + if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) { + ddt_age_histo_t histogram; + uint64_t oldest = 0; + + /* Make a pass over DDT to build a histogram */ + ddt_prune_walk(spa, 0, &histogram); + + int target = (histogram.dah_entries * amount) / 100; + + /* + * Figure out our cutoff date + * (i.e., which bins to prune from) + */ + for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) { + if (histogram.dah_age_histo[i] != 0) { + /* less than this bucket remaining */ + if (target < histogram.dah_age_histo[i]) { + oldest = MAX(1, (1< 0 && !spa_shutting_down(spa) && + !(issig(JUSTLOOKING) && issig(FORREAL))) { + /* Traverse DDT to prune entries older that our cuttoff */ + ddt_prune_walk(spa, cutoff, NULL); + } + + zfs_dbgmsg("%s: prune completed in %llu seconds", + spa_name(spa), (u_longlong_t)NSEC2SEC(gethrtime() - start_time)); + + spa->spa_active_ddt_prune = B_FALSE; + return (0); +} + ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, "Enable prefetching dedup-ed blks"); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index eca1720c92b3..1194ba4f9219 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4314,6 +4314,46 @@ zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) return (total_errors > 0 ? SET_ERROR(EINVAL) : 0); } +#define DDT_PRUNE_UNIT "ddt_prune_unit" +#define DDT_PRUNE_AMOUNT "ddt_prune_amount" + +/* + * innvl: { + * "ddt_prune_unit" -> uint32_t + * "ddt_prune_amount" -> uint64_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_ddt_prune[] = { + {DDT_PRUNE_UNIT, DATA_TYPE_INT32, 0}, + {DDT_PRUNE_AMOUNT, DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t unit; + uint64_t amount; + + if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 || + nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) { + return (EINVAL); + } + + spa_t *spa; + int error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit, + amount); + + spa_close(spa, FTAG); + + return (error); +} + /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the @@ -7402,6 +7442,11 @@ zfs_ioctl_init(void) POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props)); + zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE, + zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 4e25db3e9297..43675cf89e73 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3817,7 +3817,7 @@ zio_ddt_free(zio_t *zio) spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); - ddt_entry_t *dde; + ddt_entry_t *dde = NULL; ddt_phys_t *ddp; ASSERT(BP_GET_DEDUP(bp)); @@ -3832,6 +3832,18 @@ zio_ddt_free(zio_t *zio) } ddt_exit(ddt); + /* + * When no entry was found, it must have been pruned, + * so we can free it now instead of decrementing the + * refcount in the DDT. + */ + if (dde != NULL && + dde->dde_type == DDT_TYPES && + dde->dde_class == DDT_CLASSES) { + BP_SET_DEDUP(bp, 0); + zio->io_pipeline |= ZIO_STAGE_DVA_FREE; + } + return (zio); }