Skip to content

Commit

Permalink
Add DDT prune command
Browse files Browse the repository at this point in the history
Requires the new 'flat' physical data which has the start
time for a class entry.

The amount to prune can be based on a target percentage of
the unique entries or based on the age (i.e., every entry
older than N days).

Sponsored-by: Klara, Inc.
Sponsored-by: iXsystems, Inc.

Signed-off-by: Don Brady <don.brady@klarasystems.com>
  • Loading branch information
don-brady committed Jun 24, 2024
1 parent 0715a69 commit f010187
Show file tree
Hide file tree
Showing 19 changed files with 830 additions and 61 deletions.
29 changes: 29 additions & 0 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -2050,6 +2050,32 @@ dump_all_ddts(spa_t *spa)
}

dump_dedup_ratio(&dds_total);

/*
* Dump a histogram of unique class entry age
*/
if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
ddt_age_histo_t histogram;

(void) printf("DDT walk unique, building age histogram...\n");
ddt_prune_walk(spa, 0, &histogram);

/*
* print out histogram for unique entry class birth
*/
if (histogram.dah_entries > 0) {
(void) printf("%5s %9s %4s\n",
"age", "blocks", "amnt");
(void) printf("%5s %9s %4s\n",
"-----", "---------", "----");
for (int i = 0; i < HIST_BINS; i++) {
(void) printf("%5d %9d %4d%%\n", 1 << i,
(int)histogram.dah_age_histo[i],
(int)((histogram.dah_age_histo[i] * 100) /
histogram.dah_entries));
}
}
}
}

static void
Expand Down Expand Up @@ -5808,6 +5834,9 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
ddt_exit(ddt);
}

/*
* XXX - djb this fails during ztest runs (ENOENT)
*/
VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa,
refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa),
bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
Expand Down
89 changes: 89 additions & 0 deletions cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ static int zpool_do_version(int, char **);

static int zpool_do_wait(int, char **);

static int zpool_do_ddt_prune(int, char **);

static int zpool_do_help(int argc, char **argv);

static zpool_compat_status_t zpool_do_load_compat(
Expand Down Expand Up @@ -167,6 +169,7 @@ typedef enum {
HELP_CLEAR,
HELP_CREATE,
HELP_CHECKPOINT,
HELP_DDT_PRUNE,
HELP_DESTROY,
HELP_DETACH,
HELP_EXPORT,
Expand Down Expand Up @@ -343,6 +346,8 @@ static zpool_command_t command_table[] = {
{ "sync", zpool_do_sync, HELP_SYNC },
{ NULL },
{ "wait", zpool_do_wait, HELP_WAIT },
{ NULL },
{ "ddtprune", zpool_do_ddt_prune, HELP_DDT_PRUNE },
};

#define NCOMMAND (ARRAY_SIZE(command_table))
Expand Down Expand Up @@ -456,6 +461,8 @@ get_usage(zpool_help_t idx)
case HELP_WAIT:
return (gettext("\twait [-Hp] [-T d|u] [-t <activity>[,...]] "
"<pool> [interval]\n"));
case HELP_DDT_PRUNE:
return (gettext("\tddtprune -d|-p <amount> <pool>\n"));
default:
__builtin_unreachable();
}
Expand Down Expand Up @@ -11667,6 +11674,88 @@ found:;
return (error);
}

/*
* zpool ddtprune -d|-p <amount> <pool>
*
* -d <days> Prune entries <days> old and older
* -p <percent> Prune <percent> amount of entries
*
* Prune single reference entries from DDT to satisfy the amount specified.
*/
int
zpool_do_ddt_prune(int argc, char **argv)
{
zpool_ddt_prune_unit_t unit = ZPOOL_DDT_PRUNE_NONE;
uint64_t amount = 0;
zpool_handle_t *zhp;
char *endptr;
int c;

while ((c = getopt(argc, argv, "d:p:")) != -1) {
switch (c) {
case 'd':
if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
(void) fprintf(stderr, gettext("-d cannot be "
"combined with -p option\n"));
usage(B_FALSE);
}
errno = 0;
amount = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0' || amount == 0) {
(void) fprintf(stderr,
gettext("invalid days value\n"));
usage(B_FALSE);
}
amount *= 86400; /* convert days to seconds */
unit = ZPOOL_DDT_PRUNE_AGE;
break;
case 'p':
if (unit == ZPOOL_DDT_PRUNE_AGE) {
(void) fprintf(stderr, gettext("-p cannot be "
"combined with -d option\n"));
usage(B_FALSE);
}
errno = 0;
amount = strtoull(optarg, &endptr, 0);
if (errno != 0 || *endptr != '\0' ||
amount == 0 || amount > 100) {
(void) fprintf(stderr,
gettext("invalid percentage value\n"));
usage(B_FALSE);
}
unit = ZPOOL_DDT_PRUNE_PERCENTAGE;
break;
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
usage(B_FALSE);
}
}
argc -= optind;
argv += optind;

if (unit == ZPOOL_DDT_PRUNE_NONE) {
(void) fprintf(stderr,
gettext("missing amount option (-d|-p <value>)\n"));
usage(B_FALSE);
} else if (argc < 1) {
(void) fprintf(stderr, gettext("missing pool argument\n"));
usage(B_FALSE);
} else if (argc > 1) {
(void) fprintf(stderr, gettext("too many arguments\n"));
usage(B_FALSE);
}
zhp = zpool_open(g_zfs, argv[0]);
if (zhp == NULL)
return (-1);

int error = zpool_ddt_prune(zhp, unit, amount);

zpool_close(zhp);

return (error);
}

static int
find_command_idx(const char *command, int *idx)
{
Expand Down
30 changes: 29 additions & 1 deletion cmd/ztest.c
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,8 @@ extern unsigned long zio_decompress_fail_fraction;
extern unsigned long zfs_reconstruct_indirect_damage_fraction;
extern uint64_t raidz_expand_max_reflow_bytes;
extern uint_t raidz_expand_pause_point;
extern boolean_t ddt_prune_artificial_age;
extern boolean_t ddt_dump_prune_histogram;


static ztest_shared_opts_t *ztest_shared_opts;
Expand Down Expand Up @@ -446,6 +448,7 @@ ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt;
ztest_func_t ztest_pool_prefetch_ddt;
ztest_func_t ztest_ddt_prune;

static uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */
static uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */
Expand Down Expand Up @@ -502,6 +505,7 @@ static ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
ZTI_INIT(ztest_pool_prefetch_ddt, 1, &zopt_rarely),
ZTI_INIT(ztest_ddt_prune, 1, &zopt_rarely),
};

#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t))
Expand Down Expand Up @@ -7288,6 +7292,17 @@ ztest_trim(ztest_ds_t *zd, uint64_t id)
mutex_exit(&ztest_vdev_lock);
}

void
ztest_ddt_prune(ztest_ds_t *zd, uint64_t id)
{
(void) zd, (void) id;

spa_t *spa = ztest_spa;
uint64_t pct = ztest_random(15) + 1;

(void) ddt_prune_unique_entries(spa, ZPOOL_DDT_PRUNE_PERCENTAGE, pct);
}

/*
* Verify pool integrity by running zdb.
*/
Expand Down Expand Up @@ -7469,6 +7484,13 @@ ztest_resume_thread(void *arg)
{
spa_t *spa = arg;

/*
* Synthesize aged DDT entries for ddt prune testing
*/
ddt_prune_artificial_age = B_TRUE;
if (ztest_opts.zo_verbose >= 3)
ddt_dump_prune_histogram = B_TRUE;

while (!ztest_exiting) {
if (spa_suspended(spa))
ztest_resume(spa);
Expand Down Expand Up @@ -8586,7 +8608,13 @@ ztest_init(ztest_shared_t *zs)
*/
if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0)
continue;

#if 0
/*
* split bewtween legacy and fast dedup
*/
if (i == SPA_FEATURE_FAST_DEDUP && ztest_random(2) != 0)
continue;
#endif
VERIFY3S(-1, !=, asprintf(&buf, "feature@%s",
spa_feature_table[i].fi_uname));
fnvlist_add_uint64(props, buf, 0);
Expand Down
1 change: 1 addition & 0 deletions contrib/debian/openzfs-zfsutils.install
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ usr/share/man/man8/zpool-clear.8
usr/share/man/man8/zpool-create.8
usr/share/man/man8/zpool-destroy.8
usr/share/man/man8/zpool-detach.8
usr/share/man/man8/zpool-ddtprune.8
usr/share/man/man8/zpool-events.8
usr/share/man/man8/zpool-export.8
usr/share/man/man8/zpool-get.8
Expand Down
3 changes: 3 additions & 0 deletions include/libzfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,9 @@ _LIBZFS_H int zpool_reopen_one(zpool_handle_t *, void *);

_LIBZFS_H int zpool_sync_one(zpool_handle_t *, void *);

_LIBZFS_H int zpool_ddt_prune(zpool_handle_t *, zpool_ddt_prune_unit_t,
uint64_t);

_LIBZFS_H int zpool_vdev_online(zpool_handle_t *, const char *, int,
vdev_state_t *);
_LIBZFS_H int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
Expand Down
3 changes: 3 additions & 0 deletions include/libzfs_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);

_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);

_LIBZFS_CORE_H int lzc_ddt_prune(const char *, zpool_ddt_prune_unit_t,
uint64_t);

#ifdef __cplusplus
}
#endif
Expand Down
3 changes: 3 additions & 0 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,9 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb,

extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);

extern int ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
uint64_t amount);

#ifdef __cplusplus
}
#endif
Expand Down
45 changes: 43 additions & 2 deletions include/sys/ddt_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,11 @@ extern void ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu);

extern boolean_t ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_take_key(ddt_t *ddt, ddt_log_t *ddl,
const ddt_key_t *ddk, ddt_lightweight_entry_t *ddlwe);

extern boolean_t ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
ddt_lightweight_entry_t *ddlwe);
extern boolean_t ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl,
const ddt_key_t *ddk);

extern void ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
dmu_tx_t *tx);
Expand All @@ -213,6 +216,44 @@ extern void ddt_log_fini(void);
* them up.
*/

/*
* We use a histogram to convert a percentage request into a
* cutoff value where entries older than the cutoff get pruned.
*
* The histogram bins represent hours in power-of-two increments.
* 16 bins covers up to four years.
*/
#define HIST_BINS 16

typedef struct ddt_age_histo {
uint64_t dah_entries;
uint64_t dah_age_histo[HIST_BINS];
} ddt_age_histo_t;

void ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram);

#if defined(_KERNEL) || !defined(ZFS_DEBUG)
#define ddt_dump_age_histogram(histo, cutoff) ((void)0)
#else
static inline void
ddt_dump_age_histogram(ddt_age_histo_t *histogram, uint64_t cutoff)
{
if (histogram->dah_entries == 0)
return;

(void) printf("DDT prune unique class age, %llu hour cutoff\n",
(u_longlong_t)(gethrestime_sec() - cutoff)/3600);
(void) printf("%5s %9s %4s\n", "age", "blocks", "amnt");
(void) printf("%5s %9s %4s\n", "-----", "---------", "----");
for (int i = 0; i < HIST_BINS; i++) {
(void) printf("%5d %9llu %4d%%\n", 1<<i,
(u_longlong_t)histogram->dah_age_histo[i],
(int)((histogram->dah_age_histo[i] * 100) /
histogram->dah_entries));
}
}
#endif

/*
* Enough room to expand DMU_POOL_DDT format for all possible DDT
* checksum/class/type combinations.
Expand Down
15 changes: 14 additions & 1 deletion include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1419,7 +1419,7 @@ typedef enum {
*/
typedef enum zfs_ioc {
/*
* Core features - 88/128 numbers reserved.
* Core features - 89/128 numbers reserved.
*/
#ifdef __FreeBSD__
ZFS_IOC_FIRST = 0,
Expand Down Expand Up @@ -1516,6 +1516,7 @@ typedef enum zfs_ioc {
ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */
ZFS_IOC_POOL_SCRUB, /* 0x5a57 */
ZFS_IOC_POOL_PREFETCH, /* 0x5a58 */
ZFS_IOC_DDT_PRUNE, /* 0x5a59 */

/*
* Per-platform (Optional) - 8/128 numbers reserved.
Expand Down Expand Up @@ -1652,6 +1653,12 @@ typedef enum {
ZPOOL_PREFETCH_DDT
} zpool_prefetch_type_t;

typedef enum {
ZPOOL_DDT_PRUNE_NONE,
ZPOOL_DDT_PRUNE_AGE, /* in seconds */
ZPOOL_DDT_PRUNE_PERCENTAGE, /* 1 - 100 */
} zpool_ddt_prune_unit_t;

/*
* Bookmark name values.
*/
Expand Down Expand Up @@ -1745,6 +1752,12 @@ typedef enum {
*/
#define ZPOOL_PREFETCH_TYPE "prefetch_type"

/*
* The following are names used when invoking ZFS_IOC_DDT_PRUNE.
*/
#define DDT_PRUNE_UNIT "ddt_prune_unit"
#define DDT_PRUNE_AMOUNT "ddt_prune_amount"

/*
* Flags for ZFS_IOC_VDEV_SET_STATE
*/
Expand Down
1 change: 1 addition & 0 deletions include/sys/spa_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,7 @@ struct spa {
uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */
uint64_t spa_dedup_checksum; /* default dedup checksum */
uint64_t spa_dspace; /* dspace in normal class */
boolean_t spa_active_ddt_prune; /* ddt prune process active */
struct brt *spa_brt; /* in-core BRT */
kmutex_t spa_vdev_top_lock; /* dueling offline/remove */
kmutex_t spa_proc_lock; /* protects spa_proc* */
Expand Down
Loading

0 comments on commit f010187

Please sign in to comment.