Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

raidz time-dependent geometry #11

Merged
merged 18 commits into from
Nov 16, 2020
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -686,8 +686,8 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_SPARES "spares"
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
#define ZPOOL_CONFIG_NPARITY "nparity"
#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width"
#define ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET "raidz_expand_offset"
#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
Expand Down
10 changes: 9 additions & 1 deletion include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ extern "C" {
struct zio;
struct raidz_row;
struct raidz_map;
struct vdev_raidz;
#if !defined(_KERNEL)
struct kernel_param {};
#endif
Expand All @@ -47,6 +48,7 @@ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t,
struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
void vdev_raidz_map_free(struct raidz_map *);
void vdev_raidz_free(struct vdev_raidz *);
void vdev_raidz_generate_parity(struct raidz_map *);
void vdev_raidz_reconstruct(struct raidz_map *, const int *, int);

Expand Down Expand Up @@ -92,9 +94,15 @@ typedef struct vdev_raidz_expand {
} vdev_raidz_expand_t;

typedef struct vdev_raidz {
int vd_logical_width;
int vd_original_width;
int vd_physical_width;
int vd_nparity;

/*
* tree of reflow_node_t's.
*/
avl_tree_t vd_expand_txgs;

/*
* If this vdev is being expanded, spa_raidz_expand is set to this
*/
Expand Down
11 changes: 11 additions & 0 deletions include/sys/vdev_raidz_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,17 @@ typedef struct raidz_map {
raidz_row_t *rm_row[0]; /* flexible array of rows */
} raidz_map_t;

/*
* Nodes in vdev_raidz_t:vd_expand_txgs.
* Blocks with physical birth time of re_txg or later have the specified
* logical width (until the next node).
*/
typedef struct reflow_node {
uint64_t re_txg;
uint64_t re_logical_width;
avl_node_t re_link;
} reflow_node_t;


#define RAIDZ_ORIGINAL_IMPL (INT_MAX)

Expand Down
7 changes: 3 additions & 4 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>

Expand Down Expand Up @@ -909,10 +910,8 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);

if (vd->vdev_ops == &vdev_raidz_ops) {
vdev_raidz_t *rz = vd->vdev_tsd;
kmem_free(rz, sizeof (*rz));
}
if (vd->vdev_ops == &vdev_raidz_ops)
vdev_raidz_free(vd->vdev_tsd);

/*
* Discard allocation state.
Expand Down
125 changes: 105 additions & 20 deletions module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,27 @@ vdev_raidz_map_free_vsd(zio_t *zio)
}
}

static int
vedv_raidz_reflow_compare(const void *x1, const void *x2)
{
const reflow_node_t *l = (reflow_node_t *)x1;
const reflow_node_t *r = (reflow_node_t *)x2;

return (TREE_CMP(l->re_txg, r->re_txg));
}

void
vdev_raidz_free(vdev_raidz_t *vdrz)
{
reflow_node_t *re;
void *cookie = NULL;
avl_tree_t *tree = &vdrz->vd_expand_txgs;
while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
kmem_free(re, sizeof (*re));
avl_destroy(&vdrz->vd_expand_txgs);
kmem_free(vdrz, sizeof (*vdrz));
}

/*ARGSUSED*/
static void
vdev_raidz_cksum_free(void *arg, size_t ignored)
Expand Down Expand Up @@ -1905,13 +1926,20 @@ vdev_raidz_asize(vdev_t *vd, uint64_t psize)
vdev_raidz_t *vdrz = vd->vdev_tsd;
uint64_t asize;
uint64_t ashift = vd->vdev_top->vdev_ashift;
uint64_t cols = vdrz->vd_logical_width;
/* XXX if this is a new write, need to use new logical width */
uint64_t cols = vdrz->vd_original_width;
uint64_t nparity = vdrz->vd_nparity;

asize = ((psize - 1) >> ashift) + 1;
asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
asize = roundup(asize, nparity + 1) << ashift;

uint64_t asize_new = ((psize - 1) >> ashift) + 1;
uint64_t ncols_new = vdrz->vd_physical_width;
asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / (ncols_new - nparity));
asize_new = roundup(asize_new, nparity + 1) << ashift;
VERIFY3U(asize_new, <=, asize);

return (asize);
}

Expand Down Expand Up @@ -2064,6 +2092,29 @@ vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
}
}

static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, zio_t *zio)
{
#if 1
reflow_node_t lookup = {
.re_txg = BP_PHYSICAL_BIRTH(zio->io_bp)
};
avl_index_t where;

reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
if (re != NULL)
return (re->re_logical_width);

re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
if (re == NULL)
return (vdrz->vd_original_width);

return (re->re_logical_width);
#else
return (vdrz->vd_original_width);
#endif
}

/*
* Start an IO operation on a RAIDZ VDev
*
Expand All @@ -2089,15 +2140,16 @@ vdev_raidz_io_start(zio_t *zio)
vdev_raidz_t *vdrz = vd->vdev_tsd;
raidz_map_t *rm;

uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, zio);
zfs_dbgmsg("zio=%llx bm=%llu/%llu/%llu/%llu phys_birth=%llu logical_width=%llu",
zio,
zio->io_bookmark.zb_objset,
zio->io_bookmark.zb_object,
zio->io_bookmark.zb_level,
zio->io_bookmark.zb_blkid,
BP_PHYSICAL_BIRTH(zio->io_bp),
vdrz->vd_logical_width);
if (vdrz->vd_logical_width != vdrz->vd_physical_width) {
logical_width);
if (logical_width != vdrz->vd_physical_width) {
/* XXX rangelock not needed after expansion completes */
zfs_locked_range_t *lr =
zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
Expand All @@ -2112,14 +2164,13 @@ vdev_raidz_io_start(zio_t *zio)
rm = vdev_raidz_map_alloc_expanded(zio->io_abd,
zio->io_size, zio->io_offset,
tvd->vdev_ashift, vdrz->vd_physical_width,
vdrz->vd_logical_width, vdrz->vd_nparity,
logical_width, vdrz->vd_nparity,
vdrz->vn_vre.vre_offset_phys,
vdrz->vn_vre.vre_offset);
rm->rm_lr = lr;
} else {
rm = vdev_raidz_map_alloc(zio,
tvd->vdev_ashift, vdrz->vd_logical_width,
vdrz->vd_nparity);
tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
}

zio->io_vsd = rm;
Expand Down Expand Up @@ -2386,7 +2437,7 @@ raidz_simulate_failure(vdev_raidz_t *vdrz, int ashift, int i, raidz_col_t *rc)
zfs_dbgmsg("raidz_simulate_failure(pw=%u lw=%u ashift=%u i=%u "
"rc_offset=%llx rc_devidx=%u sector_id=%u",
vdrz->vd_physical_width,
vdrz->vd_logical_width,
vdrz->vd_original_width,
ashift,
i,
(long long)rc->rc_offset,
Expand All @@ -2395,7 +2446,7 @@ raidz_simulate_failure(vdev_raidz_t *vdrz, int ashift, int i, raidz_col_t *rc)
#endif

for (int w = vdrz->vd_physical_width;
w >= vdrz->vd_logical_width; w--) {
w >= vdrz->vd_original_width; w--) {
if (i < w) {
return (sector_id % w == i);
} else {
Expand Down Expand Up @@ -2559,7 +2610,7 @@ vdev_raidz_combrec(zio_t *zio)
/* Determine number of logical children, n */
int n = 0;
for (int w = vdrz->vd_physical_width;
w >= vdrz->vd_logical_width; w--) {
w >= vdrz->vd_original_width; w--) {
n += w;
}

Expand Down Expand Up @@ -3043,12 +3094,19 @@ raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = arg;
vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
vdev_raidz_t *vdrz = raidvd->vdev_tsd;

for (int i = 0; i < TXG_SIZE; i++)
ASSERT0(vre->vre_offset_pertxg[i]);

vre->vre_offset_phys = UINT64_MAX;

reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
re->re_txg = tx->tx_txg + 1;
re->re_logical_width = vdrz->vd_physical_width;
avl_add(&vdrz->vd_expand_txgs, re);

/*
* vre_offset_phys will be removed from the on-disk config by
* vdev_raidz_config_generate().
Expand Down Expand Up @@ -3424,7 +3482,7 @@ vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
vdev_raidz_t *vdrz = raidvd->vdev_tsd;
ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
ASSERT3P(raidvd->vdev_top, ==, raidvd);
ASSERT3U(raidvd->vdev_children, >, vdrz->vd_logical_width);
ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
new_child);
Expand Down Expand Up @@ -3488,12 +3546,27 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
* it.
*/
fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
vdrz->vd_logical_width);
if (vdrz->vn_vre.vre_offset_phys != UINT64_MAX) {
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET,
vdrz->vn_vre.vre_offset_phys);
}

if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
KM_SLEEP);
uint64_t i = 0;

for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
txgs[i++] = re->re_txg;
}

fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
txgs, count);

kmem_free(txgs, sizeof (uint64_t) * count);
}
}

/*
Expand All @@ -3504,7 +3577,8 @@ vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
void *
vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
{
uint64_t nparity, lw;
uint64_t nparity, *txgs;
uint_t txgs_size;
vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);

vdrz->vn_vre.vre_vdev_id = -1;
Expand All @@ -3521,14 +3595,9 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
if (error != 0)
goto out;

vdrz->vd_logical_width = children;
vdrz->vd_original_width = children;
vdrz->vd_physical_width = children;

if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH,
&lw) == 0) {
vdrz->vd_logical_width = lw;
}

/* note, the ID does not exist when creating a pool */
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
&vdrz->vn_vre.vre_vdev_id);
Expand All @@ -3542,6 +3611,22 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
*/
}

avl_create(&vdrz->vd_expand_txgs, vedv_raidz_reflow_compare,
sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));

error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
ahrens marked this conversation as resolved.
Show resolved Hide resolved
&txgs, &txgs_size);
if (error == 0) {
for (int i = 0; i < txgs_size; i++) {
reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
re->re_txg = txgs[txgs_size - i - 1];
re->re_logical_width = vdrz->vd_physical_width - i;
avl_add(&vdrz->vd_expand_txgs, re);
}

vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
}

if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
&nparity) == 0) {
if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
Expand Down Expand Up @@ -3571,7 +3656,7 @@ vdev_raidz_get_tsd(spa_t *spa, nvlist_t *nv)
vdrz->vd_nparity = nparity;
return (vdrz);
out:
kmem_free(vdrz, sizeof (*vdrz));
vdev_raidz_free(vdrz);
return (NULL);
}

Expand Down
3 changes: 2 additions & 1 deletion tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -708,8 +708,9 @@ tags = ['functional', 'redacted_send']

[tests/functional/raidz]
tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos',
'raidz_expand.ksh']
'raidz_expand_001_pos', 'raidz_expand_002_pos']
tags = ['functional', 'raidz']
timeout = 1200

[tests/functional/redundancy]
tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos',
Expand Down
3 changes: 2 additions & 1 deletion tests/zfs-tests/tests/functional/raidz/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ dist_pkgdata_SCRIPTS = \
raidz_002_pos.ksh \
raidz_003_pos.ksh \
raidz_004_pos.ksh \
raidz_expand.ksh
raidz_expand_001_pos.ksh \
raidz_expand_002_pos.ksh
Loading