Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

raidz time-dependent geometry #11

Merged
merged 18 commits into from
Nov 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -686,8 +686,8 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_SPARES "spares"
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
#define ZPOOL_CONFIG_NPARITY "nparity"
#define ZPOOL_CONFIG_RAIDZ_LOGICAL_WIDTH "raidz_logical_width"
#define ZPOOL_CONFIG_RAIDZ_EXPAND_OFFSET "raidz_expand_offset"
#define ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS "raidz_expand_txgs"
#define ZPOOL_CONFIG_HOSTID "hostid"
#define ZPOOL_CONFIG_HOSTNAME "hostname"
#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time"
Expand Down
6 changes: 6 additions & 0 deletions include/sys/spa.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,12 @@ typedef struct blkptr {
/*
* Macros to get and set fields in a bp or DVA.
*/

/*
* Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for
* this gang DVA including its children BP's. The space allocated at this
* DVA's vdev/offset is vdev_gang_header_asize(vdev).
*/
#define DVA_GET_ASIZE(dva) \
BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
#define DVA_SET_ASIZE(dva, x) \
Expand Down
13 changes: 13 additions & 0 deletions include/sys/vdev.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,21 @@ extern void vdev_space_update(vdev_t *vd,

extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);

extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize,
uint64_t txg);
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);

/*
* Return the amount of space allocated for a gang block header. Note that
* since the physical birth txg is not provided, this must be constant for
* a given vdev. (e.g. raidz expansion can't change this)
*/
static inline uint64_t
vdev_gang_header_asize(vdev_t *vd)
{
return (vdev_psize_to_asize_txg(vd, SPA_GANGBLOCKSIZE, 0));
}

extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
Expand Down
4 changes: 2 additions & 2 deletions include/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ extern uint32_t zfs_vdev_async_write_max_active;
typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
uint64_t *ashift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize, uint64_t txg);
typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
Expand Down Expand Up @@ -561,7 +561,7 @@ extern vdev_ops_t vdev_indirect_ops;
*/
extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in,
range_seg64_t *out);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg);
extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);

Expand Down
11 changes: 10 additions & 1 deletion include/sys/vdev_raidz.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ extern "C" {
struct zio;
struct raidz_row;
struct raidz_map;
struct vdev_raidz;
#if !defined(_KERNEL)
struct kernel_param {};
#endif
Expand All @@ -47,6 +48,7 @@ struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t,
struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
void vdev_raidz_map_free(struct raidz_map *);
void vdev_raidz_free(struct vdev_raidz *);
void vdev_raidz_generate_parity(struct raidz_map *);
void vdev_raidz_reconstruct(struct raidz_map *, const int *, int);

Expand Down Expand Up @@ -92,9 +94,16 @@ typedef struct vdev_raidz_expand {
} vdev_raidz_expand_t;

typedef struct vdev_raidz {
int vd_logical_width;
int vd_original_width;
int vd_physical_width;
int vd_nparity;

/*
* tree of reflow_node_t's. The lock protects the avl tree only.
*/
kmutex_t vd_expand_lock;
avl_tree_t vd_expand_txgs;

/*
* If this vdev is being expanded, spa_raidz_expand is set to this
*/
Expand Down
11 changes: 11 additions & 0 deletions include/sys/vdev_raidz_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,17 @@ typedef struct raidz_map {
raidz_row_t *rm_row[0]; /* flexible array of rows */
} raidz_map_t;

/*
* Nodes in vdev_raidz_t:vd_expand_txgs.
* Blocks with physical birth time of re_txg or later have the specified
* logical width (until the next node).
*/
typedef struct reflow_node {
uint64_t re_txg;
uint64_t re_logical_width;
avl_node_t re_link;
} reflow_node_t;


#define RAIDZ_ORIGINAL_IMPL (INT_MAX)

Expand Down
10 changes: 5 additions & 5 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -5173,7 +5173,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,

ASSERT(mg->mg_class == mc);

uint64_t asize = vdev_psize_to_asize(vd, psize);
uint64_t asize = vdev_psize_to_asize_txg(vd, psize, txg);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);

/*
Expand Down Expand Up @@ -5520,7 +5520,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);

if (DVA_GET_GANG(dva))
size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
size = vdev_gang_header_asize(vd);

msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];

Expand Down Expand Up @@ -5555,7 +5555,7 @@ metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);

if (DVA_GET_GANG(dva)) {
size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
size = vdev_gang_header_asize(vd);
}

metaslab_free_impl(vd, offset, size, checkpoint);
Expand Down Expand Up @@ -5747,7 +5747,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
ASSERT(DVA_IS_VALID(dva));

if (DVA_GET_GANG(dva))
size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
size = vdev_gang_header_asize(vd);

return (metaslab_claim_impl(vd, offset, size, txg));
}
Expand Down Expand Up @@ -6013,7 +6013,7 @@ metaslab_check_free(spa_t *spa, const blkptr_t *bp)
uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);

if (DVA_GET_GANG(&bp->blk_dva[i]))
size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
size = vdev_gang_header_asize(vd);

ASSERT3P(vd, !=, NULL);

Expand Down
37 changes: 25 additions & 12 deletions module/zfs/vdev.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
#include <sys/abd.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zfs_ratelimit.h>

Expand Down Expand Up @@ -249,13 +250,13 @@ vdev_derive_alloc_bias(const char *bias)
* all children. This is what's used by anything other than RAID-Z.
*/
uint64_t
vdev_default_asize(vdev_t *vd, uint64_t psize)
vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
{
uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
uint64_t csize;

for (int c = 0; c < vd->vdev_children; c++) {
csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
asize = MAX(asize, csize);
}

Expand Down Expand Up @@ -909,10 +910,8 @@ vdev_free(vdev_t *vd)
ASSERT(vd->vdev_child == NULL);
ASSERT(vd->vdev_guid_sum == vd->vdev_guid);

if (vd->vdev_ops == &vdev_raidz_ops) {
vdev_raidz_t *rz = vd->vdev_tsd;
kmem_free(rz, sizeof (*rz));
}
if (vd->vdev_ops == &vdev_raidz_ops)
vdev_raidz_free(vd->vdev_tsd);

/*
* Discard allocation state.
Expand Down Expand Up @@ -1632,17 +1631,19 @@ vdev_open_children(vdev_t *vd)
}

/*
* Compute the raidz-deflation ratio. Note, we hard-code
* in 128k (1 << 17) because it is the "typical" blocksize.
* Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
* otherwise it would inconsistently account for existing bp's.
* Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17)
* because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE
* changed, this algorithm can not change, otherwise it would inconsistently
* account for existing bp's. We also hard-code txg 0 for the same reason
* (expanded RAIDZ vdevs can use different asize for different birth txg's).
*/
static void
vdev_set_deflate_ratio(vdev_t *vd)
{
if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
(vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
SPA_MINBLOCKSHIFT);
}
}

Expand Down Expand Up @@ -3426,10 +3427,22 @@ vdev_sync(vdev_t *vd, uint64_t txg)
dmu_tx_commit(tx);
}

/*
* Return the amount of space that should be (or was) allocated for the given
* psize (compressed block size) in the given TXG. Note that for expanded
* RAIDZ vdevs, the size allocated for older BP's may be larger. See
* vdev_raidz_asize().
*/
uint64_t
vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
{
return (vd->vdev_ops->vdev_op_asize(vd, psize, txg));
}

uint64_t
vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
{
return (vd->vdev_ops->vdev_op_asize(vd, psize));
return (vdev_psize_to_asize_txg(vd, psize, 0));
}

/*
Expand Down
Loading