Skip to content

Commit

Permalink
[TA2547] feat(snaprebuild): ioseq in STARTREBUILD request (openzfs#120)
Browse files Browse the repository at this point in the history
Bump up in replication version number

Signed-off-by: Vishnu Itta <vitta@mayadata.io>
  • Loading branch information
vishnuitta authored Sep 24, 2018
1 parent afa3feb commit f95b41e
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 80 deletions.
4 changes: 1 addition & 3 deletions cmd/uzfs_test/zrepl_utest.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ char *tgt_port3 = "99161";
char *ds1 = "ds1";
char *ds2 = "ds2";
char *ds3 = "ds3";
static uint64_t last_io_seq_sent;

struct data_io {
zvol_io_hdr_t hdr;
Expand Down Expand Up @@ -418,7 +417,7 @@ writer_thread(void *arg)
while (i < warg->max_iops) {
io->hdr.version = REPLICA_VERSION;
io->hdr.opcode = ZVOL_OPCODE_WRITE;
io->hdr.checkpointed_io_seq = io->hdr.io_seq = i + 1;
io->hdr.io_seq = i + 1;
io->hdr.len = sizeof (struct zvol_io_rw_hdr) +
warg->io_block_size;
io->hdr.status = 0;
Expand Down Expand Up @@ -455,7 +454,6 @@ writer_thread(void *arg)
}
nbytes += warg->io_block_size;
i++;
last_io_seq_sent = io->hdr.checkpointed_io_seq;
}

io->hdr.version = REPLICA_VERSION;
Expand Down
5 changes: 3 additions & 2 deletions include/sys/uzfs_zvol.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ typedef struct zvol_state zvol_state_t;
#define UZFS_IO_MREAD_FAIL 3

#define ZVOL_IS_DEGRADED(zv) (zv->zv_status == ZVOL_STATUS_DEGRADED)
#define ZVOL_IS_REBUILDING(zv) \
(zv->rebuild_info.zv_rebuild_status == ZVOL_REBUILDING_SNAP)
#define ZVOL_IS_REBUILDING(zv) \
((zv->rebuild_info.zv_rebuild_status == ZVOL_REBUILDING_SNAP) ||\
(zv->rebuild_info.zv_rebuild_status == ZVOL_REBUILDING_AFS))
#define ZVOL_IS_REBUILDED(zv) \
(zv->rebuild_info.zv_rebuild_status == ZVOL_REBUILDING_DONE)
#define ZVOL_IS_REBUILDING_ERRORED(zv) \
Expand Down
20 changes: 11 additions & 9 deletions include/zrepl_prot.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ extern "C" {
* properly aligned (and packed).
*/

#define REPLICA_VERSION 1
#define REPLICA_VERSION 2
#define MAX_NAME_LEN 256
#define MAX_IP_LEN 64
#define TARGET_PORT 6060
Expand Down Expand Up @@ -109,8 +109,6 @@ struct zvol_io_hdr {
* meta data.
*/
uint64_t len;
uint64_t checkpointed_io_seq;
uint64_t checkpointed_degraded_io_seq;
} __attribute__((packed));

typedef struct zvol_io_hdr zvol_io_hdr_t;
Expand All @@ -128,12 +126,16 @@ typedef struct zvol_op_open_data zvol_op_open_data_t;
* IP, port where replica listens for data connection to zvol.
*/
struct mgmt_ack {
uint64_t pool_guid;
uint64_t zvol_guid;
uint16_t port;
char ip[MAX_IP_LEN];
char volname[MAX_NAME_LEN]; // zvol helping rebuild
char dw_volname[MAX_NAME_LEN]; // zvol being rebuilt
uint64_t pool_guid;
uint64_t zvol_guid;
uint16_t port;
char ip[MAX_IP_LEN];
char volname[MAX_NAME_LEN]; // zvol helping rebuild
char dw_volname[MAX_NAME_LEN]; // zvol being rebuilt
// checkpointed io_seq when vol is healthy
uint64_t checkpointed_io_seq;
// checkpointed io_seq when vol is in degraded state
uint64_t checkpointed_degraded_io_seq;
} __attribute__((packed));

typedef struct mgmt_ack mgmt_ack_t;
Expand Down
10 changes: 9 additions & 1 deletion lib/fio/replica.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ typedef struct io_list_entry {
struct netio_data {
io_list_entry_t *io_inprog;
struct io_u **io_completed;
pthread_mutex_t mtx;
};

// global because mgmt conn must be shared by all data connections
Expand Down Expand Up @@ -579,6 +580,7 @@ static int fio_repl_setup(struct thread_data *td)
memset(nd, 0, sizeof (*nd));
nd->io_inprog = NULL;
nd->io_completed = calloc(td->o.iodepth, sizeof (struct io_u *));
pthread_mutex_init(&nd->mtx, NULL);

// only create mgmt conn if it is needed
if (!o->port && mgmt_conn < 0) {
Expand All @@ -600,6 +602,7 @@ static void fio_repl_cleanup(struct thread_data *td)

if (nd) {
free(nd->io_completed);
pthread_mutex_destroy(&nd->mtx);
free(nd);
}
if (mgmt_conn >= 0) {
Expand Down Expand Up @@ -645,7 +648,7 @@ static enum fio_q_status fio_repl_queue(struct thread_data *td,
hdr.len = io_u->xfer_buflen;
hdr.status = 0;
hdr.flags = 0;
hdr.checkpointed_io_seq = 0;
hdr.version = REPLICA_VERSION;

if (io_u->ddir == DDIR_WRITE) {
hdr.opcode = ZVOL_OPCODE_WRITE;
Expand Down Expand Up @@ -688,8 +691,10 @@ static enum fio_q_status fio_repl_queue(struct thread_data *td,
td_verror(td, io_u->error, "xfer");
return (FIO_Q_COMPLETED);
}
pthread_mutex_lock(&nd->mtx);
io_ent->io_next = nd->io_inprog;
nd->io_inprog = io_ent;
pthread_mutex_unlock(&nd->mtx);

return (FIO_Q_QUEUED);
}
Expand All @@ -710,6 +715,7 @@ static io_list_entry_t *read_repl_reply(struct thread_data *td, int fd)
return (NULL);
}

pthread_mutex_lock(&nd->mtx);
iter = nd->io_inprog;
last = NULL;
while (iter != NULL) {
Expand All @@ -719,6 +725,7 @@ static io_list_entry_t *read_repl_reply(struct thread_data *td, int fd)
iter = iter->io_next;
}
if (iter == NULL) {
pthread_mutex_unlock(&nd->mtx);
td_verror(td, ENOENT, "unknown IO number");
return (NULL);
}
Expand All @@ -727,6 +734,7 @@ static io_list_entry_t *read_repl_reply(struct thread_data *td, int fd)
else
last->io_next = iter->io_next;
iter->io_next = NULL;
pthread_mutex_unlock(&nd->mtx);

if (hdr.status != ZVOL_OP_STATUS_OK) {
iter->io_u->error = EIO;
Expand Down
7 changes: 3 additions & 4 deletions lib/libzrepl/data_conn.c
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ uzfs_zvol_rebuild_dw_replica(void *arg)
hdr.status = ZVOL_OP_STATUS_OK;
hdr.version = REPLICA_VERSION;
hdr.opcode = ZVOL_OPCODE_REBUILD_STEP;
hdr.checkpointed_io_seq = checkpointed_ionum;
hdr.io_seq = checkpointed_ionum;
hdr.offset = offset;
if ((offset + zvol_rebuild_step_size) >
ZVOL_VOLUME_SIZE(zvol_state))
Expand Down Expand Up @@ -1215,7 +1215,7 @@ uzfs_zvol_rebuild_scanner(void *arg)

case ZVOL_OPCODE_REBUILD_STEP:

metadata.io_num = hdr.checkpointed_io_seq;
metadata.io_num = hdr.io_seq;
rebuild_req_offset = hdr.offset;
rebuild_req_len = hdr.len;

Expand Down Expand Up @@ -1688,8 +1688,7 @@ uzfs_zvol_io_receiver(void *arg)
LOG_INFO("Data connection associated with zvol %s fd: %d",
zinfo->name, fd);

while ((rc = uzfs_zvol_socket_read(fd, (char *)&hdr, sizeof (hdr))) ==
0) {
while ((rc = uzfs_zvol_read_header(fd, &hdr)) == 0) {
if ((zinfo->state == ZVOL_INFO_STATE_OFFLINE))
break;

Expand Down
129 changes: 86 additions & 43 deletions lib/libzrepl/mgmt_conn.c
Original file line number Diff line number Diff line change
Expand Up @@ -451,32 +451,22 @@ uzfs_zvol_get_ip(char *host, size_t host_len)
return (rc);
}

/*
* This function suppose to lookup into zvol list to find if LUN presented for
* identification is available/online or not. This function also need to send
* back IP address of replica along with port so that ISTGT controller can open
* a connection for IOs.
*/
static int
uzfs_zvol_mgmt_do_handshake(uzfs_mgmt_conn_t *conn, zvol_io_hdr_t *hdrp,
const char *name, zvol_info_t *zinfo)
uzfs_zvol_mgmt_get_handshake_info(zvol_io_hdr_t *in_hdr, const char *name,
zvol_info_t *zinfo, zvol_io_hdr_t *out_hdr, mgmt_ack_t *mgmt_ack)
{
zvol_state_t *zv = zinfo->main_zv;
mgmt_ack_t mgmt_ack;
zvol_io_hdr_t hdr;
int error1, error2;

bzero(&mgmt_ack, sizeof (mgmt_ack));
if (uzfs_zvol_get_ip(mgmt_ack.ip, MAX_IP_LEN) == -1) {
bzero(mgmt_ack, sizeof (*mgmt_ack));
if (uzfs_zvol_get_ip(mgmt_ack->ip, MAX_IP_LEN) == -1) {
LOG_ERRNO("Unable to get IP");
return (reply_nodata(conn, ZVOL_OP_STATUS_FAILED, hdrp->opcode,
hdrp->io_seq));
return (-1);
}

strlcpy(mgmt_ack.volname, name, sizeof (mgmt_ack.volname));
mgmt_ack.port = (hdrp->opcode == ZVOL_OPCODE_PREPARE_FOR_REBUILD) ?
strlcpy(mgmt_ack->volname, name, sizeof (mgmt_ack->volname));
mgmt_ack->port = (in_hdr->opcode == ZVOL_OPCODE_PREPARE_FOR_REBUILD) ?
REBUILD_IO_SERVER_PORT : IO_SERVER_PORT;
mgmt_ack.pool_guid = spa_guid(zv->zv_spa);
mgmt_ack->pool_guid = spa_guid(zv->zv_spa);

/*
* hold dataset during handshake if objset is NULL
Expand All @@ -485,8 +475,7 @@ uzfs_zvol_mgmt_do_handshake(uzfs_mgmt_conn_t *conn, zvol_io_hdr_t *hdrp,
if (zv->zv_objset == NULL) {
if (uzfs_hold_dataset(zv) != 0) {
LOG_ERR("Failed to hold zvol %s", zinfo->name);
return (reply_nodata(conn, ZVOL_OP_STATUS_FAILED,
hdrp->opcode, hdrp->io_seq));
return (-1);
}
}

Expand All @@ -496,35 +485,54 @@ uzfs_zvol_mgmt_do_handshake(uzfs_mgmt_conn_t *conn, zvol_io_hdr_t *hdrp,
&zinfo->degraded_checkpointed_ionum);
if ((error1 != 0) || (error2 != 0)) {
LOG_ERR("Failed to read io_seqnum %s", zinfo->name);
return (reply_nodata(conn, ZVOL_OP_STATUS_FAILED,
hdrp->opcode, hdrp->io_seq));
return (-1);
}

/*
* We don't use fsid_guid because that one is not guaranteed
* to stay the same (it is changed in case of conflicts).
*/
mgmt_ack.zvol_guid = dsl_dataset_phys(
mgmt_ack->zvol_guid = dsl_dataset_phys(
zv->zv_objset->os_dsl_dataset)->ds_guid;
if (zinfo->zvol_guid == 0)
zinfo->zvol_guid = mgmt_ack.zvol_guid;
zinfo->zvol_guid = mgmt_ack->zvol_guid;
LOG_INFO("Volume:%s has zvol_guid:%lu", zinfo->name, zinfo->zvol_guid);

bzero(&hdr, sizeof (hdr));
hdr.version = REPLICA_VERSION;
hdr.opcode = hdrp->opcode; // HANDSHAKE or PREPARE_FOR_REBUILD
hdr.io_seq = hdrp->io_seq;
hdr.len = sizeof (mgmt_ack);
hdr.status = ZVOL_OP_STATUS_OK;
bzero(out_hdr, sizeof (*out_hdr));
out_hdr->version = REPLICA_VERSION;
out_hdr->opcode = in_hdr->opcode; // HANDSHAKE or PREPARE_FOR_REBUILD
out_hdr->io_seq = in_hdr->io_seq;
out_hdr->len = sizeof (*mgmt_ack);
out_hdr->status = ZVOL_OP_STATUS_OK;

zinfo->stored_healthy_ionum = zinfo->checkpointed_ionum;
zinfo->running_ionum = zinfo->degraded_checkpointed_ionum;
LOG_INFO("IO sequence number:%lu Degraded IO sequence number:%lu",
zinfo->checkpointed_ionum, zinfo->degraded_checkpointed_ionum);

hdr.checkpointed_io_seq = zinfo->checkpointed_ionum;
hdr.checkpointed_degraded_io_seq = zinfo->degraded_checkpointed_ionum;
mgmt_ack->checkpointed_io_seq = zinfo->checkpointed_ionum;
mgmt_ack->checkpointed_degraded_io_seq =
zinfo->degraded_checkpointed_ionum;

return (0);
}

/*
* This function suppose to lookup into zvol list to find if LUN presented for
* identification is available/online or not. This function also need to send
* back IP address of replica along with port so that ISTGT controller can open
* a connection for IOs.
*/
static int
uzfs_zvol_mgmt_do_handshake(uzfs_mgmt_conn_t *conn, zvol_io_hdr_t *hdrp,
const char *name, zvol_info_t *zinfo)
{
mgmt_ack_t mgmt_ack;
zvol_io_hdr_t hdr;
if (uzfs_zvol_mgmt_get_handshake_info(hdrp, name, zinfo, &hdr,
&mgmt_ack) != 0)
return (reply_nodata(conn, ZVOL_OP_STATUS_FAILED, hdrp->opcode,
hdrp->io_seq));
return (reply_data(conn, &hdr, &mgmt_ack, sizeof (mgmt_ack)));
}

Expand Down Expand Up @@ -1151,26 +1159,20 @@ handle_start_rebuild_req(uzfs_mgmt_conn_t *conn, zvol_io_hdr_t *hdrp,
if (uzfs_zvol_get_rebuild_status(zinfo->main_zv) !=
ZVOL_REBUILDING_INIT) {
mutex_exit(&zinfo->main_zv->rebuild_mtx);
uzfs_zinfo_drop_refcnt(zinfo);
LOG_ERR("rebuilding failed for %s due to improper rebuild "
"status", zinfo->name);
uzfs_zinfo_drop_refcnt(zinfo);
rc = reply_nodata(conn, ZVOL_OP_STATUS_FAILED,
hdrp->opcode, hdrp->io_seq);
goto end;
}

memset(&zinfo->main_zv->rebuild_info, 0,
sizeof (zvol_rebuild_info_t));
int rebuild_op_cnt = (payload_size / sizeof (mgmt_ack_t));
/* Track # of rebuilds we are initializing on replica */
zinfo->main_zv->rebuild_info.rebuild_cnt = rebuild_op_cnt;

/*
* Case where just one replica is being used by customer
*/
if ((strcmp(mack->volname, "")) == 0) {
zinfo->main_zv->rebuild_info.rebuild_cnt = 0;
zinfo->main_zv->rebuild_info.rebuild_done_cnt = 0;
memset(&zinfo->main_zv->rebuild_info, 0,
sizeof (zvol_rebuild_info_t));
/* Mark replica healthy now */
uzfs_zvol_set_rebuild_status(zinfo->main_zv,
ZVOL_REBUILDING_DONE);
Expand All @@ -1185,8 +1187,49 @@ handle_start_rebuild_req(uzfs_mgmt_conn_t *conn, zvol_io_hdr_t *hdrp,
hdrp->opcode, hdrp->io_seq);
goto end;
}
uzfs_zvol_set_rebuild_status(zinfo->main_zv,
ZVOL_REBUILDING_SNAP);

int rebuild_op_cnt = (payload_size / sizeof (mgmt_ack_t));
int loop_cnt;
uint64_t max_ioseq;
for (loop_cnt = 0, max_ioseq = 0, mack = payload;
loop_cnt < rebuild_op_cnt; loop_cnt++, mack++)
if (max_ioseq < mack->checkpointed_io_seq)
max_ioseq = mack->checkpointed_io_seq;
#if 0
mack = malloc(payload_size + sizeof (mgmt_ack_t));
memcpy(mack, payload, payload_size);
self_mack = (char *)mack + payload_size;
hdr1.opcode = ZVOL_OPCODE_PREPARE_FOR_REBUILD;
hdr1.io_seq = 0;

uzfs_zvol_mgmt_get_handshake_info(&hdr1, name, zinfo, &hdr2, self_mack);
#endif
if ((zinfo->checkpointed_ionum < max_ioseq) &&
(rebuild_op_cnt != 1)) {
mutex_exit(&zinfo->main_zv->rebuild_mtx);
LOG_ERR("rebuilding failed for %s due to rebuild_op_cnt"
"(%d) is not one when checkpointed num (%lu) is "
"less than max_ioseq(%lu)", zinfo->name,
rebuild_op_cnt, zinfo->checkpointed_ionum,
max_ioseq);
uzfs_zinfo_drop_refcnt(zinfo);
rc = reply_nodata(conn, ZVOL_OP_STATUS_FAILED,
hdrp->opcode, hdrp->io_seq);
goto end;
}

memset(&zinfo->main_zv->rebuild_info, 0,
sizeof (zvol_rebuild_info_t));
if (zinfo->checkpointed_ionum >= max_ioseq)
uzfs_zvol_set_rebuild_status(zinfo->main_zv,
ZVOL_REBUILDING_AFS);
else
uzfs_zvol_set_rebuild_status(zinfo->main_zv,
ZVOL_REBUILDING_SNAP);

/* Track # of rebuilds we are initializing on replica */
zinfo->main_zv->rebuild_info.rebuild_cnt = rebuild_op_cnt;

mutex_exit(&zinfo->main_zv->rebuild_mtx);

DBGCONN(conn, "Rebuild start command");
Expand Down
Loading

0 comments on commit f95b41e

Please sign in to comment.