From 9b3f6b5df91353dfc5a7c4572eff646a9c034e62 Mon Sep 17 00:00:00 2001 From: mayank Date: Mon, 26 Mar 2018 21:32:49 +0530 Subject: [PATCH 1/4] - API to get modified data from metadata - Handling of rebuild IO from other replica according to metadata - Removing code based on txg number Signed-off-by: mayank --- .travis.yml | 2 - cmd/uzfs_test/Makefile.am | 4 +- cmd/uzfs_test/uzfs_test.c | 3 - ...fs_rebuilding.c => uzfs_test_rebuilding.c} | 188 +++---- cmd/uzfs_test/uzfs_txg_diff.c | 362 ------------- cmd/zrepl/zrepl.c | 2 +- include/Makefile.am | 2 +- include/sys/uzfs_zvol.h | 7 +- include/uzfs_io.h | 5 + include/uzfs_mtree.h | 67 --- include/uzfs_rebuilding.h | 48 ++ include/uzfs_test.h | 2 - include/zrepl_mgmt.h | 7 +- lib/libzpool/Makefile.am | 2 +- lib/libzpool/uzfs_io.c | 67 ++- lib/libzpool/uzfs_mgmt.c | 5 - lib/libzpool/uzfs_mtree.c | 478 ------------------ lib/libzpool/uzfs_rebuilding.c | 260 ++++++++++ lib/libzpool/zrepl_mgmt.c | 4 +- tests/cbtest/script/test_uzfs.sh | 46 +- 20 files changed, 462 insertions(+), 1099 deletions(-) rename cmd/uzfs_test/{uzfs_rebuilding.c => uzfs_test_rebuilding.c} (80%) delete mode 100644 cmd/uzfs_test/uzfs_txg_diff.c delete mode 100644 include/uzfs_mtree.h create mode 100644 include/uzfs_rebuilding.h delete mode 100644 lib/libzpool/uzfs_mtree.c create mode 100644 lib/libzpool/uzfs_rebuilding.c diff --git a/.travis.yml b/.travis.yml index 796786b3214b..540ed22a0d16 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,8 +14,6 @@ env: ZFS_BUILD_TAGS=0 - ZFS_TEST_TAGS=rebuild_test ZFS_BUILD_TAGS=0 - - ZFS_TEST_TAGS=txg_diff_test - ZFS_BUILD_TAGS=0 - ZFS_TEST_TAGS=fio_test ZFS_BUILD_TAGS=0 - ZFS_TEST_TAGS=zrepl_test diff --git a/cmd/uzfs_test/Makefile.am b/cmd/uzfs_test/Makefile.am index ff8682e9bb16..049614b3efd4 100644 --- a/cmd/uzfs_test/Makefile.am +++ b/cmd/uzfs_test/Makefile.am @@ -15,10 +15,8 @@ uzfs_test_SOURCES = \ uzfs_test.c \ uzfs_test_sync.c \ uzfs_zvol_zap.c \ - uzfs_txg_diff.c \ zrepl_utest.c \ - uzfs_rebuilding.c - + uzfs_test_rebuilding.c uzfs_test_LDADD = \ $(top_builddir)/lib/libnvpair/libnvpair.la \ diff --git a/cmd/uzfs_test/uzfs_test.c b/cmd/uzfs_test/uzfs_test.c index b97f20041e40..88dc28a3c4e4 100644 --- a/cmd/uzfs_test/uzfs_test.c +++ b/cmd/uzfs_test/uzfs_test.c @@ -50,9 +50,6 @@ uzfs_test_info_t uzfs_tests[] = { { uzfs_zvol_zap_operation, "uzfs zap operation test" }, { replay_fn, "zvol replay test" }, { unit_test_fn, "zvol read/write verification test"}, - { uzfs_txg_diff_verifcation_test, - "test to verify modified blocks between two txg for zvol" }, - { uzfs_txg_diff_tree_test, "txg_diff_tree functionality test" }, { uzfs_rebuild_test, "uzfs rebuild pool test"}, { zrepl_utest, "ZFS replication test" }, { uzfs_test_get_metablk_details, "Tests offset,len calculations of"\ diff --git a/cmd/uzfs_test/uzfs_rebuilding.c b/cmd/uzfs_test/uzfs_test_rebuilding.c similarity index 80% rename from cmd/uzfs_test/uzfs_rebuilding.c rename to cmd/uzfs_test/uzfs_test_rebuilding.c index 89962df096bb..7c183bbcae94 100644 --- a/cmd/uzfs_test/uzfs_rebuilding.c +++ b/cmd/uzfs_test/uzfs_test_rebuilding.c @@ -20,19 +20,20 @@ */ #include -#include -#include +#include #include -#include #include -#include #include +#include +#include +#include #include extern void make_vdev(char *path); extern void populate_string(char *buf, uint64_t size); -void *spa1, *spa2, *zvol1, *zvol2; +spa_t *spa1, *spa2; +zvol_state_t *zvol1, *zvol2; uint32_t f_index = 0; #define POOL_NAME "testpool" @@ -42,6 +43,7 @@ uint32_t f_index = 0; struct rebuilding_info { zvol_state_t *to_zvol; zvol_state_t *from_zvol; + uint64_t base_io_num; kmutex_t mtx; kcondvar_t cv; int active; @@ -58,8 +60,7 @@ typedef struct uzfs_rebuild_data { struct rebuilding_data { uzfs_rebuild_data_t *r_data; zvol_state_t *zvol; - uint64_t start_txg; - uint64_t end_txg; + uint64_t base_io; }; struct replica_read_data { @@ -101,6 +102,8 @@ replica_reader_thread(void *arg) uint64_t block_size = warg->io_block_size; uint64_t len1 = 0, len2 = 0; uint64_t mismatch_count = 0; + uint64_t mdlen1, mdlen2; + void *md1, *md2; for (j = 0; j < 15; j++) { buf1[j] = (char *)umem_alloc(sizeof (char)*(j+1)* block_size, @@ -117,6 +120,7 @@ replica_reader_thread(void *arg) while (1) { idx = uzfs_random(15); + idx = 0; len1 = 0; len2 = 0; @@ -128,13 +132,13 @@ replica_reader_thread(void *arg) len = end - offset; err = uzfs_read_data(zvol1, buf1[idx], offset, - len, NULL, NULL); + len, &md1, &mdlen1); if (err != 0) printf("IO error at offset: %lu len: %lu\n", offset, len); err = uzfs_read_data(zvol2, buf2[idx], offset, - len, NULL, NULL); + len, &md2, &mdlen2); if (err != 0) printf("IO error at offset: %lu len: %lu\n", offset, len); @@ -142,9 +146,15 @@ replica_reader_thread(void *arg) uint64_t mismatch; mismatch = verify_replica_data(buf1[idx], buf2[idx], len); mismatch_count += mismatch; - if (mismatch) + if (mdlen1 != mdlen2) { + printf("this should not happen\n"); + exit(3); + } + + if (mismatch) { printf("verification error at %lu, mismatch:%lu\n", offset, mismatch); + } iops += (idx + 1); offset += len; @@ -170,8 +180,8 @@ replica_reader_thread(void *arg) } static int -uzfs_test_txg_diff_traverse_cb(off_t offset, size_t len, - uint64_t blkid, void *arg) +uzfs_test_meta_diff_traverse_cb(off_t offset, size_t len, + blk_metadata_t *md, void *arg) { uzfs_rebuild_data_t *r_data = (uzfs_rebuild_data_t *)arg; uzfs_io_chunk_list_t *io; @@ -180,6 +190,7 @@ uzfs_test_txg_diff_traverse_cb(off_t offset, size_t len, io = umem_alloc(sizeof (*io), UMEM_NOFAIL); io->offset = offset; io->len = len; + io->io_number = md->io_num; io->buf = umem_alloc(len, UMEM_NOFAIL); err = uzfs_read_data(r_data->zvol, io->buf, offset, len, @@ -205,14 +216,18 @@ fetch_modified_data(void *arg) struct rebuilding_data *repl_data = arg; uzfs_rebuild_data_t *r_data = repl_data->r_data; int err; + blk_metadata_t io_number; printf("fetching modified data\n"); + io_number.io_num = repl_data->base_io; - err = uzfs_get_txg_diff(repl_data->zvol, repl_data->start_txg, - repl_data->end_txg, uzfs_test_txg_diff_traverse_cb, r_data); + err = uzfs_get_io_diff(repl_data->zvol, &io_number, + uzfs_test_meta_diff_traverse_cb, r_data); - if (err) + if (err) { printf("error(%d)... while fetching modified data\n", err); + exit(1); + } printf("finished fetching modified data\n"); @@ -228,7 +243,6 @@ rebuild_replica_thread(void *arg) { struct rebuilding_info *r_info = arg; r_info->active = B_TRUE; - uint64_t start_txg, end_txg; zvol_state_t *from_zvol = r_info->from_zvol; zvol_state_t *to_zvol = r_info->to_zvol; list_t *io_list; @@ -238,15 +252,13 @@ rebuild_replica_thread(void *arg) uzfs_io_chunk_list_t *node = NULL; int err = 0; uint64_t diff_data = 0; - - start_txg = 0; - txg_wait_synced(spa_get_dsl(to_zvol->zv_spa), 0); + uint64_t latest_io; uzfs_zvol_set_rebuild_status(to_zvol, ZVOL_REBUILDING_INIT); - txg_wait_synced(spa_get_dsl(from_zvol->zv_spa), 0); - end_txg = spa_last_synced_txg(from_zvol->zv_spa); - + uzfs_zvol_get_last_committed_io_no(from_zvol, &latest_io); + printf("io number... healthy replica:%lu degraded replica:%lu\n", + latest_io, r_info->base_io_num); uzfs_zvol_set_rebuild_status(to_zvol, ZVOL_REBUILDING_IN_PROGRESS); mutex_enter(&r_info->mtx); @@ -263,19 +275,18 @@ rebuild_replica_thread(void *arg) mutex_init(&r_data.mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&r_data.cv, NULL, CV_DEFAULT, NULL); - repl_data.start_txg = start_txg; - repl_data.end_txg = end_txg; repl_data.r_data = &r_data; + repl_data.base_io = r_info->base_io_num; repl_data.zvol = from_zvol; - printf("difference : start(%lu) end(%lu)\n", start_txg, end_txg); - tid = zk_thread_create(NULL, 0, (thread_func_t)fetch_modified_data, &repl_data, 0, NULL, TS_RUN, 0, PTHREAD_CREATE_DETACHED); mutex_enter(&r_data.mtx); while (!r_data.done || (node = list_remove_head(io_list)) != NULL) { + blk_metadata_t temp_metadata; + if (!node) { mutex_exit(&r_data.mtx); // sleep for some time here @@ -285,8 +296,10 @@ rebuild_replica_thread(void *arg) } mutex_exit(&r_data.mtx); + temp_metadata.io_num = node->io_number; + err = uzfs_write_data(to_zvol, node->buf, node->offset, - node->len, NULL, B_TRUE); + node->len, &temp_metadata, B_TRUE); if (err) { printf("IO error during rebuilding offset:%lu," "len:%lu\n", node->offset, node->len); @@ -310,7 +323,7 @@ rebuild_replica_thread(void *arg) uzfs_zvol_set_rebuild_status(to_zvol, ZVOL_REBUILDING_DONE); /* - * We finished rebuilding data in degraded zvol so set status to + * Degraded replica has finished rebuilding.. setting status to * ZVOL_STATUS_HEALTHY */ uzfs_zvol_set_status(to_zvol, ZVOL_STATUS_HEALTHY); @@ -343,6 +356,7 @@ replica_writer_thread(void *arg) struct rebuilding_info rebuild_info; kthread_t *rebuilding_thread = NULL; uint64_t mismatch_count = 0; + uint64_t last_io_num = 0; for (j = 0; j < 15; j++) buf[j] = (char *)umem_alloc(sizeof (char)*(j+1)*block_size, @@ -385,14 +399,24 @@ replica_writer_thread(void *arg) populate_string(buf[idx], (idx + 1) * block_size); err = uzfs_write_data(zvol1, buf[idx], offset, - (idx + 1) * block_size, NULL, B_FALSE); + (idx + 1) * block_size, &io_num, B_FALSE); if (err != 0) printf("IO error at offset: %lu len: %lu\n", offset, (idx + 1) * block_size); + /* + * update ZAP entries for io_number frequently. + */ + if (!(io_num % 30)) { + uzfs_zvol_store_last_committed_io_no(zvol1, io_num); + if (replica_active) + uzfs_zvol_store_last_committed_io_no(zvol2, + io_num); + } + if (replica_active) { err = uzfs_write_data(zvol2, buf[idx], offset, - (idx + 1) * block_size, NULL, B_FALSE); + (idx + 1) * block_size, &io_num, B_FALSE); if (err != 0) printf("IO error at offset: %lu len: %lu\n", offset, (idx + 1) * block_size); @@ -409,7 +433,19 @@ replica_writer_thread(void *arg) replica_active = B_FALSE; } else if (now > replica_start_time && !replica_active) { uzfs_zvol_set_status(zvol2, ZVOL_STATUS_DEGRADED); + replica_active = B_TRUE; + printf("other replica missed %lu bytes during " + "downtime\n", mismatch_count); + + /* + * For testing purpose, we will copy last committed + * io_number from degraded replica to some variable + * and continue to update last_committed_io_number in + * degraded replica. + */ + uzfs_zvol_get_last_committed_io_no(zvol2, &last_io_num); + rebuild_info.base_io_num = last_io_num; } else if (now > replica_rebuild_start_time && !rebuilding_started) { mutex_enter(&rebuild_info.mtx); @@ -436,9 +472,6 @@ replica_writer_thread(void *arg) if (silent == 0) printf("Stopping write.. ios done: %lu\n", iops); - printf("other replica missed %lu bytes during downtime\n", - mismatch_count); - mutex_enter(&rebuild_info.mtx); while (rebuild_info.active) cv_wait(&rebuild_info.cv, &rebuild_info.mtx); @@ -456,7 +489,7 @@ replica_writer_thread(void *arg) } void -create_and_init_pool(void **spa, void **zvol) +create_and_init_pool(spa_t **spa, zvol_state_t **zvol) { int err; char *pool_name, *file_name, *zvol_name; @@ -494,84 +527,6 @@ create_and_init_pool(void **spa, void **zvol) strfree(zvol_name); } -void -verify_overlaps(void *zv, uint64_t offset, uint64_t len, int num_args, ...) -{ - list_t *list, check_list; - va_list arg_list; - int count, num_chunks; - uzfs_io_chunk_list_t *node1, *node2; - uzfs_io_chunk_list_t *chunk_io; - int i; - - num_chunks = num_args / 2; - count = uzfs_search_incoming_io_tree(zv, offset, len, (void **)&list); - - if (count != num_chunks) { - printf("argument mismatch\n"); - exit(1); - } - - list_create(&check_list, sizeof (uzfs_io_chunk_list_t), - offsetof(uzfs_io_chunk_list_t, link)); - - va_start(arg_list, num_args); - for (i = 0; i < num_args; i += 2) { - chunk_io = umem_alloc(sizeof (*chunk_io), UMEM_NOFAIL); - chunk_io->offset = va_arg(arg_list, int); - chunk_io->len = va_arg(arg_list, int); - list_insert_tail(&check_list, chunk_io); - } - - va_end(arg_list); - - while ((node1 = list_remove_head(list))) { - for (node2 = list_head(&check_list); node2 != NULL; - node2 = list_next(&check_list, node2)) { - if (node2->offset == node1->offset && - node2->len == node1->len) { - list_remove(&check_list, node2); - umem_free(node2, sizeof (*node2)); - break; - } - } - umem_free(node1, sizeof (*node1)); - } - - VERIFY(list_is_empty(list)); - VERIFY(list_is_empty(&check_list)); - - list_destroy(list); - list_destroy(&check_list); - umem_free(list, sizeof (*list)); -} - -void -uzfs_test_check_overlaps(void *zv) -{ - uzfs_add_to_incoming_io_tree(zv, 100, 150); - uzfs_add_to_incoming_io_tree(zv, 300, 450); - uzfs_add_to_incoming_io_tree(zv, 800, 100); - - // offset == entry offset - verify_overlaps(zv, 100, 120, 0); - verify_overlaps(zv, 100, 150, 0); - verify_overlaps(zv, 100, 160, 2, 250, 10); - verify_overlaps(zv, 100, 800, 4, 250, 50, 750, 50); - - // offset > entry offset - verify_overlaps(zv, 120, 130, 0); - verify_overlaps(zv, 120, 100, 0); - verify_overlaps(zv, 120, 150, 2, 250, 20); - verify_overlaps(zv, 120, 300, 2, 250, 50); - - // offset < entry offset - verify_overlaps(zv, 50, 60, 2, 50, 50); - verify_overlaps(zv, 50, 100, 2, 50, 50); - verify_overlaps(zv, 50, 220, 4, 50, 50, 250, 20); - verify_overlaps(zv, 50, 1000, 8, 50, 50, 250, 50, 750, 50, 900, 150); -} - void uzfs_rebuild_test(void *arg) { @@ -582,7 +537,7 @@ uzfs_rebuild_test(void *arg) kthread_t *writer, *reader; worker_args_t writer_args, **reader_args; int reader_count; - int n = 0, check = 0; + int n = 0; printf("starting %s\n", test_info->name); @@ -662,13 +617,6 @@ uzfs_rebuild_test(void *arg) cv_destroy(&cv); mutex_destroy(&mtx); - /* - * This is a small test only to verify most possibilities - * of overlapping IO - */ - if (!check++) - uzfs_test_check_overlaps(zvol1); - uzfs_close_dataset(zvol1); uzfs_close_dataset(zvol2); uzfs_close_pool(spa1); diff --git a/cmd/uzfs_test/uzfs_txg_diff.c b/cmd/uzfs_test/uzfs_txg_diff.c deleted file mode 100644 index e782af063091..000000000000 --- a/cmd/uzfs_test/uzfs_txg_diff.c +++ /dev/null @@ -1,362 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -extern void populate_data(char *buf, uint64_t offset, int idx, - uint64_t block_size); -static int uzfs_test_txg_diff_traverse_cb(off_t offset, size_t len, - uint64_t blkid, void *arg); -static int del_from_txg_diff_tree(avl_tree_t *tree, uint64_t b_offset, - uint64_t b_len); -static int uzfs_search_txg_diff_tree(avl_tree_t *tree, uint64_t offset, - uint64_t *len); - -/* - * Delete entry (offset, len) from tree. - * Note : As of now, this API is used in testing code only. To use it in - * library, move this API to library code. - */ -int -del_from_txg_diff_tree(avl_tree_t *tree, uint64_t offset, uint64_t len) -{ - uint64_t new_offset, new_len, b_end, b_offset, b_len; - uint64_t entry_len, entry_offset; - uzfs_zvol_blk_phy_t *entry, *new_entry, *b_entry; - uzfs_zvol_blk_phy_t f_entry; - avl_index_t where; - int err = 0; - - new_offset = offset; - new_len = len; - - f_entry.offset = new_offset; - f_entry.len = new_len; - entry = avl_find(tree, &f_entry, &where); - - // found entry whose offset matches with f_entry's offset - if (entry != NULL) { - // entry's len doesn't match with f_entry's len - if (entry->len < new_len) { - err = -1; - goto done; - } - - /* - * entry's length is not lesser than f_entry. - * If entry's len is greater than f_entry, then - * update entry's offset to (f_entry's end) and len to - * entry's len - f_entry's len. - */ - entry_offset = entry->offset; - entry_len = entry->len; - avl_remove(tree, entry); - umem_free(entry, sizeof (*entry)); - - if (entry_len > new_len) { - new_entry = umem_alloc(sizeof (uzfs_zvol_blk_phy_t), - UMEM_NOFAIL); - new_entry->offset = entry_offset + new_len; - new_entry->len = (entry_len - new_len); - avl_add(tree, new_entry); - } - goto done; - } - - /* - * Search for nearest entry whose offset is lesser than - * f_entry's offset - */ - b_entry = avl_nearest(tree, where, AVL_BEFORE); - if (b_entry) { - b_end = (b_entry->offset + b_entry->len); - - // b_entry ends before f_entry ends - if (b_end < (new_offset + new_len)) { - err = -1; - goto done; - } - - b_offset = b_entry->offset; - b_len = b_entry->len; - avl_remove(tree, b_entry); - - umem_free(b_entry, sizeof (*b_entry)); - - new_entry = umem_alloc(sizeof (uzfs_zvol_blk_phy_t), - UMEM_NOFAIL); - new_entry->offset = b_offset; - new_entry->len = (new_offset - b_offset); - avl_add(tree, new_entry); - - if (b_end > (new_offset + new_len)) { - new_entry = umem_alloc(sizeof (uzfs_zvol_blk_phy_t), - UMEM_NOFAIL); - new_entry->offset = new_offset + new_len; - new_entry->len = (b_end - new_entry->offset); - avl_add(tree, new_entry); - } - goto done; - } else { - err = -1; - goto done; - } - -done: - return (err); -} - -int -uzfs_search_txg_diff_tree(avl_tree_t *tree, uint64_t offset, uint64_t *len) -{ - uzfs_zvol_blk_phy_t tofind; - avl_index_t where; - uzfs_zvol_blk_phy_t *entry; - - tofind.offset = offset; - tofind.len = 0; - - entry = avl_find(tree, &tofind, &where); - if (entry == NULL) - return (0); - - *len = entry->len; - return (1); -} - -static int -uzfs_test_txg_diff_traverse_cb(off_t offset, size_t len, uint64_t blkid, - void *arg) -{ - avl_tree_t *tree = (avl_tree_t *)arg; - add_to_txg_diff_tree(tree, offset, len); - return (0); -} - -void -uzfs_txg_diff_verifcation_test(void *arg) -{ - uzfs_test_info_t *test_info = (uzfs_test_info_t *)arg; - avl_tree_t *modified_block_tree; - uint64_t first_txg, last_txg; - hrtime_t end, now; - uint64_t blk_offset, offset, vol_blocks; - uint64_t io_num = 0; - spa_t *spa; - zvol_state_t *zvol; - void *cookie = NULL; - char *buf; - int max_io, count, i = 0; - avl_tree_t *write_io_tree; - avl_index_t where; - uzfs_zvol_blk_phy_t *blk_info, temp_blk_info; - - setup_unit_test(); - unit_test_create_pool_ds(); - open_pool(&spa); - open_ds(spa, &zvol); - - vol_blocks = active_size / block_size; - buf = umem_alloc(block_size, UMEM_NOFAIL); - - uzfs_create_txg_diff_tree((void **)&write_io_tree); - uzfs_create_txg_diff_tree((void **)&modified_block_tree); - - now = gethrtime(); - end = now + (hrtime_t)(total_time_in_sec * (hrtime_t)(NANOSEC)); - - while (i++ < 5) { - count = 0; - cookie = NULL; - // Here, consider test_iterations as number of ios - max_io = test_iterations; - io_num = 0; - - txg_wait_synced(spa_get_dsl(spa), 0); - first_txg = spa_last_synced_txg(spa); - - while (count++ < max_io) { - where = 0; - blk_offset = uzfs_random(vol_blocks - 16); - /* - * make sure offset is aligned to block size - */ - offset = blk_offset * block_size; - - temp_blk_info.offset = offset; - if (avl_find(write_io_tree, &temp_blk_info, &where)) - continue; - - populate_data(buf, offset, 0, block_size); - - if (uzfs_write_data(zvol, buf, offset, uzfs_random(1) ? - block_size : io_block_size, &io_num, B_FALSE)) - printf("IO error at offset: %lu len: %lu\n", - offset, block_size); - - blk_info = umem_alloc(sizeof (uzfs_zvol_blk_phy_t), - UMEM_NOFAIL); - blk_info->offset = offset; - blk_info->len = block_size; - avl_insert(write_io_tree, blk_info, where); - blk_info = NULL; - io_num++; - } - - txg_wait_synced(spa_get_dsl(spa), 0); - last_txg = spa_last_synced_txg(spa); - - uzfs_get_txg_diff(zvol, first_txg, last_txg, - uzfs_test_txg_diff_traverse_cb, - (void *)modified_block_tree); - - while ((blk_info = avl_destroy_nodes(write_io_tree, - &cookie)) != NULL) { - VERIFY0(del_from_txg_diff_tree(modified_block_tree, - blk_info->offset, blk_info->len)); - umem_free(blk_info, sizeof (uzfs_zvol_blk_phy_t)); - } - - VERIFY0(avl_numnodes(modified_block_tree)); - VERIFY0(avl_numnodes(write_io_tree)); - printf("%s : pass:%d\n", test_info->name, i); - } - - uzfs_close_dataset(zvol); - uzfs_close_pool(spa); - uzfs_destroy_txg_diff_tree(write_io_tree); - uzfs_destroy_txg_diff_tree(modified_block_tree); - umem_free(buf, block_size); -} - -static void -check_tree(avl_tree_t *tree, uint64_t offset, uint64_t len, uint64_t exp_off, - uint64_t exp_len, int exp_ret) -{ - int ret; - uint64_t len1 = 0; - - ret = uzfs_search_txg_diff_tree(tree, exp_off, &len1); - - VERIFY(ret == exp_ret); - - if (ret) - VERIFY(exp_len == len1); -} - -static void -add_and_check_tree(avl_tree_t *tree, uint64_t offset, uint64_t len, - uint64_t exp_off, uint64_t exp_len, int exp_ret) -{ - add_to_txg_diff_tree(tree, offset, len); - check_tree(tree, offset, len, exp_off, exp_len, exp_ret); -} - -static void -delete_and_check_tree(avl_tree_t *tree, uint64_t offset, uint64_t len, - uint64_t exp_off, uint64_t exp_len, int exp_ret) -{ - del_from_txg_diff_tree(tree, offset, len); - check_tree(tree, offset, len, exp_off, exp_len, exp_ret); -} - -void -uzfs_txg_diff_tree_test(void *arg) -{ - uzfs_test_info_t *test_info = (uzfs_test_info_t *)arg; - avl_tree_t *tree; - uint64_t blksz = io_block_size; - - uzfs_create_txg_diff_tree((void **)&tree); - - add_and_check_tree(tree, 100, 50, 100, 50, 1); - add_and_check_tree(tree, 150, 50, 100, 100, 1); - add_and_check_tree(tree, 5 * blksz, blksz, 5 * blksz, blksz, 1); - add_and_check_tree(tree, 4 * blksz, blksz, 4 * blksz, - 2 * blksz, 1); - check_tree(tree, 4 * blksz, blksz, 5 * blksz, blksz, 0); - add_and_check_tree(tree, 2 * blksz, blksz, 2 * blksz, blksz, 1); - check_tree(tree, 2 * blksz, blksz, 4 * blksz, 2 * blksz, 1); - check_tree(tree, 2 * blksz, blksz, 5 * blksz, blksz, 0); - add_and_check_tree(tree, 2 * blksz, blksz, 2 * blksz, blksz, 1); - check_tree(tree, 2 * blksz, blksz, 4 * blksz, 2 * blksz, 1); - check_tree(tree, 2 * blksz, blksz, 5 * blksz, blksz, 0); - add_and_check_tree(tree, blksz, blksz, blksz, 2 * blksz, 1); - check_tree(tree, blksz, blksz, 2 * blksz, blksz, 0); - check_tree(tree, blksz, blksz, 4 * blksz, 2 * blksz, 1); - check_tree(tree, blksz, blksz, 5 * blksz, blksz, 0); - add_and_check_tree(tree, 3 * blksz, blksz, blksz, 5 * blksz, 1); - check_tree(tree, 3 * blksz, blksz, 2 * blksz, blksz, 0); - check_tree(tree, 3 * blksz, blksz, 3 * blksz, blksz, 0); - check_tree(tree, 3 * blksz, blksz, 4 * blksz, 2 * blksz, 0); - check_tree(tree, 3 * blksz, blksz, 5 * blksz, blksz, 0); - add_and_check_tree(tree, blksz, blksz, blksz, 5 * blksz, 1); - check_tree(tree, blksz, blksz, 2 * blksz, blksz, 0); - check_tree(tree, blksz, blksz, 3 * blksz, blksz, 0); - check_tree(tree, blksz, blksz, 4 * blksz, 2 * blksz, 0); - check_tree(tree, blksz, blksz, 5 * blksz, blksz, 0); - add_and_check_tree(tree, 3 * blksz, blksz, blksz, 5 * blksz, 1); - check_tree(tree, 3 * blksz, blksz, 2 * blksz, blksz, 0); - check_tree(tree, 3 * blksz, blksz, 3 * blksz, blksz, 0); - check_tree(tree, 3 * blksz, blksz, 4 * blksz, 2 * blksz, 0); - check_tree(tree, 3 * blksz, blksz, 5 * blksz, blksz, 0); - delete_and_check_tree(tree, blksz, blksz, 2 * blksz, - 4 * blksz, 1); - check_tree(tree, blksz, blksz, blksz, blksz, 0); - check_tree(tree, blksz, blksz, 3 * blksz, blksz, 0); - check_tree(tree, blksz, blksz, 4 * blksz, 2 * blksz, 0); - check_tree(tree, blksz, blksz, 5 * blksz, blksz, 0); - delete_and_check_tree(tree, 2 * blksz, blksz, 3 * blksz, - 3 * blksz, 1); - check_tree(tree, 2 * blksz, blksz, blksz, blksz, 0); - check_tree(tree, 2 * blksz, blksz, 2 * blksz, blksz, 0); - check_tree(tree, 2 * blksz, blksz, 4 * blksz, 2 * blksz, 0); - check_tree(tree, 2 * blksz, blksz, 5 * blksz, blksz, 0); - delete_and_check_tree(tree, 110, 10, 100, 10, 1); - check_tree(tree, 120, 30, 120, 80, 1); - add_and_check_tree(tree, 60, 20, 50, 0, 0); - add_and_check_tree(tree, 60, 20, 60, 20, 1); - delete_and_check_tree(tree, 60, 10, 70, 10, 1); - add_and_check_tree(tree, 80, 20, 70, 40, 1); - add_and_check_tree(tree, 80, 20, 90, 0, 0); - add_and_check_tree(tree, 80, 20, 100, 0, 0); - add_and_check_tree(tree, 200, 50, 200, 0, 0); - add_and_check_tree(tree, 200, 50, 70, 40, 1); - delete_and_check_tree(tree, 230, 20, 200, 0, 0); - add_and_check_tree(tree, 40, 45, 40, 70, 1); - add_and_check_tree(tree, 40, 45, 70, 40, 0); - add_and_check_tree(tree, 150, 50, 120, 110, 1); - add_and_check_tree(tree, 130, 140, 120, 150, 1); - add_and_check_tree(tree, 30, 270, 30, 270, 1); - add_and_check_tree(tree, 30, 270, 40, 0, 0); - add_and_check_tree(tree, 130, 140, 130, 140, 0); - add_and_check_tree(tree, 130, 140, 120, 150, 0); - - uzfs_destroy_txg_diff_tree((void *)tree); - printf("%s pass\n", test_info->name); -} diff --git a/cmd/zrepl/zrepl.c b/cmd/zrepl/zrepl.c index e1dad8586a58..996640bd91ed 100644 --- a/cmd/zrepl/zrepl.c +++ b/cmd/zrepl/zrepl.c @@ -509,7 +509,7 @@ uzfs_zvol_mgmt_do_handshake(zvol_io_hdr_t *hdr, int sfd, char *name) } if (zinfo != NULL) { - uzfs_zvol_get_last_committed_io_no(zinfo, + uzfs_zvol_get_last_committed_io_no(zinfo->zv, &hdr->checkpointed_io_seq); uzfs_zinfo_drop_refcnt(zinfo, B_FALSE); } diff --git a/include/Makefile.am b/include/Makefile.am index c90345685657..4113ba623adc 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -36,7 +36,7 @@ USER_H = \ $(top_srcdir)/include/uzfs_zap.h \ $(top_srcdir)/include/uzfs.h \ $(top_srcdir)/include/uzfs_task.h \ - $(top_srcdir)/include/uzfs_mtree.h + $(top_srcdir)/include/uzfs_rebuilding.h EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) diff --git a/include/sys/uzfs_zvol.h b/include/sys/uzfs_zvol.h index 670da473db28..9d761dd58772 100644 --- a/include/sys/uzfs_zvol.h +++ b/include/sys/uzfs_zvol.h @@ -53,9 +53,7 @@ typedef struct metaobj_blk_offset { */ typedef struct zvol_rebuild_data { /* mutex to synchronize io tree operation */ - kmutex_t io_tree_mtx; uint64_t rebuild_bytes; - avl_tree_t *incoming_io_tree; /* incoming io tree */ } zvol_rebuild_data_t; /* @@ -149,11 +147,12 @@ typedef struct uzfs_zvol_blk_phy { typedef struct uzfs_io_chunk_list { uint64_t offset; uint64_t len; + uint64_t io_number; char *buf; list_node_t link; } uzfs_io_chunk_list_t; -typedef int (uzfs_txg_diff_traverse_cb_t)(off_t offset, size_t len, - uint64_t blkid, void *arg); +typedef int (uzfs_get_io_diff_cb_t)(off_t offset, size_t len, + void *io_number, void *arg); #endif #endif diff --git a/include/uzfs_io.h b/include/uzfs_io.h index 95ec4d631ec4..0ee5e4a399f6 100644 --- a/include/uzfs_io.h +++ b/include/uzfs_io.h @@ -56,4 +56,9 @@ extern int uzfs_zvol_get_rebuild_status(void *zv); extern void uzfs_zvol_set_status(void *zv, int status); extern int uzfs_zvol_get_status(void *zv); +/* + * API to read metadata + */ +extern int uzfs_read_metadata(void *zv, char *buf, uint64_t offset, + uint64_t len, uint64_t *r); #endif diff --git a/include/uzfs_mtree.h b/include/uzfs_mtree.h deleted file mode 100644 index 46c7ff00dbfb..000000000000 --- a/include/uzfs_mtree.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#ifndef _UZFS_MTREE_H -#define _UZFS_MTREE_H - -/* - * API to get modified block details between start_txg and end_txg - * Note: Caller needs to pass a callback function which will be called - * for each modified block with (offset, length and blockId) - */ -extern int uzfs_get_txg_diff(void *zv, uint64_t start_txg, - uint64_t end_txg, void *func, void *arg); - -/* - * dump_txg_diff_tree will print all entries (offset:length) to stdout - */ -extern void dump_txg_diff_tree(void *tree); - -/* - * dump_io_incoming_tree will print all entries from incoming io tree - */ -extern void dump_io_incoming_tree(void *zv); - -/* - * uzfs_create_txg_diff_tree will create avl tree to store incoming io's - * during rebuilding - */ -extern void uzfs_create_txg_diff_tree(void **tree); -extern void uzfs_destroy_txg_diff_tree(void *tree); - -extern int add_to_txg_diff_tree(void *tree, uint64_t offset, uint64_t size); - -/* - * to add incoming io's details in io_tree - */ -extern void uzfs_add_to_incoming_io_tree(void *zv, uint64_t offset, - uint64_t len); - -/* - * API to search non-overlapping segment for rebuilding io - * It will create linked list with non-overlapping segment - * entries (i.e offset and length) - */ -extern int uzfs_search_incoming_io_tree(void *zv, uint64_t offset, - uint64_t len, void **list); - -extern int uzfs_txg_diff_tree_compare(const void *arg1, const void *arg2); -#endif diff --git a/include/uzfs_rebuilding.h b/include/uzfs_rebuilding.h new file mode 100644 index 000000000000..a93ef4238415 --- /dev/null +++ b/include/uzfs_rebuilding.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _UZFS_REBUILDING_H +#define _UZFS_REBUILDING_H + +/* + * API to compare metadata + * return : + * -1 : if first < second + * 0 : if first == second + * 1 : if first > second + */ +int compare_blk_metadata(void *first_md, void *second_md); + +/* + * API to access data whose metadata is higer than base_metadata + */ +int uzfs_get_io_diff(void *zv, void *base_metadata, void *cb_func, void *arg); + +/* + * uzfs_search_nonoverlapping_io will check on_disk metadata with w_metadata and + * will populate list with non-overlapping segment(offset,len). + * IO's will be compared by meta_vol_block_size. If on_disk metadata is greater + * than w_metadata then that part of IO's will be discarded else it will be + * added to list. + */ +int uzfs_search_nonoverlapping_io(zvol_state_t *zv, uint64_t offset, + uint64_t len, void *w_metadata, void **list); +#endif diff --git a/include/uzfs_test.h b/include/uzfs_test.h index e52d4a3b05d3..1a08a0be6d20 100644 --- a/include/uzfs_test.h +++ b/include/uzfs_test.h @@ -71,8 +71,6 @@ typedef struct uzfs_test_info { void uzfs_zvol_zap_operation(void *arg); void unit_test_fn(void *arg); -void uzfs_txg_diff_tree_test(void *arg); -void uzfs_txg_diff_verifcation_test(void *arg); void zrepl_utest(void *arg); void uzfs_rebuild_test(void *arg); #endif diff --git a/include/zrepl_mgmt.h b/include/zrepl_mgmt.h index 2b5083dacb0f..995eed36b2a1 100644 --- a/include/zrepl_mgmt.h +++ b/include/zrepl_mgmt.h @@ -27,6 +27,7 @@ #include #include +#include #include "zrepl_prot.h" #ifdef __cplusplus @@ -56,7 +57,7 @@ typedef struct zvol_info_s { /* Logical Unit related fields */ zvol_info_state_t state; char name[MAXPATHLEN]; - void *zv; + zvol_state_t *zv; int refcnt; int is_io_ack_sender_created; uint64_t checkpointed_io_seq; @@ -105,8 +106,10 @@ extern void uzfs_zinfo_take_refcnt(zvol_info_t *zinfo, int locked); extern void uzfs_zinfo_replay_zil_all(void); extern int uzfs_zinfo_destroy(const char *ds_name); extern void uzfs_zinfo_update_io_seq_for_all_volumes(void); -extern void uzfs_zvol_get_last_committed_io_no(zvol_info_t *z, +void uzfs_zvol_get_last_committed_io_no(zvol_state_t *zv, uint64_t *io_seq); +void uzfs_zvol_store_last_committed_io_no(zvol_state_t *zv, + uint64_t io_seq); extern int create_and_bind(const char *port, int bind_needed); #define ZREPL_LOG(fmt, ...) syslog(LOG_NOTICE, \ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 048ddc63b719..1d4f6a90eef9 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -21,7 +21,7 @@ USER_C = \ uzfs_io.c \ uzfs_task.c \ uzfs_mgmt.c \ - uzfs_mtree.c \ + uzfs_rebuilding.c \ uzfs_test_mgmt.c \ uzfs_zap.c \ vdev_disk_aio.c \ diff --git a/lib/libzpool/uzfs_io.c b/lib/libzpool/uzfs_io.c index d1c436f0acbc..1ebc017b2296 100644 --- a/lib/libzpool/uzfs_io.c +++ b/lib/libzpool/uzfs_io.c @@ -21,7 +21,7 @@ #include #include -#include +#include #define GET_NEXT_CHUNK(chunk_io, offset, len, end) \ do { \ @@ -99,15 +99,12 @@ uzfs_write_data(zvol_state_t *zv, char *buf, uint64_t offset, uint64_t len, rl = zfs_range_lock(&zv->zv_range_lock, offset, len, RL_WRITER); - if (!is_rebuild && (zv->zv_status & ZVOL_STATUS_DEGRADED)) - uzfs_add_to_incoming_io_tree(zv, offset, len); - - if (zv->zv_rebuild_status & ZVOL_REBUILDING_IN_PROGRESS) { - if (is_rebuild) { - count = uzfs_search_incoming_io_tree(zv, offset, - len, (void **)&chunk_io); - if (!count) - goto exit_with_error; + if ((zv->zv_status & ZVOL_REBUILDING_IN_PROGRESS) && + is_rebuild) { + count = uzfs_search_nonoverlapping_io(zv, offset, + len, metadata, (void **)&chunk_io); + if (!count) + goto exit_with_error; chunk_io: GET_NEXT_CHUNK(chunk_io, offset, len, end); wrote = offset - orig_offset; @@ -117,7 +114,6 @@ uzfs_write_data(zvol_state_t *zv, char *buf, uint64_t offset, uint64_t len, zv->rebuild_data.rebuild_bytes += len; count--; - } } else { VERIFY(is_rebuild == 0); } @@ -310,3 +306,52 @@ uzfs_zvol_get_rebuild_status(zvol_state_t *zv) { return (zv->zv_rebuild_status); } + +/* + * This assumes snapshot, so, no need of range lock + */ +int +uzfs_read_metadata(zvol_state_t *zv, char *buf, uint64_t offset, uint64_t len, + uint64_t *r) +{ + uint64_t blocksize = zv->zv_volmetablocksize; + uint64_t len_in_first_aligned_block, bytes, read = 0; + uint64_t end = offset + len; + uint64_t metaobjectsize = P2ALIGN(zv->zv_volsize, + zv->zv_metavolblocksize); + uint64_t r_offset = P2ALIGN(offset, blocksize); + int ret = 0; + + len_in_first_aligned_block = (blocksize - (offset - r_offset)); + + if (len_in_first_aligned_block > len) + len_in_first_aligned_block = len; + + while ((offset < end) && (offset < metaobjectsize)) { + if (len_in_first_aligned_block != 0) { + bytes = len_in_first_aligned_block; + len_in_first_aligned_block = 0; + } else { + bytes = (len < blocksize) ? len : blocksize; + } + + if (bytes > (metaobjectsize - offset)) + bytes = metaobjectsize - offset; + + ret = dmu_read(zv->zv_objset, ZVOL_META_OBJ, offset, bytes, + buf + read, 0); + if (ret) { + ret = UZFS_IO_READ_FAIL; + break; + } + + offset += bytes; + read += bytes; + len -= bytes; + } + + if (r) + *r = read; + + return (ret); +} diff --git a/lib/libzpool/uzfs_mgmt.c b/lib/libzpool/uzfs_mgmt.c index d30164b9f4a3..5fa7444a99ef 100644 --- a/lib/libzpool/uzfs_mgmt.c +++ b/lib/libzpool/uzfs_mgmt.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -290,8 +289,6 @@ uzfs_open_dataset_init(const char *ds_name, zvol_state_t **z) zv->zv_spa = spa; zfs_rlock_init(&zv->zv_range_lock); zfs_rlock_init(&zv->zv_mrange_lock); - mutex_init(&zv->rebuild_data.io_tree_mtx, NULL, MUTEX_DEFAULT, NULL); - uzfs_create_txg_diff_tree((void **)&zv->rebuild_data.incoming_io_tree); strlcpy(zv->zv_name, ds_name, MAXNAMELEN); @@ -454,8 +451,6 @@ uzfs_close_dataset(zvol_state_t *zv) zil_close(zv->zv_zilog); dnode_rele(zv->zv_dn, zv); dmu_objset_disown(zv->zv_objset, zv); - mutex_destroy(&zv->rebuild_data.io_tree_mtx); - uzfs_destroy_txg_diff_tree(zv->rebuild_data.incoming_io_tree); zfs_rlock_destroy(&zv->zv_range_lock); zfs_rlock_destroy(&zv->zv_mrange_lock); spa_close(zv->zv_spa, zv); diff --git a/lib/libzpool/uzfs_mtree.c b/lib/libzpool/uzfs_mtree.c deleted file mode 100644 index 26783081bc09..000000000000 --- a/lib/libzpool/uzfs_mtree.c +++ /dev/null @@ -1,478 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ - -#include -#include -#include -#include -#include -#include -#include - -#define TXG_DIFF_SNAPNAME "tsnap" - -/* - * TXG_SNAPLEN = snapname length + dataset name len + - * max hrtime_t print size - */ -#define TXG_SNAPLEN sizeof (TXG_DIFF_SNAPNAME) + \ - ZFS_MAX_DATASET_NAME_LEN + 21 - -#define ADD_TO_IO_CHUNK_LIST(list, e_offset, e_len, node, count) \ - do { \ - node = umem_alloc(sizeof (*node), UMEM_NOFAIL); \ - node->offset = e_offset; \ - node->len = e_len; \ - list_insert_tail(list, node); \ - count++; \ - } while (0) - -typedef struct uzfs_txg_diff_cb_args { - uzfs_txg_diff_traverse_cb_t *func; - uint64_t start_txg; - uint64_t end_txg; - void *arg_data; -} uzfs_txg_diff_cb_args_t; - -/* - * Add entry with (offset, len) to tree. - * Merge new entry with an existing entry if new entry overlaps with - * existing entry. - */ -void -add_to_txg_diff_tree(avl_tree_t *tree, uint64_t boffset, uint64_t blen) -{ - uint64_t new_offset, new_len, b_end, a_end; - uzfs_zvol_blk_phy_t *entry, *new_node, *b_entry, *a_entry; - uzfs_zvol_blk_phy_t tofind; - avl_index_t where; - - new_offset = boffset; - new_len = blen; - -find: - tofind.offset = new_offset; - tofind.len = new_len; - entry = avl_find(tree, &tofind, &where); - - /* - * new_offset is available in tree. - * If entry->len is greater than or equal to new_len then skip adding - * a new_entry else remove entry and search again for new entry. - */ - if (entry != NULL) { - if (entry->len >= new_len) { - return; - } else { - avl_remove(tree, entry); - umem_free(entry, sizeof (*entry)); - goto find; - } - } - - // search for nearest entry whose offset is lesser than new_offset - b_entry = avl_nearest(tree, where, AVL_BEFORE); - if (b_entry) { - b_end = (b_entry->offset + b_entry->len); - - /* - * If new entry doesn't overlap with new_entry then search - * for after and entry whose offset is greater than - * new_entry's offset - */ - if (b_end < new_offset) - goto after; - - /* - * If new_entry's offset and b_entry's end are same, then - * remove b_entry and add new entry whose offset = - * (b_entry's offset) and length = (b_entry's len + - * new entry's len). - */ - if (b_end == new_offset) { - new_len += (b_entry->len); - new_offset = b_entry->offset; - avl_remove(tree, b_entry); - umem_free(b_entry, sizeof (*b_entry)); - goto find; - } - - /* - * If new_entry overlaps with b_entry, then remove b_entry and - * add new entry whose offset = (b_entry's offset) and len = - * ("b_entry's len" + "new_entry's len" - "overlap len"). - */ - if (b_end < (new_offset + new_len)) { - new_len += (new_offset - b_entry->offset); - new_offset = b_entry->offset; - avl_remove(tree, b_entry); - umem_free(b_entry, sizeof (*b_entry)); - goto find; - } - - // new_entry overlaps with b_entry completely - if (b_end >= (new_offset + new_len)) - return; - } - -after: - /* - * search for nearest entry whose offset is greater than new_offset - * Here, If we can not find any entry which overlaps with new_entry then - * we will add new_entry to tree else merge new_entry with nearest - * entry. - */ - a_entry = avl_nearest(tree, where, AVL_AFTER); - - if (a_entry) { - a_end = (a_entry->offset + a_entry->len); - - // new_entry doesn't overlap with a_entry - if ((new_offset + new_len) < a_entry->offset) - goto doadd; - - // new_entry's end and a_entry's offset are same - if ((new_offset + new_len) == a_entry->offset) { - new_len += a_entry->len; - avl_remove(tree, a_entry); - umem_free(a_entry, sizeof (*a_entry)); - goto find; - } - - /* - * new_entry overlaps with a_entry and new_entry's end is - * lesser or equal to a_entry's end - */ - if ((new_offset + new_len) <= (a_end)) { - new_len = (a_entry->len) + - (a_entry->offset - new_offset); - avl_remove(tree, a_entry); - umem_free(a_entry, sizeof (*a_entry)); - goto find; - } - - /* - * new_entry overlaps with a_entry and new_entry's end is - * greater than a_entry's end - */ - if ((new_offset + new_len) > (a_end)) { - avl_remove(tree, a_entry); - umem_free(a_entry, sizeof (*a_entry)); - goto find; - } - } - -doadd: - new_node = umem_alloc(sizeof (uzfs_zvol_blk_phy_t), UMEM_NOFAIL); - new_node->offset = new_offset; - new_node->len = new_len; - avl_insert(tree, new_node, where); -} - -void -dump_txg_diff_tree(avl_tree_t *tree) -{ - uzfs_zvol_blk_phy_t *blk; - - for (blk = avl_first(tree); blk; blk = AVL_NEXT(tree, blk)) { - printf("offset:%lu, length:%lu\n", blk->offset, blk->len); - } -} - -void -dump_incoming_io_tree(zvol_state_t *zv) -{ - mutex_enter(&zv->rebuild_data.io_tree_mtx); - dump_txg_diff_tree(zv->rebuild_data.incoming_io_tree); - mutex_exit(&zv->rebuild_data.io_tree_mtx); -} - -int -uzfs_txg_diff_cb(spa_t *spa, zilog_t *zillog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) -{ - uint64_t blksz; - uzfs_txg_diff_cb_args_t *diff_blk_info = (uzfs_txg_diff_cb_args_t *)arg; - - if ((bp == NULL) || (BP_IS_HOLE(bp)) || (zb->zb_object != ZVOL_OBJ) || - (zb->zb_level != 0)) - return (0); - - if (bp->blk_birth > diff_blk_info->end_txg || - bp->blk_birth < diff_blk_info->start_txg) - return (0); - - blksz = BP_GET_LSIZE(bp); - - return diff_blk_info->func(zb->zb_blkid * blksz, blksz, zb->zb_blkid, - diff_blk_info->arg_data); -} - -int -uzfs_txg_diff_tree_compare(const void *arg1, const void *arg2) -{ - uzfs_zvol_blk_phy_t *node1 = (uzfs_zvol_blk_phy_t *)arg1; - uzfs_zvol_blk_phy_t *node2 = (uzfs_zvol_blk_phy_t *)arg2; - - return (AVL_CMP(node1->offset, node2->offset)); -} - -int -uzfs_get_txg_diff(zvol_state_t *zv, uint64_t start_txg, uint64_t end_txg, - uzfs_txg_diff_traverse_cb_t *func, void *arg) -{ - int error; - char snapname[TXG_SNAPLEN]; - uzfs_txg_diff_cb_args_t diff_blk; - hrtime_t now; - dsl_pool_t *dp; - dsl_dataset_t *ds_snap; - - now = gethrtime(); - snprintf(snapname, sizeof (snapname), "%s%llu", TXG_DIFF_SNAPNAME, now); - - error = dmu_objset_snapshot_one(zv->zv_name, snapname); - if (error) { - printf("failed to create snapshot for %s\n", zv->zv_name); - return (error); - } - - memset(snapname, 0, sizeof (snapname)); - snprintf(snapname, sizeof (snapname), "%s@%s%llu", zv->zv_name, - TXG_DIFF_SNAPNAME, now); - - error = dsl_pool_hold(snapname, FTAG, &dp); - if (error != 0) - return (error); - - error = dsl_dataset_hold(dp, snapname, FTAG, &ds_snap); - if (error != 0) { - dsl_pool_rele(dp, FTAG); - return (error); - } - - dsl_dataset_long_hold(ds_snap, FTAG); - - memset(&diff_blk, 0, sizeof (diff_blk)); - - diff_blk.func = func; - diff_blk.start_txg = start_txg; - diff_blk.end_txg = end_txg; - diff_blk.arg_data = arg; - - error = traverse_dataset(ds_snap, start_txg, - TRAVERSE_PRE, uzfs_txg_diff_cb, &diff_blk); - - dsl_dataset_long_rele(ds_snap, FTAG); - dsl_dataset_rele(ds_snap, FTAG); - dsl_pool_rele(dp, FTAG); - - /* - * TODO: if we failed to destroy snapshot here then - * this should be handled separately from application. - */ - (void) dsl_destroy_snapshot(snapname, B_FALSE); - return (error); -} - -void -uzfs_create_txg_diff_tree(void **tree) -{ - avl_tree_t *temp_tree; - - temp_tree = umem_alloc(sizeof (avl_tree_t), UMEM_NOFAIL); - avl_create(temp_tree, uzfs_txg_diff_tree_compare, - sizeof (uzfs_zvol_blk_phy_t), - offsetof(uzfs_zvol_blk_phy_t, uzb_link)); - *tree = temp_tree; -} - -void -uzfs_destroy_txg_diff_tree(void *tree) -{ - avl_tree_t *temp_tree = tree; - uzfs_zvol_blk_phy_t *node; - void *cookie = NULL; - - while ((node = avl_destroy_nodes(temp_tree, &cookie)) != NULL) { - umem_free(node, sizeof (*node)); - } - - avl_destroy(temp_tree); - umem_free(temp_tree, sizeof (*temp_tree)); -} - -void -uzfs_add_to_incoming_io_tree(zvol_state_t *zv, uint64_t offset, uint64_t len) -{ - /* - * Here: Handling of incoming_io_tree creation is for error case only. - * It should be handled by replica or caller of uzfs_write_data - */ - mutex_enter(&zv->rebuild_data.io_tree_mtx); - - if (!zv->rebuild_data.incoming_io_tree) - uzfs_create_txg_diff_tree((void **) - &zv->rebuild_data.incoming_io_tree); - - add_to_txg_diff_tree(zv->rebuild_data.incoming_io_tree, offset, len); - mutex_exit(&zv->rebuild_data.io_tree_mtx); -} - -/* - * API to search non-overlapping segment for rebuilding io - * It will create linked list with non-overlapping segment - * entries (i.e offset and length) - */ -uint32_t -uzfs_search_incoming_io_tree(zvol_state_t *zv, uint64_t offset, uint64_t len, - list_t **list) -{ - avl_tree_t *tree = zv->rebuild_data.incoming_io_tree; - uint32_t count = 0; - uzfs_zvol_blk_phy_t *b_entry, *a_entry, *entry; - avl_index_t where; - uzfs_zvol_blk_phy_t tofind; - uint64_t a_end, b_end; - list_t *chunk_list; - uzfs_io_chunk_list_t *node; - - if (!tree) - return (0); - - chunk_list = umem_alloc(sizeof (*chunk_list), UMEM_NOFAIL); - list_create(chunk_list, sizeof (uzfs_io_chunk_list_t), - offsetof(uzfs_io_chunk_list_t, link)); - - mutex_enter(&zv->rebuild_data.io_tree_mtx); - -again: - tofind.offset = offset; - tofind.len = len; - - // Check for exact match - entry = avl_find(tree, &tofind, &where); - if (entry) { - /* - * Here, added entry length is greater or equals to rebuild - * io len - */ - if (entry->len >= len) - goto done; - - /* - * Here added entry length is smaller than rebuild io len - * so make offset to added offset + length and length to - * len - rebuild io len - */ - if (entry->len < len) { - offset = entry->offset + entry->len; - len = len - entry->len; - goto again; - } - } - - /* - * Check for entry whose offset is lesser than to_find.offset - * (or search_entry's offset) - */ - b_entry = avl_nearest(tree, where, AVL_BEFORE); - if (b_entry) { - b_end = b_entry->offset + b_entry->len; - a_end = offset + len; - - // If b_entry is not overlapping with search_entry - // like (b_entry->offset, b_end, offset, a_end) - if (b_end <= offset) - goto after; - - // If b_entry ends after search_entry - // like (b_entry->offset, offset, a_end, b_end) - if (a_end <= b_end) { - goto done; - } - - /* - * If search_entry ends before b_entry then change - * search_entry's offset to before_entry's end and - * length to length - (overlapping length) - * like.. (b_entry->offset, offset, b_end, a_end) - */ - len = len - (b_end - offset); - offset = b_end; - goto again; - } - -after: - a_entry = avl_nearest(tree, where, AVL_AFTER); - if (a_entry) { - a_end = a_entry->offset + a_entry->len; - b_end = offset + len; - - /* - * if search_entry ends <= a_entry's offset then add - * search_entry to io_chunk_list - * like (offset, b_end, a_entry->offset, a_end) - */ - if (b_end <= a_entry->offset) { - ADD_TO_IO_CHUNK_LIST(chunk_list, offset, len, node, - count); - goto done; - } - - /* - * If search_entry end <= a_entry end, then change - * search_entry's length to ( len - overlapping length) and - * add it to io_chunk_list - * like (offset, a_entry->offset, b_end, a_end) - */ - if (b_end <= a_end) { - ADD_TO_IO_CHUNK_LIST(chunk_list, offset, - len - (b_end - a_entry->offset), node, count); - goto done; - } - - /* - * if search_entry end > a_entry end, then divide search entry - * in three parts. like, (offset, a_entry->offset, a_end, b_end) - * for example, - * search_entry = offset : 400, len : 100 and - * a_entry = offset : 450, len : 20 - * then, divide search entry to , A (off:400, len:50), - * B (off:450, len:20) and C (offset:470, len:30). - * A entry : add this entry to chunk list - * B entry : ignore this entry, as it overlaps with - * search_entry - * C entry : search aggain for overlapping whith this entry - */ - ADD_TO_IO_CHUNK_LIST(chunk_list, offset, - a_entry->offset - offset, node, count); - len = b_end - a_end; - offset = a_end; - goto again; - } - - ADD_TO_IO_CHUNK_LIST(chunk_list, offset, len, node, count); -done: - mutex_exit(&zv->rebuild_data.io_tree_mtx); - *list = chunk_list; - return (count); -} diff --git a/lib/libzpool/uzfs_rebuilding.c b/lib/libzpool/uzfs_rebuilding.c new file mode 100644 index 000000000000..1fcb83732b9e --- /dev/null +++ b/lib/libzpool/uzfs_rebuilding.c @@ -0,0 +1,260 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define IO_DIFF_SNAPNAME ".io_snap" + +#define ADD_TO_IO_CHUNK_LIST(list, e_offset, e_len, count) \ + do { \ + uzfs_io_chunk_list_t *node; \ + node = umem_alloc(sizeof (*node), UMEM_NOFAIL); \ + node->offset = e_offset; \ + node->len = e_len; \ + list_insert_tail(list, node); \ + count++; \ + } while (0) + +int +compare_blk_metadata(blk_metadata_t *first, blk_metadata_t *second) +{ + if (first->io_num < second->io_num) + return (-1); + if (first->io_num == second->io_num) + return (0); + return (1); +} + +boolean_t +iszero(blk_metadata_t *md) +{ + if (md->io_num == 0) + return (B_TRUE); + return (B_FALSE); +} + +#define EXECUTE_DIFF_CALLBACK(last_lun_offset, diff_count, buf, \ + last_index, arg, last_md, zv, func) \ + do { \ + func(last_lun_offset, diff_count * \ + zv->zv_metavolblocksize, (blk_metadata_t *) \ + (buf + last_index), arg); \ + diff_count = 0; \ + last_index = 0; \ + last_md = NULL; \ + diff_count = 0; \ + } while (0) + +int +uzfs_get_io_diff(zvol_state_t *zv, blk_metadata_t *low, + uzfs_get_io_diff_cb_t *func, void *arg) +{ + uint64_t blocksize = zv->zv_volmetablocksize; + uint64_t metadata_read_chunk_size = 10 * blocksize; + uint64_t metaobjectsize = (zv->zv_volsize / zv->zv_metavolblocksize) * + zv->zv_volmetadatasize; + uint64_t metadatasize = zv->zv_volmetadatasize; + char *buf; + uint64_t lun_offset, len, i, read, offset; + int ret = 0; + char *snap_name, *dataset; + hrtime_t now; + dsl_pool_t *dp; + dsl_dataset_t *ds_snap; + int diff_count = 0, last_index = 0; + uint64_t last_lun_offset = 0; + blk_metadata_t *last_md; + + if (!func) + return (EINVAL); + + now = gethrtime(); + + snap_name = kmem_asprintf("%s%llu", IO_DIFF_SNAPNAME, now); + + ret = dmu_objset_snapshot_one(zv->zv_name, snap_name); + if (ret) { + printf("failed to create snapshot for %s\n", zv->zv_name); + strfree(snap_name); + return (ret); + } + + strfree(snap_name); + + dataset = kmem_asprintf("%s@%s%llu", zv->zv_name, + IO_DIFF_SNAPNAME, now); + + ret = dsl_pool_hold(dataset, FTAG, &dp); + if (ret) { + (void) dsl_destroy_snapshot(dataset, B_FALSE); + strfree(dataset); + return (ret); + } + + ret = dsl_dataset_hold(dp, dataset, FTAG, &ds_snap); + if (ret) { + (void) dsl_destroy_snapshot(dataset, B_FALSE); + dsl_pool_rele(dp, FTAG); + strfree(dataset); + return (ret); + } + + dsl_dataset_long_hold(ds_snap, FTAG); + + metadata_read_chunk_size = (metadata_read_chunk_size / metadatasize) * + metadatasize; + buf = umem_alloc(metadata_read_chunk_size, KM_SLEEP); + len = metadata_read_chunk_size; + + for (offset = 0; offset < metaobjectsize; offset += len) { + read = 0; + len = metadata_read_chunk_size; + + if ((offset + len) > metaobjectsize) + len = (metaobjectsize - offset); + + ret = uzfs_read_metadata(zv, buf, offset, len, &read); + + if (read != len || ret) + break; + + lun_offset = (offset / metadatasize) * zv->zv_metavolblocksize; + for (i = 0; i < len; i += sizeof (blk_metadata_t)) { + if (!iszero((blk_metadata_t *)(buf+i)) && + (compare_blk_metadata((blk_metadata_t *)(buf + i), + low) > 0)) { + if (diff_count == 0) { + last_lun_offset = lun_offset; + last_md = (blk_metadata_t *)(buf+i); + last_index = i; + } + diff_count++; + if (last_md != NULL && + compare_blk_metadata((blk_metadata_t *) + (buf + i), last_md) != 0) { + EXECUTE_DIFF_CALLBACK(last_lun_offset, + diff_count, buf, last_index, arg, + last_md, zv, func); + } + } else if (diff_count) { + EXECUTE_DIFF_CALLBACK(last_lun_offset, + diff_count, buf, last_index, arg, last_md, + zv, func); + } + + lun_offset += zv->zv_metavolblocksize; + } + + if (diff_count) { + EXECUTE_DIFF_CALLBACK(last_lun_offset, diff_count, buf, + last_index, arg, last_md, zv, func); + } + } + + dsl_dataset_long_rele(ds_snap, FTAG); + dsl_dataset_rele(ds_snap, FTAG); + dsl_pool_rele(dp, FTAG); + + /* + * TODO: if we failed to destroy snapshot here then + * this should be handled separately from application. + */ + (void) dsl_destroy_snapshot(dataset, B_FALSE); + umem_free(buf, metadata_read_chunk_size); + strfree(dataset); + return (ret); +} + +int +uzfs_search_nonoverlapping_io(zvol_state_t *zv, uint64_t offset, uint64_t len, + blk_metadata_t *metadata, void **list) +{ + char *rd_metadata_buf; + uint64_t rd_rlen; + metaobj_blk_offset_t rd_metablk; + blk_metadata_t *rd_metadata; + int diff_count = 0; + int count = 0; + int ret = 0; + int i = 0; + uint64_t lun_offset = 0, last_lun_offset = 0; + list_t *chunk_list = NULL; + uint64_t metavolblocksize = zv->zv_metavolblocksize; + uint64_t metadatasize = zv->zv_volmetadatasize; + + get_zv_metaobj_block_details(&rd_metablk, zv, offset, len); + rd_metadata_buf = umem_alloc(rd_metablk.m_len, UMEM_NOFAIL); + + ret = uzfs_read_metadata(zv, rd_metadata_buf, rd_metablk.m_offset, + rd_metablk.m_len, &rd_rlen); + if (ret || rd_rlen != rd_metablk.m_len) { + printf("failed to read metadata\n"); + goto exit; + } + + chunk_list = umem_alloc(sizeof (*chunk_list), UMEM_NOFAIL); + list_create(chunk_list, sizeof (uzfs_io_chunk_list_t), + offsetof(uzfs_io_chunk_list_t, link)); + + for (i = 0; i < rd_metablk.m_len; i += sizeof (blk_metadata_t)) { + rd_metadata = (blk_metadata_t *)(rd_metadata_buf + i); + lun_offset = ((rd_metablk.m_offset + i) * metavolblocksize) / + metadatasize; + ret = compare_blk_metadata(rd_metadata, metadata); + if (ret == -1) { + // old io number < new io number + if (diff_count == 0) { + last_lun_offset = lun_offset; + } + diff_count++; + } else if (!ret) { + // old io number == new io number + if (diff_count == 0) { + last_lun_offset = lun_offset; + } + diff_count++; + } else { + // old io number > new io number + if (diff_count != 0) { + ADD_TO_IO_CHUNK_LIST(chunk_list, + last_lun_offset, diff_count * + metavolblocksize, count); + diff_count = 0; + } + } + } + + if (diff_count != 0) + ADD_TO_IO_CHUNK_LIST(chunk_list, last_lun_offset, + diff_count * metavolblocksize, count); + +exit: + umem_free(rd_metadata_buf, rd_metablk.m_len); + *list = chunk_list; + return (count); +} diff --git a/lib/libzpool/zrepl_mgmt.c b/lib/libzpool/zrepl_mgmt.c index dd1a357c6a7f..879ed0df10cc 100644 --- a/lib/libzpool/zrepl_mgmt.c +++ b/lib/libzpool/zrepl_mgmt.c @@ -251,14 +251,14 @@ uzfs_zinfo_free(zvol_info_t *zinfo) } void -uzfs_zvol_get_last_committed_io_no(zvol_info_t *zinfo, uint64_t *io_seq) +uzfs_zvol_get_last_committed_io_no(zvol_state_t *zv, uint64_t *io_seq) { uzfs_zap_kv_t zap; zap.key = "io_seq"; zap.value = 0; zap.size = sizeof (*io_seq); - uzfs_read_zap_entry(zinfo->zv, &zap); + uzfs_read_zap_entry(zv, &zap); *io_seq = zap.value; } diff --git a/tests/cbtest/script/test_uzfs.sh b/tests/cbtest/script/test_uzfs.sh index bcdd8c00e5a3..dedf975e3b12 100755 --- a/tests/cbtest/script/test_uzfs.sh +++ b/tests/cbtest/script/test_uzfs.sh @@ -665,7 +665,7 @@ run_zrepl_uzfs_test() log_must $ZFS set sync=$3 $UZFS_TEST_POOL/$UZFS_TEST_VOL log_must_not $UZFS_TEST - log_must $UZFS_TEST -T 6 + log_must $UZFS_TEST -T 4 sleep 5 log_must kill -SIGKILL $TGT_PID2 @@ -686,15 +686,15 @@ run_uzfs_test() { log_must_not $UZFS_TEST - log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -T 8 - #log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -s -T 8 - log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -l -T 8 - #log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -s -l -T 8 + log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -T 6 + #log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -s -T 6 + log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -l -T 6 + #log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -s -l -T 6 - log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -i 8192 -b 65536 -T 8 - #log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -s -i 8192 -b 65536 -T 8 - log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -l -i 8192 -b 65536 -T 8 - #log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -s -l -i 8192 -b 65536 -T 8 + log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -i 8192 -b 65536 -T 6 + #log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -s -i 8192 -b 65536 -T 6 + log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -l -i 8192 -b 65536 -T 6 + #log_must $UZFS_TEST -t 30 -c -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -s -l -i 8192 -b 65536 -T 6 log_must truncate -s 2G "$TMPDIR/uztest.1a" log_must truncate -s 2G "$TMPDIR/uztest.log" @@ -754,17 +754,6 @@ run_uzfs_test() log_must setup_uzfs_test log 65536 standard log_must $UZFS_TEST -t 30 -v $UZFS_TEST_VOLSIZE_IN_NUM -a $UZFS_TEST_VOLSIZE_IN_NUM -l -i 8192 -b 65536 -T 2 - log_must $UZFS_TEST -T 7 - - K=1024 - M=$(( 1024 * 1024 )) - G=$(( 1024 * 1024 * 1024 )) - - log_must $UZFS_TEST -t 10 -a $(( 50 * 1024 * 1024 )) -T 3 -n 10000 - log_must $UZFS_TEST -t 10 -a $(( 100 * 1024 * 1024 )) -T 3 -n 10000 - log_must $UZFS_TEST -t 10 -a $(( 1000 * 1024 * 1024 )) -T 3 -n 10000 - log_must $UZFS_TEST -t 10 -T 4 - log_must $UZFS_TEST -t 10 -T 0 -n 10 log_must . $UZFS_TEST_SYNC_SH @@ -800,7 +789,6 @@ test_type : - pool_test (verify pool create/destroy functionality) - zvol_test (zvol sync test, read/write and replay tests) - rebuild_test (zvol rebuild related tests) - - txg_diff_test (txg diff API test) - fio_test - zrepl_test EOF @@ -861,21 +849,9 @@ run_zvol_test() run_rebuild_test() { - log_must $UZFS_TEST -T 5 -t 60 -n 3 - log_must $UZFS_TEST -T 5 -t 120 -n 3 - -} - -run_txg_diff_test() -{ - K=1024 - M=$(( 1024 * 1024 )) - G=$(( 1024 * 1024 * 1024 )) + log_must $UZFS_TEST -T 3 -t 60 -n 3 + log_must $UZFS_TEST -T 3 -t 120 -n 3 - log_must $UZFS_TEST -a $(( 100 * $M )) -T 3 -n 10000 - log_must $UZFS_TEST -a $(( 1 * $G)) -T 3 -n 10000 - - log_must $UZFS_TEST -T 4 } test_func="run_${test_type}" From fe582490d94854c881e3d9ba68291616ec22b744 Mon Sep 17 00:00:00 2001 From: mayank Date: Wed, 28 Mar 2018 03:58:19 +0530 Subject: [PATCH 2/4] - Changes in uzfs_get_io_diff to get metadata based on offset and length Snapshot created with provided metadata will be removed if (offset + length) == (end of metadata) - Replaced dsl_pool_hold with dmu_object_own to hold snapshot in uzfs_get_io_diff Since, it was causing deadlock - Functionality test for partial read from uzfs_get_io_diff Signed-off-by: mayank --- cmd/uzfs_test/uzfs_test_rebuilding.c | 59 ++++++++++-- include/sys/uzfs_zvol.h | 2 +- include/uzfs_rebuilding.h | 9 +- lib/libzpool/uzfs_io.c | 6 +- lib/libzpool/uzfs_rebuilding.c | 138 +++++++++++++++++---------- 5 files changed, 150 insertions(+), 64 deletions(-) diff --git a/cmd/uzfs_test/uzfs_test_rebuilding.c b/cmd/uzfs_test/uzfs_test_rebuilding.c index 7c183bbcae94..892ae47c6bbc 100644 --- a/cmd/uzfs_test/uzfs_test_rebuilding.c +++ b/cmd/uzfs_test/uzfs_test_rebuilding.c @@ -181,7 +181,7 @@ replica_reader_thread(void *arg) static int uzfs_test_meta_diff_traverse_cb(off_t offset, size_t len, - blk_metadata_t *md, void *arg) + blk_metadata_t *md, objset_t *snap_obj, void *arg) { uzfs_rebuild_data_t *r_data = (uzfs_rebuild_data_t *)arg; uzfs_io_chunk_list_t *io; @@ -193,8 +193,8 @@ uzfs_test_meta_diff_traverse_cb(off_t offset, size_t len, io->io_number = md->io_num; io->buf = umem_alloc(len, UMEM_NOFAIL); - err = uzfs_read_data(r_data->zvol, io->buf, offset, len, - NULL, NULL); + err = dmu_read(snap_obj, ZVOL_OBJ, offset, len, + io->buf, 0); if (err) { umem_free(io, sizeof (*io)); @@ -210,25 +210,70 @@ uzfs_test_meta_diff_traverse_cb(off_t offset, size_t len, return (err); } +void +check_snapshot(zvol_state_t *zv, blk_metadata_t *md, boolean_t err) +{ + objset_t *s_obj; + char *dataset; + int ret = 0; + + dataset = kmem_asprintf("%s@%s%lu", zv->zv_name, + IO_DIFF_SNAPNAME, md->io_num); + + ret = dmu_objset_own(dataset, DMU_OST_ANY, B_TRUE, zv, &s_obj); + if ((ret != 0 && err) || + (!err && ret == 0)) { + printf("ret:%d\n", ret); + printf("snapshot %s %s\n", dataset, + (err) ? "should not be removed" : "should be removed"); + exit(1); + } + + if (ret == 0) + dmu_objset_disown(s_obj, zv); +} + void fetch_modified_data(void *arg) { struct rebuilding_data *repl_data = arg; uzfs_rebuild_data_t *r_data = repl_data->r_data; int err; - blk_metadata_t io_number; + blk_metadata_t md; + off_t offset, end; + size_t len; + int max_count = 4; printf("fetching modified data\n"); - io_number.io_num = repl_data->base_io; + md.io_num = repl_data->base_io; + + len = r_data->zvol->zv_volsize / max_count; - err = uzfs_get_io_diff(repl_data->zvol, &io_number, - uzfs_test_meta_diff_traverse_cb, r_data); + for (offset = 0; offset < r_data->zvol->zv_volsize; ) { + end = offset + len; + if (end > r_data->zvol->zv_volsize) + len = r_data->zvol->zv_volsize - offset; + + err = uzfs_get_io_diff(repl_data->zvol, &md, + uzfs_test_meta_diff_traverse_cb, offset, len, + r_data); + if (err) + break; + + offset += len; + if (offset != r_data->zvol->zv_volsize) + check_snapshot(repl_data->zvol, &md, B_TRUE); + else + break; + } if (err) { printf("error(%d)... while fetching modified data\n", err); exit(1); } + check_snapshot(repl_data->zvol, &md, B_FALSE); + printf("finished fetching modified data\n"); mutex_enter(&r_data->mtx); diff --git a/include/sys/uzfs_zvol.h b/include/sys/uzfs_zvol.h index 9d761dd58772..7ba48645c9f6 100644 --- a/include/sys/uzfs_zvol.h +++ b/include/sys/uzfs_zvol.h @@ -153,6 +153,6 @@ typedef struct uzfs_io_chunk_list { } uzfs_io_chunk_list_t; typedef int (uzfs_get_io_diff_cb_t)(off_t offset, size_t len, - void *io_number, void *arg); + blk_metadata_t *metadata, objset_t *obj, void *arg); #endif #endif diff --git a/include/uzfs_rebuilding.h b/include/uzfs_rebuilding.h index a93ef4238415..e59cd99d439b 100644 --- a/include/uzfs_rebuilding.h +++ b/include/uzfs_rebuilding.h @@ -22,6 +22,8 @@ #ifndef _UZFS_REBUILDING_H #define _UZFS_REBUILDING_H +#define IO_DIFF_SNAPNAME ".io_snap" + /* * API to compare metadata * return : @@ -29,12 +31,13 @@ * 0 : if first == second * 1 : if first > second */ -int compare_blk_metadata(void *first_md, void *second_md); +int compare_blk_metadata(blk_metadata_t *first_md, blk_metadata_t *second_md); /* * API to access data whose metadata is higer than base_metadata */ -int uzfs_get_io_diff(void *zv, void *base_metadata, void *cb_func, void *arg); +int uzfs_get_io_diff(zvol_state_t *zv, blk_metadata_t *base_metadata, + uzfs_get_io_diff_cb_t *cb_func, off_t offset, size_t len, void *arg); /* * uzfs_search_nonoverlapping_io will check on_disk metadata with w_metadata and @@ -44,5 +47,5 @@ int uzfs_get_io_diff(void *zv, void *base_metadata, void *cb_func, void *arg); * added to list. */ int uzfs_search_nonoverlapping_io(zvol_state_t *zv, uint64_t offset, - uint64_t len, void *w_metadata, void **list); + uint64_t len, blk_metadata_t *w_metadata, void **list); #endif diff --git a/lib/libzpool/uzfs_io.c b/lib/libzpool/uzfs_io.c index 1ebc017b2296..28395897f637 100644 --- a/lib/libzpool/uzfs_io.c +++ b/lib/libzpool/uzfs_io.c @@ -99,7 +99,8 @@ uzfs_write_data(zvol_state_t *zv, char *buf, uint64_t offset, uint64_t len, rl = zfs_range_lock(&zv->zv_range_lock, offset, len, RL_WRITER); - if ((zv->zv_status & ZVOL_REBUILDING_IN_PROGRESS) && + if (zv->zv_status == ZVOL_STATUS_DEGRADED && + zv->zv_rebuild_status == ZVOL_REBUILDING_IN_PROGRESS && is_rebuild) { count = uzfs_search_nonoverlapping_io(zv, offset, len, metadata, (void **)&chunk_io); @@ -162,7 +163,8 @@ uzfs_write_data(zvol_state_t *zv, char *buf, uint64_t offset, uint64_t len, } exit_with_error: - if ((zv->zv_rebuild_status & ZVOL_REBUILDING_IN_PROGRESS) && + if (zv->zv_status == ZVOL_STATUS_DEGRADED && + zv->zv_rebuild_status == ZVOL_REBUILDING_IN_PROGRESS && is_rebuild && count && !ret) goto chunk_io; diff --git a/lib/libzpool/uzfs_rebuilding.c b/lib/libzpool/uzfs_rebuilding.c index 1fcb83732b9e..d7d745d0794d 100644 --- a/lib/libzpool/uzfs_rebuilding.c +++ b/lib/libzpool/uzfs_rebuilding.c @@ -27,8 +27,7 @@ #include #include #include - -#define IO_DIFF_SNAPNAME ".io_snap" +#include #define ADD_TO_IO_CHUNK_LIST(list, e_offset, e_len, count) \ do { \ @@ -63,16 +62,74 @@ iszero(blk_metadata_t *md) do { \ func(last_lun_offset, diff_count * \ zv->zv_metavolblocksize, (blk_metadata_t *) \ - (buf + last_index), arg); \ + (buf + last_index), zv->zv_objset, arg); \ diff_count = 0; \ last_index = 0; \ last_md = NULL; \ diff_count = 0; \ } while (0) +int +get_metadata_snapshot_info(zvol_state_t *zv, blk_metadata_t *md, + zvol_state_t **snap_zv) +{ + char *snap_name, *dataset; + int ret = 0; + zvol_state_t *s_zv; + objset_t *s_obj; + + dataset = kmem_asprintf("%s@%s%lu", zv->zv_name, + IO_DIFF_SNAPNAME, md->io_num); + + ret = dmu_objset_own(dataset, DMU_OST_ANY, B_TRUE, snap_zv, &s_obj); + if (ret == ENOENT) { + snap_name = kmem_asprintf("%s%llu", IO_DIFF_SNAPNAME, + md->io_num); + + ret = dmu_objset_snapshot_one(zv->zv_name, snap_name); + if (ret) { + printf("Failed to create snapshot for %s\n", + zv->zv_name); + strfree(dataset); + strfree(snap_name); + return (ret); + } + + strfree(snap_name); + ret = dmu_objset_own(dataset, DMU_OST_ANY, B_TRUE, snap_zv, + &s_obj); + } + + if (ret != 0) { + strfree(dataset); + printf("Failed to own snapshot.. err(%d)\n", ret); + return (ret); + } + + s_zv = umem_alloc(sizeof (*s_zv), KM_SLEEP); + memcpy(s_zv, zv, sizeof (zvol_state_t)); + s_zv->zv_objset = s_obj; + *snap_zv = s_zv; + + strfree(dataset); + return (ret); +} + +void +destroy_metadata_snapshot(zvol_state_t *zv, blk_metadata_t *md) +{ + char *dataset; + + dataset = kmem_asprintf("%s@%s%lu", zv->zv_name, + IO_DIFF_SNAPNAME, md->io_num); + (void) dsl_destroy_snapshot(dataset, B_FALSE); + strfree(dataset); +} + int uzfs_get_io_diff(zvol_state_t *zv, blk_metadata_t *low, - uzfs_get_io_diff_cb_t *func, void *arg) + uzfs_get_io_diff_cb_t *func, off_t zvol_offset, size_t zvol_len, + void *arg) { uint64_t blocksize = zv->zv_volmetablocksize; uint64_t metadata_read_chunk_size = 10 * blocksize; @@ -80,65 +137,44 @@ uzfs_get_io_diff(zvol_state_t *zv, blk_metadata_t *low, zv->zv_volmetadatasize; uint64_t metadatasize = zv->zv_volmetadatasize; char *buf; - uint64_t lun_offset, len, i, read, offset; + uint64_t lun_offset, i, read; + uint64_t offset, len, end; int ret = 0; - char *snap_name, *dataset; - hrtime_t now; - dsl_pool_t *dp; - dsl_dataset_t *ds_snap; int diff_count = 0, last_index = 0; uint64_t last_lun_offset = 0; blk_metadata_t *last_md; + zvol_state_t *snap_zv; + metaobj_blk_offset_t snap_metablk; - if (!func) + if (!func || (zvol_offset + zvol_len) > zv->zv_volsize) return (EINVAL); - now = gethrtime(); - - snap_name = kmem_asprintf("%s%llu", IO_DIFF_SNAPNAME, now); + get_zv_metaobj_block_details(&snap_metablk, zv, zvol_offset, zvol_len); + offset = snap_metablk.m_offset; + end = snap_metablk.m_offset + snap_metablk.m_len; + if (end > metaobjectsize) + end = metaobjectsize; - ret = dmu_objset_snapshot_one(zv->zv_name, snap_name); - if (ret) { - printf("failed to create snapshot for %s\n", zv->zv_name); - strfree(snap_name); + ret = get_metadata_snapshot_info(zv, low, &snap_zv); + if (ret != 0) { + printf("failed to get snapshot info for %s io_num:%lu\n", + zv->zv_name, low->io_num); return (ret); } - strfree(snap_name); - - dataset = kmem_asprintf("%s@%s%llu", zv->zv_name, - IO_DIFF_SNAPNAME, now); - - ret = dsl_pool_hold(dataset, FTAG, &dp); - if (ret) { - (void) dsl_destroy_snapshot(dataset, B_FALSE); - strfree(dataset); - return (ret); - } - - ret = dsl_dataset_hold(dp, dataset, FTAG, &ds_snap); - if (ret) { - (void) dsl_destroy_snapshot(dataset, B_FALSE); - dsl_pool_rele(dp, FTAG); - strfree(dataset); - return (ret); - } - - dsl_dataset_long_hold(ds_snap, FTAG); - metadata_read_chunk_size = (metadata_read_chunk_size / metadatasize) * metadatasize; buf = umem_alloc(metadata_read_chunk_size, KM_SLEEP); len = metadata_read_chunk_size; - for (offset = 0; offset < metaobjectsize; offset += len) { + for (; offset < end; offset += len) { read = 0; len = metadata_read_chunk_size; - if ((offset + len) > metaobjectsize) - len = (metaobjectsize - offset); + if ((offset + len) > end) + len = (end - offset); - ret = uzfs_read_metadata(zv, buf, offset, len, &read); + ret = uzfs_read_metadata(snap_zv, buf, offset, len, &read); if (read != len || ret) break; @@ -159,12 +195,12 @@ uzfs_get_io_diff(zvol_state_t *zv, blk_metadata_t *low, (buf + i), last_md) != 0) { EXECUTE_DIFF_CALLBACK(last_lun_offset, diff_count, buf, last_index, arg, - last_md, zv, func); + last_md, snap_zv, func); } } else if (diff_count) { EXECUTE_DIFF_CALLBACK(last_lun_offset, diff_count, buf, last_index, arg, last_md, - zv, func); + snap_zv, func); } lun_offset += zv->zv_metavolblocksize; @@ -172,21 +208,21 @@ uzfs_get_io_diff(zvol_state_t *zv, blk_metadata_t *low, if (diff_count) { EXECUTE_DIFF_CALLBACK(last_lun_offset, diff_count, buf, - last_index, arg, last_md, zv, func); + last_index, arg, last_md, snap_zv, func); } } - dsl_dataset_long_rele(ds_snap, FTAG); - dsl_dataset_rele(ds_snap, FTAG); - dsl_pool_rele(dp, FTAG); + dmu_objset_disown(snap_zv->zv_objset, &snap_zv); + umem_free(snap_zv, sizeof (*snap_zv)); /* * TODO: if we failed to destroy snapshot here then * this should be handled separately from application. */ - (void) dsl_destroy_snapshot(dataset, B_FALSE); + if (end == metaobjectsize) + destroy_metadata_snapshot(zv, low); + umem_free(buf, metadata_read_chunk_size); - strfree(dataset); return (ret); } From 4d15b29b775adc39d72c1f41ca265ebdd0374b1d Mon Sep 17 00:00:00 2001 From: mayank Date: Tue, 3 Apr 2018 12:48:31 +0530 Subject: [PATCH 3/4] - fix in uzfs_get_io_diff API - changes according to review comments - uzfs_open_dataset_init changed to uzfs_own_dataset for better readability Signed-off-by: mayank --- lib/libzpool/uzfs_io.c | 6 ++++++ lib/libzpool/uzfs_mgmt.c | 6 +++--- lib/libzpool/uzfs_rebuilding.c | 21 ++++++++++++++------- tests/cbtest/script/test_uzfs.sh | 4 ++-- 4 files changed, 25 insertions(+), 12 deletions(-) diff --git a/lib/libzpool/uzfs_io.c b/lib/libzpool/uzfs_io.c index 804704e6b036..18c9cc6ed827 100644 --- a/lib/libzpool/uzfs_io.c +++ b/lib/libzpool/uzfs_io.c @@ -76,6 +76,9 @@ uzfs_write_data(zvol_state_t *zv, char *buf, uint64_t offset, uint64_t len, uint64_t orig_offset = offset; char *mdata = NULL, *tmdata = NULL, *tmdataend = NULL; + VERIFY(IS_P2ALIGNED(offset, zv->zv_metavolblocksize) && + IS_P2ALIGNED(len, zv->zv_metavolblocksize)); + sync = (dmu_objset_syncprop(os) == ZFS_SYNC_ALWAYS) ? 1 : 0; ASSERT3P(zv->zv_volmetablocksize, !=, 0); @@ -198,6 +201,9 @@ uzfs_read_data(zvol_state_t *zv, char *buf, uint64_t offset, uint64_t len, blk_metadata_t *metadata; int nmetas; + VERIFY(IS_P2ALIGNED(offset, zv->zv_metavolblocksize) && + IS_P2ALIGNED(len, zv->zv_metavolblocksize)); + ASSERT3P(zv->zv_volmetadatasize, ==, sizeof (blk_metadata_t)); /* init metadata in case caller wants to receive that info */ diff --git a/lib/libzpool/uzfs_mgmt.c b/lib/libzpool/uzfs_mgmt.c index 14b56a8825b1..ec087cfe01f1 100644 --- a/lib/libzpool/uzfs_mgmt.c +++ b/lib/libzpool/uzfs_mgmt.c @@ -283,7 +283,7 @@ uzfs_objset_create_cb(objset_t *new_os, void *arg, cred_t *cr, dmu_tx_t *tx) /* owns objset with name 'ds_name' in pool 'spa' */ static int -uzfs_open_dataset_init(const char *ds_name, zvol_state_t **z) +uzfs_own_dataset(const char *ds_name, zvol_state_t **z) { zvol_state_t *zv = NULL; int error = -1; @@ -386,7 +386,7 @@ uzfs_open_dataset(spa_t *spa, const char *ds_name, zvol_state_t **z) return (error); (void) snprintf(name, sizeof (name), "%s/%s", spa_name(spa), ds_name); - error = uzfs_open_dataset_init(name, z); + error = uzfs_own_dataset(name, z); return (error); } @@ -438,7 +438,7 @@ uzfs_zvol_create_cb(const char *ds_name, void *arg) printf("ds_name %s\n", ds_name); - error = uzfs_open_dataset_init(ds_name, &zv); + error = uzfs_own_dataset(ds_name, &zv); if (error) { printf("Failed to open dataset: %s\n", ds_name); return (error); diff --git a/lib/libzpool/uzfs_rebuilding.c b/lib/libzpool/uzfs_rebuilding.c index 3f80965dc2a0..e369a2dec7c3 100644 --- a/lib/libzpool/uzfs_rebuilding.c +++ b/lib/libzpool/uzfs_rebuilding.c @@ -67,7 +67,6 @@ iszero(blk_metadata_t *md) diff_count = 0; \ last_index = 0; \ last_md = NULL; \ - diff_count = 0; \ } while (0) int @@ -186,19 +185,27 @@ uzfs_get_io_diff(zvol_state_t *zv, blk_metadata_t *low, last_index = i; } - // Increase diff_count if on_disk io number is - // same as last one. - diff_count++; - - if (compare_blk_metadata((blk_metadata_t *) + if (diff_count && + compare_blk_metadata((blk_metadata_t *) (buf + i), last_md) != 0) { /* * Execute callback function with last - * compared metadata and diff_count + * metadata and diff_count if + * last compared metadata is changed */ EXECUTE_DIFF_CALLBACK(last_lun_offset, diff_count, buf, last_index, arg, last_md, snap_zv, func, ret); + last_lun_offset = lun_offset; + last_md = (blk_metadata_t *)(buf+i); + last_index = i; + diff_count++; + } else { + /* + * increament diff_count with 1 if + * metadata is same + */ + diff_count++; } } else if (diff_count) { EXECUTE_DIFF_CALLBACK(last_lun_offset, diff --git a/tests/cbtest/script/test_uzfs.sh b/tests/cbtest/script/test_uzfs.sh index 57cb803dc13a..42081e71b9cf 100755 --- a/tests/cbtest/script/test_uzfs.sh +++ b/tests/cbtest/script/test_uzfs.sh @@ -852,8 +852,8 @@ run_zvol_test() run_rebuild_test() { - log_must $UZFS_TEST -T 3 -t 60 -n 3 - log_must $UZFS_TEST -T 3 -t 120 -n 3 + log_must $UZFS_TEST -T 3 -t 60 -n 3 -a 419430400 + log_must $UZFS_TEST -T 3 -t 120 -n 3 -a 629145600 } From 38f3c287291348b72805de6b355de43dee2b341a Mon Sep 17 00:00:00 2001 From: mayank Date: Tue, 3 Apr 2018 14:23:41 +0530 Subject: [PATCH 4/4] - reducing number of tests for rebuilding - setting metavolblocksize to 512 bytes when creating volume through CLI Signed-off-by: mayank --- module/zfs/zvol.c | 8 +++++++- tests/cbtest/script/test_uzfs.sh | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index ea48b1928ec5..77d065a60007 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -322,8 +322,14 @@ zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) ASSERT(error == 0); #if !defined(_KERNEL) + /* + * By default we are setting metavolblocksize to 512 bytes + * till we have CLI support for metavolblocksize property. + */ + uint64_t metavolblocksize = 512; + VERIFY(uzfs_zvol_create_meta(os, volblocksize, volblocksize, - volblocksize, tx) == 0); + metavolblocksize, tx) == 0); #endif } diff --git a/tests/cbtest/script/test_uzfs.sh b/tests/cbtest/script/test_uzfs.sh index 42081e71b9cf..79149fbacf8b 100755 --- a/tests/cbtest/script/test_uzfs.sh +++ b/tests/cbtest/script/test_uzfs.sh @@ -852,8 +852,8 @@ run_zvol_test() run_rebuild_test() { - log_must $UZFS_TEST -T 3 -t 60 -n 3 -a 419430400 - log_must $UZFS_TEST -T 3 -t 120 -n 3 -a 629145600 + log_must $UZFS_TEST -T 3 -t 60 -n 2 -a 419430400 + log_must $UZFS_TEST -T 3 -t 60 -n 2 -a 629145600 }