From 95784ca9fae140cbd8ae5696accc41bc4e4dc8c0 Mon Sep 17 00:00:00 2001 From: Manoj Joseph Date: Tue, 7 Dec 2021 12:32:13 -0800 Subject: [PATCH] DOSE-751 Handle pool creation failure (#4) --- cmd/zfs_object_agent/zettaobject/src/pool.rs | 29 ++- .../zettaobject/src/root_connection.rs | 16 +- include/sys/vdev_object_store.h | 1 - module/os/linux/zfs/vdev_object_store.c | 219 +++++++++--------- 4 files changed, 143 insertions(+), 122 deletions(-) diff --git a/cmd/zfs_object_agent/zettaobject/src/pool.rs b/cmd/zfs_object_agent/zettaobject/src/pool.rs index 3e871b0cd537..2a136e54b660 100644 --- a/cmd/zfs_object_agent/zettaobject/src/pool.rs +++ b/cmd/zfs_object_agent/zettaobject/src/pool.rs @@ -94,6 +94,8 @@ lazy_static! { pub static ref CLAIM_DURATION: Duration = Duration::from_secs(get_tunable("claim_duration_secs", 2)); + pub static ref CREATE_WAIT_DURATION: Duration = Duration::from_secs(get_tunable("create_wait_duration_secs", 30)); + // By default, retain metadata for as long as we would return Uberblocks in a block-based pool static ref METADATA_RETENTION_TXGS: u64 = get_tunable("metadata_retention_txgs", 128); @@ -311,6 +313,24 @@ impl PoolPhys { ) .await; } + + pub async fn put_timed( + &self, + object_access: &ObjectAccess, + timeout: Option, + ) -> Result> { + maybe_die_with(|| format!("before putting {:#?}", self)); + debug!("putting {:#?}", self); + let buf = serde_json::to_vec(&self).unwrap(); + object_access + .put_object_timed( + Self::key(self.guid), + buf.into(), + ObjectAccessStatType::MetadataPut, + timeout, + ) + .await + } } impl UberblockPhys { @@ -806,7 +826,11 @@ impl Pool { Ok(nvl) } - pub async fn create(object_access: &ObjectAccess, name: &str, guid: PoolGuid) { + pub async fn create( + object_access: &ObjectAccess, + name: &str, + guid: PoolGuid, + ) -> Result> { let phys = PoolPhys { guid, name: name.to_string(), @@ -815,7 +839,8 @@ impl Pool { checkpoint_txg: None, }; // XXX make sure it doesn't already exist - phys.put(object_access).await; + phys.put_timed(object_access, Some(*CREATE_WAIT_DURATION)) + .await } async fn open_from_txg( diff --git a/cmd/zfs_object_agent/zettaobject/src/root_connection.rs b/cmd/zfs_object_agent/zettaobject/src/root_connection.rs index 2756c780edd6..45609ea7a5e5 100644 --- a/cmd/zfs_object_agent/zettaobject/src/root_connection.rs +++ b/cmd/zfs_object_agent/zettaobject/src/root_connection.rs @@ -106,11 +106,17 @@ impl RootConnectionState { let name = nvl.lookup_string("name")?; let object_access = Self::get_object_access(&nvl)?; - Pool::create(&object_access, name.to_str()?, guid).await; let mut response = NvList::new_unique_names(); response.insert("Type", "pool create done").unwrap(); response.insert("GUID", &guid.0).unwrap(); + if let Err(err) = Pool::create(&object_access, name.to_str()?, guid).await { + error!("pool create failed: {:?}", &err); + response + .insert("cause", err.to_string().replace('\n', "").as_str()) + .unwrap(); + } + maybe_die_with(|| format!("before sending response: {:?}", response)); debug!("sending response: {:?}", response); Ok(Some(response)) @@ -128,6 +134,8 @@ impl RootConnectionState { let txg = nvl.lookup_uint64("TXG").ok().map(Txg); let syncing_txg = nvl.lookup_uint64("syncing_txg").ok().map(Txg); let mut response = NvList::new_unique_names(); + response.insert("Type", "pool open done").unwrap(); + response.insert("GUID", &guid.0).unwrap(); let (pool, phys_opt, next_block) = match Pool::open( object_access, @@ -141,14 +149,12 @@ impl RootConnectionState { .await { Err(PoolOpenError::Mmp(hostname)) => { - response.insert("Type", "pool open failed").unwrap(); response.insert("cause", "MMP").unwrap(); response.insert("hostname", hostname.as_str()).unwrap(); debug!("sending response: {:?}", response); return Ok(Some(response)); } Err(PoolOpenError::Feature(FeatureError { features, readonly })) => { - response.insert("Type", "pool open failed").unwrap(); response.insert("cause", "feature").unwrap(); let mut feature_nvl = NvList::new_unique_names(); for feature in features { @@ -174,7 +180,6 @@ impl RootConnectionState { * then, we just pass the root cause error message back to the kernel, and * hope that it can present a usable error to the user. */ - response.insert("Type", "pool open failed").unwrap(); response.insert("cause", "IO").unwrap(); response .insert("message", e.root_cause().to_string().as_str()) @@ -183,7 +188,6 @@ impl RootConnectionState { return Ok(Some(response)); } Err(PoolOpenError::NoCheckpoint) => { - response.insert("Type", "pool open failed").unwrap(); response.insert("cause", "checkpoint").unwrap(); debug!("sending response: {:?}", response); return Ok(Some(response)); @@ -191,8 +195,6 @@ impl RootConnectionState { Ok(x) => x, }; - response.insert("Type", "pool open done").unwrap(); - response.insert("GUID", &guid.0).unwrap(); if let Some(phys) = phys_opt { response .insert("uberblock", &phys.get_zfs_uberblock()[..]) diff --git a/include/sys/vdev_object_store.h b/include/sys/vdev_object_store.h index ceda16857f88..9f1e72722ea8 100644 --- a/include/sys/vdev_object_store.h +++ b/include/sys/vdev_object_store.h @@ -26,7 +26,6 @@ #define AGENT_TYPE_CREATE_POOL_DONE "pool create done" #define AGENT_TYPE_OPEN_POOL "open pool" #define AGENT_TYPE_OPEN_POOL_DONE "pool open done" -#define AGENT_TYPE_OPEN_POOL_FAILED "pool open failed" #define AGENT_TYPE_READ_BLOCK "read block" #define AGENT_TYPE_READ_DONE "read done" #define AGENT_TYPE_WRITE_BLOCK "write block" diff --git a/module/os/linux/zfs/vdev_object_store.c b/module/os/linux/zfs/vdev_object_store.c index 4e956099f298..cf4979f91e01 100644 --- a/module/os/linux/zfs/vdev_object_store.c +++ b/module/os/linux/zfs/vdev_object_store.c @@ -469,6 +469,16 @@ agent_wait_serial(vdev_object_store_t *vos, vos_serial_types_t wait_type) mutex_exit(&vos->vos_outstanding_lock); } +static void +agent_serial_done(vdev_object_store_t *vos, vos_serial_types_t wait_type) +{ + mutex_enter(&vos->vos_outstanding_lock); + ASSERT(!vos->vos_serial_done[wait_type]); + vos->vos_serial_done[wait_type] = B_TRUE; + cv_broadcast(&vos->vos_outstanding_cv); + mutex_exit(&vos->vos_outstanding_lock); +} + static nvlist_t * agent_io_block_alloc(zio_t *zio) { @@ -1032,7 +1042,7 @@ agent_resume(void *arg) zfs_dbgmsg("agent_resume completed"); } -static void +static uint64_t object_store_create_pool(vdev_t *vd) { ASSERT(vdev_is_object_based(vd)); @@ -1042,6 +1052,7 @@ object_store_create_pool(vdev_t *vd) mutex_exit(&vos->vos_sock_lock); agent_wait_serial(vos, VOS_SERIAL_CREATE_POOL); + return (vos->vos_result); } void @@ -1285,6 +1296,7 @@ agent_reader(void *arg) { vdev_object_store_t *vos = arg; uint64_t nvlist_len = 0; + char *cause = NULL; int err = agent_read_all(vos, &nvlist_len, sizeof (nvlist_len)); if (err != 0) { zfs_dbgmsg("agent_reader(%px) got err %d", curthread, err); @@ -1314,11 +1326,11 @@ agent_reader(void *arg) } // XXX debug message the nvlist if (strcmp(type, AGENT_TYPE_CREATE_POOL_DONE) == 0) { - mutex_enter(&vos->vos_outstanding_lock); - ASSERT(!vos->vos_serial_done[VOS_SERIAL_CREATE_POOL]); - vos->vos_serial_done[VOS_SERIAL_CREATE_POOL] = B_TRUE; - cv_broadcast(&vos->vos_outstanding_cv); - mutex_exit(&vos->vos_outstanding_lock); + if (nvlist_lookup_string(nv, AGENT_CAUSE, &cause) == 0) { + zfs_dbgmsg("got %s cause=\"%s\"", type, cause); + vos->vos_result = SET_ERROR(EACCES); + } + agent_serial_done(vos, VOS_SERIAL_CREATE_POOL); } else if (strcmp(type, AGENT_TYPE_END_TXG_DONE) == 0) { mutex_enter(&vos->vos_stats_lock); vos->vos_stats.voss_blocks_count = @@ -1347,100 +1359,84 @@ agent_reader(void *arg) update_features(vos->vos_vdev->vdev_spa, fnvlist_lookup_nvlist(nv, AGENT_FEATURES)); - mutex_enter(&vos->vos_outstanding_lock); - ASSERT(!vos->vos_serial_done[VOS_SERIAL_END_TXG]); - vos->vos_serial_done[VOS_SERIAL_END_TXG] = B_TRUE; - cv_broadcast(&vos->vos_outstanding_cv); - mutex_exit(&vos->vos_outstanding_lock); + agent_serial_done(vos, VOS_SERIAL_END_TXG); } else if (strcmp(type, AGENT_TYPE_OPEN_POOL_DONE) == 0) { - uint_t len; - uint8_t *arr; - int err = nvlist_lookup_uint8_array(nv, AGENT_UBERBLOCK, - &arr, &len); - if (err == 0) { - ASSERT3U(len, <=, sizeof (uberblock_t)); - bcopy(arr, &vos->vos_uberblock, len); - - /* - * We may be opening an uberblock from a pool - * with an older on-disk format. To handle this, - * we just zero out any uberblock members that - * did not exist when the uberblock was written. - */ - if (len < sizeof (uberblock_t)) { - bzero(&vos->vos_uberblock + len, - sizeof (uberblock_t) - len); - } - VERIFY0(nvlist_lookup_uint8_array(nv, - AGENT_CONFIG, &arr, &len)); - vos->vos_config = fnvlist_unpack((char *)arr, len); - - update_features(vos->vos_vdev->vdev_spa, - fnvlist_lookup_nvlist(nv, AGENT_FEATURES)); - } - - uint64_t next_block = fnvlist_lookup_uint64(nv, - AGENT_NEXT_BLOCK); - vos->vos_next_block = next_block; - - zfs_dbgmsg("got pool open done len=%u block=%llu", - len, (u_longlong_t)next_block); - - fnvlist_free(nv); - mutex_enter(&vos->vos_outstanding_lock); - ASSERT(!vos->vos_serial_done[VOS_SERIAL_OPEN_POOL]); - vos->vos_serial_done[VOS_SERIAL_OPEN_POOL] = B_TRUE; - vos->vos_open_completed = B_TRUE; - cv_broadcast(&vos->vos_outstanding_cv); - mutex_exit(&vos->vos_outstanding_lock); - } else if (strcmp(type, AGENT_TYPE_OPEN_POOL_FAILED) == 0) { - char *cause = fnvlist_lookup_string(nv, AGENT_CAUSE); - spa_t *spa = vos->vos_vdev->vdev_spa; - zfs_dbgmsg("got %s cause=\"%s\"", type, cause); - if (strcmp(cause, "MMP") == 0) { - fnvlist_add_string(spa->spa_load_info, - ZPOOL_CONFIG_MMP_HOSTNAME, fnvlist_lookup_string(nv, - AGENT_HOSTNAME)); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); - fnvlist_add_uint64(spa->spa_load_info, - ZPOOL_CONFIG_MMP_TXG, 0); - mutex_enter(&vos->vos_outstanding_lock); - vos->vos_result = SET_ERROR(EREMOTEIO); - } else if (strcmp(cause, "IO") == 0) { - char *message = fnvlist_lookup_string(nv, - AGENT_MESSAGE); - zfs_dbgmsg("message=\"%s\"", message); - mutex_enter(&vos->vos_outstanding_lock); - if (strstr(message, "does not exist") != NULL) { - vos->vos_result = SET_ERROR(ENOENT); + if (nvlist_lookup_string(nv, AGENT_CAUSE, &cause) == 0) { + spa_t *spa = vos->vos_vdev->vdev_spa; + zfs_dbgmsg("got %s cause=\"%s\"", type, cause); + if (strcmp(cause, "MMP") == 0) { + fnvlist_add_string(spa->spa_load_info, + ZPOOL_CONFIG_MMP_HOSTNAME, + fnvlist_lookup_string(nv, AGENT_HOSTNAME)); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_TXG, 0); + vos->vos_result = SET_ERROR(EREMOTEIO); + } else if (strcmp(cause, "IO") == 0) { + char *message = fnvlist_lookup_string(nv, + AGENT_MESSAGE); + zfs_dbgmsg("message=\"%s\"", message); + if (strstr(message, "does not exist") != NULL) { + vos->vos_result = SET_ERROR(ENOENT); + } else { + vos->vos_result = SET_ERROR(EIO); + } + } else if (strcmp(cause, "checkpoint") == 0) { + zfs_dbgmsg("Failed to find checkpoint when " + "attempting to rewind pool"); + vos->vos_result = + SET_ERROR(ZFS_ERR_NO_CHECKPOINT); } else { - vos->vos_result = SET_ERROR(EIO); + ASSERT0(strcmp(cause, "feature")); + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, + fnvlist_lookup_nvlist(nv, AGENT_FEATURES)); + if (fnvlist_lookup_boolean_value(nv, + AGENT_CAN_READONLY)) { + fnvlist_add_boolean(spa->spa_load_info, + ZPOOL_CONFIG_CAN_RDONLY); + } + vos->vos_result = SET_ERROR(ENOTSUP); } - } else if (strcmp(cause, "checkpoint") == 0) { - zfs_dbgmsg("Failed to find checkpoint when attempting " - "to rewind pool"); - vos->vos_result = SET_ERROR(ZFS_ERR_NO_CHECKPOINT); } else { - ASSERT0(strcmp(cause, "feature")); - fnvlist_add_nvlist(spa->spa_load_info, - ZPOOL_CONFIG_UNSUP_FEAT, fnvlist_lookup_nvlist(nv, - AGENT_FEATURES)); - if (fnvlist_lookup_boolean_value(nv, - AGENT_CAN_READONLY)) { - fnvlist_add_boolean(spa->spa_load_info, - ZPOOL_CONFIG_CAN_RDONLY); + uint_t len; + uint8_t *arr; + int err = nvlist_lookup_uint8_array(nv, + AGENT_UBERBLOCK, &arr, &len); + if (err == 0) { + ASSERT3U(len, <=, sizeof (uberblock_t)); + bcopy(arr, &vos->vos_uberblock, len); + + /* + * We may be opening an uberblock from a pool + * with an older on-disk format. To handle + * this, we just zero out any uberblock members + * that did not exist when the uberblock was + * written. + */ + if (len < sizeof (uberblock_t)) { + bzero(&vos->vos_uberblock + len, + sizeof (uberblock_t) - len); + } + VERIFY0(nvlist_lookup_uint8_array(nv, + AGENT_CONFIG, &arr, &len)); + vos->vos_config = fnvlist_unpack((char *)arr, + len); + + update_features(vos->vos_vdev->vdev_spa, + fnvlist_lookup_nvlist(nv, AGENT_FEATURES)); } - mutex_enter(&vos->vos_outstanding_lock); - vos->vos_result = SET_ERROR(ENOTSUP); - } + uint64_t next_block = fnvlist_lookup_uint64(nv, + AGENT_NEXT_BLOCK); + vos->vos_next_block = next_block; - ASSERT(!vos->vos_serial_done[VOS_SERIAL_OPEN_POOL]); - vos->vos_serial_done[VOS_SERIAL_OPEN_POOL] = B_TRUE; - cv_broadcast(&vos->vos_outstanding_cv); - mutex_exit(&vos->vos_outstanding_lock); - fnvlist_free(nv); + zfs_dbgmsg("got pool open done len=%u block=%llu", + len, (u_longlong_t)next_block); + } + vos->vos_open_completed = B_TRUE; + agent_serial_done(vos, VOS_SERIAL_OPEN_POOL); } else if (strcmp(type, AGENT_TYPE_READ_DONE) == 0) { uint64_t req = fnvlist_lookup_uint64(nv, AGENT_REQUEST_ID); @@ -1459,7 +1455,6 @@ agent_reader(void *arg) VERIFY3U(len, ==, zio->io_size); VERIFY3U(len, ==, abd_get_size(zio->io_abd)); abd_copy_from_buf(zio->io_abd, data, len); - fnvlist_free(nv); zio_delay_interrupt(zio); } else if (strcmp(type, AGENT_TYPE_WRITE_DONE) == 0) { uint64_t req = fnvlist_lookup_uint64(nv, @@ -1472,25 +1467,16 @@ agent_reader(void *arg) zio_t *zio = agent_complete_zio(vos, req, token); VERIFY3U(fnvlist_lookup_uint64(nv, AGENT_BLKID), ==, zio->io_offset >> SPA_MINBLOCKSHIFT); - fnvlist_free(nv); zio_delay_interrupt(zio); } else if (strcmp(type, AGENT_TYPE_CLOSE_POOL_DONE) == 0) { zfs_dbgmsg("got %s", type); - mutex_enter(&vos->vos_outstanding_lock); - ASSERT(!vos->vos_serial_done[VOS_SERIAL_CLOSE_POOL]); - vos->vos_serial_done[VOS_SERIAL_CLOSE_POOL] = B_TRUE; - cv_broadcast(&vos->vos_outstanding_cv); - mutex_exit(&vos->vos_outstanding_lock); + agent_serial_done(vos, VOS_SERIAL_CLOSE_POOL); mutex_enter(&vos->vos_lock); vos->vos_agent_thread_exit = B_TRUE; mutex_exit(&vos->vos_lock); } else if (strcmp(type, AGENT_TYPE_ENABLE_FEATURE_DONE) == 0) { - mutex_enter(&vos->vos_outstanding_lock); - ASSERT(!vos->vos_serial_done[VOS_SERIAL_ENABLE_FEATURE]); - vos->vos_serial_done[VOS_SERIAL_ENABLE_FEATURE] = B_TRUE; - cv_broadcast(&vos->vos_outstanding_cv); vos->vos_feature_enable = NULL; - mutex_exit(&vos->vos_outstanding_lock); + agent_serial_done(vos, VOS_SERIAL_ENABLE_FEATURE); } else if (strcmp(type, AGENT_TYPE_GET_STATS_DONE) == 0) { object_store_stats_call_t *caller, search; @@ -1517,10 +1503,11 @@ agent_reader(void *arg) zfs_dbgmsg("unexpected get stats done response: " "owner 0x%llx", (longlong_t)search.oss_owner); } - fnvlist_free(nv); } else { zfs_dbgmsg("unrecognized response type!"); } + + fnvlist_free(nv); return (0); } @@ -1766,15 +1753,21 @@ vdev_object_store_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, vd, 0, &p0, TS_RUN, defclsyspri); if (vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE) { - object_store_create_pool(vd); + error = object_store_create_pool(vd); + if (error != 0) { + zfs_dbgmsg("agent_create_pool failed with %d", error); + goto sock_ready; + } } error = agent_open_pool(vd, vos, vdev_object_store_open_mode(spa_mode(vd->vdev_spa)), B_FALSE); if (error != 0) { ASSERT3U(vd->vdev_spa->spa_load_state, !=, SPA_LOAD_CREATE); - return (error); + goto sock_ready; } +sock_ready: + /* * Socket is now ready for communication, wake up * anyone waiting. @@ -1790,11 +1783,13 @@ vdev_object_store_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, * XXX - We can only support ~1EB since the metaslab weights * use some of the high order bits. */ - *max_psize = *psize = (1ULL << 60) - 1; - *logical_ashift = vdev_object_store_logical_ashift; - *physical_ashift = vdev_object_store_physical_ashift; + if (!error) { + *max_psize = *psize = (1ULL << 60) - 1; + *logical_ashift = vdev_object_store_logical_ashift; + *physical_ashift = vdev_object_store_physical_ashift; + } - return (0); + return (error); } static void