Skip to content

Commit

Permalink
DOSE-751 Handle pool creation failure (openzfs#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
manoj-joseph authored Dec 7, 2021
1 parent 9d2d9f1 commit 95784ca
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 122 deletions.
29 changes: 27 additions & 2 deletions cmd/zfs_object_agent/zettaobject/src/pool.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ lazy_static! {

pub static ref CLAIM_DURATION: Duration = Duration::from_secs(get_tunable("claim_duration_secs", 2));

pub static ref CREATE_WAIT_DURATION: Duration = Duration::from_secs(get_tunable("create_wait_duration_secs", 30));

// By default, retain metadata for as long as we would return Uberblocks in a block-based pool
static ref METADATA_RETENTION_TXGS: u64 = get_tunable("metadata_retention_txgs", 128);

Expand Down Expand Up @@ -311,6 +313,24 @@ impl PoolPhys {
)
.await;
}

pub async fn put_timed(
&self,
object_access: &ObjectAccess,
timeout: Option<Duration>,
) -> Result<rusoto_s3::PutObjectOutput, OAError<rusoto_s3::PutObjectError>> {
maybe_die_with(|| format!("before putting {:#?}", self));
debug!("putting {:#?}", self);
let buf = serde_json::to_vec(&self).unwrap();
object_access
.put_object_timed(
Self::key(self.guid),
buf.into(),
ObjectAccessStatType::MetadataPut,
timeout,
)
.await
}
}

impl UberblockPhys {
Expand Down Expand Up @@ -806,7 +826,11 @@ impl Pool {
Ok(nvl)
}

pub async fn create(object_access: &ObjectAccess, name: &str, guid: PoolGuid) {
pub async fn create(
object_access: &ObjectAccess,
name: &str,
guid: PoolGuid,
) -> Result<rusoto_s3::PutObjectOutput, OAError<rusoto_s3::PutObjectError>> {
let phys = PoolPhys {
guid,
name: name.to_string(),
Expand All @@ -815,7 +839,8 @@ impl Pool {
checkpoint_txg: None,
};
// XXX make sure it doesn't already exist
phys.put(object_access).await;
phys.put_timed(object_access, Some(*CREATE_WAIT_DURATION))
.await
}

async fn open_from_txg(
Expand Down
16 changes: 9 additions & 7 deletions cmd/zfs_object_agent/zettaobject/src/root_connection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,17 @@ impl RootConnectionState {
let name = nvl.lookup_string("name")?;
let object_access = Self::get_object_access(&nvl)?;

Pool::create(&object_access, name.to_str()?, guid).await;
let mut response = NvList::new_unique_names();
response.insert("Type", "pool create done").unwrap();
response.insert("GUID", &guid.0).unwrap();

if let Err(err) = Pool::create(&object_access, name.to_str()?, guid).await {
error!("pool create failed: {:?}", &err);
response
.insert("cause", err.to_string().replace('\n', "").as_str())
.unwrap();
}

maybe_die_with(|| format!("before sending response: {:?}", response));
debug!("sending response: {:?}", response);
Ok(Some(response))
Expand All @@ -128,6 +134,8 @@ impl RootConnectionState {
let txg = nvl.lookup_uint64("TXG").ok().map(Txg);
let syncing_txg = nvl.lookup_uint64("syncing_txg").ok().map(Txg);
let mut response = NvList::new_unique_names();
response.insert("Type", "pool open done").unwrap();
response.insert("GUID", &guid.0).unwrap();

let (pool, phys_opt, next_block) = match Pool::open(
object_access,
Expand All @@ -141,14 +149,12 @@ impl RootConnectionState {
.await
{
Err(PoolOpenError::Mmp(hostname)) => {
response.insert("Type", "pool open failed").unwrap();
response.insert("cause", "MMP").unwrap();
response.insert("hostname", hostname.as_str()).unwrap();
debug!("sending response: {:?}", response);
return Ok(Some(response));
}
Err(PoolOpenError::Feature(FeatureError { features, readonly })) => {
response.insert("Type", "pool open failed").unwrap();
response.insert("cause", "feature").unwrap();
let mut feature_nvl = NvList::new_unique_names();
for feature in features {
Expand All @@ -174,7 +180,6 @@ impl RootConnectionState {
* then, we just pass the root cause error message back to the kernel, and
* hope that it can present a usable error to the user.
*/
response.insert("Type", "pool open failed").unwrap();
response.insert("cause", "IO").unwrap();
response
.insert("message", e.root_cause().to_string().as_str())
Expand All @@ -183,16 +188,13 @@ impl RootConnectionState {
return Ok(Some(response));
}
Err(PoolOpenError::NoCheckpoint) => {
response.insert("Type", "pool open failed").unwrap();
response.insert("cause", "checkpoint").unwrap();
debug!("sending response: {:?}", response);
return Ok(Some(response));
}
Ok(x) => x,
};

response.insert("Type", "pool open done").unwrap();
response.insert("GUID", &guid.0).unwrap();
if let Some(phys) = phys_opt {
response
.insert("uberblock", &phys.get_zfs_uberblock()[..])
Expand Down
1 change: 0 additions & 1 deletion include/sys/vdev_object_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#define AGENT_TYPE_CREATE_POOL_DONE "pool create done"
#define AGENT_TYPE_OPEN_POOL "open pool"
#define AGENT_TYPE_OPEN_POOL_DONE "pool open done"
#define AGENT_TYPE_OPEN_POOL_FAILED "pool open failed"
#define AGENT_TYPE_READ_BLOCK "read block"
#define AGENT_TYPE_READ_DONE "read done"
#define AGENT_TYPE_WRITE_BLOCK "write block"
Expand Down
Loading

0 comments on commit 95784ca

Please sign in to comment.