diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 9597e2790756..87deae2928bf 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -4487,7 +4487,7 @@ zfs_do_receive(int argc, char **argv) nomem(); /* check options */ - while ((c = getopt(argc, argv, ":o:x:dehnuvFsA")) != -1) { + while ((c = getopt(argc, argv, ":o:x:dehnuvFsAc")) != -1) { switch (c) { case 'o': if (!parseprop(props, optarg)) { @@ -4540,6 +4540,9 @@ zfs_do_receive(int argc, char **argv) case 'A': abort_resumable = B_TRUE; break; + case 'c': + flags.heal = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); diff --git a/include/libzfs.h b/include/libzfs.h index 8e9f6fb3fc1b..c1d468c07ffe 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -751,6 +751,9 @@ typedef struct recvflags { /* mount the filesystem unless nomount is specified */ boolean_t domount; + + /* use this recv to check (and heal if needed) an existing snapshot */ + boolean_t heal; } recvflags_t; extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, diff --git a/include/libzfs_core.h b/include/libzfs_core.h index bd0b0c4f73d3..21f158b4f6fc 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -21,9 +21,9 @@ /* * Copyright (c) 2012, 2018 by Delphix. All rights reserved. - * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019 Datto Inc. */ #ifndef _LIBZFS_CORE_H @@ -105,6 +105,10 @@ int lzc_receive_with_cmdprops(const char *, nvlist_t *, nvlist_t *, uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, uint64_t *, nvlist_t **); +int lzc_receive_with_heal(const char *, nvlist_t *, nvlist_t *, + uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, boolean_t, + int, const struct dmu_replay_record *, int, uint64_t *, uint64_t *, + uint64_t *, nvlist_t **); int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); int lzc_send_space_resume_redacted(const char *, const char *, enum lzc_send_flags, uint64_t, uint64_t, uint64_t, const char *, diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h index 1a7347d66e8f..36c2037a0d36 100644 --- a/include/sys/dmu_recv.h +++ b/include/sys/dmu_recv.h @@ -24,6 +24,7 @@ * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2019 Datto Inc. */ #ifndef _DMU_RECV_H @@ -48,6 +49,7 @@ typedef struct dmu_recv_cookie { boolean_t drc_byteswap; uint64_t drc_featureflags; boolean_t drc_force; + boolean_t drc_heal; boolean_t drc_resumable; boolean_t drc_raw; boolean_t drc_clone; @@ -80,7 +82,7 @@ typedef struct dmu_recv_cookie { } dmu_recv_cookie_t; int dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, - boolean_t force, boolean_t resumable, nvlist_t *localprops, + boolean_t force, boolean_t heal, boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp); int dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd, diff --git a/include/sys/spa.h b/include/sys/spa.h index 3780fdae0fd3..205c928e441c 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -26,8 +26,8 @@ * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019 Datto Inc. */ #ifndef _SYS_SPA_H @@ -1123,6 +1123,7 @@ extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); +extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb); extern int zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, uint64_t length); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 8dfd46431613..a7606900631f 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -25,8 +25,8 @@ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. - * Copyright (c) 2017 Datto Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019 Datto Inc. */ #ifndef _SYS_SPA_IMPL_H @@ -341,6 +341,7 @@ struct spa { kmutex_t spa_errlist_lock; /* error list/ereport lock */ avl_tree_t spa_errlist_last; /* last error list */ avl_tree_t spa_errlist_scrub; /* scrub error list */ + avl_tree_t spa_errlist_healed; /* list of healed blocks */ uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 23a4a6ea93cc..f4bee5309316 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -520,6 +520,8 @@ extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, extern zio_t *zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags); +extern void zio_destroy(zio_t *zio); + extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, struct abd *data, uint64_t lsize, zio_done_func_t *done, void *private, zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 20d29f48c6d8..5fb297e31690 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -816,6 +816,29 @@ send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv) } } +/* + * returns snapshot guid + * and returns 0 if the snapshot does not exist + */ +static uint64_t +get_snap_guid(libzfs_handle_t *hdl, const char *fs, const char *snap) +{ + char name[MAXPATHLEN + 1]; + uint64_t guid = 0; + + if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') + return (guid); + + (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); + zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); + if (zhp != NULL) { + guid = zfs_prop_get_int(zhp, ZFS_PROP_GUID); + zfs_close(zhp); + } + + return (guid); +} + /* * returns snapshot creation txg * and returns 0 if the snapshot does not exist @@ -4655,9 +4678,34 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; - if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { + if (flags->heal) { + if (flags->isprefix || flags->istail || flags->force || + flags->canmountoff || flags->resumable || flags->nomount || + flags->skipholds) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "corrective recv can not be used when combined with" + " this flag")); + err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto out; + } + uint64_t guid = + get_snap_guid(hdl, name, strchr(destsnap, '@') + 1); + if (guid == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "corrective recv must specify an existing snapshot" + " to heal")); + err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto out; + } else if (guid != drrb->drr_toguid) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "local snapshot doesn't match the snapshot" + " in the provided stream")); + err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); + goto out; + } + } else if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { zfs_cmd_t zc = {"\0"}; - zfs_handle_t *zhp; + zfs_handle_t *zhp = NULL; boolean_t encrypted; (void) strcpy(zc.zc_name, name); @@ -4850,8 +4898,9 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } if (flags->verbose) { - (void) printf("%s %s stream of %s into %s\n", + (void) printf("%s %s%s stream of %s into %s\n", flags->dryrun ? "would receive" : "receiving", + flags->heal ? " corrective" : "", drrb->drr_fromguid ? "incremental" : "full", drrb->drr_toname, destsnap); (void) fflush(stdout); @@ -4921,10 +4970,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); } - err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, - oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, - raw, infd, drr_noswap, cleanup_fd, &read_bytes, &errflags, - action_handlep, &prop_errors); + if (flags->heal) { + err = ioctl_err = lzc_receive_with_heal(destsnap, rcvprops, + oxprops, wkeydata, wkeylen, origin, flags->force, + flags->heal, flags->resumable, raw, infd, drr_noswap, + cleanup_fd, &read_bytes, &errflags, action_handlep, + &prop_errors); + } else { + err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, + oxprops, wkeydata, wkeylen, origin, flags->force, + flags->resumable, raw, infd, drr_noswap, cleanup_fd, + &read_bytes, &errflags, action_handlep, &prop_errors); + } ioctl_errno = ioctl_err; prop_errflags = errflags; @@ -5048,7 +5105,11 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); break; case EACCES: - if (raw && stream_wantsnewfs) { + if (flags->heal) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "key must be loaded to do a non-raw correc" + "tive recv on an encrypted dataset.")); + } else if (raw && stream_wantsnewfs) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to create encryption key")); } else if (raw && !stream_wantsnewfs) { @@ -5087,12 +5148,24 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ECKSUM: - recv_ecksum_set_aux(hdl, destsnap, flags->resumable); + if (flags->heal) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "corrective receive was not able to recon" + "struct the data needed for healing.")); + else + recv_ecksum_set_aux(hdl, destsnap, + flags->resumable); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool must be upgraded to receive this stream.")); + if (flags->heal) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "stream is not compatible with the " + "data in the pool.")); + else + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to receive this " + "stream.")); (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); break; case EDQUOT: diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 7430a845a43b..d893e0a687da 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -22,9 +22,9 @@ /* * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. - * Copyright (c) 2017 Datto Inc. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019 Datto Inc. */ /* @@ -778,7 +778,7 @@ recv_read(int fd, void *buf, int ilen) static int recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, - boolean_t resumable, boolean_t raw, int input_fd, + boolean_t heal, boolean_t resumable, boolean_t raw, int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) @@ -824,7 +824,7 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, /* * All receives with a payload should use the new interface. */ - if (resumable || raw || wkeydata != NULL || payload) { + if (resumable || heal || raw || wkeydata != NULL || payload) { nvlist_t *outnvl = NULL; nvlist_t *innvl = fnvlist_alloc(); @@ -864,6 +864,9 @@ recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, if (resumable) fnvlist_add_boolean(innvl, "resumable"); + if (heal) + fnvlist_add_boolean(innvl, "heal"); + if (cleanup_fd >= 0) fnvlist_add_int32(innvl, "cleanup_fd", cleanup_fd); @@ -982,7 +985,7 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - B_FALSE, raw, fd, NULL, -1, NULL, NULL, NULL, NULL)); + B_FALSE, B_FALSE, raw, fd, NULL, -1, NULL, NULL, NULL, NULL)); } /* @@ -996,7 +999,7 @@ lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, boolean_t force, boolean_t raw, int fd) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - B_TRUE, raw, fd, NULL, -1, NULL, NULL, NULL, NULL)); + B_FALSE, B_TRUE, raw, fd, NULL, -1, NULL, NULL, NULL, NULL)); } /* @@ -1019,7 +1022,8 @@ lzc_receive_with_header(const char *snapname, nvlist_t *props, return (EINVAL); return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - resumable, raw, fd, begin_record, -1, NULL, NULL, NULL, NULL)); + B_FALSE, resumable, raw, fd, begin_record, -1, NULL, NULL, NULL, + NULL)); } /* @@ -1049,8 +1053,8 @@ int lzc_receive_one(const char *snapname, nvlist_t *props, nvlist_t **errors) { return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, - resumable, raw, input_fd, begin_record, cleanup_fd, read_bytes, - errflags, action_handle, errors)); + B_FALSE, resumable, raw, input_fd, begin_record, cleanup_fd, + read_bytes, errflags, action_handle, errors)); } /* @@ -1063,13 +1067,32 @@ int lzc_receive_one(const char *snapname, nvlist_t *props, */ int lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props, nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, - boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, - const dmu_replay_record_t *begin_record, int cleanup_fd, + boolean_t force, boolean_t resumable, boolean_t raw, + int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, + uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, + nvlist_t **errors) +{ + return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, + force, B_FALSE, resumable, raw, input_fd, begin_record, cleanup_fd, + read_bytes, errflags, action_handle, errors)); +} + +/* + * Like lzc_receive_with_cmdprops, but allows the caller to pass an additional + * 'heal' argument. + * + * The heal arguments tells us to heal the provided snapshot using the provided + * send stream + */ +int lzc_receive_with_heal(const char *snapname, nvlist_t *props, + nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, + boolean_t force, boolean_t heal, boolean_t resumable, boolean_t raw, + int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, - force, resumable, raw, input_fd, begin_record, cleanup_fd, + force, heal, resumable, raw, input_fd, begin_record, cleanup_fd, read_bytes, errflags, action_handle, errors)); } diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 1c773435c9fb..d013fa396ca3 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2846,6 +2846,19 @@ estimates. Default value: \fB0\fR. .RE +.sp +.ne 2 +.na +\fBzfs_recv_best_effort_corrective\fR (int) +.ad +.RS 12n +When this variable is set to non-zero a corrective receive will not stop healing +and will continue going through the provided stream if a healing error is +encountered. +.sp +Default value: \fB0\fR. +.RE + .sp .ne 2 .na diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index e391b9810629..58c4eb0e9731 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -29,8 +29,9 @@ .\" Copyright 2019 Richard Laager. All rights reserved. .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. +.\" Copyright 2019 Datto Inc. .\" -.Dd June 30, 2019 +.Dd Sept 09, 2019 .Dt ZFS 8 SMM .Os Linux .Sh NAME @@ -234,6 +235,11 @@ .Fl A .Ar filesystem Ns | Ns Ar volume .Nm +.Cm receive +.Fl c +.Op Fl vn +.Ar snapshot +.Nm .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns ... @@ -4361,6 +4367,22 @@ Abort an interrupted deleting its saved partially received state. .It Xo .Nm +.Cm receive +.Fl c +.Op Fl vn +.Ar snapshot +.Xc +Attempt to correct data corruption in the specified +.Nm snapshot, +by using the provided stream as the source of healthy data. This method of +healing can only heal data blocks present in the stream. Metadata is not +able to be healed by corrective receive. Running a scrub is recommended post +healing to ensure all corruption had been healed. It's important to consider +why corruption has happened in the first place since if you have slowly failing +hardware periodically healing the data is not going to save you from data loss +later on when the hardware fails completeley. +.It Xo +.Nm .Cm redact .Ar snapshot redaction_bookmark .Ar redaction_snapshot Ns ... diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 78c67343316c..80d50b0a6ea4 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -509,6 +509,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; + if ((flags & DMU_READ_NO_DECRYPT) != 0) + dbuf_flags |= DB_RF_NO_DECRYPT; + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 48c3705c65a6..62cb39ccfaeb 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -25,8 +25,10 @@ * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright (c) 2019 Datto Inc. */ +#include #include #include #include @@ -64,6 +66,7 @@ int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; int zfs_recv_queue_ff = 20; +int zfs_recv_best_effort_corrective = 0; static char *dmu_recv_tag = "dmu_recv_tag"; const char *recv_clone_name = "%recv"; @@ -101,6 +104,8 @@ struct receive_writer_arg { int err; /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; + const char *tofs; + boolean_t heal; boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ @@ -117,6 +122,7 @@ struct receive_writer_arg { uint8_t or_iv[ZIO_DATA_IV_LEN]; uint8_t or_mac[ZIO_DATA_MAC_LEN]; boolean_t or_byteorder; + zio_t *heal_pio; }; typedef struct guid_map_entry { @@ -342,9 +348,10 @@ static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) { - uint64_t val; + uint64_t obj; uint64_t children; int error; + dsl_dataset_t *snap; dsl_pool_t *dp = ds->ds_dir->dd_pool; boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; @@ -353,7 +360,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, /* Temporary clone name must not exist. */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, - 8, 1, &val); + 8, 1, &obj); if (error != ENOENT) return (error == 0 ? SET_ERROR(EBUSY) : error); @@ -361,12 +368,16 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, if (dsl_dataset_has_resume_receive_state(ds)) return (SET_ERROR(EBUSY)); - /* New snapshot name must not exist. */ + /* New snapshot name must not exist if we're not healing it. */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, - drba->drba_cookie->drc_tosnap, 8, 1, &val); - if (error != ENOENT) + drba->drba_cookie->drc_tosnap, 8, 1, &obj); + if (drba->drba_cookie->drc_heal) { + if (error != 0) + return (error); + } else if (error != ENOENT) { return (error == 0 ? SET_ERROR(EEXIST) : error); + } /* Must not have children if receiving a ZVOL. */ error = zap_count(dp->dp_meta_objset, @@ -391,8 +402,38 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, if (error != 0) return (error); - if (fromguid != 0) { - dsl_dataset_t *snap; + if (drba->drba_cookie->drc_heal) { + /* Encryption is incompatible with embedded data. */ + if (encrypted && embed) + return (SET_ERROR(EINVAL)); + + /* Healing is not supported when in 'force' mode. */ + if (drba->drba_cookie->drc_force) + return (SET_ERROR(EINVAL)); + + /* Must have keys loaded if doing encrypted non-raw recv. */ + if (encrypted && !raw) { + if (spa_keystore_lookup_key(dp->dp_spa, ds->ds_object, + NULL, NULL) != 0) + return (SET_ERROR(EACCES)); + } + + error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); + if (error != 0) + return (error); + + /* + * Healing can only be done if the send stream is for the same + * snapshot as the one we are trying to heal. + */ + if (drba->drba_cookie->drc_drrb->drr_toguid != + dsl_dataset_phys(snap)->ds_guid) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + dsl_dataset_rele(snap, FTAG); + } else if (fromguid != 0) { + /* Sanity check the incremental recv */ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Can't perform a raw receive on top of a non-raw receive */ @@ -452,7 +493,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, dsl_dataset_rele(snap, FTAG); } else { - /* if full, then must be forced */ + /* If full and not healing then must be forced. */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); @@ -780,7 +821,7 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { - /* create temporary clone */ + /* Create temporary clone */ dsl_dataset_t *snap = NULL; if (drba->drba_cookie->drc_fromsnapobj != 0) { @@ -788,8 +829,15 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); ASSERT3P(dcp, ==, NULL); } - dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, - snap, crflags, drba->drba_cred, dcp, tx); + if (drc->drc_heal) { + /* When healing we want to use the provided snapshot */ + VERIFY0(dsl_dataset_snap_lookup(ds, drc->drc_tosnap, + &dsobj)); + } else { + dsobj = dsl_dataset_create_sync(ds->ds_dir, + recv_clone_name, snap, crflags, drba->drba_cred, + dcp, tx); + } if (drba->drba_cookie->drc_fromsnapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele_flags(ds, dsflags, FTAG); @@ -907,7 +955,8 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) */ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && - (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) { + (featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && + !drc->drc_heal) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } @@ -1102,7 +1151,7 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) */ int dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, - boolean_t force, boolean_t resumable, nvlist_t *localprops, + boolean_t force, boolean_t heal, boolean_t resumable, nvlist_t *localprops, nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) { @@ -1115,6 +1164,7 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; + drc->drc_heal = heal; drc->drc_resumable = resumable; drc->drc_cred = CRED(); drc->drc_clone = (origin != NULL); @@ -1203,6 +1253,261 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, return (err); } +/* + * Holds data need for corrective recv callback + */ +typedef struct cr_cb_data { + uint64_t size; + zbookmark_phys_t zb; + spa_t *spa; +} cr_cb_data_t; + +static void +corrective_read_done(zio_t *zio) +{ + cr_cb_data_t *data = zio->io_private; + /* Corruption corrected; update error log if needed */ + if (zio->io_error == 0) + spa_remove_error(data->spa, &data->zb); + kmem_free(data, sizeof (cr_cb_data_t)); + abd_free(zio->io_abd); +} + +/* + * zio_rewrite the data pointed to by bp with the data from the arc buf. + */ +static int +do_corrective_recv(struct receive_writer_arg *rwa, uint64_t obj, + arc_buf_t *arc_buf, uint64_t lsize, blkptr_t *bp, uint64_t blkid, + uint64_t offset) +{ + int err; + abd_t *abd; + zio_t *io; + zbookmark_phys_t zb; + void *buf = NULL, *enc_buf = NULL; + uint64_t size = arc_buf_size(arc_buf); + zio_cksum_t bp_cksum = bp->blk_cksum; + enum zio_flag flags = ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; + + SET_BOOKMARK(&zb, dmu_objset_id(rwa->os), obj, 0, blkid); + + if (BP_USES_CRYPT(bp) && !rwa->raw) { + /* + * If we're healing an encrypted block we should be using a + * raw send or have the encyption keys loaded + */ + dsl_dataset_t *ds; + dsl_pool_t *dp = rwa->os->os_dsl_dataset->ds_dir->dd_pool; + + /* get the dataset that the snap to be healed is in */ + dsl_pool_config_enter(dp, FTAG); + /* load the key for this dataset */ + err = dsl_dataset_hold_flags(dp, rwa->tofs, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + return (err); + } + err = spa_keystore_lookup_key(rwa->os->os_spa, + dmu_objset_id(rwa->os), NULL, NULL); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + dsl_pool_config_exit(dp, FTAG); + if (err != 0) + return (SET_ERROR(EACCES)); + } + + /* + * Raw healing is possible when the data is unchanged between the + * send stream and on-disk + */ + if (rwa->raw) { + if (arc_is_encrypted(arc_buf) != BP_IS_ENCRYPTED(bp)) + return (SET_ERROR(ENOTSUP)); + if (arc_get_compression(arc_buf) != BP_GET_COMPRESS(bp)) + return (SET_ERROR(ENOTSUP)); + flags |= ZIO_FLAG_RAW; + } + + /* Get the good data from the recv record */ + abd = abd_get_from_buf(arc_buf->b_data, arc_buf_size(arc_buf)); + abd_take_ownership_of_buf(abd, B_FALSE); + + if ((arc_get_compression(arc_buf) != BP_GET_COMPRESS(bp)) || + (!arc_is_encrypted(arc_buf) && BP_USES_CRYPT(bp))) { + /* + * The compression and/or encryption in the stream doesn't match + * what we have on disk; we need to re-compress/re-encrypt the + * buf to match what was written to disk previously. + */ + ASSERT(!rwa->raw); + if (arc_get_compression(arc_buf) != ZIO_COMPRESS_OFF) { + /* Decompress the stream data */ + err = zio_decompress_data(arc_get_compression(arc_buf), + abd, buf, arc_buf_size(arc_buf), lsize); + abd_release_ownership_of_buf(abd); + abd_put(abd); + if (err != 0) + return (err); + /* Swap in the newly decompressed data into the abd */ + abd = abd_get_from_buf(buf, lsize); + abd_take_ownership_of_buf(abd, B_FALSE); + } + + /* + * The stream is not encrypted but the data on-disk is. + * We need to re-encrypt the buf using the same + * encryption type, salt, iv, and mac that was used to encrypt + * the block previosly. + */ + if (!arc_is_encrypted(arc_buf) && BP_USES_CRYPT(bp)) { + dsl_dataset_t *ds; + dsl_crypto_key_t *dck = NULL; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + dsl_pool_t *dp = + rwa->os->os_dsl_dataset->ds_dir->dd_pool; + enc_buf = zio_data_buf_alloc(lsize); + abd_t *eabd = abd_get_from_buf(enc_buf, lsize); + abd_take_ownership_of_buf(eabd, B_FALSE); + zio_crypt_decode_params_bp(bp, salt, iv); + zio_crypt_decode_mac_bp(bp, mac); + + dsl_pool_config_enter(dp, FTAG); + err = dsl_dataset_hold_flags(dp, rwa->tofs, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + abd_release_ownership_of_buf(abd); + abd_put(abd); + if (buf != NULL) + abd_free(abd); + abd_release_ownership_of_buf(eabd); + abd_put(eabd); + abd_free(eabd); + zio_data_buf_free(enc_buf, lsize); + return (SET_ERROR(EACCES)); + } + + /* Look up the key from the spa's keystore */ + err = spa_keystore_lookup_key(rwa->os->os_spa, + zb.zb_objset, FTAG, &dck); + if (err != 0) { + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, + FTAG); + dsl_pool_config_exit(dp, FTAG); + abd_release_ownership_of_buf(abd); + abd_put(abd); + if (buf != NULL) + abd_free(abd); + abd_release_ownership_of_buf(eabd); + abd_put(eabd); + abd_free(eabd); + zio_data_buf_free(enc_buf, lsize); + return (SET_ERROR(EACCES)); + } + + err = zio_do_crypt_abd(B_TRUE, &dck->dck_key, + BP_GET_TYPE(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, + mac, lsize, abd, eabd, &no_crypt); + + spa_keystore_dsl_key_rele(rwa->os->os_spa, dck, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + dsl_pool_config_exit(dp, FTAG); + + if (err != 0 || no_crypt) { + abd_release_ownership_of_buf(abd); + abd_put(abd); + if (buf != NULL) + abd_free(abd); + abd_release_ownership_of_buf(eabd); + abd_put(eabd); + abd_free(eabd); + zio_data_buf_free(enc_buf, lsize); + return (err == 0 ? SET_ERROR(ENOTSUP) : err); + } + + /* Swap in the newly encrypted data into the abd */ + abd_release_ownership_of_buf(abd); + abd_put(abd); + if (buf != NULL) { + abd_free(abd); + buf = NULL; + } + abd = eabd; + + /* + * We want to prevent zio_rewrite() from trying to + * encrypt the data again + */ + flags |= ZIO_FLAG_RAW; + } + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + /* Recompress the data */ + if (buf != NULL) + abd_free(abd); + size = zio_compress_data(BP_GET_COMPRESS(bp), abd, buf, + lsize); + } + } + + io = zio_rewrite(NULL, rwa->os->os_spa, 0, bp, abd, + size, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, flags, &zb); + + /* compute new bp checksum value and make sure it matches the old one */ + zio_checksum_compute(io, BP_GET_CHECKSUM(bp), abd, size); + if (size != BP_GET_PSIZE(bp) || + !ZIO_CHECKSUM_EQUAL(bp_cksum, io->io_bp->blk_cksum)) { + zfs_dbgmsg("corrected block checksum is wrong or %lld != %lld", + size, BP_GET_PSIZE(bp)); + abd_release_ownership_of_buf(abd); + abd_put(abd); + if (buf != NULL) + abd_free(abd); + if (enc_buf != NULL) + zio_data_buf_free(enc_buf, lsize); + zio_destroy(io); + + if (zfs_recv_best_effort_corrective != 0) { + dmu_return_arcbuf(arc_buf); + return (0); + } + return (SET_ERROR(ECKSUM)); + } + + /* Correct the corruption in place */ + err = zio_wait(io); + if (err == 0) { + cr_cb_data_t *cb_data = + kmem_alloc(sizeof (cr_cb_data_t), KM_SLEEP); + cb_data->spa = rwa->os->os_spa; + cb_data->size = size; + cb_data->zb = zb; + /* Test if healing worked by re-reading the bp */ + zio_nowait(zio_read(rwa->heal_pio, rwa->os->os_spa, bp, + abd_alloc_for_io(size, B_FALSE), size, corrective_read_done, + cb_data, ZIO_PRIORITY_ASYNC_READ, flags, NULL)); + } else if (zfs_recv_best_effort_corrective != 0) { + err = 0; + } + + abd_release_ownership_of_buf(abd); + abd_put(abd); + if (buf != NULL) + abd_free(abd); + if(enc_buf != NULL) + zio_data_buf_free(enc_buf, lsize); + + if (err == 0) + dmu_return_arcbuf(arc_buf); + + return (err); +} + static int guid_compare(const void *arg1, const void *arg2) { @@ -1722,6 +2027,56 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); + if (rwa->byteswap && !arc_is_encrypted(abuf) && + arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } + + if (rwa->heal) { + uint64_t blkid; + blkptr_t *bp; + dmu_buf_t *dbp; + dnode_t *dn; + int flags = DMU_READ_PREFETCH; + int buf_size = MIN(drrw->drr_logical_size, 32); + void *buf = kmem_alloc(buf_size, KM_SLEEP); + if (buf == NULL) + return (ENOMEM); + + if (rwa->raw) + flags |= DMU_READ_NO_DECRYPT; + + /* Try to read the object to see if it needs healing */ + err = dmu_read(rwa->os, drrw->drr_object, drrw->drr_offset, + buf_size, buf, flags); + kmem_free(buf, buf_size); + /* + * We only try to heal when dmu_read() returns a ECKSUMs. + * Other errors (even EIO) get returned to caller + */ + if (err != ECKSUM) { + if (err == 0) + dmu_return_arcbuf(abuf); + return (err); + } + err = dmu_buf_hold_noread(rwa->os, drrw->drr_object, + drrw->drr_offset, FTAG, &dbp); + if (err != 0) + return (err); + + /* Get the block pointer for the corrupted block */ + dn = dmu_buf_dnode_enter(dbp); + bp = dmu_buf_get_blkptr(dbp); + blkid = dbuf_whichblock(dn, 0, drrw->drr_offset); + dmu_buf_dnode_exit(dbp); + dmu_buf_rele(dbp, FTAG); + return (do_corrective_recv(rwa, drrw->drr_object, abuf, + drrw->drr_logical_size, bp, blkid, drrw->drr_offset)); + } + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); @@ -1731,14 +2086,6 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, return (err); } - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrw->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_WRITE_PAYLOAD_SIZE(drrw)); - } - /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ VERIFY0(dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn)); err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); @@ -1917,11 +2264,35 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, if (drrs->drr_object > rwa->max_object) rwa->max_object = drrs->drr_object; + if (rwa->byteswap && !arc_is_encrypted(abuf) && + arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrs->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + DRR_SPILL_PAYLOAD_SIZE(drrs)); + } + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, &db_spill)) != 0) { + if (rwa->heal && err == ECKSUM) { + /* Get the block pointer to the corrupted spill block */ + blkptr_t *bp; + dnode_t *dn = dmu_buf_dnode_enter(db); + bp = DN_SPILL_BLKPTR(dn->dn_phys); + dmu_buf_dnode_exit(db); + dmu_buf_rele(db, FTAG); + return (do_corrective_recv(rwa, drrs->drr_object, + abuf, drrs->drr_length, bp, 0, 0)); + } dmu_buf_rele(db, FTAG); return (err); + } else if (rwa->heal) { + /* no corruption found */ + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + dmu_return_arcbuf(abuf); + return (0); } tx = dmu_tx_create(rwa->os); @@ -1947,14 +2318,6 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, drrs->drr_length, tx)); } - if (rwa->byteswap && !arc_is_encrypted(abuf) && - arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { - dmu_object_byteswap_t byteswap = - DMU_OT_BYTESWAP(drrs->drr_type); - dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, - DRR_SPILL_PAYLOAD_SIZE(drrs)); - } - dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); @@ -2077,7 +2440,8 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) rrw_exit(&ds->ds_bp_rwlock, FTAG); dsl_dataset_name(ds, name); dsl_dataset_disown(ds, dsflags, dmu_recv_tag); - (void) dsl_destroy_head(name); + if (!drc->drc_heal) + (void) dsl_destroy_head(name); } } @@ -2476,6 +2840,19 @@ receive_process_record(struct receive_writer_arg *rwa, ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); rwa->bytes_read = rrd->bytes_read; + /* We can only heal write and spill records; other ones get ignored */ + if (rwa->heal && + (rrd->header.drr_type != DRR_WRITE && + rrd->header.drr_type != DRR_SPILL)) { + if (rrd->arc_buf != NULL) + dmu_return_arcbuf(rrd->arc_buf); + else if (rrd->payload != NULL) + kmem_free(rrd->payload, rrd->payload_size); + rrd->arc_buf = NULL; + rrd->payload = NULL; + return (0); + } + switch (rrd->header.drr_type) { case DRR_OBJECT: { @@ -2589,6 +2966,7 @@ receive_writer_thread(void *arg) kmem_free(rrd, sizeof (*rrd)); } kmem_free(rrd, sizeof (*rrd)); + zio_wait(rwa->heal_pio); mutex_enter(&rwa->mutex); rwa->done = B_TRUE; cv_signal(&rwa->cv); @@ -2718,11 +3096,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd, * are sure the rest of the receive succeeded so we stash * the keynvl away until then. */ - err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), - drc->drc_ds->ds_object, drc->drc_fromsnapobj, - drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); - if (err != 0) - goto out; + if (!drc->drc_heal) { + err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), + drc->drc_ds->ds_object, drc->drc_fromsnapobj, + drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); + if (err != 0) + goto out; + } /* see comment in dmu_recv_end_sync() */ drc->drc_ivset_guid = 0; @@ -2746,10 +3126,14 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd, mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); rwa->os = drc->drc_os; rwa->byteswap = drc->drc_byteswap; + rwa->heal = drc->drc_heal; + rwa->tofs = drc->drc_tofs; rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; rwa->os->os_raw_receive = drc->drc_raw; + rwa->heal_pio = zio_root(drc->drc_os->os_spa, NULL, NULL, + ZIO_FLAG_GODFATHER); (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, TS_RUN, minclsyspri); @@ -2879,7 +3263,9 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); - if (!drc->drc_newfs) { + if (drc->drc_heal) { + error = 0; + } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); @@ -2958,7 +3344,12 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) tx, "snap=%s", drc->drc_tosnap); drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; - if (!drc->drc_newfs) { + if (drc->drc_heal) { + if (drc->drc_keynvl != NULL) { + nvlist_free(drc->drc_keynvl); + drc->drc_keynvl = NULL; + } + } else if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, @@ -3066,7 +3457,7 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) * tunable is set, in which case we will leave the newly-generated * value. */ - if (drc->drc_raw && drc->drc_ivset_guid != 0) { + if (!drc->drc_heal && drc->drc_raw && drc->drc_ivset_guid != 0) { dmu_object_zapify(dp->dp_meta_objset, drc->drc_newsnapobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_update(dp->dp_meta_objset, drc->drc_newsnapobj, @@ -3074,7 +3465,11 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) &drc->drc_ivset_guid, tx)); } - zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE); + /* + * If we're healing there is nothing to do as the zvol already exists. + */ + if (!drc->drc_heal) + zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE); /* * Release the hold from dmu_recv_begin. This must be done before @@ -3207,4 +3602,7 @@ ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW, "Receive queue fill fraction"); + +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, best_effort_corrective, INT, ZMOD_RW, + "Ignore errors during corrective receive"); /* END CSTYLED */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 9bfd24d98e68..6a58ee9b1a46 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -29,9 +29,9 @@ * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. - * Copyright (c) 2017 Datto Inc. * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019 Datto Inc. */ /* @@ -1264,6 +1264,9 @@ spa_activate(spa_t *spa, int mode) avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_healed, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); spa_keystore_init(&spa->spa_keystore); @@ -1369,6 +1372,7 @@ spa_deactivate(spa_t *spa) spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); + avl_destroy(&spa->spa_errlist_healed); spa_keystore_fini(&spa->spa_keystore); diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index fa5120eb61b3..376d70372357 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2019 Datto Inc. */ /* @@ -54,6 +55,7 @@ #include #include +#define NAME_MAX_LEN 64 /* * Convert a bookmark to a string. @@ -128,6 +130,102 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) mutex_exit(&spa->spa_errlist_lock); } +/* + * If a healed bookmark matches an entry in the error log we stash it in a tree + * so that we can later remove the related log entries in sync context. + */ +static void +spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb) +{ + char name[NAME_MAX_LEN]; + + if (obj == 0) + return; + + bookmark_to_name(healed_zb, name, sizeof (name)); + mutex_enter(&spa->spa_errlog_lock); + if (zap_contains(spa->spa_meta_objset, obj, name) == 0) { + /* + * Found an error matching healed zb, add zb to our + * tree of healed errors + */ + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + mutex_enter(&spa->spa_errlist_lock); + if (avl_find(tree, &search, &where) != NULL) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + avl_insert(tree, new, where); + mutex_exit(&spa->spa_errlist_lock); + } + mutex_exit(&spa->spa_errlog_lock); +} + +/* + * If this error exists in the given tree remove it. + */ +static void +remove_error_from_list(spa_t *spa, avl_tree_t *t, const zbookmark_phys_t *zb) +{ + spa_error_entry_t search, *found; + avl_index_t where; + + mutex_enter(&spa->spa_errlist_lock); + search.se_bookmark = *zb; + if ((found = avl_find(t, &search, &where)) != NULL) { + avl_remove(t, found); + kmem_free(found, sizeof (spa_error_entry_t)); + } + mutex_exit(&spa->spa_errlist_lock); +} + + +/* + * Removes all of the recv healed errors from both on-disk error logs + */ +static void +spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) +{ + char name[NAME_MAX_LEN]; + spa_error_entry_t *se; + void *cookie = NULL; + + ASSERT(MUTEX_HELD(&spa->spa_errlog_lock)); + + while ((se = avl_destroy_nodes(&spa->spa_errlist_healed, + &cookie)) != NULL) { + remove_error_from_list(spa, s, &se->se_bookmark); + remove_error_from_list(spa, l, &se->se_bookmark); + bookmark_to_name(&se->se_bookmark, name, sizeof (name)); + kmem_free(se, sizeof (spa_error_entry_t)); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_last, name, tx); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_scrub, name, tx); + } +} + +/* + * Stash away healed bookmarks to remove them from the on-disk error logs + * later in spa_remove_healed_errors(). + */ +void +spa_remove_error(spa_t *spa, zbookmark_phys_t *zb) +{ + char name[NAME_MAX_LEN]; + + bookmark_to_name(zb, name, sizeof (name)); + + spa_add_healed_error(spa, spa->spa_errlog_last, zb); + spa_add_healed_error(spa, spa->spa_errlog_scrub, zb); +} + /* * Return the number of errors currently in the error log. This is actually the * sum of both the last log and the current log, since we don't know the union @@ -301,7 +399,7 @@ static void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) { spa_error_entry_t *se; - char buf[64]; + char buf[NAME_MAX_LEN]; void *cookie; if (avl_numnodes(t) != 0) { @@ -352,6 +450,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) */ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && avl_numnodes(&spa->spa_errlist_last) == 0 && + avl_numnodes(&spa->spa_errlist_healed) == 0 && !spa->spa_scrub_finished) { mutex_exit(&spa->spa_errlist_lock); return; @@ -366,6 +465,11 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + /* + * Remove healed errors from errors. + */ + spa_remove_healed_errors(spa, &last, &scrub, tx); + /* * Sync out the current list of errors. */ diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 25863c8e434c..d10fef240bbb 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4701,9 +4701,9 @@ static boolean_t zfs_ioc_recv_inject_err; static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, - boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, - int cleanup_fd, uint64_t *read_bytes, uint64_t *errflags, - uint64_t *action_handle, nvlist_t **errors) + boolean_t heal, boolean_t resumable, int input_fd, + dmu_replay_record_t *begin_record, int cleanup_fd, uint64_t *read_bytes, + uint64_t *errflags, uint64_t *action_handle, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; @@ -4726,7 +4726,7 @@ zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, return (SET_ERROR(EBADF)); off = input_fp->f_offset; - error = dmu_recv_begin(tofs, tosnap, begin_record, force, + error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal, resumable, localprops, hidden_args, origin, &drc, input_fp->f_vnode, &off); if (error != 0) @@ -5075,7 +5075,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, - NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, + NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record, zc->zc_cleanup_fd, &zc->zc_cookie, &zc->zc_obj, &zc->zc_action_handle, &errors); nvlist_free(recvdprops); @@ -5110,6 +5110,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "resumable" -> resumable flag (value ignored) + * (optional) "heal" -> use send stream to heal data corruption * (optional) "cleanup_fd" -> cleanup-on-exit file descriptor * (optional) "action_handle" -> handle for this guid/ds mapping * (optional) "hidden_args" -> { "wkeydata" -> value } @@ -5130,6 +5131,7 @@ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"heal", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, @@ -5150,6 +5152,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; + boolean_t heal; boolean_t resumable; uint64_t action_handle = 0; uint64_t read_bytes = 0; @@ -5181,6 +5184,7 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); + heal = nvlist_exists(innvl, "heal"); resumable = nvlist_exists(innvl, "resumable"); error = nvlist_lookup_int32(innvl, "cleanup_fd", &cleanup_fd); @@ -5205,8 +5209,8 @@ zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, - hidden_args, force, resumable, input_fd, begin_record, cleanup_fd, - &read_bytes, &errflags, &action_handle, &errors); + hidden_args, force, heal, resumable, input_fd, begin_record, + cleanup_fd, &read_bytes, &errflags, &action_handle, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 092262590c81..ad5f70900f74 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -851,7 +851,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, return (zio); } -static void +void zio_destroy(zio_t *zio) { metaslab_trace_fini(&zio->io_alloc_list); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index d6d3874f684f..2b6b59464847 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -198,7 +198,7 @@ tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', 'receive-o-x_props_override', 'zfs_receive_from_encrypted', - 'zfs_receive_to_encrypted', 'zfs_receive_raw', + 'zfs_receive_to_encrypted', 'zfs_receive_raw', 'zfs_receive_corrective', 'zfs_receive_raw_incremental', 'zfs_receive_-e', 'zfs_receive_raw_-d'] tags = ['functional', 'cli_root', 'zfs_receive'] diff --git a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c index f3392dc17f58..0d230ca6acda 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -540,6 +540,7 @@ test_recv_new(const char *dataset, int fd) fnvlist_add_string(props, "org.openzfs:launch", "September 17th, 2013"); fnvlist_add_nvlist(optional, "localprops", props); fnvlist_add_boolean(optional, "force"); + fnvlist_add_boolean(optional, "heal"); fnvlist_add_int32(optional, "cleanup_fd", cleanup_fd); /* diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am index 7b2037b9f2ef..f4a224ca74c5 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am @@ -23,4 +23,5 @@ dist_pkgdata_SCRIPTS = \ zfs_receive_raw.ksh \ zfs_receive_raw_incremental.ksh \ zfs_receive_raw_-d.ksh \ - zfs_receive_-e.ksh + zfs_receive_-e.ksh \ + zfs_receive_corrective.ksh diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh new file mode 100755 index 000000000000..79d0c7e0628c --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh @@ -0,0 +1,157 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2019 Datto, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# ZFS be able to heal using corrective recv +# +# STRATEGY: +# 1. Create a dataset +# 2. Snapshot the dataset +# 3. Create a file and get its checksum +# 4. Snapshot the dataset +# 5. Recv dataset into a filesystem with different compression +# 6. Recv dataset into an encrypted filesystem +# 7. Corrupt the file +# 8. Heal the corruption using a corrective send and full send file +# 9. Corrupt the file again +# 10. Heal the corruption using a corrective send an incremental send file +# 11. Corrupt the differently compressed file +# 12. Heal the corruption when the target snapshot and the send file have +# different compressions algorithms +# 13. Corrupt the encrypted file +# 14. Heal the corruption when the target snapshot (on) and the send file (off) +# have different encryptions settings and the keys are loaded. +# 15. Unload keys from encrypted dataset +# 16. Corrupt the encrypted file +# 17. Heal the encrypted dataset corruption using raw send file +# + +verify_runnable "both" + +DISK=${DISKS%% *} + +backup=$TEST_BASE_DIR/backup +raw_backup=$TEST_BASE_DIR/raw_backup +ibackup=$TEST_BASE_DIR/ibackup.$$ + +function cleanup +{ + datasetexists $TESTPOOL/$TESTFS1 && \ + log_must zfs destroy -r $TESTPOOL/$TESTFS1 + datasetexists $TESTPOOL/$TESTFS2 && \ + log_must zfs destroy -r $TESTPOOL/$TESTFS2 + datasetexists $TESTPOOL/testfs3 && \ + log_must zfs destroy -r $TESTPOOL/testfs3 + + for f in $ibackup $backup; do + [[ -f $f ]] && log_must rm -f $f + done + + log_must zpool destroy $TESTPOOL + log_must zpool create -f $TESTPOOL $DISK +} + +function test_corrective_recv +{ + log_must zpool scrub $TESTPOOL + log_must zpool wait -t scrub $TESTPOOL + log_must eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" + + # make sure we will read the corruption from disk by flushing the ARC + log_must zinject -a + + log_must eval "zfs recv -c $1 < $2" + + log_must zpool sync $TESTPOOL + log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" + typeset cksum=$(md5digest $file) + [[ "$cksum" == "$checksum" ]] || \ + log_fail "Checksums differ ($cksum1 != $checksum)" +} + +log_onexit cleanup + +log_assert "ZFS corrective receive should be able to heal corruption" + +typeset passphrase="password" +typeset snap1="$TESTPOOL/$TESTFS1@snap1" +typeset snap2="$TESTPOOL/$TESTFS1@snap2" +typeset file="/$TESTPOOL/$TESTFS1/$TESTFILE0" + +log_must zpool destroy $TESTPOOL +log_must zpool create -f -o feature@embedded_data=disabled $TESTPOOL $DISK + +log_must eval "echo $passphrase > /$TESTPOOL/pwd" + +log_must zfs create -o primarycache=none \ + -o compression=lz4 $TESTPOOL/$TESTFS1 + +log_must zfs snapshot $snap1 + +log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync +typeset checksum=$(md5digest $file) + +log_must zfs snapshot $snap2 + +# create full send file +log_must eval "zfs send $snap2 > $backup" +corrupt_blocks_at_level $file 0 +# test healing recv from a full send file +test_corrective_recv $snap2 $backup + +# create incremental send file +log_must eval "zfs send -i $snap1 $snap2 > $ibackup" +corrupt_blocks_at_level $file 0 +# test healing recv from an incremental send file +test_corrective_recv $snap2 $ibackup + +# create new compressed dataset using our send file +log_must eval "zfs recv -o compression=gzip -o primarycache=none \ + $TESTPOOL/$TESTFS2 < $backup" +typeset compr=$(get_prop compression $TESTPOOL/$TESTFS2) +[[ "$compr" == "gzip" ]] || \ + log_fail "Unexpected compression $compr in recved dataset" +corrupt_blocks_at_level "/$TESTPOOL/$TESTFS2/$TESTFILE0" 0 +# test healing recv when compression doesn't match between send file and on-disk +test_corrective_recv "$TESTPOOL/$TESTFS2@snap2" $backup + +# create new encrypted dataset using our send file +log_must eval "zfs recv -o encryption=aes-256-ccm -o keyformat=passphrase \ + -o keylocation=file:///$TESTPOOL/pwd -o primarycache=none \ + $TESTPOOL/testfs3 < $backup" +typeset encr=$(get_prop encryption $TESTPOOL/testfs3) +[[ "$encr" == "aes-256-ccm" ]] || \ + log_fail "Unexpected encryption $encr in recved dataset" +log_must eval "zfs send --raw $TESTPOOL/testfs3@snap2 > $raw_backup" +corrupt_blocks_at_level "/$TESTPOOL/testfs3/$TESTFILE0" 0 +# test healing recv when encryption doesn't match between send file and on-disk +test_corrective_recv "$TESTPOOL/testfs3@snap2" $backup +corrupt_blocks_at_level "/$TESTPOOL/testfs3/$TESTFILE0" 0 +log_must zfs unmount $TESTPOOL/testfs3 +log_must zfs unload-key $TESTPOOL/testfs3 +# test healing recv using a raw send file +test_corrective_recv "$TESTPOOL/testfs3@snap2" $raw_backup + +log_pass "ZFS corrective recv works for healing"