diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 00258799bb04..763a086ac468 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3707,8 +3707,12 @@ dump_l2arc_header(int fd) (u_longlong_t)l2dhdr.dh_evict); (void) printf(" lb_asize_refcount: %llu\n", (u_longlong_t)l2dhdr.dh_lb_asize); - (void) printf(" lb_count_refcount: %llu\n\n", + (void) printf(" lb_count_refcount: %llu\n", (u_longlong_t)l2dhdr.dh_lb_count); + (void) printf(" trim_action_time: %llu\n", + (u_longlong_t)l2dhdr.dh_trim_action_time); + (void) printf(" trim_state: %llu\n\n", + (u_longlong_t)l2dhdr.dh_trim_state); } dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index e8c944ce8369..5724db0a5dc0 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -240,7 +240,14 @@ typedef struct l2arc_dev_hdr_phys { */ uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ - const uint64_t dh_pad[32]; /* pad to 512 bytes */ + /* + * Mirrors of vdev_trim_action_time and vdev_trim_state, used to + * display when the cache device was fully trimmed for the last + * time. + */ + uint64_t dh_trim_action_time; + uint64_t dh_trim_state; + const uint64_t dh_pad[30]; /* pad to 512 bytes */ zio_eck_t dh_tail; } l2arc_dev_hdr_phys_t; CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); @@ -399,6 +406,7 @@ typedef struct l2arc_dev { * Number of log blocks present on the device. */ zfs_refcount_t l2ad_lb_count; + boolean_t l2ad_trim_all; /* TRIM whole device */ } l2arc_dev_t; /* @@ -902,6 +910,10 @@ extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp); +/* used in vdev_trim.c */ +void l2arc_dev_hdr_update(l2arc_dev_t *dev); +l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); + #ifdef __cplusplus } #endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 6e844f5ee8ce..d309b0d79e5a 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -745,6 +745,7 @@ typedef enum { typedef enum trim_type { TRIM_TYPE_MANUAL = 0, TRIM_TYPE_AUTO = 1, + TRIM_TYPE_SIMPLE = 2 } trim_type_t; /* state manipulation functions */ @@ -788,6 +789,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_TRIM_RESTART 0x200 #define SPA_ASYNC_AUTOTRIM_RESTART 0x400 #define SPA_ASYNC_L2CACHE_REBUILD 0x800 +#define SPA_ASYNC_L2CACHE_TRIM 0x1000 /* * Controls the behavior of spa_vdev_remove(). @@ -940,6 +942,12 @@ typedef struct spa_iostats { kstat_named_t autotrim_bytes_skipped; kstat_named_t autotrim_extents_failed; kstat_named_t autotrim_bytes_failed; + kstat_named_t simple_trim_extents_written; + kstat_named_t simple_trim_bytes_written; + kstat_named_t simple_trim_extents_skipped; + kstat_named_t simple_trim_bytes_skipped; + kstat_named_t simple_trim_extents_failed; + kstat_named_t simple_trim_bytes_failed; } spa_iostats_t; extern void spa_stats_init(spa_t *spa); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index c4ef479b5faf..e3f12d031d60 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -192,6 +192,7 @@ typedef enum { } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); +void vdev_trim_l2arc(spa_t *spa); #ifdef __cplusplus } diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 96546ac35078..56407a1914bc 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -301,7 +301,7 @@ struct vdev { uint64_t vdev_initialize_inflight; kmutex_t vdev_trim_io_lock; kcondvar_t vdev_trim_io_cv; - uint64_t vdev_trim_inflight[2]; + uint64_t vdev_trim_inflight[3]; /* * Values stored in the config for an indirect or removing vdev. diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h index 1e54017665b3..1fc7d0c2e77b 100644 --- a/include/sys/vdev_trim.h +++ b/include/sys/vdev_trim.h @@ -44,6 +44,8 @@ extern void vdev_autotrim(spa_t *spa); extern void vdev_autotrim_stop_all(spa_t *spa); extern void vdev_autotrim_stop_wait(vdev_t *vd); extern void vdev_autotrim_restart(spa_t *spa); +extern int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size, + trim_type_t type); #ifdef __cplusplus } diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 40666c8f3242..67ae4dcaf242 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -194,11 +194,29 @@ Default value: \fB2\fR. .ad .RS 12n Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being -successfully compressed before writing. A value of 100 disables this feature. +successfully compressed before writing. A value of \fB100\fR disables this +feature. .sp Default value: \fB200\fR%. .RE +.sp +.ne 2 +.na +\fBl2arc_trim_ahead\fR (ulong) +.ad +.RS 12n +Trims ahead of the current evict hand on L2ARC devices by this percentage of +write size if we have filled the device. It has a minimum value of 64MB. If +set to \fB100\fR we TRIM twice the space required to accomodate upcoming +writes. It also enables TRIM of the whole L2ARC device upon creation or +addition to an existing pool or if the header of the device is invalid upon +importing a pool or onlining a cache device. A value of \fB0\fR disables TRIM +on L2ARC altogether. +.sp +Default value: \fB0\fR%. +.RE + .sp .ne 2 .na diff --git a/man/man8/zpoolprops.8 b/man/man8/zpoolprops.8 index f0522ef785bc..d85b6d436245 100644 --- a/man/man8/zpoolprops.8 +++ b/man/man8/zpoolprops.8 @@ -238,6 +238,8 @@ this property is Automatic TRIM does not immediately reclaim blocks after a free. Instead, it will optimistically delay allowing smaller ranges to be aggregated in to a few larger ones. These can then be issued more efficiently to the storage. +TRIM on L2ARC devices is enabled by setting +.Sy l2arc_trim_ahead > 0 . .Pp Be aware that automatic trimming of recently freed data blocks can put significant stress on the underlying storage devices. This will vary diff --git a/module/os/linux/zfs/spa_stats.c b/module/os/linux/zfs/spa_stats.c index 0d7f540d1f17..49177b708423 100644 --- a/module/os/linux/zfs/spa_stats.c +++ b/module/os/linux/zfs/spa_stats.c @@ -903,6 +903,12 @@ static spa_iostats_t spa_iostats_template = { { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, }; #define SPA_IOSTATS_ADD(stat, val) \ @@ -929,13 +935,20 @@ spa_iostats_trim_add(spa_t *spa, trim_type_t type, SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); - } else { + } else if (type == TRIM_TYPE_AUTO) { SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); + } else { + SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); + SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); + SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); + SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); + SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); + SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); } } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 4f2207ae7831..3d113c67b31a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -301,6 +301,7 @@ #include #include #include +#include #ifndef _KERNEL /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ @@ -854,7 +855,6 @@ static void arc_hdr_alloc_abd(arc_buf_hdr_t *, boolean_t); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(void); static void arc_buf_watch(arc_buf_t *); -static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static uint32_t arc_bufc_to_flags(arc_buf_contents_t); @@ -864,6 +864,16 @@ static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); +/* + * L2ARC TRIM + * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of + * the current write size we should TRIM. It is defined as a + * percentage of the write size. The minimum TRIM size is 64MB. + * If set to 100 and the write size is greater than 64MB we don't + * trim ahead. The default is 0, TRIM is disabled for L2ARC. + */ +unsigned long l2arc_trim_ahead = 0; + /* * Performance tuning of L2ARC persistence: * @@ -902,7 +912,6 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static void l2arc_dev_hdr_update(l2arc_dev_t *dev); static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); @@ -7709,7 +7718,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size, dev_size; + uint64_t size, dev_size, tsize; /* * Make sure our globals have meaningful values in case the user @@ -7732,7 +7741,12 @@ l2arc_write_size(l2arc_dev_t *dev) * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; - if ((size + l2arc_log_blk_overhead(size, dev)) >= dev_size) { + tsize = size + l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) + tsize += MAX(64 * 1024 * 1024, + (tsize * l2arc_trim_ahead) / 100); + + if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " @@ -7810,10 +7824,12 @@ l2arc_dev_get_next(void) else if (next == first) break; - } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all); /* if we were unable to find any usable vdevs, return NULL */ - if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all) next = NULL; l2arc_dev_last = next; @@ -8336,8 +8352,9 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) arc_buf_hdr_t *hdr, *hdr_prev; kmutex_t *hash_lock; uint64_t taddr; - boolean_t rerun; l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; + vdev_t *vd = dev->l2ad_vdev; + boolean_t rerun; buflist = &dev->l2ad_buflist; @@ -8345,6 +8362,14 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * We need to add in the worst case scenario of log block overhead. */ distance += l2arc_log_blk_overhead(distance, dev); + if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the evict hand 64MB or (l2arc_trim_ahead/100) + * times the write size, whichever is greater. + */ + distance += MAX(64 * 1024 * 1024, + (distance * l2arc_trim_ahead) / 100); + } top: rerun = B_FALSE; @@ -8365,25 +8390,51 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, uint64_t, taddr, boolean_t, all); - /* - * This check has to be placed after deciding whether to iterate - * (rerun). - */ - if (!all && dev->l2ad_first) { + if (!all) { /* - * This is the first sweep through the device. There is - * nothing to evict. + * This check has to be placed after deciding whether to + * iterate (rerun). */ - goto out; - } + if (dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. We have already trimmmed the + * whole device. + */ + goto out; + } else { + /* + * Trim the space to be evicted. + */ + if (vd->vdev_has_trim && dev->l2ad_evict < taddr && + l2arc_trim_ahead > 0) { + /* + * We have to drop the spa_config lock because + * vdev_trim_range() will acquire it. + * l2ad_evict already accounts for the label + * size. To prevent vdev_trim_ranges() from + * adding it again, we subtract it from + * l2ad_evict. + */ + spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); + vdev_trim_simple(vd, + dev->l2ad_evict - VDEV_LABEL_START_SIZE, + taddr - dev->l2ad_evict, TRIM_TYPE_SIMPLE); + spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, + RW_READER); + } - /* - * When rebuilding L2ARC we retrieve the evict hand from the header of - * the device. Of note, l2arc_evict() does not actually delete buffers - * from the cache device, but keeping track of the evict hand will be - * useful when TRIM is implemented. - */ - dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + /* + * When rebuilding L2ARC we retrieve the evict hand + * from the header of the device. Of note, l2arc_evict() + * does not actually delete buffers from the cache + * device, but trimming may do so depending on the + * hardware implementation. Thus keeping track of the + * evict hand is useful. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + } + } retry: mutex_enter(&dev->l2ad_mtx); @@ -8410,7 +8461,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { break; } else { - vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + vdev_space_update(vd, -asize, 0, 0); ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, @@ -9015,7 +9066,7 @@ l2arc_vdev_present(vdev_t *vd) * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if * the vdev_t isn't an L2ARC device. */ -static l2arc_dev_t * +l2arc_dev_t * l2arc_vdev_get(vdev_t *vd) { l2arc_dev_t *dev; @@ -9059,6 +9110,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd) adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; adddev->l2ad_writing = B_FALSE; + adddev->l2ad_trim_all = B_FALSE; list_link_init(&adddev->l2ad_node); adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); @@ -9164,11 +9216,21 @@ l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) dev->l2ad_rebuild = B_TRUE; } else if (spa_writeable(spa)) { /* - * In this case create a new header. We zero out the memory - * holding the header to reset dh_start_lbps. + * In this case TRIM the whole device if l2arc_trim_ahead > 0, + * otherwise create a new header. We zero out the memory holding + * the header to reset dh_start_lbps. If we TRIM the whole + * device the new header will be written by vdev_trim_simple() + * at the end of the TRIM to update the trim_state in the + * header too. When reading the header, if trim_state is not + * VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 we opt to TRIM + * the whole device again. */ - bzero(l2dhdr, l2dhdr_asize); - l2arc_dev_hdr_update(dev); + if (l2arc_trim_ahead > 0) { + dev->l2ad_trim_all = B_TRUE; + } else { + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } } } @@ -9385,6 +9447,9 @@ l2arc_rebuild(l2arc_dev_t *dev) dev->l2ad_start); dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; + vd->vdev_trim_state = l2dhdr->dh_trim_state; + /* * In case the zfs module parameter l2arc_rebuild_enabled is false * we do not start the rebuild process. @@ -9594,7 +9659,9 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) l2dhdr->dh_log_entries != dev->l2ad_log_entries || l2dhdr->dh_end != dev->l2ad_end || !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, - l2dhdr->dh_evict)) { + l2dhdr->dh_evict) || + (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && + l2arc_trim_ahead > 0)) { /* * Attempt to rebuild a device containing no actual dev hdr * or containing a header from some other pool or from another @@ -9903,7 +9970,7 @@ l2arc_log_blk_fetch_abort(zio_t *zio) * Creates a zio to update the device header on an l2arc device. The zio is * initiated as a child of `pio'. */ -static void +void l2arc_dev_hdr_update(l2arc_dev_t *dev) { l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; @@ -9924,6 +9991,8 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); l2dhdr->dh_flags = 0; + l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; + l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; if (dev->l2ad_first) l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; @@ -10260,6 +10329,9 @@ ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, "Compressed l2arc_headroom multiplier"); +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, + "TRIM ahead L2ARC write size multiplier"); + ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, "Seconds between L2ARC writing"); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 73d63f849ee0..989d6cd5d75c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1896,6 +1896,15 @@ spa_load_l2cache(spa_t *spa) if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); + + /* + * Upon cache device addition to a pool or pool + * creation with a cache device or if the header + * of the device is invalid we issue an async + * TRIM command for the whole device which will + * execute if l2arc_trim_ahead > 0. + */ + spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } @@ -7993,6 +8002,17 @@ spa_async_thread(void *arg) mutex_exit(&spa_namespace_lock); } + /* + * Kick off L2 cache whole device TRIM. + */ + if (tasks & SPA_ASYNC_L2CACHE_TRIM) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_trim_l2arc(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + /* * Kick off L2 cache rebuilding. */ diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 923bf2e336a6..83c39d119439 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2281,9 +2281,6 @@ vdev_reopen(vdev_t *vd) if (vdev_readable(vd) && vdev_writeable(vd) && vd->vdev_aux == &spa->spa_l2cache) { /* - * When reopening we can assume the device label has - * already the attribute l2cache_persistent, since we've - * opened the device in the past and updated the label. * In case the vdev is present we should evict all ARC * buffers and pointers to log blocks and reclaim their * space before restoring its contents to L2ARC. @@ -2294,6 +2291,7 @@ vdev_reopen(vdev_t *vd) l2arc_add_vdev(spa, vd); } spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); } } else { (void) vdev_validate(vd); @@ -3542,9 +3540,14 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) } mutex_exit(&vd->vdev_initialize_lock); - /* Restart trimming if necessary */ + /* + * Restart trimming if necessary. We do not restart trimming for cache + * devices here. This is triggered by l2arc_rebuild_vdev() + * asynchronously for the whole device or in l2arc_evict() as it evicts + * space for upcoming writes. + */ mutex_enter(&vd->vdev_trim_lock); - if (vdev_writeable(vd) && + if (vdev_writeable(vd) && !vd->vdev_isl2cache && vd->vdev_trim_thread == NULL && vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c index 3f4f9091f43d..56e420871f61 100644 --- a/module/zfs/vdev_removal.c +++ b/module/zfs/vdev_removal.c @@ -2224,6 +2224,20 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) * Cache devices can always be removed. */ vd = spa_lookup_by_guid(spa, guid, B_TRUE); + + /* + * Stop trimming the cache device. We need to release the + * config lock to allow the syncing of TRIM transactions + * without releasing the spa_namespace_lock. The same + * strategy is employed in spa_vdev_remove_top(). + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); + mutex_exit(&vd->vdev_trim_lock); + txg = spa_vdev_config_enter(spa); + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index b0cd40f68765..9f353320bd17 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -34,6 +34,7 @@ #include #include #include +#include /* * TRIM is a feature which is used to notify a SSD that some previously @@ -422,6 +423,35 @@ vdev_autotrim_cb(zio_t *zio) spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); } +/* + * The zio_done_func_t done callback for each TRIM issued via + * vdev_trim_simple(). It is responsible for updating the TRIM stats and + * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best + * effort and are never reissued on failure. + */ +static void +vdev_trim_simple_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + mutex_enter(&vd->vdev_trim_io_lock); + + if (zio->io_error != 0) { + vd->vdev_stat.vs_trim_errors++; + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, + 0, 0, 0, 0, 1, zio->io_orig_size); + } else { + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, + 1, zio->io_orig_size, 0, 0, 0, 0); + } + + ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0); + vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--; + cv_broadcast(&vd->vdev_trim_io_cv); + mutex_exit(&vd->vdev_trim_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} /* * Returns the average trim rate in bytes/sec for the ta->trim_vdev. */ @@ -441,6 +471,7 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) { vdev_t *vd = ta->trim_vdev; spa_t *spa = vd->vdev_spa; + void *cb; mutex_enter(&vd->vdev_trim_io_lock); @@ -459,8 +490,8 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) ta->trim_bytes_done += size; /* Limit in flight trimming I/Os */ - while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] >= - zfs_trim_queue_limit) { + while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] + + vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) { cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); } vd->vdev_trim_inflight[ta->trim_type]++; @@ -505,10 +536,17 @@ vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) if (ta->trim_type == TRIM_TYPE_MANUAL) vd->vdev_trim_offset[txg & TXG_MASK] = start + size; + if (ta->trim_type == TRIM_TYPE_MANUAL) { + cb = vdev_trim_cb; + } else if (ta->trim_type == TRIM_TYPE_AUTO) { + cb = vdev_autotrim_cb; + } else { + cb = vdev_trim_simple_cb; + } + zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, - start, size, ta->trim_type == TRIM_TYPE_MANUAL ? - vdev_trim_cb : vdev_autotrim_cb, NULL, - ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, ta->trim_flags)); + start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, + ta->trim_flags)); /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ dmu_tx_commit(tx); @@ -1016,6 +1054,7 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) { spa_t *spa = vd->vdev_spa; list_t vd_list; + vdev_t *vd_l2cache; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -1023,6 +1062,17 @@ vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) offsetof(vdev_t, vdev_trim_node)); vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); + + /* + * Iterate over cache devices and request stop trimming the + * whole device in case we export the pool or remove the cache + * device prematurely. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vd_l2cache = spa->spa_l2cache.sav_vdevs[i]; + vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list); + } + vdev_trim_stop_wait(spa, &vd_list); if (vd->vdev_spa->spa_sync_on) { @@ -1437,6 +1487,181 @@ vdev_autotrim_restart(spa_t *spa) vdev_autotrim(spa); } +static void +vdev_trim_l2arc_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + l2arc_dev_t *dev = l2arc_vdev_get(vd); + trim_args_t ta; + range_seg64_t physical_rs; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_trim_last_offset = 0; + vd->vdev_trim_rate = 0; + vd->vdev_trim_partial = 0; + vd->vdev_trim_secure = 0; + + bzero(&ta, sizeof (ta)); + ta.trim_vdev = vd; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = TRIM_TYPE_MANUAL; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + ta.trim_flags = 0; + + physical_rs.rs_start = vd->vdev_trim_bytes_done = 0; + physical_rs.rs_end = vd->vdev_trim_bytes_est = + vdev_get_min_asize(vd); + + range_tree_add(ta.trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); + mutex_exit(&vd->vdev_trim_lock); + + (void) vdev_trim_ranges(&ta); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + range_tree_destroy(ta.trim_tree); + + mutex_enter(&vd->vdev_trim_lock); + if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { + vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, + vd->vdev_trim_rate, vd->vdev_trim_partial, + vd->vdev_trim_secure); + } + ASSERT(vd->vdev_trim_thread != NULL || + vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0); + + /* + * Drop the vdev_trim_lock while we sync out the txg since it's + * possible that a device might be trying to come online and + * must check to see if it needs to restart a trim. That thread + * will be holding the spa_config_lock which would prevent the + * txg_wait_synced from completing. Same strategy as in + * vdev_trim_thread(). + */ + mutex_exit(&vd->vdev_trim_lock); + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + mutex_enter(&vd->vdev_trim_lock); + + /* + * Update the header of the cache device here, before + * broadcasting vdev_trim_cv which may lead to the removal + * of the device. + */ + spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, + RW_READER); + bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); + + vd->vdev_trim_thread = NULL; + cv_broadcast(&vd->vdev_trim_cv); + mutex_exit(&vd->vdev_trim_lock); + + dev->l2ad_trim_all = B_FALSE; + + thread_exit(); +} + +/* + * Punches out TRIM threads for the L2ARC devices in a spa and assigns them + * to vd->vdev_trim_thread variable. This facilitates the management of + * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition + * to a pool or pool creation or when the header of the device is invalid. + */ +void +vdev_trim_l2arc(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off TRIM threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_t *vd = spa->spa_l2cache.sav_vdevs[i]; + l2arc_dev_t *dev = l2arc_vdev_get(vd); + + if (dev == NULL || !dev->l2ad_trim_all) { + /* Don't attempt TRIM if the vdev is UNAVAIL */ + continue; + } + + mutex_enter(&vd->vdev_trim_lock); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_trim_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); + vd->vdev_trim_thread = thread_create(NULL, 0, + vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&vd->vdev_trim_lock); + } +} + +/* + * A wrapper which calls vdev_trim_ranges(). It is intended to be called + * on leaf vdevs. + */ +int +vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size, trim_type_t type) +{ + trim_args_t ta; + range_seg64_t physical_rs; + int error; + physical_rs.rs_start = start; + physical_rs.rs_end = start + size; + + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_top->vdev_removing); + + bzero(&ta, sizeof (ta)); + ta.trim_vdev = vd; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = type; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + ta.trim_flags = 0; + + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(ta.trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } + + error = vdev_trim_ranges(&ta); + + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[ta.trim_type] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + range_tree_destroy(ta.trim_tree); + + return (error); +} + EXPORT_SYMBOL(vdev_trim); EXPORT_SYMBOL(vdev_trim_stop); EXPORT_SYMBOL(vdev_trim_stop_all); @@ -1446,6 +1671,8 @@ EXPORT_SYMBOL(vdev_autotrim); EXPORT_SYMBOL(vdev_autotrim_stop_all); EXPORT_SYMBOL(vdev_autotrim_stop_wait); EXPORT_SYMBOL(vdev_autotrim_restart); +EXPORT_SYMBOL(vdev_trim_l2arc); +EXPORT_SYMBOL(vdev_trim_simple); /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 01bab0870b05..cbad90ad1467 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -832,7 +832,7 @@ tags = ['functional', 'threadsappend'] [tests/functional/trim] tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', - 'trim_integrity', 'trim_config'] + 'trim_integrity', 'trim_config', 'trim_l2arc'] tags = ['functional', 'trim'] [tests/functional/truncate] diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index efbcc09e7eb4..c450764db4fc 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -38,6 +38,7 @@ KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled +L2ARC_TRIM_AHEAD UNSUPPORTED l2arc_trim_ahead L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc diff --git a/tests/zfs-tests/tests/functional/trim/Makefile.am b/tests/zfs-tests/tests/functional/trim/Makefile.am index 4f260a8e47eb..8917ed726e90 100644 --- a/tests/zfs-tests/tests/functional/trim/Makefile.am +++ b/tests/zfs-tests/tests/functional/trim/Makefile.am @@ -8,4 +8,5 @@ dist_pkgdata_SCRIPTS = \ autotrim_config.ksh \ autotrim_trim_integrity.ksh \ trim_integrity.ksh \ - trim_config.ksh + trim_config.ksh \ + trim_l2arc.ksh diff --git a/tests/zfs-tests/tests/functional/trim/trim.kshlib b/tests/zfs-tests/tests/functional/trim/trim.kshlib index 7f1bcdacf735..bede946a09c5 100644 --- a/tests/zfs-tests/tests/functional/trim/trim.kshlib +++ b/tests/zfs-tests/tests/functional/trim/trim.kshlib @@ -33,17 +33,18 @@ function get_trim_io { typeset pool="${1-:$TESTPOOL}" typeset type="${2-:ind}" + typeset vdev="${3}" typeset rval # Sum the ind or agg columns of the trim request size histogram. case "$type" in "ind") - rval=$(zpool iostat -pr $pool | awk \ + rval=$(zpool iostat -pr $pool $vdev | awk \ '$1 ~ /[0-9].*/ { sum += $12 } END { print sum }') echo -n "$rval" ;; "agg") - rval=$(zpool iostat -pr $pool | awk \ + rval=$(zpool iostat -pr $pool $vdev | awk \ '$1 ~ /[0-9].*/ { sum += $13 } END { print sum }') echo -n "$rval" ;; @@ -61,9 +62,10 @@ function verify_trim_io typeset pool="${1:-$TESTPOOL}" typeset type="${2:-ind}" typeset min_trim_ios=${3:-100} + typeset vdev="${4}" typeset ios - ios=$(get_trim_io $pool $type) + ios=$(get_trim_io $pool $type $vdev) if [[ $ios -ge $min_trim_ios ]]; then log_note "Issued $ios $type trim IOs for pool $pool" else diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh new file mode 100755 index 000000000000..4fefc37d9748 --- /dev/null +++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh @@ -0,0 +1,106 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/trim/trim.kshlib +. $STF_SUITE/tests/functional/trim/trim.cfg + +# +# DESCRIPTION: +# Verify trimming of L2ARC +# +# STRATEGY: +# 1. Set 'l2arc_trim_ahead = 1' and `l2arc_write_size = 64MB`. +# 2. Create a pool on file vdevs to trim. +# 3. Verify the cache device was trimmed. +# 4. Fill the pool with a file larger than the L2ARC vdev. +# 5. Randomly read the previous written file long enough for the +# L2ARC vdev to be filled and overwritten 5 times. +# 6. Verify trim IOs of the expected type were issued for the pool. +# 7. Verify the allocated space on the cache device is less than +# its size. +# + +verify_runnable "global" + +log_assert "Auto trim of L2ARC succeeds." + +function cleanup +{ + if poolexists $TESTPOOL; then + destroy_pool $TESTPOOL + fi + + log_must rm -f $VDEVS + log_must set_tunable32 L2ARC_TRIM_AHEAD $l2arc_trimahead + log_must set_tunable32 L2ARC_WRITE_MAX $l2arc_writemax +} +log_onexit cleanup + +# The cache device $TRIM_VDEV2 has to be small enough, so that +# dev->l2ad_hand loops around and dev->l2ad_first=0. Otherwise +# l2arc_evict() exits before evicting/trimming. +typeset l2arc_trimahead=$(get_tunable L2ARC_TRIM_AHEAD) +typeset l2arc_writemax=$(get_tunable L2ARC_WRITE_MAX) +log_must set_tunable32 L2ARC_TRIM_AHEAD 1 +log_must set_tunable32 L2ARC_WRITE_MAX $(( 64 * 1024 * 1024)) +VDEVS="$TRIM_VDEV1 $TRIM_VDEV2" +log_must truncate -s $((MINVDEVSIZE)) $TRIM_VDEV2 +log_must truncate -s $((4 * MINVDEVSIZE)) $TRIM_VDEV1 +typeset VDEV_MIN_MB=$((MINVDEVSIZE * 0.30 / 1024 / 1024)) + +log_must zpool create -f $TESTPOOL $TRIM_VDEV1 cache $TRIM_VDEV2 +verify_vdevs "-le" "$VDEV_MIN_MB" $TRIM_VDEV2 + +typeset fill_mb=$(( floor(2 * MINVDEVSIZE) )) +export DIRECTORY=/$TESTPOOL +export NUMJOBS=1 +export FILE_SIZE=${fill_mb} +export PERF_RANDSEED=1234 +export PERF_COMPPERCENT=66 +export PERF_COMPCHUNK=0 +export RUNTIME=30 +export BLOCKSIZE=128K +export SYNC_TYPE=0 +export DIRECT=1 + +# Write to the pool. +log_must fio $FIO_SCRIPTS/mkfiles.fio + +# Read randomly from the pool to fill L2ARC. +export RUNTIME=30 +log_must fio $FIO_SCRIPTS/random_reads.fio + +export RUNTIME=1 +typeset do_once=true +while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do + typeset l2_size1=$(get_arcstat l2_size) + log_must fio $FIO_SCRIPTS/random_reads.fio + typeset l2_size2=$(get_arcstat l2_size) + do_once=false +done + +verify_trim_io $TESTPOOL "ind" 5 $TRIM_VDEV2 + +typeset cache_size=$(zpool list -vp | grep $TRIM_VDEV2 | awk '{print $2}') +typeset cache_alloc=$(zpool list -vp | grep $TRIM_VDEV2 | awk '{print $3}') + +log_must test $cache_alloc -lt $cache_size + +log_must zpool destroy $TESTPOOL +log_must rm -f $VDEVS + +log_pass "Auto trim of L2ARC succeeds."