diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index ea1a3030bc42..39fbfd993adf 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -1620,7 +1620,6 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) itx->itx_private = zd; itx->itx_wr_state = write_state; itx->itx_sync = (ztest_random(8) == 0); - itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0); bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, sizeof (*lr) - sizeof (lr_t)); diff --git a/include/sys/trace_zil.h b/include/sys/trace_zil.h index 5b43480aea21..eaa391270d76 100644 --- a/include/sys/trace_zil.h +++ b/include/sys/trace_zil.h @@ -63,7 +63,6 @@ DECLARE_EVENT_CLASS(zfs_zil_class, __field(uint64_t, zl_parse_lr_count) __field(uint64_t, zl_next_batch) __field(uint64_t, zl_com_batch) - __field(uint64_t, zl_itx_list_sz) __field(uint64_t, zl_cur_used) __field(clock_t, zl_replay_time) __field(uint64_t, zl_replay_blks) @@ -88,7 +87,6 @@ DECLARE_EVENT_CLASS(zfs_zil_class, __entry->zl_parse_lr_count = zilog->zl_parse_lr_count; __entry->zl_next_batch = zilog->zl_next_batch; __entry->zl_com_batch = zilog->zl_com_batch; - __entry->zl_itx_list_sz = zilog->zl_itx_list_sz; __entry->zl_cur_used = zilog->zl_cur_used; __entry->zl_replay_time = zilog->zl_replay_time; __entry->zl_replay_blks = zilog->zl_replay_blks; @@ -98,8 +96,7 @@ DECLARE_EVENT_CLASS(zfs_zil_class, "replay %u stop_sync %u writer %u logbias %u sync %u " "parse_error %u parse_blk_seq %llu parse_lr_seq %llu " "parse_blk_count %llu parse_lr_count %llu next_batch %llu " - "com_batch %llu itx_list_sz %llu cur_used %llu replay_time %lu " - "replay_blks %llu }", + "com_batch %llu cur_used %llu replay_time %lu replay_blks %llu }", __entry->zl_lr_seq, __entry->zl_commit_lr_seq, __entry->zl_destroy_txg, __entry->zl_replaying_seq, __entry->zl_suspend, __entry->zl_suspending, __entry->zl_keep_first, @@ -107,8 +104,7 @@ DECLARE_EVENT_CLASS(zfs_zil_class, __entry->zl_logbias, __entry->zl_sync, __entry->zl_parse_error, __entry->zl_parse_blk_seq, __entry->zl_parse_lr_seq, __entry->zl_parse_blk_count, __entry->zl_parse_lr_count, - __entry->zl_next_batch, __entry->zl_com_batch, - __entry->zl_itx_list_sz, __entry->zl_cur_used, + __entry->zl_next_batch, __entry->zl_com_batch, __entry->zl_cur_used, __entry->zl_replay_time, __entry->zl_replay_blks) ); /* END CSTYLED */ diff --git a/include/sys/zil.h b/include/sys/zil.h index 62572f894a17..95fd324b4abf 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -394,7 +394,6 @@ typedef struct itx { uint8_t itx_sync; /* synchronous transaction */ zil_callback_t itx_callback; /* Called when the itx is persistent */ void *itx_callback_data; /* User data for the callback */ - uint64_t itx_sod; /* record size on disk */ uint64_t itx_oid; /* object id */ lr_t itx_lr; /* common part of log record */ /* followed by type-specific part of lr_xx_t and its immediate data */ diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index 0c426a15dd06..13ecca3c8b04 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -42,6 +42,7 @@ typedef struct lwb { zilog_t *lwb_zilog; /* back pointer to log struct */ blkptr_t lwb_blk; /* on disk address of this log blk */ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ + boolean_t lwb_slog; /* lwb_blk is on SLOG device */ int lwb_nused; /* # used bytes in buffer */ int lwb_sz; /* size of block and buffer */ char *lwb_buf; /* log write buffer */ @@ -62,7 +63,6 @@ typedef struct itxs { typedef struct itxg { kmutex_t itxg_lock; /* lock for this structure */ uint64_t itxg_txg; /* txg for this chain */ - uint64_t itxg_sod; /* total size on disk for this txg */ itxs_t *itxg_itxs; /* sync and async itxs */ } itxg_t; @@ -120,7 +120,6 @@ struct zilog { kcondvar_t zl_cv_batch[2]; /* batch condition variables */ itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ list_t zl_itx_commit_list; /* itx list to be committed */ - uint64_t zl_itx_list_sz; /* total size of records on list */ uint64_t zl_cur_used; /* current commit log size used */ list_t zl_lwb_list; /* in-flight log write list */ kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ @@ -140,9 +139,26 @@ typedef struct zil_bp_node { avl_node_t zn_node; } zil_bp_node_t; +/* + * Maximum amount of write data that can be put into single log block. + */ #define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \ sizeof (lr_write_t)) +/* + * Maximum amount of log space we agree to waste to reduce number of + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + */ +#define ZIL_MAX_WASTE_SPACE (ZIL_MAX_LOG_DATA / 8) + +/* + * Maximum amount of write data for WR_COPIED. Fall back to WR_NEED_COPY + * as more space efficient if we can't fit at least two log records into + * maximum sized log block. + */ +#define ZIL_MAX_COPIED_DATA ((SPA_OLD_MAXBLOCKSIZE - \ + sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t)) + #ifdef __cplusplus } #endif diff --git a/include/sys/zio.h b/include/sys/zio.h index 23c918af18ce..4eaabc38c86f 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -515,7 +515,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags); extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, - uint64_t size, boolean_t use_slog); + uint64_t size, boolean_t *slog); extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 51f1cd93f1cf..d4bab8327c42 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -1935,12 +1935,14 @@ Use \fB1\fR for yes and \fB0\fR for no (default). .sp .ne 2 .na -\fBzil_slog_limit\fR (ulong) +\fBzil_slog_bulk\fR (ulong) .ad .RS 12n -Max commit bytes to separate log device +Limit SLOG write size per commit executed with synchronous priority. +Any writes above that will be executed with lower (asynchronous) priority +to limit potential SLOG device abuse by single active ZIL writer. .sp -Default value: \fB1,048,576\fR. +Default value: \fB786,432\fR. .RE .sp diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index ca369905246a..8887f037aa34 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -487,10 +487,9 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag, zil_callback_t callback, void *callback_data) { + uint32_t blocksize = zp->z_blksz; itx_wr_state_t write_state; - boolean_t slogging; uintptr_t fsync_cnt; - ssize_t immediate_write_sz; if (zil_replaying(zilog, tx) || zp->z_unlinked || zfs_xattr_owner_unlinked(zp)) { @@ -499,12 +498,10 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, return; } - immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - ? 0 : (ssize_t)zfs_immediate_write_sz; - - slogging = spa_has_slogs(zilog->zl_spa) && - (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); - if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz) + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + write_state = WR_INDIRECT; + else if (!spa_has_slogs(zilog->zl_spa) && + resid >= zfs_immediate_write_sz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; @@ -518,30 +515,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, while (resid) { itx_t *itx; lr_write_t *lr; - ssize_t len; + itx_wr_state_t wr_state = write_state; + ssize_t len = resid; - /* - * If the write would overflow the largest block then split it. - */ - if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA) - len = SPA_OLD_MAXBLOCKSIZE >> 1; - else - len = resid; + if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA) + wr_state = WR_NEED_COPY; + else if (wr_state == WR_INDIRECT) + len = MIN(blocksize - P2PHASE(off, blocksize), resid); itx = zil_itx_create(txtype, sizeof (*lr) + - (write_state == WR_COPIED ? len : 0)); + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED && dmu_read(ZTOZSB(zp)->z_os, + if (wr_state == WR_COPIED && dmu_read(ZTOZSB(zp)->z_os, zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; + wr_state = WR_NEED_COPY; } - itx->itx_wr_state = write_state; - if (write_state == WR_NEED_COPY) - itx->itx_sod += len; + itx->itx_wr_state = wr_state; lr->lr_foid = zp->z_id; lr->lr_offset = off; lr->lr_length = len; diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 12a034d5ba89..6a1f190f5e6c 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -102,6 +102,13 @@ int zil_replay_disable = 0; */ int zfs_nocacheflush = 0; +/* + * Limit SLOG write size per commit executed with synchronous priority. + * Any writes above that will be executed with lower (asynchronous) priority + * to limit potential SLOG device abuse by single active ZIL writer. + */ +unsigned long zil_slog_bulk = 768 * 1024; + static kmem_cache_t *zil_lwb_cache; static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); @@ -449,7 +456,8 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) } static lwb_t * -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, + boolean_t fastwrite) { lwb_t *lwb; @@ -457,6 +465,7 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg, boolean_t fastwrite) lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; lwb->lwb_fastwrite = fastwrite; + lwb->lwb_slog = slog; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_max_txg = txg; lwb->lwb_zio = NULL; @@ -542,6 +551,7 @@ zil_create(zilog_t *zilog) blkptr_t blk; int error = 0; boolean_t fastwrite = FALSE; + boolean_t slog = FALSE; /* * Wait for any previous destroy to complete. @@ -570,7 +580,7 @@ zil_create(zilog_t *zilog) } error = zio_alloc_zil(zilog->zl_spa, txg, &blk, - ZIL_MIN_BLKSZ, B_TRUE); + ZIL_MIN_BLKSZ, &slog); fastwrite = TRUE; if (error == 0) @@ -581,7 +591,7 @@ zil_create(zilog_t *zilog) * Allocate a log write buffer (lwb) for the first log block. */ if (error == 0) - lwb = zil_alloc_lwb(zilog, &blk, txg, fastwrite); + lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite); /* * If we just allocated the first log block, commit our transaction @@ -915,6 +925,7 @@ static void zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) { zbookmark_phys_t zb; + zio_priority_t prio; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, @@ -934,9 +945,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); lwb->lwb_fastwrite = 1; } + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), - zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, + zil_lwb_write_done, lwb, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_FASTWRITE, &zb); } @@ -957,15 +972,6 @@ uint64_t zil_block_buckets[] = { UINT64_MAX }; -/* - * Use the slog as long as the current commit size is less than the - * limit or the total list size is less than 2X the limit. Limit - * checking is disabled by setting zil_slog_limit to UINT64_MAX. - */ -unsigned long zil_slog_limit = 1024 * 1024; -#define USE_SLOG(zilog) (((zilog)->zl_cur_used < zil_slog_limit) || \ - ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))) - /* * Start a log block write and advance to the next log block. * Calls are serialized. @@ -981,7 +987,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) uint64_t txg; uint64_t zil_blksz, wsz; int i, error; - boolean_t use_slog; + boolean_t slog; if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { zilc = (zil_chain_t *)lwb->lwb_buf; @@ -1037,10 +1043,8 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); - use_slog = USE_SLOG(zilog); - error = zio_alloc_zil(spa, txg, bp, zil_blksz, - USE_SLOG(zilog)); - if (use_slog) { + error = zio_alloc_zil(spa, txg, bp, zil_blksz, &slog); + if (slog) { ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused); } else { @@ -1055,7 +1059,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) /* * Allocate a new log write buffer (lwb). */ - nlwb = zil_alloc_lwb(zilog, bp, txg, TRUE); + nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); /* Record the block for later vdev flushing */ zil_add_block(zilog, &lwb->lwb_blk); @@ -1092,45 +1096,53 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { - lr_t *lrc = &itx->itx_lr; /* common log record */ - lr_write_t *lrw = (lr_write_t *)lrc; + lr_t *lrcb, *lrc; + lr_write_t *lrwb, *lrw; char *lr_buf; - uint64_t txg = lrc->lrc_txg; - uint64_t reclen = lrc->lrc_reclen; - uint64_t dlen = 0; + uint64_t dlen, dnow, lwb_sp, reclen, txg; if (lwb == NULL) return (NULL); ASSERT(lwb->lwb_buf != NULL); - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) + lrc = &itx->itx_lr; /* Common log record inside itx. */ + lrw = (lr_write_t *)lrc; /* Write log record inside itx. */ + if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); - + } else { + dlen = 0; + } + reclen = lrc->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); + txg = lrc->lrc_txg; zil_lwb_write_init(zilog, lwb); +cont: /* * If this record won't fit in the current log block, start a new one. + * For WR_NEED_COPY optimize layout for minimal number of chunks. */ - if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { + lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + if (reclen > lwb_sp || (reclen + dlen > lwb_sp && + lwb_sp < ZIL_MAX_WASTE_SPACE && (dlen % ZIL_MAX_LOG_DATA == 0 || + lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) { lwb = zil_lwb_write_start(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_init(zilog, lwb); ASSERT(LWB_EMPTY(lwb)); - if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { - txg_wait_synced(zilog->zl_dmu_pool, txg); - return (lwb); - } + lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } + dnow = MIN(dlen, lwb_sp - reclen); lr_buf = lwb->lwb_buf + lwb->lwb_nused; bcopy(lrc, lr_buf, reclen); - lrc = (lr_t *)lr_buf; - lrw = (lr_write_t *)lrc; + lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ + lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zil_itx_count); @@ -1147,10 +1159,13 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) char *dbuf; int error; - if (dlen) { - ASSERT(itx->itx_wr_state == WR_NEED_COPY); + if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; - lrw->lr_common.lrc_reclen += dlen; + lrcb->lrc_reclen += dnow; + if (lrwb->lr_length > dnow) + lrwb->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; ZIL_STAT_BUMP(zil_itx_needcopy_count); ZIL_STAT_INCR(zil_itx_needcopy_bytes, lrw->lr_length); @@ -1162,7 +1177,7 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) lrw->lr_length); } error = zilog->zl_get_data( - itx->itx_private, lrw, dbuf, lwb->lwb_zio); + itx->itx_private, lrwb, dbuf, lwb->lwb_zio); if (error == EIO) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); @@ -1181,12 +1196,18 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ - lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ - lwb->lwb_nused += reclen + dlen; + lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ + lwb->lwb_nused += reclen + dnow; lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + return (lwb); } @@ -1200,7 +1221,6 @@ zil_itx_create(uint64_t txtype, size_t lrsize) itx = zio_data_buf_alloc(offsetof(itx_t, itx_lr) + lrsize); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; - itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ itx->itx_sync = B_TRUE; /* default is synchronous */ itx->itx_callback = NULL; @@ -1351,11 +1371,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) */ zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " "txg %llu", itxg->itxg_txg); - atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); - itxg->itxg_sod = 0; clean = itxg->itxg_itxs; } - ASSERT(itxg->itxg_sod == 0); itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); @@ -1368,8 +1385,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); - atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); - itxg->itxg_sod += itx->itx_sod; } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = @@ -1419,8 +1434,6 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT(itxg->itxg_txg != 0); ASSERT(zilog->zl_clean_taskq != NULL); - atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); - itxg->itxg_sod = 0; clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; @@ -1444,7 +1457,6 @@ zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg; list_t *commit_list = &zilog->zl_itx_commit_list; - uint64_t push_sod = 0; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; @@ -1476,12 +1488,9 @@ zil_get_commit_list(zilog_t *zilog) ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); - push_sod += itxg->itxg_sod; - itxg->itxg_sod = 0; mutex_exit(&itxg->itxg_lock); } - atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); } /* @@ -2304,13 +2313,14 @@ EXPORT_SYMBOL(zil_bp_tree_add); EXPORT_SYMBOL(zil_set_sync); EXPORT_SYMBOL(zil_set_logbias); +/* BEGIN CSTYLED */ module_param(zil_replay_disable, int, 0644); MODULE_PARM_DESC(zil_replay_disable, "Disable intent logging replay"); module_param(zfs_nocacheflush, int, 0644); MODULE_PARM_DESC(zfs_nocacheflush, "Disable cache flushes"); -/* CSTYLED */ -module_param(zil_slog_limit, ulong, 0644); -MODULE_PARM_DESC(zil_slog_limit, "Max commit bytes to separate log device"); +module_param(zil_slog_bulk, ulong, 0644); +MODULE_PARM_DESC(zil_slog_bulk, "Limit in bytes slog sync writes per commit"); +/* END CSTYLED */ #endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 61eb575ef817..acfc49eb5f47 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3098,7 +3098,7 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) */ int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, - boolean_t use_slog) + boolean_t *slog) { int error = 1; zio_alloc_list_t io_alloc_list; @@ -3106,17 +3106,16 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size, ASSERT(txg > spa_syncing_txg(spa)); metaslab_trace_init(&io_alloc_list); - - if (use_slog) { - error = metaslab_alloc(spa, spa_log_class(spa), size, - new_bp, 1, txg, NULL, METASLAB_FASTWRITE, - &io_alloc_list, NULL); - } - - if (error) { + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, + txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL); + if (error == 0) { + *slog = TRUE; + } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL); + if (error == 0) + *slog = FALSE; } metaslab_trace_fini(&io_alloc_list); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 1a86cd3cdc0c..bf9f48adb2f3 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -611,53 +611,44 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; - boolean_t slogging; - ssize_t immediate_write_sz; + itx_wr_state_t write_state; if (zil_replaying(zilog, tx)) return; - immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) - ? 0 : zvol_immediate_write_sz; - slogging = spa_has_slogs(zilog->zl_spa) && - (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + write_state = WR_INDIRECT; + else if (!spa_has_slogs(zilog->zl_spa) && + size >= blocksize && blocksize > zvol_immediate_write_sz) + write_state = WR_INDIRECT; + else if (sync) + write_state = WR_COPIED; + else + write_state = WR_NEED_COPY; while (size) { itx_t *itx; lr_write_t *lr; - ssize_t len; - itx_wr_state_t write_state; + itx_wr_state_t wr_state = write_state; + ssize_t len = size; - /* - * Unlike zfs_log_write() we can be called with - * up to DMU_MAX_ACCESS/2 (5MB) writes. - */ - if (blocksize > immediate_write_sz && !slogging && - size >= blocksize && offset % blocksize == 0) { - write_state = WR_INDIRECT; /* uses dmu_sync */ - len = blocksize; - } else if (sync) { - write_state = WR_COPIED; - len = MIN(ZIL_MAX_LOG_DATA, size); - } else { - write_state = WR_NEED_COPY; - len = MIN(ZIL_MAX_LOG_DATA, size); - } + if (wr_state == WR_COPIED && size > ZIL_MAX_COPIED_DATA) + wr_state = WR_NEED_COPY; + else if (wr_state == WR_INDIRECT) + len = MIN(blocksize - P2PHASE(offset, blocksize), size); itx = zil_itx_create(TX_WRITE, sizeof (*lr) + - (write_state == WR_COPIED ? len : 0)); + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; - if (write_state == WR_COPIED && dmu_read(zv->zv_objset, + if (wr_state == WR_COPIED && dmu_read(zv->zv_objset, ZVOL_OBJ, offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { zil_itx_destroy(itx); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; - write_state = WR_NEED_COPY; + wr_state = WR_NEED_COPY; } - itx->itx_wr_state = write_state; - if (write_state == WR_NEED_COPY) - itx->itx_sod += len; + itx->itx_wr_state = wr_state; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len;