Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

L2ARC: Relax locking during write #16040

Merged
merged 1 commit into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion include/sys/multilist.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,15 @@ int multilist_is_empty(multilist_t *);
unsigned int multilist_get_num_sublists(multilist_t *);
unsigned int multilist_get_random_index(multilist_t *);

multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
void multilist_sublist_lock(multilist_sublist_t *);
multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
void multilist_sublist_unlock(multilist_sublist_t *);

void multilist_sublist_insert_head(multilist_sublist_t *, void *);
void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
void multilist_sublist_remove(multilist_sublist_t *, void *);
int multilist_sublist_is_empty(multilist_sublist_t *);
Expand Down
179 changes: 93 additions & 86 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,

ASSERT3P(marker, !=, NULL);

mls = multilist_sublist_lock(ml, idx);
mls = multilist_sublist_lock_idx(ml, idx);

for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
hdr = multilist_sublist_prev(mls, marker)) {
Expand Down Expand Up @@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
return (bytes_evicted);
}

static arc_buf_hdr_t *
arc_state_alloc_marker(void)
{
arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);

/*
* A b_spa of 0 is used to indicate that this header is
* a marker. This fact is used in arc_evict_state_impl().
*/
marker->b_spa = 0;

return (marker);
}

static void
arc_state_free_marker(arc_buf_hdr_t *marker)
{
kmem_cache_free(hdr_full_cache, marker);
}

/*
* Allocate an array of buffer headers used as placeholders during arc state
* eviction.
Expand All @@ -3994,24 +4014,16 @@ arc_state_alloc_markers(int count)
arc_buf_hdr_t **markers;

markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
for (int i = 0; i < count; i++) {
markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);

/*
* A b_spa of 0 is used to indicate that this header is
* a marker. This fact is used in arc_evict_state_impl().
*/
markers[i]->b_spa = 0;

}
for (int i = 0; i < count; i++)
markers[i] = arc_state_alloc_marker();
return (markers);
}

static void
arc_state_free_markers(arc_buf_hdr_t **markers, int count)
{
for (int i = 0; i < count; i++)
kmem_cache_free(hdr_full_cache, markers[i]);
arc_state_free_marker(markers[i]);
kmem_free(markers, sizeof (*markers) * count);
}

Expand Down Expand Up @@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls;

mls = multilist_sublist_lock(ml, i);
mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_insert_tail(mls, markers[i]);
multilist_sublist_unlock(mls);
}
Expand Down Expand Up @@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
}

for (int i = 0; i < num_sublists; i++) {
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
multilist_sublist_remove(mls, markers[i]);
multilist_sublist_unlock(mls);
}
Expand Down Expand Up @@ -8633,7 +8645,7 @@ l2arc_sublist_lock(int list_num)
* sublists being selected.
*/
idx = multilist_get_random_index(ml);
return (multilist_sublist_lock(ml, idx));
return (multilist_sublist_lock_idx(ml, idx));
}

/*
Expand Down Expand Up @@ -9046,9 +9058,9 @@ l2arc_blk_fetch_done(zio_t *zio)
static uint64_t
l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
arc_buf_hdr_t *hdr, *hdr_prev, *head;
uint64_t write_asize, write_psize, write_lsize, headroom;
boolean_t full;
arc_buf_hdr_t *hdr, *head, *marker;
uint64_t write_asize, write_psize, headroom;
boolean_t full, from_head = !arc_warm;
l2arc_write_callback_t *cb = NULL;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
Expand All @@ -9057,10 +9069,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
ASSERT3P(dev->l2ad_vdev, !=, NULL);

pio = NULL;
write_lsize = write_asize = write_psize = 0;
write_asize = write_psize = 0;
full = B_FALSE;
head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
marker = arc_state_alloc_marker();

/*
* Copy buffers for L2ARC writing.
Expand All @@ -9075,40 +9088,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
continue;
}

multilist_sublist_t *mls = l2arc_sublist_lock(pass);
uint64_t passed_sz = 0;

VERIFY3P(mls, !=, NULL);
headroom = target_sz * l2arc_headroom;
if (zfs_compressed_arc_enabled)
headroom = (headroom * l2arc_headroom_boost) / 100;

/*
* L2ARC fast warmup.
*
* Until the ARC is warm and starts to evict, read from the
* head of the ARC lists rather than the tail.
*/
if (arc_warm == B_FALSE)
multilist_sublist_t *mls = l2arc_sublist_lock(pass);
ASSERT3P(mls, !=, NULL);
if (from_head)
hdr = multilist_sublist_head(mls);
else
hdr = multilist_sublist_tail(mls);

headroom = target_sz * l2arc_headroom;
if (zfs_compressed_arc_enabled)
headroom = (headroom * l2arc_headroom_boost) / 100;

for (; hdr; hdr = hdr_prev) {
while (hdr != NULL) {
kmutex_t *hash_lock;
abd_t *to_write = NULL;

if (arc_warm == B_FALSE)
hdr_prev = multilist_sublist_next(mls, hdr);
else
hdr_prev = multilist_sublist_prev(mls, hdr);

hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
/*
* Skip this buffer rather than waiting.
*/
skip:
/* Skip this buffer rather than waiting. */
if (from_head)
hdr = multilist_sublist_next(mls, hdr);
else
hdr = multilist_sublist_prev(mls, hdr);
continue;
}

Expand All @@ -9123,11 +9130,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)

if (!l2arc_write_eligible(guid, hdr)) {
mutex_exit(hash_lock);
continue;
goto skip;
}

ASSERT(HDR_HAS_L1HDR(hdr));

ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
ASSERT3U(arc_hdr_size(hdr), >, 0);
ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
Expand All @@ -9149,12 +9155,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
}

/*
* We rely on the L1 portion of the header below, so
* it's invalid for this header to have been evicted out
* of the ghost cache, prior to being written out. The
* ARC_FLAG_L2_WRITING bit ensures this won't happen.
* We should not sleep with sublist lock held or it
* may block ARC eviction. Insert a marker to save
* the position and drop the lock.
*/
arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
if (from_head) {
multilist_sublist_insert_after(mls, hdr,
marker);
} else {
multilist_sublist_insert_before(mls, hdr,
marker);
}
multilist_sublist_unlock(mls);

/*
* If this header has b_rabd, we can use this since it
Expand Down Expand Up @@ -9185,87 +9197,80 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
&to_write);
if (ret != 0) {
arc_hdr_clear_flags(hdr,
ARC_FLAG_L2_WRITING);
ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
continue;
goto next;
}

l2arc_free_abd_on_write(to_write, asize, type);
}

hdr->b_l2hdr.b_dev = dev;
hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
hdr->b_l2hdr.b_hits = 0;
hdr->b_l2hdr.b_arcs_state =
hdr->b_l1hdr.b_state->arcs_state;
mutex_enter(&dev->l2ad_mtx);
if (pio == NULL) {
/*
* Insert a dummy header on the buflist so
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, head);
mutex_exit(&dev->l2ad_mtx);
}
list_insert_head(&dev->l2ad_buflist, hdr);
mutex_exit(&dev->l2ad_mtx);
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
ARC_FLAG_L2_WRITING);

(void) zfs_refcount_add_many(&dev->l2ad_alloc,
arc_hdr_size(hdr), hdr);
l2arc_hdr_arcstats_increment(hdr);

boolean_t commit = l2arc_log_blk_insert(dev, hdr);
mutex_exit(hash_lock);

if (pio == NULL) {
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
cb->l2wcb_dev = dev;
cb->l2wcb_head = head;
/*
* Create a list to save allocated abd buffers
* for l2arc_log_blk_commit().
*/
list_create(&cb->l2wcb_abd_list,
sizeof (l2arc_lb_abd_buf_t),
offsetof(l2arc_lb_abd_buf_t, node));
pio = zio_root(spa, l2arc_write_done, cb,
ZIO_FLAG_CANFAIL);
}

hdr->b_l2hdr.b_dev = dev;
hdr->b_l2hdr.b_hits = 0;

hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
hdr->b_l2hdr.b_arcs_state =
hdr->b_l1hdr.b_state->arcs_state;
arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);

mutex_enter(&dev->l2ad_mtx);
list_insert_head(&dev->l2ad_buflist, hdr);
mutex_exit(&dev->l2ad_mtx);

(void) zfs_refcount_add_many(&dev->l2ad_alloc,
arc_hdr_size(hdr), hdr);

wzio = zio_write_phys(pio, dev->l2ad_vdev,
hdr->b_l2hdr.b_daddr, asize, to_write,
dev->l2ad_hand, asize, to_write,
ZIO_CHECKSUM_OFF, NULL, hdr,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_CANFAIL, B_FALSE);

write_lsize += HDR_GET_LSIZE(hdr);
DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
zio_t *, wzio);
zio_nowait(wzio);

write_psize += psize;
write_asize += asize;
dev->l2ad_hand += asize;
l2arc_hdr_arcstats_increment(hdr);
vdev_space_update(dev->l2ad_vdev, asize, 0, 0);

mutex_exit(hash_lock);

/*
* Append buf info to current log and commit if full.
* arcstat_l2_{size,asize} kstats are updated
* internally.
*/
if (l2arc_log_blk_insert(dev, hdr)) {
/*
* l2ad_hand will be adjusted in
* l2arc_log_blk_commit().
*/
if (commit) {
/* l2ad_hand will be adjusted inside. */
write_asize +=
l2arc_log_blk_commit(dev, pio, cb);
}

zio_nowait(wzio);
next:
multilist_sublist_lock(mls);
if (from_head)
hdr = multilist_sublist_next(mls, marker);
else
hdr = multilist_sublist_prev(mls, marker);
multilist_sublist_remove(mls, marker);
}

multilist_sublist_unlock(mls);
Expand All @@ -9274,9 +9279,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
break;
}

arc_state_free_marker(marker);

/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_lsize);
ASSERT0(write_psize);
ASSERT(!HDR_HAS_L1HDR(head));
kmem_cache_free(hdr_l2only_cache, head);

Expand Down Expand Up @@ -10604,7 +10611,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);

dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
HDR_GET_PSIZE(hdr));
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,7 @@ static void
dbuf_evict_one(void)
{
int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
multilist_sublist_t *mls = multilist_sublist_lock(
multilist_sublist_t *mls = multilist_sublist_lock_idx(
&dbuf_caches[DB_DBUF_CACHE].cache, idx);

ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
Expand Down
Loading
Loading