From 77c1ceb7dd1e4e5c77b6f98aa653d2e251de349d Mon Sep 17 00:00:00 2001 From: Pavel Snajdr Date: Thu, 3 Sep 2020 11:30:39 +0200 Subject: [PATCH] Fix zfs_fsync deadlock There are three locks at play, when handling ZIL records. First is a zl_issuer_lock, that is supposed to be held, whenever we're issuing a new record to ZIL. Then there is zl_lock to actually protect the in the zilog struct. And then there's a last lock, zcw_lock, which is used to protect the lists of ZIL call-back waiters. The order of locking is supposed to be: zl_issuer_lock -> zcw_lock -> zl_lock; as implied their usage in zil_alloc_lwb and zil_commit_waiter_timeout functions. Function zil_commit_waiter_link_lwb goes against this, it is expecting to be entered with zl_lock already held and only then it is taking the zcw_lock; This patch straightens the locking in zil_commit_waiter_link_lwb to take the zl_lock on its own, correcting the locking order. Following is an attached messsage from the Linux lockdep mechanism describing the potential deadlock - in our production, it was a matter of a few hours to hit this one... (redacted to fit 72 cols) systemd-journal/5561 is trying to acquire lock: (&zilog->zl_lock){+.+.}-{4:4}, at: zil_alloc_lwb+0x1df/0x3e0 but task is already holding lock: (&zcw->zcw_lock){+.+.}-{4:4}, at: zil_commit_impl+0x52b/0x1850 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&zcw->zcw_lock){+.+.}-{4:4}: __mutex_lock+0xac/0x9e0 mutex_lock_nested+0x1b/0x20 zil_commit_waiter_link_lwb+0x51/0x1a0 [zfs] zil_commit_impl+0x12b0/0x1850 [zfs] zil_commit+0x43/0x60 [zfs] zpl_writepages+0xf8/0x1a0 [zfs] do_writepages+0x43/0xf0 __filemap_fdatawrite_range+0xd5/0x110 filemap_write_and_wait_range+0x4b/0xb0 zpl_fsync+0x4d/0xb0 [zfs] vfs_fsync_range+0x49/0x80 do_fsync+0x3d/0x70 __x64_sys_fsync+0x14/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&zilog->zl_lock){+.+.}-{4:4}: __lock_acquire+0x12b6/0x2480 lock_acquire+0xab/0x380 __mutex_lock+0xac/0x9e0 mutex_lock_nested+0x1b/0x20 zil_alloc_lwb+0x1df/0x3e0 [zfs] zil_lwb_write_issue+0x265/0x3f0 [zfs] zil_commit_impl+0x577/0x1850 [zfs] zil_commit+0x43/0x60 [zfs] zpl_writepages+0xf8/0x1a0 [zfs] do_writepages+0x43/0xf0 __filemap_fdatawrite_range+0xd5/0x110 filemap_write_and_wait_range+0x4b/0xb0 zpl_fsync+0x4d/0xb0 [zfs] vfs_fsync_range+0x49/0x80 do_fsync+0x3d/0x70 __x64_sys_fsync+0x14/0x20 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&zcw->zcw_lock); lock(&zilog->zl_lock); lock(&zcw->zcw_lock); lock(&zilog->zl_lock); *** DEADLOCK *** 2 locks held by systemd-journal/5561: #0: (&zilog->zl_issuer_lock){+.+.}-{4:4}, at: zil_commit_impl+0x4dc... #1: (&zcw->zcw_lock){+.+.}-{4:4}, at: zil_commit_impl+0x52b/0x1850 Fixes #7038 Fixes #10440 Signed-off-by: Pavel Snajdr --- module/zfs/zil.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 9dc20ba14f37..41c80bb7eca1 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -998,11 +998,8 @@ zil_commit_waiter_skip(zil_commit_waiter_t *zcw) static void zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) { - /* - * The lwb_waiters field of the lwb is protected by the zilog's - * zl_lock, thus it must be held when calling this function. - */ - ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock)); + ASSERT(!MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); mutex_enter(&zcw->zcw_lock); ASSERT(!list_link_active(&zcw->zcw_node)); @@ -1012,8 +1009,10 @@ zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE); + mutex_enter(&lwb->lwb_zilog->zl_lock); list_insert_tail(&lwb->lwb_waiters, zcw); zcw->zcw_lwb = lwb; + mutex_exit(&lwb->lwb_zilog->zl_lock); mutex_exit(&zcw->zcw_lock); } @@ -1635,10 +1634,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * For more details, see the comment above zil_commit(). */ if (lrc->lrc_txtype == TX_COMMIT) { - mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; - mutex_exit(&zilog->zl_lock); return (lwb); }