Skip to content

Commit

Permalink
xfs: fix deadlock between shrinker and fs freeze
Browse files Browse the repository at this point in the history
Orabug: 30657780

Shrinker hold sb->s_umount lock and invoked .destroy_inode to reclaim
inode, if fs was freezed, Shrinker would hung by freeze lock. But
unfreeze could never happen because it would be hung by sb->s_umount.

Backgroud inode inactivation feature could fix this, but it was not
merged by mainline yet, according Darrick, even merged, it would be
nearly impossible to backport to 4.14. The effort here is to make a
one OFF-MAINLINE fix for uek, if future uek have that feature merged,
this patch should be dropped.

To avoid deadlock, add inode needing inactivation to list
and destroy them async.

 crash7latest> set 132
     PID: 132
 COMMAND: "kswapd0:0"
    TASK: ffff9cdc9dfb5f00  [THREAD_INFO: ffff9cdc9dfb5f00]
     CPU: 6
   STATE: TASK_UNINTERRUPTIBLE
 crash7latest> bt
 PID: 132    TASK: ffff9cdc9dfb5f00  CPU: 6   COMMAND: "kswapd0:0"
  #0 [ffffaa5d075bf900] __schedule at ffffffff8186487c
  #1 [ffffaa5d075bf998] schedule at ffffffff81864e96
  #2 [ffffaa5d075bf9b0] rwsem_down_read_failed at ffffffff818689ee
  #3 [ffffaa5d075bfa40] call_rwsem_down_read_failed at ffffffff81859308
  #4 [ffffaa5d075bfa90] __percpu_down_read at ffffffff810ebd38
  #5 [ffffaa5d075bfab0] __sb_start_write at ffffffff812859ef
  #6 [ffffaa5d075bfad0] xfs_trans_alloc at ffffffffc07ebe9c [xfs]
  #7 [ffffaa5d075bfb18] xfs_free_eofblocks at ffffffffc07c39d1 [xfs]
  #8 [ffffaa5d075bfb80] xfs_inactive at ffffffffc07de878 [xfs]
  #9 [ffffaa5d075bfba0] __dta_xfs_fs_destroy_inode_3543 at ffffffffc07e885e [xfs]
 #10 [ffffaa5d075bfbd0] destroy_inode at ffffffff812a25de
 #11 [ffffaa5d075bfbe8] evict at ffffffff812a2b73
 #12 [ffffaa5d075bfc10] dispose_list at ffffffff812a2c1d
 #13 [ffffaa5d075bfc38] prune_icache_sb at ffffffff812a421a
 #14 [ffffaa5d075bfc70] super_cache_scan at ffffffff812870a1
 #15 [ffffaa5d075bfcc8] shrink_slab at ffffffff811eebb3
 #16 [ffffaa5d075bfdb0] shrink_node at ffffffff811f4788
 #17 [ffffaa5d075bfe38] kswapd at ffffffff811f58c3
 #18 [ffffaa5d075bff08] kthread at ffffffff810b75d5
 #19 [ffffaa5d075bff50] ret_from_fork at ffffffff81a0035e
 crash7latest> set 31060
     PID: 31060
 COMMAND: "safefreeze"
    TASK: ffff9cd292868000  [THREAD_INFO: ffff9cd292868000]
     CPU: 2
   STATE: TASK_UNINTERRUPTIBLE
 crash7latest> bt
 PID: 31060  TASK: ffff9cd292868000  CPU: 2   COMMAND: "safefreeze"
  #0 [ffffaa5d10047c90] __schedule at ffffffff8186487c
  #1 [ffffaa5d10047d28] schedule at ffffffff81864e96
  #2 [ffffaa5d10047d40] rwsem_down_write_failed at ffffffff81868f18
  #3 [ffffaa5d10047dd8] call_rwsem_down_write_failed at ffffffff81859367
  #4 [ffffaa5d10047e20] down_write at ffffffff81867cfd
  #5 [ffffaa5d10047e38] thaw_super at ffffffff81285d2d
  #6 [ffffaa5d10047e60] do_vfs_ioctl at ffffffff81299566
  #7 [ffffaa5d10047ee8] sys_ioctl at ffffffff81299709
  #8 [ffffaa5d10047f28] do_syscall_64 at ffffffff81003949
  #9 [ffffaa5d10047f50] entry_SYSCALL_64_after_hwframe at ffffffff81a001ad
     RIP: 0000000000453d67  RSP: 00007ffff9c1ce78  RFLAGS: 00000206
     RAX: ffffffffffffffda  RBX: 0000000001cbe92c  RCX: 0000000000453d67
     RDX: 0000000000000000  RSI: 00000000c0045878  RDI: 0000000000000014
     RBP: 00007ffff9c1cf80   R8: 0000000000000000   R9: 0000000000000012
     R10: 0000000000000008  R11: 0000000000000206  R12: 0000000000401fb0
     R13: 0000000000402040  R14: 0000000000000000  R15: 0000000000000000
     ORIG_RAX: 0000000000000010  CS: 0033  SS: 002b

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Somasundaram Krishnasamy <somasundaram.krishnasamy@oracle.com>
  • Loading branch information
biger410 authored and Somasundaram Krishnasamy committed Feb 22, 2020
1 parent 0f9d74c commit dee04fc
Show file tree
Hide file tree
Showing 8 changed files with 186 additions and 22 deletions.
35 changes: 22 additions & 13 deletions fs/super.c
Original file line number Diff line number Diff line change
Expand Up @@ -1509,21 +1509,12 @@ int freeze_super(struct super_block *sb)
}
EXPORT_SYMBOL(freeze_super);

/**
* thaw_super -- unlock filesystem
* @sb: the super to thaw
*
* Unlocks the filesystem and marks it writeable again after freeze_super().
*/
int thaw_super(struct super_block *sb)
int __thaw_super(struct super_block *sb)
{
int error;

down_write(&sb->s_umount);
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) {
up_write(&sb->s_umount);
if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
return -EINVAL;
}

if (sb_rdonly(sb)) {
sb->s_writers.frozen = SB_UNFROZEN;
Expand All @@ -1538,7 +1529,6 @@ int thaw_super(struct super_block *sb)
printk(KERN_ERR
"VFS:Filesystem thaw failed\n");
lockdep_sb_freeze_release(sb);
up_write(&sb->s_umount);
return error;
}
}
Expand All @@ -1547,7 +1537,26 @@ int thaw_super(struct super_block *sb)
sb_freeze_unlock(sb);
out:
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
return 0;
}
EXPORT_SYMBOL(__thaw_super);

/**
* thaw_super -- unlock filesystem
* @sb: the super to thaw
*
* Unlocks the filesystem and marks it writeable again after freeze_super().
*/
int thaw_super(struct super_block *sb)
{
int error;

down_write(&sb->s_umount);
error = __thaw_super(sb);
if (error)
up_write(&sb->s_umount);
else
deactivate_locked_super(sb);
return error;
}
EXPORT_SYMBOL(thaw_super);
1 change: 1 addition & 0 deletions fs/xfs/xfs_icache.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ xfs_inode_alloc(
ip->i_flags = 0;
ip->i_delayed_blks = 0;
memset(&ip->i_d, 0, sizeof(ip->i_d));
INIT_LIST_HEAD(&ip->i_inact_list);

return ip;
}
Expand Down
2 changes: 2 additions & 0 deletions fs/xfs/xfs_inode.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ typedef struct xfs_inode {

/* VFS inode */
struct inode i_vnode; /* embedded VFS inode */

struct list_head i_inact_list;
} xfs_inode_t;

/* Convert from vfs inode to xfs inode */
Expand Down
5 changes: 5 additions & 0 deletions fs/xfs/xfs_mount.c
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,10 @@ xfs_initialize_perag(
if (radix_tree_preload(GFP_NOFS))
goto out_hash_destroy;

INIT_WORK(&pag->pag_inact_work, xfs_fs_inact_worker);
INIT_LIST_HEAD(&pag->pag_inact_list);
spin_lock_init(&pag->pag_inact_lock);

spin_lock(&mp->m_perag_lock);
if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
BUG();
Expand Down Expand Up @@ -1078,6 +1082,7 @@ xfs_unmountfs(

cancel_delayed_work_sync(&mp->m_eofblocks_work);
cancel_delayed_work_sync(&mp->m_cowblocks_work);
flush_workqueue(mp->m_inact_workqueue);

xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
Expand Down
6 changes: 6 additions & 0 deletions fs/xfs/xfs_mount.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ typedef struct xfs_mount {
struct workqueue_struct *m_log_workqueue;
struct workqueue_struct *m_eofblocks_workqueue;
struct workqueue_struct *m_sync_workqueue;
struct workqueue_struct *m_inact_workqueue;

/*
* Generation of the filesysyem layout. This is incremented by each
Expand Down Expand Up @@ -397,6 +398,11 @@ typedef struct xfs_perag {

/* reference count */
uint8_t pagf_refcount_level;

/* For inode inactivation */
struct work_struct pag_inact_work;
struct list_head pag_inact_list;
spinlock_t pag_inact_lock;
} xfs_perag_t;

static inline struct xfs_ag_resv *
Expand Down
156 changes: 147 additions & 9 deletions fs/xfs/xfs_super.c
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,24 @@ xfs_setup_devices(
return 0;
}

STATIC int
xfs_init_inact_workqueue(
struct xfs_mount *mp)
{
mp->m_inact_workqueue = alloc_workqueue("xfs-inact/%s", WQ_FREEZABLE,
xfs_guess_metadata_threads(mp), mp->m_fsname);
if (!mp->m_inact_workqueue)
return -ENOMEM;
return 0;
}

STATIC void
xfs_destroy_inact_workqueue(
struct xfs_mount *mp)
{
destroy_workqueue(mp->m_inact_workqueue);
}

STATIC int
xfs_init_mount_workqueues(
struct xfs_mount *mp)
Expand Down Expand Up @@ -971,12 +989,8 @@ xfs_fs_alloc_inode(
return NULL;
}

/*
* Now that the generic code is guaranteed not to be accessing
* the linux inode, we can inactivate and reclaim the inode.
*/
STATIC void
xfs_fs_destroy_inode(
_xfs_fs_destroy_inode(
struct inode *inode)
{
struct xfs_inode *ip = XFS_I(inode);
Expand Down Expand Up @@ -1017,6 +1031,87 @@ xfs_fs_destroy_inode(
xfs_inode_set_reclaim_tag(ip);
}

/*
* Now that the generic code is guaranteed not to be accessing
* the linux inode, we can inactivate and reclaim the inode.
*/
STATIC void
xfs_fs_destroy_inode(
struct inode *inode)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
struct xfs_perag *pag;

if (xfs_inode_needs_inactivation(ip)) {
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
spin_lock(&pag->pag_inact_lock);
list_add_tail(&ip->i_inact_list, &pag->pag_inact_list);
spin_unlock(&pag->pag_inact_lock);
queue_work(mp->m_inact_workqueue, &pag->pag_inact_work);
xfs_perag_put(pag);
return;
}

_xfs_fs_destroy_inode(inode);
}

void
xfs_fs_inact_worker(
struct work_struct *work)
{
struct xfs_perag *pag = container_of(work,
struct xfs_perag, pag_inact_work);
struct list_head list;
struct xfs_inode *ip;
struct xfs_inode *next_ip;
struct xfs_mount *mp;

mp = pag->pag_mount;
while (1) {
/* fs freezed, return to avoid hung task, requeue at thaw. */
if (!sb_start_write_trylock(mp->m_super))
return;

spin_lock(&pag->pag_inact_lock);
if (list_empty(&pag->pag_inact_list)) {
spin_unlock(&pag->pag_inact_lock);
sb_end_write(mp->m_super);
return;
}
list_replace_init(&pag->pag_inact_list, &list);
spin_unlock(&pag->pag_inact_lock);

list_for_each_entry_safe(ip, next_ip, &list, i_inact_list) {
list_del_init(&ip->i_inact_list);
_xfs_fs_destroy_inode(&ip->i_vnode);
cond_resched();
}
sb_end_write(mp->m_super);
}
}

STATIC void
xfs_fs_requeue_inact_work(
struct xfs_mount *mp)
{
struct xfs_perag *pag;
xfs_agnumber_t index;

for (index = 0; index < mp->m_sb.sb_agcount; index++) {
pag = xfs_perag_get(mp, index);
spin_lock(&pag->pag_inact_lock);
if (list_empty(&pag->pag_inact_list)) {
spin_unlock(&pag->pag_inact_lock);
xfs_perag_put(pag);
continue;
}
spin_unlock(&pag->pag_inact_lock);
queue_work(mp->m_inact_workqueue, &pag->pag_inact_work);
xfs_perag_put(pag);
}
}

static void
xfs_fs_dirty_inode(
struct inode *inode,
Expand Down Expand Up @@ -1445,6 +1540,7 @@ xfs_fs_remount(
* final log force+buftarg wait and deadlock the remount.
*/
cancel_delayed_work_sync(&mp->m_eofblocks_work);
flush_workqueue(mp->m_inact_workqueue);

xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
Expand All @@ -1453,6 +1549,19 @@ xfs_fs_remount(
return 0;
}

STATIC int
xfs_fs_freeze_super(struct super_block *sb)
{
struct xfs_mount *mp = XFS_M(sb);

/*
* clean up inactive inodes before freezing to minimize
* the amount of recovery work if we crash while frozen.
*/
flush_workqueue(mp->m_inact_workqueue);
return freeze_super(sb);
}

/*
* Second stage of a freeze. The data is already frozen so we only
* need to take care of the metadata. Once that's done sync the superblock
Expand All @@ -1470,6 +1579,25 @@ xfs_fs_freeze(
return xfs_sync_sb(mp, true);
}

STATIC int
xfs_fs_thaw_super(
struct super_block *sb)
{
struct xfs_mount *mp = XFS_M(sb);
int error;

down_write(&sb->s_umount);
error = __thaw_super(sb);
if (error)
up_write(&sb->s_umount);
else {
/* inact work was skiped for fs frozen, requeue here. */
xfs_fs_requeue_inact_work(mp);
deactivate_locked_super(sb);
}
return error;
}

STATIC int
xfs_fs_unfreeze(
struct super_block *sb)
Expand Down Expand Up @@ -1666,17 +1794,22 @@ xfs_fs_fill_super(
if (error)
goto out_free_stats;

error = xfs_finish_flags(mp);
/* worker thread number depends on agcount. */
error = xfs_init_inact_workqueue(mp);
if (error)
goto out_free_sb;

error = xfs_finish_flags(mp);
if (error)
goto out_destroy_inact_workqueue;

error = xfs_setup_devices(mp);
if (error)
goto out_free_sb;
goto out_destroy_inact_workqueue;

error = xfs_filestream_mount(mp);
if (error)
goto out_free_sb;
goto out_destroy_inact_workqueue;

/*
* we must configure the block size in the superblock before we run the
Expand Down Expand Up @@ -1765,6 +1898,8 @@ xfs_fs_fill_super(

out_filestream_unmount:
xfs_filestream_unmount(mp);
out_destroy_inact_workqueue:
xfs_destroy_inact_workqueue(mp);
out_free_sb:
xfs_freesb(mp);
out_free_stats:
Expand All @@ -1784,7 +1919,7 @@ xfs_fs_fill_super(
out_unmount:
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
goto out_free_sb;
goto out_destroy_inact_workqueue;
}

STATIC void
Expand All @@ -1801,6 +1936,7 @@ xfs_fs_put_super(
free_percpu(mp->m_stats.xs_stats);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_destroy_inact_workqueue(mp);
xfs_close_devices(mp);
xfs_free_fsname(mp);
kfree(mp);
Expand Down Expand Up @@ -1839,7 +1975,9 @@ static const struct super_operations xfs_super_operations = {
.drop_inode = xfs_fs_drop_inode,
.put_super = xfs_fs_put_super,
.sync_fs = xfs_fs_sync_fs,
.freeze_super = xfs_fs_freeze_super,
.freeze_fs = xfs_fs_freeze,
.thaw_super = xfs_fs_thaw_super,
.unfreeze_fs = xfs_fs_unfreeze,
.statfs = xfs_fs_statfs,
.remount_fs = xfs_fs_remount,
Expand Down
2 changes: 2 additions & 0 deletions fs/xfs/xfs_super.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,14 @@ struct xfs_inode;
struct xfs_mount;
struct xfs_buftarg;
struct block_device;
struct work_struct;

extern void xfs_quiesce_attr(struct xfs_mount *mp);
extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode_alloc(struct xfs_mount *,
xfs_agnumber_t agcount);
extern void xfs_fs_inact_worker(struct work_struct *work);

extern const struct export_operations xfs_export_operations;
extern const struct xattr_handler *xfs_xattr_handlers[];
Expand Down
1 change: 1 addition & 0 deletions include/linux/fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -2166,6 +2166,7 @@ extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
extern int vfs_ustat(dev_t, struct kstatfs *);
extern int freeze_super(struct super_block *super);
extern int __thaw_super(struct super_block *super);
extern int thaw_super(struct super_block *super);
extern bool our_mnt(struct vfsmount *mnt);
extern __printf(2, 3)
Expand Down

0 comments on commit dee04fc

Please sign in to comment.