Skip to content

Commit

Permalink
md: fix duplicate filename for rdev
Browse files Browse the repository at this point in the history
Commit 5792a28 ("[PATCH] md: avoid a deadlock when removing a device
from an md array via sysfs") delays the deletion of rdev, however, this
introduces a window that rdev can be added again while the deletion is
not done yet, and sysfs will complain about duplicate filename.

Follow up patches try to fix this problem by flushing workqueue, however,
flush_rdev_wq() is just dead code, the progress in
md_kick_rdev_from_array():

1) list_del_rcu(&rdev->same_set);
2) synchronize_rcu();
3) queue_work(md_rdev_misc_wq, &rdev->del_work);

So in flush_rdev_wq(), if rdev is found in the list, work_pending() can
never pass, in the meantime, if work is queued, then rdev can never be
found in the list.

flush_rdev_wq() can be replaced by flush_workqueue() directly, however,
this approach is not good:
- the workqueue is global, this synchronization for all raid disks is
  not necessary.
- flush_workqueue can't be called under 'reconfig_mutex', there is still
  a small window between flush_workqueue() and mddev_lock() that other
  contexts can queue new work, hence the problem is not solved completely.

sysfs already has apis to support delete itself through writer, and
these apis, specifically sysfs_break/unbreak_active_protection(), is used
to support deleting rdev synchronously. Therefore, the above commit can be
reverted, and sysfs duplicate filename can be avoided.

A new mdadm regression test is proposed as well([1]).

[1] https://lore.kernel.org/linux-raid/20230428062845.1975462-1-yukuai1@huaweicloud.com/

Fixes: 5792a28 ("[PATCH] md: avoid a deadlock when removing a device from an md array via sysfs")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230523012727.3042247-1-yukuai1@huaweicloud.com
  • Loading branch information
Yu Kuai authored and liu-song-6 committed Jun 13, 2023
1 parent f8b20a4 commit 3ce94ce
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 44 deletions.
86 changes: 44 additions & 42 deletions drivers/md/md.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,11 @@ static struct module *md_cluster_mod;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
static struct workqueue_struct *md_rdev_misc_wq;

static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);

/*
* Default number of read corrections we'll attempt on an rdev
Expand Down Expand Up @@ -643,9 +643,11 @@ void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->delete_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
INIT_LIST_HEAD(&mddev->deleting);
timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
Expand Down Expand Up @@ -747,6 +749,24 @@ static void mddev_free(struct mddev *mddev)

static const struct attribute_group md_redundancy_group;

static void md_free_rdev(struct mddev *mddev)
{
struct md_rdev *rdev;
struct md_rdev *tmp;

mutex_lock(&mddev->delete_mutex);
if (list_empty(&mddev->deleting))
goto out;

list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
list_del_init(&rdev->same_set);
kobject_del(&rdev->kobj);
export_rdev(rdev, mddev);
}
out:
mutex_unlock(&mddev->delete_mutex);
}

void mddev_unlock(struct mddev *mddev)
{
if (mddev->to_remove) {
Expand Down Expand Up @@ -788,6 +808,8 @@ void mddev_unlock(struct mddev *mddev)
} else
mutex_unlock(&mddev->reconfig_mutex);

md_free_rdev(mddev);

/* As we've dropped the mutex we need a spinlock to
* make sure the thread doesn't disappear
*/
Expand Down Expand Up @@ -2428,13 +2450,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return err;
}

static void rdev_delayed_delete(struct work_struct *ws)
{
struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
kobject_del(&rdev->kobj);
kobject_put(&rdev->kobj);
}

void md_autodetect_dev(dev_t dev);

/* just for claiming the bdev */
Expand All @@ -2455,6 +2470,8 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)

static void md_kick_rdev_from_array(struct md_rdev *rdev)
{
struct mddev *mddev = rdev->mddev;

bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
Expand All @@ -2468,15 +2485,17 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
rdev->sysfs_unack_badblocks = NULL;
rdev->sysfs_badblocks = NULL;
rdev->badblocks.count = 0;
/* We need to delay this, otherwise we can deadlock when
* writing to 'remove' to "dev/state". We also need
* to delay it due to rcu usage.
*/

synchronize_rcu();
INIT_WORK(&rdev->del_work, rdev_delayed_delete);
kobject_get(&rdev->kobj);
queue_work(md_rdev_misc_wq, &rdev->del_work);
export_rdev(rdev, rdev->mddev);

/*
* kobject_del() will wait for all in progress writers to be done, where
* reconfig_mutex is held, hence it can't be called under
* reconfig_mutex and it's delayed to mddev_unlock().
*/
mutex_lock(&mddev->delete_mutex);
list_add(&rdev->same_set, &mddev->deleting);
mutex_unlock(&mddev->delete_mutex);
}

static void export_array(struct mddev *mddev)
Expand Down Expand Up @@ -3544,13 +3563,18 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
{
struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
struct kernfs_node *kn = NULL;
ssize_t rv;
struct mddev *mddev = rdev->mddev;

if (!entry->store)
return -EIO;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;

if (entry->store == state_store && cmd_match(page, "remove"))
kn = sysfs_break_active_protection(kobj, attr);

rv = mddev ? mddev_lock(mddev) : -ENODEV;
if (!rv) {
if (rdev->mddev == NULL)
Expand All @@ -3559,6 +3583,10 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
rv = entry->store(rdev, page, length);
mddev_unlock(mddev);
}

if (kn)
sysfs_unbreak_active_protection(kn);

return rv;
}

Expand Down Expand Up @@ -4484,20 +4512,6 @@ null_show(struct mddev *mddev, char *page)
return -EINVAL;
}

/* need to ensure rdev_delayed_delete() has completed */
static void flush_rdev_wq(struct mddev *mddev)
{
struct md_rdev *rdev;

rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (work_pending(&rdev->del_work)) {
flush_workqueue(md_rdev_misc_wq);
break;
}
rcu_read_unlock();
}

static ssize_t
new_dev_store(struct mddev *mddev, const char *buf, size_t len)
{
Expand Down Expand Up @@ -4525,7 +4539,6 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
minor != MINOR(dev))
return -EOVERFLOW;

flush_rdev_wq(mddev);
err = mddev_lock(mddev);
if (err)
return err;
Expand Down Expand Up @@ -5595,7 +5608,6 @@ struct mddev *md_alloc(dev_t dev, char *name)
* removed (mddev_delayed_delete).
*/
flush_workqueue(md_misc_wq);
flush_workqueue(md_rdev_misc_wq);

mutex_lock(&disks_mutex);
mddev = mddev_alloc(dev);
Expand Down Expand Up @@ -7558,9 +7570,6 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,

}

if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
flush_rdev_wq(mddev);

if (cmd == HOT_REMOVE_DISK)
/* need to ensure recovery thread has run */
wait_event_interruptible_timeout(mddev->sb_wait,
Expand Down Expand Up @@ -9623,10 +9632,6 @@ static int __init md_init(void)
if (!md_misc_wq)
goto err_misc_wq;

md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
if (!md_rdev_misc_wq)
goto err_rdev_misc_wq;

ret = __register_blkdev(MD_MAJOR, "md", md_probe);
if (ret < 0)
goto err_md;
Expand All @@ -9645,8 +9650,6 @@ static int __init md_init(void)
err_mdp:
unregister_blkdev(MD_MAJOR, "md");
err_md:
destroy_workqueue(md_rdev_misc_wq);
err_rdev_misc_wq:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
Expand Down Expand Up @@ -9942,7 +9945,6 @@ static __exit void md_exit(void)
}
spin_unlock(&all_mddevs_lock);

destroy_workqueue(md_rdev_misc_wq);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
}
Expand Down
10 changes: 8 additions & 2 deletions drivers/md/md.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,6 @@ struct md_rdev {

struct serial_in_rdev *serial; /* used for raid1 io serialization */

struct work_struct del_work; /* used for delayed sysfs removal */

struct kernfs_node *sysfs_state; /* handle for 'state'
* sysfs entry */
/* handle for 'unacknowledged_bad_blocks' sysfs dentry */
Expand Down Expand Up @@ -531,6 +529,14 @@ struct mddev {
unsigned int good_device_nr; /* good device num within cluster raid */
unsigned int noio_flag; /* for memalloc scope API */

/*
* Temporarily store rdev that will be finally removed when
* reconfig_mutex is unlocked.
*/
struct list_head deleting;
/* Protect the deleting list */
struct mutex delete_mutex;

bool has_superblocks:1;
bool fail_last_dev:1;
bool serialize_policy:1;
Expand Down

0 comments on commit 3ce94ce

Please sign in to comment.