From 526dd6d7877b80b1f56d87156b65b8227c69d59f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:10 +0100 Subject: [PATCH 01/14] devlink: Move private netlink flags to C file The flags are not used outside of the C file so move them there. Suggested-by: Jiri Pirko Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/devlink/devl_internal.h | 3 --- net/devlink/netlink.c | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 183dbe3807ab3..2a9b263300a4b 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -111,9 +111,6 @@ int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink, bool *msg_updated); /* Netlink */ -#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) -#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) - enum devlink_multicast_groups { DEVLINK_MCGRP_CONFIG, }; diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index d0b90ebc8b152..7350138c8bb44 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -9,6 +9,9 @@ #include "devl_internal.h" +#define DEVLINK_NL_FLAG_NEED_PORT BIT(0) +#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) + static const struct genl_multicast_group devlink_nl_mcgrps[] = { [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, }; From e21c52d7814f5768f05224e773644629fe124af2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:11 +0100 Subject: [PATCH 02/14] devlink: Acquire device lock during netns dismantle Device drivers register with devlink from their probe routines (under the device lock) by acquiring the devlink instance lock and calling devl_register(). Drivers that support a devlink reload usually implement the reload_{down, up}() operations in a similar fashion to their remove and probe routines, respectively. However, while the remove and probe routines are invoked with the device lock held, the reload operations are only invoked with the devlink instance lock held. It is therefore impossible for drivers to acquire the device lock from their reload operations, as this would result in lock inversion. The motivating use case for invoking the reload operations with the device lock held is in mlxsw which needs to trigger a PCI reset as part of the reload. The driver cannot call pci_reset_function() as this function acquires the device lock. Instead, it needs to call __pci_reset_function_locked which expects the device lock to be held. To that end, adjust devlink to always acquire the device lock before the devlink instance lock when performing a reload. For now, only do that when reload is triggered as part of netns dismantle. Subsequent patches will handle the case where reload is explicitly triggered by user space. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/devlink/core.c | 4 ++-- net/devlink/devl_internal.h | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/net/devlink/core.c b/net/devlink/core.c index 6984877e9f10d..4275a2bc6d8e0 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -503,14 +503,14 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) * all devlink instances from this namespace into init_net. */ devlinks_xa_for_each_registered_get(net, index, devlink) { - devl_lock(devlink); + devl_dev_lock(devlink, true); err = 0; if (devl_is_registered(devlink)) err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); - devl_unlock(devlink); + devl_dev_unlock(devlink, true); devlink_put(devlink); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 2a9b263300a4b..178abaf74a107 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -3,6 +3,7 @@ * Copyright (c) 2016 Jiri Pirko */ +#include #include #include #include @@ -96,6 +97,20 @@ static inline bool devl_is_registered(struct devlink *devlink) return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); } +static inline void devl_dev_lock(struct devlink *devlink, bool dev_lock) +{ + if (dev_lock) + device_lock(devlink->dev); + devl_lock(devlink); +} + +static inline void devl_dev_unlock(struct devlink *devlink, bool dev_lock) +{ + devl_unlock(devlink); + if (dev_lock) + device_unlock(devlink->dev); +} + typedef void devlink_rel_notify_cb_t(struct devlink *devlink, u32 obj_index); typedef void devlink_rel_cleanup_cb_t(struct devlink *devlink, u32 obj_index, u32 rel_index); From c8d0a7d6152bec970552786b77626f4b4c562f4d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:12 +0100 Subject: [PATCH 03/14] devlink: Enable the use of private flags in post_doit operations Currently, private flags (e.g., 'DEVLINK_NL_FLAG_NEED_PORT') are only used in pre_doit operations, but a subsequent patch will need to conditionally lock and unlock the device lock in pre and post doit operations, respectively. As a preparation, enable the use of private flags in post_doit operations in a similar fashion to how it is done for pre_doit operations. No functional changes intended. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/devlink/netlink.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 7350138c8bb44..5bb6624f3288e 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -141,8 +141,8 @@ int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT); } -void devlink_nl_post_doit(const struct genl_split_ops *ops, - struct sk_buff *skb, struct genl_info *info) +static void __devlink_nl_post_doit(struct sk_buff *skb, struct genl_info *info, + u8 flags) { struct devlink *devlink; @@ -151,6 +151,12 @@ void devlink_nl_post_doit(const struct genl_split_ops *ops, devlink_put(devlink); } +void devlink_nl_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + __devlink_nl_post_doit(skb, info, 0); +} + static int devlink_nl_inst_single_dumpit(struct sk_buff *msg, struct netlink_callback *cb, int flags, devlink_nl_dump_one_func_t *dump_one, From d32c38256db30a2d55b849e2c77342bc70d58c6e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:13 +0100 Subject: [PATCH 04/14] devlink: Allow taking device lock in pre_doit operations Introduce a new private flag ('DEVLINK_NL_FLAG_NEED_DEV_LOCK') to allow netlink commands to specify that they need to acquire the device lock in their pre_doit operation and release it in their post_doit operation. The reload command will use this flag in the subsequent patch. No functional changes intended. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/devlink/devl_internal.h | 3 ++- net/devlink/health.c | 3 ++- net/devlink/netlink.c | 19 ++++++++++++------- net/devlink/region.c | 3 ++- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 178abaf74a107..5ea2e2012e930 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -152,7 +152,8 @@ typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg, int flags); struct devlink * -devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs); +devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs, + bool dev_lock); int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb, devlink_nl_dump_one_func_t *dump_one); diff --git a/net/devlink/health.c b/net/devlink/health.c index 695df61f8ac2a..71ae121dc739d 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -1151,7 +1151,8 @@ devlink_health_reporter_get_from_cb_lock(struct netlink_callback *cb) struct nlattr **attrs = info->attrs; struct devlink *devlink; - devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, + false); if (IS_ERR(devlink)) return NULL; diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 5bb6624f3288e..86f12531bf998 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -11,6 +11,7 @@ #define DEVLINK_NL_FLAG_NEED_PORT BIT(0) #define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1) +#define DEVLINK_NL_FLAG_NEED_DEV_LOCK BIT(2) static const struct genl_multicast_group devlink_nl_mcgrps[] = { [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME }, @@ -64,7 +65,8 @@ int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info) } struct devlink * -devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) +devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs, + bool dev_lock) { struct devlink *devlink; unsigned long index; @@ -78,12 +80,12 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]); devlinks_xa_for_each_registered_get(net, index, devlink) { - devl_lock(devlink); + devl_dev_lock(devlink, dev_lock); if (devl_is_registered(devlink) && strcmp(devlink->dev->bus->name, busname) == 0 && strcmp(dev_name(devlink->dev), devname) == 0) return devlink; - devl_unlock(devlink); + devl_dev_unlock(devlink, dev_lock); devlink_put(devlink); } @@ -93,11 +95,13 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs) static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info, u8 flags) { + bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK; struct devlink_port *devlink_port; struct devlink *devlink; int err; - devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs); + devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs, + dev_lock); if (IS_ERR(devlink)) return PTR_ERR(devlink); @@ -117,7 +121,7 @@ static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info, return 0; unlock: - devl_unlock(devlink); + devl_dev_unlock(devlink, dev_lock); devlink_put(devlink); return err; } @@ -144,10 +148,11 @@ int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, static void __devlink_nl_post_doit(struct sk_buff *skb, struct genl_info *info, u8 flags) { + bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK; struct devlink *devlink; devlink = info->user_ptr[0]; - devl_unlock(devlink); + devl_dev_unlock(devlink, dev_lock); devlink_put(devlink); } @@ -165,7 +170,7 @@ static int devlink_nl_inst_single_dumpit(struct sk_buff *msg, struct devlink *devlink; int err; - devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs); + devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs, false); if (IS_ERR(devlink)) return PTR_ERR(devlink); err = dump_one(msg, devlink, cb, flags | NLM_F_DUMP_FILTERED); diff --git a/net/devlink/region.c b/net/devlink/region.c index 0aab7b82d6780..e3bab458db940 100644 --- a/net/devlink/region.c +++ b/net/devlink/region.c @@ -883,7 +883,8 @@ int devlink_nl_region_read_dumpit(struct sk_buff *skb, start_offset = state->start_offset; - devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs); + devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs, + false); if (IS_ERR(devlink)) return PTR_ERR(devlink); From bf6b200bc80d18480f8d0fb61e185bb0587e633c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:14 +0100 Subject: [PATCH 05/14] devlink: Acquire device lock during reload command Device drivers register with devlink from their probe routines (under the device lock) by acquiring the devlink instance lock and calling devl_register(). Drivers that support a devlink reload usually implement the reload_{down, up}() operations in a similar fashion to their remove and probe routines, respectively. However, while the remove and probe routines are invoked with the device lock held, the reload operations are only invoked with the devlink instance lock held. It is therefore impossible for drivers to acquire the device lock from their reload operations, as this would result in lock inversion. The motivating use case for invoking the reload operations with the device lock held is in mlxsw which needs to trigger a PCI reset as part of the reload. The driver cannot call pci_reset_function() as this function acquires the device lock. Instead, it needs to call __pci_reset_function_locked which expects the device lock to be held. To that end, adjust devlink to always acquire the device lock before the devlink instance lock when performing a reload. Do that when reload is explicitly triggered by user space by specifying the 'DEVLINK_NL_FLAG_NEED_DEV_LOCK' flag in the pre_doit and post_doit operations of the reload command. A previous patch already handled the case where reload is invoked as part of netns dismantle. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- Documentation/netlink/specs/devlink.yaml | 4 ++-- net/devlink/netlink.c | 13 +++++++++++++ net/devlink/netlink_gen.c | 4 ++-- net/devlink/netlink_gen.h | 5 +++++ 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index 572d83a414d0d..43067e1f63aa9 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -1484,8 +1484,8 @@ operations: dont-validate: [ strict ] flags: [ admin-perm ] do: - pre: devlink-nl-pre-doit - post: devlink-nl-post-doit + pre: devlink-nl-pre-doit-dev-lock + post: devlink-nl-post-doit-dev-lock request: attributes: - bus-name diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c index 86f12531bf998..fa9afe3e6d9b2 100644 --- a/net/devlink/netlink.c +++ b/net/devlink/netlink.c @@ -138,6 +138,12 @@ int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_PORT); } +int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK); +} + int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -162,6 +168,13 @@ void devlink_nl_post_doit(const struct genl_split_ops *ops, __devlink_nl_post_doit(skb, info, 0); } +void +devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + __devlink_nl_post_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK); +} + static int devlink_nl_inst_single_dumpit(struct sk_buff *msg, struct netlink_callback *cb, int flags, devlink_nl_dump_one_func_t *dump_one, diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index 788dfdc498a95..95f9b4350ab71 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -846,9 +846,9 @@ const struct genl_split_ops devlink_nl_ops[73] = { { .cmd = DEVLINK_CMD_RELOAD, .validate = GENL_DONT_VALIDATE_STRICT, - .pre_doit = devlink_nl_pre_doit, + .pre_doit = devlink_nl_pre_doit_dev_lock, .doit = devlink_nl_reload_doit, - .post_doit = devlink_nl_post_doit, + .post_doit = devlink_nl_post_doit_dev_lock, .policy = devlink_reload_nl_policy, .maxattr = DEVLINK_ATTR_RELOAD_LIMITS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 0e9e89c31c314..02f3c0bfae0ee 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -22,12 +22,17 @@ int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); int devlink_nl_pre_doit_port(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); void devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); +void +devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info); int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); From 527a07e176eab0f61b1beec9e29b99c9a5ec219f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:15 +0100 Subject: [PATCH 06/14] devlink: Add device lock assert in reload operation Add an assert to verify that the device lock is always held throughout reload operations. Tested the following flows with netdevsim and mlxsw while lockdep is enabled: netdevsim: # echo "10 1" > /sys/bus/netdevsim/new_device # devlink dev reload netdevsim/netdevsim10 # ip netns add bla # devlink dev reload netdevsim/netdevsim10 netns bla # ip netns del bla # echo 10 > /sys/bus/netdevsim/del_device mlxsw: # devlink dev reload pci/0000:01:00.0 # ip netns add bla # devlink dev reload pci/0000:01:00.0 netns bla # ip netns del bla # echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/remove # echo 1 > /sys/bus/pci/rescan Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/devlink/dev.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 4fc7adb326631..ea6a92f2e6a2e 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -4,6 +4,7 @@ * Copyright (c) 2016 Jiri Pirko */ +#include #include #include #include "devl_internal.h" @@ -433,6 +434,13 @@ int devlink_reload(struct devlink *devlink, struct net *dest_net, struct net *curr_net; int err; + /* Make sure the reload operations are invoked with the device lock + * held to allow drivers to trigger functionality that expects it + * (e.g., PCI reset) and to close possible races between these + * operations and probe/remove. + */ + device_lock_assert(devlink->dev); + memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats)); From 3ed48c80b28d8dcd584d6ddaf00c75b7673e1a05 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:16 +0100 Subject: [PATCH 07/14] PCI: Add no PM reset quirk for NVIDIA Spectrum devices Spectrum-{1,2,3,4} devices report that a D3hot->D0 transition causes a reset (i.e., they advertise NoSoftRst-). However, this transition does not have any effect on the device: It continues to be operational and network ports remain up. Advertising this support makes it seem as if a PM reset is viable for these devices. Mark it as unavailable to skip it when testing reset methods. Before: # cat /sys/bus/pci/devices/0000\:03\:00.0/reset_method pm bus After: # cat /sys/bus/pci/devices/0000\:03\:00.0/reset_method bus Signed-off-by: Ido Schimmel Acked-by: Bjorn Helgaas Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- drivers/pci/quirks.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index ea476252280ab..d208047d1b8f2 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3786,6 +3786,19 @@ static void quirk_no_pm_reset(struct pci_dev *dev) DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_ATI, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, 8, quirk_no_pm_reset); +/* + * Spectrum-{1,2,3,4} devices report that a D3hot->D0 transition causes a reset + * (i.e., they advertise NoSoftRst-). However, this transition does not have + * any effect on the device: It continues to be operational and network ports + * remain up. Advertising this support makes it seem as if a PM reset is viable + * for these devices. Mark it as unavailable to skip it when testing reset + * methods. + */ +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcb84, quirk_no_pm_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf6c, quirk_no_pm_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf70, quirk_no_pm_reset); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf80, quirk_no_pm_reset); + /* * Thunderbolt controllers with broken MSI hotplug signaling: * Entire 1st generation (Light Ridge, Eagle Ridge, Light Peak) and part From 0a5ef95923e01aa93210d22e0d62d66b601238d7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:17 +0100 Subject: [PATCH 08/14] PCI: Add debug print for device ready delay Currently, the time it took a PCI device to become ready after reset is only printed if it was longer than 1000ms ('PCI_RESET_WAIT'). However, for debugging purposes it is useful to know this time even if it was shorter. For example, with the device I am working on, hardware engineers asked to verify that it becomes ready on the first try (no delay). To that end, add a debug level print that can be enabled using dynamic debug. Example: # echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/reset # dmesg -c | grep ready # echo "file drivers/pci/pci.c +p" > /sys/kernel/debug/dynamic_debug/control # echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/reset # dmesg -c | grep ready [ 396.060335] mlxsw_spectrum4 0000:01:00.0: ready 0ms after bus reset # echo "file drivers/pci/pci.c -p" > /sys/kernel/debug/dynamic_debug/control # echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/reset # dmesg -c | grep ready Signed-off-by: Ido Schimmel Acked-by: Bjorn Helgaas Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/pci/pci.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 55bc3576a9856..69d20d585f889 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1219,6 +1219,9 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout) if (delay > PCI_RESET_WAIT) pci_info(dev, "ready %dms after %s\n", delay - 1, reset_type); + else + pci_dbg(dev, "ready %dms after %s\n", delay - 1, + reset_type); return 0; } From e6dbab40fa096ed5e882e25cab54c3bdad57762c Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 15 Nov 2023 13:17:18 +0100 Subject: [PATCH 09/14] mlxsw: Extend MRSR pack() function to support new commands Currently mlxsw_reg_mrsr_pack() always sets 'command=1'. As preparation for support of new reset flow, pass the command as an argument to the function and add an enum for this field. For now, always pass 'command=1' to the pack() function. Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/reg.h | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index e4b25e187467a..7af37f78ed1a4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1491,7 +1491,7 @@ static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci, return err; } - mlxsw_reg_mrsr_pack(mrsr_pl); + mlxsw_reg_mrsr_pack(mrsr_pl, MLXSW_REG_MRSR_COMMAND_SOFTWARE_RESET); err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 25b294fdeb3df..13c0ff9945378 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -10122,6 +10122,15 @@ mlxsw_reg_mgir_unpack(char *payload, u32 *hw_rev, char *fw_info_psid, MLXSW_REG_DEFINE(mrsr, MLXSW_REG_MRSR_ID, MLXSW_REG_MRSR_LEN); +enum mlxsw_reg_mrsr_command { + /* Switch soft reset, does not reset PCI firmware. */ + MLXSW_REG_MRSR_COMMAND_SOFTWARE_RESET = 1, + /* Reset will be done when PCI link will be disabled. + * This command will reset PCI firmware also. + */ + MLXSW_REG_MRSR_COMMAND_RESET_AT_PCI_DISABLE = 6, +}; + /* reg_mrsr_command * Reset/shutdown command * 0 - do nothing @@ -10130,10 +10139,11 @@ MLXSW_REG_DEFINE(mrsr, MLXSW_REG_MRSR_ID, MLXSW_REG_MRSR_LEN); */ MLXSW_ITEM32(reg, mrsr, command, 0x00, 0, 4); -static inline void mlxsw_reg_mrsr_pack(char *payload) +static inline void mlxsw_reg_mrsr_pack(char *payload, + enum mlxsw_reg_mrsr_command command) { MLXSW_REG_ZERO(mrsr, payload); - mlxsw_reg_mrsr_command_set(payload, 1); + mlxsw_reg_mrsr_command_set(payload, command); } /* MLCR - Management LED Control Register From bdf85f3a695ff8b9709658e576299fa5a3f6326d Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 15 Nov 2023 13:17:19 +0100 Subject: [PATCH 10/14] mlxsw: pci: Rename mlxsw_pci_sw_reset() In the next patches, mlxsw_pci_sw_reset() will be extended to support more reset types and will not necessarily issue a software reset. Rename the function to reflect that. Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 7af37f78ed1a4..fe0b8a38d44e3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1476,8 +1476,8 @@ static int mlxsw_pci_sys_ready_wait(struct mlxsw_pci *mlxsw_pci, return -EBUSY; } -static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci, - const struct pci_device_id *id) +static int +mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) { struct pci_dev *pdev = mlxsw_pci->pdev; char mrsr_pl[MLXSW_REG_MRSR_LEN]; @@ -1537,9 +1537,9 @@ static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core, if (!mbox) return -ENOMEM; - err = mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id); + err = mlxsw_pci_reset(mlxsw_pci, mlxsw_pci->id); if (err) - goto err_sw_reset; + goto err_reset; err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci); if (err < 0) { @@ -1672,7 +1672,7 @@ static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core, err_query_fw: mlxsw_pci_free_irq_vectors(mlxsw_pci); err_alloc_irq: -err_sw_reset: +err_reset: mbox_put: mlxsw_cmd_mbox_free(mbox); return err; From 8d9da4672f94b4914d3b9318d911bfd1fcbfc9a1 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Wed, 15 Nov 2023 13:17:20 +0100 Subject: [PATCH 11/14] mlxsw: pci: Move software reset code to a separate function In general, the existing flow of software reset in the driver is: 1. Wait for system ready status. 2. Send MRSR command, to start the reset. 3. Wait for system ready status. This flow will be extended once a new reset command is supported. As a preparation, move step #2 to a separate function. Signed-off-by: Amit Cohen Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index fe0b8a38d44e3..080881c94c5a4 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1476,11 +1476,18 @@ static int mlxsw_pci_sys_ready_wait(struct mlxsw_pci *mlxsw_pci, return -EBUSY; } +static int mlxsw_pci_reset_sw(struct mlxsw_pci *mlxsw_pci) +{ + char mrsr_pl[MLXSW_REG_MRSR_LEN]; + + mlxsw_reg_mrsr_pack(mrsr_pl, MLXSW_REG_MRSR_COMMAND_SOFTWARE_RESET); + return mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl); +} + static int mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) { struct pci_dev *pdev = mlxsw_pci->pdev; - char mrsr_pl[MLXSW_REG_MRSR_LEN]; u32 sys_status; int err; @@ -1491,8 +1498,7 @@ mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) return err; } - mlxsw_reg_mrsr_pack(mrsr_pl, MLXSW_REG_MRSR_COMMAND_SOFTWARE_RESET); - err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl); + err = mlxsw_pci_reset_sw(mlxsw_pci); if (err) return err; From f257c73e53561f6c223c4bce883138e9f18b5f50 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:21 +0100 Subject: [PATCH 12/14] mlxsw: pci: Add support for new reset flow The driver resets the device during probe and during a devlink reload. The current reset method reloads the current firmware version or a pending one, if one was previously flashed using devlink. However, the current reset method does not result in a PCI hot reset, preventing the PCI firmware from being upgraded, unless the system is rebooted. To solve this problem, a new reset command (6) was implemented in the firmware. Unlike the current command (1), after issuing the new command the device will not start the reset immediately, but only after a PCI hot reset. Implement the new reset method by first verifying that it is supported by the current firmware version by querying the Management Capabilities Mask (MCAM) register. If supported, issue the new reset command (6) via MRSR register followed by a PCI reset by calling __pci_reset_function_locked(). Once the PCI firmware is operational, go back to the regular reset flow and wait for the entire device to become ready. That is, repeatedly read the "system_status" register from the BAR until a value of "FW_READY" (0x5E) appears. Tested: # for i in $(seq 1 10); do devlink dev reload pci/0000:01:00.0; done Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 44 ++++++++++++++++++++++- drivers/net/ethernet/mellanox/mlxsw/reg.h | 2 ++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 080881c94c5a4..726ed707e27b9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -1476,6 +1476,33 @@ static int mlxsw_pci_sys_ready_wait(struct mlxsw_pci *mlxsw_pci, return -EBUSY; } +static int mlxsw_pci_reset_at_pci_disable(struct mlxsw_pci *mlxsw_pci) +{ + struct pci_dev *pdev = mlxsw_pci->pdev; + char mrsr_pl[MLXSW_REG_MRSR_LEN]; + int err; + + mlxsw_reg_mrsr_pack(mrsr_pl, + MLXSW_REG_MRSR_COMMAND_RESET_AT_PCI_DISABLE); + err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl); + if (err) + return err; + + device_lock_assert(&pdev->dev); + + pci_cfg_access_lock(pdev); + pci_save_state(pdev); + + err = __pci_reset_function_locked(pdev); + if (err) + pci_err(pdev, "PCI function reset failed with %d\n", err); + + pci_restore_state(pdev); + pci_cfg_access_unlock(pdev); + + return err; +} + static int mlxsw_pci_reset_sw(struct mlxsw_pci *mlxsw_pci) { char mrsr_pl[MLXSW_REG_MRSR_LEN]; @@ -1488,6 +1515,8 @@ static int mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) { struct pci_dev *pdev = mlxsw_pci->pdev; + char mcam_pl[MLXSW_REG_MCAM_LEN]; + bool pci_reset_supported; u32 sys_status; int err; @@ -1498,10 +1527,23 @@ mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) return err; } - err = mlxsw_pci_reset_sw(mlxsw_pci); + mlxsw_reg_mcam_pack(mcam_pl, + MLXSW_REG_MCAM_FEATURE_GROUP_ENHANCED_FEATURES); + err = mlxsw_reg_query(mlxsw_pci->core, MLXSW_REG(mcam), mcam_pl); if (err) return err; + mlxsw_reg_mcam_unpack(mcam_pl, MLXSW_REG_MCAM_PCI_RESET, + &pci_reset_supported); + + if (pci_reset_supported) { + pci_dbg(pdev, "Starting PCI reset flow\n"); + err = mlxsw_pci_reset_at_pci_disable(mlxsw_pci); + } else { + pci_dbg(pdev, "Starting software reset flow\n"); + err = mlxsw_pci_reset_sw(mlxsw_pci); + } + err = mlxsw_pci_sys_ready_wait(mlxsw_pci, id, &sys_status); if (err) { dev_err(&pdev->dev, "Failed to reach system ready status after reset. Status is 0x%x\n", diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index 13c0ff9945378..e26e9d06bd721 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -10594,6 +10594,8 @@ MLXSW_ITEM32(reg, mcam, feature_group, 0x00, 16, 8); enum mlxsw_reg_mcam_mng_feature_cap_mask_bits { /* If set, MCIA supports 128 bytes payloads. Otherwise, 48 bytes. */ MLXSW_REG_MCAM_MCIA_128B = 34, + /* If set, MRSR.command=6 is supported. */ + MLXSW_REG_MCAM_PCI_RESET = 48, }; #define MLXSW_REG_BYTES_PER_DWORD 0x4 From 5e12d0898583026581e4f0506ff7c0b15a73a1c5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:22 +0100 Subject: [PATCH 13/14] mlxsw: pci: Implement PCI reset handlers Implement reset_prepare() and reset_done() handlers that are invoked by the PCI core before and after issuing a PCI reset, respectively. Specifically, implement reset_prepare() by calling mlxsw_core_bus_device_unregister() and reset_done() by calling mlxsw_core_bus_device_register(). This is the same implementation as the reload_{down,up}() devlink operations with the following differences: 1. The devlink instance is unregistered and then registered again after the reset. 2. A reset via the device's command interface (using MRSR register) is not issued during reset_done() as PCI core already issued a PCI reset. Tested: # for i in $(seq 1 10); do echo 1 > /sys/bus/pci/devices/0000\:01\:00.0/reset; done Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/pci.c | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c index 726ed707e27b9..5b1f2483a3ccf 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/pci.c +++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c @@ -130,6 +130,7 @@ struct mlxsw_pci { const struct pci_device_id *id; enum mlxsw_pci_cqe_v max_cqe_ver; /* Maximal supported CQE version */ u8 num_sdq_cqs; /* Number of CQs used for SDQs */ + bool skip_reset; }; static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q) @@ -1527,6 +1528,10 @@ mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id) return err; } + /* PCI core already issued a PCI reset, do not issue another reset. */ + if (mlxsw_pci->skip_reset) + return 0; + mlxsw_reg_mcam_pack(mcam_pl, MLXSW_REG_MCAM_FEATURE_GROUP_ENHANCED_FEATURES); err = mlxsw_reg_query(mlxsw_pci->core, MLXSW_REG(mcam), mcam_pl); @@ -2107,11 +2112,34 @@ static void mlxsw_pci_remove(struct pci_dev *pdev) kfree(mlxsw_pci); } +static void mlxsw_pci_reset_prepare(struct pci_dev *pdev) +{ + struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev); + + mlxsw_core_bus_device_unregister(mlxsw_pci->core, false); +} + +static void mlxsw_pci_reset_done(struct pci_dev *pdev) +{ + struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev); + + mlxsw_pci->skip_reset = true; + mlxsw_core_bus_device_register(&mlxsw_pci->bus_info, &mlxsw_pci_bus, + mlxsw_pci, false, NULL, NULL); + mlxsw_pci->skip_reset = false; +} + +static const struct pci_error_handlers mlxsw_pci_err_handler = { + .reset_prepare = mlxsw_pci_reset_prepare, + .reset_done = mlxsw_pci_reset_done, +}; + int mlxsw_pci_driver_register(struct pci_driver *pci_driver) { pci_driver->probe = mlxsw_pci_probe; pci_driver->remove = mlxsw_pci_remove; pci_driver->shutdown = mlxsw_pci_remove; + pci_driver->err_handler = &mlxsw_pci_err_handler; return pci_register_driver(pci_driver); } EXPORT_SYMBOL(mlxsw_pci_driver_register); From af51d6bd0b130b06fc58b35fee8f93b0a5c77f21 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 15 Nov 2023 13:17:23 +0100 Subject: [PATCH 14/14] selftests: mlxsw: Add PCI reset test Test that PCI reset works correctly by verifying that only the expected reset methods are supported and that after issuing the reset the ifindex of the port changes. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- .../selftests/drivers/net/mlxsw/pci_reset.sh | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100755 tools/testing/selftests/drivers/net/mlxsw/pci_reset.sh diff --git a/tools/testing/selftests/drivers/net/mlxsw/pci_reset.sh b/tools/testing/selftests/drivers/net/mlxsw/pci_reset.sh new file mode 100755 index 0000000000000..fe0343b95e6c0 --- /dev/null +++ b/tools/testing/selftests/drivers/net/mlxsw/pci_reset.sh @@ -0,0 +1,58 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Test that PCI reset works correctly by verifying that only the expected reset +# methods are supported and that after issuing the reset the ifindex of the +# port changes. + +lib_dir=$(dirname $0)/../../../net/forwarding + +ALL_TESTS=" + pci_reset_test +" +NUM_NETIFS=1 +source $lib_dir/lib.sh +source $lib_dir/devlink_lib.sh + +pci_reset_test() +{ + RET=0 + + local bus=$(echo $DEVLINK_DEV | cut -d '/' -f 1) + local bdf=$(echo $DEVLINK_DEV | cut -d '/' -f 2) + + if [ $bus != "pci" ]; then + check_err 1 "devlink device is not a PCI device" + log_test "pci reset" + return + fi + + if [ ! -f /sys/bus/pci/devices/$bdf/reset_method ]; then + check_err 1 "reset is not supported" + log_test "pci reset" + return + fi + + [[ $(cat /sys/bus/pci/devices/$bdf/reset_method) == "bus" ]] + check_err $? "only \"bus\" reset method should be supported" + + local ifindex_pre=$(ip -j link show dev $swp1 | jq '.[]["ifindex"]') + + echo 1 > /sys/bus/pci/devices/$bdf/reset + check_err $? "reset failed" + + # Wait for udev to rename newly created netdev. + udevadm settle + + local ifindex_post=$(ip -j link show dev $swp1 | jq '.[]["ifindex"]') + + [[ $ifindex_pre != $ifindex_post ]] + check_err $? "reset not performed" + + log_test "pci reset" +} + +swp1=${NETIFS[p1]} +tests_run + +exit $EXIT_STATUS