From d98995b4bf981519dde4af0a081c393d62474039 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Mon, 3 Jun 2024 13:26:37 +0300 Subject: [PATCH 1/4] net/mlx5: Reimplement write combining test The test of write combining was added before in mlx5_ib driver. It opens UD QP and posts NOP WQEs, and uses BlueFlame doorbell. When BlueFlame is used, WQEs get written directly to a PCI BAR of the device (in addition to memory) so that the device handles them without having to access memory. In this test, the WQEs written in memory are different from the ones written to the BlueFlame which request CQE update. By checking the completion reports posted on CQ, we can know if BlueFlame succeeds or not. The write combining must be supported if BlueFlame succeeds as its register is written using write combining. This patch reimplements the test in the same way, but using a pair of SQ and CQ only. It is moved to mlx5_core as a general feature used by both mlx5_core and mlx5_ib. Besides, save write combine test result of the PCI function, so that its thousands of child functions such as SF can query without paying the time and resource penalty by itself. The test function is called only after failing to get the cached result. With this enhancement, all thousands of SFs of the PF attached to same driver no longer need to perform WC check explicitly, which is already done in the system. This saves several commands per SF, thereby speeds up SF creation and also saves completion EQ creation. Signed-off-by: Jianbo Liu Reviewed-by: Tariq Toukan Link: https://lore.kernel.org/r/4ff5a8cc4c5b5b0d98397baa45a5019bcdbf096e.1717409369.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 19 +- drivers/infiniband/hw/mlx5/mem.c | 198 -------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 - drivers/infiniband/hw/mlx5/qp.c | 16 - .../net/ethernet/mellanox/mlx5/core/Makefile | 2 +- .../net/ethernet/mellanox/mlx5/core/main.c | 2 + drivers/net/ethernet/mellanox/mlx5/core/wc.c | 434 ++++++++++++++++++ include/linux/mlx5/driver.h | 11 + 8 files changed, 451 insertions(+), 234 deletions(-) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/wc.c diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2366c46eebc87..00e2924228019 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1810,7 +1810,7 @@ static int set_ucontext_resp(struct ib_ucontext *uctx, } resp->qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); - if (dev->wc_support) + if (mlx5_wc_support_get(dev->mdev)) resp->bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); resp->cache_line_size = cache_line_size(); @@ -2337,7 +2337,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm switch (command) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_ALLOC_WC: - if (!dev->wc_support) + if (!mlx5_wc_support_get(dev->mdev)) return -EPERM; fallthrough; case MLX5_IB_MMAP_NC_PAGE: @@ -3612,7 +3612,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)( alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) return -EOPNOTSUPP; - if (!to_mdev(c->ibucontext.device)->wc_support && + if (!mlx5_wc_support_get(to_mdev(c->ibucontext.device)->mdev) && alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) return -EOPNOTSUPP; @@ -3766,18 +3766,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) return err; } -static int mlx5_ib_enable_driver(struct ib_device *dev) -{ - struct mlx5_ib_dev *mdev = to_mdev(dev); - int ret; - - ret = mlx5_ib_test_wc(mdev); - mlx5_ib_dbg(mdev, "Write-Combining %s", - mdev->wc_support ? "supported" : "not supported"); - - return ret; -} - static const struct ib_device_ops mlx5_ib_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_MLX5, @@ -3808,7 +3796,6 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .drain_rq = mlx5_ib_drain_rq, .drain_sq = mlx5_ib_drain_sq, .device_group = &mlx5_attr_group, - .enable_driver = mlx5_ib_enable_driver, .get_dev_fw_str = get_dev_fw_str, .get_dma_mr = mlx5_ib_get_dma_mr, .get_link_layer = mlx5_ib_port_link_layer, diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 5a22be14d958f..af321f6ef7f54 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -30,10 +30,8 @@ * SOFTWARE. */ -#include #include #include "mlx5_ib.h" -#include /* * Fill in a physical address list. ib_umem_num_dma_blocks() entries will be @@ -95,199 +93,3 @@ unsigned long __mlx5_umem_find_best_quantized_pgoff( return 0; return page_size; } - -#define WR_ID_BF 0xBF -#define WR_ID_END 0xBAD -#define TEST_WC_NUM_WQES 255 -#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100) -static int post_send_nop(struct mlx5_ib_dev *dev, struct ib_qp *ibqp, u64 wr_id, - bool signaled) -{ - struct mlx5_ib_qp *qp = to_mqp(ibqp); - struct mlx5_wqe_ctrl_seg *ctrl; - struct mlx5_bf *bf = &qp->bf; - __be32 mmio_wqe[16] = {}; - unsigned long flags; - unsigned int idx; - - if (unlikely(dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) - return -EIO; - - spin_lock_irqsave(&qp->sq.lock, flags); - - idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1); - ctrl = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx); - - memset(ctrl, 0, sizeof(struct mlx5_wqe_ctrl_seg)); - ctrl->fm_ce_se = signaled ? MLX5_WQE_CTRL_CQ_UPDATE : 0; - ctrl->opmod_idx_opcode = - cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | MLX5_OPCODE_NOP); - ctrl->qpn_ds = cpu_to_be32((sizeof(struct mlx5_wqe_ctrl_seg) / 16) | - (qp->trans_qp.base.mqp.qpn << 8)); - - qp->sq.wrid[idx] = wr_id; - qp->sq.w_list[idx].opcode = MLX5_OPCODE_NOP; - qp->sq.wqe_head[idx] = qp->sq.head + 1; - qp->sq.cur_post += DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg), - MLX5_SEND_WQE_BB); - qp->sq.w_list[idx].next = qp->sq.cur_post; - qp->sq.head++; - - memcpy(mmio_wqe, ctrl, sizeof(*ctrl)); - ((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |= - MLX5_WQE_CTRL_CQ_UPDATE; - - /* Make sure that descriptors are written before - * updating doorbell record and ringing the doorbell - */ - wmb(); - - qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post); - - /* Make sure doorbell record is visible to the HCA before - * we hit doorbell - */ - wmb(); - __iowrite64_copy(bf->bfreg->map + bf->offset, mmio_wqe, - sizeof(mmio_wqe) / 8); - - bf->offset ^= bf->buf_size; - - spin_unlock_irqrestore(&qp->sq.lock, flags); - - return 0; -} - -static int test_wc_poll_cq_result(struct mlx5_ib_dev *dev, struct ib_cq *cq) -{ - int ret; - struct ib_wc wc = {}; - unsigned long end = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; - - do { - ret = ib_poll_cq(cq, 1, &wc); - if (ret < 0 || wc.status) - return ret < 0 ? ret : -EINVAL; - if (ret) - break; - } while (!time_after(jiffies, end)); - - if (!ret) - return -ETIMEDOUT; - - if (wc.wr_id != WR_ID_BF) - ret = 0; - - return ret; -} - -static int test_wc_do_send(struct mlx5_ib_dev *dev, struct ib_qp *qp) -{ - int err, i; - - for (i = 0; i < TEST_WC_NUM_WQES; i++) { - err = post_send_nop(dev, qp, WR_ID_BF, false); - if (err) - return err; - } - - return post_send_nop(dev, qp, WR_ID_END, true); -} - -int mlx5_ib_test_wc(struct mlx5_ib_dev *dev) -{ - struct ib_cq_init_attr cq_attr = { .cqe = TEST_WC_NUM_WQES + 1 }; - int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); - struct ib_qp_init_attr qp_init_attr = { - .cap = { .max_send_wr = TEST_WC_NUM_WQES }, - .qp_type = IB_QPT_UD, - .sq_sig_type = IB_SIGNAL_REQ_WR, - .create_flags = MLX5_IB_QP_CREATE_WC_TEST, - }; - struct ib_qp_attr qp_attr = { .port_num = 1 }; - struct ib_device *ibdev = &dev->ib_dev; - struct ib_qp *qp; - struct ib_cq *cq; - struct ib_pd *pd; - int ret; - - if (!MLX5_CAP_GEN(dev->mdev, bf)) - return 0; - - if (!dev->mdev->roce.roce_en && - port_type_cap == MLX5_CAP_PORT_TYPE_ETH) { - if (mlx5_core_is_pf(dev->mdev)) - dev->wc_support = arch_can_pci_mmap_wc(); - return 0; - } - - ret = mlx5_alloc_bfreg(dev->mdev, &dev->wc_bfreg, true, false); - if (ret) - goto print_err; - - if (!dev->wc_bfreg.wc) - goto out1; - - pd = ib_alloc_pd(ibdev, 0); - if (IS_ERR(pd)) { - ret = PTR_ERR(pd); - goto out1; - } - - cq = ib_create_cq(ibdev, NULL, NULL, NULL, &cq_attr); - if (IS_ERR(cq)) { - ret = PTR_ERR(cq); - goto out2; - } - - qp_init_attr.recv_cq = cq; - qp_init_attr.send_cq = cq; - qp = ib_create_qp(pd, &qp_init_attr); - if (IS_ERR(qp)) { - ret = PTR_ERR(qp); - goto out3; - } - - qp_attr.qp_state = IB_QPS_INIT; - ret = ib_modify_qp(qp, &qp_attr, - IB_QP_STATE | IB_QP_PORT | IB_QP_PKEY_INDEX | - IB_QP_QKEY); - if (ret) - goto out4; - - qp_attr.qp_state = IB_QPS_RTR; - ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE); - if (ret) - goto out4; - - qp_attr.qp_state = IB_QPS_RTS; - ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN); - if (ret) - goto out4; - - ret = test_wc_do_send(dev, qp); - if (ret < 0) - goto out4; - - ret = test_wc_poll_cq_result(dev, cq); - if (ret > 0) { - dev->wc_support = true; - ret = 0; - } - -out4: - ib_destroy_qp(qp); -out3: - ib_destroy_cq(cq); -out2: - ib_dealloc_pd(pd); -out1: - mlx5_free_bfreg(dev->mdev, &dev->wc_bfreg); -print_err: - if (ret) - mlx5_ib_err( - dev, - "Error %d while trying to test write-combining support\n", - ret); - return ret; -} diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index f255a12e26a02..b68779e9d86cf 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -341,7 +341,6 @@ struct mlx5_ib_flow_db { * rely on the range reserved for that use in the ib_qp_create_flags enum. */ #define MLX5_IB_QP_CREATE_SQPN_QP1 IB_QP_CREATE_RESERVED_START -#define MLX5_IB_QP_CREATE_WC_TEST (IB_QP_CREATE_RESERVED_START << 1) struct wr_list { u16 opcode; @@ -1123,7 +1122,6 @@ struct mlx5_ib_dev { u8 ib_active:1; u8 is_rep:1; u8 lag_active:1; - u8 wc_support:1; u8 fill_delay; struct umr_common umrc; /* sync used page count stats @@ -1149,7 +1147,6 @@ struct mlx5_ib_dev { /* Array with num_ports elements */ struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; - struct mlx5_sq_bfreg wc_bfreg; struct mlx5_sq_bfreg fp_bfreg; struct mlx5_ib_delay_drop delay_drop; const struct mlx5_ib_profile *profile; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index e2164f813607a..e8c0fead40623 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1107,8 +1107,6 @@ static int _create_kernel_qp(struct mlx5_ib_dev *dev, if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) qp->bf.bfreg = &dev->fp_bfreg; - else if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) - qp->bf.bfreg = &dev->wc_bfreg; else qp->bf.bfreg = &dev->bfreg; @@ -2959,14 +2957,6 @@ static void process_create_flag(struct mlx5_ib_dev *dev, int *flags, int flag, return; } - if (flag == MLX5_IB_QP_CREATE_WC_TEST) { - /* - * Special case, if condition didn't meet, it won't be error, - * just different in-kernel flow. - */ - *flags &= ~MLX5_IB_QP_CREATE_WC_TEST; - return; - } mlx5_ib_dbg(dev, "Verbs create QP flag 0x%X is not supported\n", flag); } @@ -3027,8 +3017,6 @@ static int process_create_flags(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, IB_QP_CREATE_PCI_WRITE_END_PADDING, MLX5_CAP_GEN(mdev, end_pad), qp); - process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_WC_TEST, - qp_type != MLX5_IB_QPT_REG_UMR, qp); process_create_flag(dev, &create_flags, MLX5_IB_QP_CREATE_SQPN_QP1, true, qp); @@ -4609,10 +4597,6 @@ static bool mlx5_ib_modify_qp_allowed(struct mlx5_ib_dev *dev, if (qp->type == IB_QPT_RAW_PACKET || qp->type == MLX5_IB_QPT_REG_UMR) return true; - /* Internal QP used for wc testing, with NOPs in wq */ - if (qp->flags & MLX5_IB_QP_CREATE_WC_TEST) - return true; - return false; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 76dc5a9b9648d..1289475e7be7a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -17,7 +17,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ fs_counters.o fs_ft_pool.o rl.o lag/debugfs.o lag/lag.o dev.o events.o wq.o lib/gid.o \ lib/devcom.o lib/pci_vsc.o lib/dm.o lib/fs_ttc.o diag/fs_tracepoint.o \ diag/fw_tracer.o diag/crdump.o devlink.o diag/rsc_dump.o diag/reporter_vnic.o \ - fw_reset.o qos.o lib/tout.o lib/aso.o + fw_reset.o qos.o lib/tout.o lib/aso.o wc.o # # Netdev basic diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 459a836a5d9c1..527da58c79535 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1819,6 +1819,7 @@ int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx) mutex_init(&dev->intf_state_mutex); lockdep_set_class(&dev->intf_state_mutex, &dev->lock_key); mutex_init(&dev->mlx5e_res.uplink_netdev_lock); + mutex_init(&dev->wc_state_lock); mutex_init(&priv->bfregs.reg_head.lock); mutex_init(&priv->bfregs.wc_head.lock); @@ -1916,6 +1917,7 @@ void mlx5_mdev_uninit(struct mlx5_core_dev *dev) mutex_destroy(&priv->alloc_mutex); mutex_destroy(&priv->bfregs.wc_head.lock); mutex_destroy(&priv->bfregs.reg_head.lock); + mutex_destroy(&dev->wc_state_lock); mutex_destroy(&dev->mlx5e_res.uplink_netdev_lock); mutex_destroy(&dev->intf_state_mutex); lockdep_unregister_key(&dev->lock_key); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/wc.c b/drivers/net/ethernet/mellanox/mlx5/core/wc.c new file mode 100644 index 0000000000000..1bed75eca97db --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/wc.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +#include +#include +#include "lib/clock.h" +#include "mlx5_core.h" +#include "wq.h" + +#define TEST_WC_NUM_WQES 255 +#define TEST_WC_LOG_CQ_SZ (order_base_2(TEST_WC_NUM_WQES)) +#define TEST_WC_SQ_LOG_WQ_SZ TEST_WC_LOG_CQ_SZ +#define TEST_WC_POLLING_MAX_TIME_JIFFIES msecs_to_jiffies(100) + +struct mlx5_wc_cq { + /* data path - accessed per cqe */ + struct mlx5_cqwq wq; + + /* data path - accessed per napi poll */ + struct mlx5_core_cq mcq; + + /* control */ + struct mlx5_core_dev *mdev; + struct mlx5_wq_ctrl wq_ctrl; +}; + +struct mlx5_wc_sq { + /* data path */ + u16 cc; + u16 pc; + + /* read only */ + struct mlx5_wq_cyc wq; + u32 sqn; + + /* control path */ + struct mlx5_wq_ctrl wq_ctrl; + + struct mlx5_wc_cq cq; + struct mlx5_sq_bfreg bfreg; +}; + +static int mlx5_wc_create_cqwq(struct mlx5_core_dev *mdev, void *cqc, + struct mlx5_wc_cq *cq) +{ + struct mlx5_core_cq *mcq = &cq->mcq; + struct mlx5_wq_param param = {}; + int err; + u32 i; + + err = mlx5_cqwq_create(mdev, ¶m, cqc, &cq->wq, &cq->wq_ctrl); + if (err) + return err; + + mcq->cqe_sz = 64; + mcq->set_ci_db = cq->wq_ctrl.db.db; + mcq->arm_db = cq->wq_ctrl.db.db + 1; + + for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { + struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); + + cqe->op_own = 0xf1; + } + + cq->mdev = mdev; + + return 0; +} + +static int create_wc_cq(struct mlx5_wc_cq *cq, void *cqc_data) +{ + u32 out[MLX5_ST_SZ_DW(create_cq_out)]; + struct mlx5_core_dev *mdev = cq->mdev; + struct mlx5_core_cq *mcq = &cq->mcq; + int err, inlen, eqn; + void *in, *cqc; + + err = mlx5_comp_eqn_get(mdev, 0, &eqn); + if (err) + return err; + + inlen = MLX5_ST_SZ_BYTES(create_cq_in) + + sizeof(u64) * cq->wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + + memcpy(cqc, cqc_data, MLX5_ST_SZ_BYTES(cqc)); + + mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); + + MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(cqc, cqc, dbr_addr, cq->wq_ctrl.db.dma); + + err = mlx5_core_create_cq(mdev, mcq, in, inlen, out, sizeof(out)); + + kvfree(in); + + return err; +} + +static int mlx5_wc_create_cq(struct mlx5_core_dev *mdev, struct mlx5_wc_cq *cq) +{ + void *cqc; + int err; + + cqc = kvzalloc(MLX5_ST_SZ_BYTES(cqc), GFP_KERNEL); + if (!cqc) + return -ENOMEM; + + MLX5_SET(cqc, cqc, log_cq_size, TEST_WC_LOG_CQ_SZ); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + if (MLX5_CAP_GEN(mdev, cqe_128_always) && cache_line_size() >= 128) + MLX5_SET(cqc, cqc, cqe_sz, CQE_STRIDE_128_PAD); + + err = mlx5_wc_create_cqwq(mdev, cqc, cq); + if (err) { + mlx5_core_err(mdev, "Failed to create wc cq wq, err=%d\n", err); + goto err_create_cqwq; + } + + err = create_wc_cq(cq, cqc); + if (err) { + mlx5_core_err(mdev, "Failed to create wc cq, err=%d\n", err); + goto err_create_cq; + } + + kvfree(cqc); + return 0; + +err_create_cq: + mlx5_wq_destroy(&cq->wq_ctrl); +err_create_cqwq: + kvfree(cqc); + return err; +} + +static void mlx5_wc_destroy_cq(struct mlx5_wc_cq *cq) +{ + mlx5_core_destroy_cq(cq->mdev, &cq->mcq); + mlx5_wq_destroy(&cq->wq_ctrl); +} + +static int create_wc_sq(struct mlx5_core_dev *mdev, void *sqc_data, + struct mlx5_wc_sq *sq) +{ + void *in, *sqc, *wq; + int inlen, err; + u8 ts_format; + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + + sizeof(u64) * sq->wq_ctrl.buf.npages; + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + + memcpy(sqc, sqc_data, MLX5_ST_SZ_BYTES(sqc)); + MLX5_SET(sqc, sqc, cqn, sq->cq.mcq.cqn); + + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + + ts_format = mlx5_is_real_time_sq(mdev) ? + MLX5_TIMESTAMP_FORMAT_REAL_TIME : + MLX5_TIMESTAMP_FORMAT_FREE_RUNNING; + MLX5_SET(sqc, sqc, ts_format, ts_format); + + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, uar_page, sq->bfreg.index); + MLX5_SET(wq, wq, log_wq_pg_sz, sq->wq_ctrl.buf.page_shift - + MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET64(wq, wq, dbr_addr, sq->wq_ctrl.db.dma); + + mlx5_fill_page_frag_array(&sq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(wq, wq, pas)); + + err = mlx5_core_create_sq(mdev, in, inlen, &sq->sqn); + if (err) { + mlx5_core_err(mdev, "Failed to create wc sq, err=%d\n", err); + goto err_create_sq; + } + + memset(in, 0, MLX5_ST_SZ_BYTES(modify_sq_in)); + MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RST); + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY); + + err = mlx5_core_modify_sq(mdev, sq->sqn, in); + if (err) { + mlx5_core_err(mdev, "Failed to set wc sq(sqn=0x%x) ready, err=%d\n", + sq->sqn, err); + goto err_modify_sq; + } + + kvfree(in); + return 0; + +err_modify_sq: + mlx5_core_destroy_sq(mdev, sq->sqn); +err_create_sq: + kvfree(in); + return err; +} + +static int mlx5_wc_create_sq(struct mlx5_core_dev *mdev, struct mlx5_wc_sq *sq) +{ + struct mlx5_wq_param param = {}; + void *sqc_data, *wq; + int err; + + sqc_data = kvzalloc(MLX5_ST_SZ_BYTES(sqc), GFP_KERNEL); + if (!sqc_data) + return -ENOMEM; + + wq = MLX5_ADDR_OF(sqc, sqc_data, wq); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, pd, mdev->mlx5e_res.hw_objs.pdn); + MLX5_SET(wq, wq, log_wq_sz, TEST_WC_SQ_LOG_WQ_SZ); + + err = mlx5_wq_cyc_create(mdev, ¶m, wq, &sq->wq, &sq->wq_ctrl); + if (err) { + mlx5_core_err(mdev, "Failed to create wc sq wq, err=%d\n", err); + goto err_create_wq_cyc; + } + + err = create_wc_sq(mdev, sqc_data, sq); + if (err) + goto err_create_sq; + + mlx5_core_dbg(mdev, "wc sq->sqn = 0x%x created\n", sq->sqn); + + kvfree(sqc_data); + return 0; + +err_create_sq: + mlx5_wq_destroy(&sq->wq_ctrl); +err_create_wq_cyc: + kvfree(sqc_data); + return err; +} + +static void mlx5_wc_destroy_sq(struct mlx5_wc_sq *sq) +{ + mlx5_core_destroy_sq(sq->cq.mdev, sq->sqn); + mlx5_wq_destroy(&sq->wq_ctrl); +} + +static void mlx5_wc_post_nop(struct mlx5_wc_sq *sq, bool signaled) +{ + int buf_size = (1 << MLX5_CAP_GEN(sq->cq.mdev, log_bf_reg_size)) / 2; + struct mlx5_wqe_ctrl_seg *ctrl; + __be32 mmio_wqe[16] = {}; + u16 pi; + + pi = mlx5_wq_cyc_ctr2ix(&sq->wq, sq->pc); + ctrl = mlx5_wq_cyc_get_wqe(&sq->wq, pi); + memset(ctrl, 0, sizeof(*ctrl)); + ctrl->opmod_idx_opcode = + cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) | MLX5_OPCODE_NOP); + ctrl->qpn_ds = + cpu_to_be32((sq->sqn << MLX5_WQE_CTRL_QPN_SHIFT) | + DIV_ROUND_UP(sizeof(struct mlx5_wqe_ctrl_seg), MLX5_SEND_WQE_DS)); + if (signaled) + ctrl->fm_ce_se |= MLX5_WQE_CTRL_CQ_UPDATE; + + memcpy(mmio_wqe, ctrl, sizeof(*ctrl)); + ((struct mlx5_wqe_ctrl_seg *)&mmio_wqe)->fm_ce_se |= + MLX5_WQE_CTRL_CQ_UPDATE; + + /* ensure wqe is visible to device before updating doorbell record */ + dma_wmb(); + + sq->pc++; + sq->wq.db[MLX5_SND_DBR] = cpu_to_be32(sq->pc); + + /* ensure doorbell record is visible to device before ringing the + * doorbell + */ + wmb(); + + __iowrite64_copy(sq->bfreg.map + sq->bfreg.offset, mmio_wqe, + sizeof(mmio_wqe) / 8); + + sq->bfreg.offset ^= buf_size; +} + +static int mlx5_wc_poll_cq(struct mlx5_wc_sq *sq) +{ + struct mlx5_wc_cq *cq = &sq->cq; + struct mlx5_cqe64 *cqe; + + cqe = mlx5_cqwq_get_cqe(&cq->wq); + if (!cqe) + return -ETIMEDOUT; + + /* sq->cc must be updated only after mlx5_cqwq_update_db_record(), + * otherwise a cq overrun may occur + */ + mlx5_cqwq_pop(&cq->wq); + + if (get_cqe_opcode(cqe) == MLX5_CQE_REQ) { + int wqe_counter = be16_to_cpu(cqe->wqe_counter); + struct mlx5_core_dev *mdev = cq->mdev; + + if (wqe_counter == TEST_WC_NUM_WQES - 1) + mdev->wc_state = MLX5_WC_STATE_UNSUPPORTED; + else + mdev->wc_state = MLX5_WC_STATE_SUPPORTED; + + mlx5_core_dbg(mdev, "wc wqe_counter = 0x%x\n", wqe_counter); + } + + mlx5_cqwq_update_db_record(&cq->wq); + + /* ensure cq space is freed before enabling more cqes */ + wmb(); + + sq->cc++; + + return 0; +} + +static void mlx5_core_test_wc(struct mlx5_core_dev *mdev) +{ + unsigned long expires; + struct mlx5_wc_sq *sq; + int i, err; + + if (mdev->wc_state != MLX5_WC_STATE_UNINITIALIZED) + return; + + sq = kzalloc(sizeof(*sq), GFP_KERNEL); + if (!sq) + return; + + err = mlx5_alloc_bfreg(mdev, &sq->bfreg, true, false); + if (err) { + mlx5_core_err(mdev, "Failed to alloc bfreg for wc, err=%d\n", err); + goto err_alloc_bfreg; + } + + err = mlx5_wc_create_cq(mdev, &sq->cq); + if (err) + goto err_create_cq; + + err = mlx5_wc_create_sq(mdev, sq); + if (err) + goto err_create_sq; + + for (i = 0; i < TEST_WC_NUM_WQES - 1; i++) + mlx5_wc_post_nop(sq, false); + + mlx5_wc_post_nop(sq, true); + + expires = jiffies + TEST_WC_POLLING_MAX_TIME_JIFFIES; + do { + err = mlx5_wc_poll_cq(sq); + if (err) + usleep_range(2, 10); + } while (mdev->wc_state == MLX5_WC_STATE_UNINITIALIZED && + time_is_after_jiffies(expires)); + + mlx5_wc_destroy_sq(sq); + +err_create_sq: + mlx5_wc_destroy_cq(&sq->cq); +err_create_cq: + mlx5_free_bfreg(mdev, &sq->bfreg); +err_alloc_bfreg: + kfree(sq); +} + +bool mlx5_wc_support_get(struct mlx5_core_dev *mdev) +{ + struct mlx5_core_dev *parent = NULL; + + if (!MLX5_CAP_GEN(mdev, bf)) { + mlx5_core_dbg(mdev, "BlueFlame not supported\n"); + goto out; + } + + if (!MLX5_CAP_GEN(mdev, log_max_sq)) { + mlx5_core_dbg(mdev, "SQ not supported\n"); + goto out; + } + + if (mdev->wc_state != MLX5_WC_STATE_UNINITIALIZED) + /* No need to lock anything as we perform WC test only + * once for whole device and was already done. + */ + goto out; + + mutex_lock(&mdev->wc_state_lock); + + if (mdev->wc_state != MLX5_WC_STATE_UNINITIALIZED) + goto unlock; + +#ifdef CONFIG_MLX5_SF + if (mlx5_core_is_sf(mdev)) + parent = mdev->priv.parent_mdev; +#endif + + if (parent) { + mutex_lock(&parent->wc_state_lock); + + mlx5_core_test_wc(parent); + + mlx5_core_dbg(mdev, "parent set wc_state=%d\n", + parent->wc_state); + mdev->wc_state = parent->wc_state; + + mutex_unlock(&parent->wc_state_lock); + } + + mlx5_core_test_wc(mdev); + +unlock: + mutex_unlock(&mdev->wc_state_lock); +out: + mlx5_core_dbg(mdev, "wc_state=%d\n", mdev->wc_state); + + return mdev->wc_state == MLX5_WC_STATE_SUPPORTED; +} +EXPORT_SYMBOL(mlx5_wc_support_get); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 779cfdf2e9d65..0d31f77396fcc 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -766,6 +766,12 @@ struct mlx5_hca_cap { u32 max[MLX5_UN_SZ_DW(hca_cap_union)]; }; +enum mlx5_wc_state { + MLX5_WC_STATE_UNINITIALIZED, + MLX5_WC_STATE_UNSUPPORTED, + MLX5_WC_STATE_SUPPORTED, +}; + struct mlx5_core_dev { struct device *device; enum mlx5_coredev_type coredev_type; @@ -824,6 +830,9 @@ struct mlx5_core_dev { #endif u64 num_ipsec_offloads; struct mlx5_sd *sd; + enum mlx5_wc_state wc_state; + /* sync write combining state */ + struct mutex wc_state_lock; }; struct mlx5_db { @@ -1375,4 +1384,6 @@ static inline bool mlx5_is_macsec_roce_supported(struct mlx5_core_dev *mdev) enum { MLX5_OCTWORD = 16, }; + +bool mlx5_wc_support_get(struct mlx5_core_dev *mdev); #endif /* MLX5_DRIVER_H */ From b339e0a39dc37726712b9f0485d78fe4306d1667 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Thu, 13 Jun 2024 21:00:04 +0300 Subject: [PATCH 2/4] RDMA/mlx5: Add Qcounters req_transport_retries_exceeded/req_rnr_retries_exceeded The req_transport_retries_exceeded counter shows the number of times requester detected transport retries exceed error. The req_rnr_retries_exceeded counter show the number of times the requester detected RNR NAKs retries exceed error. Signed-off-by: Patrisious Haddad Link: https://lore.kernel.org/r/250466af94f4989d638fab168e246035530e912f.1718301543.git.leon@kernel.org Reviewed-by: Simon Horman Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/counters.c | 4 ++++ include/linux/mlx5/mlx5_ifc.h | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/counters.c b/drivers/infiniband/hw/mlx5/counters.c index 8300ce6228350..4f6c1968a2ee3 100644 --- a/drivers/infiniband/hw/mlx5/counters.c +++ b/drivers/infiniband/hw/mlx5/counters.c @@ -83,6 +83,8 @@ static const struct mlx5_ib_counter extended_err_cnts[] = { INIT_Q_COUNTER(resp_remote_access_errors), INIT_Q_COUNTER(resp_cqe_flush_error), INIT_Q_COUNTER(req_cqe_flush_error), + INIT_Q_COUNTER(req_transport_retries_exceeded), + INIT_Q_COUNTER(req_rnr_retries_exceeded), }; static const struct mlx5_ib_counter roce_accl_cnts[] = { @@ -102,6 +104,8 @@ static const struct mlx5_ib_counter vport_extended_err_cnts[] = { INIT_VPORT_Q_COUNTER(resp_remote_access_errors), INIT_VPORT_Q_COUNTER(resp_cqe_flush_error), INIT_VPORT_Q_COUNTER(req_cqe_flush_error), + INIT_VPORT_Q_COUNTER(req_transport_retries_exceeded), + INIT_VPORT_Q_COUNTER(req_rnr_retries_exceeded), }; static const struct mlx5_ib_counter vport_roce_accl_cnts[] = { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5df52e15f7d6c..09d9d87d62c6c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5629,7 +5629,11 @@ struct mlx5_ifc_query_q_counter_out_bits { u8 local_ack_timeout_err[0x20]; - u8 reserved_at_320[0xa0]; + u8 reserved_at_320[0x60]; + + u8 req_rnr_retries_exceeded[0x20]; + + u8 reserved_at_3a0[0x20]; u8 resp_local_length_error[0x20]; From a808878308a8041ae10a151d69e2d22f94cae9f4 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 14 Apr 2024 11:05:25 +0300 Subject: [PATCH 3/4] driver core: auxiliary bus: show auxiliary device IRQs PCI subfunctions (SF) are anchored on the auxiliary bus. PCI physical and virtual functions are anchored on the PCI bus. The irq information of each such function is visible to users via sysfs directory "msi_irqs" containing files for each irq entry. However, for PCI SFs such information is unavailable. Due to this users have no visibility on IRQs used by the SFs. Secondly, an SF can be multi function device supporting rdma, netdevice and more. Without irq information at the bus level, the user is unable to view or use the affinity of the SF IRQs. Hence to match to the equivalent PCI PFs and VFs, add "irqs" directory, for supporting auxiliary devices, containing file for each irq entry. For example: $ ls /sys/bus/auxiliary/devices/mlx5_core.sf.1/irqs/ 50 51 52 53 54 55 56 57 58 Cc: Simon Horman Reviewed-by: Przemek Kitszel Reviewed-by: Parav Pandit Reviewed-by: Greg Kroah-Hartman Signed-off-by: Shay Drory Signed-off-by: Saeed Mahameed --- v9-v10: - remove Przemek RB - add name field to auxiliary_irq_info (Greg and Przemek) - handle bogus IRQ in auxiliary_device_sysfs_irq_remove (Greg) v8-v9: - add Przemek RB - use guard() in auxiliary_irq_dir_prepare (Paolo) v7-v8: - use cleanup.h for info and name fields (Greg) - correct error flow in auxiliary_irq_dir_prepare (Przemek) - add documentation for new fields of auxiliary_device (Simon) v6-v7: - dynamically creating irqs directory when first irq file created (Greg) - removed irqs flag and simplified the dev_add() API (Greg) - move sysfs related new code to a new auxiliary_sysfs.c file (Greg) v5-v6: - removed concept of shared and exclusive and hence global xarray (Greg) v4-v5: - restore global mutex and replace refcount_t with simple integer (Greg) v3->4: - remove global mutex (Przemek) v2->v3: - fix function declaration in case SYSFS isn't defined v1->v2: - move #ifdefs from drivers/base/auxiliary.c to include/linux/auxiliary_bus.h (Greg) - use EXPORT_SYMBOL_GPL instead of EXPORT_SYMBOL (Greg) - Fix kzalloc(ref) to kzalloc(*ref) (Simon) - Add return description in auxiliary_device_sysfs_irq_add() kdoc (Simon) - Fix auxiliary_irq_mode_show doc (kernel test boot) --- Documentation/ABI/testing/sysfs-bus-auxiliary | 9 ++ drivers/base/Makefile | 1 + drivers/base/auxiliary.c | 1 + drivers/base/auxiliary_sysfs.c | 113 ++++++++++++++++++ include/linux/auxiliary_bus.h | 24 ++++ 5 files changed, 148 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-bus-auxiliary create mode 100644 drivers/base/auxiliary_sysfs.c diff --git a/Documentation/ABI/testing/sysfs-bus-auxiliary b/Documentation/ABI/testing/sysfs-bus-auxiliary new file mode 100644 index 0000000000000..cc856079690f6 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-auxiliary @@ -0,0 +1,9 @@ +What: /sys/bus/auxiliary/devices/.../irqs/ +Date: April, 2024 +Contact: Shay Drory +Description: + The /sys/devices/.../irqs directory contains a variable set of + files, with each file is named as irq number similar to PCI PF + or VF's irq number located in msi_irqs directory. + These irq files are added and removed dynamically when an IRQ + is requested and freed respectively for the PCI SF. diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 3079bfe53d04d..7fb21768ca36d 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_NUMA) += node.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory.o ifeq ($(CONFIG_SYSFS),y) obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_AUXILIARY_BUS) += auxiliary_sysfs.o endif obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o obj-$(CONFIG_REGMAP) += regmap/ diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index d3a2c40c2f127..3f01f4ec69e58 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -287,6 +287,7 @@ int auxiliary_device_init(struct auxiliary_device *auxdev) dev->bus = &auxiliary_bus_type; device_initialize(&auxdev->dev); + mutex_init(&auxdev->sysfs.lock); return 0; } EXPORT_SYMBOL_GPL(auxiliary_device_init); diff --git a/drivers/base/auxiliary_sysfs.c b/drivers/base/auxiliary_sysfs.c new file mode 100644 index 0000000000000..754f21730afde --- /dev/null +++ b/drivers/base/auxiliary_sysfs.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES + */ + +#include +#include + +#define AUXILIARY_MAX_IRQ_NAME 11 + +struct auxiliary_irq_info { + struct device_attribute sysfs_attr; + char name[AUXILIARY_MAX_IRQ_NAME]; +}; + +static struct attribute *auxiliary_irq_attrs[] = { + NULL +}; + +static const struct attribute_group auxiliary_irqs_group = { + .name = "irqs", + .attrs = auxiliary_irq_attrs, +}; + +static int auxiliary_irq_dir_prepare(struct auxiliary_device *auxdev) +{ + int ret = 0; + + guard(mutex)(&auxdev->sysfs.lock); + if (auxdev->sysfs.irq_dir_exists) + return 0; + + ret = devm_device_add_group(&auxdev->dev, &auxiliary_irqs_group); + if (ret) + return ret; + + auxdev->sysfs.irq_dir_exists = true; + xa_init(&auxdev->sysfs.irqs); + return 0; +} + +/** + * auxiliary_device_sysfs_irq_add - add a sysfs entry for the given IRQ + * @auxdev: auxiliary bus device to add the sysfs entry. + * @irq: The associated interrupt number. + * + * This function should be called after auxiliary device have successfully + * received the irq. + * The driver is responsible to add a unique irq for the auxiliary device. The + * driver can invoke this function from multiple thread context safely for + * unique irqs of the auxiliary devices. The driver must not invoke this API + * multiple times if the irq is already added previously. + * + * Return: zero on success or an error code on failure. + */ +int auxiliary_device_sysfs_irq_add(struct auxiliary_device *auxdev, int irq) +{ + struct auxiliary_irq_info *info __free(kfree) = NULL; + struct device *dev = &auxdev->dev; + int ret; + + ret = auxiliary_irq_dir_prepare(auxdev); + if (ret) + return ret; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + sysfs_attr_init(&info->sysfs_attr.attr); + snprintf(info->name, AUXILIARY_MAX_IRQ_NAME, "%d", irq); + + ret = xa_insert(&auxdev->sysfs.irqs, irq, info, GFP_KERNEL); + if (ret) + return ret; + + info->sysfs_attr.attr.name = info->name; + ret = sysfs_add_file_to_group(&dev->kobj, &info->sysfs_attr.attr, + auxiliary_irqs_group.name); + if (ret) + goto sysfs_add_err; + + xa_store(&auxdev->sysfs.irqs, irq, no_free_ptr(info), GFP_KERNEL); + return 0; + +sysfs_add_err: + xa_erase(&auxdev->sysfs.irqs, irq); + return ret; +} +EXPORT_SYMBOL_GPL(auxiliary_device_sysfs_irq_add); + +/** + * auxiliary_device_sysfs_irq_remove - remove a sysfs entry for the given IRQ + * @auxdev: auxiliary bus device to add the sysfs entry. + * @irq: the IRQ to remove. + * + * This function should be called to remove an IRQ sysfs entry. + * The driver must invoke this API when IRQ is released by the device. + */ +void auxiliary_device_sysfs_irq_remove(struct auxiliary_device *auxdev, int irq) +{ + struct auxiliary_irq_info *info __free(kfree) = xa_load(&auxdev->sysfs.irqs, irq); + struct device *dev = &auxdev->dev; + + if (!info) { + dev_err(&auxdev->dev, "IRQ %d doesn't exist\n", irq); + return; + } + sysfs_remove_file_from_group(&dev->kobj, &info->sysfs_attr.attr, + auxiliary_irqs_group.name); + xa_erase(&auxdev->sysfs.irqs, irq); +} +EXPORT_SYMBOL_GPL(auxiliary_device_sysfs_irq_remove); diff --git a/include/linux/auxiliary_bus.h b/include/linux/auxiliary_bus.h index de21d9d24a95f..3ba4487c9cd93 100644 --- a/include/linux/auxiliary_bus.h +++ b/include/linux/auxiliary_bus.h @@ -58,6 +58,9 @@ * in * @name: Match name found by the auxiliary device driver, * @id: unique identitier if multiple devices of the same name are exported, + * @irqs: irqs xarray contains irq indices which are used by the device, + * @lock: Synchronize irq sysfs creation, + * @irq_dir_exists: whether "irqs" directory exists, * * An auxiliary_device represents a part of its parent device's functionality. * It is given a name that, combined with the registering drivers @@ -139,6 +142,11 @@ struct auxiliary_device { struct device dev; const char *name; u32 id; + struct { + struct xarray irqs; + struct mutex lock; /* Synchronize irq sysfs creation */ + bool irq_dir_exists; + } sysfs; }; /** @@ -212,8 +220,24 @@ int auxiliary_device_init(struct auxiliary_device *auxdev); int __auxiliary_device_add(struct auxiliary_device *auxdev, const char *modname); #define auxiliary_device_add(auxdev) __auxiliary_device_add(auxdev, KBUILD_MODNAME) +#ifdef CONFIG_SYSFS +int auxiliary_device_sysfs_irq_add(struct auxiliary_device *auxdev, int irq); +void auxiliary_device_sysfs_irq_remove(struct auxiliary_device *auxdev, + int irq); +#else /* CONFIG_SYSFS */ +static inline int +auxiliary_device_sysfs_irq_add(struct auxiliary_device *auxdev, int irq) +{ + return 0; +} + +static inline void +auxiliary_device_sysfs_irq_remove(struct auxiliary_device *auxdev, int irq) {} +#endif + static inline void auxiliary_device_uninit(struct auxiliary_device *auxdev) { + mutex_destroy(&auxdev->sysfs.lock); put_device(&auxdev->dev); } From 0477d5168bbb8767275822830b47112519a8446d Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Sun, 14 Apr 2024 11:26:07 +0300 Subject: [PATCH 4/4] net/mlx5: Expose SFs IRQs Expose the sysfs files for the IRQs that the mlx5 PCI SFs are using. These entries are similar to PCI PFs and VFs in 'msi_irqs' directory. Reviewed-by: Parav Pandit Signed-off-by: Shay Drory Signed-off-by: Saeed Mahameed --- v8-v9: - add Przemek RB v6->v7: - remove not needed changes to mlx5 sfnum SF sysfs v5->v6: - fail IRQ creation in case auxiliary_device_sysfs_irq_add() failed (Parav and Przemek) v2->v3: - fix mlx5 sfnum SF sysfs --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 6 +++--- .../ethernet/mellanox/mlx5/core/irq_affinity.c | 18 +++++++++++++++++- .../ethernet/mellanox/mlx5/core/mlx5_core.h | 6 ++++++ .../net/ethernet/mellanox/mlx5/core/mlx5_irq.h | 12 ++++++++---- .../net/ethernet/mellanox/mlx5/core/pci_irq.c | 12 +++++++++--- 5 files changed, 43 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 5693986ae6562..5661f047702e0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -714,7 +714,7 @@ static int create_async_eqs(struct mlx5_core_dev *dev) err1: mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); - mlx5_ctrl_irq_release(table->ctrl_irq); + mlx5_ctrl_irq_release(dev, table->ctrl_irq); return err; } @@ -730,7 +730,7 @@ static void destroy_async_eqs(struct mlx5_core_dev *dev) cleanup_async_eq(dev, &table->cmd_eq, "cmd"); mlx5_cmd_allowed_opcode(dev, CMD_ALLOWED_OPCODE_ALL); mlx5_eq_notifier_unregister(dev, &table->cq_err_nb); - mlx5_ctrl_irq_release(table->ctrl_irq); + mlx5_ctrl_irq_release(dev, table->ctrl_irq); } struct mlx5_eq *mlx5_get_async_eq(struct mlx5_core_dev *dev) @@ -918,7 +918,7 @@ static int comp_irq_request_sf(struct mlx5_core_dev *dev, u16 vecidx) af_desc.is_managed = 1; cpumask_copy(&af_desc.mask, cpu_online_mask); cpumask_andnot(&af_desc.mask, &af_desc.mask, &table->used_cpus); - irq = mlx5_irq_affinity_request(pool, &af_desc); + irq = mlx5_irq_affinity_request(dev, pool, &af_desc); if (IS_ERR(irq)) return PTR_ERR(irq); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c index 612e666ec2635..f7b01b3f0cba3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/irq_affinity.c @@ -112,15 +112,18 @@ irq_pool_find_least_loaded(struct mlx5_irq_pool *pool, const struct cpumask *req /** * mlx5_irq_affinity_request - request an IRQ according to the given mask. + * @dev: mlx5 core device which is requesting the IRQ. * @pool: IRQ pool to request from. * @af_desc: affinity descriptor for this IRQ. * * This function returns a pointer to IRQ, or ERR_PTR in case of error. */ struct mlx5_irq * -mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_desc) +mlx5_irq_affinity_request(struct mlx5_core_dev *dev, struct mlx5_irq_pool *pool, + struct irq_affinity_desc *af_desc) { struct mlx5_irq *least_loaded_irq, *new_irq; + int ret; mutex_lock(&pool->lock); least_loaded_irq = irq_pool_find_least_loaded(pool, &af_desc->mask); @@ -153,6 +156,16 @@ mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, struct irq_affinity_desc * mlx5_irq_read_locked(least_loaded_irq) / MLX5_EQ_REFS_PER_IRQ); unlock: mutex_unlock(&pool->lock); + if (mlx5_irq_pool_is_sf_pool(pool)) { + ret = auxiliary_device_sysfs_irq_add(mlx5_sf_coredev_to_adev(dev), + mlx5_irq_get_irq(least_loaded_irq)); + if (ret) { + mlx5_core_err(dev, "Failed to create sysfs entry for irq %d, ret = %d\n", + mlx5_irq_get_irq(least_loaded_irq), ret); + mlx5_irq_put(least_loaded_irq); + least_loaded_irq = ERR_PTR(ret); + } + } return least_loaded_irq; } @@ -164,6 +177,9 @@ void mlx5_irq_affinity_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *i cpu = cpumask_first(mlx5_irq_get_affinity_mask(irq)); synchronize_irq(pci_irq_vector(pool->dev->pdev, mlx5_irq_get_index(irq))); + if (mlx5_irq_pool_is_sf_pool(pool)) + auxiliary_device_sysfs_irq_remove(mlx5_sf_coredev_to_adev(dev), + mlx5_irq_get_irq(irq)); if (mlx5_irq_put(irq)) if (pool->irqs_per_cpu) cpu_put(pool, cpu); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index c38342b9f3208..e764b720d9b20 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -320,6 +320,12 @@ static inline bool mlx5_core_is_sf(const struct mlx5_core_dev *dev) return dev->coredev_type == MLX5_COREDEV_SF; } +static inline struct auxiliary_device * +mlx5_sf_coredev_to_adev(struct mlx5_core_dev *mdev) +{ + return container_of(mdev->device, struct auxiliary_device, dev); +} + int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx); void mlx5_mdev_uninit(struct mlx5_core_dev *dev); int mlx5_init_one(struct mlx5_core_dev *dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h index 1088114e905d1..0881e961d8b17 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h @@ -25,7 +25,7 @@ int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn, int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs); struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev); -void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq); +void mlx5_ctrl_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *ctrl_irq); struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx, struct irq_affinity_desc *af_desc, struct cpu_rmap **rmap); @@ -36,13 +36,15 @@ int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb); int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb); struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq); int mlx5_irq_get_index(struct mlx5_irq *irq); +int mlx5_irq_get_irq(const struct mlx5_irq *irq); struct mlx5_irq_pool; #ifdef CONFIG_MLX5_SF struct mlx5_irq *mlx5_irq_affinity_irq_request_auto(struct mlx5_core_dev *dev, struct cpumask *used_cpus, u16 vecidx); -struct mlx5_irq *mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, - struct irq_affinity_desc *af_desc); +struct mlx5_irq * +mlx5_irq_affinity_request(struct mlx5_core_dev *dev, struct mlx5_irq_pool *pool, + struct irq_affinity_desc *af_desc); void mlx5_irq_affinity_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *irq); #else static inline @@ -53,7 +55,8 @@ struct mlx5_irq *mlx5_irq_affinity_irq_request_auto(struct mlx5_core_dev *dev, } static inline struct mlx5_irq * -mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, struct irq_affinity_desc *af_desc) +mlx5_irq_affinity_request(struct mlx5_core_dev *dev, struct mlx5_irq_pool *pool, + struct irq_affinity_desc *af_desc) { return ERR_PTR(-EOPNOTSUPP); } @@ -61,6 +64,7 @@ mlx5_irq_affinity_request(struct mlx5_irq_pool *pool, struct irq_affinity_desc * static inline void mlx5_irq_affinity_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *irq) { + mlx5_irq_release_vector(irq); } #endif #endif /* __MLX5_IRQ_H__ */ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index fb8787e30d3fa..ac7c3a76b4cf8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -367,6 +367,11 @@ struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq) return irq->mask; } +int mlx5_irq_get_irq(const struct mlx5_irq *irq) +{ + return irq->map.virq; +} + int mlx5_irq_get_index(struct mlx5_irq *irq) { return irq->map.index; @@ -440,11 +445,12 @@ static void _mlx5_irq_release(struct mlx5_irq *irq) /** * mlx5_ctrl_irq_release - release a ctrl IRQ back to the system. + * @dev: mlx5 device that releasing the IRQ. * @ctrl_irq: ctrl IRQ to be released. */ -void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq) +void mlx5_ctrl_irq_release(struct mlx5_core_dev *dev, struct mlx5_irq *ctrl_irq) { - _mlx5_irq_release(ctrl_irq); + mlx5_irq_affinity_irq_release(dev, ctrl_irq); } /** @@ -473,7 +479,7 @@ struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev) /* Allocate the IRQ in index 0. The vector was already allocated */ irq = irq_pool_request_vector(pool, 0, &af_desc, NULL); } else { - irq = mlx5_irq_affinity_request(pool, &af_desc); + irq = mlx5_irq_affinity_request(dev, pool, &af_desc); } return irq;