From 1f53d88cbb0dcc7df235bf6611ae632b254fccd8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 3 Dec 2024 15:44:25 +0200 Subject: [PATCH 01/63] RDMA/mlx4: Avoid false error about access to uninitialized gids array Smatch generates the following false error report: drivers/infiniband/hw/mlx4/main.c:393 mlx4_ib_del_gid() error: uninitialized symbol 'gids'. Traditionally, we are not changing kernel code and asking people to fix the tools. However in this case, the fix can be done by simply rearranging the code to be more clear. Fixes: e26be1bfef81 ("IB/mlx4: Implement ib_device callbacks") Link: https://patch.msgid.link/6a3a1577463da16962463fcf62883a87506e9b62.1733233426.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 529db874d67c..b1bbdcff631d 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -351,7 +351,7 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) struct mlx4_port_gid_table *port_gid_table; int ret = 0; int hw_update = 0; - struct gid_entry *gids; + struct gid_entry *gids = NULL; if (!rdma_cap_roce_gid_table(attr->device, attr->port_num)) return -EINVAL; @@ -389,10 +389,10 @@ static int mlx4_ib_del_gid(const struct ib_gid_attr *attr, void **context) } spin_unlock_bh(&iboe->lock); - if (!ret && hw_update) { + if (gids) ret = mlx4_ib_update_gids(gids, ibdev, attr->port_num); - kfree(gids); - } + + kfree(gids); return ret; } From d31ba16c4331d1f8261c757647b141b3ed9b6e52 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 3 Dec 2024 15:42:25 +0200 Subject: [PATCH 02/63] RDMA/mlx4: Use ib_umem_find_best_pgsz() to calculate MTT size Convert mlx4 to use ib_umem_find_best_pgsz() instead of open-coded variant to calculate MTT size. Link: https://patch.msgid.link/c39ec6f5d4664c439a72f2961728ebb5895a9f07.1733233299.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/cq.c | 6 +- drivers/infiniband/hw/mlx4/mlx4_ib.h | 18 ++- drivers/infiniband/hw/mlx4/mr.c | 167 +-------------------------- drivers/infiniband/hw/mlx4/qp.c | 12 +- 4 files changed, 35 insertions(+), 168 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index aa9ea6ba26e5..c592374f4a58 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -150,8 +150,12 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, return PTR_ERR(*umem); shift = mlx4_ib_umem_calc_optimal_mtt_size(*umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &buf->mtt); if (err) goto err_buf; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index b52bceff7d97..f53b1846594c 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -667,6 +667,9 @@ struct mlx4_uverbs_ex_query_device { __u32 reserved; }; +/* 4k - 4G */ +#define MLX4_PAGE_SIZE_SUPPORTED ((unsigned long)GENMASK_ULL(31, 12)) + static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev) { return container_of(ibdev, struct mlx4_ib_dev, ib_dev); @@ -936,8 +939,19 @@ mlx4_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table) { return 0; } -int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, - int *num_of_mtts); +static inline int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, + u64 start, + int *num_of_mtts) +{ + unsigned long pg_sz; + + pg_sz = ib_umem_find_best_pgsz(umem, MLX4_PAGE_SIZE_SUPPORTED, start); + if (!pg_sz) + return -EOPNOTSUPP; + + *num_of_mtts = ib_umem_num_dma_blocks(umem, pg_sz); + return order_base_2(pg_sz); +} int mlx4_ib_cm_init(void); void mlx4_ib_cm_destroy(void); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index a40bf58bcdd3..819c98562e6a 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -87,10 +87,6 @@ err_free: return ERR_PTR(err); } -enum { - MLX4_MAX_MTT_SHIFT = 31 -}; - static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, u64 mtt_size, u64 mtt_shift, u64 len, @@ -144,41 +140,6 @@ static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, return 0; } -static inline u64 alignment_of(u64 ptr) -{ - return ilog2(ptr & (~(ptr - 1))); -} - -static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start, - u64 current_block_end, - u64 block_shift) -{ - /* Check whether the alignment of the new block is aligned as well as - * the previous block. - * Block address must start with zeros till size of entity_size. - */ - if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0) - /* - * It is not as well aligned as the previous block-reduce the - * mtt size accordingly. Here we take the last right bit which - * is 1. - */ - block_shift = alignment_of(next_block_start); - - /* - * Check whether the alignment of the end of previous block - is it - * aligned as well as the start of the block - */ - if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0) - /* - * It is not as well aligned as the start of the block - - * reduce the mtt size accordingly. - */ - block_shift = alignment_of(current_block_end); - - return block_shift; -} - int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { @@ -245,130 +206,6 @@ out: return err; } -/* - * Calculate optimal mtt size based on contiguous pages. - * Function will return also the number of pages that are not aligned to the - * calculated mtt_size to be added to total number of pages. For that we should - * check the first chunk length & last chunk length and if not aligned to - * mtt_size we should increment the non_aligned_pages number. All chunks in the - * middle already handled as part of mtt shift calculation for both their start - * & end addresses. - */ -int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem, u64 start_va, - int *num_of_mtts) -{ - u64 block_shift = MLX4_MAX_MTT_SHIFT; - u64 min_shift = PAGE_SHIFT; - u64 last_block_aligned_end = 0; - u64 current_block_start = 0; - u64 first_block_start = 0; - u64 current_block_len = 0; - u64 last_block_end = 0; - struct scatterlist *sg; - u64 current_block_end; - u64 misalignment_bits; - u64 next_block_start; - u64 total_len = 0; - int i; - - *num_of_mtts = ib_umem_num_dma_blocks(umem, PAGE_SIZE); - - for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - /* - * Initialization - save the first chunk start as the - * current_block_start - block means contiguous pages. - */ - if (current_block_len == 0 && current_block_start == 0) { - current_block_start = sg_dma_address(sg); - first_block_start = current_block_start; - /* - * Find the bits that are different between the physical - * address and the virtual address for the start of the - * MR. - * umem_get aligned the start_va to a page boundary. - * Therefore, we need to align the start va to the same - * boundary. - * misalignment_bits is needed to handle the case of a - * single memory region. In this case, the rest of the - * logic will not reduce the block size. If we use a - * block size which is bigger than the alignment of the - * misalignment bits, we might use the virtual page - * number instead of the physical page number, resulting - * in access to the wrong data. - */ - misalignment_bits = - (start_va & (~(((u64)(PAGE_SIZE)) - 1ULL))) ^ - current_block_start; - block_shift = min(alignment_of(misalignment_bits), - block_shift); - } - - /* - * Go over the scatter entries and check if they continue the - * previous scatter entry. - */ - next_block_start = sg_dma_address(sg); - current_block_end = current_block_start + current_block_len; - /* If we have a split (non-contig.) between two blocks */ - if (current_block_end != next_block_start) { - block_shift = mlx4_ib_umem_calc_block_mtt - (next_block_start, - current_block_end, - block_shift); - - /* - * If we reached the minimum shift for 4k page we stop - * the loop. - */ - if (block_shift <= min_shift) - goto end; - - /* - * If not saved yet we are in first block - we save the - * length of first block to calculate the - * non_aligned_pages number at the end. - */ - total_len += current_block_len; - - /* Start a new block */ - current_block_start = next_block_start; - current_block_len = sg_dma_len(sg); - continue; - } - /* The scatter entry is another part of the current block, - * increase the block size. - * An entry in the scatter can be larger than 4k (page) as of - * dma mapping which merge some blocks together. - */ - current_block_len += sg_dma_len(sg); - } - - /* Account for the last block in the total len */ - total_len += current_block_len; - /* Add to the first block the misalignment that it suffers from. */ - total_len += (first_block_start & ((1ULL << block_shift) - 1ULL)); - last_block_end = current_block_start + current_block_len; - last_block_aligned_end = round_up(last_block_end, 1ULL << block_shift); - total_len += (last_block_aligned_end - last_block_end); - - if (total_len & ((1ULL << block_shift) - 1ULL)) - pr_warn("misaligned total length detected (%llu, %llu)!", - total_len, block_shift); - - *num_of_mtts = total_len >> block_shift; -end: - if (block_shift < min_shift) { - /* - * If shift is less than the min we set a warning and return the - * min shift. - */ - pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift); - - block_shift = min_shift; - } - return block_shift; -} - static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start, u64 length, int access_flags) { @@ -424,6 +261,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start, &n); + if (shift < 0) { + err = shift; + goto err_umem; + } err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length, convert_access(access_flags), n, shift, &mr->mmr); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 9d08aa99f3cb..50fd407103c7 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -925,8 +925,12 @@ static int create_rq(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, } shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); if (err) goto err_buf; @@ -1108,8 +1112,12 @@ static int create_qp_common(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, } shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n); - err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); + if (shift < 0) { + err = shift; + goto err_buf; + } + err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt); if (err) goto err_buf; From f5afe060b1031a5838bc3b0cb91f4c66a5cbf151 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 3 Dec 2024 15:42:26 +0200 Subject: [PATCH 03/63] RDMA/mlx4: Use DMA iterator to write MTT Replace an open coding of rdma_umem_for_each_dma_block() with the proper function. Link: https://patch.msgid.link/0bf595962c964fb8918743405acf9103a5a85983.1733233299.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/mr.c | 119 +++----------------------------- 1 file changed, 8 insertions(+), 111 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 819c98562e6a..e77645a673fb 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -87,123 +87,20 @@ err_free: return ERR_PTR(err); } -static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev, - struct mlx4_mtt *mtt, - u64 mtt_size, u64 mtt_shift, u64 len, - u64 cur_start_addr, u64 *pages, - int *start_index, int *npages) -{ - u64 cur_end_addr = cur_start_addr + len; - u64 cur_end_addr_aligned = 0; - u64 mtt_entries; - int err = 0; - int k; - - len += (cur_start_addr & (mtt_size - 1ULL)); - cur_end_addr_aligned = round_up(cur_end_addr, mtt_size); - len += (cur_end_addr_aligned - cur_end_addr); - if (len & (mtt_size - 1ULL)) { - pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n", - len, mtt_size); - return -EINVAL; - } - - mtt_entries = (len >> mtt_shift); - - /* - * Align the MTT start address to the mtt_size. - * Required to handle cases when the MR starts in the middle of an MTT - * record. Was not required in old code since the physical addresses - * provided by the dma subsystem were page aligned, which was also the - * MTT size. - */ - cur_start_addr = round_down(cur_start_addr, mtt_size); - /* A new block is started ... */ - for (k = 0; k < mtt_entries; ++k) { - pages[*npages] = cur_start_addr + (mtt_size * k); - (*npages)++; - /* - * Be friendly to mlx4_write_mtt() and pass it chunks of - * appropriate size. - */ - if (*npages == PAGE_SIZE / sizeof(u64)) { - err = mlx4_write_mtt(dev->dev, mtt, *start_index, - *npages, pages); - if (err) - return err; - - (*start_index) += *npages; - *npages = 0; - } - } - - return 0; -} - int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem) { - u64 *pages; - u64 len = 0; - int err = 0; - u64 mtt_size; - u64 cur_start_addr = 0; - u64 mtt_shift; - int start_index = 0; - int npages = 0; - struct scatterlist *sg; - int i; + struct ib_block_iter biter; + int err, i = 0; + u64 addr; - pages = (u64 *) __get_free_page(GFP_KERNEL); - if (!pages) - return -ENOMEM; - - mtt_shift = mtt->page_shift; - mtt_size = 1ULL << mtt_shift; - - for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - if (cur_start_addr + len == sg_dma_address(sg)) { - /* still the same block */ - len += sg_dma_len(sg); - continue; - } - /* - * A new block is started ... - * If len is malaligned, write an extra mtt entry to cover the - * misaligned area (round up the division) - */ - err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, - mtt_shift, len, - cur_start_addr, - pages, &start_index, - &npages); + rdma_umem_for_each_dma_block(umem, &biter, BIT(mtt->page_shift)) { + addr = rdma_block_iter_dma_address(&biter); + err = mlx4_write_mtt(dev->dev, mtt, i++, 1, &addr); if (err) - goto out; - - cur_start_addr = sg_dma_address(sg); - len = sg_dma_len(sg); + return err; } - - /* Handle the last block */ - if (len > 0) { - /* - * If len is malaligned, write an extra mtt entry to cover - * the misaligned area (round up the division) - */ - err = mlx4_ib_umem_write_mtt_block(dev, mtt, mtt_size, - mtt_shift, len, - cur_start_addr, pages, - &start_index, &npages); - if (err) - goto out; - } - - if (npages) - err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages); - -out: - free_page((unsigned long) pages); - return err; + return 0; } static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start, From fbef60de6c753253e1337ea60cf818d079108974 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Tue, 3 Dec 2024 15:57:11 +0200 Subject: [PATCH 04/63] RDMA/mlx5: Extend ODP statistics with operation count The current ODP counters represent the total number of pages handled, but it is not enough to understand the effectiveness of these operations. Extend the ODP counters to include the number of times page fault and invalidation events were handled. Example for a single page fault handling 512 pages: - page_fault: incremented by 512 (total pages) - page_fault_handled: incremented by 1 (operation count) The same example is applicable for page invalidation too. Previous output: $ rdma stat mr dev rocep8s0f0 mrn 8 page_faults 27 page_invalidations 0 page_prefetch 29 New output: $ rdma stat mr dev rocep8s0f0 mrn 21 page_faults 512 page_faults_handled 1 page_invalidations 0 page_invalidations_handled 0 page_prefetch 51200 Signed-off-by: Chiara Meiohas Reviewed-by: Michael Guralnik Link: https://patch.msgid.link/b18f29ed1392996ade66e9e6c45f018925253f6a.1733234165.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 ++++++ drivers/infiniband/hw/mlx5/odp.c | 6 +++--- drivers/infiniband/hw/mlx5/restrack.c | 9 +++++++++ include/rdma/ib_verbs.h | 2 ++ 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a01b592aa716..974a45c92fbb 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -669,6 +669,12 @@ struct mlx5_ib_mkey { #define mlx5_update_odp_stats(mr, counter_name, value) \ atomic64_add(value, &((mr)->odp_stats.counter_name)) +#define mlx5_update_odp_stats_with_handled(mr, counter_name, value) \ + do { \ + mlx5_update_odp_stats(mr, counter_name, value); \ + atomic64_add(1, &((mr)->odp_stats.counter_name##_handled)); \ + } while (0) + struct mlx5_ib_mr { struct ib_mr ibmr; struct mlx5_ib_mkey mmkey; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4b37446758fd..4eb03fc0d302 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -313,7 +313,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); - mlx5_update_odp_stats(mr, invalidations, invalidations); + mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations); /* * We are now sure that the device will not access the @@ -997,7 +997,7 @@ next_mr: if (ret < 0) goto end; - mlx5_update_odp_stats(mr, faults, ret); + mlx5_update_odp_stats_with_handled(mr, faults, ret); npages += ret; ret = 0; @@ -1529,7 +1529,7 @@ static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, goto err; } - mlx5_update_odp_stats(mr, faults, ret); + mlx5_update_odp_stats_with_handled(mr, faults, ret); mlx5r_deref_odp_mkey(mmkey); if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST) diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c index affcf8fe943c..67841922c7b8 100644 --- a/drivers/infiniband/hw/mlx5/restrack.c +++ b/drivers/infiniband/hw/mlx5/restrack.c @@ -95,10 +95,19 @@ static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr) if (rdma_nl_stat_hwcounter_entry(msg, "page_faults", atomic64_read(&mr->odp_stats.faults))) goto err_table; + if (rdma_nl_stat_hwcounter_entry( + msg, "page_faults_handled", + atomic64_read(&mr->odp_stats.faults_handled))) + goto err_table; if (rdma_nl_stat_hwcounter_entry( msg, "page_invalidations", atomic64_read(&mr->odp_stats.invalidations))) goto err_table; + if (rdma_nl_stat_hwcounter_entry( + msg, "page_invalidations_handled", + atomic64_read(&mr->odp_stats.invalidations_handled))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch", atomic64_read(&mr->odp_stats.prefetch))) goto err_table; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3417636da960..6ddd5e3bb884 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2256,7 +2256,9 @@ struct rdma_netdev_alloc_params { struct ib_odp_counters { atomic64_t faults; + atomic64_t faults_handled; atomic64_t invalidations; + atomic64_t invalidations_handled; atomic64_t prefetch; }; From bd96a3935e89486304461a21752f824fc25e0f0b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 30 Nov 2024 13:01:37 +0300 Subject: [PATCH 05/63] rdma/cxgb4: Prevent potential integer overflow on 32bit The "gl->tot_len" variable is controlled by the user. It comes from process_responses(). On 32bit systems, the "gl->tot_len + sizeof(struct cpl_pass_accept_req) + sizeof(struct rss_header)" addition could have an integer wrapping bug. Use size_add() to prevent this. Fixes: 1cab775c3e75 ("RDMA/cxgb4: Fix LE hash collision bug for passive open connection") Link: https://patch.msgid.link/r/86b404e1-4a75-4a35-a34e-e3054fa554c7@stanley.mountain Signed-off-by: Dan Carpenter Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 80970a1738f8..034b85c42255 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -1114,8 +1114,10 @@ static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl, * The math here assumes sizeof cpl_pass_accept_req >= sizeof * cpl_rx_pkt. */ - skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) + - sizeof(struct rss_header) - pktshift, GFP_ATOMIC); + skb = alloc_skb(size_add(gl->tot_len, + sizeof(struct cpl_pass_accept_req) + + sizeof(struct rss_header)) - pktshift, + GFP_ATOMIC); if (unlikely(!skb)) return NULL; From a883e71345a0101ff33be23a8d13f112ec4defb5 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Wed, 11 Dec 2024 10:09:01 +0800 Subject: [PATCH 06/63] RDMA/erdma: Probe the erdma RoCEv2 device Currently, the erdma driver supports both the iWARP and RoCEv2 protocols. The erdma driver reads the ERDMA_REGS_DEV_PROTO_REG register to identify the protocol used by the erdma device. Since each protocol requires different ib_device_ops, we introduce the erdma_device_ops_iwarp and erdma_device_ops_rocev2 for iWARP and RoCEv2 protocols, respectively. Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241211020930.68833-2-boshiyu@linux.alibaba.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/Kconfig | 2 +- drivers/infiniband/hw/erdma/erdma.h | 3 +- drivers/infiniband/hw/erdma/erdma_hw.h | 7 +++++ drivers/infiniband/hw/erdma/erdma_main.c | 34 +++++++++++++++++------ drivers/infiniband/hw/erdma/erdma_verbs.c | 16 ++++++++++- drivers/infiniband/hw/erdma/erdma_verbs.h | 12 ++++++++ 6 files changed, 62 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/erdma/Kconfig b/drivers/infiniband/hw/erdma/Kconfig index 169038e3ceb1..267fc1f3c42a 100644 --- a/drivers/infiniband/hw/erdma/Kconfig +++ b/drivers/infiniband/hw/erdma/Kconfig @@ -5,7 +5,7 @@ config INFINIBAND_ERDMA depends on INFINIBAND_ADDR_TRANS depends on INFINIBAND_USER_ACCESS help - This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA), + This is a RDMA driver for Alibaba Elastic RDMA Adapter(ERDMA), which supports RDMA features in Alibaba cloud environment. To compile this driver as module, choose M here. The module will be diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 3c166359448d..ad4dc1a4bdc7 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -16,7 +16,7 @@ #include "erdma_hw.h" #define DRV_MODULE_NAME "erdma" -#define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack" +#define ERDMA_NODE_DESC "Elastic RDMA Adapter stack" struct erdma_eq { void *qbuf; @@ -215,6 +215,7 @@ struct erdma_dev { struct dma_pool *db_pool; struct dma_pool *resp_pool; + enum erdma_proto_type proto; }; static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 05978f3b1475..970b392d4fb4 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -21,8 +21,15 @@ #define ERDMA_NUM_MSIX_VEC 32U #define ERDMA_MSIX_VECTOR_CMDQ 0 +/* erdma device protocol type */ +enum erdma_proto_type { + ERDMA_PROTO_IWARP = 0, + ERDMA_PROTO_ROCEV2 = 1, +}; + /* PCIe Bar0 Registers. */ #define ERDMA_REGS_VERSION_REG 0x0 +#define ERDMA_REGS_DEV_PROTO_REG 0xC #define ERDMA_REGS_DEV_CTRL_REG 0x10 #define ERDMA_REGS_DEV_ST_REG 0x14 #define ERDMA_REGS_NETDEV_MAC_L_REG 0x18 diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 62f497a71004..cf97bb79e595 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -172,6 +172,8 @@ static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) { int ret; + dev->proto = erdma_reg_read32(dev, ERDMA_REGS_DEV_PROTO_REG); + dev->resp_pool = dma_pool_create("erdma_resp_pool", &pdev->dev, ERDMA_HW_RESP_SIZE, ERDMA_HW_RESP_SIZE, 0); @@ -474,6 +476,21 @@ static void erdma_res_cb_free(struct erdma_dev *dev) bitmap_free(dev->res_cb[i].bitmap); } +static const struct ib_device_ops erdma_device_ops_rocev2 = { + .get_link_layer = erdma_get_link_layer, +}; + +static const struct ib_device_ops erdma_device_ops_iwarp = { + .iw_accept = erdma_accept, + .iw_add_ref = erdma_qp_get_ref, + .iw_connect = erdma_connect, + .iw_create_listen = erdma_create_listen, + .iw_destroy_listen = erdma_destroy_listen, + .iw_get_qp = erdma_get_ibqp, + .iw_reject = erdma_reject, + .iw_rem_ref = erdma_qp_put_ref, +}; + static const struct ib_device_ops erdma_device_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_ERDMA, @@ -494,14 +511,6 @@ static const struct ib_device_ops erdma_device_ops = { .get_dma_mr = erdma_get_dma_mr, .get_hw_stats = erdma_get_hw_stats, .get_port_immutable = erdma_get_port_immutable, - .iw_accept = erdma_accept, - .iw_add_ref = erdma_qp_get_ref, - .iw_connect = erdma_connect, - .iw_create_listen = erdma_create_listen, - .iw_destroy_listen = erdma_destroy_listen, - .iw_get_qp = erdma_get_ibqp, - .iw_reject = erdma_reject, - .iw_rem_ref = erdma_qp_put_ref, .map_mr_sg = erdma_map_mr_sg, .mmap = erdma_mmap, .mmap_free = erdma_mmap_free, @@ -537,7 +546,14 @@ static int erdma_ib_device_add(struct pci_dev *pdev) if (ret) return ret; - ibdev->node_type = RDMA_NODE_RNIC; + if (erdma_device_iwarp(dev)) { + ibdev->node_type = RDMA_NODE_RNIC; + ib_set_device_ops(ibdev, &erdma_device_ops_iwarp); + } else { + ibdev->node_type = RDMA_NODE_IB_CA; + ib_set_device_ops(ibdev, &erdma_device_ops_rocev2); + } + memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); /* diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 51d619edb6c5..3b7e55515cfd 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -395,8 +395,17 @@ out: int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, struct ib_port_immutable *port_immutable) { + struct erdma_dev *dev = to_edev(ibdev); + + if (erdma_device_iwarp(dev)) { + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + } else { + port_immutable->core_cap_flags = + RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + port_immutable->max_mad_size = IB_MGMT_MAD_SIZE; + } + port_immutable->gid_tbl_len = 1; - port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; return 0; } @@ -1839,3 +1848,8 @@ int erdma_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, return stats->num_counters; } + +enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, u32 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index c998acd39a78..90e2b35a0973 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -291,6 +291,16 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, void erdma_qp_llp_close(struct erdma_qp *qp); void erdma_qp_cm_drop(struct erdma_qp *qp); +static inline bool erdma_device_iwarp(struct erdma_dev *dev) +{ + return dev->proto == ERDMA_PROTO_IWARP; +} + +static inline bool erdma_device_rocev2(struct erdma_dev *dev) +{ + return dev->proto == ERDMA_PROTO_ROCEV2; +} + static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx) { return container_of(ibctx, struct erdma_ucontext, ibucontext); @@ -370,5 +380,7 @@ struct rdma_hw_stats *erdma_alloc_hw_port_stats(struct ib_device *device, u32 port_num); int erdma_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index); +enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, + u32 port_num); #endif From 6edc15abc256f676ae44ac2ddc914567859bd1a7 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Wed, 11 Dec 2024 10:09:02 +0800 Subject: [PATCH 07/63] RDMA/erdma: Add GID table management interfaces The erdma_add_gid() interface inserts a GID entry at the specified index. The erdma_del_gid() interface deletes the GID entry at the specified index. Additionally, programs can invoke the erdma_query_port() and erdma_get_port_immutable() interfaces to query the GID table length. Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241211020930.68833-3-boshiyu@linux.alibaba.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 1 + drivers/infiniband/hw/erdma/erdma_hw.h | 28 +++++++++++- drivers/infiniband/hw/erdma/erdma_main.c | 3 ++ drivers/infiniband/hw/erdma/erdma_verbs.c | 56 +++++++++++++++++++++-- drivers/infiniband/hw/erdma/erdma_verbs.h | 12 +++++ 5 files changed, 96 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index ad4dc1a4bdc7..42dabf674f5d 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -148,6 +148,7 @@ struct erdma_devattr { u32 max_mr; u32 max_pd; u32 max_mw; + u32 max_gid; u32 local_dma_key; }; diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 970b392d4fb4..7e03c5f97501 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -21,6 +21,9 @@ #define ERDMA_NUM_MSIX_VEC 32U #define ERDMA_MSIX_VECTOR_CMDQ 0 +/* RoCEv2 related */ +#define ERDMA_ROCEV2_GID_SIZE 16 + /* erdma device protocol type */ enum erdma_proto_type { ERDMA_PROTO_IWARP = 0, @@ -143,7 +146,8 @@ enum CMDQ_RDMA_OPCODE { CMDQ_OPCODE_DESTROY_CQ = 5, CMDQ_OPCODE_REFLUSH = 6, CMDQ_OPCODE_REG_MR = 8, - CMDQ_OPCODE_DEREG_MR = 9 + CMDQ_OPCODE_DEREG_MR = 9, + CMDQ_OPCODE_SET_GID = 14, }; enum CMDQ_COMMON_OPCODE { @@ -401,7 +405,29 @@ struct erdma_cmdq_query_stats_resp { u64 rx_pps_meter_drop_packets_cnt; }; +enum erdma_network_type { + ERDMA_NETWORK_TYPE_IPV4 = 0, + ERDMA_NETWORK_TYPE_IPV6 = 1, +}; + +enum erdma_set_gid_op { + ERDMA_SET_GID_OP_ADD = 0, + ERDMA_SET_GID_OP_DEL = 1, +}; + +/* set gid cfg */ +#define ERDMA_CMD_SET_GID_SGID_IDX_MASK GENMASK(15, 0) +#define ERDMA_CMD_SET_GID_NTYPE_MASK BIT(16) +#define ERDMA_CMD_SET_GID_OP_MASK BIT(31) + +struct erdma_cmdq_set_gid_req { + u64 hdr; + u32 cfg; + u8 gid[ERDMA_ROCEV2_GID_SIZE]; +}; + /* cap qword 0 definition */ +#define ERDMA_CMD_DEV_CAP_MAX_GID_MASK GENMASK_ULL(51, 48) #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) #define ERDMA_CMD_DEV_CAP_FLAGS_MASK GENMASK_ULL(31, 24) #define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index cf97bb79e595..77440324b7e7 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -400,6 +400,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0); dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1); dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0); + dev->attrs.max_gid = 1 << ERDMA_GET_CAP(MAX_GID, cap0); dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1); dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1); dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); @@ -478,6 +479,8 @@ static void erdma_res_cb_free(struct erdma_dev *dev) static const struct ib_device_ops erdma_device_ops_rocev2 = { .get_link_layer = erdma_get_link_layer, + .add_gid = erdma_add_gid, + .del_gid = erdma_del_gid, }; static const struct ib_device_ops erdma_device_ops_iwarp = { diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 3b7e55515cfd..9944eed584ec 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -367,7 +367,13 @@ int erdma_query_port(struct ib_device *ibdev, u32 port, memset(attr, 0, sizeof(*attr)); - attr->gid_tbl_len = 1; + if (erdma_device_iwarp(dev)) { + attr->gid_tbl_len = 1; + } else { + attr->gid_tbl_len = dev->attrs.max_gid; + attr->ip_gids = true; + } + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; attr->max_msg_sz = -1; @@ -399,14 +405,14 @@ int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, if (erdma_device_iwarp(dev)) { port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + port_immutable->gid_tbl_len = 1; } else { port_immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; port_immutable->max_mad_size = IB_MGMT_MAD_SIZE; + port_immutable->gid_tbl_len = dev->attrs.max_gid; } - port_immutable->gid_tbl_len = 1; - return 0; } @@ -1853,3 +1859,47 @@ enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, u32 port_num) { return IB_LINK_LAYER_ETHERNET; } + +static int erdma_set_gid(struct erdma_dev *dev, u8 op, u32 idx, + const union ib_gid *gid) +{ + struct erdma_cmdq_set_gid_req req; + u8 ntype; + + req.cfg = FIELD_PREP(ERDMA_CMD_SET_GID_SGID_IDX_MASK, idx) | + FIELD_PREP(ERDMA_CMD_SET_GID_OP_MASK, op); + + if (op == ERDMA_SET_GID_OP_ADD) { + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) + ntype = ERDMA_NETWORK_TYPE_IPV4; + else + ntype = ERDMA_NETWORK_TYPE_IPV6; + + req.cfg |= FIELD_PREP(ERDMA_CMD_SET_GID_NTYPE_MASK, ntype); + + memcpy(&req.gid, gid, ERDMA_ROCEV2_GID_SIZE); + } + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_SET_GID); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); +} + +int erdma_add_gid(const struct ib_gid_attr *attr, void **context) +{ + struct erdma_dev *dev = to_edev(attr->device); + int ret; + + ret = erdma_check_gid_attr(attr); + if (ret) + return ret; + + return erdma_set_gid(dev, ERDMA_SET_GID_OP_ADD, attr->index, + &attr->gid); +} + +int erdma_del_gid(const struct ib_gid_attr *attr, void **context) +{ + return erdma_set_gid(to_edev(attr->device), ERDMA_SET_GID_OP_DEL, + attr->index, NULL); +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 90e2b35a0973..23cfeaf79eaa 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -326,6 +326,16 @@ static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq) return container_of(ibcq, struct erdma_cq, ibcq); } +static inline int erdma_check_gid_attr(const struct ib_gid_attr *attr) +{ + u8 ntype = rdma_gid_attr_network_type(attr); + + if (ntype != RDMA_NETWORK_IPV4 && ntype != RDMA_NETWORK_IPV6) + return -EINVAL; + + return 0; +} + static inline struct erdma_user_mmap_entry * to_emmap(struct rdma_user_mmap_entry *ibmmap) { @@ -382,5 +392,7 @@ int erdma_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, u32 port, int index); enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, u32 port_num); +int erdma_add_gid(const struct ib_gid_attr *attr, void **context); +int erdma_del_gid(const struct ib_gid_attr *attr, void **context); #endif From 14bcf7354a0ed2004da85b4b03afe44effd5e445 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Wed, 11 Dec 2024 10:09:03 +0800 Subject: [PATCH 08/63] RDMA/erdma: Add the erdma_query_pkey() interface The erdma_query_pkey() interface queries the PKey at the specified index. Currently, erdma supports only one partition and returns the default PKey for each query. Besides, the correct length of the PKey table can be obtained by calling the erdma_query_port() and erdma_get_port_immutable() interfaces. Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241211020930.68833-4-boshiyu@linux.alibaba.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_hw.h | 2 ++ drivers/infiniband/hw/erdma/erdma_main.c | 1 + drivers/infiniband/hw/erdma/erdma_verbs.c | 14 ++++++++++++++ drivers/infiniband/hw/erdma/erdma_verbs.h | 1 + 4 files changed, 18 insertions(+) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 7e03c5f97501..f7f9dcac3ab0 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -23,6 +23,8 @@ /* RoCEv2 related */ #define ERDMA_ROCEV2_GID_SIZE 16 +#define ERDMA_MAX_PKEYS 1 +#define ERDMA_DEFAULT_PKEY 0xFFFF /* erdma device protocol type */ enum erdma_proto_type { diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 77440324b7e7..b9d0ad77436a 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -481,6 +481,7 @@ static const struct ib_device_ops erdma_device_ops_rocev2 = { .get_link_layer = erdma_get_link_layer, .add_gid = erdma_add_gid, .del_gid = erdma_del_gid, + .query_pkey = erdma_query_pkey, }; static const struct ib_device_ops erdma_device_ops_iwarp = { diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 9944eed584ec..03ea52bb233e 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -336,6 +336,9 @@ int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; + if (erdma_device_rocev2(dev)) + attr->max_pkeys = ERDMA_MAX_PKEYS; + if (dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_ATOMIC) attr->atomic_cap = IB_ATOMIC_GLOB; @@ -372,6 +375,7 @@ int erdma_query_port(struct ib_device *ibdev, u32 port, } else { attr->gid_tbl_len = dev->attrs.max_gid; attr->ip_gids = true; + attr->pkey_tbl_len = ERDMA_MAX_PKEYS; } attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; @@ -411,6 +415,7 @@ int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; port_immutable->max_mad_size = IB_MGMT_MAD_SIZE; port_immutable->gid_tbl_len = dev->attrs.max_gid; + port_immutable->pkey_tbl_len = ERDMA_MAX_PKEYS; } return 0; @@ -1903,3 +1908,12 @@ int erdma_del_gid(const struct ib_gid_attr *attr, void **context) return erdma_set_gid(to_edev(attr->device), ERDMA_SET_GID_OP_DEL, attr->index, NULL); } + +int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) +{ + if (index >= ERDMA_MAX_PKEYS) + return -EINVAL; + + *pkey = ERDMA_DEFAULT_PKEY; + return 0; +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 23cfeaf79eaa..1ae6ba56f597 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -394,5 +394,6 @@ enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, u32 port_num); int erdma_add_gid(const struct ib_gid_attr *attr, void **context); int erdma_del_gid(const struct ib_gid_attr *attr, void **context); +int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); #endif From 41dcaf48ff9e31d1441b6a74ed488360aad096d4 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Wed, 11 Dec 2024 10:09:04 +0800 Subject: [PATCH 09/63] RDMA/erdma: Add address handle implementation The address handle contains the necessary information to transmit messages to a remote peer in the RoCEv2 protocol. This commit implements the erdma_create_ah(), erdma_destroy_ah(), and erdma_query_ah() interfaces, which are used to create, destroy, and query an address handle, respectively. Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241211020930.68833-5-boshiyu@linux.alibaba.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 4 +- drivers/infiniband/hw/erdma/erdma_hw.h | 34 +++++++ drivers/infiniband/hw/erdma/erdma_main.c | 4 + drivers/infiniband/hw/erdma/erdma_verbs.c | 114 +++++++++++++++++++++- drivers/infiniband/hw/erdma/erdma_verbs.h | 28 ++++++ 5 files changed, 182 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 42dabf674f5d..4f840d8e3beb 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -149,6 +149,7 @@ struct erdma_devattr { u32 max_pd; u32 max_mw; u32 max_gid; + u32 max_ah; u32 local_dma_key; }; @@ -178,7 +179,8 @@ struct erdma_resource_cb { enum { ERDMA_RES_TYPE_PD = 0, ERDMA_RES_TYPE_STAG_IDX = 1, - ERDMA_RES_CNT = 2, + ERDMA_RES_TYPE_AH = 2, + ERDMA_RES_CNT = 3, }; struct erdma_dev { diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index f7f9dcac3ab0..64d856494359 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -9,6 +9,7 @@ #include #include +#include /* PCIe device related definition. */ #define ERDMA_PCI_WIDTH 64 @@ -150,6 +151,8 @@ enum CMDQ_RDMA_OPCODE { CMDQ_OPCODE_REG_MR = 8, CMDQ_OPCODE_DEREG_MR = 9, CMDQ_OPCODE_SET_GID = 14, + CMDQ_OPCODE_CREATE_AH = 15, + CMDQ_OPCODE_DESTROY_AH = 16, }; enum CMDQ_COMMON_OPCODE { @@ -297,6 +300,36 @@ struct erdma_cmdq_dereg_mr_req { u32 cfg; }; +/* create_av cfg0 */ +#define ERDMA_CMD_CREATE_AV_FL_MASK GENMASK(19, 0) +#define ERDMA_CMD_CREATE_AV_NTYPE_MASK BIT(20) + +struct erdma_av_cfg { + u32 cfg0; + u8 traffic_class; + u8 hop_limit; + u8 sl; + u8 rsvd; + u16 udp_sport; + u16 sgid_index; + u8 dmac[ETH_ALEN]; + u8 padding[2]; + u8 dgid[ERDMA_ROCEV2_GID_SIZE]; +}; + +struct erdma_cmdq_create_ah_req { + u64 hdr; + u32 pdn; + u32 ahn; + struct erdma_av_cfg av_cfg; +}; + +struct erdma_cmdq_destroy_ah_req { + u64 hdr; + u32 pdn; + u32 ahn; +}; + /* modify qp cfg */ #define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24) #define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20) @@ -433,6 +466,7 @@ struct erdma_cmdq_set_gid_req { #define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) #define ERDMA_CMD_DEV_CAP_FLAGS_MASK GENMASK_ULL(31, 24) #define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_DEV_CAP_MAX_AH_MASK GENMASK_ULL(15, 8) #define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) /* cap qword 1 definition */ diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index b9d0ad77436a..d632c09c9acd 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -401,6 +401,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1); dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0); dev->attrs.max_gid = 1 << ERDMA_GET_CAP(MAX_GID, cap0); + dev->attrs.max_ah = 1 << ERDMA_GET_CAP(MAX_AH, cap0); dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1); dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1); dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); @@ -418,6 +419,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD; dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr; + dev->res_cb[ERDMA_RES_TYPE_AH].max_cap = dev->attrs.max_ah; erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON, CMDQ_OPCODE_QUERY_FW_INFO); @@ -482,6 +484,8 @@ static const struct ib_device_ops erdma_device_ops_rocev2 = { .add_gid = erdma_add_gid, .del_gid = erdma_del_gid, .query_pkey = erdma_query_pkey, + .create_ah = erdma_create_ah, + .destroy_ah = erdma_destroy_ah, }; static const struct ib_device_ops erdma_device_ops_iwarp = { diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 03ea52bb233e..19483667c989 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -336,8 +336,10 @@ int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; - if (erdma_device_rocev2(dev)) + if (erdma_device_rocev2(dev)) { attr->max_pkeys = ERDMA_MAX_PKEYS; + attr->max_ah = dev->attrs.max_ah; + } if (dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_ATOMIC) attr->atomic_cap = IB_ATOMIC_GLOB; @@ -1917,3 +1919,113 @@ int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) *pkey = ERDMA_DEFAULT_PKEY; return 0; } + +int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata) +{ + const struct ib_global_route *grh = + rdma_ah_read_grh(init_attr->ah_attr); + struct erdma_dev *dev = to_edev(ibah->device); + struct erdma_pd *pd = to_epd(ibah->pd); + struct erdma_ah *ah = to_eah(ibah); + struct erdma_cmdq_create_ah_req req; + u32 udp_sport; + int ret; + + ret = erdma_check_gid_attr(grh->sgid_attr); + if (ret) + return ret; + + ret = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_AH]); + if (ret < 0) + return ret; + + ah->ahn = ret; + + if (grh->flow_label) + udp_sport = rdma_flow_label_to_udp_sport(grh->flow_label); + else + udp_sport = + IB_ROCE_UDP_ENCAP_VALID_PORT_MIN + (ah->ahn & 0x3FFF); + + ah->av.port = rdma_ah_get_port_num(init_attr->ah_attr); + ah->av.sgid_index = grh->sgid_index; + ah->av.hop_limit = grh->hop_limit; + ah->av.traffic_class = grh->traffic_class; + ah->av.sl = rdma_ah_get_sl(init_attr->ah_attr); + ah->av.flow_label = grh->flow_label; + ah->av.udp_sport = udp_sport; + + ether_addr_copy(ah->av.dmac, init_attr->ah_attr->roce.dmac); + memcpy(ah->av.dgid, grh->dgid.raw, ERDMA_ROCEV2_GID_SIZE); + + if (ipv6_addr_v4mapped((struct in6_addr *)&grh->dgid)) + ah->av.ntype = ERDMA_NETWORK_TYPE_IPV4; + else + ah->av.ntype = ERDMA_NETWORK_TYPE_IPV6; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_AH); + + req.pdn = pd->pdn; + req.ahn = ah->ahn; + + req.av_cfg.cfg0 = + FIELD_PREP(ERDMA_CMD_CREATE_AV_FL_MASK, ah->av.flow_label) | + FIELD_PREP(ERDMA_CMD_CREATE_AV_NTYPE_MASK, ah->av.ntype); + req.av_cfg.traffic_class = ah->av.traffic_class; + req.av_cfg.hop_limit = ah->av.hop_limit; + req.av_cfg.sl = ah->av.sl; + req.av_cfg.udp_sport = ah->av.udp_sport; + req.av_cfg.sgid_index = ah->av.sgid_index; + ether_addr_copy(req.av_cfg.dmac, ah->av.dmac); + memcpy(req.av_cfg.dgid, ah->av.dgid, ERDMA_ROCEV2_GID_SIZE); + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + if (ret) { + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_AH], ah->ahn); + return ret; + } + + return 0; +} + +int erdma_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct erdma_dev *dev = to_edev(ibah->device); + struct erdma_pd *pd = to_epd(ibah->pd); + struct erdma_ah *ah = to_eah(ibah); + struct erdma_cmdq_destroy_ah_req req; + int ret; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_AH); + + req.pdn = pd->pdn; + req.ahn = ah->ahn; + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + if (ret) + return ret; + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_AH], ah->ahn); + + return 0; +} + +int erdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) +{ + struct erdma_ah *ah = to_eah(ibah); + + memset(ah_attr, 0, sizeof(*ah_attr)); + + ah_attr->type = RDMA_AH_ATTR_TYPE_ROCE; + rdma_ah_set_sl(ah_attr, ah->av.sl); + rdma_ah_set_port_num(ah_attr, ah->av.port); + rdma_ah_set_ah_flags(ah_attr, IB_AH_GRH); + rdma_ah_set_grh(ah_attr, NULL, ah->av.flow_label, ah->av.sgid_index, + ah->av.hop_limit, ah->av.traffic_class); + rdma_ah_set_dgid_raw(ah_attr, ah->av.dgid); + + return 0; +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 1ae6ba56f597..78a6c35cf1a5 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -136,6 +136,25 @@ struct erdma_user_dbrecords_page { int refcnt; }; +struct erdma_av { + u8 port; + u8 hop_limit; + u8 traffic_class; + u8 sl; + u8 sgid_index; + u16 udp_sport; + u32 flow_label; + u8 dmac[ETH_ALEN]; + u8 dgid[ERDMA_ROCEV2_GID_SIZE]; + enum erdma_network_type ntype; +}; + +struct erdma_ah { + struct ib_ah ibah; + struct erdma_av av; + u32 ahn; +}; + struct erdma_uqp { struct erdma_mem sq_mem; struct erdma_mem rq_mem; @@ -326,6 +345,11 @@ static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq) return container_of(ibcq, struct erdma_cq, ibcq); } +static inline struct erdma_ah *to_eah(struct ib_ah *ibah) +{ + return container_of(ibah, struct erdma_ah, ibah); +} + static inline int erdma_check_gid_attr(const struct ib_gid_attr *attr) { u8 ntype = rdma_gid_attr_network_type(attr); @@ -395,5 +419,9 @@ enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, int erdma_add_gid(const struct ib_gid_attr *attr, void **context); int erdma_del_gid(const struct ib_gid_attr *attr, void **context); int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); +int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, + struct ib_udata *udata); +int erdma_destroy_ah(struct ib_ah *ibah, u32 flags); +int erdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr); #endif From 9566cf6a7742f2e2bb8044b4624a7a1e66b6a549 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Wed, 11 Dec 2024 10:09:05 +0800 Subject: [PATCH 10/63] RDMA/erdma: Add erdma_modify_qp_rocev2() interface The QP state machines in the RoCEv2 and iWARP protocols are different. To handle these differences for the erdma RoCEv2 device, we provide the erdma_modify_qp_rocev2() interface, which transitions the QP state and modifies QP attributes accordingly. Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241211020930.68833-6-boshiyu@linux.alibaba.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_cq.c | 45 +++++ drivers/infiniband/hw/erdma/erdma_hw.h | 14 ++ drivers/infiniband/hw/erdma/erdma_main.c | 3 +- drivers/infiniband/hw/erdma/erdma_qp.c | 92 +++++++++ drivers/infiniband/hw/erdma/erdma_verbs.c | 221 +++++++++++++++++++--- drivers/infiniband/hw/erdma/erdma_verbs.h | 44 +++++ 6 files changed, 388 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index 70f89f0162aa..eada882472a3 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -201,3 +201,48 @@ int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) return npolled; } + +void erdma_remove_cqes_of_qp(struct ib_cq *ibcq, u32 qpn) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_cqe *cqe, *dst_cqe; + u32 prev_cq_ci, cur_cq_ci; + u32 ncqe = 0, nqp_cqe = 0; + unsigned long flags; + u8 owner; + + spin_lock_irqsave(&cq->kern_cq.lock, flags); + + prev_cq_ci = cq->kern_cq.ci; + + while (ncqe < cq->depth && (cqe = get_next_valid_cqe(cq)) != NULL) { + ++cq->kern_cq.ci; + ++ncqe; + } + + while (ncqe > 0) { + cur_cq_ci = prev_cq_ci + ncqe - 1; + cqe = get_queue_entry(cq->kern_cq.qbuf, cur_cq_ci, cq->depth, + CQE_SHIFT); + + if (be32_to_cpu(cqe->qpn) == qpn) { + ++nqp_cqe; + } else if (nqp_cqe) { + dst_cqe = get_queue_entry(cq->kern_cq.qbuf, + cur_cq_ci + nqp_cqe, + cq->depth, CQE_SHIFT); + owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + be32_to_cpu(dst_cqe->hdr)); + cqe->hdr = cpu_to_be32( + (be32_to_cpu(cqe->hdr) & + ~ERDMA_CQE_HDR_OWNER_MASK) | + FIELD_PREP(ERDMA_CQE_HDR_OWNER_MASK, owner)); + memcpy(dst_cqe, cqe, sizeof(*cqe)); + } + + --ncqe; + } + + cq->kern_cq.ci = prev_cq_ci + nqp_cqe; + spin_unlock_irqrestore(&cq->kern_cq.lock, flags); +} diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 64d856494359..b5c1aca71144 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -347,6 +347,20 @@ struct erdma_cmdq_modify_qp_req { u32 recv_nxt; }; +/* modify qp cfg1 for roce device */ +#define ERDMA_CMD_MODIFY_QP_DQPN_MASK GENMASK(19, 0) + +struct erdma_cmdq_mod_qp_req_rocev2 { + u64 hdr; + u32 cfg0; + u32 cfg1; + u32 attr_mask; + u32 qkey; + u32 rq_psn; + u32 sq_psn; + struct erdma_av_cfg av_cfg; +}; + /* create qp cfg0 */ #define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) #define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index d632c09c9acd..2fca163b1744 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -486,6 +486,7 @@ static const struct ib_device_ops erdma_device_ops_rocev2 = { .query_pkey = erdma_query_pkey, .create_ah = erdma_create_ah, .destroy_ah = erdma_destroy_ah, + .modify_qp = erdma_modify_qp_rocev2, }; static const struct ib_device_ops erdma_device_ops_iwarp = { @@ -497,6 +498,7 @@ static const struct ib_device_ops erdma_device_ops_iwarp = { .iw_get_qp = erdma_get_ibqp, .iw_reject = erdma_reject, .iw_rem_ref = erdma_qp_put_ref, + .modify_qp = erdma_modify_qp, }; static const struct ib_device_ops erdma_device_ops = { @@ -522,7 +524,6 @@ static const struct ib_device_ops erdma_device_ops = { .map_mr_sg = erdma_map_mr_sg, .mmap = erdma_mmap, .mmap_free = erdma_mmap_free, - .modify_qp = erdma_modify_qp, .post_recv = erdma_post_recv, .post_send = erdma_post_send, .poll_cq = erdma_poll_cq, diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 4d1f9114cd97..13977f4e9463 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -186,6 +186,98 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, return ret; } +static int modify_qp_cmd_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + enum erdma_qpa_mask_rocev2 attr_mask) +{ + struct erdma_cmdq_mod_qp_req_rocev2 req; + + memset(&req, 0, sizeof(req)); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + if (attr_mask & ERDMA_QPA_ROCEV2_STATE) + req.cfg0 |= FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, + params->state); + + if (attr_mask & ERDMA_QPA_ROCEV2_DST_QPN) + req.cfg1 = FIELD_PREP(ERDMA_CMD_MODIFY_QP_DQPN_MASK, + params->dst_qpn); + + if (attr_mask & ERDMA_QPA_ROCEV2_QKEY) + req.qkey = params->qkey; + + if (attr_mask & ERDMA_QPA_ROCEV2_AV) + erdma_set_av_cfg(&req.av_cfg, ¶ms->av); + + if (attr_mask & ERDMA_QPA_ROCEV2_SQ_PSN) + req.sq_psn = params->sq_psn; + + if (attr_mask & ERDMA_QPA_ROCEV2_RQ_PSN) + req.rq_psn = params->rq_psn; + + req.attr_mask = attr_mask; + + return erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, + NULL); +} + +static void erdma_reset_qp(struct erdma_qp *qp) +{ + qp->kern_qp.sq_pi = 0; + qp->kern_qp.sq_ci = 0; + qp->kern_qp.rq_pi = 0; + qp->kern_qp.rq_ci = 0; + memset(qp->kern_qp.swr_tbl, 0, qp->attrs.sq_size * sizeof(u64)); + memset(qp->kern_qp.rwr_tbl, 0, qp->attrs.rq_size * sizeof(u64)); + memset(qp->kern_qp.sq_buf, 0, qp->attrs.sq_size << SQEBB_SHIFT); + memset(qp->kern_qp.rq_buf, 0, qp->attrs.rq_size << RQE_SHIFT); + erdma_remove_cqes_of_qp(&qp->scq->ibcq, QP_ID(qp)); + if (qp->rcq != qp->scq) + erdma_remove_cqes_of_qp(&qp->rcq->ibcq, QP_ID(qp)); +} + +int erdma_modify_qp_state_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + int attr_mask) +{ + struct erdma_dev *dev = to_edev(qp->ibqp.device); + int ret; + + ret = modify_qp_cmd_rocev2(qp, params, attr_mask); + if (ret) + return ret; + + if (attr_mask & ERDMA_QPA_ROCEV2_STATE) + qp->attrs.rocev2.state = params->state; + + if (attr_mask & ERDMA_QPA_ROCEV2_QKEY) + qp->attrs.rocev2.qkey = params->qkey; + + if (attr_mask & ERDMA_QPA_ROCEV2_DST_QPN) + qp->attrs.rocev2.dst_qpn = params->dst_qpn; + + if (attr_mask & ERDMA_QPA_ROCEV2_AV) + memcpy(&qp->attrs.rocev2.av, ¶ms->av, + sizeof(struct erdma_av)); + + if (rdma_is_kernel_res(&qp->ibqp.res) && + params->state == ERDMA_QPS_ROCEV2_RESET) + erdma_reset_qp(qp); + + if (rdma_is_kernel_res(&qp->ibqp.res) && + params->state == ERDMA_QPS_ROCEV2_ERROR) { + qp->flags |= ERDMA_QP_IN_FLUSHING; + mod_delayed_work(dev->reflush_wq, &qp->reflush_dwork, + usecs_to_jiffies(100)); + } + + return 0; +} + static void erdma_qp_safe_free(struct kref *ref) { struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 19483667c989..79693fb40aec 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -121,7 +121,7 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, &resp1); - if (!err) + if (!err && erdma_device_iwarp(dev)) qp->attrs.cookie = FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); @@ -1017,7 +1017,12 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, qp->attrs.max_send_sge = attrs->cap.max_send_sge; qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; - qp->attrs.state = ERDMA_QP_STATE_IDLE; + + if (erdma_device_iwarp(qp->dev)) + qp->attrs.state = ERDMA_QP_STATE_IDLE; + else + qp->attrs.rocev2.state = ERDMA_QPS_ROCEV2_RESET; + INIT_DELAYED_WORK(&qp->reflush_dwork, erdma_flush_worker); ret = create_qp_cmd(uctx, qp); @@ -1291,13 +1296,20 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) struct erdma_dev *dev = to_edev(ibqp->device); struct erdma_ucontext *ctx = rdma_udata_to_drv_context( udata, struct erdma_ucontext, ibucontext); + struct erdma_mod_qp_params_rocev2 rocev2_params; struct erdma_qp_attrs qp_attrs; int err; struct erdma_cmdq_destroy_qp_req req; down_write(&qp->state_lock); - qp_attrs.state = ERDMA_QP_STATE_ERROR; - erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + if (erdma_device_iwarp(dev)) { + qp_attrs.state = ERDMA_QP_STATE_ERROR; + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + } else { + rocev2_params.state = ERDMA_QPS_ROCEV2_ERROR; + erdma_modify_qp_state_rocev2(qp, &rocev2_params, + ERDMA_QPA_ROCEV2_STATE); + } up_write(&qp->state_lock); cancel_delayed_work_sync(&qp->reflush_dwork); @@ -1528,6 +1540,29 @@ void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) atomic_dec(&dev->num_ctx); } +static void erdma_attr_to_av(const struct rdma_ah_attr *ah_attr, + struct erdma_av *av, u16 sport) +{ + const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + + av->port = rdma_ah_get_port_num(ah_attr); + av->sgid_index = grh->sgid_index; + av->hop_limit = grh->hop_limit; + av->traffic_class = grh->traffic_class; + av->sl = rdma_ah_get_sl(ah_attr); + + av->flow_label = grh->flow_label; + av->udp_sport = sport; + + ether_addr_copy(av->dmac, ah_attr->roce.dmac); + memcpy(av->dgid, grh->dgid.raw, ERDMA_ROCEV2_GID_SIZE); + + if (ipv6_addr_v4mapped((struct in6_addr *)&grh->dgid)) + av->ntype = ERDMA_NETWORK_TYPE_IPV4; + else + av->ntype = ERDMA_NETWORK_TYPE_IPV6; +} + static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { [IB_QPS_RESET] = ERDMA_QP_STATE_IDLE, [IB_QPS_INIT] = ERDMA_QP_STATE_IDLE, @@ -1538,6 +1573,140 @@ static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { [IB_QPS_ERR] = ERDMA_QP_STATE_ERROR }; +static int ib_qps_to_erdma_qps_rocev2[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = ERDMA_QPS_ROCEV2_RESET, + [IB_QPS_INIT] = ERDMA_QPS_ROCEV2_INIT, + [IB_QPS_RTR] = ERDMA_QPS_ROCEV2_RTR, + [IB_QPS_RTS] = ERDMA_QPS_ROCEV2_RTS, + [IB_QPS_SQD] = ERDMA_QPS_ROCEV2_SQD, + [IB_QPS_SQE] = ERDMA_QPS_ROCEV2_SQE, + [IB_QPS_ERR] = ERDMA_QPS_ROCEV2_ERROR, +}; + +static int erdma_qps_to_ib_qps_rocev2[ERDMA_QPS_ROCEV2_COUNT] = { + [ERDMA_QPS_ROCEV2_RESET] = IB_QPS_RESET, + [ERDMA_QPS_ROCEV2_INIT] = IB_QPS_INIT, + [ERDMA_QPS_ROCEV2_RTR] = IB_QPS_RTR, + [ERDMA_QPS_ROCEV2_RTS] = IB_QPS_RTS, + [ERDMA_QPS_ROCEV2_SQD] = IB_QPS_SQD, + [ERDMA_QPS_ROCEV2_SQE] = IB_QPS_SQE, + [ERDMA_QPS_ROCEV2_ERROR] = IB_QPS_ERR, +}; + +static int erdma_check_qp_attr_rocev2(struct erdma_qp *qp, + struct ib_qp_attr *attr, int attr_mask) +{ + enum ib_qp_state cur_state, nxt_state; + struct erdma_dev *dev = qp->dev; + int ret = -EINVAL; + + if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) { + ret = -EOPNOTSUPP; + goto out; + } + + if ((attr_mask & IB_QP_PKEY_INDEX) && + attr->pkey_index >= ERDMA_MAX_PKEYS) + goto out; + + if ((attr_mask & IB_QP_PORT) && + !rdma_is_port_valid(&dev->ibdev, attr->port_num)) + goto out; + + cur_state = (attr_mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : + erdma_qps_to_ib_qps_rocev2[qp->attrs.rocev2.state]; + + nxt_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : cur_state; + + if (!ib_modify_qp_is_ok(cur_state, nxt_state, qp->ibqp.qp_type, + attr_mask)) + goto out; + + if ((attr_mask & IB_QP_AV) && + erdma_check_gid_attr(rdma_ah_read_grh(&attr->ah_attr)->sgid_attr)) + goto out; + + return 0; + +out: + return ret; +} + +static void erdma_init_mod_qp_params_rocev2( + struct erdma_qp *qp, struct erdma_mod_qp_params_rocev2 *params, + int *erdma_attr_mask, struct ib_qp_attr *attr, int ib_attr_mask) +{ + enum erdma_qpa_mask_rocev2 to_modify_attrs = 0; + enum erdma_qps_rocev2 cur_state, nxt_state; + u16 udp_sport; + + if (ib_attr_mask & IB_QP_CUR_STATE) + cur_state = ib_qps_to_erdma_qps_rocev2[attr->cur_qp_state]; + else + cur_state = qp->attrs.rocev2.state; + + if (ib_attr_mask & IB_QP_STATE) + nxt_state = ib_qps_to_erdma_qps_rocev2[attr->qp_state]; + else + nxt_state = cur_state; + + to_modify_attrs |= ERDMA_QPA_ROCEV2_STATE; + params->state = nxt_state; + + if (ib_attr_mask & IB_QP_QKEY) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_QKEY; + params->qkey = attr->qkey; + } + + if (ib_attr_mask & IB_QP_SQ_PSN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_SQ_PSN; + params->sq_psn = attr->sq_psn; + } + + if (ib_attr_mask & IB_QP_RQ_PSN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_RQ_PSN; + params->rq_psn = attr->rq_psn; + } + + if (ib_attr_mask & IB_QP_DEST_QPN) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_DST_QPN; + params->dst_qpn = attr->dest_qp_num; + } + + if (ib_attr_mask & IB_QP_AV) { + to_modify_attrs |= ERDMA_QPA_ROCEV2_AV; + udp_sport = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, + QP_ID(qp), params->dst_qpn); + erdma_attr_to_av(&attr->ah_attr, ¶ms->av, udp_sport); + } + + *erdma_attr_mask = to_modify_attrs; +} + +int erdma_modify_qp_rocev2(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct erdma_mod_qp_params_rocev2 params; + struct erdma_qp *qp = to_eqp(ibqp); + int ret = 0, erdma_attr_mask = 0; + + down_write(&qp->state_lock); + + ret = erdma_check_qp_attr_rocev2(qp, attr, attr_mask); + if (ret) + goto out; + + erdma_init_mod_qp_params_rocev2(qp, ¶ms, &erdma_attr_mask, attr, + attr_mask); + + ret = erdma_modify_qp_state_rocev2(qp, ¶ms, erdma_attr_mask); + +out: + up_write(&qp->state_lock); + return ret; +} + int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -1920,6 +2089,22 @@ int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) return 0; } +void erdma_set_av_cfg(struct erdma_av_cfg *av_cfg, struct erdma_av *av) +{ + av_cfg->cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_AV_FL_MASK, av->flow_label) | + FIELD_PREP(ERDMA_CMD_CREATE_AV_NTYPE_MASK, av->ntype); + + av_cfg->traffic_class = av->traffic_class; + av_cfg->hop_limit = av->hop_limit; + av_cfg->sl = av->sl; + + av_cfg->udp_sport = av->udp_sport; + av_cfg->sgid_index = av->sgid_index; + + ether_addr_copy(av_cfg->dmac, av->dmac); + memcpy(av_cfg->dgid, av->dgid, ERDMA_ROCEV2_GID_SIZE); +} + int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) { @@ -1948,38 +2133,14 @@ int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, udp_sport = IB_ROCE_UDP_ENCAP_VALID_PORT_MIN + (ah->ahn & 0x3FFF); - ah->av.port = rdma_ah_get_port_num(init_attr->ah_attr); - ah->av.sgid_index = grh->sgid_index; - ah->av.hop_limit = grh->hop_limit; - ah->av.traffic_class = grh->traffic_class; - ah->av.sl = rdma_ah_get_sl(init_attr->ah_attr); - ah->av.flow_label = grh->flow_label; - ah->av.udp_sport = udp_sport; - - ether_addr_copy(ah->av.dmac, init_attr->ah_attr->roce.dmac); - memcpy(ah->av.dgid, grh->dgid.raw, ERDMA_ROCEV2_GID_SIZE); - - if (ipv6_addr_v4mapped((struct in6_addr *)&grh->dgid)) - ah->av.ntype = ERDMA_NETWORK_TYPE_IPV4; - else - ah->av.ntype = ERDMA_NETWORK_TYPE_IPV6; + erdma_attr_to_av(init_attr->ah_attr, &ah->av, udp_sport); erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_CREATE_AH); req.pdn = pd->pdn; req.ahn = ah->ahn; - - req.av_cfg.cfg0 = - FIELD_PREP(ERDMA_CMD_CREATE_AV_FL_MASK, ah->av.flow_label) | - FIELD_PREP(ERDMA_CMD_CREATE_AV_NTYPE_MASK, ah->av.ntype); - req.av_cfg.traffic_class = ah->av.traffic_class; - req.av_cfg.hop_limit = ah->av.hop_limit; - req.av_cfg.sl = ah->av.sl; - req.av_cfg.udp_sport = ah->av.udp_sport; - req.av_cfg.sgid_index = ah->av.sgid_index; - ether_addr_copy(req.av_cfg.dmac, ah->av.dmac); - memcpy(req.av_cfg.dgid, ah->av.dgid, ERDMA_ROCEV2_GID_SIZE); + erdma_set_av_cfg(&req.av_cfg, &ah->av); ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); if (ret) { diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 78a6c35cf1a5..fad3e475d8f1 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -216,10 +216,46 @@ enum erdma_qp_attr_mask { ERDMA_QP_ATTR_MPA = (1 << 7) }; +enum erdma_qps_rocev2 { + ERDMA_QPS_ROCEV2_RESET = 0, + ERDMA_QPS_ROCEV2_INIT = 1, + ERDMA_QPS_ROCEV2_RTR = 2, + ERDMA_QPS_ROCEV2_RTS = 3, + ERDMA_QPS_ROCEV2_SQD = 4, + ERDMA_QPS_ROCEV2_SQE = 5, + ERDMA_QPS_ROCEV2_ERROR = 6, + ERDMA_QPS_ROCEV2_COUNT = 7, +}; + +enum erdma_qpa_mask_rocev2 { + ERDMA_QPA_ROCEV2_STATE = (1 << 0), + ERDMA_QPA_ROCEV2_QKEY = (1 << 1), + ERDMA_QPA_ROCEV2_AV = (1 << 2), + ERDMA_QPA_ROCEV2_SQ_PSN = (1 << 3), + ERDMA_QPA_ROCEV2_RQ_PSN = (1 << 4), + ERDMA_QPA_ROCEV2_DST_QPN = (1 << 5), +}; + enum erdma_qp_flags { ERDMA_QP_IN_FLUSHING = (1 << 0), }; +struct erdma_mod_qp_params_rocev2 { + enum erdma_qps_rocev2 state; + u32 qkey; + u32 sq_psn; + u32 rq_psn; + u32 dst_qpn; + struct erdma_av av; +}; + +struct erdma_qp_attrs_rocev2 { + enum erdma_qps_rocev2 state; + u32 qkey; + u32 dst_qpn; + struct erdma_av av; +}; + struct erdma_qp_attrs { enum erdma_qp_state state; enum erdma_cc_alg cc; /* Congestion control algorithm */ @@ -234,6 +270,7 @@ struct erdma_qp_attrs { #define ERDMA_QP_PASSIVE 1 u8 qp_type; u8 pd_len; + struct erdma_qp_attrs_rocev2 rocev2; }; struct erdma_qp { @@ -307,6 +344,9 @@ void erdma_qp_get(struct erdma_qp *qp); void erdma_qp_put(struct erdma_qp *qp); int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, enum erdma_qp_attr_mask mask); +int erdma_modify_qp_state_rocev2(struct erdma_qp *qp, + struct erdma_mod_qp_params_rocev2 *params, + int attr_mask); void erdma_qp_llp_close(struct erdma_qp *qp); void erdma_qp_cm_drop(struct erdma_qp *qp); @@ -386,6 +426,8 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_qp_init_attr *init_attr); int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *data); +int erdma_modify_qp_rocev2(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata); int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext); @@ -404,6 +446,7 @@ int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, const struct ib_recv_wr **bad_recv_wr); int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +void erdma_remove_cqes_of_qp(struct ib_cq *ibcq, u32 qpn); struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg); int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, @@ -419,6 +462,7 @@ enum rdma_link_layer erdma_get_link_layer(struct ib_device *ibdev, int erdma_add_gid(const struct ib_gid_attr *attr, void **context); int erdma_del_gid(const struct ib_gid_attr *attr, void **context); int erdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey); +void erdma_set_av_cfg(struct erdma_av_cfg *av_cfg, struct erdma_av *av); int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata); int erdma_destroy_ah(struct ib_ah *ibah, u32 flags); From de5b8008aa4da7e1e750cc780c086d45242ba51e Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Wed, 11 Dec 2024 10:09:06 +0800 Subject: [PATCH 11/63] RDMA/erdma: Refactor the code of the modify_qp interface The procedure for modifying QP is similar for both the iWARP and RoCEv2 protocols. Therefore, we unify the code and provide the erdma_modify_qp() interface for both protocols. Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241211020930.68833-7-boshiyu@linux.alibaba.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_cm.c | 71 ++++---- drivers/infiniband/hw/erdma/erdma_hw.h | 1 + drivers/infiniband/hw/erdma/erdma_main.c | 3 +- drivers/infiniband/hw/erdma/erdma_qp.c | 136 ++++++++------ drivers/infiniband/hw/erdma/erdma_verbs.c | 212 +++++++++++----------- drivers/infiniband/hw/erdma/erdma_verbs.h | 75 +++++--- 6 files changed, 277 insertions(+), 221 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c index 771059a8eb7d..1b23c698ec25 100644 --- a/drivers/infiniband/hw/erdma/erdma_cm.c +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -567,7 +567,8 @@ reject_conn: static int erdma_proc_mpareply(struct erdma_cep *cep) { - struct erdma_qp_attrs qp_attrs; + enum erdma_qpa_mask_iwarp to_modify_attrs = 0; + struct erdma_mod_qp_params_iwarp params; struct erdma_qp *qp = cep->qp; struct mpa_rr *rep; int ret; @@ -597,26 +598,29 @@ static int erdma_proc_mpareply(struct erdma_cep *cep) return -EINVAL; } - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.irq_size = cep->ird; - qp_attrs.orq_size = cep->ord; - qp_attrs.state = ERDMA_QP_STATE_RTS; + memset(¶ms, 0, sizeof(params)); + params.state = ERDMA_QPS_IWARP_RTS; + params.irq_size = cep->ird; + params.orq_size = cep->ord; down_write(&qp->state_lock); - if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + if (qp->attrs.iwarp.state > ERDMA_QPS_IWARP_RTR) { ret = -EINVAL; up_write(&qp->state_lock); goto out_err; } - qp->attrs.qp_type = ERDMA_QP_ACTIVE; - if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) - qp->attrs.cc = COMPROMISE_CC; + to_modify_attrs = ERDMA_QPA_IWARP_STATE | ERDMA_QPA_IWARP_LLP_HANDLE | + ERDMA_QPA_IWARP_MPA | ERDMA_QPA_IWARP_IRD | + ERDMA_QPA_IWARP_ORD; - ret = erdma_modify_qp_internal(qp, &qp_attrs, - ERDMA_QP_ATTR_STATE | - ERDMA_QP_ATTR_LLP_HANDLE | - ERDMA_QP_ATTR_MPA); + params.qp_type = ERDMA_QP_ACTIVE; + if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) { + to_modify_attrs |= ERDMA_QPA_IWARP_CC; + params.cc = COMPROMISE_CC; + } + + ret = erdma_modify_qp_state_iwarp(qp, ¶ms, to_modify_attrs); up_write(&qp->state_lock); @@ -722,7 +726,7 @@ static int erdma_newconn_connected(struct erdma_cep *cep) __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1); memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE); - cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.iwarp.cookie); __mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc); ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len); @@ -1126,10 +1130,11 @@ error_put_qp: int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) { - struct erdma_dev *dev = to_edev(id->device); struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + struct erdma_mod_qp_params_iwarp mod_qp_params; + enum erdma_qpa_mask_iwarp to_modify_attrs = 0; + struct erdma_dev *dev = to_edev(id->device); struct erdma_qp *qp; - struct erdma_qp_attrs qp_attrs; int ret; erdma_cep_set_inuse(cep); @@ -1156,7 +1161,7 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) erdma_qp_get(qp); down_write(&qp->state_lock); - if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + if (qp->attrs.iwarp.state > ERDMA_QPS_IWARP_RTR) { ret = -EINVAL; up_write(&qp->state_lock); goto error; @@ -1181,11 +1186,11 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->cm_id = id; id->add_ref(id); - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.orq_size = params->ord; - qp_attrs.irq_size = params->ird; + memset(&mod_qp_params, 0, sizeof(mod_qp_params)); - qp_attrs.state = ERDMA_QP_STATE_RTS; + mod_qp_params.irq_size = params->ird; + mod_qp_params.orq_size = params->ord; + mod_qp_params.state = ERDMA_QPS_IWARP_RTS; /* Associate QP with CEP */ erdma_cep_get(cep); @@ -1194,19 +1199,21 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->state = ERDMA_EPSTATE_RDMA_MODE; - qp->attrs.qp_type = ERDMA_QP_PASSIVE; - qp->attrs.pd_len = params->private_data_len; + mod_qp_params.qp_type = ERDMA_QP_PASSIVE; + mod_qp_params.pd_len = params->private_data_len; - if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) - qp->attrs.cc = COMPROMISE_CC; + to_modify_attrs = ERDMA_QPA_IWARP_STATE | ERDMA_QPA_IWARP_ORD | + ERDMA_QPA_IWARP_LLP_HANDLE | ERDMA_QPA_IWARP_IRD | + ERDMA_QPA_IWARP_MPA; + + if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) { + to_modify_attrs |= ERDMA_QPA_IWARP_CC; + mod_qp_params.cc = COMPROMISE_CC; + } /* move to rts */ - ret = erdma_modify_qp_internal(qp, &qp_attrs, - ERDMA_QP_ATTR_STATE | - ERDMA_QP_ATTR_ORD | - ERDMA_QP_ATTR_LLP_HANDLE | - ERDMA_QP_ATTR_IRD | - ERDMA_QP_ATTR_MPA); + ret = erdma_modify_qp_state_iwarp(qp, &mod_qp_params, to_modify_attrs); + up_write(&qp->state_lock); if (ret) @@ -1214,7 +1221,7 @@ int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) cep->mpa.ext_data.bits = 0; __mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc); - cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.iwarp.cookie); ret = erdma_send_mpareqrep(cep, params->private_data, params->private_data_len); diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index b5c1aca71144..3b0f7fc4ff31 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -31,6 +31,7 @@ enum erdma_proto_type { ERDMA_PROTO_IWARP = 0, ERDMA_PROTO_ROCEV2 = 1, + ERDMA_PROTO_COUNT = 2, }; /* PCIe Bar0 Registers. */ diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 2fca163b1744..51cc8b17b9e9 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -486,7 +486,6 @@ static const struct ib_device_ops erdma_device_ops_rocev2 = { .query_pkey = erdma_query_pkey, .create_ah = erdma_create_ah, .destroy_ah = erdma_destroy_ah, - .modify_qp = erdma_modify_qp_rocev2, }; static const struct ib_device_ops erdma_device_ops_iwarp = { @@ -498,7 +497,6 @@ static const struct ib_device_ops erdma_device_ops_iwarp = { .iw_get_qp = erdma_get_ibqp, .iw_reject = erdma_reject, .iw_rem_ref = erdma_qp_put_ref, - .modify_qp = erdma_modify_qp, }; static const struct ib_device_ops erdma_device_ops = { @@ -533,6 +531,7 @@ static const struct ib_device_ops erdma_device_ops = { .query_qp = erdma_query_qp, .req_notify_cq = erdma_req_notify_cq, .reg_user_mr = erdma_reg_user_mr, + .modify_qp = erdma_modify_qp, INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd), diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 13977f4e9463..03d93f026fca 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -11,20 +11,20 @@ void erdma_qp_llp_close(struct erdma_qp *qp) { - struct erdma_qp_attrs qp_attrs; + struct erdma_mod_qp_params_iwarp params; down_write(&qp->state_lock); - switch (qp->attrs.state) { - case ERDMA_QP_STATE_RTS: - case ERDMA_QP_STATE_RTR: - case ERDMA_QP_STATE_IDLE: - case ERDMA_QP_STATE_TERMINATE: - qp_attrs.state = ERDMA_QP_STATE_CLOSING; - erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + switch (qp->attrs.iwarp.state) { + case ERDMA_QPS_IWARP_RTS: + case ERDMA_QPS_IWARP_RTR: + case ERDMA_QPS_IWARP_IDLE: + case ERDMA_QPS_IWARP_TERMINATE: + params.state = ERDMA_QPS_IWARP_CLOSING; + erdma_modify_qp_state_iwarp(qp, ¶ms, ERDMA_QPA_IWARP_STATE); break; - case ERDMA_QP_STATE_CLOSING: - qp->attrs.state = ERDMA_QP_STATE_IDLE; + case ERDMA_QPS_IWARP_CLOSING: + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; break; default: break; @@ -48,9 +48,10 @@ struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id) return NULL; } -static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, - struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +static int +erdma_modify_qp_state_to_rts(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + enum erdma_qpa_mask_iwarp mask) { int ret; struct erdma_dev *dev = qp->dev; @@ -59,12 +60,15 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, struct erdma_cep *cep = qp->cep; struct sockaddr_storage local_addr, remote_addr; - if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE)) + if (!(mask & ERDMA_QPA_IWARP_LLP_HANDLE)) return -EINVAL; - if (!(mask & ERDMA_QP_ATTR_MPA)) + if (!(mask & ERDMA_QPA_IWARP_MPA)) return -EINVAL; + if (!(mask & ERDMA_QPA_IWARP_CC)) + params->cc = qp->attrs.cc; + ret = getname_local(cep->sock, &local_addr); if (ret < 0) return ret; @@ -73,18 +77,16 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, if (ret < 0) return ret; - qp->attrs.state = ERDMA_QP_STATE_RTS; - tp = tcp_sk(qp->cep->sock->sk); erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_MODIFY_QP); - req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) | - FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) | + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, params->state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, params->cc) | FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); - req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie); + req.cookie = be32_to_cpu(cep->mpa.ext_data.cookie); req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr; req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr; req.dport = to_sockaddr_in(remote_addr).sin_port; @@ -92,33 +94,55 @@ static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, req.send_nxt = tp->snd_nxt; /* rsvd tcp seq for mpa-rsp in server. */ - if (qp->attrs.qp_type == ERDMA_QP_PASSIVE) - req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len; + if (params->qp_type == ERDMA_QP_PASSIVE) + req.send_nxt += MPA_DEFAULT_HDR_LEN + params->pd_len; req.recv_nxt = tp->rcv_nxt; - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + if (ret) + return ret; + + if (mask & ERDMA_QPA_IWARP_IRD) + qp->attrs.irq_size = params->irq_size; + + if (mask & ERDMA_QPA_IWARP_ORD) + qp->attrs.orq_size = params->orq_size; + + if (mask & ERDMA_QPA_IWARP_CC) + qp->attrs.cc = params->cc; + + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_RTS; + + return 0; } -static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, - struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +static int +erdma_modify_qp_state_to_stop(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + enum erdma_qpa_mask_iwarp mask) { struct erdma_dev *dev = qp->dev; struct erdma_cmdq_modify_qp_req req; - - qp->attrs.state = attrs->state; + int ret; erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_MODIFY_QP); - req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) | + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, params->state) | FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + if (ret) + return ret; + + qp->attrs.iwarp.state = params->state; + + return 0; } -int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask) +int erdma_modify_qp_state_iwarp(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + int mask) { bool need_reflush = false; int drop_conn, ret = 0; @@ -126,31 +150,31 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, if (!mask) return 0; - if (!(mask & ERDMA_QP_ATTR_STATE)) + if (!(mask & ERDMA_QPA_IWARP_STATE)) return 0; - switch (qp->attrs.state) { - case ERDMA_QP_STATE_IDLE: - case ERDMA_QP_STATE_RTR: - if (attrs->state == ERDMA_QP_STATE_RTS) { - ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); - } else if (attrs->state == ERDMA_QP_STATE_ERROR) { - qp->attrs.state = ERDMA_QP_STATE_ERROR; + switch (qp->attrs.iwarp.state) { + case ERDMA_QPS_IWARP_IDLE: + case ERDMA_QPS_IWARP_RTR: + if (params->state == ERDMA_QPS_IWARP_RTS) { + ret = erdma_modify_qp_state_to_rts(qp, params, mask); + } else if (params->state == ERDMA_QPS_IWARP_ERROR) { + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; need_reflush = true; if (qp->cep) { erdma_cep_put(qp->cep); qp->cep = NULL; } - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + ret = erdma_modify_qp_state_to_stop(qp, params, mask); } break; - case ERDMA_QP_STATE_RTS: + case ERDMA_QPS_IWARP_RTS: drop_conn = 0; - if (attrs->state == ERDMA_QP_STATE_CLOSING || - attrs->state == ERDMA_QP_STATE_TERMINATE || - attrs->state == ERDMA_QP_STATE_ERROR) { - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + if (params->state == ERDMA_QPS_IWARP_CLOSING || + params->state == ERDMA_QPS_IWARP_TERMINATE || + params->state == ERDMA_QPS_IWARP_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, params, mask); drop_conn = 1; need_reflush = true; } @@ -159,17 +183,17 @@ int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, erdma_qp_cm_drop(qp); break; - case ERDMA_QP_STATE_TERMINATE: - if (attrs->state == ERDMA_QP_STATE_ERROR) - qp->attrs.state = ERDMA_QP_STATE_ERROR; + case ERDMA_QPS_IWARP_TERMINATE: + if (params->state == ERDMA_QPS_IWARP_ERROR) + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; break; - case ERDMA_QP_STATE_CLOSING: - if (attrs->state == ERDMA_QP_STATE_IDLE) { - qp->attrs.state = ERDMA_QP_STATE_IDLE; - } else if (attrs->state == ERDMA_QP_STATE_ERROR) { - ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); - qp->attrs.state = ERDMA_QP_STATE_ERROR; - } else if (attrs->state != ERDMA_QP_STATE_CLOSING) { + case ERDMA_QPS_IWARP_CLOSING: + if (params->state == ERDMA_QPS_IWARP_IDLE) { + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; + } else if (params->state == ERDMA_QPS_IWARP_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, params, mask); + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_ERROR; + } else if (params->state != ERDMA_QPS_IWARP_CLOSING) { return -ECONNABORTED; } break; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 79693fb40aec..0543ff972247 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -122,7 +122,7 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, &resp1); if (!err && erdma_device_iwarp(dev)) - qp->attrs.cookie = + qp->attrs.iwarp.cookie = FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); return err; @@ -1019,7 +1019,7 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; if (erdma_device_iwarp(qp->dev)) - qp->attrs.state = ERDMA_QP_STATE_IDLE; + qp->attrs.iwarp.state = ERDMA_QPS_IWARP_IDLE; else qp->attrs.rocev2.state = ERDMA_QPS_ROCEV2_RESET; @@ -1296,18 +1296,18 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) struct erdma_dev *dev = to_edev(ibqp->device); struct erdma_ucontext *ctx = rdma_udata_to_drv_context( udata, struct erdma_ucontext, ibucontext); - struct erdma_mod_qp_params_rocev2 rocev2_params; - struct erdma_qp_attrs qp_attrs; - int err; struct erdma_cmdq_destroy_qp_req req; + union erdma_mod_qp_params params; + int err; down_write(&qp->state_lock); if (erdma_device_iwarp(dev)) { - qp_attrs.state = ERDMA_QP_STATE_ERROR; - erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + params.iwarp.state = ERDMA_QPS_IWARP_ERROR; + erdma_modify_qp_state_iwarp(qp, ¶ms.iwarp, + ERDMA_QPA_IWARP_STATE); } else { - rocev2_params.state = ERDMA_QPS_ROCEV2_ERROR; - erdma_modify_qp_state_rocev2(qp, &rocev2_params, + params.rocev2.state = ERDMA_QPS_ROCEV2_ERROR; + erdma_modify_qp_state_rocev2(qp, ¶ms.rocev2, ERDMA_QPA_ROCEV2_STATE); } up_write(&qp->state_lock); @@ -1563,38 +1563,69 @@ static void erdma_attr_to_av(const struct rdma_ah_attr *ah_attr, av->ntype = ERDMA_NETWORK_TYPE_IPV6; } -static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { - [IB_QPS_RESET] = ERDMA_QP_STATE_IDLE, - [IB_QPS_INIT] = ERDMA_QP_STATE_IDLE, - [IB_QPS_RTR] = ERDMA_QP_STATE_RTR, - [IB_QPS_RTS] = ERDMA_QP_STATE_RTS, - [IB_QPS_SQD] = ERDMA_QP_STATE_CLOSING, - [IB_QPS_SQE] = ERDMA_QP_STATE_TERMINATE, - [IB_QPS_ERR] = ERDMA_QP_STATE_ERROR +static int ib_qps_to_erdma_qps[ERDMA_PROTO_COUNT][IB_QPS_ERR + 1] = { + [ERDMA_PROTO_IWARP] = { + [IB_QPS_RESET] = ERDMA_QPS_IWARP_IDLE, + [IB_QPS_INIT] = ERDMA_QPS_IWARP_IDLE, + [IB_QPS_RTR] = ERDMA_QPS_IWARP_RTR, + [IB_QPS_RTS] = ERDMA_QPS_IWARP_RTS, + [IB_QPS_SQD] = ERDMA_QPS_IWARP_CLOSING, + [IB_QPS_SQE] = ERDMA_QPS_IWARP_TERMINATE, + [IB_QPS_ERR] = ERDMA_QPS_IWARP_ERROR, + }, + [ERDMA_PROTO_ROCEV2] = { + [IB_QPS_RESET] = ERDMA_QPS_ROCEV2_RESET, + [IB_QPS_INIT] = ERDMA_QPS_ROCEV2_INIT, + [IB_QPS_RTR] = ERDMA_QPS_ROCEV2_RTR, + [IB_QPS_RTS] = ERDMA_QPS_ROCEV2_RTS, + [IB_QPS_SQD] = ERDMA_QPS_ROCEV2_SQD, + [IB_QPS_SQE] = ERDMA_QPS_ROCEV2_SQE, + [IB_QPS_ERR] = ERDMA_QPS_ROCEV2_ERROR, + }, }; -static int ib_qps_to_erdma_qps_rocev2[IB_QPS_ERR + 1] = { - [IB_QPS_RESET] = ERDMA_QPS_ROCEV2_RESET, - [IB_QPS_INIT] = ERDMA_QPS_ROCEV2_INIT, - [IB_QPS_RTR] = ERDMA_QPS_ROCEV2_RTR, - [IB_QPS_RTS] = ERDMA_QPS_ROCEV2_RTS, - [IB_QPS_SQD] = ERDMA_QPS_ROCEV2_SQD, - [IB_QPS_SQE] = ERDMA_QPS_ROCEV2_SQE, - [IB_QPS_ERR] = ERDMA_QPS_ROCEV2_ERROR, +static int erdma_qps_to_ib_qps[ERDMA_PROTO_COUNT][ERDMA_QPS_ROCEV2_COUNT] = { + [ERDMA_PROTO_IWARP] = { + [ERDMA_QPS_IWARP_IDLE] = IB_QPS_INIT, + [ERDMA_QPS_IWARP_RTR] = IB_QPS_RTR, + [ERDMA_QPS_IWARP_RTS] = IB_QPS_RTS, + [ERDMA_QPS_IWARP_CLOSING] = IB_QPS_ERR, + [ERDMA_QPS_IWARP_TERMINATE] = IB_QPS_ERR, + [ERDMA_QPS_IWARP_ERROR] = IB_QPS_ERR, + }, + [ERDMA_PROTO_ROCEV2] = { + [ERDMA_QPS_ROCEV2_RESET] = IB_QPS_RESET, + [ERDMA_QPS_ROCEV2_INIT] = IB_QPS_INIT, + [ERDMA_QPS_ROCEV2_RTR] = IB_QPS_RTR, + [ERDMA_QPS_ROCEV2_RTS] = IB_QPS_RTS, + [ERDMA_QPS_ROCEV2_SQD] = IB_QPS_SQD, + [ERDMA_QPS_ROCEV2_SQE] = IB_QPS_SQE, + [ERDMA_QPS_ROCEV2_ERROR] = IB_QPS_ERR, + }, }; -static int erdma_qps_to_ib_qps_rocev2[ERDMA_QPS_ROCEV2_COUNT] = { - [ERDMA_QPS_ROCEV2_RESET] = IB_QPS_RESET, - [ERDMA_QPS_ROCEV2_INIT] = IB_QPS_INIT, - [ERDMA_QPS_ROCEV2_RTR] = IB_QPS_RTR, - [ERDMA_QPS_ROCEV2_RTS] = IB_QPS_RTS, - [ERDMA_QPS_ROCEV2_SQD] = IB_QPS_SQD, - [ERDMA_QPS_ROCEV2_SQE] = IB_QPS_SQE, - [ERDMA_QPS_ROCEV2_ERROR] = IB_QPS_ERR, -}; +static inline enum erdma_qps_iwarp ib_to_iwarp_qps(enum ib_qp_state state) +{ + return ib_qps_to_erdma_qps[ERDMA_PROTO_IWARP][state]; +} -static int erdma_check_qp_attr_rocev2(struct erdma_qp *qp, - struct ib_qp_attr *attr, int attr_mask) +static inline enum erdma_qps_rocev2 ib_to_rocev2_qps(enum ib_qp_state state) +{ + return ib_qps_to_erdma_qps[ERDMA_PROTO_ROCEV2][state]; +} + +static inline enum ib_qp_state iwarp_to_ib_qps(enum erdma_qps_iwarp state) +{ + return erdma_qps_to_ib_qps[ERDMA_PROTO_IWARP][state]; +} + +static inline enum ib_qp_state rocev2_to_ib_qps(enum erdma_qps_rocev2 state) +{ + return erdma_qps_to_ib_qps[ERDMA_PROTO_ROCEV2][state]; +} + +static int erdma_check_qp_attrs(struct erdma_qp *qp, struct ib_qp_attr *attr, + int attr_mask) { enum ib_qp_state cur_state, nxt_state; struct erdma_dev *dev = qp->dev; @@ -1605,27 +1636,31 @@ static int erdma_check_qp_attr_rocev2(struct erdma_qp *qp, goto out; } - if ((attr_mask & IB_QP_PKEY_INDEX) && - attr->pkey_index >= ERDMA_MAX_PKEYS) - goto out; - if ((attr_mask & IB_QP_PORT) && !rdma_is_port_valid(&dev->ibdev, attr->port_num)) goto out; - cur_state = (attr_mask & IB_QP_CUR_STATE) ? - attr->cur_qp_state : - erdma_qps_to_ib_qps_rocev2[qp->attrs.rocev2.state]; + if (erdma_device_rocev2(dev)) { + cur_state = (attr_mask & IB_QP_CUR_STATE) ? + attr->cur_qp_state : + rocev2_to_ib_qps(qp->attrs.rocev2.state); - nxt_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : cur_state; + nxt_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : + cur_state; - if (!ib_modify_qp_is_ok(cur_state, nxt_state, qp->ibqp.qp_type, - attr_mask)) - goto out; + if (!ib_modify_qp_is_ok(cur_state, nxt_state, qp->ibqp.qp_type, + attr_mask)) + goto out; - if ((attr_mask & IB_QP_AV) && - erdma_check_gid_attr(rdma_ah_read_grh(&attr->ah_attr)->sgid_attr)) - goto out; + if ((attr_mask & IB_QP_AV) && + erdma_check_gid_attr( + rdma_ah_read_grh(&attr->ah_attr)->sgid_attr)) + goto out; + + if ((attr_mask & IB_QP_PKEY_INDEX) && + attr->pkey_index >= ERDMA_MAX_PKEYS) + goto out; + } return 0; @@ -1642,12 +1677,12 @@ static void erdma_init_mod_qp_params_rocev2( u16 udp_sport; if (ib_attr_mask & IB_QP_CUR_STATE) - cur_state = ib_qps_to_erdma_qps_rocev2[attr->cur_qp_state]; + cur_state = ib_to_rocev2_qps(attr->cur_qp_state); else cur_state = qp->attrs.rocev2.state; if (ib_attr_mask & IB_QP_STATE) - nxt_state = ib_qps_to_erdma_qps_rocev2[attr->qp_state]; + nxt_state = ib_to_rocev2_qps(attr->qp_state); else nxt_state = cur_state; @@ -1684,75 +1719,46 @@ static void erdma_init_mod_qp_params_rocev2( *erdma_attr_mask = to_modify_attrs; } -int erdma_modify_qp_rocev2(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) { - struct erdma_mod_qp_params_rocev2 params; struct erdma_qp *qp = to_eqp(ibqp); + union erdma_mod_qp_params params; int ret = 0, erdma_attr_mask = 0; down_write(&qp->state_lock); - ret = erdma_check_qp_attr_rocev2(qp, attr, attr_mask); + ret = erdma_check_qp_attrs(qp, attr, attr_mask); if (ret) goto out; - erdma_init_mod_qp_params_rocev2(qp, ¶ms, &erdma_attr_mask, attr, - attr_mask); + if (erdma_device_iwarp(qp->dev)) { + if (attr_mask & IB_QP_STATE) { + erdma_attr_mask |= ERDMA_QPA_IWARP_STATE; + params.iwarp.state = ib_to_iwarp_qps(attr->qp_state); + } - ret = erdma_modify_qp_state_rocev2(qp, ¶ms, erdma_attr_mask); + ret = erdma_modify_qp_state_iwarp(qp, ¶ms.iwarp, + erdma_attr_mask); + } else { + erdma_init_mod_qp_params_rocev2( + qp, ¶ms.rocev2, &erdma_attr_mask, attr, attr_mask); + + ret = erdma_modify_qp_state_rocev2(qp, ¶ms.rocev2, + erdma_attr_mask); + } out: up_write(&qp->state_lock); return ret; } -int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, - struct ib_udata *udata) -{ - struct erdma_qp_attrs new_attrs; - enum erdma_qp_attr_mask erdma_attr_mask = 0; - struct erdma_qp *qp = to_eqp(ibqp); - int ret = 0; - - if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) - return -EOPNOTSUPP; - - memset(&new_attrs, 0, sizeof(new_attrs)); - - if (attr_mask & IB_QP_STATE) { - new_attrs.state = ib_qp_state_to_erdma_qp_state[attr->qp_state]; - - erdma_attr_mask |= ERDMA_QP_ATTR_STATE; - } - - down_write(&qp->state_lock); - - ret = erdma_modify_qp_internal(qp, &new_attrs, erdma_attr_mask); - - up_write(&qp->state_lock); - - return ret; -} - static enum ib_qp_state query_qp_state(struct erdma_qp *qp) { - switch (qp->attrs.state) { - case ERDMA_QP_STATE_IDLE: - return IB_QPS_INIT; - case ERDMA_QP_STATE_RTR: - return IB_QPS_RTR; - case ERDMA_QP_STATE_RTS: - return IB_QPS_RTS; - case ERDMA_QP_STATE_CLOSING: - return IB_QPS_ERR; - case ERDMA_QP_STATE_TERMINATE: - return IB_QPS_ERR; - case ERDMA_QP_STATE_ERROR: - return IB_QPS_ERR; - default: - return IB_QPS_ERR; - } + if (erdma_device_iwarp(qp->dev)) + return iwarp_to_ib_qps(qp->attrs.iwarp.state); + else + return rocev2_to_ib_qps(qp->attrs.rocev2.state); } int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index fad3e475d8f1..f9408ccc8bad 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -195,25 +195,26 @@ struct erdma_kqp { u8 sig_all; }; -enum erdma_qp_state { - ERDMA_QP_STATE_IDLE = 0, - ERDMA_QP_STATE_RTR = 1, - ERDMA_QP_STATE_RTS = 2, - ERDMA_QP_STATE_CLOSING = 3, - ERDMA_QP_STATE_TERMINATE = 4, - ERDMA_QP_STATE_ERROR = 5, - ERDMA_QP_STATE_UNDEF = 7, - ERDMA_QP_STATE_COUNT = 8 +enum erdma_qps_iwarp { + ERDMA_QPS_IWARP_IDLE = 0, + ERDMA_QPS_IWARP_RTR = 1, + ERDMA_QPS_IWARP_RTS = 2, + ERDMA_QPS_IWARP_CLOSING = 3, + ERDMA_QPS_IWARP_TERMINATE = 4, + ERDMA_QPS_IWARP_ERROR = 5, + ERDMA_QPS_IWARP_UNDEF = 6, + ERDMA_QPS_IWARP_COUNT = 7, }; -enum erdma_qp_attr_mask { - ERDMA_QP_ATTR_STATE = (1 << 0), - ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2), - ERDMA_QP_ATTR_ORD = (1 << 3), - ERDMA_QP_ATTR_IRD = (1 << 4), - ERDMA_QP_ATTR_SQ_SIZE = (1 << 5), - ERDMA_QP_ATTR_RQ_SIZE = (1 << 6), - ERDMA_QP_ATTR_MPA = (1 << 7) +enum erdma_qpa_mask_iwarp { + ERDMA_QPA_IWARP_STATE = (1 << 0), + ERDMA_QPA_IWARP_LLP_HANDLE = (1 << 2), + ERDMA_QPA_IWARP_ORD = (1 << 3), + ERDMA_QPA_IWARP_IRD = (1 << 4), + ERDMA_QPA_IWARP_SQ_SIZE = (1 << 5), + ERDMA_QPA_IWARP_RQ_SIZE = (1 << 6), + ERDMA_QPA_IWARP_MPA = (1 << 7), + ERDMA_QPA_IWARP_CC = (1 << 8), }; enum erdma_qps_rocev2 { @@ -240,6 +241,23 @@ enum erdma_qp_flags { ERDMA_QP_IN_FLUSHING = (1 << 0), }; +#define ERDMA_QP_ACTIVE 0 +#define ERDMA_QP_PASSIVE 1 + +struct erdma_mod_qp_params_iwarp { + enum erdma_qps_iwarp state; + enum erdma_cc_alg cc; + u8 qp_type; + u8 pd_len; + u32 irq_size; + u32 orq_size; +}; + +struct erdma_qp_attrs_iwarp { + enum erdma_qps_iwarp state; + u32 cookie; +}; + struct erdma_mod_qp_params_rocev2 { enum erdma_qps_rocev2 state; u32 qkey; @@ -249,6 +267,11 @@ struct erdma_mod_qp_params_rocev2 { struct erdma_av av; }; +union erdma_mod_qp_params { + struct erdma_mod_qp_params_iwarp iwarp; + struct erdma_mod_qp_params_rocev2 rocev2; +}; + struct erdma_qp_attrs_rocev2 { enum erdma_qps_rocev2 state; u32 qkey; @@ -257,7 +280,6 @@ struct erdma_qp_attrs_rocev2 { }; struct erdma_qp_attrs { - enum erdma_qp_state state; enum erdma_cc_alg cc; /* Congestion control algorithm */ u32 sq_size; u32 rq_size; @@ -265,12 +287,10 @@ struct erdma_qp_attrs { u32 irq_size; u32 max_send_sge; u32 max_recv_sge; - u32 cookie; -#define ERDMA_QP_ACTIVE 0 -#define ERDMA_QP_PASSIVE 1 - u8 qp_type; - u8 pd_len; - struct erdma_qp_attrs_rocev2 rocev2; + union { + struct erdma_qp_attrs_iwarp iwarp; + struct erdma_qp_attrs_rocev2 rocev2; + }; }; struct erdma_qp { @@ -342,8 +362,9 @@ static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id) void erdma_qp_get(struct erdma_qp *qp); void erdma_qp_put(struct erdma_qp *qp); -int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, - enum erdma_qp_attr_mask mask); +int erdma_modify_qp_state_iwarp(struct erdma_qp *qp, + struct erdma_mod_qp_params_iwarp *params, + int mask); int erdma_modify_qp_state_rocev2(struct erdma_qp *qp, struct erdma_mod_qp_params_rocev2 *params, int attr_mask); @@ -426,8 +447,6 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_qp_init_attr *init_attr); int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *data); -int erdma_modify_qp_rocev2(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int mask, struct ib_udata *udata); int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext); From 1cccbd3eec3d6370ddf9a003c1a1baeb8c155b3b Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Wed, 11 Dec 2024 10:09:07 +0800 Subject: [PATCH 12/63] RDMA/erdma: Add the query_qp command to the cmdq Certian QP attributes, such as sq_draining, can only be obtained by querying the hardware on the erdma RoCEv2 device. To address this, we add the query_qp command to the cmdq and parse the response to retrieve corresponding QP attributes. Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241211020930.68833-8-boshiyu@linux.alibaba.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_hw.h | 12 +++++ drivers/infiniband/hw/erdma/erdma_verbs.c | 58 +++++++++++++++++++---- 2 files changed, 60 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 3b0f7fc4ff31..809e77dde271 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -154,6 +154,7 @@ enum CMDQ_RDMA_OPCODE { CMDQ_OPCODE_SET_GID = 14, CMDQ_OPCODE_CREATE_AH = 15, CMDQ_OPCODE_DESTROY_AH = 16, + CMDQ_OPCODE_QUERY_QP = 17, }; enum CMDQ_COMMON_OPCODE { @@ -362,6 +363,17 @@ struct erdma_cmdq_mod_qp_req_rocev2 { struct erdma_av_cfg av_cfg; }; +/* query qp response mask */ +#define ERDMA_CMD_QUERY_QP_RESP_SQ_PSN_MASK GENMASK_ULL(23, 0) +#define ERDMA_CMD_QUERY_QP_RESP_RQ_PSN_MASK GENMASK_ULL(47, 24) +#define ERDMA_CMD_QUERY_QP_RESP_QP_STATE_MASK GENMASK_ULL(55, 48) +#define ERDMA_CMD_QUERY_QP_RESP_SQ_DRAINING_MASK GENMASK_ULL(56, 56) + +struct erdma_cmdq_query_qp_req_rocev2 { + u64 hdr; + u32 qpn; +}; + /* create qp cfg0 */ #define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) #define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 0543ff972247..e7fd3b948688 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1563,6 +1563,19 @@ static void erdma_attr_to_av(const struct rdma_ah_attr *ah_attr, av->ntype = ERDMA_NETWORK_TYPE_IPV6; } +static void erdma_av_to_attr(struct erdma_av *av, struct rdma_ah_attr *ah_attr) +{ + ah_attr->type = RDMA_AH_ATTR_TYPE_ROCE; + + rdma_ah_set_sl(ah_attr, av->sl); + rdma_ah_set_port_num(ah_attr, av->port); + rdma_ah_set_ah_flags(ah_attr, IB_AH_GRH); + + rdma_ah_set_grh(ah_attr, NULL, av->flow_label, av->sgid_index, + av->hop_limit, av->traffic_class); + rdma_ah_set_dgid_raw(ah_attr, av->dgid); +} + static int ib_qps_to_erdma_qps[ERDMA_PROTO_COUNT][IB_QPS_ERR + 1] = { [ERDMA_PROTO_IWARP] = { [IB_QPS_RESET] = ERDMA_QPS_IWARP_IDLE, @@ -1764,8 +1777,11 @@ static enum ib_qp_state query_qp_state(struct erdma_qp *qp) int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { + struct erdma_cmdq_query_qp_req_rocev2 req; struct erdma_dev *dev; struct erdma_qp *qp; + u64 resp; + int ret; if (ibqp && qp_attr && qp_init_attr) { qp = to_eqp(ibqp); @@ -1792,8 +1808,37 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; - qp_attr->qp_state = query_qp_state(qp); - qp_attr->cur_qp_state = query_qp_state(qp); + if (erdma_device_rocev2(dev)) { + /* Query hardware to get some attributes */ + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_QUERY_QP); + req.qpn = QP_ID(qp); + + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp, + NULL); + if (ret) + return ret; + + qp_attr->sq_psn = + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_SQ_PSN_MASK, resp); + qp_attr->rq_psn = + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_RQ_PSN_MASK, resp); + qp_attr->qp_state = rocev2_to_ib_qps( + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_QP_STATE_MASK, resp)); + qp_attr->cur_qp_state = qp_attr->qp_state; + qp_attr->sq_draining = FIELD_GET( + ERDMA_CMD_QUERY_QP_RESP_SQ_DRAINING_MASK, resp); + + qp_attr->pkey_index = 0; + qp_attr->dest_qp_num = qp->attrs.rocev2.dst_qpn; + + if (qp->ibqp.qp_type == IB_QPT_RC) + erdma_av_to_attr(&qp->attrs.rocev2.av, + &qp_attr->ah_attr); + } else { + qp_attr->qp_state = query_qp_state(qp); + qp_attr->cur_qp_state = qp_attr->qp_state; + } return 0; } @@ -2185,14 +2230,7 @@ int erdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr) struct erdma_ah *ah = to_eah(ibah); memset(ah_attr, 0, sizeof(*ah_attr)); - - ah_attr->type = RDMA_AH_ATTR_TYPE_ROCE; - rdma_ah_set_sl(ah_attr, ah->av.sl); - rdma_ah_set_port_num(ah_attr, ah->av.port); - rdma_ah_set_ah_flags(ah_attr, IB_AH_GRH); - rdma_ah_set_grh(ah_attr, NULL, ah->av.flow_label, ah->av.sgid_index, - ah->av.hop_limit, ah->av.traffic_class); - rdma_ah_set_dgid_raw(ah_attr, ah->av.dgid); + erdma_av_to_attr(&ah->av, ah_attr); return 0; } From 999a0a2e9b87c451786df32c57518a2600311035 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Wed, 11 Dec 2024 10:09:08 +0800 Subject: [PATCH 13/63] RDMA/erdma: Support UD QPs and UD WRs The iWARP protocol supports only RC QPs previously. Now we add UD QPs and UD WRs support for the RoCEv2 protocol. Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241211020930.68833-9-boshiyu@linux.alibaba.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_cq.c | 20 +++++++ drivers/infiniband/hw/erdma/erdma_hw.h | 37 +++++++++++- drivers/infiniband/hw/erdma/erdma_qp.c | 71 ++++++++++++++++++----- drivers/infiniband/hw/erdma/erdma_verbs.c | 29 +++++++-- 4 files changed, 136 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c index eada882472a3..1f456327e63c 100644 --- a/drivers/infiniband/hw/erdma/erdma_cq.c +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -105,6 +105,22 @@ static const struct { { ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR }, }; +static void erdma_process_ud_cqe(struct erdma_cqe *cqe, struct ib_wc *wc) +{ + u32 ud_info; + + wc->wc_flags |= (IB_WC_GRH | IB_WC_WITH_NETWORK_HDR_TYPE); + ud_info = be32_to_cpu(cqe->ud.info); + wc->network_hdr_type = FIELD_GET(ERDMA_CQE_NTYPE_MASK, ud_info); + if (wc->network_hdr_type == ERDMA_NETWORK_TYPE_IPV4) + wc->network_hdr_type = RDMA_NETWORK_IPV4; + else + wc->network_hdr_type = RDMA_NETWORK_IPV6; + wc->src_qp = FIELD_GET(ERDMA_CQE_SQPN_MASK, ud_info); + wc->sl = FIELD_GET(ERDMA_CQE_SL_MASK, ud_info); + wc->pkey_index = 0; +} + #define ERDMA_POLLCQ_NO_QP 1 static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) @@ -168,6 +184,10 @@ static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) wc->wc_flags |= IB_WC_WITH_INVALIDATE; } + if (erdma_device_rocev2(dev) && + (qp->ibqp.qp_type == IB_QPT_UD || qp->ibqp.qp_type == IB_QPT_GSI)) + erdma_process_ud_cqe(cqe, wc); + if (syndrome >= ERDMA_NUM_WC_STATUS) syndrome = ERDMA_WC_GENERAL_ERR; diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h index 809e77dde271..ea4db53901a4 100644 --- a/drivers/infiniband/hw/erdma/erdma_hw.h +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -374,6 +374,11 @@ struct erdma_cmdq_query_qp_req_rocev2 { u32 qpn; }; +enum erdma_qp_type { + ERDMA_QPT_RC = 0, + ERDMA_QPT_UD = 1, +}; + /* create qp cfg0 */ #define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) #define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) @@ -382,6 +387,9 @@ struct erdma_cmdq_query_qp_req_rocev2 { #define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20) #define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0) +/* create qp cfg2 */ +#define ERDMA_CMD_CREATE_QP_TYPE_MASK GENMASK(3, 0) + /* create qp cqn_mtt_cfg */ #define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) #define ERDMA_CMD_CREATE_QP_DB_CFG_MASK BIT(25) @@ -415,6 +423,7 @@ struct erdma_cmdq_create_qp_req { u64 rq_mtt_entry[3]; u32 db_cfg; + u32 cfg2; }; struct erdma_cmdq_destroy_qp_req { @@ -522,6 +531,10 @@ enum { #define ERDMA_CQE_QTYPE_RQ 1 #define ERDMA_CQE_QTYPE_CMDQ 2 +#define ERDMA_CQE_NTYPE_MASK BIT(31) +#define ERDMA_CQE_SL_MASK GENMASK(27, 20) +#define ERDMA_CQE_SQPN_MASK GENMASK(19, 0) + struct erdma_cqe { __be32 hdr; __be32 qe_idx; @@ -531,7 +544,16 @@ struct erdma_cqe { __be32 inv_rkey; }; __be32 size; - __be32 rsvd[3]; + union { + struct { + __be32 rsvd[3]; + } rc; + + struct { + __be32 rsvd[2]; + __be32 info; + } ud; + }; }; struct erdma_sge { @@ -583,7 +605,7 @@ struct erdma_write_sqe { struct erdma_sge sgl[]; }; -struct erdma_send_sqe { +struct erdma_send_sqe_rc { __le64 hdr; union { __be32 imm_data; @@ -594,6 +616,17 @@ struct erdma_send_sqe { struct erdma_sge sgl[]; }; +struct erdma_send_sqe_ud { + __le64 hdr; + __be32 imm_data; + __le32 length; + __le32 qkey; + __le32 dst_qpn; + __le32 ahn; + __le32 rsvd; + struct erdma_sge sgl[]; +}; + struct erdma_readreq_sqe { __le64 hdr; __le32 invalid_stag; diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 03d93f026fca..4dfb4272ad86 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -398,17 +398,57 @@ static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr, return 0; } +static void init_send_sqe_rc(struct erdma_qp *qp, struct erdma_send_sqe_rc *sqe, + const struct ib_send_wr *wr, u32 *hw_op) +{ + u32 op = ERDMA_OP_SEND; + + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + op = ERDMA_OP_SEND_WITH_IMM; + sqe->imm_data = wr->ex.imm_data; + } else if (op == IB_WR_SEND_WITH_INV) { + op = ERDMA_OP_SEND_WITH_INV; + sqe->invalid_stag = cpu_to_le32(wr->ex.invalidate_rkey); + } + + *hw_op = op; +} + +static void init_send_sqe_ud(struct erdma_qp *qp, struct erdma_send_sqe_ud *sqe, + const struct ib_send_wr *wr, u32 *hw_op) +{ + const struct ib_ud_wr *uwr = ud_wr(wr); + struct erdma_ah *ah = to_eah(uwr->ah); + u32 op = ERDMA_OP_SEND; + + if (wr->opcode == IB_WR_SEND_WITH_IMM) { + op = ERDMA_OP_SEND_WITH_IMM; + sqe->imm_data = wr->ex.imm_data; + } + + *hw_op = op; + + sqe->ahn = cpu_to_le32(ah->ahn); + sqe->dst_qpn = cpu_to_le32(uwr->remote_qpn); + /* Not allowed to send control qkey */ + if (uwr->remote_qkey & 0x80000000) + sqe->qkey = cpu_to_le32(qp->attrs.rocev2.qkey); + else + sqe->qkey = cpu_to_le32(uwr->remote_qkey); +} + static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, const struct ib_send_wr *send_wr) { u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; u32 idx = *pi & (qp->attrs.sq_size - 1); enum ib_wr_opcode op = send_wr->opcode; + struct erdma_send_sqe_rc *rc_send_sqe; + struct erdma_send_sqe_ud *ud_send_sqe; struct erdma_atomic_sqe *atomic_sqe; struct erdma_readreq_sqe *read_sqe; struct erdma_reg_mr_sqe *regmr_sge; struct erdma_write_sqe *write_sqe; - struct erdma_send_sqe *send_sqe; struct ib_rdma_wr *rdma_wr; struct erdma_sge *sge; __le32 *length_field; @@ -417,6 +457,10 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, u32 attrs; int ret; + if (qp->ibqp.qp_type != IB_QPT_RC && send_wr->opcode != IB_WR_SEND && + send_wr->opcode != IB_WR_SEND_WITH_IMM) + return -EINVAL; + entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size, SQEBB_SHIFT); @@ -490,21 +534,20 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, case IB_WR_SEND: case IB_WR_SEND_WITH_IMM: case IB_WR_SEND_WITH_INV: - send_sqe = (struct erdma_send_sqe *)entry; - hw_op = ERDMA_OP_SEND; - if (op == IB_WR_SEND_WITH_IMM) { - hw_op = ERDMA_OP_SEND_WITH_IMM; - send_sqe->imm_data = send_wr->ex.imm_data; - } else if (op == IB_WR_SEND_WITH_INV) { - hw_op = ERDMA_OP_SEND_WITH_INV; - send_sqe->invalid_stag = - cpu_to_le32(send_wr->ex.invalidate_rkey); + if (qp->ibqp.qp_type == IB_QPT_RC) { + rc_send_sqe = (struct erdma_send_sqe_rc *)entry; + init_send_sqe_rc(qp, rc_send_sqe, send_wr, &hw_op); + length_field = &rc_send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe_rc); + } else { + ud_send_sqe = (struct erdma_send_sqe_ud *)entry; + init_send_sqe_ud(qp, ud_send_sqe, send_wr, &hw_op); + length_field = &ud_send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe_ud); } - wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); - length_field = &send_sqe->length; - wqe_size = sizeof(struct erdma_send_sqe); - sgl_offset = wqe_size; + sgl_offset = wqe_size; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); break; case IB_WR_REG_MR: wqe_hdr |= diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index e7fd3b948688..e7967193ac82 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -55,6 +55,13 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) ilog2(qp->attrs.rq_size)) | FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn); + if (qp->ibqp.qp_type == IB_QPT_RC) + req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_QP_TYPE_MASK, + ERDMA_QPT_RC); + else + req.cfg2 = FIELD_PREP(ERDMA_CMD_CREATE_QP_TYPE_MASK, + ERDMA_QPT_UD); + if (rdma_is_kernel_res(&qp->ibqp.res)) { u32 pgsz_range = ilog2(SZ_1M) - ERDMA_HW_PAGE_SHIFT; @@ -481,7 +488,11 @@ static int erdma_qp_validate_cap(struct erdma_dev *dev, static int erdma_qp_validate_attr(struct erdma_dev *dev, struct ib_qp_init_attr *attrs) { - if (attrs->qp_type != IB_QPT_RC) + if (erdma_device_iwarp(dev) && attrs->qp_type != IB_QPT_RC) + return -EOPNOTSUPP; + + if (erdma_device_rocev2(dev) && attrs->qp_type != IB_QPT_RC && + attrs->qp_type != IB_QPT_UD && attrs->qp_type != IB_QPT_GSI) return -EOPNOTSUPP; if (attrs->srq) @@ -959,7 +970,8 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, udata, struct erdma_ucontext, ibucontext); struct erdma_ureq_create_qp ureq; struct erdma_uresp_create_qp uresp; - int ret; + void *old_entry; + int ret = 0; ret = erdma_qp_validate_cap(dev, attrs); if (ret) @@ -978,9 +990,16 @@ int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, kref_init(&qp->ref); init_completion(&qp->safe_free); - ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, - XA_LIMIT(1, dev->attrs.max_qp - 1), - &dev->next_alloc_qpn, GFP_KERNEL); + if (qp->ibqp.qp_type == IB_QPT_GSI) { + old_entry = xa_store(&dev->qp_xa, 1, qp, GFP_KERNEL); + if (xa_is_err(old_entry)) + ret = xa_err(old_entry); + } else { + ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, + XA_LIMIT(1, dev->attrs.max_qp - 1), + &dev->next_alloc_qpn, GFP_KERNEL); + } + if (ret < 0) { ret = -ENOMEM; goto err_out; From c7f2cfe81e059a6efdf7be95e5efa9044a2f4b67 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 10 Dec 2024 19:45:41 -0800 Subject: [PATCH 14/63] RDMA/bnxt_re: Remove extra new line in bnxt_re_netdev_event This is a purely cosmetic change. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1733888745-30939-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index b7af0d5ff3b6..735bd781dcfc 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -2247,7 +2247,6 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, if (!rdev) return NOTIFY_DONE; - switch (event) { case NETDEV_UP: case NETDEV_DOWN: From ae51cb98213268464a7fba9273cb979037516e9a Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 10 Dec 2024 19:45:42 -0800 Subject: [PATCH 15/63] RDMA/bnxt_re: Remove unnecessary goto in bnxt_re_netdev_event Return directly in case of error without a goto label as there is no cleanup actions performed. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1733888745-30939-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 735bd781dcfc..ae5025b09282 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -2241,7 +2241,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, real_dev = netdev; if (real_dev != netdev) - goto exit; + return NOTIFY_DONE; rdev = bnxt_re_from_netdev(real_dev); if (!rdev) @@ -2260,7 +2260,7 @@ static int bnxt_re_netdev_event(struct notifier_block *notifier, break; } ib_device_put(&rdev->ibdev); -exit: + return NOTIFY_DONE; } From 55992c386263f3899552118b2e3d142cd223eba4 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 10 Dec 2024 19:45:43 -0800 Subject: [PATCH 16/63] RDMA/bnxt_re: Optimize error handling in bnxt_re_probe Optimize error handling path in bnxt_re_probe by removing some duplicate code. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1733888745-30939-4-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index ae5025b09282..75e1611fc3ed 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -2321,13 +2321,9 @@ static int bnxt_re_probe(struct auxiliary_device *adev, rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT); if (rc) - goto err; - mutex_unlock(&bnxt_re_mutex); - return 0; + kfree(en_info); -err: mutex_unlock(&bnxt_re_mutex); - kfree(en_info); return rc; } From 8aa3dd3e7659128c73fe9e41381e364db8c2e56c Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 10 Dec 2024 19:45:44 -0800 Subject: [PATCH 17/63] RDMA/bnxt_re: Eliminate need for some forward declarations Move the function definition of bnxt_re_shutdown() to avoid forward declarartion of bnxt_re_dev_uninit(). Move the function definition of bnxt_re_setup_cc() before bnxt_re_add_device() to avoid it's forward declarations. Also, forward declarartions of bnxt_re_stop_irq() and bnxt_re_dev_stop() are unnecessary. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1733888745-30939-5-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 73 +++++++++++++--------------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 75e1611fc3ed..5e10b540329a 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -79,17 +79,12 @@ MODULE_LICENSE("Dual BSD/GPL"); /* globals */ static DEFINE_MUTEX(bnxt_re_mutex); -static void bnxt_re_stop_irq(void *handle); -static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev); static int bnxt_re_netdev_event(struct notifier_block *notifier, unsigned long event, void *ptr); -static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type); static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, u32 *offset); -static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable); static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; @@ -302,16 +297,6 @@ static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) &rdev->qplib_ctx); } -static void bnxt_re_shutdown(struct auxiliary_device *adev) -{ - struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); - struct bnxt_re_dev *rdev; - - rdev = en_info->rdev; - ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); -} - static void bnxt_re_stop_irq(void *handle) { struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); @@ -2123,6 +2108,30 @@ fail: return rc; } +static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) +{ + struct bnxt_qplib_cc_param cc_param = {}; + + /* Do not enable congestion control on VFs */ + if (rdev->is_virtfn) + return; + + /* Currently enabling only for GenP5 adapters */ + if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) + return; + + if (enable) { + cc_param.enable = 1; + cc_param.tos_ecn = 1; + } + + cc_param.mask = (CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC | + CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN); + + if (bnxt_qplib_modify_cc(&rdev->qplib_res, &cc_param)) + ibdev_err(&rdev->ibdev, "Failed to setup CC enable = %d\n", enable); +} + static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev, struct bnxt_re_en_dev_info *en_info, struct auxiliary_device *adev) @@ -2192,30 +2201,6 @@ exit: return rc; } -static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable) -{ - struct bnxt_qplib_cc_param cc_param = {}; - - /* Do not enable congestion control on VFs */ - if (rdev->is_virtfn) - return; - - /* Currently enabling only for GenP5 adapters */ - if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) - return; - - if (enable) { - cc_param.enable = 1; - cc_param.tos_ecn = 1; - } - - cc_param.mask = (CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_ENABLE_CC | - CMDQ_MODIFY_ROCE_CC_MODIFY_MASK_TOS_ECN); - - if (bnxt_qplib_modify_cc(&rdev->qplib_res, &cc_param)) - ibdev_err(&rdev->ibdev, "Failed to setup CC enable = %d\n", enable); -} - /* * "Notifier chain callback can be invoked for the same chain from * different CPUs at the same time". @@ -2376,6 +2361,16 @@ static int bnxt_re_resume(struct auxiliary_device *adev) return 0; } +static void bnxt_re_shutdown(struct auxiliary_device *adev) +{ + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; + + rdev = en_info->rdev; + ib_unregister_device(&rdev->ibdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); +} + static const struct auxiliary_device_id bnxt_re_id_table[] = { { .name = BNXT_ADEV_NAME ".rdma", }, {}, From 1950af31dc66487ac21287cea5edc92738e7c8c8 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 10 Dec 2024 19:45:45 -0800 Subject: [PATCH 18/63] RDMA/bnxt_re: Remove unnecessary header file inclusion There is no need to include bnxt_ulp.h in ib_verbs.c. Remove it. Also, fixed hw_counters.c to remove unwanted header file inclusions. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1733888745-30939-6-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/hw_counters.c | 9 --------- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 2 -- 2 files changed, 11 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 1e63f8091748..77ec2eda7268 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -37,18 +37,9 @@ * */ -#include #include -#include -#include -#include #include -#include -#include -#include - -#include "bnxt_ulp.h" #include "roce_hsi.h" #include "qplib_res.h" #include "qplib_sp.h" diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 82023394e330..129178bdc581 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -52,8 +52,6 @@ #include #include -#include "bnxt_ulp.h" - #include "roce_hsi.h" #include "qplib_res.h" #include "qplib_sp.h" From 2dab32d1c79c4fc1542075ab86de1c8948317375 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 16 Dec 2024 21:19:14 +0000 Subject: [PATCH 19/63] IB/hfi1: Remove unused hfi1_format_hwerrors hfi1_format_hwerrors() was added in 2015 by commit 7724105686e7 ("IB/hfi1: add driver files") but never used. Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241216211914.745111-1-linux@treblig.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/hfi.h | 14 -------------- drivers/infiniband/hw/hfi1/intr.c | 31 ------------------------------- 2 files changed, 45 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index eb38f81aeeb1..cb630551cf1a 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -2339,20 +2339,6 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (port), ##__VA_ARGS__) -/* - * this is used for formatting hw error messages... - */ -struct hfi1_hwerror_msgs { - u64 mask; - const char *msg; - size_t sz; -}; - -/* in intr.c... */ -void hfi1_format_hwerrors(u64 hwerrs, - const struct hfi1_hwerror_msgs *hwerrmsgs, - size_t nhwerrmsgs, char *msg, size_t lmsg); - #define USER_OPCODE_CHECK_VAL 0xC0 #define USER_OPCODE_CHECK_MASK 0xC0 #define OPCODE_CHECK_VAL_DISABLED 0x0 diff --git a/drivers/infiniband/hw/hfi1/intr.c b/drivers/infiniband/hw/hfi1/intr.c index 3737f632d62a..d8dd1a599631 100644 --- a/drivers/infiniband/hw/hfi1/intr.c +++ b/drivers/infiniband/hw/hfi1/intr.c @@ -47,37 +47,6 @@ static void add_full_mgmt_pkey(struct hfi1_pportdata *ppd) hfi1_event_pkey_change(ppd->dd, ppd->port); } -/** - * format_hwmsg - format a single hwerror message - * @msg: message buffer - * @msgl: length of message buffer - * @hwmsg: message to add to message buffer - */ -static void format_hwmsg(char *msg, size_t msgl, const char *hwmsg) -{ - strlcat(msg, "[", msgl); - strlcat(msg, hwmsg, msgl); - strlcat(msg, "]", msgl); -} - -/** - * hfi1_format_hwerrors - format hardware error messages for display - * @hwerrs: hardware errors bit vector - * @hwerrmsgs: hardware error descriptions - * @nhwerrmsgs: number of hwerrmsgs - * @msg: message buffer - * @msgl: message buffer length - */ -void hfi1_format_hwerrors(u64 hwerrs, const struct hfi1_hwerror_msgs *hwerrmsgs, - size_t nhwerrmsgs, char *msg, size_t msgl) -{ - int i; - - for (i = 0; i < nhwerrmsgs; i++) - if (hwerrs & hwerrmsgs[i].mask) - format_hwmsg(msg, msgl, hwerrmsgs[i].msg); -} - static void signal_ib_event(struct hfi1_pportdata *ppd, enum ib_event_type ev) { struct ib_event event; From db03b70969aab4ef111a3369cfd90ea4da3a6aa0 Mon Sep 17 00:00:00 2001 From: zhenwei pi Date: Mon, 16 Dec 2024 20:19:53 +0800 Subject: [PATCH 20/63] RDMA/rxe: Fix mismatched max_msg_sz User mode queries max_msg_sz as 0x800000 by command 'ibv_devinfo -v', however ibv_post_send/ibv_post_recv has a limit of 2^31. Fix this mismatched information. Signed-off-by: zhenwei pi Fixes: f605f26ea196 ("RDMA/rxe: Protect QP state with qp->state_lock") Fixes: 5bf944f24129 ("RDMA/rxe: Add error messages") Link: https://patch.msgid.link/20241216121953.765331-1-pizhenwei@bytedance.com Review-by: Zhu Yanjun Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_param.h | 2 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index d2f57ead78ad..003f681e5dc0 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -129,7 +129,7 @@ enum rxe_device_param { enum rxe_port_param { RXE_PORT_GID_TBL_LEN = 1024, RXE_PORT_PORT_CAP_FLAGS = IB_PORT_CM_SUP, - RXE_PORT_MAX_MSG_SZ = 0x800000, + RXE_PORT_MAX_MSG_SZ = (1UL << 31), RXE_PORT_BAD_PKEY_CNTR = 0, RXE_PORT_QKEY_VIOL_CNTR = 0, RXE_PORT_LID = 0, diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 5c18f7e342f2..ffd5b07ad3e6 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -688,7 +688,7 @@ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, for (i = 0; i < ibwr->num_sge; i++) length += ibwr->sg_list[i].length; - if (length > (1UL << 31)) { + if (length > RXE_PORT_MAX_MSG_SZ) { rxe_err_qp(qp, "message length too long\n"); break; } @@ -972,8 +972,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; - /* IBA max message size is 2^31 */ - if (length >= (1UL<<31)) { + if (length > RXE_PORT_MAX_MSG_SZ) { err = -EINVAL; rxe_dbg("message length too long\n"); goto err_out; From a3cbf68c69611188cd304229e346bffdabfd4277 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Tue, 17 Dec 2024 15:55:38 +0800 Subject: [PATCH 21/63] RDMA/srp: Fix error handling in srp_add_port As comment of device_add() says, if device_add() succeeds, you should call device_del() when you want to get rid of it. If device_add() has not succeeded, use only put_device() to drop the reference count. Add a put_device() call before returning from the function to decrement reference count for cleanup. Found by code review. Fixes: c8e4c2397655 ("RDMA/srp: Rework the srp_add_port() error path") Signed-off-by: Ma Ke Link: https://patch.msgid.link/20241217075538.2909996-1-make_ruc2021@163.com Signed-off-by: Bart Van Assche Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/srp/ib_srp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 2916e77f589b..7289ae0b83ac 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3978,7 +3978,6 @@ static struct srp_host *srp_add_port(struct srp_device *device, u32 port) return host; put_host: - device_del(&host->dev); put_device(&host->dev); return NULL; } From c57c76498a895a705b624ed65dfe5a3037b78d83 Mon Sep 17 00:00:00 2001 From: Advait Dhamorikar Date: Thu, 19 Dec 2024 10:09:39 +0530 Subject: [PATCH 22/63] RDMA/erdma: Fix opcode conditional check Fix conditional if else check by checking with wr->opcode. The indicated dead code may have performed some action; that action will never occur as op is pre-assigned a different value. Fixes: 999a0a2e9b87 ("RDMA/erdma: Support UD QPs and UD WRs") Signed-off-by: Advait Dhamorikar Link: https://patch.msgid.link/20241219043939.10344-1-advaitdhamorikar@gmail.com Reviewed-by: Cheng Xu Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 4dfb4272ad86..5c266918fb36 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -406,7 +406,7 @@ static void init_send_sqe_rc(struct erdma_qp *qp, struct erdma_send_sqe_rc *sqe, if (wr->opcode == IB_WR_SEND_WITH_IMM) { op = ERDMA_OP_SEND_WITH_IMM; sqe->imm_data = wr->ex.imm_data; - } else if (op == IB_WR_SEND_WITH_INV) { + } else if (wr->opcode == IB_WR_SEND_WITH_INV) { op = ERDMA_OP_SEND_WITH_INV; sqe->invalid_stag = cpu_to_le32(wr->ex.invalidate_rkey); } From 695df3e833c04aa48e57c28a25311dfdde241ce3 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 23 Dec 2024 00:16:13 +0000 Subject: [PATCH 23/63] RDMA/irdma: Remove unused irdma_cqp_*_fpm_val_cmd functions irdma_cqp_commit_fpm_val_cmd() and irdma_cqp_query_fpm_val_cmd() were added in 2021 by commit 915cc7ac0f8e ("RDMA/irdma: Add miscellaneous utility definitions") but haven't been used. Remove them. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241223001613.307138-1-linux@treblig.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/osdep.h | 4 -- drivers/infiniband/hw/irdma/protos.h | 4 -- drivers/infiniband/hw/irdma/utils.c | 68 ---------------------------- 3 files changed, 76 deletions(-) diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h index e1e3d3ae72b7..ddf02a462efa 100644 --- a/drivers/infiniband/hw/irdma/osdep.h +++ b/drivers/infiniband/hw/irdma/osdep.h @@ -59,10 +59,6 @@ int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, int irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, struct irdma_hmc_fcn_info *hmcfcninfo, u16 *pmf_idx); -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); int irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, struct irdma_dma_mem *mem); void *irdma_remove_cqp_head(struct irdma_sc_dev *dev); diff --git a/drivers/infiniband/hw/irdma/protos.h b/drivers/infiniband/hw/irdma/protos.h index d7c8ea948bcd..c0c9441885d3 100644 --- a/drivers/infiniband/hw/irdma/protos.h +++ b/drivers/infiniband/hw/irdma/protos.h @@ -85,10 +85,6 @@ int irdma_process_cqp_cmd(struct irdma_sc_dev *dev, int irdma_process_bh(struct irdma_sc_dev *dev); int irdma_cqp_sds_cmd(struct irdma_sc_dev *dev, struct irdma_update_sds_info *info); -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id); int irdma_alloc_query_fpm_buf(struct irdma_sc_dev *dev, struct irdma_dma_mem *mem); int irdma_cqp_manage_hmc_fcn_cmd(struct irdma_sc_dev *dev, diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 0422787592d8..1ea29994ace3 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -971,74 +971,6 @@ void irdma_terminate_del_timer(struct irdma_sc_qp *qp) irdma_qp_rem_ref(&iwqp->ibqp); } -/** - * irdma_cqp_query_fpm_val_cmd - send cqp command for fpm - * @dev: function device struct - * @val_mem: buffer for fpm - * @hmc_fn_id: function id for fpm - */ -int irdma_cqp_query_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id) -{ - struct irdma_cqp_request *cqp_request; - struct cqp_cmds_info *cqp_info; - struct irdma_pci_f *rf = dev_to_rf(dev); - int status; - - cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); - if (!cqp_request) - return -ENOMEM; - - cqp_info = &cqp_request->info; - cqp_request->param = NULL; - cqp_info->in.u.query_fpm_val.cqp = dev->cqp; - cqp_info->in.u.query_fpm_val.fpm_val_pa = val_mem->pa; - cqp_info->in.u.query_fpm_val.fpm_val_va = val_mem->va; - cqp_info->in.u.query_fpm_val.hmc_fn_id = hmc_fn_id; - cqp_info->cqp_cmd = IRDMA_OP_QUERY_FPM_VAL; - cqp_info->post_sq = 1; - cqp_info->in.u.query_fpm_val.scratch = (uintptr_t)cqp_request; - - status = irdma_handle_cqp_op(rf, cqp_request); - irdma_put_cqp_request(&rf->cqp, cqp_request); - - return status; -} - -/** - * irdma_cqp_commit_fpm_val_cmd - commit fpm values in hw - * @dev: hardware control device structure - * @val_mem: buffer with fpm values - * @hmc_fn_id: function id for fpm - */ -int irdma_cqp_commit_fpm_val_cmd(struct irdma_sc_dev *dev, - struct irdma_dma_mem *val_mem, u8 hmc_fn_id) -{ - struct irdma_cqp_request *cqp_request; - struct cqp_cmds_info *cqp_info; - struct irdma_pci_f *rf = dev_to_rf(dev); - int status; - - cqp_request = irdma_alloc_and_get_cqp_request(&rf->cqp, true); - if (!cqp_request) - return -ENOMEM; - - cqp_info = &cqp_request->info; - cqp_request->param = NULL; - cqp_info->in.u.commit_fpm_val.cqp = dev->cqp; - cqp_info->in.u.commit_fpm_val.fpm_val_pa = val_mem->pa; - cqp_info->in.u.commit_fpm_val.fpm_val_va = val_mem->va; - cqp_info->in.u.commit_fpm_val.hmc_fn_id = hmc_fn_id; - cqp_info->cqp_cmd = IRDMA_OP_COMMIT_FPM_VAL; - cqp_info->post_sq = 1; - cqp_info->in.u.commit_fpm_val.scratch = (uintptr_t)cqp_request; - - status = irdma_handle_cqp_op(rf, cqp_request); - irdma_put_cqp_request(&rf->cqp, cqp_request); - - return status; -} - /** * irdma_cqp_cq_create_cmd - create a cq for the cqp * @dev: device pointer From 30dd62fa3954cb7697dbae9c33b2a5c50d8b5c6a Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 21 Dec 2024 01:40:18 +0000 Subject: [PATCH 24/63] RDMA/core: Remove unused ib_ud_header_unpack ib_ud_header_unpack() is unused, and I can't see any sign of it ever having been used in git. The only reference I can find is from December 2004 BKrev: 41d30034XNbBUl0XnyC6ig9V61Nf-A when it looks like it was added. Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241221014021.343979-2-linux@treblig.org Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/ud_header.c | 83 ----------------------------- include/rdma/ib_pack.h | 3 -- 2 files changed, 86 deletions(-) diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 64d9c492de64..8d3dfef9ebaa 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -462,86 +462,3 @@ int ib_ud_header_pack(struct ib_ud_header *header, return len; } EXPORT_SYMBOL(ib_ud_header_pack); - -/** - * ib_ud_header_unpack - Unpack UD header struct from wire format - * @header:UD header struct - * @buf:Buffer to pack into - * - * ib_ud_header_pack() unpacks the UD header structure @header from wire - * format in the buffer @buf. - */ -int ib_ud_header_unpack(void *buf, - struct ib_ud_header *header) -{ - ib_unpack(lrh_table, ARRAY_SIZE(lrh_table), - buf, &header->lrh); - buf += IB_LRH_BYTES; - - if (header->lrh.link_version != 0) { - pr_warn("Invalid LRH.link_version %u\n", - header->lrh.link_version); - return -EINVAL; - } - - switch (header->lrh.link_next_header) { - case IB_LNH_IBA_LOCAL: - header->grh_present = 0; - break; - - case IB_LNH_IBA_GLOBAL: - header->grh_present = 1; - ib_unpack(grh_table, ARRAY_SIZE(grh_table), - buf, &header->grh); - buf += IB_GRH_BYTES; - - if (header->grh.ip_version != 6) { - pr_warn("Invalid GRH.ip_version %u\n", - header->grh.ip_version); - return -EINVAL; - } - if (header->grh.next_header != 0x1b) { - pr_warn("Invalid GRH.next_header 0x%02x\n", - header->grh.next_header); - return -EINVAL; - } - break; - - default: - pr_warn("Invalid LRH.link_next_header %u\n", - header->lrh.link_next_header); - return -EINVAL; - } - - ib_unpack(bth_table, ARRAY_SIZE(bth_table), - buf, &header->bth); - buf += IB_BTH_BYTES; - - switch (header->bth.opcode) { - case IB_OPCODE_UD_SEND_ONLY: - header->immediate_present = 0; - break; - case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE: - header->immediate_present = 1; - break; - default: - pr_warn("Invalid BTH.opcode 0x%02x\n", header->bth.opcode); - return -EINVAL; - } - - if (header->bth.transport_header_version != 0) { - pr_warn("Invalid BTH.transport_header_version %u\n", - header->bth.transport_header_version); - return -EINVAL; - } - - ib_unpack(deth_table, ARRAY_SIZE(deth_table), - buf, &header->deth); - buf += IB_DETH_BYTES; - - if (header->immediate_present) - memcpy(&header->immediate_data, buf, sizeof header->immediate_data); - - return 0; -} -EXPORT_SYMBOL(ib_ud_header_unpack); diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index b8c56d7dc35d..8266fab826a7 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -283,7 +283,4 @@ int ib_ud_header_init(int payload_bytes, int ib_ud_header_pack(struct ib_ud_header *header, void *buf); -int ib_ud_header_unpack(void *buf, - struct ib_ud_header *header); - #endif /* IB_PACK_H */ From ddc8fab40b9ae309052d37179a705430fc15db97 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 21 Dec 2024 01:40:19 +0000 Subject: [PATCH 25/63] RDMA/core: Remove unused ib_find_exact_cached_pkey The last use of ib_find_exact_cached_pkey() was removed in 2012 by commit 2c75d2ccb6e5 ("IB/mlx4: Fix QP1 P_Key processing in the Primary Physical Function (PPF)") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241221014021.343979-3-linux@treblig.org Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/cache.c | 35 --------------------------------- include/rdma/ib_cache.h | 16 --------------- 2 files changed, 51 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index b7c078b7f7cf..f8413f8a9f26 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1127,41 +1127,6 @@ err: } EXPORT_SYMBOL(ib_find_cached_pkey); -int ib_find_exact_cached_pkey(struct ib_device *device, u32 port_num, - u16 pkey, u16 *index) -{ - struct ib_pkey_cache *cache; - unsigned long flags; - int i; - int ret = -ENOENT; - - if (!rdma_is_port_valid(device, port_num)) - return -EINVAL; - - read_lock_irqsave(&device->cache_lock, flags); - - cache = device->port_data[port_num].cache.pkey; - if (!cache) { - ret = -EINVAL; - goto err; - } - - *index = -1; - - for (i = 0; i < cache->table_len; ++i) - if (cache->table[i] == pkey) { - *index = i; - ret = 0; - break; - } - -err: - read_unlock_irqrestore(&device->cache_lock, flags); - - return ret; -} -EXPORT_SYMBOL(ib_find_exact_cached_pkey); - int ib_get_cached_lmc(struct ib_device *device, u32 port_num, u8 *lmc) { unsigned long flags; diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index 226ae3702d8a..2bf09b594d10 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -63,22 +63,6 @@ int ib_find_cached_pkey(struct ib_device *device, u16 pkey, u16 *index); -/** - * ib_find_exact_cached_pkey - Returns the PKey table index where a specified - * PKey value occurs. Comparison uses the FULL 16 bits (incl membership bit) - * @device: The device to query. - * @port_num: The port number of the device to search for the PKey. - * @pkey: The PKey value to search for. - * @index: The index into the cached PKey table where the PKey was found. - * - * ib_find_exact_cached_pkey() searches the specified PKey table in - * the local software cache. - */ -int ib_find_exact_cached_pkey(struct ib_device *device, - u32 port_num, - u16 pkey, - u16 *index); - /** * ib_get_cached_lmc - Returns a cached lmc table entry * @device: The device to query. From 750efbb9c307f7d9ff43d38f58d3fca087dc041f Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 21 Dec 2024 01:40:20 +0000 Subject: [PATCH 26/63] RDMA/core: Remove unused ibdev_printk The last use of ibdev_printk() was removed in 2019 by commit b2299e83815c ("RDMA: Delete DEBUG code") Remove it. Note: The __ibdev_printk() is still used in the idev_err etc functions so leave that. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241221014021.343979-4-linux@treblig.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 17 ----------------- include/rdma/ib_verbs.h | 3 --- 2 files changed, 20 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index ca9b956c034d..a74e192b5588 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -209,23 +209,6 @@ static void __ibdev_printk(const char *level, const struct ib_device *ibdev, printk("%s(NULL ib_device): %pV", level, vaf); } -void ibdev_printk(const char *level, const struct ib_device *ibdev, - const char *format, ...) -{ - struct va_format vaf; - va_list args; - - va_start(args, format); - - vaf.fmt = format; - vaf.va = &args; - - __ibdev_printk(level, ibdev, &vaf); - - va_end(args); -} -EXPORT_SYMBOL(ibdev_printk); - #define define_ibdev_printk_level(func, level) \ void func(const struct ib_device *ibdev, const char *fmt, ...) \ { \ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6ddd5e3bb884..77472e19ec0c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -59,9 +59,6 @@ extern struct workqueue_struct *ib_comp_unbound_wq; struct ib_ucq_object; -__printf(3, 4) __cold -void ibdev_printk(const char *level, const struct ib_device *ibdev, - const char *format, ...); __printf(2, 3) __cold void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold From 2028c2958775c4861756ace010b433cc1c81f516 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 21 Dec 2024 01:40:21 +0000 Subject: [PATCH 27/63] RDMA/core: Remove unused ib_copy_path_rec_from_user ib_copy_path_rec_from_user() has been unused since 2019's commit a1a8e4a85cf7 ("rdma: Delete the ib_ucm module") Remove it. Signed-off-by: Dr. David Alan Gilbert Link: https://patch.msgid.link/20241221014021.343979-5-linux@treblig.org Reviewed-by: Kalesh AP Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_marshall.c | 42 ----------------------- include/rdma/ib_marshall.h | 3 -- 2 files changed, 45 deletions(-) diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 11a080646916..e803f609ec87 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -171,45 +171,3 @@ void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, __ib_copy_path_rec_to_user(dst, src); } EXPORT_SYMBOL(ib_copy_path_rec_to_user); - -void ib_copy_path_rec_from_user(struct sa_path_rec *dst, - struct ib_user_path_rec *src) -{ - u32 slid, dlid; - - memset(dst, 0, sizeof(*dst)); - if ((ib_is_opa_gid((union ib_gid *)src->sgid)) || - (ib_is_opa_gid((union ib_gid *)src->dgid))) { - dst->rec_type = SA_PATH_REC_TYPE_OPA; - slid = opa_get_lid_from_gid((union ib_gid *)src->sgid); - dlid = opa_get_lid_from_gid((union ib_gid *)src->dgid); - } else { - dst->rec_type = SA_PATH_REC_TYPE_IB; - slid = ntohs(src->slid); - dlid = ntohs(src->dlid); - } - memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid); - memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid); - - sa_path_set_dlid(dst, dlid); - sa_path_set_slid(dst, slid); - sa_path_set_raw_traffic(dst, src->raw_traffic); - dst->flow_label = src->flow_label; - dst->hop_limit = src->hop_limit; - dst->traffic_class = src->traffic_class; - dst->reversible = src->reversible; - dst->numb_path = src->numb_path; - dst->pkey = src->pkey; - dst->sl = src->sl; - dst->mtu_selector = src->mtu_selector; - dst->mtu = src->mtu; - dst->rate_selector = src->rate_selector; - dst->rate = src->rate; - dst->packet_life_time = src->packet_life_time; - dst->preference = src->preference; - dst->packet_life_time_selector = src->packet_life_time_selector; - - /* TODO: No need to set this */ - sa_path_set_dmac_zero(dst); -} -EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h index 1838869aad28..b179e464e3d1 100644 --- a/include/rdma/ib_marshall.h +++ b/include/rdma/ib_marshall.h @@ -22,7 +22,4 @@ void ib_copy_ah_attr_to_user(struct ib_device *device, void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, struct sa_path_rec *src); -void ib_copy_path_rec_from_user(struct sa_path_rec *dst, - struct ib_user_path_rec *src); - #endif /* IB_USER_MARSHALL_H */ From 0c039a57b68dfb1dd49dfc16240791086d8e57ad Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:52:57 +0800 Subject: [PATCH 28/63] RDMA/core: Add ib_query_netdev_port() to query netdev port by IB device. Query the port number of a netdev associated with an ibdev. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 39 ++++++++++++++++++++++++++------ include/rdma/ib_verbs.h | 2 ++ 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a74e192b5588..c2f048336c91 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2278,6 +2278,33 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, } EXPORT_SYMBOL(ib_device_get_netdev); +/** + * ib_query_netdev_port - Query the port number of a net_device + * associated with an ibdev + * @ibdev: IB device + * @ndev: Network device + * @port: IB port the net_device is connected to + */ +int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, + u32 *port) +{ + struct net_device *ib_ndev; + u32 port_num; + + rdma_for_each_port(ibdev, port_num) { + ib_ndev = ib_device_get_netdev(ibdev, port_num); + if (ndev == ib_ndev) { + *port = port_num; + dev_put(ib_ndev); + return 0; + } + dev_put(ib_ndev); + } + + return -ENOENT; +} +EXPORT_SYMBOL(ib_query_netdev_port); + /** * ib_device_get_by_netdev - Find an IB device associated with a netdev * @ndev: netdev to locate @@ -2841,7 +2868,6 @@ static int ib_netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *ndev = netdev_notifier_info_to_dev(ptr); - struct net_device *ib_ndev; struct ib_device *ibdev; u32 port; @@ -2851,13 +2877,12 @@ static int ib_netdevice_event(struct notifier_block *this, if (!ibdev) return NOTIFY_DONE; - rdma_for_each_port(ibdev, port) { - ib_ndev = ib_device_get_netdev(ibdev, port); - if (ndev == ib_ndev) - rdma_nl_notify_event(ibdev, port, - RDMA_NETDEV_RENAME_EVENT); - dev_put(ib_ndev); + if (ib_query_netdev_port(ibdev, ndev, &port)) { + ib_device_put(ibdev); + break; } + + rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); ib_device_put(ibdev); break; default: diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 77472e19ec0c..c539a1706f66 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4468,6 +4468,8 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, unsigned int port); struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); +int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, + u32 *port); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); From 1fb0644c3899b2f857b11037b19ed362b67bfe91 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:52:58 +0800 Subject: [PATCH 29/63] RDMA/core: Support link status events dispatching Currently the dispatching of link status events is implemented by each RDMA driver independently, and most of them have very similar patterns. Add support for this in ib_core so that we can get rid of duplicate codes in each driver. A new last_port_state is added in ib_port_cache to cache the port state of the last link status events dispatching. The original port_state in ib_port_cache is not used here because it will be updated when ib_dispatch_event() is called, which means it may be changed between two link status events, and may lead to a loss of event dispatching. Some drivers currently have some private stuff in their link status events handler in addition to event dispatching, and cannot be perfectly integrated into the ib_core handling process. For these drivers, add a new ops report_port_event() so that they can keep their current processing. Finally, events of LAG devices are not supported yet in this patch as currently there is no way to obtain ibdev from upper netdev in ib_core. This can be a TODO work after the core have more support for LAG. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 60 ++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 17 +++++++++ 2 files changed, 77 insertions(+) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index c2f048336c91..0ded91f056f3 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2771,6 +2771,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, set_vf_guid); SET_DEVICE_OP(dev_ops, set_vf_link_state); SET_DEVICE_OP(dev_ops, ufile_hw_cleanup); + SET_DEVICE_OP(dev_ops, report_port_event); SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); @@ -2864,6 +2865,58 @@ static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { }, }; +void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev) +{ + enum ib_port_state curr_state; + struct ib_event ibevent = {}; + u32 port; + + if (ib_query_netdev_port(ibdev, ndev, &port)) + return; + + curr_state = ib_get_curr_port_state(ndev); + + write_lock_irq(&ibdev->cache_lock); + if (ibdev->port_data[port].cache.last_port_state == curr_state) { + write_unlock_irq(&ibdev->cache_lock); + return; + } + ibdev->port_data[port].cache.last_port_state = curr_state; + write_unlock_irq(&ibdev->cache_lock); + + ibevent.event = (curr_state == IB_PORT_DOWN) ? + IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE; + ibevent.device = ibdev; + ibevent.element.port_num = port; + ib_dispatch_event(&ibevent); +} +EXPORT_SYMBOL(ib_dispatch_port_state_event); + +static void handle_port_event(struct net_device *ndev, unsigned long event) +{ + struct ib_device *ibdev; + + /* Currently, link events in bonding scenarios are still + * reported by drivers that support bonding. + */ + if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev)) + return; + + ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN); + if (!ibdev) + return; + + if (ibdev->ops.report_port_event) { + ibdev->ops.report_port_event(ibdev, ndev, event); + goto put_ibdev; + } + + ib_dispatch_port_state_event(ibdev, ndev); + +put_ibdev: + ib_device_put(ibdev); +}; + static int ib_netdevice_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -2885,6 +2938,13 @@ static int ib_netdevice_event(struct notifier_block *this, rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT); ib_device_put(ibdev); break; + + case NETDEV_UP: + case NETDEV_CHANGE: + case NETDEV_DOWN: + handle_port_event(ndev, event); + break; + default: break; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index c539a1706f66..0ad104dae253 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2174,6 +2174,7 @@ struct ib_port_cache { struct ib_gid_table *gid; u8 lmc; enum ib_port_state port_state; + enum ib_port_state last_port_state; }; struct ib_port_immutable { @@ -2680,6 +2681,13 @@ struct ib_device_ops { */ void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile); + /** + * report_port_event - Drivers need to implement this if they have + * some private stuff to handle when link status changes. + */ + void (*report_port_event)(struct ib_device *ibdev, + struct net_device *ndev, unsigned long event); + DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); @@ -4470,6 +4478,15 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev, u32 *port); + +static inline enum ib_port_state ib_get_curr_port_state(struct net_device *net_dev) +{ + return (netif_running(net_dev) && netif_carrier_ok(net_dev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; +} + +void ib_dispatch_port_state_event(struct ib_device *ibdev, + struct net_device *ndev); struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *init_attr); int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata); From d3b15fcc4201f886c75fc55bb6bdd1056c7ea433 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:52:59 +0800 Subject: [PATCH 30/63] RDMA/bnxt_re: Remove deliver net device event Since the netdev events of link status is now handled in ib_core, remove the related code in drivers. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 71 ---------------------------- 1 file changed, 71 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 5e10b540329a..6d1800e285ef 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -79,8 +79,6 @@ MODULE_LICENSE("Dual BSD/GPL"); /* globals */ static DEFINE_MUTEX(bnxt_re_mutex); -static int bnxt_re_netdev_event(struct notifier_block *notifier, - unsigned long event, void *ptr); static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, @@ -824,17 +822,6 @@ static void bnxt_re_disassociate_ucontext(struct ib_ucontext *ibcontext) } /* Device */ - -static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev) -{ - struct ib_device *ibdev = - ib_device_get_by_netdev(netdev, RDMA_DRIVER_BNXT_RE); - if (!ibdev) - return NULL; - - return container_of(ibdev, struct bnxt_re_dev, ibdev); -} - static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, char *buf) { @@ -2178,20 +2165,10 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) goto re_dev_uninit; } - rdev->nb.notifier_call = bnxt_re_netdev_event; - rc = register_netdevice_notifier(&rdev->nb); - if (rc) { - rdev->nb.notifier_call = NULL; - pr_err("%s: Cannot register to netdevice_notifier", - ROCE_DRV_MODULE_NAME); - goto re_dev_unreg; - } bnxt_re_setup_cc(rdev, true); return 0; -re_dev_unreg: - ib_unregister_device(&rdev->ibdev); re_dev_uninit: bnxt_re_update_en_info_rdev(NULL, en_info, adev); bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); @@ -2201,54 +2178,6 @@ exit: return rc; } -/* - * "Notifier chain callback can be invoked for the same chain from - * different CPUs at the same time". - * - * For cases when the netdev is already present, our call to the - * register_netdevice_notifier() will actually get the rtnl_lock() - * before sending NETDEV_REGISTER and (if up) NETDEV_UP - * events. - * - * But for cases when the netdev is not already present, the notifier - * chain is subjected to be invoked from different CPUs simultaneously. - * - * This is protected by the netdev_mutex. - */ -static int bnxt_re_netdev_event(struct notifier_block *notifier, - unsigned long event, void *ptr) -{ - struct net_device *real_dev, *netdev = netdev_notifier_info_to_dev(ptr); - struct bnxt_re_dev *rdev; - - real_dev = rdma_vlan_dev_real_dev(netdev); - if (!real_dev) - real_dev = netdev; - - if (real_dev != netdev) - return NOTIFY_DONE; - - rdev = bnxt_re_from_netdev(real_dev); - if (!rdev) - return NOTIFY_DONE; - - switch (event) { - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_CHANGE: - bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, - netif_carrier_ok(real_dev) ? - IB_EVENT_PORT_ACTIVE : - IB_EVENT_PORT_ERR); - break; - default: - break; - } - ib_device_put(&rdev->ibdev); - - return NOTIFY_DONE; -} - #define BNXT_ADEV_NAME "bnxt_en" static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type, From 18eb2bf3df190f03f62e8ed45730229002cc341f Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:00 +0800 Subject: [PATCH 31/63] RDMA/erdma: Remove deliver net device event Since the netdev events of link status is now handled in ib_core, remove the related code in drivers. In addition, remove dev->state as it is only used in erdma_query_port(), and it can be replaced by ib_get_curr_port_state(). Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 2 -- drivers/infiniband/hw/erdma/erdma_main.c | 8 -------- drivers/infiniband/hw/erdma/erdma_verbs.c | 8 ++------ 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 4f840d8e3beb..2418ac687404 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -195,8 +195,6 @@ struct erdma_dev { u8 __iomem *func_bar; struct erdma_devattr attrs; - /* physical port state (only one port per device) */ - enum ib_port_state state; u32 mtu; /* cmdq and aeq use the same msix vector */ diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 51cc8b17b9e9..eabf435c77a3 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -26,14 +26,6 @@ static int erdma_netdev_event(struct notifier_block *nb, unsigned long event, goto done; switch (event) { - case NETDEV_UP: - dev->state = IB_PORT_ACTIVE; - erdma_port_event(dev, IB_EVENT_PORT_ACTIVE); - break; - case NETDEV_DOWN: - dev->state = IB_PORT_DOWN; - erdma_port_event(dev, IB_EVENT_PORT_ERR); - break; case NETDEV_CHANGEMTU: if (dev->mtu != netdev->mtu) { erdma_set_mtu(dev, netdev->mtu); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index e7967193ac82..45b377ac9e49 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -396,14 +396,10 @@ int erdma_query_port(struct ib_device *ibdev, u32 port, ib_get_eth_speed(ibdev, port, &attr->active_speed, &attr->active_width); attr->max_mtu = ib_mtu_int_to_enum(ndev->mtu); attr->active_mtu = ib_mtu_int_to_enum(ndev->mtu); - if (netif_running(ndev) && netif_carrier_ok(ndev)) - dev->state = IB_PORT_ACTIVE; - else - dev->state = IB_PORT_DOWN; - attr->state = dev->state; + attr->state = ib_get_curr_port_state(ndev); out: - if (dev->state == IB_PORT_ACTIVE) + if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; From 4c354c02d5e38c5cd0edc7780645d0ab0b3bc455 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:01 +0800 Subject: [PATCH 32/63] RDMA/irdma: Remove deliver net device event Since the netdev events of link status is now handled in ib_core, remove the related code in drivers. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/irdma/utils.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index 1ea29994ace3..0e594122baa7 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -320,9 +320,6 @@ int irdma_netdevice_event(struct notifier_block *notifier, unsigned long event, case NETDEV_DOWN: iwdev->iw_status = 0; fallthrough; - case NETDEV_UP: - irdma_port_ibevent(iwdev); - break; default: break; } From 958152336cfafdec1f42d3d44253805282dc1321 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:02 +0800 Subject: [PATCH 33/63] RDMA/rxe: Remove deliver net device event Since the netdev events of link status is now handled in ib_core, remove the related code in drivers. In addition, remove the setting of port->attr.state in rxe_port_up() and rxe_port_down(), as it is only used in rxe_query_port(), and it can be replaced by ib_get_curr_port_state(). Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_net.c | 22 ++++------------------ drivers/infiniband/sw/rxe/rxe_verbs.c | 1 + 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 75d1407db52d..d400aaab0e70 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -563,11 +563,6 @@ static void rxe_port_event(struct rxe_dev *rxe, /* Caller must hold net_info_lock */ void rxe_port_up(struct rxe_dev *rxe) { - struct rxe_port *port; - - port = &rxe->port; - port->attr.state = IB_PORT_ACTIVE; - rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); dev_info(&rxe->ib_dev.dev, "set active\n"); } @@ -575,11 +570,6 @@ void rxe_port_up(struct rxe_dev *rxe) /* Caller must hold net_info_lock */ void rxe_port_down(struct rxe_dev *rxe) { - struct rxe_port *port; - - port = &rxe->port; - port->attr.state = IB_PORT_DOWN; - rxe_port_event(rxe, IB_EVENT_PORT_ERR); rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); dev_info(&rxe->ib_dev.dev, "set down\n"); @@ -587,7 +577,7 @@ void rxe_port_down(struct rxe_dev *rxe) void rxe_set_port_state(struct rxe_dev *rxe) { - if (netif_running(rxe->ndev) && netif_carrier_ok(rxe->ndev)) + if (ib_get_curr_port_state(rxe->ndev) == IB_PORT_ACTIVE) rxe_port_up(rxe); else rxe_port_down(rxe); @@ -607,18 +597,14 @@ static int rxe_notify(struct notifier_block *not_blk, case NETDEV_UNREGISTER: ib_unregister_device_queued(&rxe->ib_dev); break; - case NETDEV_UP: - rxe_port_up(rxe); - break; - case NETDEV_DOWN: - rxe_port_down(rxe); - break; case NETDEV_CHANGEMTU: rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); rxe_set_mtu(rxe, ndev->mtu); break; + case NETDEV_DOWN: case NETDEV_CHANGE: - rxe_set_port_state(rxe); + if (ib_get_curr_port_state(ndev) == IB_PORT_DOWN) + rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED); break; case NETDEV_REBOOT: case NETDEV_GOING_DOWN: diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index ffd5b07ad3e6..dad3cacb9048 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -55,6 +55,7 @@ static int rxe_query_port(struct ib_device *ibdev, ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, &attr->active_width); + attr->state = ib_get_curr_port_state(rxe->ndev); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; else if (dev_get_flags(rxe->ndev) & IFF_UP) From 62f5d59d1a3d17918ee7e9da9cb4ccb9443e0480 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:03 +0800 Subject: [PATCH 34/63] RDMA/siw: Remove deliver net device event Since the netdev events of link status is now handled in ib_core, remove the related code in drivers. In addition, remove sdev->state as it is only used in siw_query_port(), and it can be replaced by ib_get_curr_port_state(). Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/siw/siw.h | 3 --- drivers/infiniband/sw/siw/siw_main.c | 16 ---------------- drivers/infiniband/sw/siw/siw_verbs.c | 4 ++-- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 86d4d6a2170e..f5dc4b3e0e60 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -76,9 +76,6 @@ struct siw_device { int numa_node; char raw_gid[ETH_ALEN]; - /* physical port state (only one port per device) */ - enum ib_port_state state; - spinlock_t lock; struct xarray qp_xa; diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index 17abef48abcd..a9dc20f241ec 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -380,16 +380,6 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, sdev = to_siw_dev(base_dev); switch (event) { - case NETDEV_UP: - sdev->state = IB_PORT_ACTIVE; - siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); - break; - - case NETDEV_DOWN: - sdev->state = IB_PORT_DOWN; - siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); - break; - case NETDEV_REGISTER: /* * Device registration now handled only by @@ -410,7 +400,6 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, * Todo: Below netdev events are currently not handled. */ case NETDEV_CHANGEMTU: - case NETDEV_CHANGE: break; default: @@ -443,11 +432,6 @@ static int siw_newlink(const char *basedev_name, struct net_device *netdev) if (sdev) { dev_dbg(&netdev->dev, "siw: new device\n"); - if (netif_running(netdev) && netif_carrier_ok(netdev)) - sdev->state = IB_PORT_ACTIVE; - else - sdev->state = IB_PORT_DOWN; - ib_mark_name_assigned_by_user(&sdev->base_dev); rv = siw_device_register(sdev, basedev_name); if (rv) diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 986666c19378..592a015cc4c6 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -182,10 +182,10 @@ int siw_query_port(struct ib_device *base_dev, u32 port, attr->max_msg_sz = -1; attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); - attr->phys_state = sdev->state == IB_PORT_ACTIVE ? + attr->state = ib_get_curr_port_state(sdev->netdev); + attr->phys_state = attr->state == IB_PORT_ACTIVE ? IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; - attr->state = sdev->state; /* * All zero * From 2298c30c6885a017b0a979abb5376e98e5e4c352 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:04 +0800 Subject: [PATCH 35/63] RDMA/usnic: Support report_port_event() ops In addition to dispatching event, some private stuffs need to be done in this driver's link status event handler. Implement the new report_port_event() ops with the link status event codes. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/usnic/usnic_ib_main.c | 73 +++++++++++++-------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 13b654ddd3cc..4ddcd5860e0f 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -151,34 +151,6 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, ib_event.element.port_num = 1; ib_dispatch_event(&ib_event); break; - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_CHANGE: - if (!us_ibdev->ufdev->link_up && - netif_carrier_ok(netdev)) { - usnic_fwd_carrier_up(us_ibdev->ufdev); - usnic_info("Link UP on %s\n", - dev_name(&us_ibdev->ib_dev.dev)); - ib_event.event = IB_EVENT_PORT_ACTIVE; - ib_event.device = &us_ibdev->ib_dev; - ib_event.element.port_num = 1; - ib_dispatch_event(&ib_event); - } else if (us_ibdev->ufdev->link_up && - !netif_carrier_ok(netdev)) { - usnic_fwd_carrier_down(us_ibdev->ufdev); - usnic_info("Link DOWN on %s\n", - dev_name(&us_ibdev->ib_dev.dev)); - usnic_ib_qp_grp_modify_active_to_err(us_ibdev); - ib_event.event = IB_EVENT_PORT_ERR; - ib_event.device = &us_ibdev->ib_dev; - ib_event.element.port_num = 1; - ib_dispatch_event(&ib_event); - } else { - usnic_dbg("Ignoring %s on %s\n", - netdev_cmd_to_name(event), - dev_name(&us_ibdev->ib_dev.dev)); - } - break; case NETDEV_CHANGEADDR: if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, sizeof(us_ibdev->ufdev->mac))) { @@ -218,6 +190,50 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, mutex_unlock(&us_ibdev->usdev_lock); } +static void usnic_ib_handle_port_event(struct ib_device *ibdev, + struct net_device *netdev, + unsigned long event) +{ + struct usnic_ib_dev *us_ibdev = + container_of(ibdev, struct usnic_ib_dev, ib_dev); + struct ib_event ib_event; + + mutex_lock(&us_ibdev->usdev_lock); + switch (event) { + case NETDEV_UP: + case NETDEV_DOWN: + case NETDEV_CHANGE: + if (!us_ibdev->ufdev->link_up && + netif_carrier_ok(netdev)) { + usnic_fwd_carrier_up(us_ibdev->ufdev); + usnic_info("Link UP on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); + ib_event.event = IB_EVENT_PORT_ACTIVE; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else if (us_ibdev->ufdev->link_up && + !netif_carrier_ok(netdev)) { + usnic_fwd_carrier_down(us_ibdev->ufdev); + usnic_info("Link DOWN on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); + usnic_ib_qp_grp_modify_active_to_err(us_ibdev); + ib_event.event = IB_EVENT_PORT_ERR; + ib_event.device = &us_ibdev->ib_dev; + ib_event.element.port_num = 1; + ib_dispatch_event(&ib_event); + } else { + usnic_dbg("Ignoring %s on %s\n", + netdev_cmd_to_name(event), + dev_name(&us_ibdev->ib_dev.dev)); + } + break; + default: + break; + } + mutex_unlock(&us_ibdev->usdev_lock); +} + static int usnic_ib_netdevice_event(struct notifier_block *notifier, unsigned long event, void *ptr) { @@ -358,6 +374,7 @@ static const struct ib_device_ops usnic_dev_ops = { .query_port = usnic_ib_query_port, .query_qp = usnic_ib_query_qp, .reg_user_mr = usnic_ib_reg_mr, + .report_port_event = usnic_ib_handle_port_event, INIT_RDMA_OBJ_SIZE(ib_pd, usnic_ib_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_cq, usnic_ib_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_qp, usnic_ib_qp_grp, ibqp), From e4b4ef34578844d235b2fdc50210de80b5a83789 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:05 +0800 Subject: [PATCH 36/63] RDMA/mlx4: Support report_port_event() ops In addition to dispatching event, some private stuffs need to be done in this driver's link status event handler. Implement the new report_port_event() ops with the link status event codes. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx4/main.c | 58 ++++++++++++++++--------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index b1bbdcff631d..dd35e03402ab 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2341,39 +2341,40 @@ static void mlx4_ib_scan_netdev(struct mlx4_ib_dev *ibdev, iboe->netdevs[dev->dev_port] = event != NETDEV_UNREGISTER ? dev : NULL; - if (event == NETDEV_UP || event == NETDEV_DOWN) { - enum ib_port_state port_state; - struct ib_event ibev = { }; - - if (ib_get_cached_port_state(&ibdev->ib_dev, dev->dev_port + 1, - &port_state)) - goto iboe_out; - - if (event == NETDEV_UP && - (port_state != IB_PORT_ACTIVE || - iboe->last_port_state[dev->dev_port] != IB_PORT_DOWN)) - goto iboe_out; - if (event == NETDEV_DOWN && - (port_state != IB_PORT_DOWN || - iboe->last_port_state[dev->dev_port] != IB_PORT_ACTIVE)) - goto iboe_out; - iboe->last_port_state[dev->dev_port] = port_state; - - ibev.device = &ibdev->ib_dev; - ibev.element.port_num = dev->dev_port + 1; - ibev.event = event == NETDEV_UP ? IB_EVENT_PORT_ACTIVE : - IB_EVENT_PORT_ERR; - ib_dispatch_event(&ibev); - } - -iboe_out: spin_unlock_bh(&iboe->lock); - if (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER || - event == NETDEV_UP || event == NETDEV_CHANGE) + if (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER) mlx4_ib_update_qps(ibdev, dev, dev->dev_port + 1); } +static void mlx4_ib_port_event(struct ib_device *ibdev, struct net_device *ndev, + unsigned long event) +{ + struct mlx4_ib_dev *mlx4_ibdev = + container_of(ibdev, struct mlx4_ib_dev, ib_dev); + struct mlx4_ib_iboe *iboe = &mlx4_ibdev->iboe; + + if (!net_eq(dev_net(ndev), &init_net)) + return; + + ASSERT_RTNL(); + + if (ndev->dev.parent != mlx4_ibdev->ib_dev.dev.parent) + return; + + spin_lock_bh(&iboe->lock); + + iboe->netdevs[ndev->dev_port] = event != NETDEV_UNREGISTER ? ndev : NULL; + + if (event == NETDEV_UP || event == NETDEV_DOWN) + ib_dispatch_port_state_event(&mlx4_ibdev->ib_dev, ndev); + + spin_unlock_bh(&iboe->lock); + + if (event == NETDEV_UP || event == NETDEV_CHANGE) + mlx4_ib_update_qps(mlx4_ibdev, ndev, ndev->dev_port + 1); +} + static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -2569,6 +2570,7 @@ static const struct ib_device_ops mlx4_ib_dev_ops = { .req_notify_cq = mlx4_ib_arm_cq, .rereg_user_mr = mlx4_ib_rereg_user_mr, .resize_cq = mlx4_ib_resize_cq, + .report_port_event = mlx4_ib_port_event, INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, mlx4_ib_cq, ibcq), From e89fd16278fda644fc7e3a6870d814d294be05b9 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:06 +0800 Subject: [PATCH 37/63] RDMA/pvrdma: Support report_port_event() ops In addition to dispatching event, some private stuffs need to be done in this driver's link status event handler. Implement the new report_port_event() ops with the link status event codes. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- .../infiniband/hw/vmw_pvrdma/pvrdma_main.c | 66 ++++++++++++------- 1 file changed, 41 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 768aad364c89..1664d1d7d969 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -143,6 +143,46 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u32 port_num, return 0; } +static void pvrdma_dispatch_event(struct pvrdma_dev *dev, int port, + enum ib_event_type event) +{ + struct ib_event ib_event; + + memset(&ib_event, 0, sizeof(ib_event)); + ib_event.device = &dev->ib_dev; + ib_event.element.port_num = port; + ib_event.event = event; + ib_dispatch_event(&ib_event); +} + +static void pvrdma_report_event_handle(struct ib_device *ibdev, + struct net_device *ndev, + unsigned long event) +{ + struct pvrdma_dev *dev = container_of(ibdev, struct pvrdma_dev, ib_dev); + + switch (event) { + case NETDEV_DOWN: + pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ERR); + break; + case NETDEV_UP: + pvrdma_write_reg(dev, PVRDMA_REG_CTL, + PVRDMA_DEVICE_CTL_UNQUIESCE); + + mb(); + + if (pvrdma_read_reg(dev, PVRDMA_REG_ERR)) + dev_err(&dev->pdev->dev, + "failed to activate device during link up\n"); + else + pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE); + break; + + default: + break; + } +} + static const struct ib_device_ops pvrdma_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_VMW_PVRDMA, @@ -181,6 +221,7 @@ static const struct ib_device_ops pvrdma_dev_ops = { .query_qp = pvrdma_query_qp, .reg_user_mr = pvrdma_reg_user_mr, .req_notify_cq = pvrdma_req_notify_cq, + .report_port_event = pvrdma_report_event_handle, INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, pvrdma_cq, ibcq), @@ -362,18 +403,6 @@ static void pvrdma_srq_event(struct pvrdma_dev *dev, u32 srqn, int type) } } -static void pvrdma_dispatch_event(struct pvrdma_dev *dev, int port, - enum ib_event_type event) -{ - struct ib_event ib_event; - - memset(&ib_event, 0, sizeof(ib_event)); - ib_event.device = &dev->ib_dev; - ib_event.element.port_num = port; - ib_event.event = event; - ib_dispatch_event(&ib_event); -} - static void pvrdma_dev_event(struct pvrdma_dev *dev, u8 port, int type) { if (port < 1 || port > dev->dsr->caps.phys_port_cnt) { @@ -666,21 +695,8 @@ static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev, switch (event) { case NETDEV_REBOOT: - case NETDEV_DOWN: pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ERR); break; - case NETDEV_UP: - pvrdma_write_reg(dev, PVRDMA_REG_CTL, - PVRDMA_DEVICE_CTL_UNQUIESCE); - - mb(); - - if (pvrdma_read_reg(dev, PVRDMA_REG_ERR)) - dev_err(&dev->pdev->dev, - "failed to activate device during link up\n"); - else - pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE); - break; case NETDEV_UNREGISTER: ib_device_set_netdev(&dev->ib_dev, NULL, 1); dev_put(dev->netdev); From 379013776222e296645f39a689c57edc5f13b5be Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:07 +0800 Subject: [PATCH 38/63] RDMA/mlx5: Handle link status event only for LAG device The link status events of non-LAG devices are now handled in ib_core, so only LAG device events need to be handled in driver. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index bc7930d0c564..e4010f871865 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -242,6 +242,9 @@ static int mlx5_netdev_event(struct notifier_block *this, case NETDEV_DOWN: { struct net_device *upper = NULL; + if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev)) + return NOTIFY_DONE; + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { struct net_device *lag_ndev; From 20b6d8a7b9bdced0c5f9a4887dbf123dd8e334c0 Mon Sep 17 00:00:00 2001 From: Yuyu Li Date: Fri, 22 Nov 2024 18:53:08 +0800 Subject: [PATCH 39/63] RDMA/hns: Support fast path for link-down events dispatching hns3 NIC driver can directly notify the RoCE driver about link status events bypassing the netdev notifier. This can provide more timely event dispatching for ULPs. Signed-off-by: Yuyu Li Signed-off-by: Junxian Huang Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 697b17cca02e..5c911d1def03 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -7178,9 +7178,22 @@ static int hns_roce_hw_v2_reset_notify(struct hnae3_handle *handle, return ret; } +static void hns_roce_hw_v2_link_status_change(struct hnae3_handle *handle, + bool linkup) +{ + struct hns_roce_dev *hr_dev = (struct hns_roce_dev *)handle->priv; + struct net_device *netdev = handle->rinfo.netdev; + + if (linkup || !hr_dev) + return; + + ib_dispatch_port_state_event(&hr_dev->ib_dev, netdev); +} + static const struct hnae3_client_ops hns_roce_hw_v2_ops = { .init_instance = hns_roce_hw_v2_init_instance, .uninit_instance = hns_roce_hw_v2_uninit_instance, + .link_status_change = hns_roce_hw_v2_link_status_change, .reset_notify = hns_roce_hw_v2_reset_notify, }; From 123c13f10ed3627ba112172d8bd122a72cae226d Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Wed, 25 Dec 2024 13:15:48 +0000 Subject: [PATCH 40/63] RDMA/efa: Reset device on probe failure Make sure the device is being reset on driver exit whatever the reason is, to keep the device aligned and allow it to close shared resources (e.g. admin queue). Reviewed-by: Firas Jahjah Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin Link: https://patch.msgid.link/20241225131548.15155-1-mrgolin@amazon.com Reviewed-by: Gal Pressman Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa_main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index ad225823e6f2..45a4564c670c 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -470,7 +470,6 @@ static void efa_ib_device_remove(struct efa_dev *dev) ibdev_info(&dev->ibdev, "Unregister ib device\n"); ib_unregister_device(&dev->ibdev); efa_destroy_eqs(dev); - efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL); efa_release_doorbell_bar(dev); } @@ -643,12 +642,14 @@ err_disable_device: return ERR_PTR(err); } -static void efa_remove_device(struct pci_dev *pdev) +static void efa_remove_device(struct pci_dev *pdev, + enum efa_regs_reset_reason_types reset_reason) { struct efa_dev *dev = pci_get_drvdata(pdev); struct efa_com_dev *edev; edev = &dev->edev; + efa_com_dev_reset(edev, reset_reason); efa_com_admin_destroy(edev); efa_free_irq(dev, &dev->admin_irq); efa_disable_msix(dev); @@ -676,7 +677,7 @@ static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; err_remove_device: - efa_remove_device(pdev); + efa_remove_device(pdev, EFA_REGS_RESET_INIT_ERR); return err; } @@ -685,7 +686,7 @@ static void efa_remove(struct pci_dev *pdev) struct efa_dev *dev = pci_get_drvdata(pdev); efa_ib_device_remove(dev); - efa_remove_device(pdev); + efa_remove_device(pdev, EFA_REGS_RESET_NORMAL); } static void efa_shutdown(struct pci_dev *pdev) From 67831baff0d7a7ae12bba80c721fffacfab82e89 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Thu, 26 Dec 2024 16:41:08 +0800 Subject: [PATCH 41/63] RDMA/erdma: Add missing fields to the erdma_device_ops_rocev2 Set the query_ah field to the erdma_create_ah() function and set the size_ib_ah field to the size of struct erdma_ah. Reviewed-by: Cheng Xu Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241226084141.74823-2-boshiyu@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index eabf435c77a3..9f512b642884 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -478,6 +478,9 @@ static const struct ib_device_ops erdma_device_ops_rocev2 = { .query_pkey = erdma_query_pkey, .create_ah = erdma_create_ah, .destroy_ah = erdma_destroy_ah, + .query_ah = erdma_query_ah, + + INIT_RDMA_OBJ_SIZE(ib_ah, erdma_ah, ibah), }; static const struct ib_device_ops erdma_device_ops_iwarp = { From 3761e0ad79c137d61baf5f1518e4795f35fe8159 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Thu, 26 Dec 2024 16:41:09 +0800 Subject: [PATCH 42/63] RDMA/erdma: Fix incorrect response returned from query_qp The erdma_post_cmd_wait() function returns the cmdq response only when both resp0 and resp1 are not NULL. Reviewed-by: Cheng Xu Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241226084141.74823-3-boshiyu@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_verbs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 45b377ac9e49..199c7a26cd9d 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1795,7 +1795,7 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, struct erdma_cmdq_query_qp_req_rocev2 req; struct erdma_dev *dev; struct erdma_qp *qp; - u64 resp; + u64 resp0, resp1; int ret; if (ibqp && qp_attr && qp_init_attr) { @@ -1829,20 +1829,20 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, CMDQ_OPCODE_QUERY_QP); req.qpn = QP_ID(qp); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp, - NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, + &resp1); if (ret) return ret; qp_attr->sq_psn = - FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_SQ_PSN_MASK, resp); + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_SQ_PSN_MASK, resp0); qp_attr->rq_psn = - FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_RQ_PSN_MASK, resp); - qp_attr->qp_state = rocev2_to_ib_qps( - FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_QP_STATE_MASK, resp)); + FIELD_GET(ERDMA_CMD_QUERY_QP_RESP_RQ_PSN_MASK, resp0); + qp_attr->qp_state = rocev2_to_ib_qps(FIELD_GET( + ERDMA_CMD_QUERY_QP_RESP_QP_STATE_MASK, resp0)); qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->sq_draining = FIELD_GET( - ERDMA_CMD_QUERY_QP_RESP_SQ_DRAINING_MASK, resp); + ERDMA_CMD_QUERY_QP_RESP_SQ_DRAINING_MASK, resp0); qp_attr->pkey_index = 0; qp_attr->dest_qp_num = qp->attrs.rocev2.dst_qpn; From 26981e688ca896e9310e1918d104a79cb140ce85 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Thu, 26 Dec 2024 16:41:10 +0800 Subject: [PATCH 43/63] RDMA/erdma: Support non-sleeping erdma_post_cmd_wait() Several scenarios require posting commands to the cmdq in a non-sleepable context. For example, the cm_alloc_msg() might call erdma_create_ah() while still holding a spinlock. So we add support for non-sleeping erdma_post_cmd_wait(). Reviewed-by: Cheng Xu Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241226084141.74823-4-boshiyu@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma.h | 4 +-- drivers/infiniband/hw/erdma/erdma_cmdq.c | 26 +++++++------- drivers/infiniband/hw/erdma/erdma_eq.c | 6 ++-- drivers/infiniband/hw/erdma/erdma_main.c | 7 ++-- drivers/infiniband/hw/erdma/erdma_qp.c | 8 +++-- drivers/infiniband/hw/erdma/erdma_verbs.c | 44 ++++++++++++++--------- 6 files changed, 54 insertions(+), 41 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index 2418ac687404..2a023b99f992 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -101,8 +101,6 @@ struct erdma_cmdq { struct erdma_comp_wait *wait_pool; spinlock_t lock; - bool use_event; - struct erdma_cmdq_sq sq; struct erdma_cmdq_cq cq; struct erdma_eq eq; @@ -267,7 +265,7 @@ void erdma_cmdq_destroy(struct erdma_dev *dev); void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op); int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, - u64 *resp0, u64 *resp1); + u64 *resp0, u64 *resp1, bool sleepable); void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq); int erdma_ceqs_init(struct erdma_dev *dev); diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c index a3d8922d1ad1..b867aefe83b2 100644 --- a/drivers/infiniband/hw/erdma/erdma_cmdq.c +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -182,7 +182,6 @@ int erdma_cmdq_init(struct erdma_dev *dev) int err; cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING; - cmdq->use_event = false; sema_init(&cmdq->credits, cmdq->max_outstandings); @@ -223,8 +222,6 @@ err_destroy_sq: void erdma_finish_cmdq_init(struct erdma_dev *dev) { - /* after device init successfully, change cmdq to event mode. */ - dev->cmdq.use_event = true; arm_cmdq_cq(&dev->cmdq); } @@ -312,8 +309,7 @@ static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq) /* Copy 16B comp data after cqe hdr to outer */ be32_to_cpu_array(comp_wait->comp_data, cqe + 2, 4); - if (cmdq->use_event) - complete(&comp_wait->wait_event); + complete(&comp_wait->wait_event); return 0; } @@ -332,9 +328,6 @@ static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq) if (erdma_poll_single_cmd_completion(cmdq)) break; - if (comp_num && cmdq->use_event) - arm_cmdq_cq(cmdq); - spin_unlock_irqrestore(&cmdq->cq.lock, flags); } @@ -342,8 +335,7 @@ void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) { int got_event = 0; - if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) || - !cmdq->use_event) + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) return; while (get_next_valid_eqe(&cmdq->eq)) { @@ -354,6 +346,7 @@ void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) if (got_event) { cmdq->cq.cmdsn++; erdma_polling_cmd_completions(cmdq); + arm_cmdq_cq(cmdq); } notify_eq(&cmdq->eq); @@ -372,7 +365,7 @@ static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx, if (time_is_before_jiffies(comp_timeout)) return -ETIME; - msleep(20); + udelay(20); } return 0; @@ -403,7 +396,7 @@ void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op) } int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, - u64 *resp0, u64 *resp1) + u64 *resp0, u64 *resp1, bool sleepable) { struct erdma_comp_wait *comp_wait; int ret; @@ -411,7 +404,12 @@ int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) return -ENODEV; - down(&cmdq->credits); + if (!sleepable) { + while (down_trylock(&cmdq->credits)) + ; + } else { + down(&cmdq->credits); + } comp_wait = get_comp_wait(cmdq); if (IS_ERR(comp_wait)) { @@ -425,7 +423,7 @@ int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, void *req, u32 req_size, push_cmdq_sqe(cmdq, req, req_size, comp_wait); spin_unlock(&cmdq->sq.lock); - if (cmdq->use_event) + if (sleepable) ret = erdma_wait_cmd_completion(comp_wait, cmdq, ERDMA_CMDQ_TIMEOUT_MS); else diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c index 9a72fec6d5cc..6486234a2360 100644 --- a/drivers/infiniband/hw/erdma/erdma_eq.c +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -236,7 +236,8 @@ static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) req.db_dma_addr_l = lower_32_bits(eq->dbrec_dma); req.db_dma_addr_h = upper_32_bits(eq->dbrec_dma); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + false); } static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) @@ -278,7 +279,8 @@ static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) req.qtype = ERDMA_EQ_TYPE_CEQ; req.vector_idx = ceqn + 1; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + false); if (err) return; diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 9f512b642884..f35b30235018 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -384,7 +384,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) CMDQ_OPCODE_QUERY_DEVICE); err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, - &cap1); + &cap1, true); if (err) return err; @@ -417,7 +417,7 @@ static int erdma_dev_attrs_init(struct erdma_dev *dev) CMDQ_OPCODE_QUERY_FW_INFO); err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, - &cap1); + &cap1, true); if (!err) dev->attrs.fw_version = FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0); @@ -438,7 +438,8 @@ static int erdma_device_config(struct erdma_dev *dev) req.cfg = FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PGSHIFT_MASK, PAGE_SHIFT) | FIELD_PREP(ERDMA_CMD_CONFIG_DEVICE_PS_EN_MASK, 1); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_res_cb_init(struct erdma_dev *dev) diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c index 5c266918fb36..25f6c49aec77 100644 --- a/drivers/infiniband/hw/erdma/erdma_qp.c +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -98,7 +98,8 @@ erdma_modify_qp_state_to_rts(struct erdma_qp *qp, req.send_nxt += MPA_DEFAULT_HDR_LEN + params->pd_len; req.recv_nxt = tp->rcv_nxt; - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) return ret; @@ -131,7 +132,8 @@ erdma_modify_qp_state_to_stop(struct erdma_qp *qp, req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, params->state) | FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) return ret; @@ -246,7 +248,7 @@ static int modify_qp_cmd_rocev2(struct erdma_qp *qp, req.attr_mask = attr_mask; return erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, - NULL); + NULL, true); } static void erdma_reset_qp(struct erdma_qp *qp) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 199c7a26cd9d..0e8a13577fdc 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -126,8 +126,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp) } } - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, - &resp1); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, &resp1, + true); if (!err && erdma_device_iwarp(dev)) qp->attrs.iwarp.cookie = FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); @@ -185,7 +185,8 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) } post_cmd: - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) @@ -247,7 +248,8 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq) } } - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_alloc_idx(struct erdma_resource_cb *res_cb) @@ -463,7 +465,8 @@ static void erdma_flush_worker(struct work_struct *work) req.qpn = QP_ID(qp); req.sq_pi = qp->kern_qp.sq_pi; req.rq_pi = qp->kern_qp.rq_pi; - erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, NULL); + erdma_post_cmd_wait(&qp->dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } static int erdma_qp_validate_cap(struct erdma_dev *dev, @@ -1261,7 +1264,8 @@ int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) req.cfg = FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, ibmr->lkey >> 8) | FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, ibmr->lkey & 0xFF); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) return ret; @@ -1286,7 +1290,8 @@ int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) CMDQ_OPCODE_DESTROY_CQ); req.cqn = cq->cqn; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) return err; @@ -1333,7 +1338,8 @@ int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) CMDQ_OPCODE_DESTROY_QP); req.qpn = QP_ID(qp); - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) return err; @@ -1431,7 +1437,8 @@ static int alloc_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx, FIELD_PREP(ERDMA_CMD_EXT_DB_RQ_EN_MASK, 1) | FIELD_PREP(ERDMA_CMD_EXT_DB_SQ_EN_MASK, 1); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &val0, &val1); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &val0, &val1, + true); if (ret) return ret; @@ -1466,7 +1473,8 @@ static void free_db_resources(struct erdma_dev *dev, struct erdma_ucontext *ctx) req.rdb_off = ctx->ext_db.rdb_off; req.cdb_off = ctx->ext_db.cdb_off; - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) ibdev_err_ratelimited(&dev->ibdev, "free db resources failed %d", ret); @@ -1830,7 +1838,7 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, req.qpn = QP_ID(qp); ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), &resp0, - &resp1); + &resp1, true); if (ret) return ret; @@ -1993,7 +2001,7 @@ void erdma_set_mtu(struct erdma_dev *dev, u32 mtu) CMDQ_OPCODE_CONF_MTU); req.mtu = mtu; - erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, true); } void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason) @@ -2063,7 +2071,8 @@ static int erdma_query_hw_stats(struct erdma_dev *dev, req.target_addr = dma_addr; req.target_length = ERDMA_HW_RESP_SIZE; - err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + err = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (err) goto out; @@ -2124,7 +2133,8 @@ static int erdma_set_gid(struct erdma_dev *dev, u8 op, u32 idx, erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_SET_GID); - return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); } int erdma_add_gid(const struct ib_gid_attr *attr, void **context) @@ -2208,7 +2218,8 @@ int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, req.ahn = ah->ahn; erdma_set_av_cfg(&req.av_cfg, &ah->av); - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) { erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_AH], ah->ahn); return ret; @@ -2231,7 +2242,8 @@ int erdma_destroy_ah(struct ib_ah *ibah, u32 flags) req.pdn = pd->pdn; req.ahn = ah->ahn; - ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL); + ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, + true); if (ret) return ret; From a6c346760a52afaf7d75991c16ee4d70d6270d06 Mon Sep 17 00:00:00 2001 From: Boshi Yu Date: Thu, 26 Dec 2024 16:41:11 +0800 Subject: [PATCH 44/63] RDMA/erdma: Support create_ah/destroy_ah in non-sleepable contexts The RDMA CM module might invoke erdma_create_ah() or erdma_destroy_ah() in a non-sleepable context. Both of these functions will call the erdma_post_cmd_wait(), which can potentially sleep and occasionally lead to a hard lockup. Therefore, post the create_ah and destroy_ah commands in polling mode if the RDMA_CREATE_AH_SLEEPABLE and RDMA_DESTROY_AH_SLEEPABLE flags are not set, respectively. Reviewed-by: Cheng Xu Signed-off-by: Boshi Yu Link: https://patch.msgid.link/20241226084141.74823-5-boshiyu@linux.alibaba.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/erdma/erdma_verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index 0e8a13577fdc..af36a8d2df22 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -2219,7 +2219,7 @@ int erdma_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, erdma_set_av_cfg(&req.av_cfg, &ah->av); ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, - true); + init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (ret) { erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_AH], ah->ahn); return ret; @@ -2243,7 +2243,7 @@ int erdma_destroy_ah(struct ib_ah *ibah, u32 flags) req.ahn = ah->ahn; ret = erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL, - true); + flags & RDMA_DESTROY_AH_SLEEPABLE); if (ret) return ret; From 220043b06fded9909bdf62e3355396eff0bb8a52 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Mon, 30 Dec 2024 16:14:04 +0200 Subject: [PATCH 45/63] RDMA/mlx5: Fix link status down event for MPV The commit below prevented MPV from unloading correctly due to blocking the netdev down event, allow sending the event for MPV mode to maintain proper unload flow. Fixes: 379013776222 ("RDMA/mlx5: Handle link status event only for LAG device") Signed-off-by: Patrisious Haddad Reviewed-by: Maor Gottlieb Link: https://patch.msgid.link/d7731478e456f61255af798a7fd4e64b006ddebb.1735567976.git.leonro@nvidia.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e4010f871865..89597a88c75d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -242,7 +242,8 @@ static int mlx5_netdev_event(struct notifier_block *this, case NETDEV_DOWN: { struct net_device *upper = NULL; - if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev)) + if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev) && + !mlx5_core_mp_enabled(mdev)) return NOTIFY_DONE; if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { From c84f0f4f49d81645f49c3269fdcc3b84ce61e795 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Sat, 4 Jan 2025 11:45:19 +0530 Subject: [PATCH 46/63] RDMA/bnxt_re: Fix to drop reference to the mmap entry in case of error In the error handling path of bnxt_re_mmap(), driver should invoke rdma_user_mmap_entry_put() to free the reference of mmap entry in case the error happens after rdma_user_mmap_entry_get was called. Fixes: ea2224857882 ("RDMA/bnxt_re: Update alloc_page uapi for pacing") Reviewed-by: Saravanan Vajravel Reviewed-by: Kashyap Desai Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20250104061519.2540178-1-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 129178bdc581..27efaaf1c82a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -4465,9 +4465,10 @@ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) case BNXT_RE_MMAP_TOGGLE_PAGE: /* Driver doesn't expect write access for user space */ if (vma->vm_flags & VM_WRITE) - return -EFAULT; - ret = vm_insert_page(vma, vma->vm_start, - virt_to_page((void *)bnxt_entry->mem_offset)); + ret = -EFAULT; + else + ret = vm_insert_page(vma, vma->vm_start, + virt_to_page((void *)bnxt_entry->mem_offset)); break; default: ret = -EINVAL; From 802a9f8792c4b4f81f36e90302067bf55ababed7 Mon Sep 17 00:00:00 2001 From: Yonatan Nachum Date: Sun, 5 Jan 2025 13:14:21 +0000 Subject: [PATCH 47/63] RDMA/efa: Align interrupt related fields to same type There is a lot of implicit casting of interrupt related fields. Use u32 as common type since this is what the device use as type for max supported EQs and what IB core expects in num_comp_vectors field. Reviewed-by: Daniel Kranzdorf Reviewed-by: Michael Margolin Signed-off-by: Yonatan Nachum Link: https://patch.msgid.link/20250105131421.29030-1-ynachum@amazon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa.h | 8 ++++---- drivers/infiniband/hw/efa/efa_com.h | 6 +++--- drivers/infiniband/hw/efa/efa_main.c | 19 +++++++------------ 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index d7fc9d5eeefd..838182d0409c 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_H_ @@ -57,15 +57,15 @@ struct efa_dev { u64 db_bar_addr; u64 db_bar_len; - unsigned int num_irq_vectors; - int admin_msix_vector_idx; + u32 num_irq_vectors; + u32 admin_msix_vector_idx; struct efa_irq admin_irq; struct efa_stats stats; /* Array of completion EQs */ struct efa_eq *eqs; - unsigned int neqs; + u32 neqs; /* Only stores CQs with interrupts enabled */ struct xarray cqs_xa; diff --git a/drivers/infiniband/hw/efa/efa_com.h b/drivers/infiniband/hw/efa/efa_com.h index 77282234ce68..4d9ca97e4296 100644 --- a/drivers/infiniband/hw/efa/efa_com.h +++ b/drivers/infiniband/hw/efa/efa_com.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef _EFA_COM_H_ @@ -65,7 +65,7 @@ struct efa_com_admin_queue { u16 depth; struct efa_com_admin_cq cq; struct efa_com_admin_sq sq; - u16 msix_vector_idx; + u32 msix_vector_idx; unsigned long state; @@ -89,7 +89,7 @@ struct efa_com_aenq { struct efa_aenq_handlers *aenq_handlers; dma_addr_t dma_addr; u32 cc; /* consumer counter */ - u16 msix_vector_idx; + u32 msix_vector_idx; u16 depth; u8 phase; }; diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 45a4564c670c..4f03c0ec819f 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include @@ -141,8 +141,7 @@ static int efa_request_irq(struct efa_dev *dev, struct efa_irq *irq) return 0; } -static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq, - int vector) +static void efa_setup_comp_irq(struct efa_dev *dev, struct efa_eq *eq, u32 vector) { u32 cpu; @@ -305,7 +304,7 @@ static void efa_destroy_eq(struct efa_dev *dev, struct efa_eq *eq) efa_free_irq(dev, &eq->irq); } -static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u8 msix_vec) +static int efa_create_eq(struct efa_dev *dev, struct efa_eq *eq, u32 msix_vec) { int err; @@ -328,21 +327,17 @@ err_free_comp_irq: static int efa_create_eqs(struct efa_dev *dev) { - unsigned int neqs = dev->dev_attr.max_eq; - int err; - int i; - - neqs = min_t(unsigned int, neqs, - dev->num_irq_vectors - EFA_COMP_EQS_VEC_BASE); + u32 neqs = dev->dev_attr.max_eq; + int err, i; + neqs = min_t(u32, neqs, dev->num_irq_vectors - EFA_COMP_EQS_VEC_BASE); dev->neqs = neqs; dev->eqs = kcalloc(neqs, sizeof(*dev->eqs), GFP_KERNEL); if (!dev->eqs) return -ENOMEM; for (i = 0; i < neqs; i++) { - err = efa_create_eq(dev, &dev->eqs[i], - i + EFA_COMP_EQS_VEC_BASE); + err = efa_create_eq(dev, &dev->eqs[i], i + EFA_COMP_EQS_VEC_BASE); if (err) goto err_destroy_eqs; } From 81468c4058a62e84e475433b83b3edc613294f5e Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Mon, 6 Jan 2025 08:45:16 +0800 Subject: [PATCH 48/63] RDMA/rtrs: Add missing deinit() call A warning is triggered when repeatedly connecting and disconnecting the rnbd: list_add corruption. prev->next should be next (ffff88800b13e480), but was ffff88801ecd1338. (prev=ffff88801ecd1340). WARNING: CPU: 1 PID: 36562 at lib/list_debug.c:32 __list_add_valid_or_report+0x7f/0xa0 Workqueue: ib_cm cm_work_handler [ib_cm] RIP: 0010:__list_add_valid_or_report+0x7f/0xa0 ? __list_add_valid_or_report+0x7f/0xa0 ib_register_event_handler+0x65/0x93 [ib_core] rtrs_srv_ib_dev_init+0x29/0x30 [rtrs_server] rtrs_ib_dev_find_or_add+0x124/0x1d0 [rtrs_core] __alloc_path+0x46c/0x680 [rtrs_server] ? rtrs_rdma_connect+0xa6/0x2d0 [rtrs_server] ? rcu_is_watching+0xd/0x40 ? __mutex_lock+0x312/0xcf0 ? get_or_create_srv+0xad/0x310 [rtrs_server] ? rtrs_rdma_connect+0xa6/0x2d0 [rtrs_server] rtrs_rdma_connect+0x23c/0x2d0 [rtrs_server] ? __lock_release+0x1b1/0x2d0 cma_cm_event_handler+0x4a/0x1a0 [rdma_cm] cma_ib_req_handler+0x3a0/0x7e0 [rdma_cm] cm_process_work+0x28/0x1a0 [ib_cm] ? _raw_spin_unlock_irq+0x2f/0x50 cm_req_handler+0x618/0xa60 [ib_cm] cm_work_handler+0x71/0x520 [ib_cm] Commit 667db86bcbe8 ("RDMA/rtrs: Register ib event handler") introduced a new element .deinit but never used it at all. Fix it by invoking the `deinit()` to appropriately unregister the IB event handler. Cc: Jinpu Wang Fixes: 667db86bcbe8 ("RDMA/rtrs: Register ib event handler") Signed-off-by: Li Zhijian Link: https://patch.msgid.link/20250106004516.16611-1-lizhijian@fujitsu.com Acked-by: Jack Wang Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/rtrs/rtrs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/ulp/rtrs/rtrs.c b/drivers/infiniband/ulp/rtrs/rtrs.c index 4e17d546d4cc..bf38ac6f87c4 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs.c +++ b/drivers/infiniband/ulp/rtrs/rtrs.c @@ -584,6 +584,9 @@ static void dev_free(struct kref *ref) list_del(&dev->entry); mutex_unlock(&pool->mutex); + if (pool->ops && pool->ops->deinit) + pool->ops->deinit(dev); + ib_dealloc_pd(dev->ib_pd); kfree(dev); } From 8977b561216c7e693d61c6442657e33f134bfeb5 Mon Sep 17 00:00:00 2001 From: Junxian Huang Date: Mon, 6 Jan 2025 19:12:11 +0800 Subject: [PATCH 49/63] RDMA/hns: Clean up the legacy CONFIG_INFINIBAND_HNS hns driver used to support hip06 and hip08 devices with CONFIG_INFINIBAND_HNS_HIP06 and CONFIG_INFINIBAND_HNS_HIP08 respectively, which both depended on CONFIG_INFINIBAND_HNS. But we no longer provide support for hip06 and only support hip08 and higher since the commit in fixes line, so there is no need to have CONFIG_INFINIBAND_HNS any more. Remove it and only keep CONFIG_INFINIBAND_HNS_HIP08. Fixes: 38d220882426 ("RDMA/hns: Remove support for HIP06") Signed-off-by: Junxian Huang Link: https://patch.msgid.link/20250106111211.3945051-1-huangjunxian6@hisilicon.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/Makefile | 2 +- drivers/infiniband/hw/hns/Kconfig | 20 +++++--------------- drivers/infiniband/hw/hns/Makefile | 9 +++------ 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index 1211f4317a9f..aba96ca9bce5 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_INFINIBAND_OCRDMA) += ocrdma/ obj-$(CONFIG_INFINIBAND_VMWARE_PVRDMA) += vmw_pvrdma/ obj-$(CONFIG_INFINIBAND_USNIC) += usnic/ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ -obj-$(CONFIG_INFINIBAND_HNS) += hns/ +obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig index ab3fbba70789..44cdb706fe27 100644 --- a/drivers/infiniband/hw/hns/Kconfig +++ b/drivers/infiniband/hw/hns/Kconfig @@ -1,21 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only -config INFINIBAND_HNS - tristate "HNS RoCE Driver" - depends on NET_VENDOR_HISILICON - depends on ARM64 || (COMPILE_TEST && 64BIT) - depends on (HNS_DSAF && HNS_ENET) || HNS3 - help - This is a RoCE/RDMA driver for the Hisilicon RoCE engine. - - To compile HIP08 driver as module, choose M here. - config INFINIBAND_HNS_HIP08 - bool "Hisilicon Hip08 Family RoCE support" - depends on INFINIBAND_HNS && PCI && HNS3 - depends on INFINIBAND_HNS=m || HNS3=y + tristate "Hisilicon Hip08 Family RoCE support" + depends on ARM64 || (COMPILE_TEST && 64BIT) + depends on PCI && HNS3 help RoCE driver support for Hisilicon RoCE engine in Hisilicon Hip08 SoC. The RoCE engine is a PCI device. - To compile this driver, choose Y here: if INFINIBAND_HNS is m, this - module will be called hns-roce-hw-v2. + To compile this driver, choose M here. This module will be called + hns-roce-hw-v2. diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index be1e1cdbcfa8..7917af8e6380 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -5,12 +5,9 @@ ccflags-y := -I $(srctree)/drivers/net/ethernet/hisilicon/hns3 -hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ +hns-roce-hw-v2-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o \ - hns_roce_debugfs.o + hns_roce_debugfs.o hns_roce_hw_v2.o -ifdef CONFIG_INFINIBAND_HNS_HIP08 -hns-roce-hw-v2-objs := hns_roce_hw_v2.o $(hns-roce-objs) -obj-$(CONFIG_INFINIBAND_HNS) += hns-roce-hw-v2.o -endif +obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o From 76b26917e4ff9545b321730cd3b64fdd2e043769 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 7 Jan 2025 01:01:59 -0800 Subject: [PATCH 50/63] MAINTAINERS: Update the bnxt_re maintainers Adding Kalesh to the bnxt_re maintainers list Reviewed-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1736240519-2491-1-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1e930c7a58b1..f5302f7e8ca0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4766,6 +4766,7 @@ F: drivers/scsi/mpi3mr/ BROADCOM NETXTREME-E ROCE DRIVER M: Selvin Xavier +M: Kalesh AP L: linux-rdma@vger.kernel.org S: Supported W: http://www.broadcom.com From 235f238402194a78ac5fb882a46717eac817e5d1 Mon Sep 17 00:00:00 2001 From: Michael Guralnik Date: Mon, 6 Jan 2025 20:27:10 +0200 Subject: [PATCH 51/63] RDMA/mlx5: Fix indirect mkey ODP page count Restrict the check for the number of pages handled during an ODP page fault to direct mkeys. Perform the check right after handling the page fault and don't propagate the number of handled pages to callers. Indirect mkeys and their associated direct mkeys can have different start addresses. As a result, the calculation of the number of pages to handle for an indirect mkey may not match the actual page fault handling done on the direct mkey. For example: A 4K sized page fault on a KSM mkey that has a start address that is not aligned to a page will result a calculation that assumes the number of pages required to handle are 2. While the underlying MTT might be aligned will require fetching only a single page. Thus, do the calculation and compare number of pages handled only per direct mkey. Fixes: db570d7deafb ("IB/mlx5: Add ODP support to MW") Signed-off-by: Michael Guralnik Reviewed-by: Artemy Kovalyov Link: https://patch.msgid.link/86c483d9e75ce8fe14e9ff85b62df72b779f8ab1.1736187990.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/odp.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4eb03fc0d302..f2eb940bddc8 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -944,8 +944,7 @@ out: /* * Handle a single data segment in a page-fault WQE or RDMA region. * - * Returns number of OS pages retrieved on success. The caller may continue to - * the next data segment. + * Returns zero on success. The caller may continue to the next data segment. * Can return the following error codes: * -EAGAIN to designate a temporary error. The caller will abort handling the * page fault and resolve it. @@ -958,7 +957,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 *bytes_committed, u32 *bytes_mapped) { - int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0; + int ret, i, outlen, cur_outlen = 0, depth = 0, pages_in_range; struct pf_frame *head = NULL, *frame; struct mlx5_ib_mkey *mmkey; struct mlx5_ib_mr *mr; @@ -993,13 +992,20 @@ next_mr: case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + pages_in_range = (ALIGN(io_virt + bcnt, PAGE_SIZE) - + (io_virt & PAGE_MASK)) >> + PAGE_SHIFT; ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false); if (ret < 0) goto end; mlx5_update_odp_stats_with_handled(mr, faults, ret); - npages += ret; + if (ret < pages_in_range) { + ret = -EFAULT; + goto end; + } + ret = 0; break; @@ -1090,7 +1096,7 @@ end: kfree(out); *bytes_committed = 0; - return ret ? ret : npages; + return ret; } /* @@ -1109,8 +1115,7 @@ end: * the committed bytes). * @receive_queue: receive WQE end of sg list * - * Returns the number of pages loaded if positive, zero for an empty WQE, or a - * negative error code. + * Returns zero for success or a negative error code. */ static int pagefault_data_segments(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, @@ -1118,7 +1123,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, void *wqe_end, u32 *bytes_mapped, u32 *total_wqe_bytes, bool receive_queue) { - int ret = 0, npages = 0; + int ret = 0; u64 io_virt; __be32 key; u32 byte_count; @@ -1175,10 +1180,9 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, bytes_mapped); if (ret < 0) break; - npages += ret; } - return ret < 0 ? ret : npages; + return ret; } /* @@ -1414,12 +1418,6 @@ resolve_page_fault: free_page((unsigned long)wqe_start); } -static int pages_in_range(u64 address, u32 length) -{ - return (ALIGN(address + length, PAGE_SIZE) - - (address & PAGE_MASK)) >> PAGE_SHIFT; -} - static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { @@ -1458,7 +1456,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; - } else if (ret < 0 || pages_in_range(address, length) > ret) { + } else if (ret < 0) { mlx5_ib_page_fault_resume(dev, pfault, 1); if (ret != -ENOENT) mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n", From 184fe6f2382babdc63f07315c8accea258476070 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Tue, 7 Jan 2025 08:15:49 +0530 Subject: [PATCH 52/63] bnxt_en: Add ULP call to notify async events When the driver receives an async event notification from the Firmware, we make the new ulp_async_notifier() call to inform the RDMA driver that a firmware async event has been received. RDMA driver can then take necessary actions based on the event type. In the next patch, we will implement the ulp_async_notifier() callbacks in the RDMA driver. Signed-off-by: Michael Chan Signed-off-by: Selvin Xavier Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20250107024553.2926983-2-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 28 +++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 2 ++ 3 files changed, 31 insertions(+) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 4ec4934a4edd..25850730071b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -2857,6 +2857,7 @@ static int bnxt_async_event_process(struct bnxt *bp, } __bnxt_queue_sp_work(bp); async_event_process_exit: + bnxt_ulp_async_events(bp, cmpl); return 0; } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index b771c84cdd89..59c280634bc5 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -345,6 +345,34 @@ void bnxt_ulp_irq_restart(struct bnxt *bp, int err) } } +void bnxt_ulp_async_events(struct bnxt *bp, struct hwrm_async_event_cmpl *cmpl) +{ + u16 event_id = le16_to_cpu(cmpl->event_id); + struct bnxt_en_dev *edev = bp->edev; + struct bnxt_ulp_ops *ops; + struct bnxt_ulp *ulp; + + if (!bnxt_ulp_registered(edev)) + return; + ulp = edev->ulp_tbl; + + rcu_read_lock(); + + ops = rcu_dereference(ulp->ulp_ops); + if (!ops || !ops->ulp_async_notifier) + goto exit_unlock_rcu; + if (!ulp->async_events_bmap || event_id > ulp->max_async_event_id) + goto exit_unlock_rcu; + + /* Read max_async_event_id first before testing the bitmap. */ + smp_rmb(); + + if (test_bit(event_id, ulp->async_events_bmap)) + ops->ulp_async_notifier(ulp->handle, cmpl); +exit_unlock_rcu: + rcu_read_unlock(); +} + int bnxt_register_async_events(struct bnxt_en_dev *edev, unsigned long *events_bmap, u16 max_id) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index 5d6aac60f236..a21294cf197b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -30,6 +30,8 @@ struct bnxt_msix_entry { }; struct bnxt_ulp_ops { + /* async_notifier() cannot sleep (in BH context) */ + void (*ulp_async_notifier)(void *, struct hwrm_async_event_cmpl *); void (*ulp_irq_stop)(void *); void (*ulp_irq_restart)(void *, struct bnxt_msix_entry *); }; From 7fea327840683ebec5632cf2c942ed1940ef63bf Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 7 Jan 2025 08:15:50 +0530 Subject: [PATCH 53/63] RDMA/bnxt_re: Add Async event handling support Using the option provided by Ethernet driver, register for FW Async event. During probe, while registeriung with Ethernet driver, provide the ulp hook 'ulp_async_notifier' for receiving the firmware events. Signed-off-by: Selvin Xavier Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20250107024553.2926983-3-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 1 + drivers/infiniband/hw/bnxt_re/main.c | 39 +++++++++++++++++++ drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 6 +-- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 4 +- 4 files changed, 44 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 2975b11b79bf..018386295bcd 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -229,6 +229,7 @@ struct bnxt_re_dev { DECLARE_HASHTABLE(srq_hash, MAX_SRQ_HASH_BITS); struct dentry *dbg_root; struct dentry *qp_debugfs; + unsigned long event_bitmap; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 6d1800e285ef..54dee0f5dd3f 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -295,6 +295,20 @@ static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) &rdev->qplib_ctx); } +static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *cmpl) +{ + struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; + u32 data1, data2; + u16 event_id; + + event_id = le16_to_cpu(cmpl->event_id); + data1 = le32_to_cpu(cmpl->event_data1); + data2 = le32_to_cpu(cmpl->event_data2); + + ibdev_dbg(&rdev->ibdev, "Async event_id = %d data1 = %d data2 = %d", + event_id, data1, data2); +} + static void bnxt_re_stop_irq(void *handle) { struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); @@ -361,6 +375,7 @@ static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) } static struct bnxt_ulp_ops bnxt_re_ulp_ops = { + .ulp_async_notifier = bnxt_re_async_notifier, .ulp_irq_stop = bnxt_re_stop_irq, .ulp_irq_restart = bnxt_re_start_irq }; @@ -1785,6 +1800,26 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) return 0; } +static void bnxt_re_net_unregister_async_event(struct bnxt_re_dev *rdev) +{ + if (rdev->is_virtfn) + return; + + memset(&rdev->event_bitmap, 0, sizeof(rdev->event_bitmap)); + bnxt_register_async_events(rdev->en_dev, &rdev->event_bitmap, + ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); +} + +static void bnxt_re_net_register_async_event(struct bnxt_re_dev *rdev) +{ + if (rdev->is_virtfn) + return; + + rdev->event_bitmap |= (1 << ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); + bnxt_register_async_events(rdev->en_dev, &rdev->event_bitmap, + ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE); +} + static void bnxt_re_query_hwrm_intf_version(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev = rdev->en_dev; @@ -1864,6 +1899,8 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) bnxt_re_debugfs_rem_pdev(rdev); + bnxt_re_net_unregister_async_event(rdev); + if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) cancel_delayed_work_sync(&rdev->worker); @@ -2077,6 +2114,8 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) bnxt_re_debugfs_add_pdev(rdev); + bnxt_re_net_register_async_event(rdev); + return 0; free_sctx: bnxt_re_net_stats_ctx_free(rdev, rdev->qplib_ctx.stats.fw_id); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 59c280634bc5..3e17db0a453e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -373,9 +373,8 @@ exit_unlock_rcu: rcu_read_unlock(); } -int bnxt_register_async_events(struct bnxt_en_dev *edev, - unsigned long *events_bmap, - u16 max_id) +void bnxt_register_async_events(struct bnxt_en_dev *edev, + unsigned long *events_bmap, u16 max_id) { struct net_device *dev = edev->net; struct bnxt *bp = netdev_priv(dev); @@ -387,7 +386,6 @@ int bnxt_register_async_events(struct bnxt_en_dev *edev, smp_wmb(); ulp->max_async_event_id = max_id; bnxt_hwrm_func_drv_rgtr(bp, events_bmap, max_id + 1, true); - return 0; } EXPORT_SYMBOL(bnxt_register_async_events); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index a21294cf197b..ee6a5b8562c3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -126,6 +126,6 @@ int bnxt_register_dev(struct bnxt_en_dev *edev, struct bnxt_ulp_ops *ulp_ops, void *handle); void bnxt_unregister_dev(struct bnxt_en_dev *edev); int bnxt_send_msg(struct bnxt_en_dev *edev, struct bnxt_fw_msg *fw_msg); -int bnxt_register_async_events(struct bnxt_en_dev *edev, - unsigned long *events_bmap, u16 max_id); +void bnxt_register_async_events(struct bnxt_en_dev *edev, + unsigned long *events_bmap, u16 max_id); #endif From c0ad30eddc2858b97024527ffff1704306ac8fae Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 7 Jan 2025 08:15:51 +0530 Subject: [PATCH 54/63] RDMA/bnxt_re: Query firmware defaults of CC params during probe Added function to query firmware default values of CC parameters during driver init. These values will be stored in driver local structure and used in subsequent patch. Signed-off-by: Selvin Xavier Signed-off-by: Kalesh AP Link: https://patch.msgid.link/20250107024553.2926983-4-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 1 + drivers/infiniband/hw/bnxt_re/main.c | 5 + drivers/infiniband/hw/bnxt_re/qplib_sp.c | 113 +++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_sp.h | 2 + 4 files changed, 121 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 018386295bcd..f40aca550328 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -230,6 +230,7 @@ struct bnxt_re_dev { struct dentry *dbg_root; struct dentry *qp_debugfs; unsigned long event_bitmap; + struct bnxt_qplib_cc_param cc_param; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 54dee0f5dd3f..87ff6d874015 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -2096,6 +2096,11 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) set_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, &rdev->flags); if (!rdev->is_virtfn) { + /* Query f/w defaults of CC params */ + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, &rdev->cc_param); + if (rc) + ibdev_warn(&rdev->ibdev, "Failed to query CC defaults\n"); + rc = bnxt_re_setup_qos(rdev); if (rc) ibdev_info(&rdev->ibdev, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 7e20ae3d2c4f..d56cc3330d1b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -1016,3 +1016,116 @@ free_mem: dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); return rc; } + +static void bnxt_qplib_read_cc_gen1(struct bnxt_qplib_cc_param_ext *cc_ext, + struct creq_query_roce_cc_gen1_resp_sb_tlv *sb) +{ + cc_ext->inact_th_hi = le16_to_cpu(sb->inactivity_th_hi); + cc_ext->min_delta_cnp = le16_to_cpu(sb->min_time_between_cnps); + cc_ext->init_cp = le16_to_cpu(sb->init_cp); + cc_ext->tr_update_mode = sb->tr_update_mode; + cc_ext->tr_update_cyls = sb->tr_update_cycles; + cc_ext->fr_rtt = sb->fr_num_rtts; + cc_ext->ai_rate_incr = sb->ai_rate_increase; + cc_ext->rr_rtt_th = le16_to_cpu(sb->reduction_relax_rtts_th); + cc_ext->ar_cr_th = le16_to_cpu(sb->additional_relax_cr_th); + cc_ext->cr_min_th = le16_to_cpu(sb->cr_min_th); + cc_ext->bw_avg_weight = sb->bw_avg_weight; + cc_ext->cr_factor = sb->actual_cr_factor; + cc_ext->cr_th_max_cp = le16_to_cpu(sb->max_cp_cr_th); + cc_ext->cp_bias_en = sb->cp_bias_en; + cc_ext->cp_bias = sb->cp_bias; + cc_ext->cnp_ecn = sb->cnp_ecn; + cc_ext->rtt_jitter_en = sb->rtt_jitter_en; + cc_ext->bytes_per_usec = le16_to_cpu(sb->link_bytes_per_usec); + cc_ext->cc_cr_reset_th = le16_to_cpu(sb->reset_cc_cr_th); + cc_ext->cr_width = sb->cr_width; + cc_ext->min_quota = sb->quota_period_min; + cc_ext->max_quota = sb->quota_period_max; + cc_ext->abs_max_quota = sb->quota_period_abs_max; + cc_ext->tr_lb = le16_to_cpu(sb->tr_lower_bound); + cc_ext->cr_prob_fac = sb->cr_prob_factor; + cc_ext->tr_prob_fac = sb->tr_prob_factor; + cc_ext->fair_cr_th = le16_to_cpu(sb->fairness_cr_th); + cc_ext->red_div = sb->red_div; + cc_ext->cnp_ratio_th = sb->cnp_ratio_th; + cc_ext->ai_ext_rtt = le16_to_cpu(sb->exp_ai_rtts); + cc_ext->exp_crcp_ratio = sb->exp_ai_cr_cp_ratio; + cc_ext->low_rate_en = sb->use_rate_table; + cc_ext->cpcr_update_th = le16_to_cpu(sb->cp_exp_update_th); + cc_ext->ai_rtt_th1 = le16_to_cpu(sb->high_exp_ai_rtts_th1); + cc_ext->ai_rtt_th2 = le16_to_cpu(sb->high_exp_ai_rtts_th2); + cc_ext->cf_rtt_th = le16_to_cpu(sb->actual_cr_cong_free_rtts_th); + cc_ext->sc_cr_th1 = le16_to_cpu(sb->severe_cong_cr_th1); + cc_ext->sc_cr_th2 = le16_to_cpu(sb->severe_cong_cr_th2); + cc_ext->l64B_per_rtt = le32_to_cpu(sb->link64B_per_rtt); + cc_ext->cc_ack_bytes = sb->cc_ack_bytes; + cc_ext->reduce_cf_rtt_th = le16_to_cpu(sb->reduce_init_cong_free_rtts_th); +} + +int bnxt_qplib_query_cc_param(struct bnxt_qplib_res *res, + struct bnxt_qplib_cc_param *cc_param) +{ + struct bnxt_qplib_tlv_query_rcc_sb *ext_sb; + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct creq_query_roce_cc_resp resp = {}; + struct creq_query_roce_cc_resp_sb *sb; + struct bnxt_qplib_cmdqmsg msg = {}; + struct cmdq_query_roce_cc req = {}; + struct bnxt_qplib_rcfw_sbuf sbuf; + size_t resp_size; + int rc; + + /* Query the parameters from chip */ + bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_QUERY_ROCE_CC, + sizeof(req)); + if (bnxt_qplib_is_chip_gen_p5_p7(res->cctx)) + resp_size = sizeof(*ext_sb); + else + resp_size = sizeof(*sb); + + sbuf.size = ALIGN(resp_size, BNXT_QPLIB_CMDQE_UNITS); + sbuf.sb = dma_alloc_coherent(&rcfw->pdev->dev, sbuf.size, + &sbuf.dma_addr, GFP_KERNEL); + if (!sbuf.sb) + return -ENOMEM; + + req.resp_size = sbuf.size / BNXT_QPLIB_CMDQE_UNITS; + bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, &sbuf, sizeof(req), + sizeof(resp), 0); + rc = bnxt_qplib_rcfw_send_message(res->rcfw, &msg); + if (rc) + goto out; + + ext_sb = sbuf.sb; + sb = bnxt_qplib_is_chip_gen_p5_p7(res->cctx) ? &ext_sb->base_sb : + (struct creq_query_roce_cc_resp_sb *)ext_sb; + + cc_param->enable = sb->enable_cc & CREQ_QUERY_ROCE_CC_RESP_SB_ENABLE_CC; + cc_param->tos_ecn = (sb->tos_dscp_tos_ecn & + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_ECN_MASK) >> + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_ECN_SFT; + cc_param->tos_dscp = (sb->tos_dscp_tos_ecn & + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_DSCP_MASK) >> + CREQ_QUERY_ROCE_CC_RESP_SB_TOS_DSCP_SFT; + cc_param->alt_tos_dscp = sb->alt_tos_dscp; + cc_param->alt_vlan_pcp = sb->alt_vlan_pcp; + + cc_param->g = sb->g; + cc_param->nph_per_state = sb->num_phases_per_state; + cc_param->init_cr = le16_to_cpu(sb->init_cr); + cc_param->init_tr = le16_to_cpu(sb->init_tr); + cc_param->cc_mode = sb->cc_mode; + cc_param->inact_th = le16_to_cpu(sb->inactivity_th); + cc_param->rtt = le16_to_cpu(sb->rtt); + cc_param->tcp_cp = le16_to_cpu(sb->tcp_cp); + cc_param->time_pph = sb->time_per_phase; + cc_param->pkts_pph = sb->pkts_per_phase; + if (bnxt_qplib_is_chip_gen_p5_p7(res->cctx)) { + bnxt_qplib_read_cc_gen1(&cc_param->cc_ext, &ext_sb->gen1_sb); + cc_param->inact_th |= (cc_param->cc_ext.inact_th_hi & 0x3F) << 16; + } +out: + dma_free_coherent(&rcfw->pdev->dev, sbuf.size, sbuf.sb, sbuf.dma_addr); + return rc; +} diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index e6beeb514b7d..debb26080143 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -355,6 +355,8 @@ int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, struct bnxt_qplib_cc_param *cc_param); int bnxt_qplib_read_context(struct bnxt_qplib_rcfw *rcfw, u8 type, u32 xid, u32 resp_size, void *resp_va); +int bnxt_qplib_query_cc_param(struct bnxt_qplib_res *res, + struct bnxt_qplib_cc_param *cc_param); #define BNXT_VAR_MAX_WQE 4352 #define BNXT_VAR_MAX_SLOT_ALIGN 256 From 51dc5312dcd929efea7647c0c0e75afa461531b5 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Tue, 7 Jan 2025 08:15:52 +0530 Subject: [PATCH 55/63] RDMA/bnxt_re: Add support to handle DCB_CONFIG_CHANGE event QP1 context in HW needs to be updated when there is a change in the default DSCP values used for RoCE traffic. Handle the event from FW and modify the dscp value used by QP1. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/20250107024553.2926983-5-kalesh-anakkur.purayil@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 1 + drivers/infiniband/hw/bnxt_re/main.c | 104 +++++++++++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_fp.h | 1 + drivers/infiniband/hw/bnxt_re/qplib_sp.h | 1 + 4 files changed, 107 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index f40aca550328..dc2b193af7e8 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -231,6 +231,7 @@ struct bnxt_re_dev { struct dentry *qp_debugfs; unsigned long event_bitmap; struct bnxt_qplib_cc_param cc_param; + struct workqueue_struct *dcb_wq; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 87ff6d874015..93c4ba868ec3 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -295,9 +295,96 @@ static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) &rdev->qplib_ctx); } +struct bnxt_re_dcb_work { + struct work_struct work; + struct bnxt_re_dev *rdev; + struct hwrm_async_event_cmpl cmpl; +}; + +static bool bnxt_re_is_qp1_qp(struct bnxt_re_qp *qp) +{ + return qp->ib_qp.qp_type == IB_QPT_GSI; +} + +static struct bnxt_re_qp *bnxt_re_get_qp1_qp(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_qp *qp; + + mutex_lock(&rdev->qp_lock); + list_for_each_entry(qp, &rdev->qp_list, list) { + if (bnxt_re_is_qp1_qp(qp)) { + mutex_unlock(&rdev->qp_lock); + return qp; + } + } + mutex_unlock(&rdev->qp_lock); + return NULL; +} + +static int bnxt_re_update_qp1_tos_dscp(struct bnxt_re_dev *rdev) +{ + struct bnxt_re_qp *qp; + + if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) + return 0; + + qp = bnxt_re_get_qp1_qp(rdev); + if (!qp) + return 0; + + qp->qplib_qp.modify_flags = CMDQ_MODIFY_QP_MODIFY_MASK_TOS_DSCP; + qp->qplib_qp.tos_dscp = rdev->cc_param.qp1_tos_dscp; + + return bnxt_qplib_modify_qp(&rdev->qplib_res, &qp->qplib_qp); +} + +static void bnxt_re_init_dcb_wq(struct bnxt_re_dev *rdev) +{ + rdev->dcb_wq = create_singlethread_workqueue("bnxt_re_dcb_wq"); +} + +static void bnxt_re_uninit_dcb_wq(struct bnxt_re_dev *rdev) +{ + if (!rdev->dcb_wq) + return; + destroy_workqueue(rdev->dcb_wq); +} + +static void bnxt_re_dcb_wq_task(struct work_struct *work) +{ + struct bnxt_re_dcb_work *dcb_work = + container_of(work, struct bnxt_re_dcb_work, work); + struct bnxt_re_dev *rdev = dcb_work->rdev; + struct bnxt_qplib_cc_param *cc_param; + int rc; + + if (!rdev) + goto free_dcb; + + cc_param = &rdev->cc_param; + rc = bnxt_qplib_query_cc_param(&rdev->qplib_res, cc_param); + if (rc) { + ibdev_dbg(&rdev->ibdev, "Failed to query ccparam rc:%d", rc); + goto free_dcb; + } + if (cc_param->qp1_tos_dscp != cc_param->tos_dscp) { + cc_param->qp1_tos_dscp = cc_param->tos_dscp; + rc = bnxt_re_update_qp1_tos_dscp(rdev); + if (rc) { + ibdev_dbg(&rdev->ibdev, "%s: Failed to modify QP1 rc:%d", + __func__, rc); + goto free_dcb; + } + } + +free_dcb: + kfree(dcb_work); +} + static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *cmpl) { struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; + struct bnxt_re_dcb_work *dcb_work; u32 data1, data2; u16 event_id; @@ -307,6 +394,21 @@ static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *c ibdev_dbg(&rdev->ibdev, "Async event_id = %d data1 = %d data2 = %d", event_id, data1, data2); + + switch (event_id) { + case ASYNC_EVENT_CMPL_EVENT_ID_DCB_CONFIG_CHANGE: + dcb_work = kzalloc(sizeof(*dcb_work), GFP_ATOMIC); + if (!dcb_work) + break; + + dcb_work->rdev = rdev; + memcpy(&dcb_work->cmpl, cmpl, sizeof(*cmpl)); + INIT_WORK(&dcb_work->work, bnxt_re_dcb_wq_task); + queue_work(rdev->dcb_wq, &dcb_work->work); + break; + default: + break; + } } static void bnxt_re_stop_irq(void *handle) @@ -1900,6 +2002,7 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) bnxt_re_debugfs_rem_pdev(rdev); bnxt_re_net_unregister_async_event(rdev); + bnxt_re_uninit_dcb_wq(rdev); if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) cancel_delayed_work_sync(&rdev->worker); @@ -2119,6 +2222,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) bnxt_re_debugfs_add_pdev(rdev); + bnxt_re_init_dcb_wq(rdev); bnxt_re_net_register_async_event(rdev); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index ef3424c81345..264cf0c2c1ac 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -343,6 +343,7 @@ struct bnxt_qplib_qp { u32 msn; u32 msn_tbl_sz; bool is_host_msn_tbl; + u8 tos_dscp; }; #define BNXT_QPLIB_MAX_CQE_ENTRY_SIZE sizeof(struct cq_base) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index debb26080143..eafa0c1bc732 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -296,6 +296,7 @@ struct bnxt_qplib_cc_param_ext { struct bnxt_qplib_cc_param { u8 alt_vlan_pcp; + u8 qp1_tos_dscp; u16 alt_tos_dscp; u8 cc_mode; u8 enable; From 57e6464c221c7ffaeae784bcaa8171ffac179d3e Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 9 Jan 2025 10:18:12 -0800 Subject: [PATCH 56/63] RDMA/bnxt_re: Pass the context for ulp_irq_stop ulp_irq_stop() can be invoked from a context where FW is healthy or when FW is in a reset state. In the latter case, ULP must stop all interactions with HW/FW and also with application and stack. Added a new parameter to the ulp_irq_stop() function to achieve that. Reviewed-by: Vikas Gupta Reviewed-by: Michael Chan Reviewed-by: Chandramohan Akula Reviewed-by: Pavan Chebbi Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1736446693-6692-2-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/main.c | 12 +++++++++++- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c | 5 ++++- drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 93c4ba868ec3..bfccf34c08f9 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -83,6 +83,8 @@ static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, u32 *offset); +static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp, + u8 port_num, enum ib_event_type event); static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; @@ -411,7 +413,7 @@ static void bnxt_re_async_notifier(void *handle, struct hwrm_async_event_cmpl *c } } -static void bnxt_re_stop_irq(void *handle) +static void bnxt_re_stop_irq(void *handle, bool reset) { struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); struct bnxt_qplib_rcfw *rcfw; @@ -422,6 +424,14 @@ static void bnxt_re_stop_irq(void *handle) rdev = en_info->rdev; rcfw = &rdev->rcfw; + if (reset) { + set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); + set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); + wake_up_all(&rdev->rcfw.cmdq.waitq); + bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, + IB_EVENT_DEVICE_FATAL); + } + for (indx = BNXT_RE_NQ_IDX; indx < rdev->nqr->num_msix; indx++) { nq = &rdev->nqr->nq[indx - 1]; bnxt_qplib_nq_stop_irq(nq, false); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c index 3e17db0a453e..a5fa9a254e01 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.c @@ -297,6 +297,7 @@ void bnxt_ulp_irq_stop(struct bnxt *bp) { struct bnxt_en_dev *edev = bp->edev; struct bnxt_ulp_ops *ops; + bool reset = false; if (!edev || !(edev->flags & BNXT_EN_FLAG_MSIX_REQUESTED)) return; @@ -310,7 +311,9 @@ void bnxt_ulp_irq_stop(struct bnxt *bp) ops = rtnl_dereference(ulp->ulp_ops); if (!ops || !ops->ulp_irq_stop) return; - ops->ulp_irq_stop(ulp->handle); + if (test_bit(BNXT_STATE_FW_RESET_DET, &bp->state)) + reset = true; + ops->ulp_irq_stop(ulp->handle, reset); } } diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h index ee6a5b8562c3..65ea2a546033 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ulp.h @@ -32,7 +32,7 @@ struct bnxt_msix_entry { struct bnxt_ulp_ops { /* async_notifier() cannot sleep (in BH context) */ void (*ulp_async_notifier)(void *, struct hwrm_async_event_cmpl *); - void (*ulp_irq_stop)(void *); + void (*ulp_irq_stop)(void *, bool); void (*ulp_irq_restart)(void *, struct bnxt_msix_entry *); }; From 9264cd6aa8f194753507cb6e1f444141e7c79f48 Mon Sep 17 00:00:00 2001 From: Kalesh AP Date: Thu, 9 Jan 2025 10:18:13 -0800 Subject: [PATCH 57/63] RDMA/bnxt_re: Allocate dev_attr information dynamically In order to optimize the size of driver private structure, the memory for dev_attr is allocated dynamically during the chip context initialization. In order to make certain runtime decisions, store dev_attr in the qplib_res structure. Signed-off-by: Kalesh AP Signed-off-by: Selvin Xavier Link: https://patch.msgid.link/1736446693-6692-3-git-send-email-selvin.xavier@broadcom.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 +- drivers/infiniband/hw/bnxt_re/hw_counters.c | 2 +- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 38 ++++++++++----------- drivers/infiniband/hw/bnxt_re/main.c | 36 ++++++++++++------- drivers/infiniband/hw/bnxt_re/qplib_res.c | 7 ++-- drivers/infiniband/hw/bnxt_re/qplib_res.h | 4 +-- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 4 +-- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 3 +- 8 files changed, 51 insertions(+), 45 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index dc2b193af7e8..b91a85a491d0 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -204,7 +204,7 @@ struct bnxt_re_dev { struct bnxt_re_nq_record *nqr; /* Device Resources */ - struct bnxt_qplib_dev_attr dev_attr; + struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_ctx qplib_ctx; struct bnxt_qplib_res qplib_res; struct bnxt_qplib_dpi dpi_privileged; diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 77ec2eda7268..3ac47f4e6122 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -348,7 +348,7 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, goto done; } bnxt_re_copy_err_stats(rdev, stats, err_s); - if (_is_ext_stats_supported(rdev->dev_attr.dev_cap_flags) && + if (_is_ext_stats_supported(rdev->dev_attr->dev_cap_flags) && !rdev->is_virtfn) { rc = bnxt_re_get_ext_stat(rdev, stats); if (rc) { diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 27efaaf1c82a..8b5435ee72a5 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -159,7 +159,7 @@ static int __qp_access_flags_to_ib(struct bnxt_qplib_chip_ctx *cctx, u8 qflags) static void bnxt_re_check_and_set_relaxed_ordering(struct bnxt_re_dev *rdev, struct bnxt_qplib_mrw *qplib_mr) { - if (_is_relaxed_ordering_supported(rdev->dev_attr.dev_cap_flags2) && + if (_is_relaxed_ordering_supported(rdev->dev_attr->dev_cap_flags2) && pcie_relaxed_ordering_enabled(rdev->en_dev->pdev)) qplib_mr->flags |= CMDQ_REGISTER_MR_FLAGS_ENABLE_RO; } @@ -184,7 +184,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, struct ib_udata *udata) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; memset(ib_attr, 0, sizeof(*ib_attr)); memcpy(&ib_attr->fw_ver, dev_attr->fw_ver, @@ -273,7 +273,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *port_attr) { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; int rc; memset(port_attr, 0, sizeof(*port_attr)); @@ -331,8 +331,8 @@ void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str) struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d", - rdev->dev_attr.fw_ver[0], rdev->dev_attr.fw_ver[1], - rdev->dev_attr.fw_ver[2], rdev->dev_attr.fw_ver[3]); + rdev->dev_attr->fw_ver[0], rdev->dev_attr->fw_ver[1], + rdev->dev_attr->fw_ver[2], rdev->dev_attr->fw_ver[3]); } int bnxt_re_query_pkey(struct ib_device *ibdev, u32 port_num, @@ -583,7 +583,7 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->qplib_mr.pd = &pd->qplib_pd; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); - if (!_is_alloc_mr_unified(rdev->dev_attr.dev_cap_flags)) { + if (!_is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)) { rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) { ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); @@ -1060,7 +1060,7 @@ static int bnxt_re_setup_swqe_size(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; sq = &qplqp->sq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; align = sizeof(struct sq_send_hdr); ilsize = ALIGN(init_attr->cap.max_inline_data, align); @@ -1280,7 +1280,7 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; rq = &qplqp->rq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (init_attr->srq) { struct bnxt_re_srq *srq; @@ -1317,7 +1317,7 @@ static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp) rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { qplqp->rq.max_sge = dev_attr->max_qp_sges; @@ -1343,7 +1343,7 @@ static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; sq = &qplqp->sq; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; sq->max_sge = init_attr->cap.max_send_sge; entries = init_attr->cap.max_send_wr; @@ -1396,7 +1396,7 @@ static void bnxt_re_adjust_gsi_sq_attr(struct bnxt_re_qp *qp, rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) { entries = bnxt_re_init_depth(init_attr->cap.max_send_wr + 1, uctx); @@ -1445,7 +1445,7 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, rdev = qp->rdev; qplqp = &qp->qplib_qp; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; /* Setup misc params */ ether_addr_copy(qplqp->smac, rdev->netdev->dev_addr); @@ -1615,7 +1615,7 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, ib_pd = ib_qp->pd; pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); rdev = pd->rdev; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); @@ -1843,7 +1843,7 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, ib_pd = ib_srq->pd; pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); rdev = pd->rdev; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; srq = container_of(ib_srq, struct bnxt_re_srq, ib_srq); if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) { @@ -2047,7 +2047,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, { struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); struct bnxt_re_dev *rdev = qp->rdev; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; enum ib_qp_state curr_qp_state, new_qp_state; int rc, entries; unsigned int flags; @@ -3089,7 +3089,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_udata *udata = &attrs->driver_udata; struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; struct bnxt_qplib_chip_ctx *cctx; int cqe = attr->cqe; int rc, entries; @@ -3224,7 +3224,7 @@ int bnxt_re_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) cq = container_of(ibcq, struct bnxt_re_cq, ib_cq); rdev = cq->rdev; - dev_attr = &rdev->dev_attr; + dev_attr = rdev->dev_attr; if (!ibcq->uobject) { ibdev_err(&rdev->ibdev, "Kernel CQ Resize not supported"); return -EOPNOTSUPP; @@ -4197,7 +4197,7 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR; - if (!_is_alloc_mr_unified(rdev->dev_attr.dev_cap_flags)) { + if (!_is_alloc_mr_unified(rdev->dev_attr->dev_cap_flags)) { rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) { ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); @@ -4289,7 +4289,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) struct bnxt_re_ucontext *uctx = container_of(ctx, struct bnxt_re_ucontext, ib_uctx); struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; struct bnxt_re_user_mmap_entry *entry; struct bnxt_re_uctx_resp resp = {}; struct bnxt_re_uctx_req ureq = {}; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index bfccf34c08f9..bc02a8608fe7 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -148,6 +148,10 @@ static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) if (!rdev->chip_ctx) return; + + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; + chip_ctx = rdev->chip_ctx; rdev->chip_ctx = NULL; rdev->rcfw.res = NULL; @@ -161,7 +165,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; - int rc; + int rc = -ENOMEM; en_dev = rdev->en_dev; @@ -177,7 +181,10 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) rdev->qplib_res.cctx = rdev->chip_ctx; rdev->rcfw.res = &rdev->qplib_res; - rdev->qplib_res.dattr = &rdev->dev_attr; + rdev->dev_attr = kzalloc(sizeof(*rdev->dev_attr), GFP_KERNEL); + if (!rdev->dev_attr) + goto free_chip_ctx; + rdev->qplib_res.dattr = rdev->dev_attr; rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); rdev->qplib_res.en_dev = en_dev; @@ -185,16 +192,20 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) bnxt_re_set_db_offset(rdev); rc = bnxt_qplib_map_db_bar(&rdev->qplib_res); - if (rc) { - kfree(rdev->chip_ctx); - rdev->chip_ctx = NULL; - return rc; - } + if (rc) + goto free_dev_attr; if (bnxt_qplib_determine_atomics(en_dev->pdev)) ibdev_info(&rdev->ibdev, "platform doesn't support global atomics."); return 0; +free_dev_attr: + kfree(rdev->dev_attr); + rdev->dev_attr = NULL; +free_chip_ctx: + kfree(rdev->chip_ctx); + rdev->chip_ctx = NULL; + return rc; } /* SR-IOV helper functions */ @@ -216,7 +227,7 @@ static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev) struct bnxt_qplib_ctx *ctx; int i; - attr = &rdev->dev_attr; + attr = rdev->dev_attr; ctx = &rdev->qplib_ctx; ctx->qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT, @@ -230,7 +241,7 @@ static void bnxt_re_limit_pf_res(struct bnxt_re_dev *rdev) if (!bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx)) for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) rdev->qplib_ctx.tqm_ctx.qcount[i] = - rdev->dev_attr.tqm_alloc_reqs[i]; + rdev->dev_attr->tqm_alloc_reqs[i]; } static void bnxt_re_limit_vf_res(struct bnxt_qplib_ctx *qplib_ctx, u32 num_vf) @@ -1726,12 +1737,11 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw); if (rc) goto fail; - rc = bnxt_qplib_alloc_res(&rdev->qplib_res, rdev->en_dev->pdev, - rdev->netdev, &rdev->dev_attr); + rc = bnxt_qplib_alloc_res(&rdev->qplib_res, rdev->netdev); if (rc) goto fail; @@ -2160,7 +2170,7 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) rdev->pacing.dbr_pacing = false; } } - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw); if (rc) goto disable_rcfw; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 96ceec1e8199..02922a0987ad 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -876,14 +876,13 @@ void bnxt_qplib_free_res(struct bnxt_qplib_res *res) bnxt_qplib_free_dpi_tbl(res, &res->dpi_tbl); } -int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev, - struct net_device *netdev, - struct bnxt_qplib_dev_attr *dev_attr) +int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev) { + struct bnxt_qplib_dev_attr *dev_attr; int rc; - res->pdev = pdev; res->netdev = netdev; + dev_attr = res->dattr; rc = bnxt_qplib_alloc_sgid_tbl(res, &res->sgid_tbl, dev_attr->max_sgid); if (rc) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index 21fb148713a6..f5a48e8fb335 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -424,9 +424,7 @@ int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, void bnxt_qplib_cleanup_res(struct bnxt_qplib_res *res); int bnxt_qplib_init_res(struct bnxt_qplib_res *res); void bnxt_qplib_free_res(struct bnxt_qplib_res *res); -int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct pci_dev *pdev, - struct net_device *netdev, - struct bnxt_qplib_dev_attr *dev_attr); +int bnxt_qplib_alloc_res(struct bnxt_qplib_res *res, struct net_device *netdev); void bnxt_qplib_free_ctx(struct bnxt_qplib_res *res, struct bnxt_qplib_ctx *ctx); int bnxt_qplib_alloc_ctx(struct bnxt_qplib_res *res, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index d56cc3330d1b..47ed455b52f0 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -88,9 +88,9 @@ static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw, fw_ver[3] = resp.fw_rsvd; } -int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr) +int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw) { + struct bnxt_qplib_dev_attr *attr = rcfw->res->dattr; struct creq_query_func_resp resp = {}; struct bnxt_qplib_cmdqmsg msg = {}; struct creq_query_func_resp_sb *sb; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index eafa0c1bc732..e626b05038a1 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -326,8 +326,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int bnxt_qplib_update_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, struct bnxt_qplib_gid *gid, u16 gid_idx, const u8 *smac); -int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr); +int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw); int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_ctx *ctx); From 42e6ddda4c17fa0d5120e3723d522649f8fc62fa Mon Sep 17 00:00:00 2001 From: Anumula Murali Mohan Reddy Date: Tue, 7 Jan 2025 15:20:53 +0530 Subject: [PATCH 58/63] RDMA/cxgb4: Notify rdma stack for IB_EVENT_QP_LAST_WQE_REACHED event This patch sends IB_EVENT_QP_LAST_WQE_REACHED event on a QP that is in error state and associated with an SRQ. This behaviour is incorporated in flush_qp() which is called when QP transitions to error state. Supports SRQ drain functionality added by commit 844bc12e6da3 ("IB/core: add support for draining Shared receive queues") Fixes: 844bc12e6da3 ("IB/core: add support for draining Shared receive queues") Signed-off-by: Anumula Murali Mohan Reddy Signed-off-by: Potnuri Bharat Teja Link: https://patch.msgid.link/20250107095053.81007-1-anumula@chelsio.com Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/cxgb4/qp.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 7b5c4522b426..955f061a55e9 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1599,6 +1599,7 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, int count; int rq_flushed = 0, sq_flushed; unsigned long flag; + struct ib_event ev; pr_debug("qhp %p rchp %p schp %p\n", qhp, rchp, schp); @@ -1607,6 +1608,13 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, if (schp != rchp) spin_lock(&schp->lock); spin_lock(&qhp->lock); + if (qhp->srq && qhp->attr.state == C4IW_QP_STATE_ERROR && + qhp->ibqp.event_handler) { + ev.device = qhp->ibqp.device; + ev.element.qp = &qhp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + qhp->ibqp.event_handler(&ev, qhp->ibqp.qp_context); + } if (qhp->wq.flushed) { spin_unlock(&qhp->lock); From edc4ef0e0154096d6c0cf5e06af6fc330dbad9d1 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Fri, 10 Jan 2025 17:09:27 +0100 Subject: [PATCH 59/63] RDMA/rxe: Fix the warning "__rxe_cleanup+0x12c/0x170 [rdma_rxe]" The Call Trace is as below: " ? show_regs.cold+0x1a/0x1f ? __rxe_cleanup+0x12c/0x170 [rdma_rxe] ? __warn+0x84/0xd0 ? __rxe_cleanup+0x12c/0x170 [rdma_rxe] ? report_bug+0x105/0x180 ? handle_bug+0x46/0x80 ? exc_invalid_op+0x19/0x70 ? asm_exc_invalid_op+0x1b/0x20 ? __rxe_cleanup+0x12c/0x170 [rdma_rxe] ? __rxe_cleanup+0x124/0x170 [rdma_rxe] rxe_destroy_qp.cold+0x24/0x29 [rdma_rxe] ib_destroy_qp_user+0x118/0x190 [ib_core] rdma_destroy_qp.cold+0x43/0x5e [rdma_cm] rtrs_cq_qp_destroy.cold+0x1d/0x2b [rtrs_core] rtrs_srv_close_work.cold+0x1b/0x31 [rtrs_server] process_one_work+0x21d/0x3f0 worker_thread+0x4a/0x3c0 ? process_one_work+0x3f0/0x3f0 kthread+0xf0/0x120 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 " When too many rdma resources are allocated, rxe needs more time to handle these rdma resources. Sometimes with the current timeout, rxe can not release the rdma resources correctly. Compared with other rdma drivers, a bigger timeout is used. Fixes: 215d0a755e1b ("RDMA/rxe: Stop lookup of partially built objects") Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20250110160927.55014-1-yanjun.zhu@linux.dev Tested-by: Joe Klein Signed-off-by: Leon Romanovsky --- drivers/infiniband/sw/rxe/rxe_pool.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 67567d62195e..d9cb682fd71f 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -178,7 +178,6 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) { struct rxe_pool *pool = elem->pool; struct xarray *xa = &pool->xa; - static int timeout = RXE_POOL_TIMEOUT; int ret, err = 0; void *xa_ret; @@ -202,19 +201,19 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) * return to rdma-core */ if (sleepable) { - if (!completion_done(&elem->complete) && timeout) { + if (!completion_done(&elem->complete)) { ret = wait_for_completion_timeout(&elem->complete, - timeout); + msecs_to_jiffies(50000)); /* Shouldn't happen. There are still references to * the object but, rather than deadlock, free the * object or pass back to rdma-core. */ if (WARN_ON(!ret)) - err = -EINVAL; + err = -ETIMEDOUT; } } else { - unsigned long until = jiffies + timeout; + unsigned long until = jiffies + RXE_POOL_TIMEOUT; /* AH objects are unique in that the destroy_ah verb * can be called in atomic context. This delay @@ -226,7 +225,7 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) mdelay(1); if (WARN_ON(!completion_done(&elem->complete))) - err = -EINVAL; + err = -ETIMEDOUT; } if (pool->cleanup) From 39d772f6654a17ae57656602e801d265e82a2e36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 14 Jan 2025 22:32:13 +0100 Subject: [PATCH 60/63] RDMA/hfi1: Constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20250114-sysfs-const-bin_attr-infiniband-v1-1-397aaa94d453@weissschuh.net Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/hfi1/sysfs.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index d62ba5fdd80c..d94216c7d576 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -27,8 +27,8 @@ static struct hfi1_pportdata *hfi1_get_pportdata_kobj(struct kobject *kobj) * Congestion control table size followed by table entries */ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { int ret; struct hfi1_pportdata *ppd = hfi1_get_pportdata_kobj(kobj); @@ -57,7 +57,7 @@ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); /* * Congestion settings: port control, control map and an array of 16 @@ -65,7 +65,7 @@ static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); * trigger threshold and the minimum injection rate delay. */ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t count) { struct hfi1_pportdata *ppd = hfi1_get_pportdata_kobj(kobj); @@ -93,9 +93,9 @@ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); -static struct bin_attribute *port_cc_bin_attributes[] = { +static const struct bin_attribute *const port_cc_bin_attributes[] = { &bin_attr_cc_setting_bin, &bin_attr_cc_table_bin, NULL @@ -134,7 +134,7 @@ static struct attribute *port_cc_attributes[] = { static const struct attribute_group port_cc_group = { .name = "CCMgtA", .attrs = port_cc_attributes, - .bin_attrs = port_cc_bin_attributes, + .bin_attrs_new = port_cc_bin_attributes, }; /* Start sc2vl */ From f5f01c5c409e697c6ae7091ca578bfbd3825e87c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 14 Jan 2025 22:32:14 +0100 Subject: [PATCH 61/63] RDMA/qib: Constify 'struct bin_attribute' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs core now allows instances of 'struct bin_attribute' to be moved into read-only memory. Make use of that to protect them against accidental or malicious modifications. Signed-off-by: Thomas Weißschuh Link: https://patch.msgid.link/20250114-sysfs-const-bin_attr-infiniband-v1-2-397aaa94d453@weissschuh.net Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/qib/qib_sysfs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index ba2cd68b53e6..805e37dc7621 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -214,8 +214,8 @@ static const struct attribute_group port_linkcontrol_group = { * Congestion control table size followed by table entries */ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj); int ret; @@ -241,7 +241,7 @@ static ssize_t cc_table_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); /* * Congestion settings: port control, control map and an array of 16 @@ -249,8 +249,8 @@ static BIN_ATTR_RO(cc_table_bin, PAGE_SIZE); * trigger threshold and the minimum injection rate delay. */ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, char *buf, - loff_t pos, size_t count) + const struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) { struct qib_pportdata *ppd = qib_get_pportdata_kobj(kobj); int ret; @@ -274,9 +274,9 @@ static ssize_t cc_setting_bin_read(struct file *filp, struct kobject *kobj, return count; } -static BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); +static const BIN_ATTR_RO(cc_setting_bin, PAGE_SIZE); -static struct bin_attribute *port_ccmgta_attributes[] = { +static const struct bin_attribute *const port_ccmgta_attributes[] = { &bin_attr_cc_setting_bin, &bin_attr_cc_table_bin, NULL, @@ -295,7 +295,7 @@ static umode_t qib_ccmgta_is_bin_visible(struct kobject *kobj, static const struct attribute_group port_ccmgta_attribute_group = { .name = "CCMgtA", .is_bin_visible = qib_ccmgta_is_bin_visible, - .bin_attrs = port_ccmgta_attributes, + .bin_attrs_new = port_ccmgta_attributes, }; /* Start sl2vl */ From abb604a1a9c87255c7a6f3b784410a9707baf467 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 19 Jan 2025 14:38:25 +0200 Subject: [PATCH 62/63] RDMA/mlx5: Fix a race for an ODP MR which leads to CQE with error This patch addresses a race condition for an ODP MR that can result in a CQE with an error on the UMR QP. During the __mlx5_ib_dereg_mr() flow, the following sequence of calls occurs: mlx5_revoke_mr() mlx5r_umr_revoke_mr() mlx5r_umr_post_send_wait() At this point, the lkey is freed from the hardware's perspective. However, concurrently, mlx5_ib_invalidate_range() might be triggered by another task attempting to invalidate a range for the same freed lkey. This task will: - Acquire the umem_odp->umem_mutex lock. - Call mlx5r_umr_update_xlt() on the UMR QP. - Since the lkey has already been freed, this can lead to a CQE error, causing the UMR QP to enter an error state [1]. To resolve this race condition, the umem_odp->umem_mutex lock is now also acquired as part of the mlx5_revoke_mr() scope. Upon successful revoke, we set umem_odp->private which points to that MR to NULL, preventing any further invalidation attempts on its lkey. [1] From dmesg: infiniband rocep8s0f0: dump_cqe:277:(pid 0): WC error: 6, Message: memory bind operation error cqe_dump: 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 cqe_dump: 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 cqe_dump: 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 cqe_dump: 00000030: 00 00 00 00 08 00 78 06 25 00 11 b9 00 0e dd d2 WARNING: CPU: 15 PID: 1506 at drivers/infiniband/hw/mlx5/umr.c:394 mlx5r_umr_post_send_wait+0x15a/0x2b0 [mlx5_ib] Modules linked in: ip6table_mangle ip6table_natip6table_filter ip6_tables iptable_mangle xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_umad ib_ipoib ib_cm mlx5_ib ib_uverbs ib_core fuse mlx5_core CPU: 15 UID: 0 PID: 1506 Comm: ibv_rc_pingpong Not tainted 6.12.0-rc7+ #1626 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:mlx5r_umr_post_send_wait+0x15a/0x2b0 [mlx5_ib] [..] Call Trace: mlx5r_umr_update_xlt+0x23c/0x3e0 [mlx5_ib] mlx5_ib_invalidate_range+0x2e1/0x330 [mlx5_ib] __mmu_notifier_invalidate_range_start+0x1e1/0x240 zap_page_range_single+0xf1/0x1a0 madvise_vma_behavior+0x677/0x6e0 do_madvise+0x1a2/0x4b0 __x64_sys_madvise+0x25/0x30 do_syscall_64+0x6b/0x140 entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: e6fb246ccafb ("RDMA/mlx5: Consolidate MR destruction to mlx5_ib_dereg_mr()") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/r/68a1e007c25b2b8fe5d625f238cc3b63e5341f77.1737290229.git.leon@kernel.org Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 17 +++++++++++++++-- drivers/infiniband/hw/mlx5/odp.c | 2 ++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 45d9dc9c6c8f..bb02b6adbf2c 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -2021,6 +2021,11 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) { struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; + bool is_odp = is_odp_mr(mr); + int ret = 0; + + if (is_odp) + mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { ent = mr->mmkey.cache_ent; @@ -2032,7 +2037,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) ent->tmp_cleanup_scheduled = true; } spin_unlock_irq(&ent->mkeys_queue.lock); - return 0; + goto out; } if (ent) { @@ -2041,7 +2046,15 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) mr->mmkey.cache_ent = NULL; spin_unlock_irq(&ent->mkeys_queue.lock); } - return destroy_mkey(dev, mr); + ret = destroy_mkey(dev, mr); +out: + if (is_odp) { + if (!ret) + to_ib_umem_odp(mr->umem)->private = NULL; + mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); + } + + return ret; } static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f2eb940bddc8..f655859eec00 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -268,6 +268,8 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, if (!umem_odp->npages) goto out; mr = umem_odp->private; + if (!mr) + goto out; start = max_t(u64, ib_umem_start(umem_odp), range->start); end = min_t(u64, ib_umem_end(umem_odp), range->end); From d3d930411ce390e532470194296658a960887773 Mon Sep 17 00:00:00 2001 From: Patrisious Haddad Date: Sun, 19 Jan 2025 10:21:41 +0200 Subject: [PATCH 63/63] RDMA/mlx5: Fix implicit ODP use after free Prevent double queueing of implicit ODP mr destroy work by using __xa_cmpxchg() to make sure this is the only time we are destroying this specific mr. Without this change, we could try to invalidate this mr twice, which in turn could result in queuing a MR work destroy twice, and eventually the second work could execute after the MR was freed due to the first work, causing a user after free and trace below. refcount_t: underflow; use-after-free. WARNING: CPU: 2 PID: 12178 at lib/refcount.c:28 refcount_warn_saturate+0x12b/0x130 Modules linked in: bonding ib_ipoib vfio_pci ip_gre geneve nf_tables ip6_gre gre ip6_tunnel tunnel6 ipip tunnel4 ib_umad rdma_ucm mlx5_vfio_pci vfio_pci_core vfio_iommu_type1 mlx5_ib vfio ib_uverbs mlx5_core iptable_raw openvswitch nsh rpcrdma ib_iser libiscsi scsi_transport_iscsi rdma_cm iw_cm ib_cm ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: ib_uverbs] CPU: 2 PID: 12178 Comm: kworker/u20:5 Not tainted 6.5.0-rc1_net_next_mlx5_58c644e #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Workqueue: events_unbound free_implicit_child_mr_work [mlx5_ib] RIP: 0010:refcount_warn_saturate+0x12b/0x130 Code: 48 c7 c7 38 95 2a 82 c6 05 bc c6 fe 00 01 e8 0c 66 aa ff 0f 0b 5b c3 48 c7 c7 e0 94 2a 82 c6 05 a7 c6 fe 00 01 e8 f5 65 aa ff <0f> 0b 5b c3 90 8b 07 3d 00 00 00 c0 74 12 83 f8 01 74 13 8d 50 ff RSP: 0018:ffff8881008e3e40 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027 RDX: ffff88852c91b5c8 RSI: 0000000000000001 RDI: ffff88852c91b5c0 RBP: ffff8881dacd4e00 R08: 00000000ffffffff R09: 0000000000000019 R10: 000000000000072e R11: 0000000063666572 R12: ffff88812bfd9e00 R13: ffff8881c792d200 R14: ffff88810011c005 R15: ffff8881002099c0 FS: 0000000000000000(0000) GS:ffff88852c900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f5694b5e000 CR3: 00000001153f6003 CR4: 0000000000370ea0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? refcount_warn_saturate+0x12b/0x130 free_implicit_child_mr_work+0x180/0x1b0 [mlx5_ib] process_one_work+0x1cc/0x3c0 worker_thread+0x218/0x3c0 kthread+0xc6/0xf0 ret_from_fork+0x1f/0x30 Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/r/c96b8645a81085abff739e6b06e286a350d1283d.1737274283.git.leon@kernel.org Signed-off-by: Patrisious Haddad Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/odp.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f655859eec00..f1e23583e6c0 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -228,13 +228,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *imr = mr->parent; + /* + * If userspace is racing freeing the parent implicit ODP MR then we can + * loose the race with parent destruction. In this case + * mlx5_ib_free_odp_mr() will free everything in the implicit_children + * xarray so NOP is fine. This child MR cannot be destroyed here because + * we are under its umem_mutex. + */ if (!refcount_inc_not_zero(&imr->mmkey.usecount)) return; - xa_erase(&imr->implicit_children, idx); + xa_lock(&imr->implicit_children); + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != + mr) { + xa_unlock(&imr->implicit_children); + return; + } + if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) - xa_erase(&mr_to_mdev(mr)->odp_mkeys, - mlx5_base_mkey(mr->mmkey.key)); + __xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key)); + xa_unlock(&imr->implicit_children); /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); @@ -502,18 +516,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_inc(&ret->mmkey.usecount); goto out_lock; } - xa_unlock(&imr->implicit_children); if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { - ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), - &mr->mmkey, GFP_KERNEL); + ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_KERNEL); if (xa_is_err(ret)) { ret = ERR_PTR(xa_err(ret)); - xa_erase(&imr->implicit_children, idx); - goto out_mr; + __xa_erase(&imr->implicit_children, idx); + goto out_lock; } mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; } + xa_unlock(&imr->implicit_children); mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); return mr;