From 3cafad43e2dfb4b9cdabe3bb42bbffccf1e45538 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Tue, 14 Nov 2017 04:34:16 -0800 Subject: IB/hfi1: Mask the path bits with the LMC for 16B RC Acks 16B packets require that the path bits are masked with the LMC. This mask is done correctly in all 16B header creation but was left out for the RC Acknowledge. Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/rc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index fd01a760259f..e6fa9365de49 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -843,11 +843,11 @@ static inline void hfi1_make_rc_ack_16B(struct rvt_qp *qp, /* Convert dwords to flits */ len = (*hwords + *nwords) >> 1; - hfi1_make_16b_hdr(hdr, - ppd->lid | rdma_ah_get_path_bits(&qp->remote_ah_attr), + hfi1_make_16b_hdr(hdr, ppd->lid | + (rdma_ah_get_path_bits(&qp->remote_ah_attr) & + ((1 << ppd->lmc) - 1)), opa_get_lid(rdma_ah_get_dlid(&qp->remote_ah_attr), - 16B), - len, pkey, becn, 0, l4, sc5); + 16B), len, pkey, becn, 0, l4, sc5); bth0 = pkey | (OP(ACKNOWLEDGE) << 24); bth0 |= extra_bytes << 20; -- cgit v1.2.3 From c5c4e40e90b5c714248bcf4e8a63b1a1cd07c11e Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Tue, 14 Nov 2017 04:34:38 -0800 Subject: IB/CM: Change sgid to IB GID when handling CM request ULPs do not understand OPA GIDs and will reject CM requests if the sgid does not match the local_gid. In order to fix this behavior we convert the OPA GID back to an IB GID. Reviewed-by: Ira Weiny Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index f6b159d79977..1cafa8350c52 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1560,6 +1560,37 @@ static u16 cm_get_bth_pkey(struct cm_work *work) return pkey; } +/** + * Convert OPA SGID to IB SGID + * ULPs (such as IPoIB) do not understand OPA GIDs and will + * reject them as the local_gid will not match the sgid. Therefore, + * change the pathrec's SGID to an IB SGID. + * + * @work: Work completion + * @path: Path record + */ +static void cm_opa_to_ib_sgid(struct cm_work *work, + struct sa_path_rec *path) +{ + struct ib_device *dev = work->port->cm_dev->ib_device; + struct ib_gid_attr gid_attr; + u8 port_num = work->port->port_num; + + if (rdma_cap_opa_ah(dev, port_num) && + (ib_is_opa_gid(&path->sgid))) { + union ib_gid sgid; + + if (ib_get_cached_gid(dev, port_num, 0, + &sgid, &gid_attr)) { + dev_warn(&dev->dev, + "Error updating sgid in CM request\n"); + return; + } + + path->sgid = sgid; + } +} + static void cm_format_req_event(struct cm_work *work, struct cm_id_private *cm_id_priv, struct ib_cm_id *listen_id) @@ -1573,10 +1604,13 @@ static void cm_format_req_event(struct cm_work *work, param->bth_pkey = cm_get_bth_pkey(work); param->port = cm_id_priv->av.port->port_num; param->primary_path = &work->path[0]; - if (cm_req_has_alt_path(req_msg)) + cm_opa_to_ib_sgid(work, param->primary_path); + if (cm_req_has_alt_path(req_msg)) { param->alternate_path = &work->path[1]; - else + cm_opa_to_ib_sgid(work, param->alternate_path); + } else { param->alternate_path = NULL; + } param->remote_ca_guid = req_msg->local_ca_guid; param->remote_qkey = be32_to_cpu(req_msg->local_qkey); param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg)); -- cgit v1.2.3 From 69a3ffaa0a43d1c5a34b6f52aa08a5fc3d8d87d4 Mon Sep 17 00:00:00 2001 From: Jan Sokolowski Date: Tue, 14 Nov 2017 04:34:45 -0800 Subject: IB/hfi1: Use 4096 for default active MTU in query_qp Currently, if a port is queried that has an invalid Maximum Transmission Unit, driver reports default MTU of 2048. This in incorrect. Use default value of 4096 if invalid. Reviewed-by: Dennis Dalessandro Signed-off-by: Jan Sokolowski Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a38785e224cc..6d27c8594b34 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1486,7 +1486,7 @@ static int query_port(struct rvt_dev_info *rdi, u8 port_num, props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ? 4096 : hfi1_max_mtu), IB_MTU_4096); props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : - mtu_to_enum(ppd->ibmtu, IB_MTU_2048); + mtu_to_enum(ppd->ibmtu, IB_MTU_4096); /* * sm_lid of 0xFFFF needs special handling so that it can -- cgit v1.2.3 From 87b3524cb5058fdc7c2afdb92bdb2e079661ddc4 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Tue, 14 Nov 2017 04:34:52 -0800 Subject: IB/qib: Fix comparison error with qperf compare/swap test This failure exists with qib: ver_rc_compare_swap: mismatch, sequence 2, expected 123456789abcdef, got 0 The request builder was using the incorrect inlines to build the request header resulting in incorrect data in the atomic header. Fix by using the appropriate inlines to create the request. Cc: # 4.9.x+ Fixes: 261a4351844b ("IB/qib,IB/hfi: Use core common header file") Reviewed-by: Michael J. Ruhl Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_rc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 8f5754fb8579..e4a9ba1dd9ba 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -434,13 +434,13 @@ no_flow_control: qp->s_state = OP(COMPARE_SWAP); put_ib_ateth_swap(wqe->atomic_wr.swap, &ohdr->u.atomic_eth); - put_ib_ateth_swap(wqe->atomic_wr.compare_add, - &ohdr->u.atomic_eth); + put_ib_ateth_compare(wqe->atomic_wr.compare_add, + &ohdr->u.atomic_eth); } else { qp->s_state = OP(FETCH_ADD); put_ib_ateth_swap(wqe->atomic_wr.compare_add, &ohdr->u.atomic_eth); - put_ib_ateth_swap(0, &ohdr->u.atomic_eth); + put_ib_ateth_compare(0, &ohdr->u.atomic_eth); } put_ib_ateth_vaddr(wqe->atomic_wr.remote_addr, &ohdr->u.atomic_eth); -- cgit v1.2.3 From 3365776ebf6a21d7c4036125fd59b01efc1c060b Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 14 Nov 2017 16:29:36 -0500 Subject: IB/qib: remove redundant setting of any in for-loop The variable all is being set but is never read after this hence it can be removed from the for loop initialization. Cleans up clang warning: drivers/infiniband/hw/qib/qib_file_ops.c:640:7: warning: Value stored to 'any' is never read Tested-by: Mike Marciniszyn Signed-off-by: Colin Ian King Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 2d6a191afec0..b5c2e4223ee7 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -637,7 +637,7 @@ static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) ret = -EBUSY; goto bail; } - for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { if (!ppd->pkeys[i] && atomic_inc_return(&ppd->pkeyrefs[i]) == 1) { rcd->pkeys[pidx] = key; -- cgit v1.2.3 From e3649293b9e0ec0c83e6271f0a5cbe7bf7019cb9 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 14 Nov 2017 16:29:41 -0500 Subject: IB/qib: Cleanup qib_set_part_key() with direct returns Perhaps the function is better written without the empty bail: label and without setting ret and just using return. Combining the int/bool conversion of any and the direct returns makes the resulting code clearer. Tested-by: Mike Marciniszyn Signed-off-by: Joe Perches Signed-off-by: Mike Marciniszyn Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_file_ops.c | 66 ++++++++++++-------------------- 1 file changed, 24 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index b5c2e4223ee7..a9a48b393323 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -568,20 +568,16 @@ done: static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) { struct qib_pportdata *ppd = rcd->ppd; - int i, any = 0, pidx = -1; + int i, pidx = -1; + bool any = false; u16 lkey = key & 0x7FFF; - int ret; - if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) { + if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) /* nothing to do; this key always valid */ - ret = 0; - goto bail; - } + return 0; - if (!lkey) { - ret = -EINVAL; - goto bail; - } + if (!lkey) + return -EINVAL; /* * Set the full membership bit, because it has to be @@ -594,18 +590,14 @@ static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) { if (!rcd->pkeys[i] && pidx == -1) pidx = i; - if (rcd->pkeys[i] == key) { - ret = -EEXIST; - goto bail; - } + if (rcd->pkeys[i] == key) + return -EEXIST; } - if (pidx == -1) { - ret = -EBUSY; - goto bail; - } - for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { + if (pidx == -1) + return -EBUSY; + for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { if (!ppd->pkeys[i]) { - any++; + any = true; continue; } if (ppd->pkeys[i] == key) { @@ -613,44 +605,34 @@ static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key) if (atomic_inc_return(pkrefs) > 1) { rcd->pkeys[pidx] = key; - ret = 0; - goto bail; - } else { - /* - * lost race, decrement count, catch below - */ - atomic_dec(pkrefs); - any++; + return 0; } + /* + * lost race, decrement count, catch below + */ + atomic_dec(pkrefs); + any = true; } - if ((ppd->pkeys[i] & 0x7FFF) == lkey) { + if ((ppd->pkeys[i] & 0x7FFF) == lkey) /* * It makes no sense to have both the limited and * full membership PKEY set at the same time since * the unlimited one will disable the limited one. */ - ret = -EEXIST; - goto bail; - } - } - if (!any) { - ret = -EBUSY; - goto bail; + return -EEXIST; } + if (!any) + return -EBUSY; for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) { if (!ppd->pkeys[i] && atomic_inc_return(&ppd->pkeyrefs[i]) == 1) { rcd->pkeys[pidx] = key; ppd->pkeys[i] = key; (void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0); - ret = 0; - goto bail; + return 0; } } - ret = -EBUSY; - -bail: - return ret; + return -EBUSY; } /** -- cgit v1.2.3 From 1c3081a8ad365cc617b2857794adc91a3ade5195 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 19 Nov 2017 19:59:21 +0100 Subject: i40w: Remove garbage at end of INFINIBAND_I40IW Kconfig section Remove leftover garbage (containing Kconfig dependencies for another symbol?) Signed-off-by: Geert Uytterhoeven Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/Kconfig b/drivers/infiniband/hw/i40iw/Kconfig index f6d20ba88c03..2962979c06e9 100644 --- a/drivers/infiniband/hw/i40iw/Kconfig +++ b/drivers/infiniband/hw/i40iw/Kconfig @@ -5,4 +5,3 @@ config INFINIBAND_I40IW select GENERIC_ALLOCATOR ---help--- Intel(R) Ethernet X722 iWARP Driver - INET && I40IW && INFINIBAND && I40E -- cgit v1.2.3 From 5b8a3b08b343155a153151a77c59feb7419f1d38 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Sun, 26 Nov 2017 13:51:35 +0200 Subject: RDMA/vmw_pvrdma: Do not re-calculate npages There is no need to re-calculate the number of pages since it is already done in ib_umem_get. Signed-off-by: Yuval Shaia Acked-by: Adit Ranadive Tested-by: Adit Ranadive Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index 8519f3212e52..fa96fa4fb829 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -119,10 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, union pvrdma_cmd_resp rsp; struct pvrdma_cmd_create_mr *cmd = &req.create_mr; struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; - int nchunks; int ret; - int entry; - struct scatterlist *sg; if (length == 0 || length > dev->dsr->caps.max_mr_size) { dev_warn(&dev->pdev->dev, "invalid mem region length\n"); @@ -137,13 +134,9 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, return ERR_CAST(umem); } - nchunks = 0; - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) - nchunks += sg_dma_len(sg) >> PAGE_SHIFT; - - if (nchunks < 0 || nchunks > PVRDMA_PAGE_DIR_MAX_PAGES) { + if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) { dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n", - nchunks); + umem->npages); ret = -EINVAL; goto err_umem; } @@ -158,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->mmr.size = length; mr->umem = umem; - ret = pvrdma_page_dir_init(dev, &mr->pdir, nchunks, false); + ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false); if (ret) { dev_warn(&dev->pdev->dev, "could not allocate page directory\n"); @@ -175,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, cmd->length = length; cmd->pd_handle = to_vpd(pd)->pd_handle; cmd->access_flags = access_flags; - cmd->nchunks = nchunks; + cmd->nchunks = umem->npages; cmd->pdir_dma = mr->pdir.dir_dma; ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP); -- cgit v1.2.3 From 40dc8c4df0be05d4b96dc4475c6248125769a577 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Nov 2017 12:38:09 +0100 Subject: i40iw: remove unused 'timeval' struct member There is a stale entry in i40iw_cm_tcp_context that apparently was copied from the 'nes' driver but never used in i40iw. I'm trying to kill off all users of timeval as part of the y2038-safety work, so let's just remove this one. Signed-off-by: Arnd Bergmann Acked-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 0d5840d2c4fc..1577608fc51a 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -276,8 +276,6 @@ struct i40iw_cm_tcp_context { u32 mss; u8 snd_wscale; u8 rcv_wscale; - - struct timeval sent_ts; }; enum i40iw_cm_listener_state { -- cgit v1.2.3 From 261593c1e1fea6cee26ecd548dc682d624c99473 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Nov 2017 12:38:10 +0100 Subject: nes: remove unused 'timeval' struct member There is a stale entry in nes_cm_tcp_context that has apparently never been used in mainline linux. I'm trying to kill off all users of timeval as part of the y2038-safety work, so let's just remove this one. Signed-off-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/nes/nes_cm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index d827d03e3941..2cf2641867d8 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -279,7 +279,6 @@ struct nes_cm_tcp_context { u8 rcv_wscale; struct nes_cm_tsa_context tsa_cntxt; - struct timeval sent_ts; }; -- cgit v1.2.3 From f4cd9d588e77c86362ba3c072385cde11c86b611 Mon Sep 17 00:00:00 2001 From: "Gomonovych, Vasyl" Date: Tue, 28 Nov 2017 16:18:07 +0100 Subject: IB/core: Use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: drivers/infiniband/core/uverbs_cmd.c:1156:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Signed-off-by: Vasyl Gomonovych Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 16d55710b116..adc6a2379c76 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1145,10 +1145,7 @@ int ib_uverbs_ex_create_cq(struct ib_uverbs_file *file, min(ucore->inlen, sizeof(cmd)), ib_uverbs_ex_create_cq_cb, NULL); - if (IS_ERR(obj)) - return PTR_ERR(obj); - - return 0; + return PTR_ERR_OR_ZERO(obj); } ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, -- cgit v1.2.3 From c55359a23c23d0ea9af6c0d5d443810551d3b99b Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Wed, 29 Nov 2017 11:24:29 +0200 Subject: IB/ipoib: Replace printk with pr_warn pr_* is the preferred way to print messages, replace all printk(KERN_WARN, ...) with pr_warn. Signed-off-by: Yuval Shaia Reviewed-by: Zhu Yanjun Reviewed-by: Leon Romanovsky Reviewed-by: Santosh Shilimkar Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 8 ++++---- drivers/infiniband/ulp/ipoib/ipoib_main.c | 23 +++++++++++------------ drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 6 +++--- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 87f4bd99cdf7..c0a596e4c258 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -876,7 +876,7 @@ int ipoib_cm_dev_open(struct net_device *dev) priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev); if (IS_ERR(priv->cm.id)) { - printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name); + pr_warn("%s: failed to create CM ID\n", priv->ca->name); ret = PTR_ERR(priv->cm.id); goto err_cm; } @@ -884,8 +884,8 @@ int ipoib_cm_dev_open(struct net_device *dev) ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num), 0); if (ret) { - printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name, - IPOIB_CM_IETF_ID | priv->qp->qp_num); + pr_warn("%s: failed to listen on ID 0x%llx\n", priv->ca->name, + IPOIB_CM_IETF_ID | priv->qp->qp_num); goto err_listen; } @@ -1562,7 +1562,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr); if (IS_ERR(priv->cm.srq)) { if (PTR_ERR(priv->cm.srq) != -ENOSYS) - printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n", + pr_warn("%s: failed to allocate SRQ, error %ld\n", priv->ca->name, PTR_ERR(priv->cm.srq)); priv->cm.srq = NULL; return; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 12b7f911f0e5..6930ee0d63cf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1663,8 +1663,8 @@ static int ipoib_dev_init_default(struct net_device *dev) priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring); if (!priv->tx_ring) { - printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", - priv->ca->name, ipoib_sendq_size); + pr_warn("%s: failed to allocate TX ring (%d entries)\n", + priv->ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } @@ -2204,8 +2204,7 @@ static struct net_device *ipoib_add_port(const char *format, result = ib_query_port(hca, port, &attr); if (result) { - printk(KERN_WARNING "%s: ib_query_port %d failed\n", - hca->name, port); + pr_warn("%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } @@ -2220,8 +2219,8 @@ static struct net_device *ipoib_add_port(const char *format, result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { - printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", - hca->name, port, result); + pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", + hca->name, port, result); goto device_init_failed; } @@ -2238,8 +2237,8 @@ static struct net_device *ipoib_add_port(const char *format, result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL); if (result) { - printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", - hca->name, port, result); + pr_warn("%s: ib_query_gid port %d failed (ret = %d)\n", + hca->name, port, result); goto device_init_failed; } @@ -2249,8 +2248,8 @@ static struct net_device *ipoib_add_port(const char *format, result = ipoib_dev_init(priv->dev, hca, port); if (result) { - printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", - hca->name, port, result); + pr_warn("%s: failed to initialize port %d (ret = %d)\n", + hca->name, port, result); goto device_init_failed; } @@ -2260,8 +2259,8 @@ static struct net_device *ipoib_add_port(const char *format, result = register_netdev(priv->dev); if (result) { - printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n", - hca->name, port, result); + pr_warn("%s: couldn't register ipoib port %d; error %d\n", + hca->name, port, result); goto register_failed; } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index a1ed25422b72..984a88096f39 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -178,7 +178,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_rx_completion, NULL, priv, &cq_attr); if (IS_ERR(priv->recv_cq)) { - printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name); + pr_warn("%s: failed to create receive CQ\n", ca->name); goto out_cm_dev_cleanup; } @@ -187,7 +187,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->send_cq = ib_create_cq(priv->ca, ipoib_ib_tx_completion, NULL, priv, &cq_attr); if (IS_ERR(priv->send_cq)) { - printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name); + pr_warn("%s: failed to create send CQ\n", ca->name); goto out_free_recv_cq; } @@ -208,7 +208,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca) priv->qp = ib_create_qp(priv->pd, &init_attr); if (IS_ERR(priv->qp)) { - printk(KERN_WARNING "%s: failed to create QP\n", ca->name); + pr_warn("%s: failed to create QP\n", ca->name); goto out_free_send_cq; } -- cgit v1.2.3 From 807e005ab8b3e1be5739e8df7d8c79913ed0ac83 Mon Sep 17 00:00:00 2001 From: Chien Tin Tung Date: Fri, 8 Dec 2017 14:34:28 -0600 Subject: i40iw: Use sqsize to initialize cqp_requests elements Use sqsize instead of I40IW_CQP_SW_SQSIZE_2048 to initialize cqp_requests elements in the for-loop as sqsize is used to allocate memory for cqp_requests. Signed-off-by: Chien Tin Tung Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index e824296713e2..1f9d9c77d8ce 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -607,7 +607,7 @@ static enum i40iw_status_code i40iw_create_cqp(struct i40iw_device *iwdev) INIT_LIST_HEAD(&cqp->cqp_avail_reqs); INIT_LIST_HEAD(&cqp->cqp_pending_reqs); /* init the waitq of the cqp_requests and add them to the list */ - for (i = 0; i < I40IW_CQP_SW_SQSIZE_2048; i++) { + for (i = 0; i < sqsize; i++) { init_waitqueue_head(&cqp->cqp_requests[i].waitq); list_add_tail(&cqp->cqp_requests[i].list, &cqp->cqp_avail_reqs); } -- cgit v1.2.3 From 24b9abab15896c30f7e4a8f752245cfd45ce7fbf Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Fri, 8 Dec 2017 14:34:29 -0600 Subject: i40iw: Reinitialize add_sd_cnt add_sd_cnt in info structure passed to i40iw_create_hmc_obj_type must be 0 and since it is modified during the call, it must be reset in the loop. This avoids unnecessarily reprogramming the SDs multiple times with the same values. Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 1f9d9c77d8ce..3aecf4cbb128 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -483,6 +483,7 @@ static enum i40iw_status_code i40iw_create_hmc_objs(struct i40iw_device *iwdev, for (i = 0; i < IW_HMC_OBJ_TYPE_NUM; i++) { info.rsrc_type = iw_hmc_obj_types[i]; info.count = dev->hmc_info->hmc_obj[info.rsrc_type].cnt; + info.add_sd_cnt = 0; status = i40iw_create_hmc_obj_type(dev, &info); if (status) { i40iw_pr_err("create obj type %d status = %d\n", -- cgit v1.2.3 From 1b19b95169cd52fe82cd442fec0b279fe25cc838 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 11 Dec 2017 12:45:44 +0100 Subject: IB/mlx5: revisit -Wmaybe-uninitialized warning A warning that I thought I had fixed before occasionally comes back in rare randconfig builds (I found 7 instances in the last 100000 builds, originally it was much more frequent): drivers/infiniband/hw/mlx5/mr.c: In function 'mlx5_ib_reg_user_mr': drivers/infiniband/hw/mlx5/mr.c:1229:5: error: 'order' may be used uninitialized in this function [-Werror=maybe-uninitialized] if (order <= mr_cache_max_order(dev)) { ^ drivers/infiniband/hw/mlx5/mr.c:1247:8: error: 'ncont' may be used uninitialized in this function [-Werror=maybe-uninitialized] drivers/infiniband/hw/mlx5/mr.c:1247:8: error: 'page_shift' may be used uninitialized in this function [-Werror=maybe-uninitialized] drivers/infiniband/hw/mlx5/mr.c:1260:2: error: 'npages' may be used uninitialized in this function [-Werror=maybe-uninitialized] I've looked at all those findings again and noticed that they are all with CONFIG_INFINIBAND_USER_MEM=n, which means ib_umem_get() returns an error unconditionally and we never initialize or use those variables. This triggers a condition in gcc iff mr_umem_get() is partially but not entirely inlined, which in turn depends on the exact combination of optimization settings. This is a known problem with gcc, with no easy solution in the compiler, so this adds another workaround that should be more reliable than my previous attempt. Returning an error from mlx5_ib_reg_user_mr() earlier means that we can completely bypass the logic that caused the warning, the compiler can now see that the variable is never accessed. Fixes: 14ab8896f5d9 ("IB/mlx5: avoid bogus -Wmaybe-uninitialized warning") Signed-off-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index ee0ee1f9994b..ad37d8441fa2 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1206,6 +1206,9 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int err; bool use_umr = true; + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + return ERR_PTR(-EINVAL); + mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); -- cgit v1.2.3 From e711f968c49c224527e75203cf121c7a69309030 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 11 Dec 2017 15:05:05 +0200 Subject: IB/srp: replace custom implementation of hex2bin() There is no need to have a duplication of the generic library, i.e. hex2bin(). Replace the open coded variant. Signed-off-by: Andy Shevchenko Acked-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srp/ib_srp.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 972d4b3c5223..62d88212c1b0 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3110,7 +3110,6 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) { char *options, *sep_opt; char *p; - char dgid[3]; substring_t args[MAX_OPT_ARGS]; int opt_mask = 0; int token; @@ -3162,16 +3161,10 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) goto out; } - for (i = 0; i < 16; ++i) { - strlcpy(dgid, p + i * 2, sizeof(dgid)); - if (sscanf(dgid, "%hhx", - &target->orig_dgid.raw[i]) < 1) { - ret = -EINVAL; - kfree(p); - goto out; - } - } + ret = hex2bin(target->orig_dgid.raw, p, 16); kfree(p); + if (ret < 0) + goto out; break; case SRP_OPT_PKEY: -- cgit v1.2.3 From 302784729e7fb29d5888686fe83b42bb18f81ab8 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 14 Nov 2017 18:20:56 +0530 Subject: RDMA/bnxt_re: Remove redundant bnxt_qplib_disable_nq() call The bnxt_qplib_disable_nq() call is redundant as it occurs after 'goto fail' and hence it called twice. Remove it. Signed-off-by: Arvind Yadav Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 61764f7aa79b..eb7195c20b88 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -410,7 +410,6 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, if (rc) { dev_err(&nq->pdev->dev, "Failed to request IRQ for NQ: %#x", rc); - bnxt_qplib_disable_nq(nq); goto fail; } -- cgit v1.2.3 From ac6dbf7fa4707c75a247b540cc0b5c881f3d0ba8 Mon Sep 17 00:00:00 2001 From: Yuval Shaia Date: Wed, 29 Nov 2017 08:34:02 +0200 Subject: IB/ipoib: Warn when one port fails to initialize If one port fails to initialize an error message should indicate the reason and driver should continue serving the working port(s) and other HCA(s). Fixes: e4b2d06892c7 ("IB/ipoib: Remove device when one port fails to init"). Signed-off-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 6930ee0d63cf..567dbd45817c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2196,8 +2196,10 @@ static struct net_device *ipoib_add_port(const char *format, int result = -ENOMEM; priv = ipoib_intf_alloc(hca, port, format); - if (!priv) + if (!priv) { + pr_warn("%s, %d: ipoib_intf_alloc failed\n", hca->name, port); goto alloc_mem_failed; + } SET_NETDEV_DEV(priv->dev, hca->dev.parent); priv->dev->dev_id = port - 1; @@ -2325,8 +2327,7 @@ static void ipoib_add_one(struct ib_device *device) } if (!count) { - pr_err("Failed to init port, removing it\n"); - ipoib_remove_one(device, dev_list); + kfree(dev_list); return; } -- cgit v1.2.3 From 302d6424e4a293a5761997e6c9fc3dfb1e4c355f Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 29 Nov 2017 09:47:33 +0100 Subject: RDMA/iwpm: Fix uninitialized error code in iwpm_send_mapinfo() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With gcc-4.1.2: drivers/infiniband/core/iwpm_util.c: In function ‘iwpm_send_mapinfo’: drivers/infiniband/core/iwpm_util.c:647: warning: ‘ret’ may be used uninitialized in this function Indeed, if nl_client is not found in any of the scanned has buckets, ret will be used uninitialized. Preinitialize ret to -EINVAL to fix this. Fixes: 30dc5e63d6a5ad24 ("RDMA/core: Add support for iWARP Port Mapper user space service") Signed-off-by: Geert Uytterhoeven Reviewed-by: Tatyana Nikolova Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/iwpm_util.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index 3c4faadb8cdd..81528f64061a 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -654,6 +654,7 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) } skb_num++; spin_lock_irqsave(&iwpm_mapinfo_lock, flags); + ret = -EINVAL; for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) { hlist_for_each_entry(map_info, &iwpm_hash_bucket[i], hlist_node) { -- cgit v1.2.3 From f8109d9e7de550a28d620144b0fe31e661e02bcb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 27 Nov 2017 12:44:53 +0100 Subject: infiniband: cxgb4: use ktime_get for timestamps The debugfs file prints the difference between host timestamps as a seconds/nanoseconds tuple, along with a 64-bit nanoseconds hardware timestamp. The host time is read using getnstimeofday() which is deprecated because of the y2038 overflow, and it suffers from time jumps during settimeofday() and leap seconds. Converting to ktime_get_ts64() would solve those two, but I'm going a little further here by changing to ktime_get() and printing 64-bit nanoseconds on both host and hw timestamps. This simplifies the code further and makes the output easier to understand. The format of the debugfs file obviously changes here, but this should only be read by humans and not scripts, so I assume it's fine. Signed-off-by: Arnd Bergmann Acked-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/device.c | 34 +++++++++++++++------------------- drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 4 ++-- drivers/infiniband/hw/cxgb4/qp.c | 6 +++--- drivers/infiniband/hw/cxgb4/t4.h | 4 ++-- 4 files changed, 22 insertions(+), 26 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index af77d128d242..cbdf35ea0cbe 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -108,19 +108,19 @@ void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe) idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) & (wq->rdev->wr_log_size - 1); le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]); - getnstimeofday(&le.poll_host_ts); + le.poll_host_time = ktime_get(); le.valid = 1; le.cqe_sge_ts = CQE_TS(cqe); if (SQ_TYPE(cqe)) { le.qid = wq->sq.qid; le.opcode = CQE_OPCODE(cqe); - le.post_host_ts = wq->sq.sw_sq[wq->sq.cidx].host_ts; + le.post_host_time = wq->sq.sw_sq[wq->sq.cidx].host_time; le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts; le.wr_id = CQE_WRID_SQ_IDX(cqe); } else { le.qid = wq->rq.qid; le.opcode = FW_RI_RECEIVE; - le.post_host_ts = wq->rq.sw_rq[wq->rq.cidx].host_ts; + le.post_host_time = wq->rq.sw_rq[wq->rq.cidx].host_time; le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts; le.wr_id = CQE_WRID_MSN(cqe); } @@ -130,9 +130,9 @@ void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe) static int wr_log_show(struct seq_file *seq, void *v) { struct c4iw_dev *dev = seq->private; - struct timespec prev_ts = {0, 0}; + ktime_t prev_time; struct wr_log_entry *lep; - int prev_ts_set = 0; + int prev_time_set = 0; int idx, end; #define ts2ns(ts) div64_u64((ts) * dev->rdev.lldi.cclk_ps, 1000) @@ -145,33 +145,29 @@ static int wr_log_show(struct seq_file *seq, void *v) lep = &dev->rdev.wr_log[idx]; while (idx != end) { if (lep->valid) { - if (!prev_ts_set) { - prev_ts_set = 1; - prev_ts = lep->poll_host_ts; + if (!prev_time_set) { + prev_time_set = 1; + prev_time = lep->poll_host_time; } - seq_printf(seq, "%04u: sec %lu nsec %lu qid %u opcode " - "%u %s 0x%x host_wr_delta sec %lu nsec %lu " + seq_printf(seq, "%04u: nsec %llu qid %u opcode " + "%u %s 0x%x host_wr_delta nsec %llu " "post_sge_ts 0x%llx cqe_sge_ts 0x%llx " "poll_sge_ts 0x%llx post_poll_delta_ns %llu " "cqe_poll_delta_ns %llu\n", idx, - timespec_sub(lep->poll_host_ts, - prev_ts).tv_sec, - timespec_sub(lep->poll_host_ts, - prev_ts).tv_nsec, + ktime_to_ns(ktime_sub(lep->poll_host_time, + prev_time)), lep->qid, lep->opcode, lep->opcode == FW_RI_RECEIVE ? "msn" : "wrid", lep->wr_id, - timespec_sub(lep->poll_host_ts, - lep->post_host_ts).tv_sec, - timespec_sub(lep->poll_host_ts, - lep->post_host_ts).tv_nsec, + ktime_to_ns(ktime_sub(lep->poll_host_time, + lep->post_host_time)), lep->post_sge_ts, lep->cqe_sge_ts, lep->poll_sge_ts, ts2ns(lep->poll_sge_ts - lep->post_sge_ts), ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts)); - prev_ts = lep->poll_host_ts; + prev_time = lep->poll_host_time; } idx++; if (idx > (dev->rdev.wr_log_size - 1)) diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 470f97a79ebb..37ff2ad8a3d4 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -153,8 +153,8 @@ struct c4iw_hw_queue { }; struct wr_log_entry { - struct timespec post_host_ts; - struct timespec poll_host_ts; + ktime_t post_host_time; + ktime_t poll_host_time; u64 post_sge_ts; u64 cqe_sge_ts; u64 poll_sge_ts; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 5ee7fe433136..8018533ab705 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -975,7 +975,7 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, if (c4iw_wr_log) { swsqe->sge_ts = cxgb4_read_sge_timestamp( qhp->rhp->rdev.lldi.ports[0]); - getnstimeofday(&swsqe->host_ts); + swsqe->host_time = ktime_get(); } init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16); @@ -1045,8 +1045,8 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts = cxgb4_read_sge_timestamp( qhp->rhp->rdev.lldi.ports[0]); - getnstimeofday( - &qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_ts); + qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_time = + ktime_get(); } wqe->recv.opcode = FW_RI_RECV_WR; diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index e9ea94268d51..e12377cc024c 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -271,7 +271,7 @@ struct t4_swsqe { int signaled; u16 idx; int flushed; - struct timespec host_ts; + ktime_t host_time; u64 sge_ts; }; @@ -312,7 +312,7 @@ struct t4_sq { struct t4_swrqe { u64 wr_id; - struct timespec host_ts; + ktime_t host_time; u64 sge_ts; }; -- cgit v1.2.3 From 0cb65d421a2ad714d336e673d448184ea51bb66a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 30 Nov 2017 13:30:06 +0000 Subject: iw_cxgb4: make pointer reg_workq static The pointer reg_workq is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: drivers/infiniband/hw/cxgb4/device.c:69:25: warning: symbol 'reg_workq' was not declared. Should it be static? Fixes: 1c8f1da5d851 ("iw_cxgb4: Fix possible circular dependency locking warning") Signed-off-by: Colin Ian King Reviewed-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index cbdf35ea0cbe..7a9d0de89d6a 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -66,7 +66,7 @@ MODULE_PARM_DESC(c4iw_wr_log_size_order, static LIST_HEAD(uld_ctx_list); static DEFINE_MUTEX(dev_mutex); -struct workqueue_struct *reg_workq; +static struct workqueue_struct *reg_workq; #define DB_FC_RESUME_SIZE 64 #define DB_FC_RESUME_DELAY 1 -- cgit v1.2.3 From ccc04cdd55d8cfa7376ba96a037ba0f1b3df33c7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 5 Dec 2017 17:36:54 +0300 Subject: RDMA/cxgb4: Add a sanity check in process_work() The story is that Smatch marks skb->data as untrusted so it generates a warning message here: drivers/infiniband/hw/cxgb4/cm.c:4100 process_work() error: buffer overflow 'work_handlers' 241 <= 255 In other places which handle this such as t4_uld_rx_handler() there is some checking to make sure that the function pointer is not NULL. I have added bounds checking and a check for NULL here as well. Signed-off-by: Dan Carpenter Acked-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 21db3b48a617..844c9e78df8b 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -4097,9 +4097,15 @@ static void process_work(struct work_struct *work) dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *))); opcode = rpl->ot.opcode; - ret = work_handlers[opcode](dev, skb); - if (!ret) + if (opcode >= ARRAY_SIZE(work_handlers) || + !work_handlers[opcode]) { + pr_err("No handler for opcode 0x%x.\n", opcode); kfree_skb(skb); + } else { + ret = work_handlers[opcode](dev, skb); + if (!ret) + kfree_skb(skb); + } process_timedout_eps(); } } -- cgit v1.2.3 From 54a6d63f14bdb4e899bbb4128d32717074d13862 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 5 Dec 2017 17:39:23 +0300 Subject: IB/mlx4: Potential buffer overflow in _mlx4_set_path() Smatch complains about this code: drivers/infiniband/hw/mlx4/qp.c:1827 _mlx4_set_path() error: buffer overflow 'dev->dev->caps.gid_table_len' 3 <= 255 The mlx4_ib_gid_index_to_real_index() does check that "port" is within bounds, but we don't check the return value for errors. It seems simple enough to add a check for that. Signed-off-by: Dan Carpenter Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/qp.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 013049bcdb53..817257f105aa 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1824,6 +1824,8 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, mlx4_ib_gid_index_to_real_index(dev, port, grh->sgid_index); + if (real_sgid_index < 0) + return real_sgid_index; if (real_sgid_index >= dev->dev->caps.gid_table_len[port]) { pr_err("sgid_index (%u) too large. max is %d\n", real_sgid_index, dev->dev->caps.gid_table_len[port] - 1); -- cgit v1.2.3 From 8b0c05dc9380ea3e5d42e0df0a79aeb4971220c5 Mon Sep 17 00:00:00 2001 From: "Andrew F. Davis" Date: Tue, 5 Dec 2017 13:15:53 -0600 Subject: IB/ocrdma: Remove unneeded conversions to bool Found with scripts/coccinelle/misc/boolconv.cocci. Signed-off-by: Andrew F. Davis Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 7866fd8051f6..b5f1dd33b73e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -461,7 +461,7 @@ retry: static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx, struct ocrdma_pd *pd) { - return (uctx->cntxt_pd == pd ? true : false); + return (uctx->cntxt_pd == pd); } static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev, -- cgit v1.2.3 From 1060f86534147c2830db4bbc9dd849d1892a611b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:51:49 +0200 Subject: IB/{core/cm}: Fix generating a return AH for RoCEE When computing a UD reverse path (return AH) from a WC the code was not doing a route lookup anchored in a specific netdevice. This caused several bugs, including broken IPv6 link-local address support in RoCEv2. [1] This fixes the lookup by determining the GID table entry that the HW matched to the SGID for the WC and then using the netdevice from that entry to perform the route and ND lookup for the 'DGID' to build a return AH. RoCE GID table management ensures that right upper netdevices of the physical netdevices are added. Therefore init_ah_from_wc doesn't need to perform such check. Now that route lookup is done based on the netdevice of the GID entry, simplify code to not have ifindex and vlan pointers. As part of that, refactor to have netdevice as input parameter. This is already discussed at [2]. Finally ib_init_ah_from_wc resolves dmac for unicast GID in similar way as what ib_resolve_eth_dmac() does. So ib_resolve_eth_dmac is refactored to split for unicast and non unicast GIDs, so that it can be reused by ib_init_ah_from_wc. While we are at refactoring ib_resolve_eth_dmac(), it is further simplified (a) to avoid hoplimit as optional parameter, as there is only one user who always queries hoplimit. (b) for empty line. (c) avoided zero initialization of ret. (d) removed as exported symbol as only ib core uses it. For IPv6, this is tested using simple rping test as below. rping -sv -a ::0 rping -c -a fe80::268a:7ff:fe55:4661%ens2f1 -C 1 -v -d [1] https://www.spinics.net/lists/linux-rdma/msg45690.html [2] https://www.spinics.net/lists/linux-rdma/msg45710.html Signed-off-by: Parav Pandit Reviewed-by: Matan Barak Reviewed-by: Mark Bloch Reported-by: Roland Dreier Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 24 ++----- drivers/infiniband/core/verbs.c | 136 +++++++++++++++++----------------------- include/rdma/ib_addr.h | 2 +- 3 files changed, 64 insertions(+), 98 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index f4e8185bccd3..da4469c38eac 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -761,27 +761,23 @@ static void resolve_cb(int status, struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, u16 *vlan_id, int *if_index, + u8 *dmac, const struct net_device *ndev, int *hoplimit) { - int ret = 0; struct rdma_dev_addr dev_addr; struct resolve_cb_context ctx; - struct net_device *dev; - union { struct sockaddr _sockaddr; struct sockaddr_in _sockaddr_in; struct sockaddr_in6 _sockaddr_in6; } sgid_addr, dgid_addr; - + int ret; rdma_gid2ip(&sgid_addr._sockaddr, sgid); rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - if (if_index) - dev_addr.bound_dev_if = *if_index; + dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; ctx.addr = &dev_addr; @@ -798,19 +794,9 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, return ret; memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN); - dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); - if (!dev) - return -ENODEV; - if (if_index) - *if_index = dev_addr.bound_dev_if; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); - if (hoplimit) - *hoplimit = dev_addr.hoplimit; - dev_put(dev); - return ret; + *hoplimit = dev_addr.hoplimit; + return 0; } -EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) { diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 3fb8fb6cc824..54b56c4fcc38 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -481,6 +481,40 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, } EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); +/* Resolve destination mac address and hop limit for unicast destination + * GID entry, considering the source GID entry as well. + * ah_attribute must have have valid port_num, sgid_index. + */ +static int ib_resolve_unicast_gid_dmac(struct ib_device *device, + struct rdma_ah_attr *ah_attr) +{ + struct ib_gid_attr sgid_attr; + struct ib_global_route *grh; + int hop_limit = 0xff; + union ib_gid sgid; + int ret; + + grh = rdma_ah_retrieve_grh(ah_attr); + + ret = ib_query_gid(device, + rdma_ah_get_port_num(ah_attr), + grh->sgid_index, + &sgid, &sgid_attr); + if (ret || !sgid_attr.ndev) { + if (!ret) + ret = -ENXIO; + return ret; + } + + ret = rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, + ah_attr->roce.dmac, + sgid_attr.ndev, &hop_limit); + dev_put(sgid_attr.ndev); + + grh->hop_limit = hop_limit; + return ret; +} + /* * This function creates ah from the incoming packet. * Incoming packet has dgid of the receiver node on which this code is @@ -490,9 +524,6 @@ EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); * as sgid and, sgid is used as dgid because sgid contains destinations * GID whom to respond to. * - * This is why when calling rdma_addr_find_l2_eth_by_grh() function, the - * position of arguments dgid and sgid do not match the order of the - * parameters. */ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, @@ -523,57 +554,33 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (ret) return ret; + rdma_ah_set_sl(ah_attr, wc->sl); + rdma_ah_set_port_num(ah_attr, port_num); + if (rdma_protocol_roce(device, port_num)) { - int if_index = 0; u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; - struct net_device *idev; - struct net_device *resolved_dev; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - if (!device->get_netdev) - return -EOPNOTSUPP; - - idev = device->get_netdev(device, port_num); - if (!idev) - return -ENODEV; - - ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, - ah_attr->roce.dmac, - wc->wc_flags & IB_WC_WITH_VLAN ? - NULL : &vlan_id, - &if_index, &hoplimit); - if (ret) { - dev_put(idev); - return ret; - } - - resolved_dev = dev_get_by_index(&init_net, if_index); - rcu_read_lock(); - if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, - resolved_dev)) - ret = -EHOSTUNREACH; - rcu_read_unlock(); - dev_put(idev); - dev_put(resolved_dev); - if (ret) - return ret; - - ret = get_sgid_index_from_eth(device, port_num, vlan_id, - &dgid, gid_type, &gid_index); + ret = get_sgid_index_from_eth(device, port_num, + vlan_id, &dgid, + gid_type, &gid_index); if (ret) return ret; - } - rdma_ah_set_dlid(ah_attr, wc->slid); - rdma_ah_set_sl(ah_attr, wc->sl); - rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); - rdma_ah_set_port_num(ah_attr, port_num); + flow_class = be32_to_cpu(grh->version_tclass_flow); + rdma_ah_set_grh(ah_attr, &sgid, + flow_class & 0xFFFFF, + (u8)gid_index, hoplimit, + (flow_class >> 20) & 0xFF); + return ib_resolve_unicast_gid_dmac(device, ah_attr); + } else { + rdma_ah_set_dlid(ah_attr, wc->slid); + rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); - if (wc->wc_flags & IB_WC_GRH) { - if (!rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_GRH) { if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { ret = ib_find_cached_gid_by_port(device, &dgid, IB_GID_TYPE_IB, @@ -584,16 +591,15 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, } else { gid_index = 0; } - } - - flow_class = be32_to_cpu(grh->version_tclass_flow); - rdma_ah_set_grh(ah_attr, &sgid, - flow_class & 0xFFFFF, - (u8)gid_index, hoplimit, - (flow_class >> 20) & 0xFF); + flow_class = be32_to_cpu(grh->version_tclass_flow); + rdma_ah_set_grh(ah_attr, &sgid, + flow_class & 0xFFFFF, + (u8)gid_index, hoplimit, + (flow_class >> 20) & 0xFF); + } + return 0; } - return 0; } EXPORT_SYMBOL(ib_init_ah_from_wc); @@ -1290,34 +1296,8 @@ static int ib_resolve_eth_dmac(struct ib_device *device, (char *)ah_attr->roce.dmac); } } else { - union ib_gid sgid; - struct ib_gid_attr sgid_attr; - int ifindex; - int hop_limit; - - ret = ib_query_gid(device, - rdma_ah_get_port_num(ah_attr), - grh->sgid_index, - &sgid, &sgid_attr); - - if (ret || !sgid_attr.ndev) { - if (!ret) - ret = -ENXIO; - goto out; - } - - ifindex = sgid_attr.ndev->ifindex; - - ret = - rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, - ah_attr->roce.dmac, - NULL, &ifindex, &hop_limit); - - dev_put(sgid_attr.ndev); - - grh->hop_limit = hop_limit; + ret = ib_resolve_unicast_gid_dmac(device, ah_attr); } -out: return ret; } diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 18c564f60e93..d5c3bbb84608 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -134,7 +134,7 @@ int rdma_addr_size(struct sockaddr *addr); int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id); int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *smac, u16 *vlan_id, int *if_index, + u8 *dmac, const struct net_device *ndev, int *hoplimit); static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) -- cgit v1.2.3 From 56d0a7d9a0f045ee27a001762deac28c7d28e2e4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:51:50 +0200 Subject: IB/core: Depend on IPv6 stack to resolve link local address for RoCEv2 RoCEv1 does not use the IPv6 stack to resolve the link local DGID since it uses GID address. It forms the DMAC directly from the DGID. The code became confused and also tried to use this bypass for RoCEv2 packets, however RoCEv2 always uses a IP address in the GID and must always use ARP or neighbor discovery to get the DMAC address. Now that rdma_addr_find_l2_eth_by_grh() supports resolving link local address to find destination mac address, lets make use of it. This aligns it to how the rest of the IPv6 stack resolves link local destination IPv6 address. Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 54b56c4fcc38..75ebd74f8bbd 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -506,9 +506,20 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, return ret; } + /* If destination is link local and source GID is RoCEv1, + * IP stack is not used. + */ + if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) && + sgid_attr.gid_type == IB_GID_TYPE_ROCE) { + rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, + ah_attr->roce.dmac); + goto done; + } + ret = rdma_addr_find_l2_eth_by_grh(&sgid, &grh->dgid, ah_attr->roce.dmac, sgid_attr.ndev, &hop_limit); +done: dev_put(sgid_attr.ndev); grh->hop_limit = hop_limit; @@ -1280,11 +1291,6 @@ static int ib_resolve_eth_dmac(struct ib_device *device, grh = rdma_ah_retrieve_grh(ah_attr); - if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw)) { - rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, - ah_attr->roce.dmac); - return 0; - } if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { __be32 addr = 0; -- cgit v1.2.3 From df8441c668f70573f4706bb59a18fc67dc5d20dc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:51:51 +0200 Subject: IB/core: Avoid exporting module internal function ib_security_modify_qp and ib_security_pkey_access are core internal function. So avoid exporting them. ib_security_pkey_access is used only when secuirty hooks are enabled so avoid defining it otherwise. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 13 ------------- drivers/infiniband/core/security.c | 10 ++++------ 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index a1d687a664f8..6c4541af54bb 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -213,11 +213,6 @@ int ib_get_cached_subnet_prefix(struct ib_device *device, u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND -int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec); - void ib_security_destroy_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, @@ -240,14 +235,6 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); #else -static inline int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec) -{ - return 0; -} - static inline void ib_security_destroy_port_pkey_list(struct ib_device *device) { } diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 23278ed5be45..209d0574fc1f 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -620,12 +620,11 @@ int ib_security_modify_qp(struct ib_qp *qp, } return ret; } -EXPORT_SYMBOL(ib_security_modify_qp); -int ib_security_pkey_access(struct ib_device *dev, - u8 port_num, - u16 pkey_index, - void *sec) +static int ib_security_pkey_access(struct ib_device *dev, + u8 port_num, + u16 pkey_index, + void *sec) { u64 subnet_prefix; u16 pkey; @@ -642,7 +641,6 @@ int ib_security_pkey_access(struct ib_device *dev, return security_ib_pkey_access(sec, subnet_prefix, pkey); } -EXPORT_SYMBOL(ib_security_pkey_access); static int ib_mad_agent_security_change(struct notifier_block *nb, unsigned long event, -- cgit v1.2.3 From d7c0557a10eeff5177deb3c1733383b139e1fcae Mon Sep 17 00:00:00 2001 From: Erez Alfasi Date: Tue, 14 Nov 2017 14:51:52 +0200 Subject: IB/mlx4: Remove unused ibpd parameter Remove unused ibpd parameter from create_qp_rss() function. Signed-off-by: Erez Alfasi Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/qp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 817257f105aa..16dea0e3d50f 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -725,7 +725,7 @@ static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, return 0; } -static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_pd *ibpd, +static int create_qp_rss(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx4_ib_create_qp_rss *ucmd, struct mlx4_ib_qp *qp) @@ -848,7 +848,7 @@ static struct ib_qp *_mlx4_ib_create_qp_rss(struct ib_pd *pd, qp->pri.vid = 0xFFFF; qp->alt.vid = 0xFFFF; - err = create_qp_rss(to_mdev(pd->device), pd, init_attr, &ucmd, qp); + err = create_qp_rss(to_mdev(pd->device), init_attr, &ucmd, qp); if (err) { kfree(qp); return ERR_PTR(err); -- cgit v1.2.3 From 439000892ee17a9c92f1e4297818790ef8bb4ced Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 14 Nov 2017 14:51:53 +0200 Subject: IB/ipoib: Avoid memory leak if the SA returns a different DGID The ipoib path database is organized around DGIDs from the LLADDR, but the SA is free to return a different GID when asked for path. This causes a bug because the SA's modified DGID is copied into the database key, even though it is no longer the correct lookup key, causing a memory leak and other malfunctions. Ensure the database key does not change after the SA query completes. Demonstration of the bug is as follows ipoib wants to send to GID fe80:0000:0000:0000:0002:c903:00ef:5ee2, it creates new record in the DB with that gid as a key, and issues a new request to the SM. Now, the SM from some reason returns path-record with other SGID (for example, 2001:0000:0000:0000:0002:c903:00ef:5ee2 that contains the local subnet prefix) now ipoib will overwrite the current entry with the new one, and if new request to the original GID arrives ipoib will not find it in the DB (was overwritten) and will create new record that in its turn will also be overwritten by the response from the SM, and so on till the driver eats all the device memory. Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 567dbd45817c..7301a79c4058 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -775,6 +775,22 @@ static void path_rec_completion(int status, spin_lock_irqsave(&priv->lock, flags); if (!IS_ERR_OR_NULL(ah)) { + /* + * pathrec.dgid is used as the database key from the LLADDR, + * it must remain unchanged even if the SA returns a different + * GID to use in the AH. + */ + if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, + sizeof(union ib_gid))) { + ipoib_dbg( + priv, + "%s got PathRec for gid %pI6 while asked for %pI6\n", + dev->name, pathrec->dgid.raw, + path->pathrec.dgid.raw); + memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, + sizeof(union ib_gid)); + } + path->pathrec = *pathrec; old_ah = path->ah; -- cgit v1.2.3 From 98aebc550b88c6d0928e01652ffe76806a2b782e Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 14 Nov 2017 14:51:54 +0200 Subject: IB/ipoib: Update pathrec field if not valid record In case that the PathRecord is not valid (SM changed its network prefix) ipoib will continue issue PathQuery requests with the same parameters that are in its database, which are no longer valid anymore. Now the driver in that case will re-initialize the record from a valid place (the priv structure keeps the updated values), and a valid request will be issued. Signed-off-by: Erez Shitrit Reviewed-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 49 +++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 7301a79c4058..c182604076e1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -856,6 +856,23 @@ static void path_rec_completion(int status, } } +static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path, + void *gid) +{ + path->dev = priv->dev; + + if (rdma_cap_opa_ah(priv->ca, priv->port)) + path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; + else + path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; + + memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid)); + path->pathrec.sgid = priv->local_gid; + path->pathrec.pkey = cpu_to_be16(priv->pkey); + path->pathrec.numb_path = 1; + path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; +} + static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -868,21 +885,11 @@ static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) if (!path) return NULL; - path->dev = dev; - skb_queue_head_init(&path->queue); INIT_LIST_HEAD(&path->neigh_list); - if (rdma_cap_opa_ah(priv->ca, priv->port)) - path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; - else - path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; - memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid)); - path->pathrec.sgid = priv->local_gid; - path->pathrec.pkey = cpu_to_be16(priv->pkey); - path->pathrec.numb_path = 1; - path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; + init_path_rec(priv, path, gid); return path; } @@ -1011,6 +1018,10 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_lock_irqsave(&priv->lock, flags); + /* no broadcast means that all paths are (going to be) not valid */ + if (!priv->broadcast) + goto drop_and_unlock; + path = __path_find(dev, phdr->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; @@ -1020,6 +1031,10 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, new_path = 1; } if (path) { + if (!new_path) + /* make sure there is no changes in the existing path record */ + init_path_rec(priv, path, phdr->hwaddr + 4); + if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); @@ -1036,8 +1051,7 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, } else __path_add(dev, path); } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + goto drop_and_unlock; } spin_unlock_irqrestore(&priv->lock, flags); @@ -1057,11 +1071,16 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); } else { - ++dev->stats.tx_dropped; - dev_kfree_skb_any(skb); + goto drop_and_unlock; } spin_unlock_irqrestore(&priv->lock, flags); + return; + +drop_and_unlock: + ++dev->stats.tx_dropped; + dev_kfree_skb_any(skb); + spin_unlock_irqrestore(&priv->lock, flags); } static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) -- cgit v1.2.3 From 7baaa49af3716fb31877c61f59b74d029ce15b75 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:51:55 +0200 Subject: RDMA/cma: Use correct size when writing netlink stats The code was using the src size when formatting the dst. They are almost certainly the same value but it reads wrong. Fixes: ce117ffac2e9 ("RDMA/cma: Export AF_IB statistics") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 1fdb473b5df7..9841e7ca5c06 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4431,7 +4431,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) RDMA_NL_RDMA_CM_ATTR_SRC_ADDR)) goto out; if (ibnl_put_attr(skb, nlh, - rdma_addr_size(cma_src_addr(id_priv)), + rdma_addr_size(cma_dst_addr(id_priv)), cma_dst_addr(id_priv), RDMA_NL_RDMA_CM_ATTR_DST_ADDR)) goto out; -- cgit v1.2.3 From 8b00914654ef56ff5473f4fe1f1168254dbb8a17 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Tue, 14 Nov 2017 14:51:57 +0200 Subject: IB/core: Fix memory leak in cm_req_handler error flows In cm_req_handler error flows, sometimes cm_id_priv->timewait_info isn't free'd. Signed-off-by: Matan Barak Signed-off-by: Mukesh Kacker Signed-off-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 1cafa8350c52..dbbebbb5c315 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1876,8 +1876,7 @@ static int cm_req_handler(struct cm_work *work) listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { ret = -EINVAL; - kfree(cm_id_priv->timewait_info); - goto destroy; + goto free_timeinfo; } cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler; @@ -1979,6 +1978,8 @@ static int cm_req_handler(struct cm_work *work) rejected: atomic_dec(&cm_id_priv->refcount); cm_deref_id(listen_cm_id_priv); +free_timeinfo: + kfree(cm_id_priv->timewait_info); destroy: ib_destroy_cm_id(cm_id); return ret; -- cgit v1.2.3 From 119bf81793ea952fcd46e44378ff3c75562d16f8 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 14 Nov 2017 14:51:58 +0200 Subject: IB/cm: Add debug prints to ib_cm Add debug prints to the error paths in the connection manager control flows, to help debug connection management problems. Signed-off-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index dbbebbb5c315..d3aab237f8c6 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1875,6 +1875,8 @@ static int cm_req_handler(struct cm_work *work) listen_cm_id_priv = cm_match_req(work, cm_id_priv); if (!listen_cm_id_priv) { + pr_debug("%s: local_id %d, no listen_cm_id_priv\n", __func__, + be32_to_cpu(cm_id->local_id)); ret = -EINVAL; goto free_timeinfo; } @@ -2032,6 +2034,8 @@ int ib_send_cm_rep(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REQ_RCVD && cm_id->state != IB_CM_MRA_REQ_SENT) { + pr_debug("%s: local_comm_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2098,6 +2102,8 @@ int ib_send_cm_rtu(struct ib_cm_id *cm_id, spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_REP_RCVD && cm_id->state != IB_CM_MRA_REP_SENT) { + pr_debug("%s: local_id %d, cm_id->state %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; goto error; } @@ -2205,6 +2211,8 @@ static int cm_rep_handler(struct cm_work *work) cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0); if (!cm_id_priv) { cm_dup_rep_handler(work); + pr_debug("%s: remote_comm_id %d, no cm_id_priv\n", __func__, + be32_to_cpu(rep_msg->remote_comm_id)); return -EINVAL; } @@ -2218,6 +2226,10 @@ static int cm_rep_handler(struct cm_work *work) default: spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; + pr_debug("%s: cm_id_priv->id.state: %d, local_comm_id %d, remote_comm_id %d\n", + __func__, cm_id_priv->id.state, + be32_to_cpu(rep_msg->local_comm_id), + be32_to_cpu(rep_msg->remote_comm_id)); goto error; } @@ -2231,6 +2243,8 @@ static int cm_rep_handler(struct cm_work *work) spin_unlock(&cm.lock); spin_unlock_irq(&cm_id_priv->lock); ret = -EINVAL; + pr_debug("%s: Failed to insert remote id %d\n", __func__, + be32_to_cpu(rep_msg->remote_comm_id)); goto error; } /* Check for a stale connection. */ @@ -2248,6 +2262,10 @@ static int cm_rep_handler(struct cm_work *work) IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP, NULL, 0); ret = -EINVAL; + pr_debug("%s: Stale connection. local_comm_id %d, remote_comm_id %d\n", + __func__, be32_to_cpu(rep_msg->local_comm_id), + be32_to_cpu(rep_msg->remote_comm_id)); + if (cur_cm_id_priv) { cm_id = &cur_cm_id_priv->id; ib_send_cm_dreq(cm_id, NULL, 0); @@ -2394,6 +2412,8 @@ int ib_send_cm_dreq(struct ib_cm_id *cm_id, cm_id_priv = container_of(cm_id, struct cm_id_private, id); spin_lock_irqsave(&cm_id_priv->lock, flags); if (cm_id->state != IB_CM_ESTABLISHED) { + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2463,6 +2483,8 @@ int ib_send_cm_drep(struct ib_cm_id *cm_id, if (cm_id->state != IB_CM_DREQ_RCVD) { spin_unlock_irqrestore(&cm_id_priv->lock, flags); kfree(data); + pr_debug("%s: local_id %d, cm_idcm_id->state(%d) != IB_CM_DREQ_RCVD\n", + __func__, be32_to_cpu(cm_id->local_id), cm_id->state); return -EINVAL; } @@ -2528,6 +2550,9 @@ static int cm_dreq_handler(struct cm_work *work) atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES]. counter[CM_DREQ_COUNTER]); cm_issue_drep(work->port, work->mad_recv_wc); + pr_debug("%s: no cm_id_priv, local_comm_id %d, remote_comm_id %d\n", + __func__, be32_to_cpu(dreq_msg->local_comm_id), + be32_to_cpu(dreq_msg->remote_comm_id)); return -EINVAL; } @@ -2570,6 +2595,9 @@ static int cm_dreq_handler(struct cm_work *work) counter[CM_DREQ_COUNTER]); goto unlock; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); goto unlock; } cm_id_priv->id.state = IB_CM_DREQ_RCVD; @@ -2673,6 +2701,8 @@ int ib_send_cm_rej(struct ib_cm_id *cm_id, cm_enter_timewait(cm_id_priv); break; default: + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id_priv->id.local_id), cm_id->state); ret = -EINVAL; goto out; } @@ -2783,6 +2813,9 @@ static int cm_rej_handler(struct cm_work *work) /* fall through */ default: spin_unlock_irq(&cm_id_priv->lock); + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; goto out; } @@ -2846,6 +2879,9 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, } /* fall through */ default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; goto error1; } @@ -2947,6 +2983,9 @@ static int cm_mra_handler(struct cm_work *work) counter[CM_MRA_COUNTER]); /* fall through */ default: + pr_debug("%s local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); goto out; } @@ -3727,6 +3766,7 @@ static void cm_work_handler(struct work_struct *_work) ret = cm_timewait_handler(work); break; default: + pr_debug("cm_event.event: 0x%x\n", work->cm_event.event); ret = -EINVAL; break; } @@ -3762,6 +3802,8 @@ static int cm_establish(struct ib_cm_id *cm_id) ret = -EISCONN; break; default: + pr_debug("%s: local_id %d, cm_id->state: %d\n", __func__, + be32_to_cpu(cm_id->local_id), cm_id->state); ret = -EINVAL; break; } @@ -3959,6 +4001,9 @@ static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } @@ -4006,6 +4051,9 @@ static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } @@ -4065,6 +4113,9 @@ static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv, ret = 0; break; default: + pr_debug("%s: local_id %d, cm_id_priv->id.state: %d\n", + __func__, be32_to_cpu(cm_id_priv->id.local_id), + cm_id_priv->id.state); ret = -EINVAL; break; } -- cgit v1.2.3 From edf1a84fe37c51290e2c88154ecaf48dadff3d27 Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Tue, 14 Nov 2017 14:51:59 +0200 Subject: IB/umem: Fix use of npages/nmap fields In ib_umem structure npages holds original number of sg entries, while nmap is number of DMA blocks returned by dma_map_sg. Fixes: c5d76f130b28 ('IB/core: Add umem function to read data from user-space') Signed-off-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 130606c3b07c..9a4e899d94b3 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -352,7 +352,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset, return -EINVAL; } - ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length, + ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length, offset + ib_umem_offset(umem)); if (ret < 0) -- cgit v1.2.3 From 5c181bda77f409d89ad513528eccac5f3a416474 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:00 +0200 Subject: RDMA/cma: Set default GID type as RoCE when resolving RoCE route As the function name suggests cma_resolve_iboe_route() resolves RoCE route. However, its default GID type is IB_GID_TYPE_IB and not IB_GID_TYPE_ROCE, even though both are mapped to the same enum value. Change default GID type to IB_GID_TYPE_ROCE. cma_iboe_set_mgid() is updated to reflect the RoCEv2 GID check. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Reviewed-by: Noa Osherovich Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 9841e7ca5c06..e63a51b68ed5 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2528,7 +2528,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct cma_work *work; int ret; struct net_device *ndev = NULL; - enum ib_gid_type gid_type = IB_GID_TYPE_IB; + enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; @@ -4009,8 +4009,10 @@ static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid, } else if (addr->sa_family == AF_INET6) { memcpy(mgid, &sin6->sin6_addr, sizeof *mgid); } else { - mgid->raw[0] = (gid_type == IB_GID_TYPE_IB) ? 0xff : 0; - mgid->raw[1] = (gid_type == IB_GID_TYPE_IB) ? 0x0e : 0; + mgid->raw[0] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff; + mgid->raw[1] = + (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e; mgid->raw[2] = 0; mgid->raw[3] = 0; mgid->raw[4] = 0; -- cgit v1.2.3 From 4367ec7fe2dd6f049f8bb558c86d54b438186729 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:01 +0200 Subject: RDMA/cma: Simplify netdev check Current code checks for NULL ndev twice where 2nd check is always invalid given the fact that during route resolving stage, device address must be bound to netdevice interface. This patch simplifies such check. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index e63a51b68ed5..168f1230756b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2527,8 +2527,10 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct rdma_addr *addr = &route->addr; struct cma_work *work; int ret; - struct net_device *ndev = NULL; + struct net_device *ndev; enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; + unsigned long supported_gids; + u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; u8 tos = id_priv->tos_set ? id_priv->tos : default_roce_tos; @@ -2549,30 +2551,25 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->num_paths = 1; - if (addr->dev_addr.bound_dev_if) { - unsigned long supported_gids; - - ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); - if (!ndev) { - ret = -ENODEV; - goto err2; - } - - supported_gids = roce_gid_type_mask_support(id_priv->id.device, - id_priv->id.port_num); - gid_type = cma_route_gid_type(addr->dev_addr.network, - supported_gids, - id_priv->gid_type); - route->path_rec->rec_type = - sa_conv_gid_to_pathrec_type(gid_type); - sa_path_set_ndev(route->path_rec, &init_net); - sa_path_set_ifindex(route->path_rec, ndev->ifindex); + if (!addr->dev_addr.bound_dev_if) { + ret = -ENODEV; + goto err2; } + + ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); if (!ndev) { ret = -ENODEV; goto err2; } + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + gid_type = cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + sa_path_set_ndev(route->path_rec, &init_net); + sa_path_set_ifindex(route->path_rec, ndev->ifindex); sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, -- cgit v1.2.3 From c4238805375847071e1ca9e2a37e1f26ce1a4809 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:02 +0200 Subject: RDMA/cma: Avoid setting path record type twice Avoid setting path record type twice for RoCE. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 168f1230756b..f986597201fa 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2567,7 +2567,11 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) gid_type = cma_route_gid_type(addr->dev_addr.network, supported_gids, id_priv->gid_type); + /* Use the hint from IP Stack to select GID Type */ + if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + sa_path_set_ndev(route->path_rec, &init_net); sa_path_set_ifindex(route->path_rec, ndev->ifindex); sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); @@ -2577,11 +2581,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - /* Use the hint from IP Stack to select GID Type */ - if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) - gid_type = ib_network_to_gid_type(addr->dev_addr.network); - route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); - if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) /* TODO: get the hoplimit from the inet/inet6 device */ route->path_rec->hop_limit = addr->dev_addr.hoplimit; -- cgit v1.2.3 From 981b5a2384b7c9b521fb8d54ffb89259f754846e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:03 +0200 Subject: RDMA/cma: Introduce and use helper functions to init work Introduce and user helper functions to initialize work for address resolved and route resolved event that avoid code duplication at few places. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 53 ++++++++++++++++++++----------------------- 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f986597201fa..48f593bafe33 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2413,6 +2413,26 @@ out: kfree(work); } +static void cma_init_resolve_route_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ROUTE_QUERY; + work->new_state = RDMA_CM_ROUTE_RESOLVED; + work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; +} + +static void cma_init_resolve_addr_work(struct cma_work *work, + struct rdma_id_private *id_priv) +{ + work->id = id_priv; + INIT_WORK(&work->work, cma_work_handler); + work->old_state = RDMA_CM_ADDR_QUERY; + work->new_state = RDMA_CM_ADDR_RESOLVED; + work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; +} + static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) { struct rdma_route *route = &id_priv->id.route; @@ -2423,11 +2443,7 @@ static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + cma_init_resolve_route_work(work, id_priv); route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { @@ -2482,11 +2498,7 @@ static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; } @@ -2540,9 +2552,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) if (!work) return -ENOMEM; - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL); if (!route->path_rec) { ret = -ENOMEM; @@ -2602,11 +2611,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - work->old_state = RDMA_CM_ROUTE_QUERY; - work->new_state = RDMA_CM_ROUTE_RESOLVED; - work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; - work->event.status = 0; - + cma_init_resolve_route_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; @@ -2786,11 +2791,7 @@ static int cma_resolve_loopback(struct rdma_id_private *id_priv) rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid); rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid); - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + cma_init_resolve_addr_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err: @@ -2816,11 +2817,7 @@ static int cma_resolve_ib_addr(struct rdma_id_private *id_priv) rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *) &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr)); - work->id = id_priv; - INIT_WORK(&work->work, cma_work_handler); - work->old_state = RDMA_CM_ADDR_QUERY; - work->new_state = RDMA_CM_ADDR_RESOLVED; - work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; + cma_init_resolve_addr_work(work, id_priv); queue_work(cma_wq, &work->work); return 0; err: -- cgit v1.2.3 From b0dd0d335364a2a748a63413ca4812c9398bf2ae Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:04 +0200 Subject: IB/core: Avoid unnecessary type cast Type cast from void to struct find_gid_index_context is not needed. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 75ebd74f8bbd..2d2e0aabccc7 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -421,8 +421,7 @@ static bool find_gid_index(const union ib_gid *gid, const struct ib_gid_attr *gid_attr, void *context) { - struct find_gid_index_context *ctx = - (struct find_gid_index_context *)context; + struct find_gid_index_context *ctx = context; if (ctx->gid_type != gid_attr->gid_type) return false; -- cgit v1.2.3 From 151ed9d7009769207cd71065f64f58e7d73add65 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:05 +0200 Subject: IB/core: Refactor to avoid unnecessary check on GID lookup miss Currently on every gid entry comparison miss found variable is checked; which is not needed as those two comparison fail already indicate that GID is not found yet. So refactor to avoid such check and copy the GID index when found. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 77515638c55c..43300b8e6341 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -573,27 +573,24 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, struct ib_gid_attr attr; if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) - goto next; + continue; if (memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) - goto next; + continue; memcpy(&attr, &table->data_vec[i].attr, sizeof(attr)); - if (filter(gid, &attr, context)) + if (filter(gid, &attr, context)) { found = true; - -next: - if (found) + if (index) + *index = i; break; + } } read_unlock_irqrestore(&table->rwlock, flags); if (!found) return -ENOENT; - - if (index) - *index = i; return 0; } -- cgit v1.2.3 From fe78acf95f807bd5f6ddbde841e255aa3fdedd49 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:06 +0200 Subject: IB/rxe: Avoid passing unused index pointer which is optional While searching for GID, returned index is not used, so avoid passing pointer during invocation. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_recv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index fb8c83e055e1..4c3f899241d4 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -336,7 +336,6 @@ static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) { union ib_gid dgid; union ib_gid *pdgid; - u16 index; if (skb->protocol == htons(ETH_P_IP)) { ipv6_addr_set_v4mapped(ip_hdr(skb)->daddr, @@ -348,7 +347,7 @@ static int rxe_match_dgid(struct rxe_dev *rxe, struct sk_buff *skb) return ib_find_cached_gid_by_port(&rxe->ib_dev, pdgid, IB_GID_TYPE_ROCE_UDP_ENCAP, - 1, rxe->ndev, &index); + 1, rxe->ndev, NULL); } /* rxe_rcv is called from the interface driver */ -- cgit v1.2.3 From 1c43d5d308f0858e763f46bc2bd01cd32b67c63e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:07 +0200 Subject: IB/core: Avoid exporting module internal ib_find_gid_by_filter() ib_find_gid_by_filter() is used only by ib_core, therefore avoid exporting it. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 43300b8e6341..7babdbceb6d0 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -880,7 +880,6 @@ int ib_find_gid_by_filter(struct ib_device *device, port_num, filter, context, index); } -EXPORT_SYMBOL(ib_find_gid_by_filter); int ib_get_cached_pkey(struct ib_device *device, u8 port_num, -- cgit v1.2.3 From 86937fcd6ed28a2c253119998e0eebfb73dd289e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:08 +0200 Subject: RDMA/core: Avoid redundant memcpy in rdma_addr_find_l2_eth_by_grh rdma_resolve_ip already copies 'addr' to its dev_addr argument. Remove the duplicate memcpy and since it was the only user, remove the 'addr' member from resolve_cb_context. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index da4469c38eac..6679dd43fe08 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -744,7 +744,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) EXPORT_SYMBOL(rdma_addr_cancel); struct resolve_cb_context { - struct rdma_dev_addr *addr; struct completion comp; int status; }; @@ -752,9 +751,6 @@ struct resolve_cb_context { static void resolve_cb(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context) { - if (!status) - memcpy(((struct resolve_cb_context *)context)->addr, - addr, sizeof(struct rdma_dev_addr)); ((struct resolve_cb_context *)context)->status = status; complete(&((struct resolve_cb_context *)context)->comp); } @@ -780,7 +776,6 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; - ctx.addr = &dev_addr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr, &dev_addr, 1000, resolve_cb, &ctx); -- cgit v1.2.3 From 699a83f1eba0f2aa6850e1080a9f4466d8092b7b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:09 +0200 Subject: IB/core: Removed unused function rdma_addr_find_smac_by_sgid() is exported symbol not used by any kernel module. Therefore its removed. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 23 ----------------------- include/rdma/ib_addr.h | 1 - 2 files changed, 24 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 6679dd43fe08..e0086901d64c 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -793,29 +793,6 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, return 0; } -int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) -{ - int ret = 0; - struct rdma_dev_addr dev_addr; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } gid_addr; - - rdma_gid2ip(&gid_addr._sockaddr, sgid); - - memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.net = &init_net; - ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id); - if (ret) - return ret; - - memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN); - return ret; -} -EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid); - static int netevent_callback(struct notifier_block *self, unsigned long event, void *ctx) { diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index d5c3bbb84608..3d17073ea338 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -131,7 +131,6 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr, int rdma_addr_size(struct sockaddr *addr); -int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id); int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, u8 *dmac, const struct net_device *ndev, -- cgit v1.2.3 From 575c7e583e7ea5724f6ee500268c7980e31e45b7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:10 +0200 Subject: RDMA/{core, cma}: Simplify rdma_translate_ip Since no caller needs vlan, rdma_translate_ip is simplified to avoid vlan pointer. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 11 +++-------- drivers/infiniband/core/cma.c | 4 ++-- include/rdma/ib_addr.h | 2 +- 3 files changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index e0086901d64c..0ae18608761c 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -243,8 +243,7 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr, EXPORT_SYMBOL(rdma_copy_addr); int rdma_translate_ip(const struct sockaddr *addr, - struct rdma_dev_addr *dev_addr, - u16 *vlan_id) + struct rdma_dev_addr *dev_addr) { struct net_device *dev; @@ -267,8 +266,6 @@ int rdma_translate_ip(const struct sockaddr *addr, rdma_copy_addr(dev_addr, dev, NULL); dev_addr->bound_dev_if = dev->ifindex; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); dev_put(dev); break; #if IS_ENABLED(CONFIG_IPV6) @@ -280,8 +277,6 @@ int rdma_translate_ip(const struct sockaddr *addr, dev, 1)) { rdma_copy_addr(dev_addr, dev, NULL); dev_addr->bound_dev_if = dev->ifindex; - if (vlan_id) - *vlan_id = rdma_vlan_dev_vlan_id(dev); break; } } @@ -481,7 +476,7 @@ static int addr_resolve_neigh(struct dst_entry *dst, if (dst->dev->flags & IFF_LOOPBACK) { int ret; - ret = rdma_translate_ip(dst_in, addr, NULL); + ret = rdma_translate_ip(dst_in, addr); if (!ret) memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); @@ -558,7 +553,7 @@ static int addr_resolve(struct sockaddr *src_in, } if (ndev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip(dst_in, addr, NULL); + ret = rdma_translate_ip(dst_in, addr); /* * Put the loopback device and get the translated * device instead. diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 48f593bafe33..19769ea4d71f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -601,7 +601,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a int ret; if (addr->sa_family != AF_IB) { - ret = rdma_translate_ip(addr, dev_addr, NULL); + ret = rdma_translate_ip(addr, dev_addr); } else { cma_translate_ib((struct sockaddr_ib *) addr, dev_addr); ret = 0; @@ -2131,7 +2131,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); conn_id->state = RDMA_CM_CONNECT; - ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL); + ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 3d17073ea338..dc0b642e0175 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -94,7 +94,7 @@ struct rdma_dev_addr { * The dev_addr->net field must be initialized. */ int rdma_translate_ip(const struct sockaddr *addr, - struct rdma_dev_addr *dev_addr, u16 *vlan_id); + struct rdma_dev_addr *dev_addr); /** * rdma_resolve_ip - Resolve source and destination IP addresses to -- cgit v1.2.3 From 5092d17a39a454d8b045a8cdce514bf4b07d22dc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:11 +0200 Subject: RDMA/core: Avoid copying ifindex twice rdma_copy_addr copies the ifndex to bound_dev_if. Therefore avoid copying it again after rdma_copy_addr call is completed. Signed-off-by: Parav Pandit Reviewed-by: Moni Shoua Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 0ae18608761c..a5b4cf030c11 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -265,7 +265,6 @@ int rdma_translate_ip(const struct sockaddr *addr, return -EADDRNOTAVAIL; rdma_copy_addr(dev_addr, dev, NULL); - dev_addr->bound_dev_if = dev->ifindex; dev_put(dev); break; #if IS_ENABLED(CONFIG_IPV6) @@ -276,7 +275,6 @@ int rdma_translate_ip(const struct sockaddr *addr, &((const struct sockaddr_in6 *)addr)->sin6_addr, dev, 1)) { rdma_copy_addr(dev_addr, dev, NULL); - dev_addr->bound_dev_if = dev->ifindex; break; } } -- cgit v1.2.3 From dbb12562f7c2377c210ed6b2e79eda5bfe23c30c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:12 +0200 Subject: IB/{core, ipoib}: Simplify ib_find_gid to search only for IB link layer Currently there are no users of ib_find_gid for RoCE transport. It is only used by IPoIB. Therefore its simplified to ignore RoCE ports and GID type check which was previously done for every port. Signed-off-by: Parav Pandit Reviewed-by: Eli Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 16 +++------------- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 3 +-- include/rdma/ib_verbs.h | 3 +-- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 84fc32a2c8b3..bb2686d56d3c 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1017,32 +1017,22 @@ EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where - * a specified GID value occurs. + * a specified GID value occurs. Its searches only for IB link layer. * @device: The device to query. * @gid: The GID value to search for. - * @gid_type: Type of GID. * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - enum ib_gid_type gid_type, struct net_device *ndev, - u8 *port_num, u16 *index) + struct net_device *ndev, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { - if (rdma_cap_roce_gid_table(device, port)) { - if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, - ndev, index)) { - *port_num = port; - return 0; - } - } - - if (gid_type != IB_GID_TYPE_IB) + if (rdma_cap_roce_gid_table(device, port)) continue; for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 3b96cdaf9a83..b9fa21790da9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1085,8 +1085,7 @@ static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) netif_addr_unlock_bh(priv->dev); - err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB, - priv->dev, &port, &index); + err = ib_find_gid(priv->ca, &search_gid, priv->dev, &port, &index); netif_addr_lock_bh(priv->dev); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index fd84cda5ed7c..cfd837049a32 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2836,8 +2836,7 @@ int ib_modify_port(struct ib_device *device, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, - enum ib_gid_type gid_type, struct net_device *ndev, - u8 *port_num, u16 *index); + struct net_device *ndev, u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); -- cgit v1.2.3 From 0c4386ec77cfcd0ccbdbe8c2e67dd3a49b2a4c7f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:13 +0200 Subject: IB/{cm, umad}: Handle av init error cm_init_av_for_response depends on ib_init_ah_from_wc() whose return status is ignored. ib_init_ah_from_wc() can fail and its return status should be handled as done in this patch. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 34 +++++++++++++++++++++------------- drivers/infiniband/core/user_mad.c | 10 +++++++--- 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index d3aab237f8c6..f9bb24c2679e 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -452,13 +452,13 @@ static void cm_set_private_data(struct cm_id_private *cm_id_priv, cm_id_priv->private_data_len = private_data_len; } -static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, - struct ib_grh *grh, struct cm_av *av) +static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, + struct ib_grh *grh, struct cm_av *av) { av->port = port; av->pkey_index = wc->pkey_index; - ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, - grh, &av->ah_attr); + return ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, + grh, &av->ah_attr); } static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, @@ -1860,9 +1860,11 @@ static int cm_req_handler(struct cm_work *work) cm_id_priv = container_of(cm_id, struct cm_id_private, id); cm_id_priv->id.remote_id = req_msg->local_comm_id; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto destroy; cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv-> id.local_id); if (IS_ERR(cm_id_priv->timewait_info)) { @@ -3205,9 +3207,11 @@ static int cm_lap_handler(struct cm_work *work) cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto unlock; cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, cm_id_priv); ret = atomic_inc_and_test(&cm_id_priv->work_count); @@ -3460,6 +3464,7 @@ static int cm_sidr_req_handler(struct cm_work *work) struct cm_id_private *cm_id_priv, *cur_cm_id_priv; struct cm_sidr_req_msg *sidr_req_msg; struct ib_wc *wc; + int ret; cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL); if (IS_ERR(cm_id)) @@ -3472,9 +3477,12 @@ static int cm_sidr_req_handler(struct cm_work *work) wc = work->mad_recv_wc->wc; cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid); cm_id_priv->av.dgid.global.interface_id = 0; - cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto out; + cm_id_priv->id.remote_id = sidr_req_msg->request_id; cm_id_priv->tid = sidr_req_msg->hdr.tid; atomic_inc(&cm_id_priv->work_count); diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 4b64dd02e090..3179fed4b408 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -246,10 +246,14 @@ static void recv_handler(struct ib_mad_agent *agent, if (packet->mad.hdr.grh_present) { struct rdma_ah_attr ah_attr; const struct ib_global_route *grh; + int ret; - ib_init_ah_from_wc(agent->device, agent->port_num, - mad_recv_wc->wc, mad_recv_wc->recv_buf.grh, - &ah_attr); + ret = ib_init_ah_from_wc(agent->device, agent->port_num, + mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, + &ah_attr); + if (ret) + goto err2; grh = rdma_ah_read_grh(&ah_attr); packet->mad.hdr.gid_index = grh->sgid_index; -- cgit v1.2.3 From 5cf3968afcc752fa2db6910e9694120b3373c73b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:14 +0200 Subject: IB/cm: Handle address handle attribute init error cm_init_av_by_path depends on ib_init_ah_from_path to initialize ah attribute and ib_init_ah_from_path() can fail, such error should not be ignored. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index f9bb24c2679e..c6a816678675 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -494,8 +494,11 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, return ret; av->port = port; - ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, - &av->ah_attr); + ret = ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, + &av->ah_attr); + if (ret) + return ret; + av->timeout = path->packet_life_time + 1; spin_lock_irqsave(&cm.lock, flags); -- cgit v1.2.3 From 33f93e1ebcf5acfaef06cda2d3e373730519e33e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:15 +0200 Subject: IB/cm: Fix sleeping while spin lock is held In case of LAP are used for RoCE, it can lead to a problem of sleeping a context while spin lock is held in below flow. cm_lap_handler ->spin_lock -> <..switch_case..> -> cm_init_av_for_response -> ib_init_ah_from_wc -> rdma_addr_find_l2_eth_by_grh wait_for_completion() Therefore ah attribute initialization is done for incoming lap requests outside of the lock context. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c6a816678675..c8fc92fdff76 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3164,6 +3164,12 @@ static int cm_lap_handler(struct cm_work *work) if (!cm_id_priv) return -EINVAL; + ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, + work->mad_recv_wc->recv_buf.grh, + &cm_id_priv->av); + if (ret) + goto deref; + param = &work->cm_event.param.lap_rcvd; memset(&work->path[0], 0, sizeof(work->path[1])); cm_path_set_rec_type(work->port->cm_dev->ib_device, @@ -3210,11 +3216,6 @@ static int cm_lap_handler(struct cm_work *work) cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; - ret = cm_init_av_for_response(work->port, work->mad_recv_wc->wc, - work->mad_recv_wc->recv_buf.grh, - &cm_id_priv->av); - if (ret) - goto unlock; cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av, cm_id_priv); ret = atomic_inc_and_test(&cm_id_priv->work_count); -- cgit v1.2.3 From 4ad6a0245ec818bc0e03910ea3f503824f25b0fc Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:16 +0200 Subject: IB/{core, cm, cma, ipoib}: Rename ib_init_ah_from_path to ib_init_ah_attr_from_path Since ib_init_ah_from_path initializes the address handle attribute, it is renamed to reflect so. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 4 ++-- drivers/infiniband/core/cma.c | 7 ++++--- drivers/infiniband/core/sa_query.c | 8 ++++---- drivers/infiniband/ulp/ipoib/ipoib_main.c | 3 ++- include/rdma/ib_sa.h | 10 +++++----- 5 files changed, 17 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c8fc92fdff76..5af1c1c168b7 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -494,8 +494,8 @@ static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, return ret; av->port = port; - ret = ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path, - &av->ah_attr); + ret = ib_init_ah_attr_from_path(cm_dev->ib_device, port->port_num, path, + &av->ah_attr); if (ret) return ret; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 19769ea4d71f..5ab646a24089 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3396,9 +3396,10 @@ static int cma_sidr_rep_handler(struct ib_cm_id *cm_id, event.status = ret; break; } - ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num, - id_priv->id.route.path_rec, - &event.param.ud.ah_attr); + ib_init_ah_attr_from_path(id_priv->id.device, + id_priv->id.port_num, + id_priv->id.route.path_rec, + &event.param.ud.ah_attr); event.param.ud.qp_num = rep->qpn; event.param.ud.qkey = rep->qkey; event.event = RDMA_CM_EVENT_ESTABLISHED; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index ab5e1024fea9..7cd37e77debb 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1227,9 +1227,9 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -int ib_init_ah_from_path(struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - struct rdma_ah_attr *ah_attr) +int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr) { int ret; u16 gid_index; @@ -1341,7 +1341,7 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, return 0; } -EXPORT_SYMBOL(ib_init_ah_from_path); +EXPORT_SYMBOL(ib_init_ah_attr_from_path); static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index c182604076e1..cb79be1a1474 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -768,7 +768,8 @@ static void path_rec_completion(int status, if (!status) { struct rdma_ah_attr av; - if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) + if (!ib_init_ah_attr_from_path(priv->ca, priv->port, + pathrec, &av)) ah = ipoib_create_ah(dev, priv->pd, &av); } diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 1f7f604db5aa..811cfcfcbe3d 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -549,12 +549,12 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct rdma_ah_attr *ah_attr); /** - * ib_init_ah_from_path - Initialize address handle attributes based on an SA - * path record. + * ib_init_ah_attr_from_path - Initialize address handle attributes based on + * an SA path record. */ -int ib_init_ah_from_path(struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - struct rdma_ah_attr *ah_attr); +int ib_init_ah_attr_from_path(struct ib_device *device, u8 port_num, + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr); /** * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec -- cgit v1.2.3 From f6bdb14267ba799296a49a3de9c2811636b768f9 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:17 +0200 Subject: IB/{core, umad, cm}: Rename ib_init_ah_from_wc to ib_init_ah_attr_from_wc Currently ib_init_ah_from_wc initializes address handle attributes and not the address handle object itself. To avoid confusion between ah_attr vs ah, ib_init_ah_from_wc is renamed to ib_init_ah_attr_from_wc to reflect that its initialzes ah_attr. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 5 +++-- drivers/infiniband/core/user_mad.c | 8 ++++---- drivers/infiniband/core/verbs.c | 12 ++++++------ include/rdma/ib_verbs.h | 8 ++++---- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 5af1c1c168b7..cbbaaf1a493a 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -457,8 +457,9 @@ static int cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc, { av->port = port; av->pkey_index = wc->pkey_index; - return ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc, - grh, &av->ah_attr); + return ib_init_ah_attr_from_wc(port->cm_dev->ib_device, + port->port_num, wc, + grh, &av->ah_attr); } static int cm_init_av_by_path(struct sa_path_rec *path, struct cm_av *av, diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 3179fed4b408..3ef8ee5d4aa2 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -248,10 +248,10 @@ static void recv_handler(struct ib_mad_agent *agent, const struct ib_global_route *grh; int ret; - ret = ib_init_ah_from_wc(agent->device, agent->port_num, - mad_recv_wc->wc, - mad_recv_wc->recv_buf.grh, - &ah_attr); + ret = ib_init_ah_attr_from_wc(agent->device, agent->port_num, + mad_recv_wc->wc, + mad_recv_wc->recv_buf.grh, + &ah_attr); if (ret) goto err2; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 2d2e0aabccc7..43e025435418 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -526,7 +526,7 @@ done: } /* - * This function creates ah from the incoming packet. + * This function initializes address handle attributes from the incoming packet. * Incoming packet has dgid of the receiver node on which this code is * getting executed and, sgid contains the GID of the sender. * @@ -535,9 +535,9 @@ done: * GID whom to respond to. * */ -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, - const struct ib_wc *wc, const struct ib_grh *grh, - struct rdma_ah_attr *ah_attr) +int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct rdma_ah_attr *ah_attr) { u32 flow_class; u16 gid_index; @@ -611,7 +611,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, return 0; } } -EXPORT_SYMBOL(ib_init_ah_from_wc); +EXPORT_SYMBOL(ib_init_ah_attr_from_wc); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num) @@ -619,7 +619,7 @@ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, struct rdma_ah_attr ah_attr; int ret; - ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr); + ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr); if (ret) return ERR_PTR(ret); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index cfd837049a32..60c3268c8c04 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2904,7 +2904,7 @@ int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); /** - * ib_init_ah_from_wc - Initializes address handle attributes from a + * ib_init_ah_attr_from_wc - Initializes address handle attributes from a * work completion. * @device: Device on which the received message arrived. * @port_num: Port on which the received message arrived. @@ -2914,9 +2914,9 @@ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); * @ah_attr: Returned attributes that can be used when creating an address * handle for replying to the message. */ -int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, - const struct ib_wc *wc, const struct ib_grh *grh, - struct rdma_ah_attr *ah_attr); +int ib_init_ah_attr_from_wc(struct ib_device *device, u8 port_num, + const struct ib_wc *wc, const struct ib_grh *grh, + struct rdma_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the -- cgit v1.2.3 From 16c72e402867e956fd558d55cbe2f1a40c742e73 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 14 Nov 2017 14:52:18 +0200 Subject: IB/cm: Refactor to avoid setting path record software only fields When path ah_attr initialization from path record fails, ib_cm_send_rej() uses av.ah_attr fields to send out reject message. In such cases initialization of path record software fields is not needed. Code is simplified for same. Additionally in current code in cm_req_handler, when ib_get_cached_gid fails for a given sgid_index of the GID of the GRH of the incoming CM MAD, error code 12 is sent. This error code refers to primary GID in incoming CM REQ and not for the GID in in MAD packet. Therefore code is refactored to send code 5 (unsupported request) for such error. Signed-off-by: Parav Pandit Reviewed-by: Hal Rosenstock Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 92 +++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index cbbaaf1a493a..b8f8d3128a53 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1902,56 +1902,52 @@ static int cm_req_handler(struct cm_work *work) work->port->port_num, grh->sgid_index, &gid, &gid_attr); - if (!ret) { - if (gid_attr.ndev) { - work->path[0].rec_type = - sa_conv_gid_to_pathrec_type(gid_attr.gid_type); - sa_path_set_ifindex(&work->path[0], - gid_attr.ndev->ifindex); - sa_path_set_ndev(&work->path[0], - dev_net(gid_attr.ndev)); + if (ret) { + if (gid_attr.ndev) dev_put(gid_attr.ndev); - } else { - cm_path_set_rec_type(work->port->cm_dev->ib_device, - work->port->port_num, - &work->path[0], - &req_msg->primary_local_gid); - } - if (cm_req_has_alt_path(req_msg)) - work->path[1].rec_type = work->path[0].rec_type; - cm_format_paths_from_req(req_msg, &work->path[0], - &work->path[1]); - if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) - sa_path_set_dmac(&work->path[0], - cm_id_priv->av.ah_attr.roce.dmac); - work->path[0].hop_limit = grh->hop_limit; - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, - cm_id_priv); + ib_send_cm_rej(cm_id, IB_CM_REJ_UNSUPPORTED, NULL, 0, NULL, 0); + goto rejected; } + + if (gid_attr.ndev) { + work->path[0].rec_type = + sa_conv_gid_to_pathrec_type(gid_attr.gid_type); + sa_path_set_ifindex(&work->path[0], + gid_attr.ndev->ifindex); + sa_path_set_ndev(&work->path[0], + dev_net(gid_attr.ndev)); + dev_put(gid_attr.ndev); + } else { + cm_path_set_rec_type(work->port->cm_dev->ib_device, + work->port->port_num, + &work->path[0], + &req_msg->primary_local_gid); + } + if (cm_req_has_alt_path(req_msg)) + work->path[1].rec_type = work->path[0].rec_type; + cm_format_paths_from_req(req_msg, &work->path[0], + &work->path[1]); + if (cm_id_priv->av.ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) + sa_path_set_dmac(&work->path[0], + cm_id_priv->av.ah_attr.roce.dmac); + work->path[0].hop_limit = grh->hop_limit; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av, + cm_id_priv); if (ret) { - int err = ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, - &work->path[0].sgid, - &gid_attr); - if (!err && gid_attr.ndev) { - work->path[0].rec_type = - sa_conv_gid_to_pathrec_type(gid_attr.gid_type); - sa_path_set_ifindex(&work->path[0], - gid_attr.ndev->ifindex); - sa_path_set_ndev(&work->path[0], - dev_net(gid_attr.ndev)); - dev_put(gid_attr.ndev); - } else { - cm_path_set_rec_type(work->port->cm_dev->ib_device, - work->port->port_num, - &work->path[0], - &req_msg->primary_local_gid); - } - if (cm_req_has_alt_path(req_msg)) - work->path[1].rec_type = work->path[0].rec_type; - ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, - &work->path[0].sgid, sizeof work->path[0].sgid, - NULL, 0); + int err; + + err = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid, + NULL); + if (err) + ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + NULL, 0, NULL, 0); + else + ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, + &work->path[0].sgid, + sizeof(work->path[0].sgid), + NULL, 0); goto rejected; } if (cm_req_has_alt_path(req_msg)) { @@ -1960,7 +1956,7 @@ static int cm_req_handler(struct cm_work *work) if (ret) { ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID, &work->path[0].sgid, - sizeof work->path[0].sgid, NULL, 0); + sizeof(work->path[0].sgid), NULL, 0); goto rejected; } } -- cgit v1.2.3 From 66f53e6f5400578bae58db0c06d85a8820831f40 Mon Sep 17 00:00:00 2001 From: Bharat Potnuri Date: Tue, 28 Nov 2017 23:58:07 +0530 Subject: iser-target: avoid reinitializing rdma contexts for isert commands isert commands that failed during isert_rdma_rw_ctx_post() are queued to Queue-Full(QF) queue and are scheduled to be reposted during queue-full queue processing. During this reposting, the rdma contexts are initialised again in isert_rdma_rw_ctx_post(), which is leaking significant memory. unreferenced object 0xffff8830201d9640 (size 64): comm "kworker/0:2", pid 195, jiffies 4295374851 (age 4528.436s) hex dump (first 32 bytes): 00 60 8b cb 2e 00 00 00 00 10 00 00 00 00 00 00 .`.............. 00 90 e3 cb 2e 00 00 00 00 10 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4e/0xb0 [] __kmalloc+0x125/0x2b0 [] rdma_rw_ctx_init+0x15f/0x6f0 [ib_core] [] isert_rdma_rw_ctx_post+0xc4/0x3c0 [ib_isert] [] isert_put_datain+0x112/0x1c0 [ib_isert] [] lio_queue_data_in+0x2e/0x30 [iscsi_target_mod] [] target_qf_do_work+0x2b2/0x4b0 [target_core_mod] [] process_one_work+0x1db/0x5d0 [] worker_thread+0x4d/0x3e0 [] kthread+0x117/0x150 [] ret_from_fork+0x27/0x40 [] 0xffffffffffffffff Here is patch to use the older rdma contexts while reposting the isert commands intead of reinitialising them. Signed-off-by: Potnuri Bharat Teja Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/isert/ib_isert.c | 7 +++++++ drivers/infiniband/ulp/isert/ib_isert.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 720dfb3a1ac2..fd55163801a3 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2123,6 +2123,9 @@ isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn, u32 rkey, offset; int ret; + if (cmd->ctx_init_done) + goto rdma_ctx_post; + if (dir == DMA_FROM_DEVICE) { addr = cmd->write_va; rkey = cmd->write_stag; @@ -2150,11 +2153,15 @@ isert_rdma_rw_ctx_post(struct isert_cmd *cmd, struct isert_conn *conn, se_cmd->t_data_sg, se_cmd->t_data_nents, offset, addr, rkey, dir); } + if (ret < 0) { isert_err("Cmd: %p failed to prepare RDMA res\n", cmd); return ret; } + cmd->ctx_init_done = true; + +rdma_ctx_post: ret = rdma_rw_ctx_post(&cmd->rw, conn->qp, port_num, cqe, chain_wr); if (ret < 0) isert_err("Cmd: %p failed to post RDMA res\n", cmd); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index d6fd248320ae..3b296bac4f60 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -126,6 +126,7 @@ struct isert_cmd { struct rdma_rw_ctx rw; struct work_struct comp_work; struct scatterlist sg; + bool ctx_init_done; }; static inline struct isert_cmd *tx_desc_to_cmd(struct iser_tx_desc *desc) -- cgit v1.2.3 From 744820869166c8c78be891240cf5f66e8a333694 Mon Sep 17 00:00:00 2001 From: Anton Vasilyev Date: Tue, 8 Aug 2017 18:56:37 +0300 Subject: RDMA/ocrdma: Fix permissions for OCRDMA_RESET_STATS Debugfs file reset_stats is created with S_IRUSR permissions, but ocrdma_dbgfs_ops_read() doesn't support OCRDMA_RESET_STATS, whereas ocrdma_dbgfs_ops_write() supports only OCRDMA_RESET_STATS. The patch fixes misstype with permissions. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Vasilyev Acked-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index e528d7acb7f6..fb78b16ce671 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -834,7 +834,7 @@ void ocrdma_add_port_stats(struct ocrdma_dev *dev) dev->reset_stats.type = OCRDMA_RESET_STATS; dev->reset_stats.dev = dev; - if (!debugfs_create_file("reset_stats", S_IRUSR, dev->dir, + if (!debugfs_create_file("reset_stats", 0200, dev->dir, &dev->reset_stats, &ocrdma_dbg_ops)) goto err; -- cgit v1.2.3 From b16f8188472efac75f5afc9a8226d635a9075672 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Tue, 14 Nov 2017 17:26:16 +0800 Subject: RDMA/hns: Refactor eq code for hip06 Considering the compatibility of supporting hip08's eq process and possible changes of data structure, this patch refactors the eq code structure of hip06. We move all the eq process code for hip06 from hns_roce_eq.c into hns_roce_hw_v1.c, and also for hns_roce_eq.h. With these changes, it will be convenient to add the eq support for later hardware version. Signed-off-by: Yixian Liu Reviewed-by: Lijun Ou Reviewed-by: Wei Hu (Xavier) Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/Makefile | 2 +- drivers/infiniband/hw/hns/hns_roce_cmd.c | 1 + drivers/infiniband/hw/hns/hns_roce_cq.c | 19 +- drivers/infiniband/hw/hns/hns_roce_device.h | 57 ++- drivers/infiniband/hw/hns/hns_roce_eq.c | 759 ---------------------------- drivers/infiniband/hw/hns/hns_roce_eq.h | 134 ----- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 740 ++++++++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v1.h | 44 +- drivers/infiniband/hw/hns/hns_roce_main.c | 16 +- drivers/infiniband/hw/hns/hns_roce_qp.c | 1 + 10 files changed, 843 insertions(+), 930 deletions(-) delete mode 100644 drivers/infiniband/hw/hns/hns_roce_eq.c delete mode 100644 drivers/infiniband/hw/hns/hns_roce_eq.h diff --git a/drivers/infiniband/hw/hns/Makefile b/drivers/infiniband/hw/hns/Makefile index ff426a625e13..97bf2cd1cacb 100644 --- a/drivers/infiniband/hw/hns/Makefile +++ b/drivers/infiniband/hw/hns/Makefile @@ -5,7 +5,7 @@ ccflags-y := -Idrivers/net/ethernet/hisilicon/hns3 obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o -hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_eq.o hns_roce_pd.o \ +hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \ hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \ hns_roce_cq.o hns_roce_alloc.o obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c b/drivers/infiniband/hw/hns/hns_roce_cmd.c index 1085cb249bc1..9ebe839d8b24 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c @@ -103,6 +103,7 @@ void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status, context->out_param = out_param; complete(&context->done); } +EXPORT_SYMBOL_GPL(hns_roce_cmd_event); /* this should be called with "use_events" */ static int __hns_roce_cmd_mbox_wait(struct hns_roce_dev *hr_dev, u64 in_param, diff --git a/drivers/infiniband/hw/hns/hns_roce_cq.c b/drivers/infiniband/hw/hns/hns_roce_cq.c index 2111b57a3489..bccc9b54c9ce 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cq.c +++ b/drivers/infiniband/hw/hns/hns_roce_cq.c @@ -196,15 +196,14 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) if (ret) dev_err(dev, "HW2SW_CQ failed (%d) for CQN %06lx\n", ret, hr_cq->cqn); - if (hr_dev->eq_table.eq) { - /* Waiting interrupt process procedure carried out */ - synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq); - - /* wait for all interrupt processed */ - if (atomic_dec_and_test(&hr_cq->refcount)) - complete(&hr_cq->free); - wait_for_completion(&hr_cq->free); - } + + /* Waiting interrupt process procedure carried out */ + synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq); + + /* wait for all interrupt processed */ + if (atomic_dec_and_test(&hr_cq->refcount)) + complete(&hr_cq->free); + wait_for_completion(&hr_cq->free); spin_lock_irq(&cq_table->lock); radix_tree_delete(&cq_table->tree, hr_cq->cqn); @@ -460,6 +459,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn) ++cq->arm_sn; cq->comp(cq); } +EXPORT_SYMBOL_GPL(hns_roce_cq_completion); void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) { @@ -482,6 +482,7 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type) if (atomic_dec_and_test(&cq->refcount)) complete(&cq->free); } +EXPORT_SYMBOL_GPL(hns_roce_cq_event); int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev) { diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 01d3d695cbba..9aa9e94ef39a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -62,12 +62,16 @@ #define HNS_ROCE_CQE_WCMD_EMPTY_BIT 0x2 #define HNS_ROCE_MIN_CQE_CNT 16 -#define HNS_ROCE_MAX_IRQ_NUM 34 +#define HNS_ROCE_MAX_IRQ_NUM 128 -#define HNS_ROCE_COMP_VEC_NUM 32 +#define EQ_ENABLE 1 +#define EQ_DISABLE 0 -#define HNS_ROCE_AEQE_VEC_NUM 1 -#define HNS_ROCE_AEQE_OF_VEC_NUM 1 +#define HNS_ROCE_CEQ 0 +#define HNS_ROCE_AEQ 1 + +#define HNS_ROCE_CEQ_ENTRY_SIZE 0x4 +#define HNS_ROCE_AEQ_ENTRY_SIZE 0x10 /* 4G/4K = 1M */ #define HNS_ROCE_SL_SHIFT 28 @@ -485,6 +489,45 @@ struct hns_roce_ib_iboe { u8 phy_port[HNS_ROCE_MAX_PORTS]; }; +enum { + HNS_ROCE_EQ_STAT_INVALID = 0, + HNS_ROCE_EQ_STAT_VALID = 2, +}; + +struct hns_roce_ceqe { + u32 comp; +}; + +struct hns_roce_aeqe { + u32 asyn; + union { + struct { + u32 qp; + u32 rsv0; + u32 rsv1; + } qp_event; + + struct { + u32 cq; + u32 rsv0; + u32 rsv1; + } cq_event; + + struct { + u32 ceqe; + u32 rsv0; + u32 rsv1; + } ce_event; + + struct { + __le64 out_param; + __le16 token; + u8 status; + u8 rsv0; + } __packed cmd; + } event; +}; + struct hns_roce_eq { struct hns_roce_dev *hr_dev; void __iomem *doorbell; @@ -502,7 +545,7 @@ struct hns_roce_eq { struct hns_roce_eq_table { struct hns_roce_eq *eq; - void __iomem **eqc_base; + void __iomem **eqc_base; /* only for hw v1 */ }; struct hns_roce_caps { @@ -550,7 +593,7 @@ struct hns_roce_caps { u32 pbl_buf_pg_sz; u32 pbl_hop_num; int aeqe_depth; - int ceqe_depth[HNS_ROCE_COMP_VEC_NUM]; + int ceqe_depth; enum ib_mtu max_mtu; u32 qpc_bt_num; u32 srqc_bt_num; @@ -623,6 +666,8 @@ struct hns_roce_hw { int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr); int (*destroy_cq)(struct ib_cq *ibcq); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); + int (*init_eq)(struct hns_roce_dev *hr_dev); + void (*cleanup_eq)(struct hns_roce_dev *hr_dev); }; struct hns_roce_dev { diff --git a/drivers/infiniband/hw/hns/hns_roce_eq.c b/drivers/infiniband/hw/hns/hns_roce_eq.c deleted file mode 100644 index d184431e2bf5..000000000000 --- a/drivers/infiniband/hw/hns/hns_roce_eq.c +++ /dev/null @@ -1,759 +0,0 @@ -/* - * Copyright (c) 2016 Hisilicon Limited. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include -#include -#include "hns_roce_common.h" -#include "hns_roce_device.h" -#include "hns_roce_eq.h" - -static void eq_set_cons_index(struct hns_roce_eq *eq, int req_not) -{ - roce_raw_write((eq->cons_index & CONS_INDEX_MASK) | - (req_not << eq->log_entries), eq->doorbell); - /* Memory barrier */ - mb(); -} - -static struct hns_roce_aeqe *get_aeqe(struct hns_roce_eq *eq, u32 entry) -{ - unsigned long off = (entry & (eq->entries - 1)) * - HNS_ROCE_AEQ_ENTRY_SIZE; - - return (struct hns_roce_aeqe *)((u8 *) - (eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) + - off % HNS_ROCE_BA_SIZE); -} - -static struct hns_roce_aeqe *next_aeqe_sw(struct hns_roce_eq *eq) -{ - struct hns_roce_aeqe *aeqe = get_aeqe(eq, eq->cons_index); - - return (roce_get_bit(aeqe->asyn, HNS_ROCE_AEQE_U32_4_OWNER_S) ^ - !!(eq->cons_index & eq->entries)) ? aeqe : NULL; -} - -static void hns_roce_wq_catas_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, int qpn) -{ - struct device *dev = &hr_dev->pdev->dev; - - dev_warn(dev, "Local Work Queue Catastrophic Error.\n"); - switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { - case HNS_ROCE_LWQCE_QPC_ERROR: - dev_warn(dev, "QP %d, QPC error.\n", qpn); - break; - case HNS_ROCE_LWQCE_MTU_ERROR: - dev_warn(dev, "QP %d, MTU error.\n", qpn); - break; - case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: - dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn); - break; - case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: - dev_warn(dev, "QP %d, WQE addr error.\n", qpn); - break; - case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: - dev_warn(dev, "QP %d, WQE shift error\n", qpn); - break; - case HNS_ROCE_LWQCE_SL_ERROR: - dev_warn(dev, "QP %d, SL error.\n", qpn); - break; - case HNS_ROCE_LWQCE_PORT_ERROR: - dev_warn(dev, "QP %d, port error.\n", qpn); - break; - default: - break; - } -} - -static void hns_roce_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int qpn) -{ - struct device *dev = &hr_dev->pdev->dev; - - dev_warn(dev, "Local Access Violation Work Queue Error.\n"); - switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { - case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: - dev_warn(dev, "QP %d, R_key violation.\n", qpn); - break; - case HNS_ROCE_LAVWQE_LENGTH_ERROR: - dev_warn(dev, "QP %d, length error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_VA_ERROR: - dev_warn(dev, "QP %d, VA error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_PD_ERROR: - dev_err(dev, "QP %d, PD error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_RW_ACC_ERROR: - dev_warn(dev, "QP %d, rw acc error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: - dev_warn(dev, "QP %d, key state error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: - dev_warn(dev, "QP %d, MR operation error.\n", qpn); - break; - default: - break; - } -} - -static void hns_roce_qp_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int event_type) -{ - struct device *dev = &hr_dev->pdev->dev; - int phy_port; - int qpn; - - qpn = roce_get_field(aeqe->event.qp_event.qp, - HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M, - HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S); - phy_port = roce_get_field(aeqe->event.qp_event.qp, - HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M, - HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S); - if (qpn <= 1) - qpn = HNS_ROCE_MAX_PORTS * qpn + phy_port; - - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: - dev_warn(dev, "Invalid Req Local Work Queue Error.\n" - "QP %d, phy_port %d.\n", qpn, phy_port); - break; - case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - hns_roce_wq_catas_err_handle(hr_dev, aeqe, qpn); - break; - case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_local_wq_access_err_handle(hr_dev, aeqe, qpn); - break; - default: - break; - } - - hns_roce_qp_event(hr_dev, qpn, event_type); -} - -static void hns_roce_cq_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int event_type) -{ - struct device *dev = &hr_dev->pdev->dev; - u32 cqn; - - cqn = le32_to_cpu(roce_get_field(aeqe->event.cq_event.cq, - HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M, - HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S)); - - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - dev_warn(dev, "CQ 0x%x access err.\n", cqn); - break; - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - dev_warn(dev, "CQ 0x%x overflow\n", cqn); - break; - case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID: - dev_warn(dev, "CQ 0x%x ID invalid.\n", cqn); - break; - default: - break; - } - - hns_roce_cq_event(hr_dev, cqn, event_type); -} - -static void hns_roce_db_overflow_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe) -{ - struct device *dev = &hr_dev->pdev->dev; - - switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { - case HNS_ROCE_DB_SUBTYPE_SDB_OVF: - dev_warn(dev, "SDB overflow.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_SDB_ALM_OVF: - dev_warn(dev, "SDB almost overflow.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_SDB_ALM_EMP: - dev_warn(dev, "SDB almost empty.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_ODB_OVF: - dev_warn(dev, "ODB overflow.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_ODB_ALM_OVF: - dev_warn(dev, "ODB almost overflow.\n"); - break; - case HNS_ROCE_DB_SUBTYPE_ODB_ALM_EMP: - dev_warn(dev, "SDB almost empty.\n"); - break; - default: - break; - } -} - -static int hns_roce_aeq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_aeqe *aeqe; - int aeqes_found = 0; - int event_type; - - while ((aeqe = next_aeqe_sw(eq))) { - dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe, - roce_get_field(aeqe->asyn, - HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S)); - /* Memory barrier */ - rmb(); - - event_type = roce_get_field(aeqe->asyn, - HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, - HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S); - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_PATH_MIG: - dev_warn(dev, "PATH MIG not supported\n"); - break; - case HNS_ROCE_EVENT_TYPE_COMM_EST: - dev_warn(dev, "COMMUNICATION established\n"); - break; - case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: - dev_warn(dev, "SQ DRAINED not supported\n"); - break; - case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: - dev_warn(dev, "PATH MIG failed\n"); - break; - case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: - case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_qp_err_handle(hr_dev, aeqe, event_type); - break; - case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: - case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: - case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: - dev_warn(dev, "SRQ not support!\n"); - break; - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID: - hns_roce_cq_err_handle(hr_dev, aeqe, event_type); - break; - case HNS_ROCE_EVENT_TYPE_PORT_CHANGE: - dev_warn(dev, "port change.\n"); - break; - case HNS_ROCE_EVENT_TYPE_MB: - hns_roce_cmd_event(hr_dev, - le16_to_cpu(aeqe->event.cmd.token), - aeqe->event.cmd.status, - le64_to_cpu(aeqe->event.cmd.out_param - )); - break; - case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: - hns_roce_db_overflow_handle(hr_dev, aeqe); - break; - case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW: - dev_warn(dev, "CEQ 0x%lx overflow.\n", - roce_get_field(aeqe->event.ce_event.ceqe, - HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M, - HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S)); - break; - default: - dev_warn(dev, "Unhandled event %d on EQ %d at index %u\n", - event_type, eq->eqn, eq->cons_index); - break; - } - - eq->cons_index++; - aeqes_found = 1; - - if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) { - dev_warn(dev, "cons_index overflow, set back to zero\n" - ); - eq->cons_index = 0; - } - } - - eq_set_cons_index(eq, 0); - - return aeqes_found; -} - -static struct hns_roce_ceqe *get_ceqe(struct hns_roce_eq *eq, u32 entry) -{ - unsigned long off = (entry & (eq->entries - 1)) * - HNS_ROCE_CEQ_ENTRY_SIZE; - - return (struct hns_roce_ceqe *)((u8 *) - (eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) + - off % HNS_ROCE_BA_SIZE); -} - -static struct hns_roce_ceqe *next_ceqe_sw(struct hns_roce_eq *eq) -{ - struct hns_roce_ceqe *ceqe = get_ceqe(eq, eq->cons_index); - - return (!!(roce_get_bit(ceqe->ceqe.comp, - HNS_ROCE_CEQE_CEQE_COMP_OWNER_S))) ^ - (!!(eq->cons_index & eq->entries)) ? ceqe : NULL; -} - -static int hns_roce_ceq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - struct hns_roce_ceqe *ceqe; - int ceqes_found = 0; - u32 cqn; - - while ((ceqe = next_ceqe_sw(eq))) { - /* Memory barrier */ - rmb(); - cqn = roce_get_field(ceqe->ceqe.comp, - HNS_ROCE_CEQE_CEQE_COMP_CQN_M, - HNS_ROCE_CEQE_CEQE_COMP_CQN_S); - hns_roce_cq_completion(hr_dev, cqn); - - ++eq->cons_index; - ceqes_found = 1; - - if (eq->cons_index > 2 * hr_dev->caps.ceqe_depth[eq->eqn] - 1) { - dev_warn(&eq->hr_dev->pdev->dev, - "cons_index overflow, set back to zero\n"); - eq->cons_index = 0; - } - } - - eq_set_cons_index(eq, 0); - - return ceqes_found; -} - -static int hns_roce_aeq_ovf_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) -{ - struct device *dev = &eq->hr_dev->pdev->dev; - int eqovf_found = 0; - u32 caepaemask_val; - u32 cealmovf_val; - u32 caepaest_val; - u32 aeshift_val; - u32 ceshift_val; - u32 cemask_val; - int i = 0; - - /** - * AEQ overflow ECC mult bit err CEQ overflow alarm - * must clear interrupt, mask irq, clear irq, cancel mask operation - */ - aeshift_val = roce_read(hr_dev, ROCEE_CAEP_AEQC_AEQE_SHIFT_REG); - - if (roce_get_bit(aeshift_val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S) == 1) { - dev_warn(dev, "AEQ overflow!\n"); - - /* Set mask */ - caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(caepaemask_val, - ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, - HNS_ROCE_INT_MASK_ENABLE); - roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); - - /* Clear int state(INT_WC : write 1 clear) */ - caepaest_val = roce_read(hr_dev, ROCEE_CAEP_AE_ST_REG); - roce_set_bit(caepaest_val, - ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1); - roce_write(hr_dev, ROCEE_CAEP_AE_ST_REG, caepaest_val); - - /* Clear mask */ - caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(caepaemask_val, - ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, - HNS_ROCE_INT_MASK_DISABLE); - roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); - } - - /* CEQ almost overflow */ - for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { - ceshift_val = roce_read(hr_dev, ROCEE_CAEP_CEQC_SHIFT_0_REG + - i * CEQ_REG_OFFSET); - - if (roce_get_bit(ceshift_val, - ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S) == 1) { - dev_warn(dev, "CEQ[%d] almost overflow!\n", i); - eqovf_found++; - - /* Set mask */ - cemask_val = roce_read(hr_dev, - ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET); - roce_set_bit(cemask_val, - ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, - HNS_ROCE_INT_MASK_ENABLE); - roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET, cemask_val); - - /* Clear int state(INT_WC : write 1 clear) */ - cealmovf_val = roce_read(hr_dev, - ROCEE_CAEP_CEQ_ALM_OVF_0_REG + - i * CEQ_REG_OFFSET); - roce_set_bit(cealmovf_val, - ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S, - 1); - roce_write(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG + - i * CEQ_REG_OFFSET, cealmovf_val); - - /* Clear mask */ - cemask_val = roce_read(hr_dev, - ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET); - roce_set_bit(cemask_val, - ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, - HNS_ROCE_INT_MASK_DISABLE); - roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET, cemask_val); - } - } - - /* ECC multi-bit error alarm */ - dev_warn(dev, "ECC UCERR ALARM: 0x%x, 0x%x, 0x%x\n", - roce_read(hr_dev, ROCEE_ECC_UCERR_ALM0_REG), - roce_read(hr_dev, ROCEE_ECC_UCERR_ALM1_REG), - roce_read(hr_dev, ROCEE_ECC_UCERR_ALM2_REG)); - - dev_warn(dev, "ECC CERR ALARM: 0x%x, 0x%x, 0x%x\n", - roce_read(hr_dev, ROCEE_ECC_CERR_ALM0_REG), - roce_read(hr_dev, ROCEE_ECC_CERR_ALM1_REG), - roce_read(hr_dev, ROCEE_ECC_CERR_ALM2_REG)); - - return eqovf_found; -} - -static int hns_roce_eq_int(struct hns_roce_dev *hr_dev, struct hns_roce_eq *eq) -{ - int eqes_found = 0; - - if (likely(eq->type_flag == HNS_ROCE_CEQ)) - /* CEQ irq routine, CEQ is pulse irq, not clear */ - eqes_found = hns_roce_ceq_int(hr_dev, eq); - else if (likely(eq->type_flag == HNS_ROCE_AEQ)) - /* AEQ irq routine, AEQ is pulse irq, not clear */ - eqes_found = hns_roce_aeq_int(hr_dev, eq); - else - /* AEQ queue overflow irq */ - eqes_found = hns_roce_aeq_ovf_int(hr_dev, eq); - - return eqes_found; -} - -static irqreturn_t hns_roce_msi_x_interrupt(int irq, void *eq_ptr) -{ - int int_work = 0; - struct hns_roce_eq *eq = eq_ptr; - struct hns_roce_dev *hr_dev = eq->hr_dev; - - int_work = hns_roce_eq_int(hr_dev, eq); - - return IRQ_RETVAL(int_work); -} - -static void hns_roce_enable_eq(struct hns_roce_dev *hr_dev, int eq_num, - int enable_flag) -{ - void __iomem *eqc = hr_dev->eq_table.eqc_base[eq_num]; - u32 val; - - val = readl(eqc); - - if (enable_flag) - roce_set_field(val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, - HNS_ROCE_EQ_STAT_VALID); - else - roce_set_field(val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, - HNS_ROCE_EQ_STAT_INVALID); - writel(val, eqc); -} - -static int hns_roce_create_eq(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) -{ - void __iomem *eqc = hr_dev->eq_table.eqc_base[eq->eqn]; - struct device *dev = &hr_dev->pdev->dev; - dma_addr_t tmp_dma_addr; - u32 eqconsindx_val = 0; - u32 eqcuridx_val = 0; - u32 eqshift_val = 0; - int num_bas = 0; - int ret; - int i; - - num_bas = (PAGE_ALIGN(eq->entries * eq->eqe_size) + - HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE; - - if ((eq->entries * eq->eqe_size) > HNS_ROCE_BA_SIZE) { - dev_err(dev, "[error]eq buf %d gt ba size(%d) need bas=%d\n", - (eq->entries * eq->eqe_size), HNS_ROCE_BA_SIZE, - num_bas); - return -EINVAL; - } - - eq->buf_list = kcalloc(num_bas, sizeof(*eq->buf_list), GFP_KERNEL); - if (!eq->buf_list) - return -ENOMEM; - - for (i = 0; i < num_bas; ++i) { - eq->buf_list[i].buf = dma_alloc_coherent(dev, HNS_ROCE_BA_SIZE, - &tmp_dma_addr, - GFP_KERNEL); - if (!eq->buf_list[i].buf) { - ret = -ENOMEM; - goto err_out_free_pages; - } - - eq->buf_list[i].map = tmp_dma_addr; - memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE); - } - eq->cons_index = 0; - roce_set_field(eqshift_val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, - HNS_ROCE_EQ_STAT_INVALID); - roce_set_field(eqshift_val, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M, - ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S, - eq->log_entries); - writel(eqshift_val, eqc); - - /* Configure eq extended address 12~44bit */ - writel((u32)(eq->buf_list[0].map >> 12), eqc + 4); - - /* - * Configure eq extended address 45~49 bit. - * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of - * using 4K page, and shift more 32 because of - * caculating the high 32 bit value evaluated to hardware. - */ - roce_set_field(eqcuridx_val, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M, - ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S, - eq->buf_list[0].map >> 44); - roce_set_field(eqcuridx_val, - ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M, - ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S, 0); - writel(eqcuridx_val, eqc + 8); - - /* Configure eq consumer index */ - roce_set_field(eqconsindx_val, - ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M, - ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S, 0); - writel(eqconsindx_val, eqc + 0xc); - - return 0; - -err_out_free_pages: - for (i = i - 1; i >= 0; i--) - dma_free_coherent(dev, HNS_ROCE_BA_SIZE, eq->buf_list[i].buf, - eq->buf_list[i].map); - - kfree(eq->buf_list); - return ret; -} - -static void hns_roce_free_eq(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) -{ - int i = 0; - int npages = (PAGE_ALIGN(eq->eqe_size * eq->entries) + - HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE; - - if (!eq->buf_list) - return; - - for (i = 0; i < npages; ++i) - dma_free_coherent(&hr_dev->pdev->dev, HNS_ROCE_BA_SIZE, - eq->buf_list[i].buf, eq->buf_list[i].map); - - kfree(eq->buf_list); -} - -static void hns_roce_int_mask_en(struct hns_roce_dev *hr_dev) -{ - int i = 0; - u32 aemask_val; - int masken = 0; - - /* AEQ INT */ - aemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); - roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, - masken); - roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken); - roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, aemask_val); - - /* CEQ INT */ - for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { - /* IRQ mask */ - roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + - i * CEQ_REG_OFFSET, masken); - } -} - -static void hns_roce_ce_int_default_cfg(struct hns_roce_dev *hr_dev) -{ - /* Configure ce int interval */ - roce_write(hr_dev, ROCEE_CAEP_CE_INTERVAL_CFG_REG, - HNS_ROCE_CEQ_DEFAULT_INTERVAL); - - /* Configure ce int burst num */ - roce_write(hr_dev, ROCEE_CAEP_CE_BURST_NUM_CFG_REG, - HNS_ROCE_CEQ_DEFAULT_BURST_NUM); -} - -int hns_roce_init_eq_table(struct hns_roce_dev *hr_dev) -{ - struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; - struct device *dev = &hr_dev->pdev->dev; - struct hns_roce_eq *eq = NULL; - int eq_num = 0; - int ret = 0; - int i = 0; - int j = 0; - - eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; - eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL); - if (!eq_table->eq) - return -ENOMEM; - - eq_table->eqc_base = kcalloc(eq_num, sizeof(*eq_table->eqc_base), - GFP_KERNEL); - if (!eq_table->eqc_base) { - ret = -ENOMEM; - goto err_eqc_base_alloc_fail; - } - - for (i = 0; i < eq_num; i++) { - eq = &eq_table->eq[i]; - eq->hr_dev = hr_dev; - eq->eqn = i; - eq->irq = hr_dev->irq[i]; - eq->log_page_size = PAGE_SHIFT; - - if (i < hr_dev->caps.num_comp_vectors) { - /* CEQ */ - eq_table->eqc_base[i] = hr_dev->reg_base + - ROCEE_CAEP_CEQC_SHIFT_0_REG + - HNS_ROCE_CEQC_REG_OFFSET * i; - eq->type_flag = HNS_ROCE_CEQ; - eq->doorbell = hr_dev->reg_base + - ROCEE_CAEP_CEQC_CONS_IDX_0_REG + - HNS_ROCE_CEQC_REG_OFFSET * i; - eq->entries = hr_dev->caps.ceqe_depth[i]; - eq->log_entries = ilog2(eq->entries); - eq->eqe_size = sizeof(struct hns_roce_ceqe); - } else { - /* AEQ */ - eq_table->eqc_base[i] = hr_dev->reg_base + - ROCEE_CAEP_AEQC_AEQE_SHIFT_REG; - eq->type_flag = HNS_ROCE_AEQ; - eq->doorbell = hr_dev->reg_base + - ROCEE_CAEP_AEQE_CONS_IDX_REG; - eq->entries = hr_dev->caps.aeqe_depth; - eq->log_entries = ilog2(eq->entries); - eq->eqe_size = sizeof(struct hns_roce_aeqe); - } - } - - /* Disable irq */ - hns_roce_int_mask_en(hr_dev); - - /* Configure CE irq interval and burst num */ - hns_roce_ce_int_default_cfg(hr_dev); - - for (i = 0; i < eq_num; i++) { - ret = hns_roce_create_eq(hr_dev, &eq_table->eq[i]); - if (ret) { - dev_err(dev, "eq create failed\n"); - goto err_create_eq_fail; - } - } - - for (j = 0; j < eq_num; j++) { - ret = request_irq(eq_table->eq[j].irq, hns_roce_msi_x_interrupt, - 0, hr_dev->irq_names[j], eq_table->eq + j); - if (ret) { - dev_err(dev, "request irq error!\n"); - goto err_request_irq_fail; - } - } - - for (i = 0; i < eq_num; i++) - hns_roce_enable_eq(hr_dev, i, EQ_ENABLE); - - return 0; - -err_request_irq_fail: - for (j = j - 1; j >= 0; j--) - free_irq(eq_table->eq[j].irq, eq_table->eq + j); - -err_create_eq_fail: - for (i = i - 1; i >= 0; i--) - hns_roce_free_eq(hr_dev, &eq_table->eq[i]); - - kfree(eq_table->eqc_base); - -err_eqc_base_alloc_fail: - kfree(eq_table->eq); - - return ret; -} - -void hns_roce_cleanup_eq_table(struct hns_roce_dev *hr_dev) -{ - int i; - int eq_num; - struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; - - eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; - for (i = 0; i < eq_num; i++) { - /* Disable EQ */ - hns_roce_enable_eq(hr_dev, i, EQ_DISABLE); - - free_irq(eq_table->eq[i].irq, eq_table->eq + i); - - hns_roce_free_eq(hr_dev, &eq_table->eq[i]); - } - - kfree(eq_table->eqc_base); - kfree(eq_table->eq); -} diff --git a/drivers/infiniband/hw/hns/hns_roce_eq.h b/drivers/infiniband/hw/hns/hns_roce_eq.h deleted file mode 100644 index c6d212d12e03..000000000000 --- a/drivers/infiniband/hw/hns/hns_roce_eq.h +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2016 Hisilicon Limited. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef _HNS_ROCE_EQ_H -#define _HNS_ROCE_EQ_H - -#define HNS_ROCE_CEQ 1 -#define HNS_ROCE_AEQ 2 - -#define HNS_ROCE_CEQ_ENTRY_SIZE 0x4 -#define HNS_ROCE_AEQ_ENTRY_SIZE 0x10 -#define HNS_ROCE_CEQC_REG_OFFSET 0x18 - -#define HNS_ROCE_CEQ_DEFAULT_INTERVAL 0x10 -#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM 0x10 - -#define HNS_ROCE_INT_MASK_DISABLE 0 -#define HNS_ROCE_INT_MASK_ENABLE 1 - -#define EQ_ENABLE 1 -#define EQ_DISABLE 0 -#define CONS_INDEX_MASK 0xffff - -#define CEQ_REG_OFFSET 0x18 - -enum { - HNS_ROCE_EQ_STAT_INVALID = 0, - HNS_ROCE_EQ_STAT_VALID = 2, -}; - -struct hns_roce_aeqe { - u32 asyn; - union { - struct { - u32 qp; - u32 rsv0; - u32 rsv1; - } qp_event; - - struct { - u32 cq; - u32 rsv0; - u32 rsv1; - } cq_event; - - struct { - u32 port; - u32 rsv0; - u32 rsv1; - } port_event; - - struct { - u32 ceqe; - u32 rsv0; - u32 rsv1; - } ce_event; - - struct { - __le64 out_param; - __le16 token; - u8 status; - u8 rsv0; - } __packed cmd; - } event; -}; - -#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S 16 -#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M \ - (((1UL << 8) - 1) << HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S) - -#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S 24 -#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M \ - (((1UL << 7) - 1) << HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S) - -#define HNS_ROCE_AEQE_U32_4_OWNER_S 31 - -#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S 0 -#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M \ - (((1UL << 24) - 1) << HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S) - -#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S 25 -#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M \ - (((1UL << 3) - 1) << HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S) - -#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S 0 -#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M \ - (((1UL << 16) - 1) << HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S) - -#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S 0 -#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M \ - (((1UL << 5) - 1) << HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S) - -struct hns_roce_ceqe { - union { - int comp; - } ceqe; -}; - -#define HNS_ROCE_CEQE_CEQE_COMP_OWNER_S 0 - -#define HNS_ROCE_CEQE_CEQE_COMP_CQN_S 16 -#define HNS_ROCE_CEQE_CEQE_COMP_CQN_M \ - (((1UL << 16) - 1) << HNS_ROCE_CEQE_CEQE_COMP_CQN_S) - -#endif /* _HNS_ROCE_EQ_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index af27168faf0f..6100ace9f4b6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1492,9 +1493,9 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev) caps->max_sq_inline = HNS_ROCE_V1_INLINE_SIZE; caps->num_uars = HNS_ROCE_V1_UAR_NUM; caps->phy_num_uars = HNS_ROCE_V1_PHY_UAR_NUM; - caps->num_aeq_vectors = HNS_ROCE_AEQE_VEC_NUM; - caps->num_comp_vectors = HNS_ROCE_COMP_VEC_NUM; - caps->num_other_vectors = HNS_ROCE_AEQE_OF_VEC_NUM; + caps->num_aeq_vectors = HNS_ROCE_V1_AEQE_VEC_NUM; + caps->num_comp_vectors = HNS_ROCE_V1_COMP_VEC_NUM; + caps->num_other_vectors = HNS_ROCE_V1_ABNORMAL_VEC_NUM; caps->num_mtpts = HNS_ROCE_V1_MAX_MTPT_NUM; caps->num_mtt_segs = HNS_ROCE_V1_MAX_MTT_SEGS; caps->num_pds = HNS_ROCE_V1_MAX_PD_NUM; @@ -1529,10 +1530,8 @@ static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev) caps->num_ports + 1; } - for (i = 0; i < caps->num_comp_vectors; i++) - caps->ceqe_depth[i] = HNS_ROCE_V1_NUM_COMP_EQE; - - caps->aeqe_depth = HNS_ROCE_V1_NUM_ASYNC_EQE; + caps->ceqe_depth = HNS_ROCE_V1_COMP_EQE_NUM; + caps->aeqe_depth = HNS_ROCE_V1_ASYNC_EQE_NUM; caps->local_ca_ack_delay = le32_to_cpu(roce_read(hr_dev, ROCEE_ACK_DELAY_REG)); caps->max_mtu = IB_MTU_2048; @@ -3960,6 +3959,727 @@ static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq) return ret; } +static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not) +{ + roce_raw_write((eq->cons_index & HNS_ROCE_V1_CONS_IDX_M) | + (req_not << eq->log_entries), eq->doorbell); + /* Memory barrier */ + mb(); +} + +static void hns_roce_v1_wq_catas_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, int qpn) +{ + struct device *dev = &hr_dev->pdev->dev; + + dev_warn(dev, "Local Work Queue Catastrophic Error.\n"); + switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { + case HNS_ROCE_LWQCE_QPC_ERROR: + dev_warn(dev, "QP %d, QPC error.\n", qpn); + break; + case HNS_ROCE_LWQCE_MTU_ERROR: + dev_warn(dev, "QP %d, MTU error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: + dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: + dev_warn(dev, "QP %d, WQE addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: + dev_warn(dev, "QP %d, WQE shift error\n", qpn); + break; + case HNS_ROCE_LWQCE_SL_ERROR: + dev_warn(dev, "QP %d, SL error.\n", qpn); + break; + case HNS_ROCE_LWQCE_PORT_ERROR: + dev_warn(dev, "QP %d, port error.\n", qpn); + break; + default: + break; + } +} + +static void hns_roce_v1_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int qpn) +{ + struct device *dev = &hr_dev->pdev->dev; + + dev_warn(dev, "Local Access Violation Work Queue Error.\n"); + switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { + case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: + dev_warn(dev, "QP %d, R_key violation.\n", qpn); + break; + case HNS_ROCE_LAVWQE_LENGTH_ERROR: + dev_warn(dev, "QP %d, length error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_VA_ERROR: + dev_warn(dev, "QP %d, VA error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_PD_ERROR: + dev_err(dev, "QP %d, PD error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_RW_ACC_ERROR: + dev_warn(dev, "QP %d, rw acc error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: + dev_warn(dev, "QP %d, key state error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: + dev_warn(dev, "QP %d, MR operation error.\n", qpn); + break; + default: + break; + } +} + +static void hns_roce_v1_qp_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int event_type) +{ + struct device *dev = &hr_dev->pdev->dev; + int phy_port; + int qpn; + + qpn = roce_get_field(aeqe->event.qp_event.qp, + HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M, + HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S); + phy_port = roce_get_field(aeqe->event.qp_event.qp, + HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M, + HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S); + if (qpn <= 1) + qpn = HNS_ROCE_MAX_PORTS * qpn + phy_port; + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + dev_warn(dev, "Invalid Req Local Work Queue Error.\n" + "QP %d, phy_port %d.\n", qpn, phy_port); + break; + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + hns_roce_v1_wq_catas_err_handle(hr_dev, aeqe, qpn); + break; + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_v1_local_wq_access_err_handle(hr_dev, aeqe, qpn); + break; + default: + break; + } + + hns_roce_qp_event(hr_dev, qpn, event_type); +} + +static void hns_roce_v1_cq_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int event_type) +{ + struct device *dev = &hr_dev->pdev->dev; + u32 cqn; + + cqn = le32_to_cpu(roce_get_field(aeqe->event.cq_event.cq, + HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M, + HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S)); + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + dev_warn(dev, "CQ 0x%x access err.\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + dev_warn(dev, "CQ 0x%x overflow\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID: + dev_warn(dev, "CQ 0x%x ID invalid.\n", cqn); + break; + default: + break; + } + + hns_roce_cq_event(hr_dev, cqn, event_type); +} + +static void hns_roce_v1_db_overflow_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe) +{ + struct device *dev = &hr_dev->pdev->dev; + + switch (roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S)) { + case HNS_ROCE_DB_SUBTYPE_SDB_OVF: + dev_warn(dev, "SDB overflow.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_SDB_ALM_OVF: + dev_warn(dev, "SDB almost overflow.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_SDB_ALM_EMP: + dev_warn(dev, "SDB almost empty.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_ODB_OVF: + dev_warn(dev, "ODB overflow.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_ODB_ALM_OVF: + dev_warn(dev, "ODB almost overflow.\n"); + break; + case HNS_ROCE_DB_SUBTYPE_ODB_ALM_EMP: + dev_warn(dev, "SDB almost empty.\n"); + break; + default: + break; + } +} + +static struct hns_roce_aeqe *get_aeqe_v1(struct hns_roce_eq *eq, u32 entry) +{ + unsigned long off = (entry & (eq->entries - 1)) * + HNS_ROCE_AEQ_ENTRY_SIZE; + + return (struct hns_roce_aeqe *)((u8 *) + (eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) + + off % HNS_ROCE_BA_SIZE); +} + +static struct hns_roce_aeqe *next_aeqe_sw_v1(struct hns_roce_eq *eq) +{ + struct hns_roce_aeqe *aeqe = get_aeqe_v1(eq, eq->cons_index); + + return (roce_get_bit(aeqe->asyn, HNS_ROCE_AEQE_U32_4_OWNER_S) ^ + !!(eq->cons_index & eq->entries)) ? aeqe : NULL; +} + +static int hns_roce_v1_aeq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = &hr_dev->pdev->dev; + struct hns_roce_aeqe *aeqe; + int aeqes_found = 0; + int event_type; + + while ((aeqe = next_aeqe_sw_v1(eq))) { + dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe, + roce_get_field(aeqe->asyn, + HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S)); + /* Memory barrier */ + rmb(); + + event_type = roce_get_field(aeqe->asyn, + HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, + HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S); + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + dev_warn(dev, "PATH MIG not supported\n"); + break; + case HNS_ROCE_EVENT_TYPE_COMM_EST: + dev_warn(dev, "COMMUNICATION established\n"); + break; + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + dev_warn(dev, "SQ DRAINED not supported\n"); + break; + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + dev_warn(dev, "PATH MIG failed\n"); + break; + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_v1_qp_err_handle(hr_dev, aeqe, event_type); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + dev_warn(dev, "SRQ not support!\n"); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + case HNS_ROCE_EVENT_TYPE_CQ_ID_INVALID: + hns_roce_v1_cq_err_handle(hr_dev, aeqe, event_type); + break; + case HNS_ROCE_EVENT_TYPE_PORT_CHANGE: + dev_warn(dev, "port change.\n"); + break; + case HNS_ROCE_EVENT_TYPE_MB: + hns_roce_cmd_event(hr_dev, + le16_to_cpu(aeqe->event.cmd.token), + aeqe->event.cmd.status, + le64_to_cpu(aeqe->event.cmd.out_param + )); + break; + case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: + hns_roce_v1_db_overflow_handle(hr_dev, aeqe); + break; + case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW: + dev_warn(dev, "CEQ 0x%lx overflow.\n", + roce_get_field(aeqe->event.ce_event.ceqe, + HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M, + HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S)); + break; + default: + dev_warn(dev, "Unhandled event %d on EQ %d at idx %u.\n", + event_type, eq->eqn, eq->cons_index); + break; + } + + eq->cons_index++; + aeqes_found = 1; + + if (eq->cons_index > 2 * hr_dev->caps.aeqe_depth - 1) { + dev_warn(dev, "cons_index overflow, set back to 0.\n"); + eq->cons_index = 0; + } + } + + set_eq_cons_index_v1(eq, 0); + + return aeqes_found; +} + +static struct hns_roce_ceqe *get_ceqe_v1(struct hns_roce_eq *eq, u32 entry) +{ + unsigned long off = (entry & (eq->entries - 1)) * + HNS_ROCE_CEQ_ENTRY_SIZE; + + return (struct hns_roce_ceqe *)((u8 *) + (eq->buf_list[off / HNS_ROCE_BA_SIZE].buf) + + off % HNS_ROCE_BA_SIZE); +} + +static struct hns_roce_ceqe *next_ceqe_sw_v1(struct hns_roce_eq *eq) +{ + struct hns_roce_ceqe *ceqe = get_ceqe_v1(eq, eq->cons_index); + + return (!!(roce_get_bit(ceqe->comp, + HNS_ROCE_CEQE_CEQE_COMP_OWNER_S))) ^ + (!!(eq->cons_index & eq->entries)) ? ceqe : NULL; +} + +static int hns_roce_v1_ceq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct hns_roce_ceqe *ceqe; + int ceqes_found = 0; + u32 cqn; + + while ((ceqe = next_ceqe_sw_v1(eq))) { + /* Memory barrier */ + rmb(); + cqn = roce_get_field(ceqe->comp, + HNS_ROCE_CEQE_CEQE_COMP_CQN_M, + HNS_ROCE_CEQE_CEQE_COMP_CQN_S); + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ceqes_found = 1; + + if (eq->cons_index > 2 * hr_dev->caps.ceqe_depth - 1) { + dev_warn(&eq->hr_dev->pdev->dev, + "cons_index overflow, set back to 0.\n"); + eq->cons_index = 0; + } + } + + set_eq_cons_index_v1(eq, 0); + + return ceqes_found; +} + +static irqreturn_t hns_roce_v1_msix_interrupt_eq(int irq, void *eq_ptr) +{ + struct hns_roce_eq *eq = eq_ptr; + struct hns_roce_dev *hr_dev = eq->hr_dev; + int int_work = 0; + + if (eq->type_flag == HNS_ROCE_CEQ) + /* CEQ irq routine, CEQ is pulse irq, not clear */ + int_work = hns_roce_v1_ceq_int(hr_dev, eq); + else + /* AEQ irq routine, AEQ is pulse irq, not clear */ + int_work = hns_roce_v1_aeq_int(hr_dev, eq); + + return IRQ_RETVAL(int_work); +} + +static irqreturn_t hns_roce_v1_msix_interrupt_abn(int irq, void *dev_id) +{ + struct hns_roce_dev *hr_dev = dev_id; + struct device *dev = &hr_dev->pdev->dev; + int int_work = 0; + u32 caepaemask_val; + u32 cealmovf_val; + u32 caepaest_val; + u32 aeshift_val; + u32 ceshift_val; + u32 cemask_val; + int i; + + /* + * Abnormal interrupt: + * AEQ overflow, ECC multi-bit err, CEQ overflow must clear + * interrupt, mask irq, clear irq, cancel mask operation + */ + aeshift_val = roce_read(hr_dev, ROCEE_CAEP_AEQC_AEQE_SHIFT_REG); + + /* AEQE overflow */ + if (roce_get_bit(aeshift_val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S) == 1) { + dev_warn(dev, "AEQ overflow!\n"); + + /* Set mask */ + caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); + roce_set_bit(caepaemask_val, + ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + HNS_ROCE_INT_MASK_ENABLE); + roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); + + /* Clear int state(INT_WC : write 1 clear) */ + caepaest_val = roce_read(hr_dev, ROCEE_CAEP_AE_ST_REG); + roce_set_bit(caepaest_val, + ROCEE_CAEP_AE_ST_CAEP_AEQ_ALM_OVF_S, 1); + roce_write(hr_dev, ROCEE_CAEP_AE_ST_REG, caepaest_val); + + /* Clear mask */ + caepaemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); + roce_set_bit(caepaemask_val, + ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + HNS_ROCE_INT_MASK_DISABLE); + roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, caepaemask_val); + } + + /* CEQ almost overflow */ + for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { + ceshift_val = roce_read(hr_dev, ROCEE_CAEP_CEQC_SHIFT_0_REG + + i * CEQ_REG_OFFSET); + + if (roce_get_bit(ceshift_val, + ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S) == 1) { + dev_warn(dev, "CEQ[%d] almost overflow!\n", i); + int_work++; + + /* Set mask */ + cemask_val = roce_read(hr_dev, + ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET); + roce_set_bit(cemask_val, + ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, + HNS_ROCE_INT_MASK_ENABLE); + roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET, cemask_val); + + /* Clear int state(INT_WC : write 1 clear) */ + cealmovf_val = roce_read(hr_dev, + ROCEE_CAEP_CEQ_ALM_OVF_0_REG + + i * CEQ_REG_OFFSET); + roce_set_bit(cealmovf_val, + ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S, + 1); + roce_write(hr_dev, ROCEE_CAEP_CEQ_ALM_OVF_0_REG + + i * CEQ_REG_OFFSET, cealmovf_val); + + /* Clear mask */ + cemask_val = roce_read(hr_dev, + ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET); + roce_set_bit(cemask_val, + ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S, + HNS_ROCE_INT_MASK_DISABLE); + roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET, cemask_val); + } + } + + /* ECC multi-bit error alarm */ + dev_warn(dev, "ECC UCERR ALARM: 0x%x, 0x%x, 0x%x\n", + roce_read(hr_dev, ROCEE_ECC_UCERR_ALM0_REG), + roce_read(hr_dev, ROCEE_ECC_UCERR_ALM1_REG), + roce_read(hr_dev, ROCEE_ECC_UCERR_ALM2_REG)); + + dev_warn(dev, "ECC CERR ALARM: 0x%x, 0x%x, 0x%x\n", + roce_read(hr_dev, ROCEE_ECC_CERR_ALM0_REG), + roce_read(hr_dev, ROCEE_ECC_CERR_ALM1_REG), + roce_read(hr_dev, ROCEE_ECC_CERR_ALM2_REG)); + + return IRQ_RETVAL(int_work); +} + +static void hns_roce_v1_int_mask_enable(struct hns_roce_dev *hr_dev) +{ + u32 aemask_val; + int masken = 0; + int i; + + /* AEQ INT */ + aemask_val = roce_read(hr_dev, ROCEE_CAEP_AE_MASK_REG); + roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AEQ_ALM_OVF_MASK_S, + masken); + roce_set_bit(aemask_val, ROCEE_CAEP_AE_MASK_CAEP_AE_IRQ_MASK_S, masken); + roce_write(hr_dev, ROCEE_CAEP_AE_MASK_REG, aemask_val); + + /* CEQ INT */ + for (i = 0; i < hr_dev->caps.num_comp_vectors; i++) { + /* IRQ mask */ + roce_write(hr_dev, ROCEE_CAEP_CE_IRQ_MASK_0_REG + + i * CEQ_REG_OFFSET, masken); + } +} + +static void hns_roce_v1_free_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + int npages = (PAGE_ALIGN(eq->eqe_size * eq->entries) + + HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE; + int i; + + if (!eq->buf_list) + return; + + for (i = 0; i < npages; ++i) + dma_free_coherent(&hr_dev->pdev->dev, HNS_ROCE_BA_SIZE, + eq->buf_list[i].buf, eq->buf_list[i].map); + + kfree(eq->buf_list); +} + +static void hns_roce_v1_enable_eq(struct hns_roce_dev *hr_dev, int eq_num, + int enable_flag) +{ + void __iomem *eqc = hr_dev->eq_table.eqc_base[eq_num]; + u32 val; + + val = readl(eqc); + + if (enable_flag) + roce_set_field(val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, + HNS_ROCE_EQ_STAT_VALID); + else + roce_set_field(val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, + HNS_ROCE_EQ_STAT_INVALID); + writel(val, eqc); +} + +static int hns_roce_v1_create_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + void __iomem *eqc = hr_dev->eq_table.eqc_base[eq->eqn]; + struct device *dev = &hr_dev->pdev->dev; + dma_addr_t tmp_dma_addr; + u32 eqconsindx_val = 0; + u32 eqcuridx_val = 0; + u32 eqshift_val = 0; + int num_bas; + int ret; + int i; + + num_bas = (PAGE_ALIGN(eq->entries * eq->eqe_size) + + HNS_ROCE_BA_SIZE - 1) / HNS_ROCE_BA_SIZE; + + if ((eq->entries * eq->eqe_size) > HNS_ROCE_BA_SIZE) { + dev_err(dev, "[error]eq buf %d gt ba size(%d) need bas=%d\n", + (eq->entries * eq->eqe_size), HNS_ROCE_BA_SIZE, + num_bas); + return -EINVAL; + } + + eq->buf_list = kcalloc(num_bas, sizeof(*eq->buf_list), GFP_KERNEL); + if (!eq->buf_list) + return -ENOMEM; + + for (i = 0; i < num_bas; ++i) { + eq->buf_list[i].buf = dma_alloc_coherent(dev, HNS_ROCE_BA_SIZE, + &tmp_dma_addr, + GFP_KERNEL); + if (!eq->buf_list[i].buf) { + ret = -ENOMEM; + goto err_out_free_pages; + } + + eq->buf_list[i].map = tmp_dma_addr; + memset(eq->buf_list[i].buf, 0, HNS_ROCE_BA_SIZE); + } + eq->cons_index = 0; + roce_set_field(eqshift_val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S, + HNS_ROCE_EQ_STAT_INVALID); + roce_set_field(eqshift_val, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M, + ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S, + eq->log_entries); + writel(eqshift_val, eqc); + + /* Configure eq extended address 12~44bit */ + writel((u32)(eq->buf_list[0].map >> 12), eqc + 4); + + /* + * Configure eq extended address 45~49 bit. + * 44 = 32 + 12, When evaluating addr to hardware, shift 12 because of + * using 4K page, and shift more 32 because of + * caculating the high 32 bit value evaluated to hardware. + */ + roce_set_field(eqcuridx_val, ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M, + ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S, + eq->buf_list[0].map >> 44); + roce_set_field(eqcuridx_val, + ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M, + ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S, 0); + writel(eqcuridx_val, eqc + 8); + + /* Configure eq consumer index */ + roce_set_field(eqconsindx_val, + ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M, + ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S, 0); + writel(eqconsindx_val, eqc + 0xc); + + return 0; + +err_out_free_pages: + for (i -= 1; i >= 0; i--) + dma_free_coherent(dev, HNS_ROCE_BA_SIZE, eq->buf_list[i].buf, + eq->buf_list[i].map); + + kfree(eq->buf_list); + return ret; +} + +static int hns_roce_v1_init_eq_table(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; + struct device *dev = &hr_dev->pdev->dev; + struct hns_roce_eq *eq; + int irq_num; + int eq_num; + int ret; + int i, j; + + eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; + irq_num = eq_num + hr_dev->caps.num_other_vectors; + + eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL); + if (!eq_table->eq) + return -ENOMEM; + + eq_table->eqc_base = kcalloc(eq_num, sizeof(*eq_table->eqc_base), + GFP_KERNEL); + if (!eq_table->eqc_base) { + ret = -ENOMEM; + goto err_eqc_base_alloc_fail; + } + + for (i = 0; i < eq_num; i++) { + eq = &eq_table->eq[i]; + eq->hr_dev = hr_dev; + eq->eqn = i; + eq->irq = hr_dev->irq[i]; + eq->log_page_size = PAGE_SHIFT; + + if (i < hr_dev->caps.num_comp_vectors) { + /* CEQ */ + eq_table->eqc_base[i] = hr_dev->reg_base + + ROCEE_CAEP_CEQC_SHIFT_0_REG + + CEQ_REG_OFFSET * i; + eq->type_flag = HNS_ROCE_CEQ; + eq->doorbell = hr_dev->reg_base + + ROCEE_CAEP_CEQC_CONS_IDX_0_REG + + CEQ_REG_OFFSET * i; + eq->entries = hr_dev->caps.ceqe_depth; + eq->log_entries = ilog2(eq->entries); + eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE; + } else { + /* AEQ */ + eq_table->eqc_base[i] = hr_dev->reg_base + + ROCEE_CAEP_AEQC_AEQE_SHIFT_REG; + eq->type_flag = HNS_ROCE_AEQ; + eq->doorbell = hr_dev->reg_base + + ROCEE_CAEP_AEQE_CONS_IDX_REG; + eq->entries = hr_dev->caps.aeqe_depth; + eq->log_entries = ilog2(eq->entries); + eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE; + } + } + + /* Disable irq */ + hns_roce_v1_int_mask_enable(hr_dev); + + /* Configure ce int interval */ + roce_write(hr_dev, ROCEE_CAEP_CE_INTERVAL_CFG_REG, + HNS_ROCE_CEQ_DEFAULT_INTERVAL); + + /* Configure ce int burst num */ + roce_write(hr_dev, ROCEE_CAEP_CE_BURST_NUM_CFG_REG, + HNS_ROCE_CEQ_DEFAULT_BURST_NUM); + + for (i = 0; i < eq_num; i++) { + ret = hns_roce_v1_create_eq(hr_dev, &eq_table->eq[i]); + if (ret) { + dev_err(dev, "eq create failed\n"); + goto err_create_eq_fail; + } + } + + for (j = 0; j < irq_num; j++) { + if (j < eq_num) + ret = request_irq(hr_dev->irq[j], + hns_roce_v1_msix_interrupt_eq, 0, + hr_dev->irq_names[j], + &eq_table->eq[j]); + else + ret = request_irq(hr_dev->irq[j], + hns_roce_v1_msix_interrupt_abn, 0, + hr_dev->irq_names[j], hr_dev); + + if (ret) { + dev_err(dev, "request irq error!\n"); + goto err_request_irq_fail; + } + } + + for (i = 0; i < eq_num; i++) + hns_roce_v1_enable_eq(hr_dev, i, EQ_ENABLE); + + return 0; + +err_request_irq_fail: + for (j -= 1; j >= 0; j--) + free_irq(hr_dev->irq[j], &eq_table->eq[j]); + +err_create_eq_fail: + for (i -= 1; i >= 0; i--) + hns_roce_v1_free_eq(hr_dev, &eq_table->eq[i]); + + kfree(eq_table->eqc_base); + +err_eqc_base_alloc_fail: + kfree(eq_table->eq); + + return ret; +} + +static void hns_roce_v1_cleanup_eq_table(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; + int irq_num; + int eq_num; + int i; + + eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; + irq_num = eq_num + hr_dev->caps.num_other_vectors; + for (i = 0; i < eq_num; i++) { + /* Disable EQ */ + hns_roce_v1_enable_eq(hr_dev, i, EQ_DISABLE); + + free_irq(hr_dev->irq[i], &eq_table->eq[i]); + + hns_roce_v1_free_eq(hr_dev, &eq_table->eq[i]); + } + for (i = eq_num; i < irq_num; i++) + free_irq(hr_dev->irq[i], hr_dev); + + kfree(eq_table->eqc_base); + kfree(eq_table->eq); +} + static const struct hns_roce_hw hns_roce_hw_v1 = { .reset = hns_roce_v1_reset, .hw_profile = hns_roce_v1_profile, @@ -3983,6 +4703,8 @@ static const struct hns_roce_hw hns_roce_hw_v1 = { .poll_cq = hns_roce_v1_poll_cq, .dereg_mr = hns_roce_v1_dereg_mr, .destroy_cq = hns_roce_v1_destroy_cq, + .init_eq = hns_roce_v1_init_eq_table, + .cleanup_eq = hns_roce_v1_cleanup_eq_table, }; static const struct of_device_id hns_roce_of_match[] = { @@ -4132,14 +4854,14 @@ static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev) /* read the interrupt names from the DT or ACPI */ ret = device_property_read_string_array(dev, "interrupt-names", hr_dev->irq_names, - HNS_ROCE_MAX_IRQ_NUM); + HNS_ROCE_V1_MAX_IRQ_NUM); if (ret < 0) { dev_err(dev, "couldn't get interrupt names from DT or ACPI!\n"); return ret; } /* fetch the interrupt numbers */ - for (i = 0; i < HNS_ROCE_MAX_IRQ_NUM; i++) { + for (i = 0; i < HNS_ROCE_V1_MAX_IRQ_NUM; i++) { hr_dev->irq[i] = platform_get_irq(hr_dev->pdev, i); if (hr_dev->irq[i] <= 0) { dev_err(dev, "platform get of irq[=%d] failed!\n", i); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h index 21a07ef0afc9..b44ddd239060 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.h @@ -60,8 +60,13 @@ #define HNS_ROCE_V1_GID_NUM 16 #define HNS_ROCE_V1_RESV_QP 8 -#define HNS_ROCE_V1_NUM_COMP_EQE 0x8000 -#define HNS_ROCE_V1_NUM_ASYNC_EQE 0x400 +#define HNS_ROCE_V1_MAX_IRQ_NUM 34 +#define HNS_ROCE_V1_COMP_VEC_NUM 32 +#define HNS_ROCE_V1_AEQE_VEC_NUM 1 +#define HNS_ROCE_V1_ABNORMAL_VEC_NUM 1 + +#define HNS_ROCE_V1_COMP_EQE_NUM 0x8000 +#define HNS_ROCE_V1_ASYNC_EQE_NUM 0x400 #define HNS_ROCE_V1_QPC_ENTRY_SIZE 256 #define HNS_ROCE_V1_IRRL_ENTRY_SIZE 8 @@ -159,6 +164,41 @@ #define SDB_INV_CNT_OFFSET 8 #define SDB_ST_CMP_VAL 8 +#define HNS_ROCE_CEQ_DEFAULT_INTERVAL 0x10 +#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM 0x10 + +#define HNS_ROCE_INT_MASK_DISABLE 0 +#define HNS_ROCE_INT_MASK_ENABLE 1 + +#define CEQ_REG_OFFSET 0x18 + +#define HNS_ROCE_CEQE_CEQE_COMP_OWNER_S 0 + +#define HNS_ROCE_V1_CONS_IDX_M GENMASK(15, 0) + +#define HNS_ROCE_CEQE_CEQE_COMP_CQN_S 16 +#define HNS_ROCE_CEQE_CEQE_COMP_CQN_M GENMASK(31, 16) + +#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S 16 +#define HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M GENMASK(23, 16) + +#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_S 24 +#define HNS_ROCE_AEQE_U32_4_EVENT_SUB_TYPE_M GENMASK(30, 24) + +#define HNS_ROCE_AEQE_U32_4_OWNER_S 31 + +#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_S 0 +#define HNS_ROCE_AEQE_EVENT_QP_EVENT_QP_QPN_M GENMASK(23, 0) + +#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_S 25 +#define HNS_ROCE_AEQE_EVENT_QP_EVENT_PORT_NUM_M GENMASK(27, 25) + +#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_S 0 +#define HNS_ROCE_AEQE_EVENT_CQ_EVENT_CQ_CQN_M GENMASK(15, 0) + +#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_S 0 +#define HNS_ROCE_AEQE_EVENT_CE_EVENT_CEQE_CEQN_M GENMASK(4, 0) + struct hns_roce_cq_context { u32 cqc_byte_4; u32 cq_bt_l; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index cf02ac2d3596..aa0c242ddc50 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -748,12 +748,10 @@ int hns_roce_init(struct hns_roce_dev *hr_dev) goto error_failed_cmd_init; } - if (hr_dev->cmd_mod) { - ret = hns_roce_init_eq_table(hr_dev); - if (ret) { - dev_err(dev, "eq init failed!\n"); - goto error_failed_eq_table; - } + ret = hr_dev->hw->init_eq(hr_dev); + if (ret) { + dev_err(dev, "eq init failed!\n"); + goto error_failed_eq_table; } if (hr_dev->cmd_mod) { @@ -805,8 +803,7 @@ error_failed_init_hem: hns_roce_cmd_use_polling(hr_dev); error_failed_use_event: - if (hr_dev->cmd_mod) - hns_roce_cleanup_eq_table(hr_dev); + hr_dev->hw->cleanup_eq(hr_dev); error_failed_eq_table: hns_roce_cmd_cleanup(hr_dev); @@ -837,8 +834,7 @@ void hns_roce_exit(struct hns_roce_dev *hr_dev) if (hr_dev->cmd_mod) hns_roce_cmd_use_polling(hr_dev); - if (hr_dev->cmd_mod) - hns_roce_cleanup_eq_table(hr_dev); + hr_dev->hw->cleanup_eq(hr_dev); hns_roce_cmd_cleanup(hr_dev); if (hr_dev->hw->cmq_exit) hr_dev->hw->cmq_exit(hr_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 49586ec8126a..69e25847ff1b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -65,6 +65,7 @@ void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type) if (atomic_dec_and_test(&qp->refcount)) complete(&qp->free); } +EXPORT_SYMBOL_GPL(hns_roce_qp_event); static void hns_roce_ib_qp_event(struct hns_roce_qp *hr_qp, enum hns_roce_event type) -- cgit v1.2.3 From a5073d6054f75d7c94b3354206eec4b804d2fbd4 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Tue, 14 Nov 2017 17:26:17 +0800 Subject: RDMA/hns: Add eq support of hip08 This patch adds eq support for hip08. The eq table can be multi-hop addressed. Signed-off-by: Yixian Liu Reviewed-by: Lijun Ou Reviewed-by: Wei Hu (Xavier) Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_cmd.h | 10 + drivers/infiniband/hw/hns/hns_roce_common.h | 11 + drivers/infiniband/hw/hns/hns_roce_device.h | 26 +- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1177 ++++++++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 192 ++++- 5 files changed, 1408 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.h b/drivers/infiniband/hw/hns/hns_roce_cmd.h index b1c94223c28b..9549ae51a0dd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_cmd.h +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.h @@ -88,6 +88,16 @@ enum { HNS_ROCE_CMD_DESTROY_SRQC_BT0 = 0x38, HNS_ROCE_CMD_DESTROY_SRQC_BT1 = 0x39, HNS_ROCE_CMD_DESTROY_SRQC_BT2 = 0x3a, + + /* EQC commands */ + HNS_ROCE_CMD_CREATE_AEQC = 0x80, + HNS_ROCE_CMD_MODIFY_AEQC = 0x81, + HNS_ROCE_CMD_QUERY_AEQC = 0x82, + HNS_ROCE_CMD_DESTROY_AEQC = 0x83, + HNS_ROCE_CMD_CREATE_CEQC = 0x90, + HNS_ROCE_CMD_MODIFY_CEQC = 0x91, + HNS_ROCE_CMD_QUERY_CEQC = 0x92, + HNS_ROCE_CMD_DESTROY_CEQC = 0x93, }; enum { diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h b/drivers/infiniband/hw/hns/hns_roce_common.h index 7ecb7a4147a8..dd67fafd0c40 100644 --- a/drivers/infiniband/hw/hns/hns_roce_common.h +++ b/drivers/infiniband/hw/hns/hns_roce_common.h @@ -376,6 +376,12 @@ #define ROCEE_RX_CMQ_TAIL_REG 0x07024 #define ROCEE_RX_CMQ_HEAD_REG 0x07028 +#define ROCEE_VF_MB_CFG0_REG 0x40 +#define ROCEE_VF_MB_STATUS_REG 0x58 + +#define ROCEE_VF_EQ_DB_CFG0_REG 0x238 +#define ROCEE_VF_EQ_DB_CFG1_REG 0x23C + #define ROCEE_VF_SMAC_CFG0_REG 0x12000 #define ROCEE_VF_SMAC_CFG1_REG 0x12004 @@ -385,4 +391,9 @@ #define ROCEE_VF_SGID_CFG3_REG 0x1000c #define ROCEE_VF_SGID_CFG4_REG 0x10010 +#define ROCEE_VF_ABN_INT_CFG_REG 0x13000 +#define ROCEE_VF_ABN_INT_ST_REG 0x13004 +#define ROCEE_VF_ABN_INT_EN_REG 0x13008 +#define ROCEE_VF_EVENT_INT_EN_REG 0x1300c + #endif /* _HNS_ROCE_COMMON_H */ diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 9aa9e94ef39a..dde51787cbfd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -134,6 +134,7 @@ enum hns_roce_event { HNS_ROCE_EVENT_TYPE_DB_OVERFLOW = 0x12, HNS_ROCE_EVENT_TYPE_MB = 0x13, HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW = 0x14, + HNS_ROCE_EVENT_TYPE_FLR = 0x15, }; /* Local Work Queue Catastrophic Error,SUBTYPE 0x5 */ @@ -541,6 +542,26 @@ struct hns_roce_eq { int log_page_size; int cons_index; struct hns_roce_buf_list *buf_list; + int over_ignore; + int coalesce; + int arm_st; + u64 eqe_ba; + int eqe_ba_pg_sz; + int eqe_buf_pg_sz; + int hop_num; + u64 *bt_l0; /* Base address table for L0 */ + u64 **bt_l1; /* Base address table for L1 */ + u64 **buf; + dma_addr_t l0_dma; + dma_addr_t *l1_dma; + dma_addr_t *buf_dma; + u32 l0_last_num; /* L0 last chunk num */ + u32 l1_last_num; /* L1 last chunk num */ + int eq_max_cnt; + int eq_period; + int shift; + dma_addr_t cur_eqe_ba; + dma_addr_t nxt_eqe_ba; }; struct hns_roce_eq_table { @@ -571,7 +592,7 @@ struct hns_roce_caps { u32 min_wqes; int reserved_cqs; int num_aeq_vectors; /* 1 */ - int num_comp_vectors; /* 32 ceq */ + int num_comp_vectors; int num_other_vectors; int num_mtpts; u32 num_mtt_segs; @@ -617,6 +638,9 @@ struct hns_roce_caps { u32 cqe_ba_pg_sz; u32 cqe_buf_pg_sz; u32 cqe_hop_num; + u32 eqe_ba_pg_sz; + u32 eqe_buf_pg_sz; + u32 eqe_hop_num; u32 chunk_sz; /* chunk size in non multihop mode*/ u64 flags; }; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 8f719c00467b..04281d0c0b42 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -908,9 +908,9 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE; caps->num_uars = HNS_ROCE_V2_UAR_NUM; caps->phy_num_uars = HNS_ROCE_V2_PHY_UAR_NUM; - caps->num_aeq_vectors = 1; - caps->num_comp_vectors = 63; - caps->num_other_vectors = 0; + caps->num_aeq_vectors = HNS_ROCE_V2_AEQE_VEC_NUM; + caps->num_comp_vectors = HNS_ROCE_V2_COMP_VEC_NUM; + caps->num_other_vectors = HNS_ROCE_V2_ABNORMAL_VEC_NUM; caps->num_mtpts = HNS_ROCE_V2_MAX_MTPT_NUM; caps->num_mtt_segs = HNS_ROCE_V2_MAX_MTT_SEGS; caps->num_cqe_segs = HNS_ROCE_V2_MAX_CQE_SEGS; @@ -955,12 +955,17 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->cqe_ba_pg_sz = 0; caps->cqe_buf_pg_sz = 0; caps->cqe_hop_num = HNS_ROCE_CQE_HOP_NUM; + caps->eqe_ba_pg_sz = 0; + caps->eqe_buf_pg_sz = 0; + caps->eqe_hop_num = HNS_ROCE_EQE_HOP_NUM; caps->chunk_sz = HNS_ROCE_V2_TABLE_CHUNK_SIZE; caps->flags = HNS_ROCE_CAP_FLAG_REREG_MR | HNS_ROCE_CAP_FLAG_ROCE_V1_V2; caps->pkey_table_len[0] = 1; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; + caps->ceqe_depth = HNS_ROCE_V2_COMP_EQE_NUM; + caps->aeqe_depth = HNS_ROCE_V2_ASYNC_EQE_NUM; caps->local_ca_ack_delay = 0; caps->max_mtu = IB_MTU_4096; @@ -1374,6 +1379,8 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CQ_ST_M, V2_CQC_BYTE_4_CQ_ST_S, V2_CQ_STATE_VALID); + roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_ARM_ST_M, + V2_CQC_BYTE_4_ARM_ST_S, REG_NXT_CEQE); roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_SHIFT_M, V2_CQC_BYTE_4_SHIFT_S, ilog2((unsigned int)nent)); roce_set_field(cq_context->byte_4_pg_ceqn, V2_CQC_BYTE_4_CEQN_M, @@ -1414,6 +1421,15 @@ static void hns_roce_v2_write_cqc(struct hns_roce_dev *hr_dev, roce_set_field(cq_context->byte_40_cqe_ba, V2_CQC_BYTE_40_CQE_BA_M, V2_CQC_BYTE_40_CQE_BA_S, (dma_handle >> (32 + 3))); + + roce_set_field(cq_context->byte_56_cqe_period_maxcnt, + V2_CQC_BYTE_56_CQ_MAX_CNT_M, + V2_CQC_BYTE_56_CQ_MAX_CNT_S, + HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM); + roce_set_field(cq_context->byte_56_cqe_period_maxcnt, + V2_CQC_BYTE_56_CQ_PERIOD_M, + V2_CQC_BYTE_56_CQ_PERIOD_S, + HNS_ROCE_V2_CQ_DEFAULT_INTERVAL); } static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, @@ -3154,6 +3170,1152 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period) return ret; } +static void set_eq_cons_index_v2(struct hns_roce_eq *eq) +{ + u32 doorbell[2]; + + doorbell[0] = 0; + doorbell[1] = 0; + + if (eq->type_flag == HNS_ROCE_AEQ) { + roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M, + HNS_ROCE_V2_EQ_DB_CMD_S, + eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ? + HNS_ROCE_EQ_DB_CMD_AEQ : + HNS_ROCE_EQ_DB_CMD_AEQ_ARMED); + } else { + roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_TAG_M, + HNS_ROCE_V2_EQ_DB_TAG_S, eq->eqn); + + roce_set_field(doorbell[0], HNS_ROCE_V2_EQ_DB_CMD_M, + HNS_ROCE_V2_EQ_DB_CMD_S, + eq->arm_st == HNS_ROCE_V2_EQ_ALWAYS_ARMED ? + HNS_ROCE_EQ_DB_CMD_CEQ : + HNS_ROCE_EQ_DB_CMD_CEQ_ARMED); + } + + roce_set_field(doorbell[1], HNS_ROCE_V2_EQ_DB_PARA_M, + HNS_ROCE_V2_EQ_DB_PARA_S, + (eq->cons_index & HNS_ROCE_V2_CONS_IDX_M)); + + hns_roce_write64_k(doorbell, eq->doorbell); + + /* Memory barrier */ + mb(); + +} + +static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + u32 qpn) +{ + struct device *dev = hr_dev->dev; + int sub_type; + + dev_warn(dev, "Local work queue catastrophic error.\n"); + sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M, + HNS_ROCE_V2_AEQE_SUB_TYPE_S); + switch (sub_type) { + case HNS_ROCE_LWQCE_QPC_ERROR: + dev_warn(dev, "QP %d, QPC error.\n", qpn); + break; + case HNS_ROCE_LWQCE_MTU_ERROR: + dev_warn(dev, "QP %d, MTU error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: + dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: + dev_warn(dev, "QP %d, WQE addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: + dev_warn(dev, "QP %d, WQE shift error.\n", qpn); + break; + default: + dev_err(dev, "Unhandled sub_event type %d.\n", sub_type); + break; + } +} + +static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, u32 qpn) +{ + struct device *dev = hr_dev->dev; + int sub_type; + + dev_warn(dev, "Local access violation work queue error.\n"); + sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M, + HNS_ROCE_V2_AEQE_SUB_TYPE_S); + switch (sub_type) { + case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: + dev_warn(dev, "QP %d, R_key violation.\n", qpn); + break; + case HNS_ROCE_LAVWQE_LENGTH_ERROR: + dev_warn(dev, "QP %d, length error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_VA_ERROR: + dev_warn(dev, "QP %d, VA error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_PD_ERROR: + dev_err(dev, "QP %d, PD error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_RW_ACC_ERROR: + dev_warn(dev, "QP %d, rw acc error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: + dev_warn(dev, "QP %d, key state error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: + dev_warn(dev, "QP %d, MR operation error.\n", qpn); + break; + default: + dev_err(dev, "Unhandled sub_event type %d.\n", sub_type); + break; + } +} + +static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int event_type) +{ + struct device *dev = hr_dev->dev; + u32 qpn; + + qpn = roce_get_field(aeqe->event.qp_event.qp, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_COMM_EST: + dev_warn(dev, "Communication established.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + dev_warn(dev, "Send queue drained.\n"); + break; + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + hns_roce_v2_wq_catas_err_handle(hr_dev, aeqe, qpn); + break; + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + dev_warn(dev, "Invalid request local work queue error.\n"); + break; + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_v2_local_wq_access_err_handle(hr_dev, aeqe, qpn); + break; + default: + break; + } + + hns_roce_qp_event(hr_dev, qpn, event_type); +} + +static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev, + struct hns_roce_aeqe *aeqe, + int event_type) +{ + struct device *dev = hr_dev->dev; + u32 cqn; + + cqn = roce_get_field(aeqe->event.cq_event.cq, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M, + HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S); + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + dev_warn(dev, "CQ 0x%x access err.\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + dev_warn(dev, "CQ 0x%x overflow\n", cqn); + break; + default: + break; + } + + hns_roce_cq_event(hr_dev, cqn, event_type); +} + +static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry) +{ + u32 buf_chk_sz; + unsigned long off; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE; + + return (struct hns_roce_aeqe *)((char *)(eq->buf_list->buf) + + off % buf_chk_sz); +} + +static struct hns_roce_aeqe *mhop_get_aeqe(struct hns_roce_eq *eq, u32 entry) +{ + u32 buf_chk_sz; + unsigned long off; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + + off = (entry & (eq->entries - 1)) * HNS_ROCE_AEQ_ENTRY_SIZE; + + if (eq->hop_num == HNS_ROCE_HOP_NUM_0) + return (struct hns_roce_aeqe *)((u8 *)(eq->bt_l0) + + off % buf_chk_sz); + else + return (struct hns_roce_aeqe *)((u8 *) + (eq->buf[off / buf_chk_sz]) + off % buf_chk_sz); +} + +static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) +{ + struct hns_roce_aeqe *aeqe; + + if (!eq->hop_num) + aeqe = get_aeqe_v2(eq, eq->cons_index); + else + aeqe = mhop_get_aeqe(eq, eq->cons_index); + + return (roce_get_bit(aeqe->asyn, HNS_ROCE_V2_AEQ_AEQE_OWNER_S) ^ + !!(eq->cons_index & eq->entries)) ? aeqe : NULL; +} + +static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_aeqe *aeqe; + int aeqe_found = 0; + int event_type; + + while ((aeqe = next_aeqe_sw_v2(eq))) { + /* Memory barrier */ + rmb(); + + event_type = roce_get_field(aeqe->asyn, + HNS_ROCE_V2_AEQE_EVENT_TYPE_M, + HNS_ROCE_V2_AEQE_EVENT_TYPE_S); + + switch (event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + dev_warn(dev, "Path migrated succeeded.\n"); + break; + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + dev_warn(dev, "Path migration failed.\n"); + break; + case HNS_ROCE_EVENT_TYPE_COMM_EST: + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + dev_warn(dev, "SRQ not support.\n"); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type); + break; + case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: + dev_warn(dev, "DB overflow.\n"); + break; + case HNS_ROCE_EVENT_TYPE_MB: + hns_roce_cmd_event(hr_dev, + le16_to_cpu(aeqe->event.cmd.token), + aeqe->event.cmd.status, + le64_to_cpu(aeqe->event.cmd.out_param)); + break; + case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW: + dev_warn(dev, "CEQ overflow.\n"); + break; + case HNS_ROCE_EVENT_TYPE_FLR: + dev_warn(dev, "Function level reset.\n"); + break; + default: + dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n", + event_type, eq->eqn, eq->cons_index); + break; + }; + + ++eq->cons_index; + aeqe_found = 1; + + if (eq->cons_index > (2 * eq->entries - 1)) { + dev_warn(dev, "cons_index overflow, set back to 0.\n"); + eq->cons_index = 0; + } + } + + set_eq_cons_index_v2(eq); + return aeqe_found; +} + +static struct hns_roce_ceqe *get_ceqe_v2(struct hns_roce_eq *eq, u32 entry) +{ + u32 buf_chk_sz; + unsigned long off; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE; + + return (struct hns_roce_ceqe *)((char *)(eq->buf_list->buf) + + off % buf_chk_sz); +} + +static struct hns_roce_ceqe *mhop_get_ceqe(struct hns_roce_eq *eq, u32 entry) +{ + u32 buf_chk_sz; + unsigned long off; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + + off = (entry & (eq->entries - 1)) * HNS_ROCE_CEQ_ENTRY_SIZE; + + if (eq->hop_num == HNS_ROCE_HOP_NUM_0) + return (struct hns_roce_ceqe *)((u8 *)(eq->bt_l0) + + off % buf_chk_sz); + else + return (struct hns_roce_ceqe *)((u8 *)(eq->buf[off / + buf_chk_sz]) + off % buf_chk_sz); +} + +static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) +{ + struct hns_roce_ceqe *ceqe; + + if (!eq->hop_num) + ceqe = get_ceqe_v2(eq, eq->cons_index); + else + ceqe = mhop_get_ceqe(eq, eq->cons_index); + + return (!!(roce_get_bit(ceqe->comp, HNS_ROCE_V2_CEQ_CEQE_OWNER_S))) ^ + (!!(eq->cons_index & eq->entries)) ? ceqe : NULL; +} + +static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_ceqe *ceqe; + int ceqe_found = 0; + u32 cqn; + + while ((ceqe = next_ceqe_sw_v2(eq))) { + + /* Memory barrier */ + rmb(); + cqn = roce_get_field(ceqe->comp, + HNS_ROCE_V2_CEQE_COMP_CQN_M, + HNS_ROCE_V2_CEQE_COMP_CQN_S); + + hns_roce_cq_completion(hr_dev, cqn); + + ++eq->cons_index; + ceqe_found = 1; + + if (eq->cons_index > (2 * eq->entries - 1)) { + dev_warn(dev, "cons_index overflow, set back to 0.\n"); + eq->cons_index = 0; + } + } + + set_eq_cons_index_v2(eq); + + return ceqe_found; +} + +static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) +{ + struct hns_roce_eq *eq = eq_ptr; + struct hns_roce_dev *hr_dev = eq->hr_dev; + int int_work = 0; + + if (eq->type_flag == HNS_ROCE_CEQ) + /* Completion event interrupt */ + int_work = hns_roce_v2_ceq_int(hr_dev, eq); + else + /* Asychronous event interrupt */ + int_work = hns_roce_v2_aeq_int(hr_dev, eq); + + return IRQ_RETVAL(int_work); +} + +static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +{ + struct hns_roce_dev *hr_dev = dev_id; + struct device *dev = hr_dev->dev; + int int_work = 0; + u32 int_st; + u32 int_en; + + /* Abnormal interrupt */ + int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); + int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG); + + if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) { + dev_err(dev, "AEQ overflow!\n"); + + roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + + /* Memory barrier */ + mb(); + + roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + + int_work = 1; + } else if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S)) { + dev_err(dev, "BUS ERR!\n"); + + roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + + /* Memory barrier */ + mb(); + + roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + + int_work = 1; + } else if (roce_get_bit(int_st, HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S)) { + dev_err(dev, "OTHER ERR!\n"); + + roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + + /* Memory barrier */ + mb(); + roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + + int_work = 1; + } else + dev_err(dev, "There is no abnormal irq found!\n"); + + return IRQ_RETVAL(int_work); +} + +static void hns_roce_v2_int_mask_enable(struct hns_roce_dev *hr_dev, + int eq_num, int enable_flag) +{ + int i; + + if (enable_flag == EQ_ENABLE) { + for (i = 0; i < eq_num; i++) + roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG + + i * EQ_REG_OFFSET, + HNS_ROCE_V2_VF_EVENT_INT_EN_M); + + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, + HNS_ROCE_V2_VF_ABN_INT_EN_M); + roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG, + HNS_ROCE_V2_VF_ABN_INT_CFG_M); + } else { + for (i = 0; i < eq_num; i++) + roce_write(hr_dev, ROCEE_VF_EVENT_INT_EN_REG + + i * EQ_REG_OFFSET, + HNS_ROCE_V2_VF_EVENT_INT_EN_M & 0x0); + + roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, + HNS_ROCE_V2_VF_ABN_INT_EN_M & 0x0); + roce_write(hr_dev, ROCEE_VF_ABN_INT_CFG_REG, + HNS_ROCE_V2_VF_ABN_INT_CFG_M & 0x0); + } +} + +static void hns_roce_v2_destroy_eqc(struct hns_roce_dev *hr_dev, int eqn) +{ + struct device *dev = hr_dev->dev; + int ret; + + if (eqn < hr_dev->caps.num_comp_vectors) + ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, + 0, HNS_ROCE_CMD_DESTROY_CEQC, + HNS_ROCE_CMD_TIMEOUT_MSECS); + else + ret = hns_roce_cmd_mbox(hr_dev, 0, 0, eqn & HNS_ROCE_V2_EQN_M, + 0, HNS_ROCE_CMD_DESTROY_AEQC, + HNS_ROCE_CMD_TIMEOUT_MSECS); + if (ret) + dev_err(dev, "[mailbox cmd] destroy eqc(%d) failed.\n", eqn); +} + +static void hns_roce_mhop_free_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = hr_dev->dev; + u64 idx; + u64 size; + u32 buf_chk_sz; + u32 bt_chk_sz; + u32 mhop_num; + int eqe_alloc; + int ba_num; + int i = 0; + int j = 0; + + mhop_num = hr_dev->caps.eqe_hop_num; + buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT); + bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT); + ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1) / + buf_chk_sz; + + /* hop_num = 0 */ + if (mhop_num == HNS_ROCE_HOP_NUM_0) { + dma_free_coherent(dev, (unsigned int)(eq->entries * + eq->eqe_size), eq->bt_l0, eq->l0_dma); + return; + } + + /* hop_num = 1 or hop = 2 */ + dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma); + if (mhop_num == 1) { + for (i = 0; i < eq->l0_last_num; i++) { + if (i == eq->l0_last_num - 1) { + eqe_alloc = i * (buf_chk_sz / eq->eqe_size); + size = (eq->entries - eqe_alloc) * eq->eqe_size; + dma_free_coherent(dev, size, eq->buf[i], + eq->buf_dma[i]); + break; + } + dma_free_coherent(dev, buf_chk_sz, eq->buf[i], + eq->buf_dma[i]); + } + } else if (mhop_num == 2) { + for (i = 0; i < eq->l0_last_num; i++) { + dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], + eq->l1_dma[i]); + + for (j = 0; j < bt_chk_sz / 8; j++) { + idx = i * (bt_chk_sz / 8) + j; + if ((i == eq->l0_last_num - 1) + && j == eq->l1_last_num - 1) { + eqe_alloc = (buf_chk_sz / eq->eqe_size) + * idx; + size = (eq->entries - eqe_alloc) + * eq->eqe_size; + dma_free_coherent(dev, size, + eq->buf[idx], + eq->buf_dma[idx]); + break; + } + dma_free_coherent(dev, buf_chk_sz, eq->buf[idx], + eq->buf_dma[idx]); + } + } + } + kfree(eq->buf_dma); + kfree(eq->buf); + kfree(eq->l1_dma); + kfree(eq->bt_l1); + eq->buf_dma = NULL; + eq->buf = NULL; + eq->l1_dma = NULL; + eq->bt_l1 = NULL; +} + +static void hns_roce_v2_free_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + u32 buf_chk_sz; + + buf_chk_sz = 1 << (eq->eqe_buf_pg_sz + PAGE_SHIFT); + + if (hr_dev->caps.eqe_hop_num) { + hns_roce_mhop_free_eq(hr_dev, eq); + return; + } + + if (eq->buf_list) + dma_free_coherent(hr_dev->dev, buf_chk_sz, + eq->buf_list->buf, eq->buf_list->map); +} + +static void hns_roce_config_eqc(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq, + void *mb_buf) +{ + struct hns_roce_eq_context *eqc; + + eqc = mb_buf; + memset(eqc, 0, sizeof(struct hns_roce_eq_context)); + + /* init eqc */ + eq->doorbell = hr_dev->reg_base + ROCEE_VF_EQ_DB_CFG0_REG; + eq->hop_num = hr_dev->caps.eqe_hop_num; + eq->cons_index = 0; + eq->over_ignore = HNS_ROCE_V2_EQ_OVER_IGNORE_0; + eq->coalesce = HNS_ROCE_V2_EQ_COALESCE_0; + eq->arm_st = HNS_ROCE_V2_EQ_ALWAYS_ARMED; + eq->eqe_ba_pg_sz = hr_dev->caps.eqe_ba_pg_sz; + eq->eqe_buf_pg_sz = hr_dev->caps.eqe_buf_pg_sz; + eq->shift = ilog2((unsigned int)eq->entries); + + if (!eq->hop_num) + eq->eqe_ba = eq->buf_list->map; + else + eq->eqe_ba = eq->l0_dma; + + /* set eqc state */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_EQ_ST_M, + HNS_ROCE_EQC_EQ_ST_S, + HNS_ROCE_V2_EQ_STATE_VALID); + + /* set eqe hop num */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_HOP_NUM_M, + HNS_ROCE_EQC_HOP_NUM_S, eq->hop_num); + + /* set eqc over_ignore */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_OVER_IGNORE_M, + HNS_ROCE_EQC_OVER_IGNORE_S, eq->over_ignore); + + /* set eqc coalesce */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_COALESCE_M, + HNS_ROCE_EQC_COALESCE_S, eq->coalesce); + + /* set eqc arm_state */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_ARM_ST_M, + HNS_ROCE_EQC_ARM_ST_S, eq->arm_st); + + /* set eqn */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_EQN_M, + HNS_ROCE_EQC_EQN_S, eq->eqn); + + /* set eqe_cnt */ + roce_set_field(eqc->byte_4, + HNS_ROCE_EQC_EQE_CNT_M, + HNS_ROCE_EQC_EQE_CNT_S, + HNS_ROCE_EQ_INIT_EQE_CNT); + + /* set eqe_ba_pg_sz */ + roce_set_field(eqc->byte_8, + HNS_ROCE_EQC_BA_PG_SZ_M, + HNS_ROCE_EQC_BA_PG_SZ_S, eq->eqe_ba_pg_sz); + + /* set eqe_buf_pg_sz */ + roce_set_field(eqc->byte_8, + HNS_ROCE_EQC_BUF_PG_SZ_M, + HNS_ROCE_EQC_BUF_PG_SZ_S, eq->eqe_buf_pg_sz); + + /* set eq_producer_idx */ + roce_set_field(eqc->byte_8, + HNS_ROCE_EQC_PROD_INDX_M, + HNS_ROCE_EQC_PROD_INDX_S, + HNS_ROCE_EQ_INIT_PROD_IDX); + + /* set eq_max_cnt */ + roce_set_field(eqc->byte_12, + HNS_ROCE_EQC_MAX_CNT_M, + HNS_ROCE_EQC_MAX_CNT_S, eq->eq_max_cnt); + + /* set eq_period */ + roce_set_field(eqc->byte_12, + HNS_ROCE_EQC_PERIOD_M, + HNS_ROCE_EQC_PERIOD_S, eq->eq_period); + + /* set eqe_report_timer */ + roce_set_field(eqc->eqe_report_timer, + HNS_ROCE_EQC_REPORT_TIMER_M, + HNS_ROCE_EQC_REPORT_TIMER_S, + HNS_ROCE_EQ_INIT_REPORT_TIMER); + + /* set eqe_ba [34:3] */ + roce_set_field(eqc->eqe_ba0, + HNS_ROCE_EQC_EQE_BA_L_M, + HNS_ROCE_EQC_EQE_BA_L_S, eq->eqe_ba >> 3); + + /* set eqe_ba [64:35] */ + roce_set_field(eqc->eqe_ba1, + HNS_ROCE_EQC_EQE_BA_H_M, + HNS_ROCE_EQC_EQE_BA_H_S, eq->eqe_ba >> 35); + + /* set eq shift */ + roce_set_field(eqc->byte_28, + HNS_ROCE_EQC_SHIFT_M, + HNS_ROCE_EQC_SHIFT_S, eq->shift); + + /* set eq MSI_IDX */ + roce_set_field(eqc->byte_28, + HNS_ROCE_EQC_MSI_INDX_M, + HNS_ROCE_EQC_MSI_INDX_S, + HNS_ROCE_EQ_INIT_MSI_IDX); + + /* set cur_eqe_ba [27:12] */ + roce_set_field(eqc->byte_28, + HNS_ROCE_EQC_CUR_EQE_BA_L_M, + HNS_ROCE_EQC_CUR_EQE_BA_L_S, eq->cur_eqe_ba >> 12); + + /* set cur_eqe_ba [59:28] */ + roce_set_field(eqc->byte_32, + HNS_ROCE_EQC_CUR_EQE_BA_M_M, + HNS_ROCE_EQC_CUR_EQE_BA_M_S, eq->cur_eqe_ba >> 28); + + /* set cur_eqe_ba [63:60] */ + roce_set_field(eqc->byte_36, + HNS_ROCE_EQC_CUR_EQE_BA_H_M, + HNS_ROCE_EQC_CUR_EQE_BA_H_S, eq->cur_eqe_ba >> 60); + + /* set eq consumer idx */ + roce_set_field(eqc->byte_36, + HNS_ROCE_EQC_CONS_INDX_M, + HNS_ROCE_EQC_CONS_INDX_S, + HNS_ROCE_EQ_INIT_CONS_IDX); + + /* set nex_eqe_ba[43:12] */ + roce_set_field(eqc->nxt_eqe_ba0, + HNS_ROCE_EQC_NXT_EQE_BA_L_M, + HNS_ROCE_EQC_NXT_EQE_BA_L_S, eq->nxt_eqe_ba >> 12); + + /* set nex_eqe_ba[63:44] */ + roce_set_field(eqc->nxt_eqe_ba1, + HNS_ROCE_EQC_NXT_EQE_BA_H_M, + HNS_ROCE_EQC_NXT_EQE_BA_H_S, eq->nxt_eqe_ba >> 44); +} + +static int hns_roce_mhop_alloc_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) +{ + struct device *dev = hr_dev->dev; + int eq_alloc_done = 0; + int eq_buf_cnt = 0; + int eqe_alloc; + u32 buf_chk_sz; + u32 bt_chk_sz; + u32 mhop_num; + u64 size; + u64 idx; + int ba_num; + int bt_num; + int record_i; + int record_j; + int i = 0; + int j = 0; + + mhop_num = hr_dev->caps.eqe_hop_num; + buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT); + bt_chk_sz = 1 << (hr_dev->caps.eqe_ba_pg_sz + PAGE_SHIFT); + + ba_num = (PAGE_ALIGN(eq->entries * eq->eqe_size) + buf_chk_sz - 1) + / buf_chk_sz; + bt_num = (ba_num + bt_chk_sz / 8 - 1) / (bt_chk_sz / 8); + + /* hop_num = 0 */ + if (mhop_num == HNS_ROCE_HOP_NUM_0) { + if (eq->entries > buf_chk_sz / eq->eqe_size) { + dev_err(dev, "eq entries %d is larger than buf_pg_sz!", + eq->entries); + return -EINVAL; + } + eq->bt_l0 = dma_alloc_coherent(dev, eq->entries * eq->eqe_size, + &(eq->l0_dma), GFP_KERNEL); + if (!eq->bt_l0) + return -ENOMEM; + + eq->cur_eqe_ba = eq->l0_dma; + eq->nxt_eqe_ba = 0; + + memset(eq->bt_l0, 0, eq->entries * eq->eqe_size); + + return 0; + } + + eq->buf_dma = kcalloc(ba_num, sizeof(*eq->buf_dma), GFP_KERNEL); + if (!eq->buf_dma) + return -ENOMEM; + eq->buf = kcalloc(ba_num, sizeof(*eq->buf), GFP_KERNEL); + if (!eq->buf) + goto err_kcalloc_buf; + + if (mhop_num == 2) { + eq->l1_dma = kcalloc(bt_num, sizeof(*eq->l1_dma), GFP_KERNEL); + if (!eq->l1_dma) + goto err_kcalloc_l1_dma; + + eq->bt_l1 = kcalloc(bt_num, sizeof(*eq->bt_l1), GFP_KERNEL); + if (!eq->bt_l1) + goto err_kcalloc_bt_l1; + } + + /* alloc L0 BT */ + eq->bt_l0 = dma_alloc_coherent(dev, bt_chk_sz, &eq->l0_dma, GFP_KERNEL); + if (!eq->bt_l0) + goto err_dma_alloc_l0; + + if (mhop_num == 1) { + if (ba_num > (bt_chk_sz / 8)) + dev_err(dev, "ba_num %d is too large for 1 hop\n", + ba_num); + + /* alloc buf */ + for (i = 0; i < bt_chk_sz / 8; i++) { + if (eq_buf_cnt + 1 < ba_num) { + size = buf_chk_sz; + } else { + eqe_alloc = i * (buf_chk_sz / eq->eqe_size); + size = (eq->entries - eqe_alloc) * eq->eqe_size; + } + eq->buf[i] = dma_alloc_coherent(dev, size, + &(eq->buf_dma[i]), + GFP_KERNEL); + if (!eq->buf[i]) + goto err_dma_alloc_buf; + + memset(eq->buf[i], 0, size); + *(eq->bt_l0 + i) = eq->buf_dma[i]; + + eq_buf_cnt++; + if (eq_buf_cnt >= ba_num) + break; + } + eq->cur_eqe_ba = eq->buf_dma[0]; + eq->nxt_eqe_ba = eq->buf_dma[1]; + + } else if (mhop_num == 2) { + /* alloc L1 BT and buf */ + for (i = 0; i < bt_chk_sz / 8; i++) { + eq->bt_l1[i] = dma_alloc_coherent(dev, bt_chk_sz, + &(eq->l1_dma[i]), + GFP_KERNEL); + if (!eq->bt_l1[i]) + goto err_dma_alloc_l1; + *(eq->bt_l0 + i) = eq->l1_dma[i]; + + for (j = 0; j < bt_chk_sz / 8; j++) { + idx = i * bt_chk_sz / 8 + j; + if (eq_buf_cnt + 1 < ba_num) { + size = buf_chk_sz; + } else { + eqe_alloc = (buf_chk_sz / eq->eqe_size) + * idx; + size = (eq->entries - eqe_alloc) + * eq->eqe_size; + } + eq->buf[idx] = dma_alloc_coherent(dev, size, + &(eq->buf_dma[idx]), + GFP_KERNEL); + if (!eq->buf[idx]) + goto err_dma_alloc_buf; + + memset(eq->buf[idx], 0, size); + *(eq->bt_l1[i] + j) = eq->buf_dma[idx]; + + eq_buf_cnt++; + if (eq_buf_cnt >= ba_num) { + eq_alloc_done = 1; + break; + } + } + + if (eq_alloc_done) + break; + } + eq->cur_eqe_ba = eq->buf_dma[0]; + eq->nxt_eqe_ba = eq->buf_dma[1]; + } + + eq->l0_last_num = i + 1; + if (mhop_num == 2) + eq->l1_last_num = j + 1; + + return 0; + +err_dma_alloc_l1: + dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma); + eq->bt_l0 = NULL; + eq->l0_dma = 0; + for (i -= 1; i >= 0; i--) { + dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], + eq->l1_dma[i]); + + for (j = 0; j < bt_chk_sz / 8; j++) { + idx = i * bt_chk_sz / 8 + j; + dma_free_coherent(dev, buf_chk_sz, eq->buf[idx], + eq->buf_dma[idx]); + } + } + goto err_dma_alloc_l0; + +err_dma_alloc_buf: + dma_free_coherent(dev, bt_chk_sz, eq->bt_l0, eq->l0_dma); + eq->bt_l0 = NULL; + eq->l0_dma = 0; + + if (mhop_num == 1) + for (i -= i; i >= 0; i--) + dma_free_coherent(dev, buf_chk_sz, eq->buf[i], + eq->buf_dma[i]); + else if (mhop_num == 2) { + record_i = i; + record_j = j; + for (; i >= 0; i--) { + dma_free_coherent(dev, bt_chk_sz, eq->bt_l1[i], + eq->l1_dma[i]); + + for (j = 0; j < bt_chk_sz / 8; j++) { + if (i == record_i && j >= record_j) + break; + + idx = i * bt_chk_sz / 8 + j; + dma_free_coherent(dev, buf_chk_sz, + eq->buf[idx], + eq->buf_dma[idx]); + } + } + } + +err_dma_alloc_l0: + kfree(eq->bt_l1); + eq->bt_l1 = NULL; + +err_kcalloc_bt_l1: + kfree(eq->l1_dma); + eq->l1_dma = NULL; + +err_kcalloc_l1_dma: + kfree(eq->buf); + eq->buf = NULL; + +err_kcalloc_buf: + kfree(eq->buf_dma); + eq->buf_dma = NULL; + + return -ENOMEM; +} + +static int hns_roce_v2_create_eq(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq, + unsigned int eq_cmd) +{ + struct device *dev = hr_dev->dev; + struct hns_roce_cmd_mailbox *mailbox; + u32 buf_chk_sz = 0; + int ret; + + /* Allocate mailbox memory */ + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + if (!hr_dev->caps.eqe_hop_num) { + buf_chk_sz = 1 << (hr_dev->caps.eqe_buf_pg_sz + PAGE_SHIFT); + + eq->buf_list = kzalloc(sizeof(struct hns_roce_buf_list), + GFP_KERNEL); + if (!eq->buf_list) { + ret = -ENOMEM; + goto free_cmd_mbox; + } + + eq->buf_list->buf = dma_alloc_coherent(dev, buf_chk_sz, + &(eq->buf_list->map), + GFP_KERNEL); + if (!eq->buf_list->buf) { + ret = -ENOMEM; + goto err_alloc_buf; + } + + memset(eq->buf_list->buf, 0, buf_chk_sz); + } else { + ret = hns_roce_mhop_alloc_eq(hr_dev, eq); + if (ret) { + ret = -ENOMEM; + goto free_cmd_mbox; + } + } + + hns_roce_config_eqc(hr_dev, eq, mailbox->buf); + + ret = hns_roce_cmd_mbox(hr_dev, mailbox->dma, 0, eq->eqn, 0, + eq_cmd, HNS_ROCE_CMD_TIMEOUT_MSECS); + if (ret) { + dev_err(dev, "[mailbox cmd] creat eqc failed.\n"); + goto err_cmd_mbox; + } + + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + + return 0; + +err_cmd_mbox: + if (!hr_dev->caps.eqe_hop_num) + dma_free_coherent(dev, buf_chk_sz, eq->buf_list->buf, + eq->buf_list->map); + else { + hns_roce_mhop_free_eq(hr_dev, eq); + goto free_cmd_mbox; + } + +err_alloc_buf: + kfree(eq->buf_list); + +free_cmd_mbox: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + + return ret; +} + +static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; + struct device *dev = hr_dev->dev; + struct hns_roce_eq *eq; + unsigned int eq_cmd; + int irq_num; + int eq_num; + int other_num; + int comp_num; + int aeq_num; + int i, j, k; + int ret; + + other_num = hr_dev->caps.num_other_vectors; + comp_num = hr_dev->caps.num_comp_vectors; + aeq_num = hr_dev->caps.num_aeq_vectors; + + eq_num = comp_num + aeq_num; + irq_num = eq_num + other_num; + + eq_table->eq = kcalloc(eq_num, sizeof(*eq_table->eq), GFP_KERNEL); + if (!eq_table->eq) + return -ENOMEM; + + for (i = 0; i < irq_num; i++) { + hr_dev->irq_names[i] = kzalloc(HNS_ROCE_INT_NAME_LEN, + GFP_KERNEL); + if (!hr_dev->irq_names[i]) { + ret = -ENOMEM; + goto err_failed_kzalloc; + } + } + + /* create eq */ + for (j = 0; j < eq_num; j++) { + eq = &eq_table->eq[j]; + eq->hr_dev = hr_dev; + eq->eqn = j; + if (j < comp_num) { + /* CEQ */ + eq_cmd = HNS_ROCE_CMD_CREATE_CEQC; + eq->type_flag = HNS_ROCE_CEQ; + eq->entries = hr_dev->caps.ceqe_depth; + eq->eqe_size = HNS_ROCE_CEQ_ENTRY_SIZE; + eq->irq = hr_dev->irq[j + other_num + aeq_num]; + eq->eq_max_cnt = HNS_ROCE_CEQ_DEFAULT_BURST_NUM; + eq->eq_period = HNS_ROCE_CEQ_DEFAULT_INTERVAL; + } else { + /* AEQ */ + eq_cmd = HNS_ROCE_CMD_CREATE_AEQC; + eq->type_flag = HNS_ROCE_AEQ; + eq->entries = hr_dev->caps.aeqe_depth; + eq->eqe_size = HNS_ROCE_AEQ_ENTRY_SIZE; + eq->irq = hr_dev->irq[j - comp_num + other_num]; + eq->eq_max_cnt = HNS_ROCE_AEQ_DEFAULT_BURST_NUM; + eq->eq_period = HNS_ROCE_AEQ_DEFAULT_INTERVAL; + } + + ret = hns_roce_v2_create_eq(hr_dev, eq, eq_cmd); + if (ret) { + dev_err(dev, "eq create failed.\n"); + goto err_create_eq_fail; + } + } + + /* enable irq */ + hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_ENABLE); + + /* irq contains: abnormal + AEQ + CEQ*/ + for (k = 0; k < irq_num; k++) + if (k < other_num) + snprintf((char *)hr_dev->irq_names[k], + HNS_ROCE_INT_NAME_LEN, "hns-abn-%d", k); + else if (k < (other_num + aeq_num)) + snprintf((char *)hr_dev->irq_names[k], + HNS_ROCE_INT_NAME_LEN, "hns-aeq-%d", + k - other_num); + else + snprintf((char *)hr_dev->irq_names[k], + HNS_ROCE_INT_NAME_LEN, "hns-ceq-%d", + k - other_num - aeq_num); + + for (k = 0; k < irq_num; k++) { + if (k < other_num) + ret = request_irq(hr_dev->irq[k], + hns_roce_v2_msix_interrupt_abn, + 0, hr_dev->irq_names[k], hr_dev); + + else if (k < (other_num + comp_num)) + ret = request_irq(eq_table->eq[k - other_num].irq, + hns_roce_v2_msix_interrupt_eq, + 0, hr_dev->irq_names[k + aeq_num], + &eq_table->eq[k - other_num]); + else + ret = request_irq(eq_table->eq[k - other_num].irq, + hns_roce_v2_msix_interrupt_eq, + 0, hr_dev->irq_names[k - comp_num], + &eq_table->eq[k - other_num]); + if (ret) { + dev_err(dev, "Request irq error!\n"); + goto err_request_irq_fail; + } + } + + return 0; + +err_request_irq_fail: + for (k -= 1; k >= 0; k--) + if (k < other_num) + free_irq(hr_dev->irq[k], hr_dev); + else + free_irq(eq_table->eq[k - other_num].irq, + &eq_table->eq[k - other_num]); + +err_create_eq_fail: + for (j -= 1; j >= 0; j--) + hns_roce_v2_free_eq(hr_dev, &eq_table->eq[j]); + +err_failed_kzalloc: + for (i -= 1; i >= 0; i--) + kfree(hr_dev->irq_names[i]); + kfree(eq_table->eq); + + return ret; +} + +static void hns_roce_v2_cleanup_eq_table(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_eq_table *eq_table = &hr_dev->eq_table; + int irq_num; + int eq_num; + int i; + + eq_num = hr_dev->caps.num_comp_vectors + hr_dev->caps.num_aeq_vectors; + irq_num = eq_num + hr_dev->caps.num_other_vectors; + + /* Disable irq */ + hns_roce_v2_int_mask_enable(hr_dev, eq_num, EQ_DISABLE); + + for (i = 0; i < hr_dev->caps.num_other_vectors; i++) + free_irq(hr_dev->irq[i], hr_dev); + + for (i = 0; i < eq_num; i++) { + hns_roce_v2_destroy_eqc(hr_dev, i); + + free_irq(eq_table->eq[i].irq, &eq_table->eq[i]); + + hns_roce_v2_free_eq(hr_dev, &eq_table->eq[i]); + } + + for (i = 0; i < irq_num; i++) + kfree(hr_dev->irq_names[i]); + + kfree(eq_table->eq); +} + static const struct hns_roce_hw hns_roce_hw_v2 = { .cmq_init = hns_roce_v2_cmq_init, .cmq_exit = hns_roce_v2_cmq_exit, @@ -3175,6 +4337,8 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .post_recv = hns_roce_v2_post_recv, .req_notify_cq = hns_roce_v2_req_notify_cq, .poll_cq = hns_roce_v2_poll_cq, + .init_eq = hns_roce_v2_init_eq_table, + .cleanup_eq = hns_roce_v2_cleanup_eq_table, }; static const struct pci_device_id hns_roce_hw_v2_pci_tbl[] = { @@ -3189,6 +4353,7 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev, struct hnae3_handle *handle) { const struct pci_device_id *id; + int i; id = pci_match_id(hns_roce_hw_v2_pci_tbl, hr_dev->pci_dev); if (!id) { @@ -3206,8 +4371,12 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev, hr_dev->iboe.netdevs[0] = handle->rinfo.netdev; hr_dev->iboe.phy_port[0] = 0; + for (i = 0; i < HNS_ROCE_V2_MAX_IRQ_NUM; i++) + hr_dev->irq[i] = pci_irq_vector(handle->pdev, + i + handle->rinfo.base_vector); + /* cmd issue mode: 0 is poll, 1 is event */ - hr_dev->cmd_mod = 0; + hr_dev->cmd_mod = 1; hr_dev->loop_idc = 0; return 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 04b7a51b8efb..463edab9b719 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -53,6 +53,10 @@ #define HNS_ROCE_V2_MAX_SQ_INLINE 0x20 #define HNS_ROCE_V2_UAR_NUM 256 #define HNS_ROCE_V2_PHY_UAR_NUM 1 +#define HNS_ROCE_V2_MAX_IRQ_NUM 65 +#define HNS_ROCE_V2_COMP_VEC_NUM 63 +#define HNS_ROCE_V2_AEQE_VEC_NUM 1 +#define HNS_ROCE_V2_ABNORMAL_VEC_NUM 1 #define HNS_ROCE_V2_MAX_MTPT_NUM 0x8000 #define HNS_ROCE_V2_MAX_MTT_SEGS 0x1000000 #define HNS_ROCE_V2_MAX_CQE_SEGS 0x1000000 @@ -78,6 +82,8 @@ #define HNS_ROCE_MTT_HOP_NUM 1 #define HNS_ROCE_CQE_HOP_NUM 1 #define HNS_ROCE_PBL_HOP_NUM 2 +#define HNS_ROCE_EQE_HOP_NUM 2 + #define HNS_ROCE_V2_GID_INDEX_NUM 256 #define HNS_ROCE_V2_TABLE_CHUNK_SIZE (1 << 18) @@ -105,6 +111,12 @@ (step_idx == 1 && hop_num == 1) || \ (step_idx == 2 && hop_num == 2)) +enum { + NO_ARMED = 0x0, + REG_NXT_CEQE = 0x2, + REG_NXT_SE_CEQE = 0x3 +}; + #define V2_CQ_DB_REQ_NOT_SOL 0 #define V2_CQ_DB_REQ_NOT 1 @@ -229,6 +241,9 @@ struct hns_roce_v2_cq_context { u32 cqe_report_timer; u32 byte_64_se_cqe_idx; }; +#define HNS_ROCE_V2_CQ_DEFAULT_BURST_NUM 0x0 +#define HNS_ROCE_V2_CQ_DEFAULT_INTERVAL 0x0 + #define V2_CQC_BYTE_4_CQ_ST_S 0 #define V2_CQC_BYTE_4_CQ_ST_M GENMASK(1, 0) @@ -1129,9 +1144,6 @@ struct hns_roce_cmq_desc { u32 data[6]; }; -#define ROCEE_VF_MB_CFG0_REG 0x40 -#define ROCEE_VF_MB_STATUS_REG 0x58 - #define HNS_ROCE_V2_GO_BIT_TIMEOUT_MSECS 10000 #define HNS_ROCE_HW_RUN_BIT_SHIFT 31 @@ -1174,4 +1186,178 @@ struct hns_roce_v2_priv { struct hns_roce_v2_cmq cmq; }; +struct hns_roce_eq_context { + u32 byte_4; + u32 byte_8; + u32 byte_12; + u32 eqe_report_timer; + u32 eqe_ba0; + u32 eqe_ba1; + u32 byte_28; + u32 byte_32; + u32 byte_36; + u32 nxt_eqe_ba0; + u32 nxt_eqe_ba1; + u32 rsv[5]; +}; + +#define HNS_ROCE_AEQ_DEFAULT_BURST_NUM 0x0 +#define HNS_ROCE_AEQ_DEFAULT_INTERVAL 0x0 +#define HNS_ROCE_CEQ_DEFAULT_BURST_NUM 0x0 +#define HNS_ROCE_CEQ_DEFAULT_INTERVAL 0x0 + +#define HNS_ROCE_V2_EQ_STATE_INVALID 0 +#define HNS_ROCE_V2_EQ_STATE_VALID 1 +#define HNS_ROCE_V2_EQ_STATE_OVERFLOW 2 +#define HNS_ROCE_V2_EQ_STATE_FAILURE 3 + +#define HNS_ROCE_V2_EQ_OVER_IGNORE_0 0 +#define HNS_ROCE_V2_EQ_OVER_IGNORE_1 1 + +#define HNS_ROCE_V2_EQ_COALESCE_0 0 +#define HNS_ROCE_V2_EQ_COALESCE_1 1 + +#define HNS_ROCE_V2_EQ_FIRED 0 +#define HNS_ROCE_V2_EQ_ARMED 1 +#define HNS_ROCE_V2_EQ_ALWAYS_ARMED 3 + +#define HNS_ROCE_EQ_INIT_EQE_CNT 0 +#define HNS_ROCE_EQ_INIT_PROD_IDX 0 +#define HNS_ROCE_EQ_INIT_REPORT_TIMER 0 +#define HNS_ROCE_EQ_INIT_MSI_IDX 0 +#define HNS_ROCE_EQ_INIT_CONS_IDX 0 +#define HNS_ROCE_EQ_INIT_NXT_EQE_BA 0 + +#define HNS_ROCE_V2_CEQ_CEQE_OWNER_S 31 +#define HNS_ROCE_V2_AEQ_AEQE_OWNER_S 31 + +#define HNS_ROCE_V2_COMP_EQE_NUM 0x1000 +#define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000 + +#define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S 0 +#define HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S 1 +#define HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S 2 + +#define HNS_ROCE_EQ_DB_CMD_AEQ 0x0 +#define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED 0x1 +#define HNS_ROCE_EQ_DB_CMD_CEQ 0x2 +#define HNS_ROCE_EQ_DB_CMD_CEQ_ARMED 0x3 + +#define EQ_ENABLE 1 +#define EQ_DISABLE 0 + +#define EQ_REG_OFFSET 0x4 + +#define HNS_ROCE_INT_NAME_LEN 32 +#define HNS_ROCE_V2_EQN_M GENMASK(23, 0) + +#define HNS_ROCE_V2_CONS_IDX_M GENMASK(23, 0) + +#define HNS_ROCE_V2_VF_ABN_INT_EN_S 0 +#define HNS_ROCE_V2_VF_ABN_INT_EN_M GENMASK(0, 0) +#define HNS_ROCE_V2_VF_ABN_INT_ST_M GENMASK(2, 0) +#define HNS_ROCE_V2_VF_ABN_INT_CFG_M GENMASK(2, 0) +#define HNS_ROCE_V2_VF_EVENT_INT_EN_M GENMASK(0, 0) + +/* WORD0 */ +#define HNS_ROCE_EQC_EQ_ST_S 0 +#define HNS_ROCE_EQC_EQ_ST_M GENMASK(1, 0) + +#define HNS_ROCE_EQC_HOP_NUM_S 2 +#define HNS_ROCE_EQC_HOP_NUM_M GENMASK(3, 2) + +#define HNS_ROCE_EQC_OVER_IGNORE_S 4 +#define HNS_ROCE_EQC_OVER_IGNORE_M GENMASK(4, 4) + +#define HNS_ROCE_EQC_COALESCE_S 5 +#define HNS_ROCE_EQC_COALESCE_M GENMASK(5, 5) + +#define HNS_ROCE_EQC_ARM_ST_S 6 +#define HNS_ROCE_EQC_ARM_ST_M GENMASK(7, 6) + +#define HNS_ROCE_EQC_EQN_S 8 +#define HNS_ROCE_EQC_EQN_M GENMASK(15, 8) + +#define HNS_ROCE_EQC_EQE_CNT_S 16 +#define HNS_ROCE_EQC_EQE_CNT_M GENMASK(31, 16) + +/* WORD1 */ +#define HNS_ROCE_EQC_BA_PG_SZ_S 0 +#define HNS_ROCE_EQC_BA_PG_SZ_M GENMASK(3, 0) + +#define HNS_ROCE_EQC_BUF_PG_SZ_S 4 +#define HNS_ROCE_EQC_BUF_PG_SZ_M GENMASK(7, 4) + +#define HNS_ROCE_EQC_PROD_INDX_S 8 +#define HNS_ROCE_EQC_PROD_INDX_M GENMASK(31, 8) + +/* WORD2 */ +#define HNS_ROCE_EQC_MAX_CNT_S 0 +#define HNS_ROCE_EQC_MAX_CNT_M GENMASK(15, 0) + +#define HNS_ROCE_EQC_PERIOD_S 16 +#define HNS_ROCE_EQC_PERIOD_M GENMASK(31, 16) + +/* WORD3 */ +#define HNS_ROCE_EQC_REPORT_TIMER_S 0 +#define HNS_ROCE_EQC_REPORT_TIMER_M GENMASK(31, 0) + +/* WORD4 */ +#define HNS_ROCE_EQC_EQE_BA_L_S 0 +#define HNS_ROCE_EQC_EQE_BA_L_M GENMASK(31, 0) + +/* WORD5 */ +#define HNS_ROCE_EQC_EQE_BA_H_S 0 +#define HNS_ROCE_EQC_EQE_BA_H_M GENMASK(28, 0) + +/* WORD6 */ +#define HNS_ROCE_EQC_SHIFT_S 0 +#define HNS_ROCE_EQC_SHIFT_M GENMASK(7, 0) + +#define HNS_ROCE_EQC_MSI_INDX_S 8 +#define HNS_ROCE_EQC_MSI_INDX_M GENMASK(15, 8) + +#define HNS_ROCE_EQC_CUR_EQE_BA_L_S 16 +#define HNS_ROCE_EQC_CUR_EQE_BA_L_M GENMASK(31, 16) + +/* WORD7 */ +#define HNS_ROCE_EQC_CUR_EQE_BA_M_S 0 +#define HNS_ROCE_EQC_CUR_EQE_BA_M_M GENMASK(31, 0) + +/* WORD8 */ +#define HNS_ROCE_EQC_CUR_EQE_BA_H_S 0 +#define HNS_ROCE_EQC_CUR_EQE_BA_H_M GENMASK(3, 0) + +#define HNS_ROCE_EQC_CONS_INDX_S 8 +#define HNS_ROCE_EQC_CONS_INDX_M GENMASK(31, 8) + +/* WORD9 */ +#define HNS_ROCE_EQC_NXT_EQE_BA_L_S 0 +#define HNS_ROCE_EQC_NXT_EQE_BA_L_M GENMASK(31, 0) + +/* WORD10 */ +#define HNS_ROCE_EQC_NXT_EQE_BA_H_S 0 +#define HNS_ROCE_EQC_NXT_EQE_BA_H_M GENMASK(19, 0) + +#define HNS_ROCE_V2_CEQE_COMP_CQN_S 0 +#define HNS_ROCE_V2_CEQE_COMP_CQN_M GENMASK(23, 0) + +#define HNS_ROCE_V2_AEQE_EVENT_TYPE_S 0 +#define HNS_ROCE_V2_AEQE_EVENT_TYPE_M GENMASK(7, 0) + +#define HNS_ROCE_V2_AEQE_SUB_TYPE_S 8 +#define HNS_ROCE_V2_AEQE_SUB_TYPE_M GENMASK(15, 8) + +#define HNS_ROCE_V2_EQ_DB_CMD_S 16 +#define HNS_ROCE_V2_EQ_DB_CMD_M GENMASK(17, 16) + +#define HNS_ROCE_V2_EQ_DB_TAG_S 0 +#define HNS_ROCE_V2_EQ_DB_TAG_M GENMASK(7, 0) + +#define HNS_ROCE_V2_EQ_DB_PARA_S 0 +#define HNS_ROCE_V2_EQ_DB_PARA_M GENMASK(23, 0) + +#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0 +#define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0) + #endif -- cgit v1.2.3 From 72c7fe90ee7a16a8e350183712b2a639b4054ace Mon Sep 17 00:00:00 2001 From: Pravin Shedge Date: Wed, 6 Dec 2017 22:19:39 +0530 Subject: drivers: infiniband: remove duplicate includes These duplicate includes have been found with scripts/checkincludes.pl but they have been removed manually to avoid removing false positives. Signed-off-by: Pravin Shedge Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 1 - drivers/infiniband/core/netlink.c | 2 -- drivers/infiniband/hw/mlx5/main.c | 4 +--- drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 1 - drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 1 - 5 files changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index cb91245e9163..c50596f7f98a 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -49,7 +49,6 @@ #include "smi.h" #include "opa_smi.h" #include "agent.h" -#include "core_priv.h" static int mad_sendq_size = IB_MAD_QP_SEND_SIZE; static int mad_recvq_size = IB_MAD_QP_RECV_SIZE; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 1fb72c356e36..0c7db94cd9b9 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -41,8 +41,6 @@ #include #include "core_priv.h" -#include "core_priv.h" - static DEFINE_MUTEX(rdma_nl_mutex); static struct sock *nls; static struct { diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 543d0a4c8bf3..fe435edf305b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -50,16 +50,14 @@ #include #include #include +#include #include #include #include #include #include -#include -#include #include "mlx5_ib.h" #include "cmd.h" -#include #define DRIVER_NAME "mlx5_ib" #define DRIVER_VERSION "5.0-0" diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index 685ef2293cb8..4210ca14014d 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -45,7 +45,6 @@ #include "usnic_ib_verbs.h" #include "usnic_ib_sysfs.h" #include "usnic_log.h" -#include "usnic_ib_sysfs.h" static ssize_t usnic_ib_show_board(struct device *device, struct device_attribute *attr, diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index aa2456a4f9bd..a688a5669168 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -47,7 +47,6 @@ #include "usnic_log.h" #include "usnic_uiom.h" #include "usnic_transport.h" -#include "usnic_ib_verbs.h" #define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM -- cgit v1.2.3 From adb1c0d15e688f548faae6287cd8a63bceca1114 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Sun, 17 Dec 2017 12:21:38 -0600 Subject: nes: Change accelerated flag to bool The accelerated flag only utilizes two values: 0 and 1. Modify accelerated flag in struct nes_cm_node to bool. Signed-off-by: Shiraz Saleem Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/nes/nes_cm.c | 2 +- drivers/infiniband/hw/nes/nes_cm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index c56ca2a74df5..6cdfbf8c5674 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -1365,7 +1365,7 @@ static int mini_cm_del_listen(struct nes_cm_core *cm_core, static inline int mini_cm_accelerated(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node) { - cm_node->accelerated = 1; + cm_node->accelerated = true; if (cm_node->accept_pend) { BUG_ON(!cm_node->listener); diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 2cf2641867d8..b9cc02b4e8d5 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -340,7 +340,7 @@ struct nes_cm_node { u16 mpa_frame_size; struct iw_cm_id *cm_id; struct list_head list; - int accelerated; + bool accelerated; struct nes_cm_listener *listener; enum nes_cm_conn_type conn_type; struct nes_vnic *nesvnic; -- cgit v1.2.3 From af808ece5ce9b4382f6844b8e25ef1f841708c06 Mon Sep 17 00:00:00 2001 From: Venkata Sandeep Dhanalakota Date: Mon, 18 Dec 2017 19:26:58 -0800 Subject: IB/SA: Check dlid before SA agent queries for ClassPortInfo SA queries SM for class port info when there is a LID_CHANGE event. When a base lid is configured before fm is started ie when smlid is not yet assigned, SA handles the LID_CHANGE event and tries query SM with lid 0. This will cause an hang. [ 1106.958820] INFO: task kworker/2:0:23 blocked for more than 120 seconds. [ 1106.965082] Tainted: G O 4.12.0+ #1 [ 1106.969602] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1106.977227] kworker/2:0 D 0 23 2 0x00000000 [ 1106.977250] Workqueue: infiniband update_ib_cpi [ib_core] [ 1106.977261] Call Trace: [ 1106.977273] __schedule+0x28e/0x860 [ 1106.977285] schedule+0x36/0x80 [ 1106.977298] schedule_timeout+0x1a3/0x2e0 [ 1106.977310] ? radix_tree_iter_tag_clear+0x1b/0x20 [ 1106.977322] ? idr_alloc+0x64/0x90 [ 1106.977334] wait_for_completion+0xe3/0x140 [ 1106.977347] ? wake_up_q+0x80/0x80 [ 1106.977369] update_ib_cpi+0x163/0x210 [ib_core] [ 1106.977381] process_one_work+0x147/0x370 [ 1106.977394] worker_thread+0x4a/0x390 [ 1106.977406] kthread+0x109/0x140 [ 1106.977418] ? process_one_work+0x370/0x370 [ 1106.977430] ? kthread_park+0x60/0x60 [ 1106.977443] ret_from_fork+0x22/0x30 Always ensure a proper smlid is assigned before querying SM for cpi. Fixes: ee1c60b1bff ("IB/SA: Modify SA to implicitly cache Class Port info") Reviewed-by: Ira Weiny Signed-off-by: Venkata Sandeep Dhanalakota Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sa_query.c | 10 ++++++++++ include/rdma/opa_addr.h | 16 ++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7cd37e77debb..8cf15d4a8ac4 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1345,6 +1345,7 @@ EXPORT_SYMBOL(ib_init_ah_attr_from_path); static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) { + struct rdma_ah_attr ah_attr; unsigned long flags; spin_lock_irqsave(&query->port->ah_lock, flags); @@ -1356,6 +1357,15 @@ static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) query->sm_ah = query->port->sm_ah; spin_unlock_irqrestore(&query->port->ah_lock, flags); + /* + * Always check if sm_ah has valid dlid assigned, + * before querying for class port info + */ + if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) || + !rdma_is_valid_unicast_lid(&ah_attr)) { + kref_put(&query->sm_ah->ref, free_sm_ah); + return -EAGAIN; + } query->mad_buf = ib_create_send_mad(query->port->agent, 1, query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h index f68fca296631..2bbb7a67e643 100644 --- a/include/rdma/opa_addr.h +++ b/include/rdma/opa_addr.h @@ -114,4 +114,20 @@ static inline u32 opa_get_mcast_base(u32 nr_top_bits) return (be32_to_cpu(OPA_LID_PERMISSIVE) << (32 - nr_top_bits)); } +/* Check for a valid unicast LID for non-SM traffic types */ +static inline bool rdma_is_valid_unicast_lid(struct rdma_ah_attr *attr) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_IB) { + if (!rdma_ah_get_dlid(attr) || + rdma_ah_get_dlid(attr) >= + be32_to_cpu(IB_MULTICAST_LID_BASE)) + return false; + } else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) { + if (!rdma_ah_get_dlid(attr) || + rdma_ah_get_dlid(attr) >= + opa_get_mcast_base(OPA_MCAST_NR)) + return false; + } + return true; +} #endif /* OPA_ADDR_H */ -- cgit v1.2.3 From 896b1ec8474937d7bff89d12e4554ca0bdb22284 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Mon, 18 Dec 2017 19:57:13 -0800 Subject: rdma: Update maintainer contact for Intel RDMA drivers Ensure both Mike and I are listed as maintainer contacts for Intel's qib, hfi1, and rdmavt drivers. Reviewed-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index d4fdcb12616c..018a916ad58e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11184,7 +11184,8 @@ S: Maintained F: drivers/firmware/qemu_fw_cfg.c QIB DRIVER -M: Mike Marciniszyn +M: Dennis Dalessandro +M: Mike Marciniszyn L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/hw/qib/ @@ -11482,6 +11483,7 @@ F: drivers/net/ethernet/rdc/r6040.c RDMAVT - RDMA verbs software M: Dennis Dalessandro +M: Mike Marciniszyn L: linux-rdma@vger.kernel.org S: Supported F: drivers/infiniband/sw/rdmavt -- cgit v1.2.3 From f32b766cf787bba1bea111f43edac3f32b45a4e9 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 22 Dec 2017 09:46:54 -0600 Subject: i40iw: Set MAX_IRD_SIZE to 64 Increase I40IW_MAX_IRD_SIZE to 64 which is the device limit. Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_user.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h index e73efc59a0ab..5467c6fdad03 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_user.h +++ b/drivers/infiniband/hw/i40iw/i40iw_user.h @@ -72,7 +72,7 @@ enum i40iw_device_capabilities_const { I40IW_MAX_SQ_PAYLOAD_SIZE = 2145386496, I40IW_MAX_INLINE_DATA_SIZE = 48, I40IW_MAX_PUSHMODE_INLINE_DATA_SIZE = 48, - I40IW_MAX_IRD_SIZE = 63, + I40IW_MAX_IRD_SIZE = 64, I40IW_MAX_ORD_SIZE = 127, I40IW_MAX_WQ_ENTRIES = 2048, I40IW_Q2_BUFFER_SIZE = (248 + 100), -- cgit v1.2.3 From 8758768ad8aa9fc0d56417315dec65b610fc3a21 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 22 Dec 2017 09:46:55 -0600 Subject: i40iw: Use utility function roundup_pow_of_two() Consolidate all power of 2 round calculations to use kernel utility function roundup_pow_of_two(). Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 7 +------ drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 15 ++++----------- drivers/infiniband/hw/i40iw/i40iw_uk.c | 18 ++---------------- 3 files changed, 7 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 493d6ef3d2d5..03e6fc67d955 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -92,14 +92,9 @@ void i40iw_free_sqbuf(struct i40iw_sc_vsi *vsi, void *bufp) static u8 i40iw_derive_hw_ird_setting(u16 cm_ird) { u8 encoded_ird_size; - u8 pof2_cm_ird = 1; - - /* round-off to next powerof2 */ - while (pof2_cm_ird < cm_ird) - pof2_cm_ird *= 2; /* ird_size field is encoded in qp_ctx */ - switch (pof2_cm_ird) { + switch (cm_ird ? roundup_pow_of_two(cm_ird) : 0) { case I40IW_HW_IRD_SETTING_64: encoded_ird_size = 3; break; diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index d88c6cf47cf2..90171643cf7f 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -3855,7 +3855,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ struct i40iw_virt_mem virt_mem; u32 i, mem_size; u32 qpwantedoriginal, qpwanted, mrwanted, pblewanted; - u32 powerof2; u64 sd_needed; u32 loop_count = 0; @@ -3928,16 +3927,10 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ if ((loop_count > 1000) || ((!(loop_count % 10)) && (qpwanted > qpwantedoriginal * 2 / 3))) { - if (qpwanted > FPM_MULTIPLIER) { - qpwanted -= FPM_MULTIPLIER; - powerof2 = 1; - while (powerof2 < qpwanted) - powerof2 *= 2; - powerof2 /= 2; - qpwanted = powerof2; - } else { - qpwanted /= 2; - } + if (qpwanted > FPM_MULTIPLIER) + qpwanted = roundup_pow_of_two(qpwanted - + FPM_MULTIPLIER); + qpwanted >>= 1; } if (mrwanted > FPM_MULTIPLIER * 10) mrwanted -= FPM_MULTIPLIER * 10; diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c b/drivers/infiniband/hw/i40iw/i40iw_uk.c index 3ec5389a81a1..8afa5a67a86b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_uk.c +++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c @@ -893,20 +893,6 @@ exit: return ret_code; } -/** - * i40iw_qp_roundup - return round up QP WQ depth - * @wqdepth: WQ depth in quantas to round up - */ -static int i40iw_qp_round_up(u32 wqdepth) -{ - int scount = 1; - - for (wqdepth--; scount <= 16; scount *= 2) - wqdepth |= wqdepth >> scount; - - return ++wqdepth; -} - /** * i40iw_get_wqe_shift - get shift count for maximum wqe size * @sge: Maximum Scatter Gather Elements wqe @@ -934,7 +920,7 @@ void i40iw_get_wqe_shift(u32 sge, u32 inline_data, u8 *shift) */ enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth) { - *sqdepth = i40iw_qp_round_up((sq_size << shift) + I40IW_SQ_RSVD); + *sqdepth = roundup_pow_of_two((sq_size << shift) + I40IW_SQ_RSVD); if (*sqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift)) *sqdepth = I40IW_QP_SW_MIN_WQSIZE << shift; @@ -953,7 +939,7 @@ enum i40iw_status_code i40iw_get_sqdepth(u32 sq_size, u8 shift, u32 *sqdepth) */ enum i40iw_status_code i40iw_get_rqdepth(u32 rq_size, u8 shift, u32 *rqdepth) { - *rqdepth = i40iw_qp_round_up((rq_size << shift) + I40IW_RQ_RSVD); + *rqdepth = roundup_pow_of_two((rq_size << shift) + I40IW_RQ_RSVD); if (*rqdepth < (I40IW_QP_SW_MIN_WQSIZE << shift)) *rqdepth = I40IW_QP_SW_MIN_WQSIZE << shift; -- cgit v1.2.3 From fe99afd1febd74e0ef1fed7e3283f09effe1f4f0 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 22 Dec 2017 09:46:56 -0600 Subject: i40iw: Correct Q1/XF object count equation Lower Inbound RDMA Read Queue (Q1) object count by a factor of 2 as it is incorrectly doubled. Also, round up Q1 and Transmit FIFO (XF) object count to power of 2 to satisfy hardware requirement. Fixes: 86dbcd0f12e9 ("i40iw: add file to handle cqp calls") Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index 90171643cf7f..e857f287a0c0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -3910,8 +3910,10 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ hmc_info->hmc_obj[I40IW_HMC_IW_APBVT_ENTRY].cnt = 1; hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt = mrwanted; - hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt = I40IW_MAX_WQ_ENTRIES * qpwanted; - hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt = 4 * I40IW_MAX_IRD_SIZE * qpwanted; + hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt = + roundup_pow_of_two(I40IW_MAX_WQ_ENTRIES * qpwanted); + hmc_info->hmc_obj[I40IW_HMC_IW_Q1].cnt = + roundup_pow_of_two(2 * I40IW_MAX_IRD_SIZE * qpwanted); hmc_info->hmc_obj[I40IW_HMC_IW_XFFL].cnt = hmc_info->hmc_obj[I40IW_HMC_IW_XF].cnt / hmc_fpm_misc->xf_block_size; hmc_info->hmc_obj[I40IW_HMC_IW_Q1FL].cnt = -- cgit v1.2.3 From 0c5d5155463cf08492a97b596c5c24636762c310 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 22 Dec 2017 09:46:57 -0600 Subject: i40iw: Add notifier for network device events Register a netdevice notifier for netdev UP/DOWN notification events and report the appropriate ib event. Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw.h | 3 ++ drivers/infiniband/hw/i40iw/i40iw_main.c | 6 ++++ drivers/infiniband/hw/i40iw/i40iw_utils.c | 50 +++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw.h b/drivers/infiniband/hw/i40iw/i40iw.h index 4ae9131b6350..bcddd7061fc0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw.h +++ b/drivers/infiniband/hw/i40iw/i40iw.h @@ -587,5 +587,8 @@ int i40iw_inet6addr_event(struct notifier_block *notifier, int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void *ptr); +int i40iw_netdevice_event(struct notifier_block *notifier, + unsigned long event, + void *ptr); #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 3aecf4cbb128..a4a845825565 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -99,6 +99,10 @@ static struct notifier_block i40iw_net_notifier = { .notifier_call = i40iw_net_event }; +static struct notifier_block i40iw_netdevice_notifier = { + .notifier_call = i40iw_netdevice_event +}; + /** * i40iw_find_i40e_handler - find a handler given a client info * @ldev: pointer to a client info @@ -1394,6 +1398,7 @@ static void i40iw_register_notifiers(void) register_inetaddr_notifier(&i40iw_inetaddr_notifier); register_inet6addr_notifier(&i40iw_inetaddr6_notifier); register_netevent_notifier(&i40iw_net_notifier); + register_netdevice_notifier(&i40iw_netdevice_notifier); } /** @@ -1405,6 +1410,7 @@ static void i40iw_unregister_notifiers(void) unregister_netevent_notifier(&i40iw_net_notifier); unregister_inetaddr_notifier(&i40iw_inetaddr_notifier); unregister_inet6addr_notifier(&i40iw_inetaddr6_notifier); + unregister_netdevice_notifier(&i40iw_netdevice_notifier); } /** diff --git a/drivers/infiniband/hw/i40iw/i40iw_utils.c b/drivers/infiniband/hw/i40iw/i40iw_utils.c index 8845dba7c438..ddc1056b0b4e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_utils.c +++ b/drivers/infiniband/hw/i40iw/i40iw_utils.c @@ -137,7 +137,7 @@ inline u32 i40iw_rd32(struct i40iw_hw *hw, u32 reg) } /** - * i40iw_inetaddr_event - system notifier for netdev events + * i40iw_inetaddr_event - system notifier for ipv4 addr events * @notfier: not used * @event: event for notifier * @ptr: if address @@ -200,7 +200,7 @@ int i40iw_inetaddr_event(struct notifier_block *notifier, } /** - * i40iw_inet6addr_event - system notifier for ipv6 netdev events + * i40iw_inet6addr_event - system notifier for ipv6 addr events * @notfier: not used * @event: event for notifier * @ptr: if address @@ -252,7 +252,7 @@ int i40iw_inet6addr_event(struct notifier_block *notifier, } /** - * i40iw_net_event - system notifier for net events + * i40iw_net_event - system notifier for netevents * @notfier: not used * @event: event for notifier * @ptr: neighbor @@ -296,6 +296,50 @@ int i40iw_net_event(struct notifier_block *notifier, unsigned long event, void * return NOTIFY_DONE; } +/** + * i40iw_netdevice_event - system notifier for netdev events + * @notfier: not used + * @event: event for notifier + * @ptr: netdev + */ +int i40iw_netdevice_event(struct notifier_block *notifier, + unsigned long event, + void *ptr) +{ + struct net_device *event_netdev; + struct net_device *netdev; + struct i40iw_device *iwdev; + struct i40iw_handler *hdl; + + event_netdev = netdev_notifier_info_to_dev(ptr); + + hdl = i40iw_find_netdev(event_netdev); + if (!hdl) + return NOTIFY_DONE; + + iwdev = &hdl->device; + if (iwdev->init_state < RDMA_DEV_REGISTERED || iwdev->closing) + return NOTIFY_DONE; + + netdev = iwdev->ldev->netdev; + if (netdev != event_netdev) + return NOTIFY_DONE; + + iwdev->iw_status = 1; + + switch (event) { + case NETDEV_DOWN: + iwdev->iw_status = 0; + /* Fall through */ + case NETDEV_UP: + i40iw_port_ibevent(iwdev); + break; + default: + break; + } + return NOTIFY_DONE; +} + /** * i40iw_get_cqp_request - get cqp struct * @cqp: device cqp ptr -- cgit v1.2.3 From 3020f252c3aa7bd59c5df38671f1ef13a0426e40 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 22 Dec 2017 09:46:58 -0600 Subject: i40iw: Selectively teardown QPs on IP addr change event On IP address change event, all connected QPs are torn down irrespective of whether IP address is involved in a connection. Only teardown connections those source or destination address matches the netdev interface IP address being changed, and if they are on the same VLAN as the netdev. Fixes: e5e74b61b165 ("i40iw: Add IP addr handling on netdev events") Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 28 ++++++++++++++++++++-------- drivers/infiniband/hw/i40iw/i40iw_cm.h | 4 +++- drivers/infiniband/hw/i40iw/i40iw_main.c | 2 +- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 03e6fc67d955..0cc52e64da3e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -4228,10 +4228,16 @@ set_qhash: } /** - * i40iw_cm_disconnect_all - disconnect all connected qp's + * i40iw_cm_teardown_connections - teardown QPs * @iwdev: device pointer + * @ipaddr: Pointer to IPv4 or IPv6 address + * @ipv4: flag indicating IPv4 when true + * @disconnect_all: flag indicating disconnect all QPs + * teardown QPs where source or destination addr matches ip addr */ -void i40iw_cm_disconnect_all(struct i40iw_device *iwdev) +void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr, + struct i40iw_cm_info *nfo, + bool disconnect_all) { struct i40iw_cm_core *cm_core = &iwdev->cm_core; struct list_head *list_core_temp; @@ -4245,8 +4251,13 @@ void i40iw_cm_disconnect_all(struct i40iw_device *iwdev) spin_lock_irqsave(&cm_core->ht_lock, flags); list_for_each_safe(list_node, list_core_temp, &cm_core->connected_nodes) { cm_node = container_of(list_node, struct i40iw_cm_node, list); - atomic_inc(&cm_node->ref_count); - list_add(&cm_node->connected_entry, &connected_list); + if (disconnect_all || + (nfo->vlan_id == cm_node->vlan_id && + (!memcmp(cm_node->loc_addr, ipaddr, nfo->ipv4 ? 4 : 16) || + !memcmp(cm_node->rem_addr, ipaddr, nfo->ipv4 ? 4 : 16)))) { + atomic_inc(&cm_node->ref_count); + list_add(&cm_node->connected_entry, &connected_list); + } } spin_unlock_irqrestore(&cm_core->ht_lock, flags); @@ -4280,6 +4291,9 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, enum i40iw_quad_hash_manage_type op = ifup ? I40IW_QHASH_MANAGE_TYPE_ADD : I40IW_QHASH_MANAGE_TYPE_DELETE; + nfo.vlan_id = vlan_id; + nfo.ipv4 = ipv4; + /* Disable or enable qhash for listeners */ spin_lock_irqsave(&cm_core->listen_list_lock, flags); list_for_each_entry(listen_node, &cm_core->listen_nodes, list) { @@ -4289,8 +4303,6 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, memcpy(nfo.loc_addr, listen_node->loc_addr, sizeof(nfo.loc_addr)); nfo.loc_port = listen_node->loc_port; - nfo.ipv4 = listen_node->ipv4; - nfo.vlan_id = listen_node->vlan_id; nfo.user_pri = listen_node->user_pri; if (!list_empty(&listen_node->child_listen_list)) { i40iw_qhash_ctrl(iwdev, @@ -4312,7 +4324,7 @@ void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, } spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); - /* disconnect any connected qp's on ifdown */ + /* teardown connected qp's on ifdown */ if (!ifup) - i40iw_cm_disconnect_all(iwdev); + i40iw_cm_teardown_connections(iwdev, ipaddr, &nfo, false); } diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 1577608fc51a..47638a44e60b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -453,5 +453,7 @@ int i40iw_arp_table(struct i40iw_device *iwdev, void i40iw_if_notify(struct i40iw_device *iwdev, struct net_device *netdev, u32 *ipaddr, bool ipv4, bool ifup); -void i40iw_cm_disconnect_all(struct i40iw_device *iwdev); +void i40iw_cm_teardown_connections(struct i40iw_device *iwdev, u32 *ipaddr, + struct i40iw_cm_info *nfo, + bool disconnect_all); #endif /* I40IW_CM_H */ diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index a4a845825565..8a9815e43282 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1800,7 +1800,7 @@ static void i40iw_close(struct i40e_info *ldev, struct i40e_client *client, bool if (reset) iwdev->reset = true; - i40iw_cm_disconnect_all(iwdev); + i40iw_cm_teardown_connections(iwdev, NULL, NULL, true); destroy_workqueue(iwdev->virtchnl_wq); i40iw_deinit_device(iwdev); } -- cgit v1.2.3 From df8b13a1b23356d01dfc4647a5629cdb0f4ce566 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 22 Dec 2017 09:46:59 -0600 Subject: i40iw: Fix sequence number for the first partial FPDU Partial FPDU processing is broken as the sequence number for the first partial FPDU is wrong due to incorrect Q2 buffer offset. The offset should be 64 rather than 16. Fixes: 786c6adb3a94 ("i40iw: add puda code") Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_d.h | 1 + drivers/infiniband/hw/i40iw/i40iw_puda.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h b/drivers/infiniband/hw/i40iw/i40iw_d.h index 65ec39e3746b..b8565872d744 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_d.h +++ b/drivers/infiniband/hw/i40iw/i40iw_d.h @@ -97,6 +97,7 @@ #define RDMA_OPCODE_MASK 0x0f #define RDMA_READ_REQ_OPCODE 1 #define Q2_BAD_FRAME_OFFSET 72 +#define Q2_FPSN_OFFSET 64 #define CQE_MAJOR_DRV 0x8000 #define I40IW_TERM_SENT 0x01 diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index 796a815b53fd..f64b6700f43f 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -1378,7 +1378,7 @@ static void i40iw_ieq_handle_exception(struct i40iw_puda_rsrc *ieq, u32 *hw_host_ctx = (u32 *)qp->hw_host_ctx; u32 rcv_wnd = hw_host_ctx[23]; /* first partial seq # in q2 */ - u32 fps = qp->q2_buf[16]; + u32 fps = *(u32 *)(qp->q2_buf + Q2_FPSN_OFFSET); struct list_head *rxlist = &pfpdu->rxlist; struct list_head *plist; -- cgit v1.2.3 From 756f77d216f8c62659d6b0c4b851314c97f794fd Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Fri, 22 Dec 2017 09:47:00 -0600 Subject: i40iw: Ignore LLP_DOUBT_REACHABILITY AE The LLP_DOUBT_REACHABILITY Asynchronous Event (AE) is an early warning of a connection issue. It is followed by LLP_TOO_MANY_RETRIES AE, if the retransmit threshold is reached and recovery is not possible for the connection. Currently we terminate the connection on receiving the LLP_DOUBT_REACHABILITY AE. Ignore this AE and terminate the connection only on LLP_TOO_MANY_RETRIES AE. This improves the user experience on cable disconnect/reconnect scenario while running iWARP traffic. On cable disconnect, the QP traffic is paused and the user has a larger and more reasonable timeout within which if the cable is reconnected, traffic can continue. Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_hw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c b/drivers/infiniband/hw/i40iw/i40iw_hw.c index e96bdafbcbb3..61540e14e4b9 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_hw.c +++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c @@ -385,6 +385,8 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) iwcq->ibcq.event_handler(&ibevent, iwcq->ibcq.cq_context); } break; + case I40IW_AE_LLP_DOUBT_REACHABILITY: + break; case I40IW_AE_PRIV_OPERATION_DENIED: case I40IW_AE_STAG_ZERO_INVALID: case I40IW_AE_IB_RREQ_AND_Q1_FULL: @@ -403,7 +405,6 @@ void i40iw_process_aeq(struct i40iw_device *iwdev) case I40IW_AE_LLP_SEGMENT_TOO_SMALL: case I40IW_AE_LLP_SYN_RECEIVED: case I40IW_AE_LLP_TOO_MANY_RETRIES: - case I40IW_AE_LLP_DOUBT_REACHABILITY: case I40IW_AE_LCE_QP_CATASTROPHIC: case I40IW_AE_LCE_FUNCTION_CATASTROPHIC: case I40IW_AE_LCE_CQ_CATASTROPHIC: -- cgit v1.2.3 From ce9ce74145aa6814a370a9ff4f5a1d719baaced1 Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Fri, 22 Dec 2017 09:47:01 -0600 Subject: i40iw: Validate correct IRD/ORD connection parameters Casting to u16 before validating IRD/ORD connection parameters could cause recording wrong IRD/ORD values in the cm_node. Validate the IRD/ORD parameters as they are passed by the application before recording them. Fixes: f27b4746f378 ("i40iw: add connection management code") Signed-off-by: Tatyana Nikolova Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 0cc52e64da3e..341f1380c2a1 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -120,7 +120,8 @@ static u8 i40iw_derive_hw_ird_setting(u16 cm_ird) * @conn_ird: connection IRD * @conn_ord: connection ORD */ -static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u16 conn_ird, u16 conn_ord) +static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u32 conn_ird, + u32 conn_ord) { if (conn_ird > I40IW_MAX_IRD_SIZE) conn_ird = I40IW_MAX_IRD_SIZE; @@ -3842,7 +3843,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } cm_node->apbvt_set = true; - i40iw_record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord); + i40iw_record_ird_ord(cm_node, conn_param->ird, conn_param->ord); if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && !cm_node->ord_size) cm_node->ord_size = 1; -- cgit v1.2.3 From fefa06811cc7b1c25904579833b4f319cde3ce7f Mon Sep 17 00:00:00 2001 From: Tatyana Nikolova Date: Fri, 22 Dec 2017 09:47:02 -0600 Subject: i40iw: Fix the connection ORD value for loopback The accepting QP ORD value should be adjusted not to exceed the peer QP IRD value (RFC 6581). This is skipped for loopback. After the ORD is validated by i40iw_record_ird_ord(), adjust the ORD value of the loopback accepting QP to prevent overrunning the IRD space of the peer QP. Also move the ORD accounting for 0-byte RDMA read to i40iw_record_ird_ord(). Fixes: f27b4746f378 ("i40iw: add connection management code") Signed-off-by: Tatyana Nikolova Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 341f1380c2a1..97b620382faf 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -128,6 +128,8 @@ static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u32 conn_ird, if (conn_ord > I40IW_MAX_ORD_SIZE) conn_ord = I40IW_MAX_ORD_SIZE; + else if (!conn_ord && cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO) + conn_ord = 1; cm_node->ird_size = conn_ird; cm_node->ord_size = conn_ord; @@ -2874,15 +2876,13 @@ static struct i40iw_cm_listener *i40iw_make_listen_node( * i40iw_create_cm_node - make a connection node with params * @cm_core: cm's core * @iwdev: iwarp device structure - * @private_data_len: len to provate data for mpa request - * @private_data: pointer to private data for connection + * @conn_param: upper layer connection parameters * @cm_info: quad info for connection */ static struct i40iw_cm_node *i40iw_create_cm_node( struct i40iw_cm_core *cm_core, struct i40iw_device *iwdev, - u16 private_data_len, - void *private_data, + struct iw_cm_conn_param *conn_param, struct i40iw_cm_info *cm_info) { struct i40iw_cm_node *cm_node; @@ -2890,6 +2890,9 @@ static struct i40iw_cm_node *i40iw_create_cm_node( struct i40iw_cm_node *loopback_remotenode; struct i40iw_cm_info loopback_cm_info; + u16 private_data_len = conn_param->private_data_len; + const void *private_data = conn_param->private_data; + /* create a CM connection node */ cm_node = i40iw_make_cm_node(cm_core, iwdev, cm_info, NULL); if (!cm_node) @@ -2898,6 +2901,8 @@ static struct i40iw_cm_node *i40iw_create_cm_node( cm_node->tcp_cntxt.client = 1; cm_node->tcp_cntxt.rcv_wscale = I40IW_CM_DEFAULT_RCV_WND_SCALE; + i40iw_record_ird_ord(cm_node, conn_param->ird, conn_param->ord); + if (!memcmp(cm_info->loc_addr, cm_info->rem_addr, sizeof(cm_info->loc_addr))) { loopback_remotelistener = i40iw_find_listener( cm_core, @@ -2931,6 +2936,10 @@ static struct i40iw_cm_node *i40iw_create_cm_node( private_data_len); loopback_remotenode->pdata.size = private_data_len; + if (loopback_remotenode->ord_size > cm_node->ird_size) + loopback_remotenode->ord_size = + cm_node->ird_size; + cm_node->state = I40IW_CM_STATE_OFFLOADED; cm_node->tcp_cntxt.rcv_nxt = loopback_remotenode->tcp_cntxt.loc_seq_num; @@ -3809,9 +3818,7 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) __func__, cm_id->tos, cm_info.user_pri); cm_id->add_ref(cm_id); cm_node = i40iw_create_cm_node(&iwdev->cm_core, iwdev, - conn_param->private_data_len, - (void *)conn_param->private_data, - &cm_info); + conn_param, &cm_info); if (IS_ERR(cm_node)) { ret = PTR_ERR(cm_node); @@ -3843,11 +3850,6 @@ int i40iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) } cm_node->apbvt_set = true; - i40iw_record_ird_ord(cm_node, conn_param->ird, conn_param->ord); - if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO && - !cm_node->ord_size) - cm_node->ord_size = 1; - iwqp->cm_node = cm_node; cm_node->iwqp = iwqp; iwqp->cm_id = cm_id; -- cgit v1.2.3 From 2ef7f2e2706cee6ad7b5eb6ef8d85824008f0425 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Fri, 22 Dec 2017 08:45:52 -0800 Subject: IB/core: Use rdma_cap_opa_mad to check for OPA Use rdma_cap_opa_mad() to check for OPA to promote code reuse. Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/user_mad.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 3ef8ee5d4aa2..d4de187b1064 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -233,8 +233,7 @@ static void recv_handler(struct ib_mad_agent *agent, * On OPA devices it is okay to lose the upper 16 bits of LID as this * information is obtained elsewhere. Mask off the upper 16 bits. */ - if (agent->device->port_immutable[agent->port_num].core_cap_flags & - RDMA_CORE_PORT_INTEL_OPA) + if (rdma_cap_opa_mad(agent->device, agent->port_num)) packet->mad.hdr.lid = ib_lid_be16(0xFFFF & mad_recv_wc->wc->slid); else -- cgit v1.2.3 From 2e903b611b3e5b6ef1bf875a747368ec6b30f667 Mon Sep 17 00:00:00 2001 From: Don Hiatt Date: Fri, 22 Dec 2017 08:46:00 -0800 Subject: IB/hfi1: Change slid arg in ingress_pkey_table_fail to 32bit Change the slid arg to ingress_pkey_table_fail() to a full 32Bits and do not convert to 16Bits in caller. This is so we can keep everything 32bit in the kernel and only change to 16bit at the uapi boundary. Signed-off-by: Don Hiatt Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/hfi.h | 2 +- drivers/infiniband/hw/hfi1/mad.c | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 4a9b4d7efe63..85b95520db5d 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1624,7 +1624,7 @@ static int ingress_pkey_table_search(struct hfi1_pportdata *ppd, u16 pkey) * the 'error info' for this failure. */ static void ingress_pkey_table_fail(struct hfi1_pportdata *ppd, u16 pkey, - u16 slid) + u32 slid) { struct hfi1_devdata *dd = ppd->dd; diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index cf8dba34fe30..34547a48a445 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -4348,11 +4348,7 @@ static int opa_local_smp_check(struct hfi1_ibport *ibp, */ if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) return 0; - /* - * On OPA devices it is okay to lose the upper 16 bits of LID as this - * information is obtained elsewhere. Mask off the upper 16 bits. - */ - ingress_pkey_table_fail(ppd, pkey, ib_lid_cpu16(0xFFFF & in_wc->slid)); + ingress_pkey_table_fail(ppd, pkey, in_wc->slid); return 1; } -- cgit v1.2.3 From 727b7e9a65c01569f8e4f9809d6f2be8c331c42a Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Tue, 14 Nov 2017 14:51:56 +0200 Subject: IB/core: Make sure that PSN does not overflow The rq/sq->psn is 24 bits as defined in the IB spec, therefore we mask out the 8 most significant bits to avoid overflow in modify_qp. Signed-off-by: Majd Dibbiny Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 43e025435418..b53fb0e98751 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1320,6 +1320,7 @@ static int ib_resolve_eth_dmac(struct ib_device *device, int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { + u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; int ret; if (attr_mask & IB_QP_AV) { @@ -1327,6 +1328,21 @@ int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, if (ret) return ret; } + + if (rdma_ib_or_roce(qp->device, port)) { + if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { + pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", + __func__, qp->device->name); + attr->rq_psn &= 0xffffff; + } + + if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { + pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n", + __func__, qp->device->name); + attr->sq_psn &= 0xffffff; + } + } + ret = ib_security_modify_qp(qp, attr, attr_mask, udata); if (!ret && (attr_mask & IB_QP_PORT)) qp->port = attr->port_num; -- cgit v1.2.3 From efac5ac052d91629e994ddec6771a7c413f067a0 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 27 Dec 2017 16:33:57 -0800 Subject: infiniband: drop unknown function from core_priv.h Delete ibnl_chk_listeners() and its kernel-doc comments from the core_priv.h header file. There is no such function. Fixes: 233c1955835b ("RDMA/netlink: Reduce exposure of RDMA netlink functions") Signed-off-by: Randy Dunlap Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 6c4541af54bb..0deff4c4911b 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -191,13 +191,6 @@ void ib_sa_cleanup(void); int rdma_nl_init(void); void rdma_nl_exit(void); -/** - * Check if there are any listeners to the netlink group - * @group: the netlink group ID - * Returns 0 on success or a negative for no listeners. - */ -int ibnl_chk_listeners(unsigned int group); - int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); -- cgit v1.2.3 From 66131e005e24ef62ca1c10bcbaa0508b63fa9473 Mon Sep 17 00:00:00 2001 From: Henry Orosco Date: Sun, 17 Dec 2017 12:21:39 -0600 Subject: i40iw: Change accelerated flag to bool The accelerated flag only utilizes two values: 0 and 1. Modify accelerated flag in struct i40iw_cm_node to bool. Signed-off-by: Henry Orosco Signed-off-by: Shiraz Saleem Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 4 ++-- drivers/infiniband/hw/i40iw/i40iw_cm.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 77870f9e1736..0d4c2f430288 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -3691,7 +3691,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) cm_node->qhash_set = false; i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL); - cm_node->accelerated = 1; + cm_node->accelerated = true; status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_ESTABLISHED, 0); if (status) @@ -4058,7 +4058,7 @@ static void i40iw_cm_event_connected(struct i40iw_cm_event *event) cm_node->qhash_set = false; i40iw_modify_qp(&iwqp->ibqp, &attr, IB_QP_STATE, NULL); - cm_node->accelerated = 1; + cm_node->accelerated = true; status = i40iw_send_cm_event(cm_node, cm_id, IW_CM_EVENT_CONNECT_REPLY, 0); if (status) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.h b/drivers/infiniband/hw/i40iw/i40iw_cm.h index 0d5840d2c4fc..e09b1f5ecf19 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.h +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.h @@ -337,7 +337,7 @@ struct i40iw_cm_node { u16 mpav2_ird_ord; struct iw_cm_id *cm_id; struct list_head list; - int accelerated; + bool accelerated; struct i40iw_cm_listener *listener; int apbvt_set; int accept_pend; -- cgit v1.2.3 From 926aae273019cc137ed3711077b89a71319e5983 Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Wed, 20 Dec 2017 09:50:57 -0800 Subject: RDMA/vmw_pvrdma: Add UAR SRQ macros in ABI header file Support for SRQs were added in the vmw_pvrdma userlevel library before two necessary macros were added into the kernel ABI header file. Add the two UAR SRQ macros that are required by the userlevel library so that the library can rely on the kernel ABI header file for these SRQ macro definitions. Fixes: 8b10ba783c9d ("RDMA/vmw_pvrdma: Add shared receive queue support") Reviewed-by: Adit Ranadive Reviewed-by: Aditya Sarwade Reviewed-by: Jorgen Hansen Signed-off-by: Bryan Tan Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/vmw_pvrdma-abi.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h index aaa352f2f110..4007cacb1792 100644 --- a/include/uapi/rdma/vmw_pvrdma-abi.h +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -58,6 +58,8 @@ #define PVRDMA_UAR_CQ_ARM_SOL BIT(29) /* Arm solicited bit. */ #define PVRDMA_UAR_CQ_ARM BIT(30) /* Arm bit. */ #define PVRDMA_UAR_CQ_POLL BIT(31) /* Poll bit. */ +#define PVRDMA_UAR_SRQ_OFFSET 8 /* SRQ doorbell. */ +#define PVRDMA_UAR_SRQ_RECV BIT(30) /* Recv bit. */ enum pvrdma_wr_opcode { PVRDMA_WR_RDMA_WRITE, -- cgit v1.2.3 From 5aef7cf254879e7af41a6764608832450eefe909 Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Wed, 20 Dec 2017 11:24:33 -0800 Subject: RDMA/vmw_pvrdma: Clarify QP and CQ is_kernel logic Be more consistent in setting and checking is_kernel flag for QPs and CQs. Reviewed-by: Adit Ranadive Reviewed-by: Aditya Sarwade Reviewed-by: Jorgen Hansen Signed-off-by: Bryan Tan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 9 ++++----- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 7 +++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index e529622cefad..cc4616124983 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -132,8 +132,9 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, } cq->ibcq.cqe = entries; + cq->is_kernel = !context; - if (context) { + if (!cq->is_kernel) { if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { ret = -EFAULT; goto err_cq; @@ -148,8 +149,6 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, npages = ib_umem_page_count(cq->umem); } else { - cq->is_kernel = true; - /* One extra page for shared ring state */ npages = 1 + (entries * sizeof(struct pvrdma_cqe) + PAGE_SIZE - 1) / PAGE_SIZE; @@ -202,7 +201,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, dev->cq_tbl[cq->cq_handle % dev->dsr->caps.max_cq] = cq; spin_unlock_irqrestore(&dev->cq_tbl_lock, flags); - if (context) { + if (!cq->is_kernel) { cq->uar = &(to_vucontext(context)->uar); /* Copy udata back. */ @@ -219,7 +218,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, err_page_dir: pvrdma_page_dir_cleanup(dev, &cq->pdir); err_umem: - if (context) + if (!cq->is_kernel) ib_umem_release(cq->umem); err_cq: atomic_dec(&dev->num_cqs); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 4059308e1454..9dd556a7afe3 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -249,8 +249,9 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, init_completion(&qp->free); qp->state = IB_QPS_RESET; + qp->is_kernel = !(pd->uobject && udata); - if (pd->uobject && udata) { + if (!qp->is_kernel) { dev_dbg(&dev->pdev->dev, "create queuepair from user space\n"); @@ -291,8 +292,6 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, qp->npages_recv = 0; qp->npages = qp->npages_send + qp->npages_recv; } else { - qp->is_kernel = true; - ret = pvrdma_set_sq_size(to_vdev(pd->device), &init_attr->cap, qp); if (ret) @@ -394,7 +393,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, err_pdir: pvrdma_page_dir_cleanup(dev, &qp->pdir); err_umem: - if (pd->uobject && udata) { + if (!qp->is_kernel) { if (qp->rumem) ib_umem_release(qp->rumem); if (qp->sumem) -- cgit v1.2.3 From 1a9ecf8d1a292f0f908e5605b58738a273d2a937 Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Wed, 20 Dec 2017 11:26:00 -0800 Subject: RDMA/vmw_pvrdma: Use more specific sizeof in kcalloc Convert the sizeof(void *) in two kcalloc calls to be more specific for the arrays that are being allocated. Reviewed-by: Adit Ranadive Reviewed-by: Aditya Sarwade Reviewed-by: Jorgen Hansen Signed-off-by: Bryan Tan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index e92681878c93..69835d1b3383 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -243,13 +243,13 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) mutex_init(&dev->port_mutex); spin_lock_init(&dev->desc_lock); - dev->cq_tbl = kcalloc(dev->dsr->caps.max_cq, sizeof(void *), + dev->cq_tbl = kcalloc(dev->dsr->caps.max_cq, sizeof(struct pvrdma_cq *), GFP_KERNEL); if (!dev->cq_tbl) return ret; spin_lock_init(&dev->cq_tbl_lock); - dev->qp_tbl = kcalloc(dev->dsr->caps.max_qp, sizeof(void *), + dev->qp_tbl = kcalloc(dev->dsr->caps.max_qp, sizeof(struct pvrdma_qp *), GFP_KERNEL); if (!dev->qp_tbl) goto err_cq_free; -- cgit v1.2.3 From a61eb6136829173d51dd0c7f7248733025708eeb Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Wed, 20 Dec 2017 11:27:00 -0800 Subject: RDMA/vmw_pvrdma: Use refcount_t instead of atomic_t refcount_t is the preferred type for refcounts. Change the QP and CQ refcnt fields to use refcount_t. Reviewed-by: Adit Ranadive Reviewed-by: Aditya Sarwade Reviewed-by: Jorgen Hansen Signed-off-by: Bryan Tan Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma.h | 4 ++-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c | 4 ++-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 12 ++++++------ drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h index 4f7bd3b6a315..44cb1cfba417 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma.h @@ -93,7 +93,7 @@ struct pvrdma_cq { struct pvrdma_page_dir pdir; u32 cq_handle; bool is_kernel; - atomic_t refcnt; + refcount_t refcnt; struct completion free; }; @@ -196,7 +196,7 @@ struct pvrdma_qp { u8 state; bool is_kernel; struct mutex mutex; /* QP state mutex. */ - atomic_t refcnt; + refcount_t refcnt; struct completion free; }; diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c index cc4616124983..faa9478c14a6 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c @@ -177,7 +177,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev, else pvrdma_page_dir_insert_umem(&cq->pdir, cq->umem, 0); - atomic_set(&cq->refcnt, 1); + refcount_set(&cq->refcnt, 1); init_completion(&cq->free); spin_lock_init(&cq->cq_lock); @@ -229,7 +229,7 @@ err_cq: static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq) { - if (atomic_dec_and_test(&cq->refcnt)) + if (refcount_dec_and_test(&cq->refcnt)) complete(&cq->free); wait_for_completion(&cq->free); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 69835d1b3383..939ac2f7bf0f 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -333,7 +333,7 @@ static void pvrdma_qp_event(struct pvrdma_dev *dev, u32 qpn, int type) spin_lock_irqsave(&dev->qp_tbl_lock, flags); qp = dev->qp_tbl[qpn % dev->dsr->caps.max_qp]; if (qp) - atomic_inc(&qp->refcnt); + refcount_inc(&qp->refcnt); spin_unlock_irqrestore(&dev->qp_tbl_lock, flags); if (qp && qp->ibqp.event_handler) { @@ -346,7 +346,7 @@ static void pvrdma_qp_event(struct pvrdma_dev *dev, u32 qpn, int type) ibqp->event_handler(&e, ibqp->qp_context); } if (qp) { - if (atomic_dec_and_test(&qp->refcnt)) + if (refcount_dec_and_test(&qp->refcnt)) complete(&qp->free); } } @@ -359,7 +359,7 @@ static void pvrdma_cq_event(struct pvrdma_dev *dev, u32 cqn, int type) spin_lock_irqsave(&dev->cq_tbl_lock, flags); cq = dev->cq_tbl[cqn % dev->dsr->caps.max_cq]; if (cq) - atomic_inc(&cq->refcnt); + refcount_inc(&cq->refcnt); spin_unlock_irqrestore(&dev->cq_tbl_lock, flags); if (cq && cq->ibcq.event_handler) { @@ -372,7 +372,7 @@ static void pvrdma_cq_event(struct pvrdma_dev *dev, u32 cqn, int type) ibcq->event_handler(&e, ibcq->cq_context); } if (cq) { - if (atomic_dec_and_test(&cq->refcnt)) + if (refcount_dec_and_test(&cq->refcnt)) complete(&cq->free); } } @@ -531,13 +531,13 @@ static irqreturn_t pvrdma_intrx_handler(int irq, void *dev_id) spin_lock_irqsave(&dev->cq_tbl_lock, flags); cq = dev->cq_tbl[cqne->info % dev->dsr->caps.max_cq]; if (cq) - atomic_inc(&cq->refcnt); + refcount_inc(&cq->refcnt); spin_unlock_irqrestore(&dev->cq_tbl_lock, flags); if (cq && cq->ibcq.comp_handler) cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); if (cq) { - if (atomic_dec_and_test(&cq->refcnt)) + if (refcount_dec_and_test(&cq->refcnt)) complete(&cq->free); } pvrdma_idx_ring_inc(&ring->cons_head, ring_slots); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 9dd556a7afe3..7bf518bdbf21 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -245,7 +245,7 @@ struct ib_qp *pvrdma_create_qp(struct ib_pd *pd, spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); mutex_init(&qp->mutex); - atomic_set(&qp->refcnt, 1); + refcount_set(&qp->refcnt, 1); init_completion(&qp->free); qp->state = IB_QPS_RESET; @@ -427,7 +427,7 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp) pvrdma_unlock_cqs(scq, rcq, &scq_flags, &rcq_flags); - if (atomic_dec_and_test(&qp->refcnt)) + if (refcount_dec_and_test(&qp->refcnt)) complete(&qp->free); wait_for_completion(&qp->free); -- cgit v1.2.3 From d2acafea14e214421d08d9ae866c854feb47761d Mon Sep 17 00:00:00 2001 From: Bryan Tan Date: Wed, 20 Dec 2017 11:27:28 -0800 Subject: RDMA/vmw_pvrdma: Remove usage of BIT() from UAPI header BIT() should not be used in the UAPI header. Remove it. Signed-off-by: Bryan Tan Signed-off-by: Jason Gunthorpe --- include/uapi/rdma/vmw_pvrdma-abi.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h index 4007cacb1792..02ca0d0f1eb7 100644 --- a/include/uapi/rdma/vmw_pvrdma-abi.h +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -52,14 +52,14 @@ #define PVRDMA_UVERBS_ABI_VERSION 3 /* ABI Version. */ #define PVRDMA_UAR_HANDLE_MASK 0x00FFFFFF /* Bottom 24 bits. */ #define PVRDMA_UAR_QP_OFFSET 0 /* QP doorbell. */ -#define PVRDMA_UAR_QP_SEND BIT(30) /* Send bit. */ -#define PVRDMA_UAR_QP_RECV BIT(31) /* Recv bit. */ +#define PVRDMA_UAR_QP_SEND (1 << 30) /* Send bit. */ +#define PVRDMA_UAR_QP_RECV (1 << 31) /* Recv bit. */ #define PVRDMA_UAR_CQ_OFFSET 4 /* CQ doorbell. */ -#define PVRDMA_UAR_CQ_ARM_SOL BIT(29) /* Arm solicited bit. */ -#define PVRDMA_UAR_CQ_ARM BIT(30) /* Arm bit. */ -#define PVRDMA_UAR_CQ_POLL BIT(31) /* Poll bit. */ +#define PVRDMA_UAR_CQ_ARM_SOL (1 << 29) /* Arm solicited bit. */ +#define PVRDMA_UAR_CQ_ARM (1 << 30) /* Arm bit. */ +#define PVRDMA_UAR_CQ_POLL (1 << 31) /* Poll bit. */ #define PVRDMA_UAR_SRQ_OFFSET 8 /* SRQ doorbell. */ -#define PVRDMA_UAR_SRQ_RECV BIT(30) /* Recv bit. */ +#define PVRDMA_UAR_SRQ_RECV (1 << 30) /* Recv bit. */ enum pvrdma_wr_opcode { PVRDMA_WR_RDMA_WRITE, -- cgit v1.2.3 From 07d84f7b6adf3ea0d9f8c2bbee412a2a7aaa83e5 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Sun, 24 Dec 2017 14:51:24 +0200 Subject: IB/mlx4: Add support to RSS hash for inner headers Support RSS hash for inner headers according to a new flag, MLX4_IB_RX_HASH_INNER provided by the vendor channel. In case the flag is set, RSS hash will be done on the inner headers of VXLAN packets (which are encapsulated). Non-encapsulated packets will be hashed according to the outer headers. Signed-off-by: Guy Levi Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 6 ++++++ drivers/infiniband/hw/mlx4/qp.c | 14 ++++++++++++++ include/uapi/rdma/mlx4-abi.h | 7 ++++--- 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 8c8a16791a3f..5695ce53fddb 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -589,6 +589,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, if (props->rss_caps.supported_qpts) { resp.rss_caps.rx_hash_function = MLX4_IB_RX_HASH_FUNC_TOEPLITZ; + resp.rss_caps.rx_hash_fields_mask = MLX4_IB_RX_HASH_SRC_IPV4 | MLX4_IB_RX_HASH_DST_IPV4 | @@ -598,6 +599,11 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, MLX4_IB_RX_HASH_DST_PORT_TCP | MLX4_IB_RX_HASH_SRC_PORT_UDP | MLX4_IB_RX_HASH_DST_PORT_UDP; + + if (dev->dev->caps.tunnel_offload_mode == + MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) + resp.rss_caps.rx_hash_fields_mask |= + MLX4_IB_RX_HASH_INNER; } } diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index f5f5c5960bba..f045491f2c14 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -734,6 +734,20 @@ static int set_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_rss *rss_ctx, return (-EOPNOTSUPP); } + if (ucmd->rx_hash_fields_mask & MLX4_IB_RX_HASH_INNER) { + if (dev->dev->caps.tunnel_offload_mode == + MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) { + /* + * Hash according to inner headers if exist, otherwise + * according to outer headers. + */ + rss_ctx->flags |= MLX4_RSS_BY_INNER_HEADERS_IPONLY; + } else { + pr_debug("RSS Hash for inner headers isn't supported\n"); + return (-EOPNOTSUPP); + } + } + return 0; } diff --git a/include/uapi/rdma/mlx4-abi.h b/include/uapi/rdma/mlx4-abi.h index 224b52b6279c..7f9c37346613 100644 --- a/include/uapi/rdma/mlx4-abi.h +++ b/include/uapi/rdma/mlx4-abi.h @@ -97,8 +97,8 @@ struct mlx4_ib_create_srq_resp { }; struct mlx4_ib_create_qp_rss { - __u64 rx_hash_fields_mask; - __u8 rx_hash_function; + __u64 rx_hash_fields_mask; /* Use enum mlx4_ib_rx_hash_fields */ + __u8 rx_hash_function; /* Use enum mlx4_ib_rx_hash_function_flags */ __u8 reserved[7]; __u8 rx_hash_key[40]; __u32 comp_mask; @@ -152,7 +152,8 @@ enum mlx4_ib_rx_hash_fields { MLX4_IB_RX_HASH_SRC_PORT_TCP = 1 << 4, MLX4_IB_RX_HASH_DST_PORT_TCP = 1 << 5, MLX4_IB_RX_HASH_SRC_PORT_UDP = 1 << 6, - MLX4_IB_RX_HASH_DST_PORT_UDP = 1 << 7 + MLX4_IB_RX_HASH_DST_PORT_UDP = 1 << 7, + MLX4_IB_RX_HASH_INNER = 1ULL << 31, }; #endif /* MLX4_ABI_USER_H */ -- cgit v1.2.3 From 4e2b53a5cb5a8243284dd7ec4980d2dc556e79f0 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Sun, 24 Dec 2017 14:51:25 +0200 Subject: IB/mlx5: Report inner RSS capability Add missing inner RSS support capability as part of the RSS supported fields. In addition change MLX5_RX_HASH_INNER to 1UL << 31 in order to define it as unsigned. Fixes: 309fa3470fca ("IB/mlx5: Add support for RSS on the inner packet") Signed-off-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 3 ++- include/uapi/rdma/mlx5-abi.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 46c7367d3255..fb38da571f40 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -680,7 +680,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_RX_HASH_SRC_PORT_TCP | MLX5_RX_HASH_DST_PORT_TCP | MLX5_RX_HASH_SRC_PORT_UDP | - MLX5_RX_HASH_DST_PORT_UDP; + MLX5_RX_HASH_DST_PORT_UDP | + MLX5_RX_HASH_INNER; resp.response_length += sizeof(resp.rss_caps); } } else { diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index a33e0517d3fd..062d14f07b61 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -307,7 +307,7 @@ enum mlx5_rx_hash_fields { MLX5_RX_HASH_SRC_PORT_UDP = 1 << 6, MLX5_RX_HASH_DST_PORT_UDP = 1 << 7, /* Save bits for future fields */ - MLX5_RX_HASH_INNER = 1 << 31 + MLX5_RX_HASH_INNER = (1UL << 31), }; struct mlx5_ib_create_qp_rss { -- cgit v1.2.3 From 31a78a5a7983141c17852d31eb3a1f70d8161225 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 24 Dec 2017 16:31:34 +0200 Subject: IB/mlx5: Extend UAR stuff to support dynamic allocation This patch extends the alloc context flow to be prepared for working with dynamic UAR allocations. Currently upon alloc context there is some fix size of UARs that are allocated (named 'static allocation') and there is no option to user application to ask for more or control which UAR will be used by which QP. In this patch the driver prepares its data structures to manage both the static and the dynamic allocations and let the user driver knows about the max value of dynamic blue-flame registers that are allowed. Downstream patches from this series will enable the dynamic allocation and the association as part of QP creation. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 37 +++++++++++++++++++++++++++--------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 +++--- drivers/infiniband/hw/mlx5/qp.c | 2 +- include/linux/mlx5/device.h | 2 ++ include/linux/mlx5/driver.h | 3 +++ include/uapi/rdma/mlx5-abi.h | 2 ++ 6 files changed, 39 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fb38da571f40..b894bc5be384 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1240,9 +1240,18 @@ static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps) caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n"); } +static u16 calc_dynamic_bfregs(int uars_per_sys_page) +{ + /* Large page with non 4k uar support might limit the dynamic size */ + if (uars_per_sys_page == 1 && PAGE_SIZE > 4096) + return MLX5_MIN_DYN_BFREGS; + + return MLX5_MAX_DYN_BFREGS; +} + static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, struct mlx5_ib_alloc_ucontext_req_v2 *req, - u32 *num_sys_pages) + struct mlx5_bfreg_info *bfregi) { int uars_per_sys_page; int bfregs_per_sys_page; @@ -1259,16 +1268,21 @@ static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k); bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR; + /* This holds the required static allocation asked by the user */ req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page); - *num_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; - if (req->num_low_latency_bfregs > req->total_num_bfregs - 1) return -EINVAL; - mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, using %d sys pages\n", + bfregi->num_static_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; + bfregi->num_dyn_bfregs = ALIGN(calc_dynamic_bfregs(uars_per_sys_page), bfregs_per_sys_page); + bfregi->total_num_bfregs = req->total_num_bfregs + bfregi->num_dyn_bfregs; + bfregi->num_sys_pages = bfregi->total_num_bfregs / bfregs_per_sys_page; + + mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, allocated %d, total bfregs %d, using %d sys pages\n", MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no", lib_uar_4k ? "yes" : "no", ref_bfregs, - req->total_num_bfregs, *num_sys_pages); + req->total_num_bfregs, bfregi->total_num_bfregs, + bfregi->num_sys_pages); return 0; } @@ -1280,7 +1294,7 @@ static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *conte int i; bfregi = &context->bfregi; - for (i = 0; i < bfregi->num_sys_pages; i++) { + for (i = 0; i < bfregi->num_static_sys_pages; i++) { err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]); if (err) goto error; @@ -1304,7 +1318,7 @@ static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *con int i; bfregi = &context->bfregi; - for (i = 0; i < bfregi->num_sys_pages; i++) { + for (i = 0; i < bfregi->num_static_sys_pages; i++) { err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); if (err) { mlx5_ib_warn(dev, "failed to free uar %d\n", i); @@ -1419,13 +1433,13 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, bfregi = &context->bfregi; /* updates req->total_num_bfregs */ - err = calc_total_bfregs(dev, lib_uar_4k, &req, &bfregi->num_sys_pages); + err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) goto out_ctx; mutex_init(&bfregi->lock); bfregi->lib_uar_4k = lib_uar_4k; - bfregi->count = kcalloc(req.total_num_bfregs, sizeof(*bfregi->count), + bfregi->count = kcalloc(bfregi->total_num_bfregs, sizeof(*bfregi->count), GFP_KERNEL); if (!bfregi->count) { err = -ENOMEM; @@ -1509,6 +1523,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (field_avail(typeof(resp), num_uars_per_page, udata->outlen)) resp.response_length += sizeof(resp.num_uars_per_page); + if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) { + resp.num_dyn_bfregs = bfregi->num_dyn_bfregs; + resp.response_length += sizeof(resp.num_dyn_bfregs); + } + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_td; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 2c5f3533bbc9..330c69cb87df 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1113,10 +1113,10 @@ static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_suppor MLX5_UARS_IN_PAGE : 1; } -static inline int get_num_uars(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi) +static inline int get_num_static_uars(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) { - return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_sys_pages; + return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_static_sys_pages; } #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 31ad28853efa..1acc445a93cd 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -493,7 +493,7 @@ enum { static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) { - return get_num_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; + return get_num_static_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; } static int num_med_bfreg(struct mlx5_ib_dev *dev, diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 409ffb14298a..52b8ea423dd2 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -244,6 +244,8 @@ enum { MLX5_NON_FP_BFREGS_PER_UAR, MLX5_UARS_IN_PAGE = PAGE_SIZE / MLX5_ADAPTER_PAGE_SIZE, MLX5_NON_FP_BFREGS_IN_PAGE = MLX5_NON_FP_BFREGS_PER_UAR * MLX5_UARS_IN_PAGE, + MLX5_MIN_DYN_BFREGS = 512, + MLX5_MAX_DYN_BFREGS = 1024, }; enum { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8846919356ca..2fe263f69751 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -230,6 +230,9 @@ struct mlx5_bfreg_info { u32 ver; bool lib_uar_4k; u32 num_sys_pages; + u32 num_static_sys_pages; + u32 total_num_bfregs; + u32 num_dyn_bfregs; }; struct mlx5_cmd_first { diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 062d14f07b61..12c523a1dd18 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -125,6 +125,8 @@ struct mlx5_ib_alloc_ucontext_resp { __u64 hca_core_clock_offset; __u32 log_uar_size; __u32 num_uars_per_page; + __u32 num_dyn_bfregs; + __u32 reserved3; }; struct mlx5_ib_alloc_pd_resp { -- cgit v1.2.3 From 4ed131d0bb1597ce12fff22d9d7fc9720a6e8cf0 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 24 Dec 2017 16:31:35 +0200 Subject: IB/mlx5: Expose dynamic mmap allocation This patch exposes the option to dynamic allocates a UAR, this functionality will be used in downstream patch in this series as part of QP creation. Specifically, the user space driver asks for a UAR allocation in a given page index, upon success this UAR and its bfregs can be used as part of QP creation by the user space driver. To enable allocating more than 256 UARs the page index is encoded in an extra one byte just after the command byte. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 108 +++++++++++++++++++++++++++++------ drivers/infiniband/hw/mlx5/mlx5_ib.h | 8 +++ drivers/infiniband/hw/mlx5/qp.c | 6 +- 3 files changed, 101 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b894bc5be384..1422db847b3e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1301,6 +1301,10 @@ static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *conte mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]); } + + for (i = bfregi->num_static_sys_pages; i < bfregi->num_sys_pages; i++) + bfregi->sys_pages[i] = MLX5_IB_INVALID_UAR_INDEX; + return 0; error: @@ -1318,13 +1322,17 @@ static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *con int i; bfregi = &context->bfregi; - for (i = 0; i < bfregi->num_static_sys_pages; i++) { - err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); - if (err) { - mlx5_ib_warn(dev, "failed to free uar %d\n", i); - return err; + for (i = 0; i < bfregi->num_sys_pages; i++) { + if (i < bfregi->num_static_sys_pages || + bfregi->sys_pages[i] != MLX5_IB_INVALID_UAR_INDEX) { + err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); + if (err) { + mlx5_ib_warn(dev, "failed to free uar %d, err=%d\n", i, err); + return err; + } } } + return 0; } @@ -1582,15 +1590,13 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) } static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi, - int idx) + int uar_idx) { int fw_uars_per_page; fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; - return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + - bfregi->sys_pages[idx] / fw_uars_per_page; + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; } static int get_command(unsigned long offset) @@ -1608,6 +1614,12 @@ static int get_index(unsigned long offset) return get_arg(offset); } +/* Index resides in an extra byte to enable larger values than 255 */ +static int get_extended_index(unsigned long offset) +{ + return get_arg(offset) | ((offset >> 16) & 0xff) << 8; +} + static void mlx5_ib_vma_open(struct vm_area_struct *area) { /* vma_open is called when a new VMA is created on top of our VMA. This @@ -1758,21 +1770,29 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, unsigned long idx; phys_addr_t pfn, pa; pgprot_t prot; - int uars_per_page; + u32 bfreg_dyn_idx = 0; + u32 uar_index; + int dyn_uar = (cmd == MLX5_IB_MMAP_ALLOC_WC); + int max_valid_idx = dyn_uar ? bfregi->num_sys_pages : + bfregi->num_static_sys_pages; if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; - uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); - idx = get_index(vma->vm_pgoff); - if (idx % uars_per_page || - idx * uars_per_page >= bfregi->num_sys_pages) { - mlx5_ib_warn(dev, "invalid uar index %lu\n", idx); + if (dyn_uar) + idx = get_extended_index(vma->vm_pgoff) + bfregi->num_static_sys_pages; + else + idx = get_index(vma->vm_pgoff); + + if (idx >= max_valid_idx) { + mlx5_ib_warn(dev, "invalid uar index %lu, max=%d\n", + idx, max_valid_idx); return -EINVAL; } switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: /* Some architectures don't support WC memory */ #if defined(CONFIG_X86) if (!pat_enabled()) @@ -1792,7 +1812,40 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, return -EINVAL; } - pfn = uar_index2pfn(dev, bfregi, idx); + if (dyn_uar) { + int uars_per_page; + + uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); + bfreg_dyn_idx = idx * (uars_per_page * MLX5_NON_FP_BFREGS_PER_UAR); + if (bfreg_dyn_idx >= bfregi->total_num_bfregs) { + mlx5_ib_warn(dev, "invalid bfreg_dyn_idx %u, max=%u\n", + bfreg_dyn_idx, bfregi->total_num_bfregs); + return -EINVAL; + } + + mutex_lock(&bfregi->lock); + /* Fail if uar already allocated, first bfreg index of each + * page holds its count. + */ + if (bfregi->count[bfreg_dyn_idx]) { + mlx5_ib_warn(dev, "wrong offset, idx %lu is busy, bfregn=%u\n", idx, bfreg_dyn_idx); + mutex_unlock(&bfregi->lock); + return -EINVAL; + } + + bfregi->count[bfreg_dyn_idx]++; + mutex_unlock(&bfregi->lock); + + err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index); + if (err) { + mlx5_ib_warn(dev, "UAR alloc failed\n"); + goto free_bfreg; + } + } else { + uar_index = bfregi->sys_pages[idx]; + } + + pfn = uar_index2pfn(dev, uar_index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); vma->vm_page_prot = prot; @@ -1801,14 +1854,32 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, if (err) { mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%lx, pfn=%pa, mmap_cmd=%s\n", err, vma->vm_start, &pfn, mmap_cmd2str(cmd)); - return -EAGAIN; + err = -EAGAIN; + goto err; } pa = pfn << PAGE_SHIFT; mlx5_ib_dbg(dev, "mapped %s at 0x%lx, PA %pa\n", mmap_cmd2str(cmd), vma->vm_start, &pa); - return mlx5_ib_set_vma_data(vma, context); + err = mlx5_ib_set_vma_data(vma, context); + if (err) + goto err; + + if (dyn_uar) + bfregi->sys_pages[idx] = uar_index; + return 0; + +err: + if (!dyn_uar) + return err; + + mlx5_cmd_free_uar(dev->mdev, idx); + +free_bfreg: + mlx5_ib_free_bfreg(dev, bfregi, bfreg_dyn_idx); + + return err; } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) @@ -1823,6 +1894,7 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: return uar_mmap(dev, command, vma, context); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 330c69cb87df..6c0bdc9a92c4 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -77,6 +77,7 @@ enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_NC_PAGE = 3, /* 5 is chosen in order to be compatible with old versions of libmlx5 */ MLX5_IB_MMAP_CORE_CLOCK = 5, + MLX5_IB_MMAP_ALLOC_WC = 6, }; enum { @@ -112,6 +113,10 @@ enum { MLX5_TM_MAX_SGE = 1, }; +enum { + MLX5_IB_INVALID_UAR_INDEX = BIT(31), +}; + struct mlx5_ib_vma_private_data { struct list_head list; struct vm_area_struct *vma; @@ -1021,6 +1026,9 @@ void mlx5_ib_gsi_pkey_change(struct mlx5_ib_gsi_qp *gsi); int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); +void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, + int bfregn); + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1acc445a93cd..ddf52dc4a78a 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -581,7 +581,7 @@ static int alloc_bfreg(struct mlx5_ib_dev *dev, return bfregn; } -static void free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) +void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) { mutex_lock(&bfregi->lock); bfregi->count[bfregn]--; @@ -874,7 +874,7 @@ err_umem: ib_umem_release(ubuffer->umem); err_bfreg: - free_bfreg(dev, &context->bfregi, bfregn); + mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn); return err; } @@ -887,7 +887,7 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_db_unmap_user(context, &qp->db); if (base->ubuffer.umem) ib_umem_release(base->ubuffer.umem); - free_bfreg(dev, &context->bfregi, qp->bfregn); + mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, -- cgit v1.2.3 From 1ee47ab3e8d868185ec9a0bfe5da2a7f502c04ab Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 24 Dec 2017 16:31:36 +0200 Subject: IB/mlx5: Enable QP creation with a given blue flame index This patch enables QP creation with a given BF index, this allows the user space driver to share same BF between few QPs or alternatively have a dedicated BF per QP. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 51 ++++++++++++++++++++++++++++-------- include/uapi/rdma/mlx5-abi.h | 3 ++- 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 6c0bdc9a92c4..5fabd5807db6 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -115,6 +115,7 @@ enum { enum { MLX5_IB_INVALID_UAR_INDEX = BIT(31), + MLX5_IB_INVALID_BFREG = BIT(31), }; struct mlx5_ib_vma_private_data { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ddf52dc4a78a..45b9aba599c8 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -627,7 +627,8 @@ static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq); static int bfregn_to_uar_index(struct mlx5_ib_dev *dev, - struct mlx5_bfreg_info *bfregi, int bfregn) + struct mlx5_bfreg_info *bfregi, int bfregn, + bool dyn_bfreg) { int bfregs_per_sys_page; int index_of_sys_page; @@ -637,8 +638,16 @@ static int bfregn_to_uar_index(struct mlx5_ib_dev *dev, MLX5_NON_FP_BFREGS_PER_UAR; index_of_sys_page = bfregn / bfregs_per_sys_page; - offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR; + if (dyn_bfreg) { + index_of_sys_page += bfregi->num_static_sys_pages; + if (bfregn > bfregi->num_dyn_bfregs || + bfregi->sys_pages[index_of_sys_page] == MLX5_IB_INVALID_UAR_INDEX) { + mlx5_ib_dbg(dev, "Invalid dynamic uar index\n"); + return -EINVAL; + } + } + offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR; return bfregi->sys_pages[index_of_sys_page] + offset; } @@ -764,7 +773,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_create_qp ucmd; struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; int page_shift = 0; - int uar_index; + int uar_index = 0; int npages; u32 offset = 0; int bfregn; @@ -780,12 +789,20 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } context = to_mucontext(pd->uobject->context); - /* - * TBD: should come from the verbs when we have the API - */ - if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) { + uar_index = bfregn_to_uar_index(dev, &context->bfregi, + ucmd.bfreg_index, true); + if (uar_index < 0) + return uar_index; + + bfregn = MLX5_IB_INVALID_BFREG; + } else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) { + /* + * TBD: should come from the verbs when we have the API + */ /* In CROSS_CHANNEL CQ and QP must use the same UAR */ bfregn = MLX5_CROSS_CHANNEL_BFREG; + } else { bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH); if (bfregn < 0) { @@ -804,8 +821,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } } - uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn); mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index); + if (bfregn != MLX5_IB_INVALID_BFREG) + uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn, + false); qp->rq.offset = 0; qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); @@ -845,7 +864,10 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, page_offset, offset); MLX5_SET(qpc, qpc, uar_page, uar_index); - resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn); + if (bfregn != MLX5_IB_INVALID_BFREG) + resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn); + else + resp->bfreg_index = MLX5_IB_INVALID_BFREG; qp->bfregn = bfregn; err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); @@ -874,7 +896,8 @@ err_umem: ib_umem_release(ubuffer->umem); err_bfreg: - mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn); + if (bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, bfregn); return err; } @@ -887,7 +910,13 @@ static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_db_unmap_user(context, &qp->db); if (base->ubuffer.umem) ib_umem_release(base->ubuffer.umem); - mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); + + /* + * Free only the BFREGs which are handled by the kernel. + * BFREGs of UARs allocated dynamically are handled by user. + */ + if (qp->bfregn != MLX5_IB_INVALID_BFREG) + mlx5_ib_free_bfreg(dev, &context->bfregi, qp->bfregn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 12c523a1dd18..0f7e45680ce5 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -41,6 +41,7 @@ enum { MLX5_QP_FLAG_SIGNATURE = 1 << 0, MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, MLX5_QP_FLAG_TUNNEL_OFFLOADS = 1 << 2, + MLX5_QP_FLAG_BFREG_INDEX = 1 << 3, }; enum { @@ -282,7 +283,7 @@ struct mlx5_ib_create_qp { __u32 rq_wqe_shift; __u32 flags; __u32 uidx; - __u32 reserved0; + __u32 bfreg_index; __u64 sq_buf_addr; }; -- cgit v1.2.3 From 4044a3f482a3373ea5379da47c04ebecb9a3f133 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 29 Dec 2017 19:26:18 +0800 Subject: RDMA/hns: Add detailed comments for mb() call This patch adds more detailed comments when we call the memory barrier function, such as rmb, wmb and mb. Three mb() callers are deleted since they are unnecessary. Suggested-by: Jason Gunthorpe Signed-off-by: Yixian Liu Signed-off-by: Lijun Ou Signed-off-by: Wei Hu (Xavier) Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 19 ++++++++++++------- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 26 ++++++++++---------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 6100ace9f4b6..73f187e784c7 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -3963,8 +3963,6 @@ static void set_eq_cons_index_v1(struct hns_roce_eq *eq, int req_not) { roce_raw_write((eq->cons_index & HNS_ROCE_V1_CONS_IDX_M) | (req_not << eq->log_entries), eq->doorbell); - /* Memory barrier */ - mb(); } static void hns_roce_v1_wq_catas_err_handle(struct hns_roce_dev *hr_dev, @@ -4156,13 +4154,16 @@ static int hns_roce_v1_aeq_int(struct hns_roce_dev *hr_dev, int event_type; while ((aeqe = next_aeqe_sw_v1(eq))) { + + /* Make sure we read the AEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + dev_dbg(dev, "aeqe = %p, aeqe->asyn.event_type = 0x%lx\n", aeqe, roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S)); - /* Memory barrier */ - rmb(); - event_type = roce_get_field(aeqe->asyn, HNS_ROCE_AEQE_U32_4_EVENT_TYPE_M, HNS_ROCE_AEQE_U32_4_EVENT_TYPE_S); @@ -4260,8 +4261,12 @@ static int hns_roce_v1_ceq_int(struct hns_roce_dev *hr_dev, u32 cqn; while ((ceqe = next_ceqe_sw_v1(eq))) { - /* Memory barrier */ - rmb(); + + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + cqn = roce_get_field(ceqe->comp, HNS_ROCE_CEQE_CEQE_COMP_CQN_M, HNS_ROCE_CEQE_CEQE_COMP_CQN_S); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4d3e97622f39..5196b6b7b503 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3207,10 +3207,6 @@ static void set_eq_cons_index_v2(struct hns_roce_eq *eq) (eq->cons_index & HNS_ROCE_V2_CONS_IDX_M)); hns_roce_write64_k(doorbell, eq->doorbell); - - /* Memory barrier */ - mb(); - } static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev, @@ -3392,8 +3388,11 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, int event_type; while ((aeqe = next_aeqe_sw_v2(eq))) { - /* Memory barrier */ - rmb(); + + /* Make sure we read AEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); event_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_EVENT_TYPE_M, @@ -3508,8 +3507,11 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, while ((ceqe = next_ceqe_sw_v2(eq))) { - /* Memory barrier */ - rmb(); + /* Make sure we read CEQ entry after we have checked the + * ownership bit + */ + dma_rmb(); + cqn = roce_get_field(ceqe->comp, HNS_ROCE_V2_CEQE_COMP_CQN_M, HNS_ROCE_V2_CEQE_COMP_CQN_S); @@ -3564,9 +3566,6 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S, 1); roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); - /* Memory barrier */ - mb(); - roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); @@ -3577,9 +3576,6 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_BUS_ERR_S, 1); roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); - /* Memory barrier */ - mb(); - roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); @@ -3590,8 +3586,6 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) roce_set_bit(int_st, HNS_ROCE_V2_VF_INT_ST_OTHER_ERR_S, 1); roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); - /* Memory barrier */ - mb(); roce_set_bit(int_en, HNS_ROCE_V2_VF_ABN_INT_EN_S, 1); roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); -- cgit v1.2.3 From f48fca4d818f92966419fb74ce56bc5e198ecb5c Mon Sep 17 00:00:00 2001 From: Bharat Potnuri Date: Fri, 29 Dec 2017 13:11:14 +0530 Subject: iw_cxgb4: Change error/warn prints to pr_debug These prints not neccesarily mean error/warning, so changing them to debug. Signed-off-by: Potnuri Bharat Teja Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cm.c | 17 ++++++++--------- drivers/infiniband/hw/cxgb4/ev.c | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 844c9e78df8b..4cf17c650c36 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -257,8 +257,8 @@ static void set_emss(struct c4iw_ep *ep, u16 opt) if (ep->emss < 128) ep->emss = 128; if (ep->emss & 7) - pr_warn("Warning: misaligned mtu idx %u mss %u emss=%u\n", - TCPOPT_MSS_G(opt), ep->mss, ep->emss); + pr_debug("Warning: misaligned mtu idx %u mss %u emss=%u\n", + TCPOPT_MSS_G(opt), ep->mss, ep->emss); pr_debug("mss_idx %u mss %u emss=%u\n", TCPOPT_MSS_G(opt), ep->mss, ep->emss); } @@ -2733,9 +2733,8 @@ static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb) return 0; if (cxgb_is_neg_adv(req->status)) { - pr_warn("%s Negative advice on abort- tid %u status %d (%s)\n", - __func__, ep->hwtid, req->status, - neg_adv_str(req->status)); + pr_debug("Negative advice on abort- tid %u status %d (%s)\n", + ep->hwtid, req->status, neg_adv_str(req->status)); ep->stats.abort_neg_adv++; mutex_lock(&dev->rdev.stats.lock); dev->rdev.stats.neg_adv++; @@ -3567,8 +3566,8 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp) case MORIBUND: case ABORTING: case DEAD: - pr_info("%s ignoring disconnect ep %p state %u\n", - __func__, ep, ep->com.state); + pr_debug("ignoring disconnect ep %p state %u\n", + ep, ep->com.state); break; default: WARN_ONCE(1, "Bad endpoint state %u\n", ep->com.state); @@ -4207,8 +4206,8 @@ static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb) return 0; } if (cxgb_is_neg_adv(req->status)) { - pr_warn("%s Negative advice on abort- tid %u status %d (%s)\n", - __func__, ep->hwtid, req->status, + pr_debug("Negative advice on abort- tid %u status %d (%s)\n", + ep->hwtid, req->status, neg_adv_str(req->status)); goto out; } diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c index a252d5c40ae3..3e9d8b277ab9 100644 --- a/drivers/infiniband/hw/cxgb4/ev.c +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -236,7 +236,7 @@ int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid) if (atomic_dec_and_test(&chp->refcnt)) wake_up(&chp->wait); } else { - pr_warn("%s unknown cqid 0x%x\n", __func__, qid); + pr_debug("unknown cqid 0x%x\n", qid); spin_unlock_irqrestore(&dev->lock, flag); } return 0; -- cgit v1.2.3 From d78756d842de49608bedc80666ae4c8cbf1e4889 Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Sat, 30 Dec 2017 21:09:56 +0530 Subject: IB/ocrdma: Use zeroing memory allocator than allocator/memset Use dma_zalloc_coherent for allocating zeroed memory and remove unnecessary memset function. Done using Coccinelle. Generated-by: scripts/coccinelle/api/alloc/kzalloc-simple.cocci 0-day tested with no failures. Suggested-by: Luis R. Rodriguez Signed-off-by: Himanshu Jha Reviewed-by: Leon Romanovsky Acked-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 19 +++++++------------ drivers/infiniband/hw/ocrdma/ocrdma_stats.c | 6 ++---- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 8 +++----- 3 files changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 0ba695a88b62..9904918589a4 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -380,11 +380,10 @@ static int ocrdma_alloc_q(struct ocrdma_dev *dev, q->len = len; q->entry_size = entry_size; q->size = len * entry_size; - q->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, q->size, - &q->dma, GFP_KERNEL); + q->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, q->size, + &q->dma, GFP_KERNEL); if (!q->va) return -ENOMEM; - memset(q->va, 0, q->size); return 0; } @@ -1819,12 +1818,11 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, return -ENOMEM; ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ, OCRDMA_SUBSYS_COMMON, sizeof(*cmd)); - cq->va = dma_alloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL); + cq->va = dma_zalloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL); if (!cq->va) { status = -ENOMEM; goto mem_err; } - memset(cq->va, 0, cq->len); page_size = cq->len / hw_pages; cmd->cmd.pgsz_pgcnt = (page_size / OCRDMA_MIN_Q_PAGE_SIZE) << OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT; @@ -2212,10 +2210,9 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd, qp->sq.max_cnt = max_wqe_allocated; len = (hw_pages * hw_page_size); - qp->sq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + qp->sq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); if (!qp->sq.va) return -EINVAL; - memset(qp->sq.va, 0, len); qp->sq.len = len; qp->sq.pa = pa; qp->sq.entry_size = dev->attr.wqe_size; @@ -2263,10 +2260,9 @@ static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd, qp->rq.max_cnt = max_rqe_allocated; len = (hw_pages * hw_page_size); - qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); + qp->rq.va = dma_zalloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL); if (!qp->rq.va) return -ENOMEM; - memset(qp->rq.va, 0, len); qp->rq.pa = pa; qp->rq.len = len; qp->rq.entry_size = dev->attr.rqe_size; @@ -2320,11 +2316,10 @@ static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd, if (dev->attr.ird == 0) return 0; - qp->ird_q_va = dma_alloc_coherent(&pdev->dev, ird_q_len, - &pa, GFP_KERNEL); + qp->ird_q_va = dma_zalloc_coherent(&pdev->dev, ird_q_len, &pa, + GFP_KERNEL); if (!qp->ird_q_va) return -ENOMEM; - memset(qp->ird_q_va, 0, ird_q_len); ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages, pa, ird_page_size); for (; i < ird_q_len / dev->attr.rqe_size; i++) { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index fb78b16ce671..24d20a4aa262 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -73,15 +73,13 @@ bool ocrdma_alloc_stats_resources(struct ocrdma_dev *dev) mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req), sizeof(struct ocrdma_rdma_stats_resp)); - mem->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size, - &mem->pa, GFP_KERNEL); + mem->va = dma_zalloc_coherent(&dev->nic_info.pdev->dev, mem->size, + &mem->pa, GFP_KERNEL); if (!mem->va) { pr_err("%s: stats mbox allocation failed\n", __func__); return false; } - memset(mem->va, 0, mem->size); - /* Alloc debugfs mem */ mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL); if (!mem->debugfs_mem) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index b5f1dd33b73e..8009bdad4e5b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -550,13 +550,12 @@ struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev, INIT_LIST_HEAD(&ctx->mm_head); mutex_init(&ctx->mm_list_lock); - ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len, - &ctx->ah_tbl.pa, GFP_KERNEL); + ctx->ah_tbl.va = dma_zalloc_coherent(&pdev->dev, map_len, + &ctx->ah_tbl.pa, GFP_KERNEL); if (!ctx->ah_tbl.va) { kfree(ctx); return ERR_PTR(-ENOMEM); } - memset(ctx->ah_tbl.va, 0, map_len); ctx->ah_tbl.len = map_len; memset(&resp, 0, sizeof(resp)); @@ -885,13 +884,12 @@ static int ocrdma_build_pbl_tbl(struct ocrdma_dev *dev, struct ocrdma_hw_mr *mr) return -ENOMEM; for (i = 0; i < mr->num_pbls; i++) { - va = dma_alloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL); + va = dma_zalloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL); if (!va) { ocrdma_free_mr_pbl_tbl(dev, mr); status = -ENOMEM; break; } - memset(va, 0, dma_len); mr->pbl_table[i].va = va; mr->pbl_table[i].pa = pa; } -- cgit v1.2.3 From 583556568c16c682a66d30f646d6a87a6d4a438e Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Sun, 31 Dec 2017 17:59:00 +0530 Subject: RDMA/vmw_pvrdma: Use zeroing memory allocator than allocator/memset Use dma_zalloc_coherent for allocating zeroed memory and remove unnecessary memset function. Done using Coccinelle. Generated-by: scripts/coccinelle/api/alloc/kzalloc-simple.cocci 0-day tested with no failures. Suggested-by: Luis R. Rodriguez Signed-off-by: Himanshu Jha Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 939ac2f7bf0f..d650a9fcde24 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -882,8 +882,8 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, dev_info(&pdev->dev, "device version %d, driver version %d\n", dev->dsr_version, PVRDMA_VERSION); - dev->dsr = dma_alloc_coherent(&pdev->dev, sizeof(*dev->dsr), - &dev->dsrbase, GFP_KERNEL); + dev->dsr = dma_zalloc_coherent(&pdev->dev, sizeof(*dev->dsr), + &dev->dsrbase, GFP_KERNEL); if (!dev->dsr) { dev_err(&pdev->dev, "failed to allocate shared region\n"); ret = -ENOMEM; @@ -891,7 +891,6 @@ static int pvrdma_pci_probe(struct pci_dev *pdev, } /* Setup the shared region */ - memset(dev->dsr, 0, sizeof(*dev->dsr)); dev->dsr->driver_version = PVRDMA_VERSION; dev->dsr->gos_info.gos_bits = sizeof(void *) == 4 ? PVRDMA_GOS_BITS_32 : -- cgit v1.2.3 From 7bced914e80ac23978a3eb6ba00a3338e7fce087 Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Sun, 31 Dec 2017 18:01:03 +0530 Subject: RDMA/qedr: Use zeroing memory allocator than allocator/memset Use dma_zalloc_coherent for allocating zeroed memory and remove unnecessary memset function. Done using Coccinelle. Generated-by: scripts/coccinelle/api/alloc/kzalloc-simple.cocci 0-day tested with no failures. Suggested-by: Luis R. Rodriguez Signed-off-by: Himanshu Jha Acked-by: Ram Amrani Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index b26aa88dab48..3b9c89848d66 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -604,12 +604,11 @@ static struct qedr_pbl *qedr_alloc_pbl_tbl(struct qedr_dev *dev, return ERR_PTR(-ENOMEM); for (i = 0; i < pbl_info->num_pbls; i++) { - va = dma_alloc_coherent(&pdev->dev, pbl_info->pbl_size, - &pa, flags); + va = dma_zalloc_coherent(&pdev->dev, pbl_info->pbl_size, + &pa, flags); if (!va) goto err; - memset(va, 0, pbl_info->pbl_size); pbl_table[i].va = va; pbl_table[i].pa = pa; } -- cgit v1.2.3 From 7d37ebbb92d96df87afae1c4b2c78a92e07c2ed6 Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Sun, 31 Dec 2017 18:02:38 +0530 Subject: RDMA/bnxt_re: Use zeroing memory allocator than allocator/memset Use dma_zalloc_coherent for allocating zeroed memory and remove unnecessary memset function. Done using Coccinelle. Generated-by: scripts/coccinelle/api/alloc/kzalloc-simple.cocci 0-day tested with no failures. Suggested-by: Luis R. Rodriguez Signed-off-by: Himanshu Jha Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_res.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 4e101704e801..ad37d54affcc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -104,13 +104,12 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, if (!sghead) { for (i = 0; i < pages; i++) { - pbl->pg_arr[i] = dma_alloc_coherent(&pdev->dev, - pbl->pg_size, - &pbl->pg_map_arr[i], - GFP_KERNEL); + pbl->pg_arr[i] = dma_zalloc_coherent(&pdev->dev, + pbl->pg_size, + &pbl->pg_map_arr[i], + GFP_KERNEL); if (!pbl->pg_arr[i]) goto fail; - memset(pbl->pg_arr[i], 0, pbl->pg_size); pbl->pg_count++; } } else { -- cgit v1.2.3 From 27f382da31240243dd599b9557e3abb9ac01f7e3 Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Sun, 31 Dec 2017 18:04:10 +0530 Subject: IB/mthca: Use zeroing memory allocator than allocator/memset Use dma_zalloc_coherent for allocating zeroed memory and remove unnecessary memset function. Done using Coccinelle. Generated-by: scripts/coccinelle/api/alloc/kzalloc-simple.cocci 0-day tested with no failures. Suggested-by: Luis R. Rodriguez Signed-off-by: Himanshu Jha Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mthca/mthca_memfree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index c6fe89d79248..2a41ed74add8 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -623,13 +623,12 @@ int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type, page = dev->db_tab->page + end; alloc: - page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, - &page->mapping, GFP_KERNEL); + page->db_rec = dma_zalloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE, + &page->mapping, GFP_KERNEL); if (!page->db_rec) { ret = -ENOMEM; goto out; } - memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE); ret = mthca_MAP_ICM_page(dev, page->mapping, mthca_uarc_virt(dev, &dev->driver_uar, i)); -- cgit v1.2.3 From 9ef77bd76075408be3a0bb769d37f4b67953a098 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 1 Jan 2018 13:07:11 +0200 Subject: RDMA/rxe: Remove useless EXPORT_SYMBOL The RXE driver is standalone module and hence doesn't need to export symbols, nor does this one line function deserve to be not inlined. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 6 ------ drivers/infiniband/sw/rxe/rxe.h | 6 +++++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 8c3d30b3092d..b7debb6f2eac 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -77,12 +77,6 @@ void rxe_release(struct kref *kref) ib_dealloc_device(&rxe->ib_dev); } -void rxe_dev_put(struct rxe_dev *rxe) -{ - kref_put(&rxe->ref_cnt, rxe_release); -} -EXPORT_SYMBOL_GPL(rxe_dev_put); - /* initialize rxe device parameters */ static int rxe_init_device_param(struct rxe_dev *rxe) { diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index 6447d736d5a4..7d232611303f 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -57,6 +57,7 @@ #include "rxe_hdr.h" #include "rxe_param.h" #include "rxe_verbs.h" +#include "rxe_loc.h" #define RXE_UVERBS_ABI_VERSION (1) @@ -95,7 +96,10 @@ void rxe_remove_all(void); int rxe_rcv(struct sk_buff *skb); -void rxe_dev_put(struct rxe_dev *rxe); +static inline void rxe_dev_put(struct rxe_dev *rxe) +{ + kref_put(&rxe->ref_cnt, rxe_release); +} struct rxe_dev *net_to_rxe(struct net_device *ndev); struct rxe_dev *get_rxe_by_name(const char *name); -- cgit v1.2.3 From b823369b6faa33cc71b65eef9bebcefe622913dc Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 1 Jan 2018 13:07:12 +0200 Subject: RDMA/netlink: Simplify code of autoload modules The request_module() call is internally wrapped by CONFIG_MODULE, so there is no need to check it in our RDMA code too. Refactor to simplify the code. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/netlink.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 0c7db94cd9b9..3ccaae18ad75 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -81,15 +81,13 @@ static bool is_nl_valid(unsigned int type, unsigned int op) if (!is_nl_msg_valid(type, op)) return false; - cb_table = rdma_nl_types[type].cb_table; -#ifdef CONFIG_MODULES - if (!cb_table) { + if (!rdma_nl_types[type].cb_table) { mutex_unlock(&rdma_nl_mutex); request_module("rdma-netlink-subsys-%d", type); mutex_lock(&rdma_nl_mutex); - cb_table = rdma_nl_types[type].cb_table; } -#endif + + cb_table = rdma_nl_types[type].cb_table; if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) return false; -- cgit v1.2.3 From 924b8900a49df284464db5d97362801f8a2ac7b4 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 1 Jan 2018 13:07:13 +0200 Subject: RDMA/core: Replace open-coded variant of put_device There is an existing function to decrease reference counter of the device, let's use it. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 21302e077be1..a8757b5552de 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -272,7 +272,7 @@ void ib_dealloc_device(struct ib_device *device) { WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); - kobject_put(&device->dev.kobj); + put_device(&device->dev); } EXPORT_SYMBOL(ib_dealloc_device); -- cgit v1.2.3 From c2409810c0c9148bca4c3e13ac96dced1e4c11c5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 1 Jan 2018 13:07:14 +0200 Subject: RDMA/nldev: Refactor setting the nldev handle to a common function The NLDEV commands are using IB device indexes and names as a handle for netlink communications. Put all relevant code into one function to remove code duplication in followup patches. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 9a05245a1acf..2b631307349d 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -54,14 +54,23 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, }; -static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { - char fw[IB_FW_VERSION_NAME_MAX]; - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) return -EMSGSIZE; + + return 0; +} + +static int fill_dev_info(struct sk_buff *msg, struct ib_device *device) +{ + char fw[IB_FW_VERSION_NAME_MAX]; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, rdma_end_port(device))) return -EMSGSIZE; @@ -92,10 +101,9 @@ static int fill_port_info(struct sk_buff *msg, struct ib_port_attr attr; int ret; - if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) - return -EMSGSIZE; - if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + if (fill_nldev_handle(msg, device)) return -EMSGSIZE; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port)) return -EMSGSIZE; -- cgit v1.2.3 From e48e5e198fb6ec77c91047a694022f0fefa45292 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 1 Jan 2018 13:07:17 +0200 Subject: RDMA/cma: Mark end of CMA ID messages The commit 1a1c116f3dcf ("RDMA/netlink: Simplify the put_msg and put_attr") removes nlmsg_len calculation in ibnl_put_attr causing netlink messages and caused to miss source and destination addresses. Fixes: 1a1c116f3dcf ("RDMA/netlink: Simplify the put_msg and put_attr") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 7d38c2bff5ea..65c55f79444a 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4440,6 +4440,7 @@ static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb) id_stats->qp_type = id->qp_type; i_id++; + nlmsg_end(skb, nlh); } cb->args[1] = 0; -- cgit v1.2.3 From 809cb6955650d892c6ef95f1d55f28fceded0ce1 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 3 Jan 2018 07:39:50 -0500 Subject: IB/ipoib: Fix for notify send CQ failure messages If IB_CQ_REPORT_MISSED_EVENTS flag is passed in ib_req_notify_cq() it may return positive value indicating non-empty CQ. If return code not verified the log might be flooded with false warning messages "request notify on send CQ failed". Fixes: 8966e28d2e40 ("IB/ipoib: Use NAPI in UD/TX flows") Reviewed-by: Mike Marciniszyn Signed-off-by: Alex Estrin Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 10 ++++++---- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index d45b0f2aa987..dfbb8fdda5f6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -766,12 +766,14 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_ skb_orphan(skb); skb_dst_drop(skb); - if (netif_queue_stopped(dev)) - if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS)) { + if (netif_queue_stopped(dev)) { + rc = ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS); + if (unlikely(rc < 0)) ipoib_warn(priv, "IPoIB/CM:request notify on send CQ failed\n"); + else if (rc) napi_schedule(&priv->send_napi); - } + } rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1), tx_req); if (unlikely(rc)) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index a8760d40dfa6..10384ea50bed 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -644,7 +644,7 @@ int ipoib_send(struct net_device *dev, struct sk_buff *skb, if (netif_queue_stopped(dev)) if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS)) + IB_CQ_REPORT_MISSED_EVENTS) < 0) ipoib_warn(priv, "request notify on send CQ failed\n"); rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1), -- cgit v1.2.3 From 0009c2dbe8a47008a11abca04da2db57f9eea6a8 Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 3 Jan 2018 10:44:03 +0800 Subject: RDMA/hns: Add rq inline data support for hip08 RoCE This patch mainly implement rq inline data feature for hip08 RoCE in kernel mode. Signed-off-by: Lijun Ou Signed-off-by: Yixian Liu Signed-off-by: Wei Hu (Xavier) Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 18 ++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 64 ++++++++++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_qp.c | 53 ++++++++++++++++++++---- 3 files changed, 127 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index dcfd2099bac1..8d123d3ee1ff 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -178,6 +178,7 @@ enum { enum { HNS_ROCE_CAP_FLAG_REREG_MR = BIT(0), HNS_ROCE_CAP_FLAG_ROCE_V1_V2 = BIT(1), + HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2) }; enum hns_roce_mtt_type { @@ -446,6 +447,21 @@ struct hns_roce_cmd_mailbox { struct hns_roce_dev; +struct hns_roce_rinl_sge { + void *addr; + u32 len; +}; + +struct hns_roce_rinl_wqe { + struct hns_roce_rinl_sge *sg_list; + u32 sge_cnt; +}; + +struct hns_roce_rinl_buf { + struct hns_roce_rinl_wqe *wqe_list; + u32 wqe_cnt; +}; + struct hns_roce_qp { struct ib_qp ibqp; struct hns_roce_buf hr_buf; @@ -477,6 +493,8 @@ struct hns_roce_qp { struct hns_roce_sge sge; u32 next_sge; + + struct hns_roce_rinl_buf rq_inl_buf; }; struct hns_roce_sqp { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5196b6b7b503..ba05de426c29 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -299,6 +299,7 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); struct hns_roce_v2_wqe_data_seg *dseg; + struct hns_roce_rinl_sge *sge_list; struct device *dev = hr_dev->dev; struct hns_roce_v2_db rq_db; unsigned long flags; @@ -347,6 +348,14 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, dseg[i].addr = 0; } + /* rq support inline data */ + sge_list = hr_qp->rq_inl_buf.wqe_list[ind].sg_list; + hr_qp->rq_inl_buf.wqe_list[ind].sge_cnt = (u32)wr->num_sge; + for (i = 0; i < wr->num_sge; i++) { + sge_list[i].addr = (void *)(u64)wr->sg_list[i].addr; + sge_list[i].len = wr->sg_list[i].length; + } + hr_qp->rq.wrid[ind] = wr->wr_id; ind = (ind + 1) & (hr_qp->rq.wqe_cnt - 1); @@ -961,7 +970,8 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->chunk_sz = HNS_ROCE_V2_TABLE_CHUNK_SIZE; caps->flags = HNS_ROCE_CAP_FLAG_REREG_MR | - HNS_ROCE_CAP_FLAG_ROCE_V1_V2; + HNS_ROCE_CAP_FLAG_ROCE_V1_V2 | + HNS_ROCE_CAP_FLAG_RQ_INLINE; caps->pkey_table_len[0] = 1; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; caps->ceqe_depth = HNS_ROCE_V2_COMP_EQE_NUM; @@ -1473,6 +1483,40 @@ static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, return 0; } +static int hns_roce_handle_recv_inl_wqe(struct hns_roce_v2_cqe *cqe, + struct hns_roce_qp **cur_qp, + struct ib_wc *wc) +{ + struct hns_roce_rinl_sge *sge_list; + u32 wr_num, wr_cnt, sge_num; + u32 sge_cnt, data_len, size; + void *wqe_buf; + + wr_num = roce_get_field(cqe->byte_4, V2_CQE_BYTE_4_WQE_INDX_M, + V2_CQE_BYTE_4_WQE_INDX_S) & 0xffff; + wr_cnt = wr_num & ((*cur_qp)->rq.wqe_cnt - 1); + + sge_list = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sg_list; + sge_num = (*cur_qp)->rq_inl_buf.wqe_list[wr_cnt].sge_cnt; + wqe_buf = get_recv_wqe(*cur_qp, wr_cnt); + data_len = wc->byte_len; + + for (sge_cnt = 0; (sge_cnt < sge_num) && (data_len); sge_cnt++) { + size = min(sge_list[sge_cnt].len, data_len); + memcpy((void *)sge_list[sge_cnt].addr, wqe_buf, size); + + data_len -= size; + wqe_buf += size; + } + + if (data_len) { + wc->status = IB_WC_LOC_LEN_ERR; + return -EAGAIN; + } + + return 0; +} + static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, struct hns_roce_qp **cur_qp, struct ib_wc *wc) { @@ -1485,6 +1529,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, u32 opcode; u32 status; int qpn; + int ret; /* Find cqe according to consumer index */ cqe = next_cqe_sw_v2(hr_cq); @@ -1673,6 +1718,17 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, break; } + if ((wc->qp->qp_type == IB_QPT_RC || + wc->qp->qp_type == IB_QPT_UC) && + (opcode == HNS_ROCE_V2_OPCODE_SEND || + opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_IMM || + opcode == HNS_ROCE_V2_OPCODE_SEND_WITH_INV) && + (roce_get_bit(cqe->byte_4, V2_CQE_BYTE_4_RQ_INLINE_S))) { + ret = hns_roce_handle_recv_inl_wqe(cqe, cur_qp, wc); + if (ret) + return -EAGAIN; + } + /* Update tail pointer, record wr_id */ wq = &(*cur_qp)->rq; wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; @@ -1972,6 +2028,7 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, !!(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)); roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0); + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 1); roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0); roce_set_field(context->byte_80_rnr_rx_cqn, V2_QPC_BYTE_80_RX_CQN_M, @@ -3114,6 +3171,11 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev, hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { + kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); + kfree(hr_qp->rq_inl_buf.wqe_list); + } + return 0; } diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 69e25847ff1b..351fa3119420 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -494,6 +494,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, int ret = 0; u32 page_shift; u32 npages; + int i; mutex_init(&hr_qp->mutex); spin_lock_init(&hr_qp->sq.lock); @@ -513,18 +514,48 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, goto err_out; } + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) { + /* allocate recv inline buf */ + hr_qp->rq_inl_buf.wqe_list = kcalloc(hr_qp->rq.wqe_cnt, + sizeof(struct hns_roce_rinl_wqe), + GFP_KERNEL); + if (!hr_qp->rq_inl_buf.wqe_list) { + ret = -ENOMEM; + goto err_out; + } + + hr_qp->rq_inl_buf.wqe_cnt = hr_qp->rq.wqe_cnt; + + /* Firstly, allocate a list of sge space buffer */ + hr_qp->rq_inl_buf.wqe_list[0].sg_list = + kcalloc(hr_qp->rq_inl_buf.wqe_cnt, + init_attr->cap.max_recv_sge * + sizeof(struct hns_roce_rinl_sge), + GFP_KERNEL); + if (!hr_qp->rq_inl_buf.wqe_list[0].sg_list) { + ret = -ENOMEM; + goto err_wqe_list; + } + + for (i = 1; i < hr_qp->rq_inl_buf.wqe_cnt; i++) + /* Secondly, reallocate the buffer */ + hr_qp->rq_inl_buf.wqe_list[i].sg_list = + &hr_qp->rq_inl_buf.wqe_list[0].sg_list[i * + init_attr->cap.max_recv_sge]; + } + if (ib_pd->uobject) { if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { dev_err(dev, "ib_copy_from_udata error for create qp\n"); ret = -EFAULT; - goto err_out; + goto err_rq_sge_list; } ret = hns_roce_set_user_sq_size(hr_dev, &init_attr->cap, hr_qp, &ucmd); if (ret) { dev_err(dev, "hns_roce_set_user_sq_size error for create qp\n"); - goto err_out; + goto err_rq_sge_list; } hr_qp->umem = ib_umem_get(ib_pd->uobject->context, @@ -533,7 +564,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, if (IS_ERR(hr_qp->umem)) { dev_err(dev, "ib_umem_get error for create qp\n"); ret = PTR_ERR(hr_qp->umem); - goto err_out; + goto err_rq_sge_list; } hr_qp->mtt.mtt_type = MTT_TYPE_WQE; @@ -567,13 +598,13 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { dev_err(dev, "init_attr->create_flags error!\n"); ret = -EINVAL; - goto err_out; + goto err_rq_sge_list; } if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO) { dev_err(dev, "init_attr->create_flags error!\n"); ret = -EINVAL; - goto err_out; + goto err_rq_sge_list; } /* Set SQ size */ @@ -581,7 +612,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, hr_qp); if (ret) { dev_err(dev, "hns_roce_set_kernel_sq_size error!\n"); - goto err_out; + goto err_rq_sge_list; } /* QP doorbell register address */ @@ -597,7 +628,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, &hr_qp->hr_buf, page_shift)) { dev_err(dev, "hns_roce_buf_alloc error!\n"); ret = -ENOMEM; - goto err_out; + goto err_rq_sge_list; } hr_qp->mtt.mtt_type = MTT_TYPE_WQE; @@ -679,6 +710,14 @@ err_buf: else hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf); +err_rq_sge_list: + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) + kfree(hr_qp->rq_inl_buf.wqe_list[0].sg_list); + +err_wqe_list: + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RQ_INLINE) + kfree(hr_qp->rq_inl_buf.wqe_list); + err_out: return ret; } -- cgit v1.2.3 From 4f3f7a704b3bff9e4eb322ab3c989b505f7562eb Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 3 Jan 2018 10:44:04 +0800 Subject: RDMA/hns: Update the usage of sr_max and rr_max field This patch fixes the usage with sr_max filed and rr_max of qp context when modify qp. Its modifications include: 1. Adjust location of filling sr_max filed of qpc 2. Only assign the number of responder resource if IB_QP_MAX_DEST_RD_ATOMIC bit is set 3. Only assign the number of outstanding resource if IB_QP_MAX_QP_RD_ATOMIC 4. Fix the assgin algorithms for the field of sr_max and rr_max of qp context Signed-off-by: Lijun Ou Signed-off-by: Yixian Liu Signed-off-by: Wei Hu (Xavier) Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ba05de426c29..4685696ff937 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2536,11 +2536,14 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_LBI_S, 0); } - roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M, - V2_QPC_BYTE_140_RR_MAX_S, - ilog2((unsigned int)attr->max_dest_rd_atomic)); - roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M, - V2_QPC_BYTE_140_RR_MAX_S, 0); + if ((attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) && + attr->max_dest_rd_atomic) { + roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M, + V2_QPC_BYTE_140_RR_MAX_S, + fls(attr->max_dest_rd_atomic - 1)); + roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M, + V2_QPC_BYTE_140_RR_MAX_S, 0); + } roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S, attr->dest_qp_num); @@ -2630,12 +2633,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, V2_QPC_BYTE_168_LP_SGEN_INI_M, V2_QPC_BYTE_168_LP_SGEN_INI_S, 0); - roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M, - V2_QPC_BYTE_208_SR_MAX_S, - ilog2((unsigned int)attr->max_rd_atomic)); - roce_set_field(qpc_mask->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M, - V2_QPC_BYTE_208_SR_MAX_S, 0); - roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, V2_QPC_BYTE_28_SL_S, rdma_ah_get_sl(&attr->ah_attr)); roce_set_field(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_SL_M, @@ -2839,6 +2836,14 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M, V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0); + if ((attr_mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) { + roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M, + V2_QPC_BYTE_208_SR_MAX_S, + fls(attr->max_rd_atomic - 1)); + roce_set_field(qpc_mask->byte_208_irrl, + V2_QPC_BYTE_208_SR_MAX_M, + V2_QPC_BYTE_208_SR_MAX_S, 0); + } return 0; } -- cgit v1.2.3 From ace1c5416b37bc9d925f91ee163c47fa6aa16781 Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 3 Jan 2018 10:44:05 +0800 Subject: RDMA/hns: Set access flags of hip08 RoCE This patch refactors the code of setting access flags for RDMA operation as well as adds the scene when attr->max_dest_rd_atomic is zero. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 48 +++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 8d123d3ee1ff..4afa070b20fd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -483,6 +483,7 @@ struct hns_roce_qp { u8 resp_depth; u8 state; u32 access_flags; + u32 atomic_rd_en; u32 pkey_index; void (*event)(struct hns_roce_qp *, enum hns_roce_event); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4685696ff937..ea966ccb4dfb 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1931,6 +1931,36 @@ static int hns_roce_v2_qp_modify(struct hns_roce_dev *hr_dev, return ret; } +static void set_access_flags(struct hns_roce_qp *hr_qp, + struct hns_roce_v2_qp_context *context, + struct hns_roce_v2_qp_context *qpc_mask, + const struct ib_qp_attr *attr, int attr_mask) +{ + u8 dest_rd_atomic; + u32 access_flags; + + dest_rd_atomic = !!(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) ? + attr->max_dest_rd_atomic : hr_qp->resp_depth; + + access_flags = !!(attr_mask & IB_QP_ACCESS_FLAGS) ? + attr->qp_access_flags : hr_qp->atomic_rd_en; + + if (!dest_rd_atomic) + access_flags &= IB_ACCESS_REMOTE_WRITE; + + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, + !!(access_flags & IB_ACCESS_REMOTE_READ)); + roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, 0); + + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, + !!(access_flags & IB_ACCESS_REMOTE_WRITE)); + roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, 0); + + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, + !!(access_flags & IB_ACCESS_REMOTE_ATOMIC)); + roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0); +} + static void modify_qp_reset_to_init(struct ib_qp *ibqp, const struct ib_qp_attr *attr, struct hns_roce_v2_qp_context *context, @@ -2016,18 +2046,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0); roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0); - roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, - !!(attr->qp_access_flags & IB_ACCESS_REMOTE_READ)); - roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RRE_S, 0); - - roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, - !!(attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE)); - roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RWE_S, 0); - - roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, - !!(attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)); - roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_ATE_S, 0); - roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 1); roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0); @@ -2907,6 +2925,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, goto out; } + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) + set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask); + /* Every status migrate must change state */ roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S, new_state); @@ -2923,6 +2944,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, hr_qp->state = new_state; + if (attr_mask & IB_QP_ACCESS_FLAGS) + hr_qp->atomic_rd_en = attr->qp_access_flags; + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) hr_qp->resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) { -- cgit v1.2.3 From 52e3b42a2f587912f0558cd4989da160d8b1304a Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 3 Jan 2018 10:44:06 +0800 Subject: RDMA/hns: Filter for zero length of sge in hip08 kernel mode When the length of sge is zero, the driver need to filter it Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ea966ccb4dfb..4c37f43d60a6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -230,26 +230,37 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, V2_RC_SEND_WQE_BYTE_4_INLINE_S, 1); } else { if (wr->num_sge <= 2) { - for (i = 0; i < wr->num_sge; i++) - set_data_seg_v2(dseg + i, - wr->sg_list + i); + for (i = 0; i < wr->num_sge; i++) { + if (likely(wr->sg_list[i].length)) { + set_data_seg_v2(dseg, + wr->sg_list + i); + dseg++; + } + } } else { roce_set_field(rc_sq_wqe->byte_20, V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, sge_ind & (qp->sge.sge_cnt - 1)); - for (i = 0; i < 2; i++) - set_data_seg_v2(dseg + i, - wr->sg_list + i); + for (i = 0; i < 2; i++) { + if (likely(wr->sg_list[i].length)) { + set_data_seg_v2(dseg, + wr->sg_list + i); + dseg++; + } + } dseg = get_send_extend_sge(qp, sge_ind & (qp->sge.sge_cnt - 1)); for (i = 0; i < wr->num_sge - 2; i++) { - set_data_seg_v2(dseg + i, - wr->sg_list + 2 + i); - sge_ind++; + if (likely(wr->sg_list[i + 2].length)) { + set_data_seg_v2(dseg, + wr->sg_list + 2 + i); + dseg++; + sge_ind++; + } } } -- cgit v1.2.3 From 10bd2ade4b5fe0e8d0f3a4c4b20f008c4cb06bd2 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Wed, 3 Jan 2018 10:44:07 +0800 Subject: RDMA/hns: Fix QP state judgement before sending work requests The QP can accept send work requests only when the QP is in the states that allow them to be submitted. This patch updates the QP state judgement based on the specification. Signed-off-by: Yixian Liu Signed-off-by: Shaobo Xu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4c37f43d60a6..2ca35e341d09 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -76,7 +76,8 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, return -EOPNOTSUPP; } - if (unlikely(qp->state != IB_QPS_RTS && qp->state != IB_QPS_SQD)) { + if (unlikely(qp->state == IB_QPS_RESET || qp->state == IB_QPS_INIT || + qp->state == IB_QPS_RTR)) { dev_err(dev, "Post WQE fail, QP state %d err!\n", qp->state); *bad_wr = wr; return -EINVAL; -- cgit v1.2.3 From 107013ce7b28c3d7395bc0299c0fe3ce12f15b6f Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 3 Jan 2018 10:44:08 +0800 Subject: RDMA/hns: Assign dest_qp when deregistering mr It needs to create eight reserve QPs for resolving a bug of hip06. When deregistering mr, it will issue a rdma write for every reserve QPs. When modify qp from init to rtr, it needs to set the value of dest_qp_num. Otherwise, it will lead an error of freeing mr. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 73f187e784c7..939355ede14a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -775,7 +775,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) goto create_lp_qp_failed; } - ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, attr_mask, + ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, &attr, IB_QP_DEST_QPN, IB_QPS_INIT, IB_QPS_RTR); if (ret) { dev_err(dev, "modify qp failed(%d)!\n", ret); -- cgit v1.2.3 From d67d6114ca23263dfc17ef98e94b88846b796212 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Fri, 22 Dec 2017 08:45:44 -0800 Subject: IB/hfi1: Add RQ/SRQ information to QP stats When debugging issues with RC QPs, it is useful to know if a QP has an associated RQ or SRQ, the size of the RQ, and any RNR timeout values. Add the necessary information to the QP stats output. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 4b01ccd895b4..5507910e8b8a 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -556,6 +556,8 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) struct sdma_engine *sde; struct send_context *send_context; struct rvt_ack_entry *e = NULL; + struct rvt_srq *srq = qp->ibqp.srq ? + ibsrq_to_rvtsrq(qp->ibqp.srq) : NULL; sde = qp_to_sdma_engine(qp, priv->s_sc); wqe = rvt_get_swqe_ptr(qp, qp->s_last); @@ -563,7 +565,7 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) if (qp->s_ack_queue) e = &qp->s_ack_queue[qp->s_tail_ack_queue]; seq_printf(s, - "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x\n", + "N %d %s QP %x R %u %s %u %u %u f=%x %u %u %u %u %u %u SPSN %x %x %x %x %x RPSN %x S(%u %u %u %u %u %u %u) R(%u %u %u) RQP %x LID %x SL %u MTU %u %u %u %u %u SDE %p,%u SC %p,%u SCQ %u %u PID %d OS %x %x E %x %x %x RNR %d %s %d\n", iter->n, qp_idle(qp) ? "I" : "B", qp->ibqp.qp_num, @@ -610,7 +612,11 @@ void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter) /* ack queue information */ e ? e->opcode : 0, e ? e->psn : 0, - e ? e->lpsn : 0); + e ? e->lpsn : 0, + qp->r_min_rnr_timer, + srq ? "SRQ" : "RQ", + srq ? srq->rq.size : qp->r_rq.size + ); } void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) -- cgit v1.2.3 From 16c1975f1032330d54979fbc83c5f767afce67a9 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 1 Jan 2018 13:06:58 +0200 Subject: IB/mlx5: Create profile infrastructure to add and remove stages Today we have single function which is used when we add an IB interface, break this function into multiple functions. Create stages and a generic mechanism to execute each stage. This is in preparation for RDMA/IB representors which might not need all stages or will do things differently in some of the stages. This patch doesn't change any functionality. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 344 +++++++++++++++++++++++++---------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 31 ++++ 2 files changed, 282 insertions(+), 93 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 1422db847b3e..97c2b6b25ac8 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4024,30 +4024,21 @@ mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector) return mlx5_get_vector_affinity(dev->mdev, comp_vector); } -static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { - struct mlx5_ib_dev *dev; - enum rdma_link_layer ll; - int port_type_cap; + kfree(dev->port); +} + +static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; const char *name; int err; - int i; - - port_type_cap = MLX5_CAP_GEN(mdev, port_type); - ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); - - printk_once(KERN_INFO "%s", mlx5_version); - - dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); - if (!dev) - return NULL; - - dev->mdev = mdev; dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), GFP_KERNEL); if (!dev->port) - goto err_dealloc; + return -ENOMEM; rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); @@ -4072,6 +4063,24 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dev.parent = &mdev->pdev->dev; + return 0; + +err_free_port: + kfree(dev->port); + + return -ENOMEM; +} + +static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + int err; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | @@ -4215,139 +4224,288 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) } err = init_node_data(dev); if (err) - goto err_free_port; + return err; mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); + return 0; +} + +static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; + int err; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + if (ll == IB_LINK_LAYER_ETHERNET) { err = mlx5_enable_eth(dev); if (err) - goto err_free_port; + return err; dev->roce.last_port_state = IB_PORT_DOWN; } - err = create_dev_resources(&dev->devr); - if (err) - goto err_disable_eth; + return 0; +} - err = mlx5_ib_odp_init_one(dev); - if (err) - goto err_rsrc; +static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev) +{ + struct mlx5_core_dev *mdev = dev->mdev; + enum rdma_link_layer ll; + int port_type_cap; - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { - err = mlx5_ib_alloc_counters(dev); - if (err) - goto err_odp; + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (ll == IB_LINK_LAYER_ETHERNET) { + mlx5_disable_eth(dev); + mlx5_remove_netdev_notifier(dev); } +} - err = mlx5_ib_init_cong_debugfs(dev); - if (err) - goto err_cnt; +static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev) +{ + return create_dev_resources(&dev->devr); +} + +static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev) +{ + destroy_dev_resources(&dev->devr); +} + +static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev) +{ + return mlx5_ib_odp_init_one(dev); +} +static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_odp_remove_one(dev); +} + +static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) + return mlx5_ib_alloc_counters(dev); + + return 0; +} + +static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev) +{ + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) + mlx5_ib_dealloc_counters(dev); +} + +static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev) +{ + return mlx5_ib_init_cong_debugfs(dev); +} + +static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_ib_cleanup_cong_debugfs(dev); +} + +static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) +{ dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); if (!dev->mdev->priv.uar) - goto err_cong; + return -ENOMEM; + return 0; +} + +static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); +} + +static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) +{ + int err; err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); if (err) - goto err_uar_page; + return err; err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); if (err) - goto err_bfreg; + mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); - err = ib_register_device(&dev->ib_dev, NULL); - if (err) - goto err_fp_bfreg; + return err; +} - err = create_umr_res(dev); - if (err) - goto err_dev; +static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev) +{ + mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + mlx5_free_bfreg(dev->mdev, &dev->bfreg); +} +static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) +{ + return ib_register_device(&dev->ib_dev, NULL); +} + +static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev) +{ + ib_unregister_device(&dev->ib_dev); +} + +static int mlx5_ib_stage_umr_res_init(struct mlx5_ib_dev *dev) +{ + return create_umr_res(dev); +} + +static void mlx5_ib_stage_umr_res_cleanup(struct mlx5_ib_dev *dev) +{ + destroy_umrc_res(dev); +} + +static int mlx5_ib_stage_delay_drop_init(struct mlx5_ib_dev *dev) +{ init_delay_drop(dev); + return 0; +} + +static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) +{ + cancel_delay_drop(dev); +} + +static int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev) +{ + int err; + int i; + for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { err = device_create_file(&dev->ib_dev.dev, mlx5_class_attributes[i]); if (err) - goto err_delay_drop; + return err; } - if ((MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && - MLX5_CAP_GEN(mdev, disable_local_lb)) + return 0; +} + +static int mlx5_ib_stage_loopback_init(struct mlx5_ib_dev *dev) +{ + if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && + MLX5_CAP_GEN(dev->mdev, disable_local_lb)) mutex_init(&dev->lb_mutex); - dev->ib_active = true; + return 0; +} - return dev; +static void __mlx5_ib_remove(struct mlx5_ib_dev *dev, + const struct mlx5_ib_profile *profile, + int stage) +{ + /* Number of stages to cleanup */ + while (stage) { + stage--; + if (profile->stage[stage].cleanup) + profile->stage[stage].cleanup(dev); + } -err_delay_drop: - cancel_delay_drop(dev); - destroy_umrc_res(dev); + ib_dealloc_device((struct ib_device *)dev); +} -err_dev: - ib_unregister_device(&dev->ib_dev); +static void *__mlx5_ib_add(struct mlx5_core_dev *mdev, + const struct mlx5_ib_profile *profile) +{ + struct mlx5_ib_dev *dev; + int err; + int i; -err_fp_bfreg: - mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + printk_once(KERN_INFO "%s", mlx5_version); -err_bfreg: - mlx5_free_bfreg(dev->mdev, &dev->bfreg); + dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev)); + if (!dev) + return NULL; -err_uar_page: - mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); + dev->mdev = mdev; -err_cong: - mlx5_ib_cleanup_cong_debugfs(dev); -err_cnt: - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) - mlx5_ib_dealloc_counters(dev); + for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { + if (profile->stage[i].init) { + err = profile->stage[i].init(dev); + if (err) + goto err_out; + } + } -err_odp: - mlx5_ib_odp_remove_one(dev); + dev->profile = profile; + dev->ib_active = true; -err_rsrc: - destroy_dev_resources(&dev->devr); + return dev; -err_disable_eth: - if (ll == IB_LINK_LAYER_ETHERNET) { - mlx5_disable_eth(dev); - mlx5_remove_netdev_notifier(dev); - } +err_out: + __mlx5_ib_remove(dev, profile, i); -err_free_port: - kfree(dev->port); + return NULL; +} -err_dealloc: - ib_dealloc_device((struct ib_device *)dev); +static const struct mlx5_ib_profile pf_profile = { + STAGE_CREATE(MLX5_IB_STAGE_INIT, + mlx5_ib_stage_init_init, + mlx5_ib_stage_init_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CAPS, + mlx5_ib_stage_caps_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_ROCE, + mlx5_ib_stage_roce_init, + mlx5_ib_stage_roce_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, + mlx5_ib_stage_dev_res_init, + mlx5_ib_stage_dev_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_ODP, + mlx5_ib_stage_odp_init, + mlx5_ib_stage_odp_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, + mlx5_ib_stage_counters_init, + mlx5_ib_stage_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, + mlx5_ib_stage_cong_debugfs_init, + mlx5_ib_stage_cong_debugfs_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_UAR, + mlx5_ib_stage_uar_init, + mlx5_ib_stage_uar_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_BFREG, + mlx5_ib_stage_bfrag_init, + mlx5_ib_stage_bfrag_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_IB_REG, + mlx5_ib_stage_ib_reg_init, + mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_UMR_RESOURCES, + mlx5_ib_stage_umr_res_init, + mlx5_ib_stage_umr_res_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, + mlx5_ib_stage_delay_drop_init, + mlx5_ib_stage_delay_drop_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, + mlx5_ib_stage_class_attr_init, + NULL), + STAGE_CREATE(MLX5_IB_STAGE_LOOPBACK, + mlx5_ib_stage_loopback_init, + NULL), +}; - return NULL; +static void *mlx5_ib_add(struct mlx5_core_dev *mdev) +{ + return __mlx5_ib_add(mdev, &pf_profile); } static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; - enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); - cancel_delay_drop(dev); - mlx5_remove_netdev_notifier(dev); - ib_unregister_device(&dev->ib_dev); - mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); - mlx5_free_bfreg(dev->mdev, &dev->bfreg); - mlx5_put_uars_page(dev->mdev, mdev->priv.uar); - mlx5_ib_cleanup_cong_debugfs(dev); - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) - mlx5_ib_dealloc_counters(dev); - destroy_umrc_res(dev); - mlx5_ib_odp_remove_one(dev); - destroy_dev_resources(&dev->devr); - if (ll == IB_LINK_LAYER_ETHERNET) - mlx5_disable_eth(dev); - kfree(dev->port); - ib_dealloc_device(&dev->ib_dev); + __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } static struct mlx5_interface mlx5_ib_interface = { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5fabd5807db6..0504ab3b9a06 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -715,6 +715,36 @@ struct mlx5_ib_delay_drop { struct mlx5_ib_dbg_delay_drop *dbg; }; +enum mlx5_ib_stages { + MLX5_IB_STAGE_INIT, + MLX5_IB_STAGE_CAPS, + MLX5_IB_STAGE_ROCE, + MLX5_IB_STAGE_DEVICE_RESOURCES, + MLX5_IB_STAGE_ODP, + MLX5_IB_STAGE_COUNTERS, + MLX5_IB_STAGE_CONG_DEBUGFS, + MLX5_IB_STAGE_UAR, + MLX5_IB_STAGE_BFREG, + MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_UMR_RESOURCES, + MLX5_IB_STAGE_DELAY_DROP, + MLX5_IB_STAGE_CLASS_ATTR, + MLX5_IB_STAGE_LOOPBACK, + MLX5_IB_STAGE_MAX, +}; + +struct mlx5_ib_stage { + int (*init)(struct mlx5_ib_dev *dev); + void (*cleanup)(struct mlx5_ib_dev *dev); +}; + +#define STAGE_CREATE(_stage, _init, _cleanup) \ + .stage[_stage] = {.init = _init, .cleanup = _cleanup} + +struct mlx5_ib_profile { + struct mlx5_ib_stage stage[MLX5_IB_STAGE_MAX]; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -753,6 +783,7 @@ struct mlx5_ib_dev { struct mlx5_sq_bfreg fp_bfreg; struct mlx5_ib_delay_drop delay_drop; struct mlx5_ib_dbg_cc_params *dbg_cc_params; + const struct mlx5_ib_profile *profile; /* protect the user_td */ struct mutex lb_mutex; -- cgit v1.2.3 From c11a226a1e023508a663a07ff822f6ff33e7868c Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 1 Jan 2018 13:06:59 +0200 Subject: IB/mlx5: Move RoCE/ETH initialization to the corresponding stage Now that we have a stage just for RoCE/ETH, move all relevant initialization logic into one place. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 97c2b6b25ac8..9a5de242b74d 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4074,13 +4074,8 @@ err_free_port: static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; - enum rdma_link_layer ll; - int port_type_cap; int err; - port_type_cap = MLX5_CAP_GEN(mdev, port_type); - ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); - dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | @@ -4119,8 +4114,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; - if (ll == IB_LINK_LAYER_ETHERNET) - dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; dev->ib_dev.add_gid = mlx5_ib_add_gid; dev->ib_dev.del_gid = mlx5_ib_del_gid; @@ -4208,20 +4201,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); - if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == - IB_LINK_LAYER_ETHERNET) { - dev->ib_dev.create_wq = mlx5_ib_create_wq; - dev->ib_dev.modify_wq = mlx5_ib_modify_wq; - dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; - dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; - dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; - dev->ib_dev.uverbs_ex_cmd_mask |= - (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | - (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | - (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | - (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | - (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); - } err = init_node_data(dev); if (err) return err; @@ -4245,6 +4224,18 @@ static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) { + dev->ib_dev.get_netdev = mlx5_ib_get_netdev; + dev->ib_dev.create_wq = mlx5_ib_create_wq; + dev->ib_dev.modify_wq = mlx5_ib_modify_wq; + dev->ib_dev.destroy_wq = mlx5_ib_destroy_wq; + dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table; + dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table; + dev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | + (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); err = mlx5_enable_eth(dev); if (err) return err; -- cgit v1.2.3 From 07321b3c674bc2f4d1e47431e38365af523fe34f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 1 Jan 2018 13:07:00 +0200 Subject: IB/mlx5: Move ODP initialization to the corresponding stage Now that we have a stage just for ODP, move all relevant initialization logic into one place. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 9a5de242b74d..07331613f423 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4170,8 +4170,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) dev->ib_dev.disassociate_ucontext = mlx5_ib_disassociate_ucontext; - mlx5_ib_internal_fill_odp_caps(dev); - dev->umr_fence = mlx5_get_umr_fence(MLX5_CAP_GEN(mdev, umr_fence)); if (MLX5_CAP_GEN(mdev, imaicl)) { @@ -4272,6 +4270,8 @@ static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev) static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev) { + mlx5_ib_internal_fill_odp_caps(dev); + return mlx5_ib_odp_init_one(dev); } -- cgit v1.2.3 From 5e1e76125152c9e4a963188ff66e773086fb54d2 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 1 Jan 2018 13:07:01 +0200 Subject: IB/mlx5: Move hardware counters initialization to the corresponding stage Now that we have a stage just for hardware counters, move all relevant initialization logic into one place. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 07331613f423..1a409e7ade80 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4180,11 +4180,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { - dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; - dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; - } - if (MLX5_CAP_GEN(mdev, xrc)) { dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd; dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd; @@ -4282,8 +4277,12 @@ static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev) static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) { - if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) - return mlx5_ib_alloc_counters(dev); + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { + dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; + dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; + + return mlx5_ib_alloc_counters(dev); + } return 0; } -- cgit v1.2.3 From c8b8992446a945c103ac74ebd5e05672d9b3c48a Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 1 Jan 2018 13:07:02 +0200 Subject: IB/mlx5: Move loopback initialization to the corresponding stage The loopback stage only initializes a lock, move it to be in the CAPS initialization phase and get rid loopback step completely. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 16 ++++------------ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 1a409e7ade80..b8952fff1fc8 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4203,6 +4203,10 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); + if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && + MLX5_CAP_GEN(dev->mdev, disable_local_lb)) + mutex_init(&dev->lb_mutex); + return 0; } @@ -4384,15 +4388,6 @@ static int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev) return 0; } -static int mlx5_ib_stage_loopback_init(struct mlx5_ib_dev *dev) -{ - if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && - MLX5_CAP_GEN(dev->mdev, disable_local_lb)) - mutex_init(&dev->lb_mutex); - - return 0; -} - static void __mlx5_ib_remove(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile, int stage) @@ -4481,9 +4476,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, mlx5_ib_stage_class_attr_init, NULL), - STAGE_CREATE(MLX5_IB_STAGE_LOOPBACK, - mlx5_ib_stage_loopback_init, - NULL), }; static void *mlx5_ib_add(struct mlx5_core_dev *mdev) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 0504ab3b9a06..768fa7334100 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -729,7 +729,6 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_UMR_RESOURCES, MLX5_IB_STAGE_DELAY_DROP, MLX5_IB_STAGE_CLASS_ATTR, - MLX5_IB_STAGE_LOOPBACK, MLX5_IB_STAGE_MAX, }; -- cgit v1.2.3 From 3cc297db970762024109f75ce289078f8479a2f8 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 1 Jan 2018 13:07:03 +0200 Subject: IB/mlx5: Move locks initialization to the corresponding stage Unconditional locks/list and ODP srcu initialization should be done in the INIT stage. Remove those from the CAPS stage and move them to the proper stage. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 26 +++++++++++++++----------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 -- drivers/infiniband/hw/mlx5/odp.c | 9 --------- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b8952fff1fc8..b9e195d154b1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4026,6 +4026,9 @@ mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector) static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + cleanup_srcu_struct(&dev->mr_srcu); +#endif kfree(dev->port); } @@ -4063,6 +4066,17 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dev.parent = &mdev->pdev->dev; + mutex_init(&dev->flow_db.lock); + mutex_init(&dev->cap_mask_mutex); + INIT_LIST_HEAD(&dev->qp_list); + spin_lock_init(&dev->reset_flow_resource_lock); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + err = init_srcu_struct(&dev->mr_srcu); + if (err) + goto err_free_port; +#endif + return 0; err_free_port: @@ -4198,11 +4212,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) if (err) return err; - mutex_init(&dev->flow_db.lock); - mutex_init(&dev->cap_mask_mutex); - INIT_LIST_HEAD(&dev->qp_list); - spin_lock_init(&dev->reset_flow_resource_lock); - if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && MLX5_CAP_GEN(dev->mdev, disable_local_lb)) mutex_init(&dev->lb_mutex); @@ -4274,11 +4283,6 @@ static int mlx5_ib_stage_odp_init(struct mlx5_ib_dev *dev) return mlx5_ib_odp_init_one(dev); } -static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev) -{ - mlx5_ib_odp_remove_one(dev); -} - static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { @@ -4451,7 +4455,7 @@ static const struct mlx5_ib_profile pf_profile = { mlx5_ib_stage_dev_res_cleanup), STAGE_CREATE(MLX5_IB_STAGE_ODP, mlx5_ib_stage_odp_init, - mlx5_ib_stage_odp_cleanup), + NULL), STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, mlx5_ib_stage_counters_init, mlx5_ib_stage_counters_cleanup), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 768fa7334100..b3f2f5cae672 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -998,7 +998,6 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, struct mlx5_pagefault *pfault); int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); -void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, @@ -1013,7 +1012,6 @@ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) } static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } -static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index e2197bdda89c..f1a87a690a4c 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1207,10 +1207,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { int ret; - ret = init_srcu_struct(&dev->mr_srcu); - if (ret) - return ret; - if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); if (ret) { @@ -1222,11 +1218,6 @@ int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) return 0; } -void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev) -{ - cleanup_srcu_struct(&dev->mr_srcu); -} - int mlx5_ib_odp_init(void) { mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - -- cgit v1.2.3 From 02ee9da347873699603d9ce0112a80b5dd69dea1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 3 Jan 2018 13:28:18 -0800 Subject: IB/core: Fix two kernel warnings triggered by rxe registration Eliminate the WARN_ONs that create following two warnings when registering an rxe device: WARNING: CPU: 2 PID: 1005 at drivers/infiniband/core/device.c:449 ib_register_device+0x591/0x640 [ib_core] CPU: 2 PID: 1005 Comm: run_tests Not tainted 4.15.0-rc4-dbg+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 RIP: 0010:ib_register_device+0x591/0x640 [ib_core] Call Trace: rxe_register_device+0x3c6/0x470 [rdma_rxe] rxe_add+0x543/0x5e0 [rdma_rxe] rxe_net_add+0x37/0xb0 [rdma_rxe] rxe_param_set_add+0x5a/0x120 [rdma_rxe] param_attr_store+0x5e/0xc0 module_attr_store+0x19/0x30 sysfs_kf_write+0x3d/0x50 kernfs_fop_write+0x116/0x1a0 __vfs_write+0x23/0x120 vfs_write+0xbe/0x1b0 SyS_write+0x44/0xa0 entry_SYSCALL_64_fastpath+0x23/0x9a WARNING: CPU: 2 PID: 1005 at drivers/infiniband/core/sysfs.c:1279 ib_device_register_sysfs+0x11d/0x160 [ib_core] CPU: 2 PID: 1005 Comm: run_tests Tainted: G W 4.15.0-rc4-dbg+ #2 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 RIP: 0010:ib_device_register_sysfs+0x11d/0x160 [ib_core] Call Trace: ib_register_device+0x3f7/0x640 [ib_core] rxe_register_device+0x3c6/0x470 [rdma_rxe] rxe_add+0x543/0x5e0 [rdma_rxe] rxe_net_add+0x37/0xb0 [rdma_rxe] rxe_param_set_add+0x5a/0x120 [rdma_rxe] param_attr_store+0x5e/0xc0 module_attr_store+0x19/0x30 sysfs_kf_write+0x3d/0x50 kernfs_fop_write+0x116/0x1a0 __vfs_write+0x23/0x120 vfs_write+0xbe/0x1b0 SyS_write+0x44/0xa0 entry_SYSCALL_64_fastpath+0x23/0x9a The code should accept either a parent pointer or a fully specified DMA specification without producing warnings. Fixes: 99db9494035f ("IB/core: Remove ib_device.dma_device") Signed-off-by: Bart Van Assche Cc: Leon Romanovsky Cc: stable@vger.kernel.org # v4.11 Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 20 ++++++++++++++------ drivers/infiniband/core/sysfs.c | 1 - 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a8757b5552de..5c9934b1e598 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -446,7 +446,6 @@ int ib_register_device(struct ib_device *device, struct ib_udata uhw = {.outlen = 0, .inlen = 0}; struct device *parent = device->dev.parent; - WARN_ON_ONCE(!parent); WARN_ON_ONCE(device->dma_device); if (device->dev.dma_ops) { /* @@ -455,16 +454,25 @@ int ib_register_device(struct ib_device *device, * into device->dev. */ device->dma_device = &device->dev; - if (!device->dev.dma_mask) - device->dev.dma_mask = parent->dma_mask; - if (!device->dev.coherent_dma_mask) - device->dev.coherent_dma_mask = - parent->coherent_dma_mask; + if (!device->dev.dma_mask) { + if (parent) + device->dev.dma_mask = parent->dma_mask; + else + WARN_ON_ONCE(true); + } + if (!device->dev.coherent_dma_mask) { + if (parent) + device->dev.coherent_dma_mask = + parent->coherent_dma_mask; + else + WARN_ON_ONCE(true); + } } else { /* * The caller did not provide custom DMA operations. Use the * DMA mapping operations of the parent device. */ + WARN_ON_ONCE(!parent); device->dma_device = parent; } diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index e30d86fa1855..8ae1308eecc7 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1276,7 +1276,6 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - WARN_ON_ONCE(!device->dev.parent); ret = dev_set_name(class_dev, "%s", device->name); if (ret) return ret; -- cgit v1.2.3 From 437ff786e24934bb39493f9c381595e6fab6d338 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 18 Dec 2017 19:56:30 -0800 Subject: IB/rdmavt: No need to cancel RNRNAK retry timer when it is running When the rdmavt's RNRNAK timer is fired, it tries to cancel the timer by calling hrtimer_try_to_cancel(), which always returns -1 because the timer is currently running. This patch removes this useless call. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 9177df60742a..73705a19bd2e 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2110,10 +2110,8 @@ static int rvt_stop_rnr_timer(struct rvt_qp *qp) lockdep_assert_held(&qp->s_lock); /* Remove QP from rnr timer */ - if (qp->s_flags & RVT_S_WAIT_RNR) { + if (qp->s_flags & RVT_S_WAIT_RNR) qp->s_flags &= ~RVT_S_WAIT_RNR; - rval = hrtimer_try_to_cancel(&qp->s_rnr_timer); - } return rval; } -- cgit v1.2.3 From 5084c8ff21f202db98a2228eb6a042f18b8f0fee Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 18 Dec 2017 19:56:37 -0800 Subject: IB/{rdmavt, hfi1, qib}: Self determine driver name Currently the HFI and QIB drivers allow the IB core to assign a unit number to the driver name string. If multiple devices exist in a system, there is a possibility that the device unit number and the IB core number will be mismatched. Fix by using the driver defined unit number to generate the device name. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/init.c | 2 ++ drivers/infiniband/hw/hfi1/verbs.c | 3 --- drivers/infiniband/hw/qib/qib_init.c | 2 ++ drivers/infiniband/hw/qib/qib_verbs.c | 1 - include/rdma/rdma_vt.h | 13 +++++++++++++ 5 files changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 8e3b3e7d829a..9b128268fb28 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1272,6 +1272,8 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) "Could not allocate unit ID: error %d\n", -ret); goto bail; } + rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); + /* * Initialize all locks for the device. This needs to be as early as * possible so locks are usable. diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 6d27c8594b34..4e68e3f3ce9f 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1844,7 +1844,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) struct hfi1_ibport *ibp = &ppd->ibport_data; unsigned i; int ret; - size_t lcpysz = IB_DEVICE_NAME_MAX; for (i = 0; i < dd->num_pports; i++) init_ibport(ppd + i); @@ -1872,8 +1871,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) */ if (!ib_hfi1_sys_image_guid) ib_hfi1_sys_image_guid = ibdev->node_guid; - lcpysz = strlcpy(ibdev->name, class_name(), lcpysz); - strlcpy(ibdev->name + lcpysz, "_%d", IB_DEVICE_NAME_MAX - lcpysz); ibdev->owner = THIS_MODULE; ibdev->phys_port_cnt = dd->num_pports; ibdev->dev.parent = &dd->pcidev->dev; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index 85dfbba427f6..3990f386aa32 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1119,6 +1119,8 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) "Could not allocate unit ID: error %d\n", -ret); goto bail; } + rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s%d", "qib", dd->unit); + dd->int_counter = alloc_percpu(u64); if (!dd->int_counter) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index c55000501582..373b80b1d677 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1571,7 +1571,6 @@ int qib_register_ib_device(struct qib_devdata *dd) if (!ib_qib_sys_image_guid) ib_qib_sys_image_guid = ppd->guid; - strlcpy(ibdev->name, "qib%d", IB_DEVICE_NAME_MAX); ibdev->owner = THIS_MODULE; ibdev->node_guid = ppd->guid; ibdev->phys_port_cnt = dd->num_pports; diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 1ba84a78f1c5..b57784e30668 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -419,6 +419,19 @@ struct rvt_dev_info { }; +/** + * rvt_set_ibdev_name - Craft an IB device name from client info + * @rdi: pointer to the client rvt_dev_info structure + * @name: client specific name + * @unit: client specific unit number. + */ +static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, + const char *fmt, const char *name, + const int unit) +{ + snprintf(rdi->ibdev.name, sizeof(rdi->ibdev.name), fmt, name, unit); +} + static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) { return container_of(ibpd, struct rvt_pd, ibpd); -- cgit v1.2.3 From 06f2597f75c7ecd9006dd9711ec8c09b4f3d29e3 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 18 Dec 2017 19:56:44 -0800 Subject: IB/{rdmavt, hfi1, qib}: Remove get_card_name() downcall rdmavt has a down call to client drivers to retrieve a crafted card name. This name should be the IB defined name. Rather than craft the name each time it is needed, simply retrieve the IB allocated name from the IB device. Update the function name to reflect its application. Clean up driver code to match this change. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/driver.c | 8 -------- drivers/infiniband/hw/hfi1/hfi.h | 1 - drivers/infiniband/hw/hfi1/verbs.c | 1 - drivers/infiniband/hw/qib/qib.h | 1 - drivers/infiniband/hw/qib/qib_driver.c | 8 -------- drivers/infiniband/hw/qib/qib_verbs.c | 1 - drivers/infiniband/sw/rdmavt/trace.h | 4 ++-- drivers/infiniband/sw/rdmavt/vt.c | 1 - drivers/infiniband/sw/rdmavt/vt.h | 6 +++--- include/rdma/rdma_vt.h | 18 +++++++++++------- 10 files changed, 16 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 4f65ac671044..4561719096f2 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -167,14 +167,6 @@ const char *get_unit_name(int unit) return iname; } -const char *get_card_name(struct rvt_dev_info *rdi) -{ - struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); - struct hfi1_devdata *dd = container_of(ibdev, - struct hfi1_devdata, verbs_dev); - return get_unit_name(dd->unit); -} - struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi) { struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 99921acd2ab4..0709aba7c964 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1972,7 +1972,6 @@ int get_platform_config_field(struct hfi1_devdata *dd, u32 *data, u32 len); const char *get_unit_name(int unit); -const char *get_card_name(struct rvt_dev_info *rdi); struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi); /* diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 4e68e3f3ce9f..b8776a362a91 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1890,7 +1890,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) * Fill in rvt info object. */ dd->verbs_dev.rdi.driver_f.port_callback = hfi1_create_port_files; - dd->verbs_dev.rdi.driver_f.get_card_name = get_card_name; dd->verbs_dev.rdi.driver_f.get_pci_dev = get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = hfi1_check_ah; dd->verbs_dev.rdi.driver_f.notify_new_ah = hfi1_notify_new_ah; diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 092ed8103842..34c5254b803d 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1429,7 +1429,6 @@ u64 qib_sps_ints(void); dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long, size_t, int); const char *qib_get_unit_name(int unit); -const char *qib_get_card_name(struct rvt_dev_info *rdi); struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi); /* diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 33d3335385e8..9b13680836be 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -89,14 +89,6 @@ const char *qib_get_unit_name(int unit) return iname; } -const char *qib_get_card_name(struct rvt_dev_info *rdi) -{ - struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); - struct qib_devdata *dd = container_of(ibdev, - struct qib_devdata, verbs_dev); - return qib_get_unit_name(dd->unit); -} - struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi) { struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 373b80b1d677..fabee760407e 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1585,7 +1585,6 @@ int qib_register_ib_device(struct qib_devdata *dd) * Fill in rvt info object. */ dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files; - dd->verbs_dev.rdi.driver_f.get_card_name = qib_get_card_name; dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah; dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe; diff --git a/drivers/infiniband/sw/rdmavt/trace.h b/drivers/infiniband/sw/rdmavt/trace.h index bb4b1e710f22..36ddbd291ee0 100644 --- a/drivers/infiniband/sw/rdmavt/trace.h +++ b/drivers/infiniband/sw/rdmavt/trace.h @@ -45,8 +45,8 @@ * */ -#define RDI_DEV_ENTRY(rdi) __string(dev, rdi->driver_f.get_card_name(rdi)) -#define RDI_DEV_ASSIGN(rdi) __assign_str(dev, rdi->driver_f.get_card_name(rdi)) +#define RDI_DEV_ENTRY(rdi) __string(dev, rvt_get_ibdev_name(rdi)) +#define RDI_DEV_ASSIGN(rdi) __assign_str(dev, rvt_get_ibdev_name(rdi)) #include "trace_rvt.h" #include "trace_qp.h" diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 64bdd442078a..088fb2d6d919 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -413,7 +413,6 @@ static noinline int check_support(struct rvt_dev_info *rdi, int verb) * required for rdmavt to function. */ if ((!rdi->driver_f.port_callback) || - (!rdi->driver_f.get_card_name) || (!rdi->driver_f.get_pci_dev)) return -EINVAL; break; diff --git a/drivers/infiniband/sw/rdmavt/vt.h b/drivers/infiniband/sw/rdmavt/vt.h index f363505312be..8823b2e7aac6 100644 --- a/drivers/infiniband/sw/rdmavt/vt.h +++ b/drivers/infiniband/sw/rdmavt/vt.h @@ -63,19 +63,19 @@ #define rvt_pr_info(rdi, fmt, ...) \ __rvt_pr_info(rdi->driver_f.get_pci_dev(rdi), \ - rdi->driver_f.get_card_name(rdi), \ + rvt_get_ibdev_name(rdi), \ fmt, \ ##__VA_ARGS__) #define rvt_pr_warn(rdi, fmt, ...) \ __rvt_pr_warn(rdi->driver_f.get_pci_dev(rdi), \ - rdi->driver_f.get_card_name(rdi), \ + rvt_get_ibdev_name(rdi), \ fmt, \ ##__VA_ARGS__) #define rvt_pr_err(rdi, fmt, ...) \ __rvt_pr_err(rdi->driver_f.get_pci_dev(rdi), \ - rdi->driver_f.get_card_name(rdi), \ + rvt_get_ibdev_name(rdi), \ fmt, \ ##__VA_ARGS__) diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index b57784e30668..4118324a0310 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -227,13 +227,6 @@ struct rvt_driver_provided { /* Passed to ib core registration. Callback to create syfs files */ int (*port_callback)(struct ib_device *, u8, struct kobject *); - /* - * Returns a string to represent the device for which is being - * registered. This is primarily used for error and debug messages on - * the console. - */ - const char * (*get_card_name)(struct rvt_dev_info *rdi); - /* * Returns a pointer to the undelying hardware's PCI device. This is * used to display information as to what hardware is being referenced @@ -432,6 +425,17 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, snprintf(rdi->ibdev.name, sizeof(rdi->ibdev.name), fmt, name, unit); } +/** + * rvt_get_ibdev_name - return the IB name + * @rdi: rdmavt device + * + * Return the registered name of the device. + */ +static inline const char *rvt_get_ibdev_name(const struct rvt_dev_info *rdi) +{ + return rdi->ibdev.name; +} + static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) { return container_of(ibpd, struct rvt_pd, ibpd); -- cgit v1.2.3 From f5b53b043470b28b60a9c50317e3f777a872b77c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 18 Dec 2017 19:56:52 -0800 Subject: IB/rdmavt: Use correct numa node for SRQ allocation Normal receive queue allocation ensures that kernel receive queues are allocated on the local numa node. Shared receive queues do not behave the same way. Ensure that kernel shared receive queues are allocated on the device local node. Reviewed-by: Sebastian Sanchez Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/srq.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/srq.c b/drivers/infiniband/sw/rdmavt/srq.c index f7c48e9023de..3707952b4364 100644 --- a/drivers/infiniband/sw/rdmavt/srq.c +++ b/drivers/infiniband/sw/rdmavt/srq.c @@ -90,7 +90,7 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, srq_init_attr->attr.max_wr > dev->dparms.props.max_srq_wr) return ERR_PTR(-EINVAL); - srq = kmalloc(sizeof(*srq), GFP_KERNEL); + srq = kzalloc_node(sizeof(*srq), GFP_KERNEL, dev->dparms.node); if (!srq) return ERR_PTR(-ENOMEM); @@ -101,7 +101,10 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, srq->rq.max_sge = srq_init_attr->attr.max_sge; sz = sizeof(struct ib_sge) * srq->rq.max_sge + sizeof(struct rvt_rwqe); - srq->rq.wq = vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz); + srq->rq.wq = udata ? + vmalloc_user(sizeof(struct rvt_rwq) + srq->rq.size * sz) : + vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz, + dev->dparms.node); if (!srq->rq.wq) { ret = ERR_PTR(-ENOMEM); goto bail_srq; @@ -129,16 +132,12 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd, ret = ERR_PTR(err); goto bail_ip; } - } else { - srq->ip = NULL; } /* * ib_create_srq() will initialize srq->ibsrq. */ spin_lock_init(&srq->rq.lock); - srq->rq.wq->head = 0; - srq->rq.wq->tail = 0; srq->limit = srq_init_attr->attr.srq_limit; spin_lock(&dev->n_srqs_lock); @@ -200,7 +199,10 @@ int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, sz = sizeof(struct rvt_rwqe) + srq->rq.max_sge * sizeof(struct ib_sge); size = attr->max_wr + 1; - wq = vmalloc_user(sizeof(struct rvt_rwq) + size * sz); + wq = udata ? + vmalloc_user(sizeof(struct rvt_rwq) + size * sz) : + vzalloc_node(sizeof(struct rvt_rwq) + size * sz, + dev->dparms.node); if (!wq) return -ENOMEM; -- cgit v1.2.3 From 9996b049f64125dbf05916e80a06546e92185f9f Mon Sep 17 00:00:00 2001 From: Sebastian Sanchez Date: Mon, 18 Dec 2017 19:56:59 -0800 Subject: IB/hfi1: Fix infinite loop in 8051 command error path When an 8051 command times out, the entire DC block is restarted. During the restart, the host interface version bit is set, which calls do_8051_command() recursively. The host version bit needs to be set before the link moves into polling, so the host version bit can be set in set_local_link_attributes() instead. Thus, the 8051 command functions can be simplied as a non-locking version (dd->dc8051_lock) of those functions are no longer needed. Fixes: 9be6a5d788b0 ("IB/hfi1: Prevent LNI out of sync by resetting host interface version") Reviewed-by: Michael J. Ruhl Signed-off-by: Sebastian Sanchez Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 82 +++++++++++++---------------------- drivers/infiniband/hw/hfi1/chip.h | 2 +- drivers/infiniband/hw/hfi1/firmware.c | 64 +++++++-------------------- 3 files changed, 46 insertions(+), 102 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 4f057e8ffe50..ef0939c632c7 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -6518,11 +6518,12 @@ static void _dc_start(struct hfi1_devdata *dd) if (!dd->dc_shutdown) return; - /* - * Take the 8051 out of reset, wait until 8051 is ready, and set host - * version bit. - */ - release_and_wait_ready_8051_firmware(dd); + /* Take the 8051 out of reset */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + /* Wait until 8051 is ready */ + if (wait_fm_ready(dd, TIMEOUT_8051_START)) + dd_dev_err(dd, "%s: timeout starting 8051 firmware\n", + __func__); /* Take away reset for LCB and RX FPE (set in lcb_shutdown). */ write_csr(dd, DCC_CFG_RESET, 0x10); @@ -8564,23 +8565,27 @@ int write_lcb_csr(struct hfi1_devdata *dd, u32 addr, u64 data) } /* - * If the 8051 is in reset mode (dd->dc_shutdown == 1), this function - * will still continue executing. - * * Returns: * < 0 = Linux error, not able to get access * > 0 = 8051 command RETURN_CODE */ -static int _do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, - u64 *out_data) +static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, + u64 *out_data) { u64 reg, completed; int return_code; unsigned long timeout; - lockdep_assert_held(&dd->dc8051_lock); hfi1_cdbg(DC8051, "type %d, data 0x%012llx", type, in_data); + mutex_lock(&dd->dc8051_lock); + + /* We can't send any commands to the 8051 if it's in reset */ + if (dd->dc_shutdown) { + return_code = -ENODEV; + goto fail; + } + /* * If an 8051 host command timed out previously, then the 8051 is * stuck. @@ -8680,29 +8685,6 @@ static int _do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, */ write_csr(dd, DC_DC8051_CFG_HOST_CMD_0, 0); -fail: - return return_code; -} - -/* - * Returns: - * < 0 = Linux error, not able to get access - * > 0 = 8051 command RETURN_CODE - */ -static int do_8051_command(struct hfi1_devdata *dd, u32 type, u64 in_data, - u64 *out_data) -{ - int return_code; - - mutex_lock(&dd->dc8051_lock); - /* We can't send any commands to the 8051 if it's in reset */ - if (dd->dc_shutdown) { - return_code = -ENODEV; - goto fail; - } - - return_code = _do_8051_command(dd, type, in_data, out_data); - fail: mutex_unlock(&dd->dc8051_lock); return return_code; @@ -8713,17 +8695,16 @@ static int set_physical_link_state(struct hfi1_devdata *dd, u64 state) return do_8051_command(dd, HCMD_CHANGE_PHY_STATE, state, NULL); } -static int _load_8051_config(struct hfi1_devdata *dd, u8 field_id, - u8 lane_id, u32 config_data) +int load_8051_config(struct hfi1_devdata *dd, u8 field_id, + u8 lane_id, u32 config_data) { u64 data; int ret; - lockdep_assert_held(&dd->dc8051_lock); data = (u64)field_id << LOAD_DATA_FIELD_ID_SHIFT | (u64)lane_id << LOAD_DATA_LANE_ID_SHIFT | (u64)config_data << LOAD_DATA_DATA_SHIFT; - ret = _do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); + ret = do_8051_command(dd, HCMD_LOAD_CONFIG_DATA, data, NULL); if (ret != HCMD_SUCCESS) { dd_dev_err(dd, "load 8051 config: field id %d, lane %d, err %d\n", @@ -8732,18 +8713,6 @@ static int _load_8051_config(struct hfi1_devdata *dd, u8 field_id, return ret; } -int load_8051_config(struct hfi1_devdata *dd, u8 field_id, - u8 lane_id, u32 config_data) -{ - int return_code; - - mutex_lock(&dd->dc8051_lock); - return_code = _load_8051_config(dd, field_id, lane_id, config_data); - mutex_unlock(&dd->dc8051_lock); - - return return_code; -} - /* * Read the 8051 firmware "registers". Use the RAM directly. Always * set the result, even on error. @@ -8859,14 +8828,13 @@ int write_host_interface_version(struct hfi1_devdata *dd, u8 version) u32 frame; u32 mask; - lockdep_assert_held(&dd->dc8051_lock); mask = (HOST_INTERFACE_VERSION_MASK << HOST_INTERFACE_VERSION_SHIFT); read_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, &frame); /* Clear, then set field */ frame &= ~mask; frame |= ((u32)version << HOST_INTERFACE_VERSION_SHIFT); - return _load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, - frame); + return load_8051_config(dd, RESERVED_REGISTERS, GENERAL_CONFIG, + frame); } void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor, @@ -9270,6 +9238,14 @@ static int set_local_link_attributes(struct hfi1_pportdata *ppd) if (ret != HCMD_SUCCESS) goto set_local_link_attributes_fail; + ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to set host interface version, return 0x%x\n", + ret); + goto set_local_link_attributes_fail; + } + /* * DC supports continuous updates. */ diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 133e313feca4..21fca8ec5076 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -508,6 +508,7 @@ #define DOWN_REMOTE_REASON_SHIFT 16 #define DOWN_REMOTE_REASON_MASK 0xff +#define HOST_INTERFACE_VERSION 1 #define HOST_INTERFACE_VERSION_SHIFT 16 #define HOST_INTERFACE_VERSION_MASK 0xff @@ -713,7 +714,6 @@ void read_misc_status(struct hfi1_devdata *dd, u8 *ver_major, u8 *ver_minor, u8 *ver_patch); int write_host_interface_version(struct hfi1_devdata *dd, u8 version); void read_guid(struct hfi1_devdata *dd); -int release_and_wait_ready_8051_firmware(struct hfi1_devdata *dd); int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout); void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, u8 neigh_reason, u8 rem_reason); diff --git a/drivers/infiniband/hw/hfi1/firmware.c b/drivers/infiniband/hw/hfi1/firmware.c index 98868df78a7e..2b57ba70ddd6 100644 --- a/drivers/infiniband/hw/hfi1/firmware.c +++ b/drivers/infiniband/hw/hfi1/firmware.c @@ -68,7 +68,6 @@ #define ALT_FW_FABRIC_NAME "hfi1_fabric_d.fw" #define ALT_FW_SBUS_NAME "hfi1_sbus_d.fw" #define ALT_FW_PCIE_NAME "hfi1_pcie_d.fw" -#define HOST_INTERFACE_VERSION 1 MODULE_FIRMWARE(DEFAULT_FW_8051_NAME_ASIC); MODULE_FIRMWARE(DEFAULT_FW_FABRIC_NAME); @@ -975,46 +974,6 @@ int wait_fm_ready(struct hfi1_devdata *dd, u32 mstimeout) } } -/* - * Clear all reset bits, releasing the 8051. - * Wait for firmware to be ready to accept host requests. - * Then, set host version bit. - * - * This function executes even if the 8051 is in reset mode when - * dd->dc_shutdown == 1. - * - * Expects dd->dc8051_lock to be held. - */ -int release_and_wait_ready_8051_firmware(struct hfi1_devdata *dd) -{ - int ret; - - lockdep_assert_held(&dd->dc8051_lock); - /* clear all reset bits, releasing the 8051 */ - write_csr(dd, DC_DC8051_CFG_RST, 0ull); - - /* - * Wait for firmware to be ready to accept host - * requests. - */ - ret = wait_fm_ready(dd, TIMEOUT_8051_START); - if (ret) { - dd_dev_err(dd, "8051 start timeout, current FW state 0x%x\n", - get_firmware_state(dd)); - return ret; - } - - ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); - if (ret != HCMD_SUCCESS) { - dd_dev_err(dd, - "Failed to set host interface version, return 0x%x\n", - ret); - return -EIO; - } - - return 0; -} - /* * Load the 8051 firmware. */ @@ -1080,22 +1039,31 @@ static int load_8051_firmware(struct hfi1_devdata *dd, if (ret) return ret; + /* clear all reset bits, releasing the 8051 */ + write_csr(dd, DC_DC8051_CFG_RST, 0ull); + /* - * Clear all reset bits, releasing the 8051. * DC reset step 5. Wait for firmware to be ready to accept host * requests. - * Then, set host version bit. */ - mutex_lock(&dd->dc8051_lock); - ret = release_and_wait_ready_8051_firmware(dd); - mutex_unlock(&dd->dc8051_lock); - if (ret) - return ret; + ret = wait_fm_ready(dd, TIMEOUT_8051_START); + if (ret) { /* timed out */ + dd_dev_err(dd, "8051 start timeout, current state 0x%x\n", + get_firmware_state(dd)); + return -ETIMEDOUT; + } read_misc_status(dd, &ver_major, &ver_minor, &ver_patch); dd_dev_info(dd, "8051 firmware version %d.%d.%d\n", (int)ver_major, (int)ver_minor, (int)ver_patch); dd->dc8051_ver = dc8051_ver(ver_major, ver_minor, ver_patch); + ret = write_host_interface_version(dd, HOST_INTERFACE_VERSION); + if (ret != HCMD_SUCCESS) { + dd_dev_err(dd, + "Failed to set host interface version, return 0x%x\n", + ret); + return -EIO; + } return 0; } -- cgit v1.2.3 From db9a2c6f9b6196b889b98e961cb9a37617b11ccf Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 18 Dec 2017 19:57:06 -0800 Subject: IB/rdmavt: Allocate CQ memory on the correct node CQ allocation does not ensure that completion queue entries and the completion queue structure are allocated on the correct numa node. Fix by allocating the rvt_cq and kernel CQ entries on the device node, leaving the user CQ entries on the default local node. Also ensure CQ resizes use the correct allocator when extending a CQ. Reviewed-by: Sebastian Sanchez Signed-off-by: Mike Marciniszyn Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/cq.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 97d71e49c092..88fa4d44ab5f 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -198,7 +198,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, return ERR_PTR(-EINVAL); /* Allocate the completion queue structure. */ - cq = kzalloc(sizeof(*cq), GFP_KERNEL); + cq = kzalloc_node(sizeof(*cq), GFP_KERNEL, rdi->dparms.node); if (!cq) return ERR_PTR(-ENOMEM); @@ -214,7 +214,9 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev, sz += sizeof(struct ib_uverbs_wc) * (entries + 1); else sz += sizeof(struct ib_wc) * (entries + 1); - wc = vmalloc_user(sz); + wc = udata ? + vmalloc_user(sz) : + vzalloc_node(sz, rdi->dparms.node); if (!wc) { ret = ERR_PTR(-ENOMEM); goto bail_cq; @@ -369,7 +371,9 @@ int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) sz += sizeof(struct ib_uverbs_wc) * (cqe + 1); else sz += sizeof(struct ib_wc) * (cqe + 1); - wc = vmalloc_user(sz); + wc = udata ? + vmalloc_user(sz) : + vzalloc_node(sz, rdi->dparms.node); if (!wc) return -ENOMEM; -- cgit v1.2.3 From 11f0e89710ebea7d1086cbe9ab1270c96677f1dc Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 18 Dec 2017 19:57:21 -0800 Subject: IB/{hfi1, qib}: Fix a concurrency issue with device name in logging The get_unit_name() function crafts a string based on the device name and the device unit number. It then stores this in a static variable. This has concurrency issues as can be seen with this log: hfi1 0000:02:00.0: hfi1_1: read_idle_message: read idle message 0x203 hfi1 0000:01:00.0: hfi1_1: read_idle_message: read idle message 0x203 The PCI device ID (0000:02:00.0 vs. 0000:01:00.0) is correct for the message, but the device string hfi1_1 is incorrect (it should be hfi1_0 for the second log message). Remove get_unit_name() function. Instead, use the rvt accessor rvt_get_ibdev_name() to get the IB name string. Clean up any hfi1_early_xx calls that can now use the new path. QIB has the same (qib_get_unit_name()) issue. Updating as necessary. Remove qib_get_unit_name() function. Update log message that has redundant device name. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 5 ++--- drivers/infiniband/hw/hfi1/driver.c | 8 -------- drivers/infiniband/hw/hfi1/hfi.h | 22 ++++++++++++---------- drivers/infiniband/hw/qib/qib.h | 7 +++---- drivers/infiniband/hw/qib/qib_driver.c | 8 -------- drivers/infiniband/hw/qib/qib_eeprom.c | 3 +-- 6 files changed, 18 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index ef0939c632c7..6660f920f42e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -14920,9 +14920,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (num_vls < HFI1_MIN_VLS_SUPPORTED || num_vls > HFI1_MAX_VLS_SUPPORTED) { - hfi1_early_err(&pdev->dev, - "Invalid num_vls %u, using %u VLs\n", - num_vls, HFI1_MAX_VLS_SUPPORTED); + dd_dev_err(dd, "Invalid num_vls %u, using %u VLs\n", + num_vls, HFI1_MAX_VLS_SUPPORTED); num_vls = HFI1_MAX_VLS_SUPPORTED; } ppd->vls_supported = num_vls; diff --git a/drivers/infiniband/hw/hfi1/driver.c b/drivers/infiniband/hw/hfi1/driver.c index 4561719096f2..067b29f35f21 100644 --- a/drivers/infiniband/hw/hfi1/driver.c +++ b/drivers/infiniband/hw/hfi1/driver.c @@ -159,14 +159,6 @@ static int hfi1_caps_get(char *buffer, const struct kernel_param *kp) return scnprintf(buffer, PAGE_SIZE, "0x%lx", cap_mask); } -const char *get_unit_name(int unit) -{ - static char iname[16]; - - snprintf(iname, sizeof(iname), DRIVER_NAME "_%u", unit); - return iname; -} - struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi) { struct hfi1_ibdev *ibdev = container_of(rdi, struct hfi1_ibdev, rdi); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 0709aba7c964..b42c22292597 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1971,7 +1971,6 @@ int get_platform_config_field(struct hfi1_devdata *dd, table_type, int table_index, int field_index, u32 *data, u32 len); -const char *get_unit_name(int unit); struct pci_dev *get_pci_dev(struct rvt_dev_info *rdi); /* @@ -2121,39 +2120,42 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) #define dd_dev_emerg(dd, fmt, ...) \ dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define dd_dev_err(dd, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define dd_dev_err_ratelimited(dd, fmt, ...) \ dev_err_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) #define dd_dev_warn(dd, fmt, ...) \ dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define dd_dev_warn_ratelimited(dd, fmt, ...) \ dev_warn_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) #define dd_dev_info(dd, fmt, ...) \ dev_info(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define dd_dev_info_ratelimited(dd, fmt, ...) \ dev_info_ratelimited(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), \ + ##__VA_ARGS__) #define dd_dev_dbg(dd, fmt, ...) \ dev_dbg(&(dd)->pcidev->dev, "%s: " fmt, \ - get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define hfi1_dev_porterr(dd, port, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: port %u: " fmt, \ - get_unit_name((dd)->unit), (port), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (port), ##__VA_ARGS__) /* * this is used for formatting hw error messages... diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 34c5254b803d..0235f76bbc72 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1428,7 +1428,6 @@ u64 qib_sps_ints(void); */ dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long, size_t, int); -const char *qib_get_unit_name(int unit); struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi); /* @@ -1487,15 +1486,15 @@ extern struct mutex qib_mutex; #define qib_dev_err(dd, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: " fmt, \ - qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define qib_dev_warn(dd, fmt, ...) \ dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \ - qib_get_unit_name((dd)->unit), ##__VA_ARGS__) + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) #define qib_dev_porterr(dd, port, fmt, ...) \ dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \ - qib_get_unit_name((dd)->unit), (dd)->unit, (port), \ + rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), (dd)->unit, (port), \ ##__VA_ARGS__) #define qib_devinfo(pcidev, fmt, ...) \ diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c index 9b13680836be..3117cc5f2a9a 100644 --- a/drivers/infiniband/hw/qib/qib_driver.c +++ b/drivers/infiniband/hw/qib/qib_driver.c @@ -81,14 +81,6 @@ MODULE_DESCRIPTION("Intel IB driver"); struct qlogic_ib_stats qib_stats; -const char *qib_get_unit_name(int unit) -{ - static char iname[16]; - - snprintf(iname, sizeof(iname), "infinipath%u", unit); - return iname; -} - struct pci_dev *qib_get_pci_dev(struct rvt_dev_info *rdi) { struct qib_ibdev *ibdev = container_of(rdi, struct qib_ibdev, rdi); diff --git a/drivers/infiniband/hw/qib/qib_eeprom.c b/drivers/infiniband/hw/qib/qib_eeprom.c index 33a2e74c8495..5838b3bf34b9 100644 --- a/drivers/infiniband/hw/qib/qib_eeprom.c +++ b/drivers/infiniband/hw/qib/qib_eeprom.c @@ -163,8 +163,7 @@ void qib_get_eeprom_info(struct qib_devdata *dd) if (bguid[6] == 0xff) { if (bguid[5] == 0xff) { qib_dev_err(dd, - "Can't set %s GUID from base, wraps to OUI!\n", - qib_get_unit_name(t)); + "Can't set GUID from base, wraps to OUI!\n"); dd->base_guid = 0; goto bail; } -- cgit v1.2.3 From 57f6b6639afab6cda27851e485bdc244462ba135 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Mon, 18 Dec 2017 19:57:28 -0800 Subject: IB/rdmavt: Add trace for RNRNAK timer This patch adds static trace for RNRNAK timer. Currently the output from hrtimer static trace only shows the addresses of hrtimers in the system and there is no easy way to correlate an RNRNAK timer with its entries in the hrtimer trace. This patch adds the correlation among a QP, its RNRNAK timer, and its entries in the hrtimer trace. This correlation will be enormously helpful when debugging RNRNAK related issues. In addition, this patch cleans up rvt_stop_rnr_timer() to be void while here. Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rdmavt/qp.c | 11 +++++---- drivers/infiniband/sw/rdmavt/trace_qp.h | 42 +++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 73705a19bd2e..0bbf20597056 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2075,6 +2075,7 @@ void rvt_add_rnr_timer(struct rvt_qp *qp, u32 aeth) lockdep_assert_held(&qp->s_lock); qp->s_flags |= RVT_S_WAIT_RNR; to = rvt_aeth_to_usec(aeth); + trace_rvt_rnrnak_add(qp, to); hrtimer_start(&qp->s_rnr_timer, ns_to_ktime(1000 * to), HRTIMER_MODE_REL); } @@ -2104,15 +2105,14 @@ EXPORT_SYMBOL(rvt_stop_rc_timers); * stop an rnr timer and return if the timer * had been pending. */ -static int rvt_stop_rnr_timer(struct rvt_qp *qp) +static void rvt_stop_rnr_timer(struct rvt_qp *qp) { - int rval = 0; - lockdep_assert_held(&qp->s_lock); /* Remove QP from rnr timer */ - if (qp->s_flags & RVT_S_WAIT_RNR) + if (qp->s_flags & RVT_S_WAIT_RNR) { qp->s_flags &= ~RVT_S_WAIT_RNR; - return rval; + trace_rvt_rnrnak_stop(qp, 0); + } } /** @@ -2165,6 +2165,7 @@ enum hrtimer_restart rvt_rc_rnr_retry(struct hrtimer *t) spin_lock_irqsave(&qp->s_lock, flags); rvt_stop_rnr_timer(qp); + trace_rvt_rnrnak_timeout(qp, 0); rdi->driver_f.schedule_send(qp); spin_unlock_irqrestore(&qp->s_lock, flags); return HRTIMER_NORESTART; diff --git a/drivers/infiniband/sw/rdmavt/trace_qp.h b/drivers/infiniband/sw/rdmavt/trace_qp.h index 4c77a3119bda..efc9d814b032 100644 --- a/drivers/infiniband/sw/rdmavt/trace_qp.h +++ b/drivers/infiniband/sw/rdmavt/trace_qp.h @@ -85,6 +85,48 @@ DEFINE_EVENT(rvt_qphash_template, rvt_qpremove, TP_PROTO(struct rvt_qp *qp, u32 bucket), TP_ARGS(qp, bucket)); +DECLARE_EVENT_CLASS( + rvt_rnrnak_template, + TP_PROTO(struct rvt_qp *qp, u32 to), + TP_ARGS(qp, to), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) + __field(u32, qpn) + __field(void *, hrtimer) + __field(u32, s_flags) + __field(u32, to) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + __entry->qpn = qp->ibqp.qp_num; + __entry->hrtimer = &qp->s_rnr_timer; + __entry->s_flags = qp->s_flags; + __entry->to = to; + ), + TP_printk( + "[%s] qpn 0x%x hrtimer 0x%p s_flags 0x%x timeout %u us", + __get_str(dev), + __entry->qpn, + __entry->hrtimer, + __entry->s_flags, + __entry->to + ) +); + +DEFINE_EVENT( + rvt_rnrnak_template, rvt_rnrnak_add, + TP_PROTO(struct rvt_qp *qp, u32 to), + TP_ARGS(qp, to)); + +DEFINE_EVENT( + rvt_rnrnak_template, rvt_rnrnak_timeout, + TP_PROTO(struct rvt_qp *qp, u32 to), + TP_ARGS(qp, to)); + +DEFINE_EVENT( + rvt_rnrnak_template, rvt_rnrnak_stop, + TP_PROTO(struct rvt_qp *qp, u32 to), + TP_ARGS(qp, to)); #endif /* __RVT_TRACE_QP_H */ -- cgit v1.2.3 From 106b8863069e0fb31468ddd6ffcd162cc8430280 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Sun, 24 Dec 2017 18:39:40 +0800 Subject: i40iw: Replace mdelay with msleep in i40iw_wait_pe_ready i40iw_wait_pe_ready is not called in an interrupt handler nor holding a spinlock. The function mdelay in it can be replaced with msleep, to reduce busy wait. Signed-off-by: Jia-Ju Bai Reviewed-by: Leon Romanovsky Acked-by: Shiraz Saleem Signed-off-by: Doug Ledford --- drivers/infiniband/hw/i40iw/i40iw_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_main.c b/drivers/infiniband/hw/i40iw/i40iw_main.c index 8a9815e43282..b08862978de8 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_main.c +++ b/drivers/infiniband/hw/i40iw/i40iw_main.c @@ -1290,7 +1290,7 @@ static void i40iw_wait_pe_ready(struct i40iw_hw *hw) __LINE__, statuscpu2); if ((statuscpu0 == 0x80) && (statuscpu1 == 0x80) && (statuscpu2 == 0x80)) break; /* SUCCESS */ - mdelay(1000); + msleep(1000); retrycount++; } while (retrycount < 14); i40iw_wr32(hw, 0xb4040, 0x4C104C5); -- cgit v1.2.3 From e2dda36855bf21b4ccca42970f7216280ddc16a7 Mon Sep 17 00:00:00 2001 From: Hans Westgaard Ry Date: Tue, 2 Jan 2018 14:50:40 +0100 Subject: RDMA/core: Add encode/decode FDR/EDR rates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cases for FDR/EDR signalling speed, were missing in ib_rate_to_mult and mult_to_ib_rate giving wrong return values when drivers convert static rate to/from inter-packet-delay. Signed-off-by: Hans Westgaard Ry Reviewed-by: Håkon Bugge Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 56 ++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index fe72fb303b01..a1bcdacb7f40 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -124,16 +124,24 @@ EXPORT_SYMBOL(ib_wc_status_msg); __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { - case IB_RATE_2_5_GBPS: return 1; - case IB_RATE_5_GBPS: return 2; - case IB_RATE_10_GBPS: return 4; - case IB_RATE_20_GBPS: return 8; - case IB_RATE_30_GBPS: return 12; - case IB_RATE_40_GBPS: return 16; - case IB_RATE_60_GBPS: return 24; - case IB_RATE_80_GBPS: return 32; - case IB_RATE_120_GBPS: return 48; - default: return -1; + case IB_RATE_2_5_GBPS: return 1; + case IB_RATE_5_GBPS: return 2; + case IB_RATE_10_GBPS: return 4; + case IB_RATE_20_GBPS: return 8; + case IB_RATE_30_GBPS: return 12; + case IB_RATE_40_GBPS: return 16; + case IB_RATE_60_GBPS: return 24; + case IB_RATE_80_GBPS: return 32; + case IB_RATE_120_GBPS: return 48; + case IB_RATE_14_GBPS: return 6; + case IB_RATE_56_GBPS: return 22; + case IB_RATE_112_GBPS: return 45; + case IB_RATE_168_GBPS: return 67; + case IB_RATE_25_GBPS: return 10; + case IB_RATE_100_GBPS: return 40; + case IB_RATE_200_GBPS: return 80; + case IB_RATE_300_GBPS: return 120; + default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mult); @@ -141,16 +149,24 @@ EXPORT_SYMBOL(ib_rate_to_mult); __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { - case 1: return IB_RATE_2_5_GBPS; - case 2: return IB_RATE_5_GBPS; - case 4: return IB_RATE_10_GBPS; - case 8: return IB_RATE_20_GBPS; - case 12: return IB_RATE_30_GBPS; - case 16: return IB_RATE_40_GBPS; - case 24: return IB_RATE_60_GBPS; - case 32: return IB_RATE_80_GBPS; - case 48: return IB_RATE_120_GBPS; - default: return IB_RATE_PORT_CURRENT; + case 1: return IB_RATE_2_5_GBPS; + case 2: return IB_RATE_5_GBPS; + case 4: return IB_RATE_10_GBPS; + case 8: return IB_RATE_20_GBPS; + case 12: return IB_RATE_30_GBPS; + case 16: return IB_RATE_40_GBPS; + case 24: return IB_RATE_60_GBPS; + case 32: return IB_RATE_80_GBPS; + case 48: return IB_RATE_120_GBPS; + case 6: return IB_RATE_14_GBPS; + case 22: return IB_RATE_56_GBPS; + case 45: return IB_RATE_112_GBPS; + case 67: return IB_RATE_168_GBPS; + case 10: return IB_RATE_25_GBPS; + case 40: return IB_RATE_100_GBPS; + case 80: return IB_RATE_200_GBPS; + case 120: return IB_RATE_300_GBPS; + default: return IB_RATE_PORT_CURRENT; } } EXPORT_SYMBOL(mult_to_ib_rate); -- cgit v1.2.3 From 8fc12d94ee997625dbae102e597ac0f4a21f2142 Mon Sep 17 00:00:00 2001 From: Jonathan Toppins Date: Thu, 4 Jan 2018 16:04:13 -0500 Subject: bnxt_re: report RoCE device support at info level Reporting that a device doesn't support RoCE seems like a valuable piece of information to have when trying to determine why a driver is not binding to a device. Better to report this at info log level instead of requiring a user to enable all debug messages in the driver. Signed-off-by: Jonathan Toppins Acked-By: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index aafc19aa5de1..bf268bf1f496 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -417,7 +417,7 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev) return ERR_PTR(-EINVAL); if (!(en_dev->flags & BNXT_EN_FLAG_ROCE_CAP)) { - dev_dbg(&pdev->dev, + dev_info(&pdev->dev, "%s: probe error: RoCE is not supported on this device", ROCE_DRV_MODULE_NAME); return ERR_PTR(-ENODEV); -- cgit v1.2.3 From 57cda166bbe045151d46b2d1133fdf4afccb90ed Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 2 Jan 2018 16:19:28 +0200 Subject: net/mlx5: Add DCT command interface Add a missing command interface to work with a DCT. It includes: creating, destroying and get events for. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 9 +- drivers/net/ethernet/mellanox/mlx5/core/qp.c | 125 ++++++++++++++++++++++++--- include/linux/mlx5/device.h | 9 ++ include/linux/mlx5/driver.h | 8 ++ include/linux/mlx5/qp.h | 12 +++ 5 files changed, 150 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 60771865c99c..7d3d503fa675 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -417,7 +417,11 @@ static irqreturn_t mlx5_eq_int(int irq, void *eq_ptr) cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff; mlx5_cq_completion(dev, cqn); break; - + case MLX5_EVENT_TYPE_DCT_DRAINED: + rsn = be32_to_cpu(eqe->data.dct.dctn) & 0xffffff; + rsn |= (MLX5_RES_DCT << MLX5_USER_INDEX_LEN); + mlx5_rsc_event(dev, rsn, eqe->type); + break; case MLX5_EVENT_TYPE_PATH_MIG: case MLX5_EVENT_TYPE_COMM_EST: case MLX5_EVENT_TYPE_SQ_DRAINED: @@ -715,6 +719,9 @@ int mlx5_start_eqs(struct mlx5_core_dev *dev) if (MLX5_CAP_GEN(dev, fpga)) async_event_mask |= (1ull << MLX5_EVENT_TYPE_FPGA_ERROR); + if (MLX5_CAP_GEN_MAX(dev, dct)) + async_event_mask |= (1ull << MLX5_EVENT_TYPE_DCT_DRAINED); + err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD, MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index db9e665ab104..0f5ddd22927d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -98,6 +98,11 @@ static u64 sq_allowed_event_types(void) return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); } +static u64 dct_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_DCT_DRAINED); +} + static bool is_event_type_allowed(int rsc_type, int event_type) { switch (rsc_type) { @@ -107,6 +112,8 @@ static bool is_event_type_allowed(int rsc_type, int event_type) return BIT(event_type) & rq_allowed_event_types(); case MLX5_EVENT_QUEUE_TYPE_SQ: return BIT(event_type) & sq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_DCT: + return BIT(event_type) & dct_allowed_event_types(); default: WARN(1, "Event arrived for unknown resource type"); return false; @@ -116,6 +123,7 @@ static bool is_event_type_allowed(int rsc_type, int event_type) void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) { struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn); + struct mlx5_core_dct *dct; struct mlx5_core_qp *qp; if (!common) @@ -134,7 +142,11 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) qp = (struct mlx5_core_qp *)common; qp->event(qp, event_type); break; - + case MLX5_RES_DCT: + dct = (struct mlx5_core_dct *)common; + if (event_type == MLX5_EVENT_TYPE_DCT_DRAINED) + complete(&dct->drained); + break; default: mlx5_core_warn(dev, "invalid resource type for 0x%x\n", rsn); } @@ -142,9 +154,9 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) mlx5_core_put_rsc(common); } -static int create_qprqsq_common(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp, - int rsc_type) +static int create_resource_common(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp, + int rsc_type) { struct mlx5_qp_table *table = &dev->priv.qp_table; int err; @@ -165,8 +177,8 @@ static int create_qprqsq_common(struct mlx5_core_dev *dev, return 0; } -static void destroy_qprqsq_common(struct mlx5_core_dev *dev, - struct mlx5_core_qp *qp) +static void destroy_resource_common(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp) { struct mlx5_qp_table *table = &dev->priv.qp_table; unsigned long flags; @@ -179,6 +191,40 @@ static void destroy_qprqsq_common(struct mlx5_core_dev *dev, wait_for_completion(&qp->common.free); } +int mlx5_core_create_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct, + u32 *in, int inlen) +{ + u32 out[MLX5_ST_SZ_DW(create_dct_out)] = {0}; + u32 din[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; + u32 dout[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + init_completion(&dct->drained); + MLX5_SET(create_dct_in, in, opcode, MLX5_CMD_OP_CREATE_DCT); + + err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + if (err) { + mlx5_core_warn(dev, "create DCT failed, ret %d\n", err); + return err; + } + + qp->qpn = MLX5_GET(create_dct_out, out, dctn); + err = create_resource_common(dev, qp, MLX5_RES_DCT); + if (err) + goto err_cmd; + + return 0; +err_cmd: + MLX5_SET(destroy_dct_in, din, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, din, dctn, qp->qpn); + mlx5_cmd_exec(dev, (void *)&in, sizeof(din), + (void *)&out, sizeof(dout)); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_create_dct); + int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *in, int inlen) @@ -197,7 +243,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, qp->qpn = MLX5_GET(create_qp_out, out, qpn); mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); - err = create_qprqsq_common(dev, qp, MLX5_RES_QP); + err = create_resource_common(dev, qp, MLX5_RES_QP); if (err) goto err_cmd; @@ -220,6 +266,47 @@ err_cmd: } EXPORT_SYMBOL_GPL(mlx5_core_create_qp); +static int mlx5_core_drain_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct) +{ + u32 out[MLX5_ST_SZ_DW(drain_dct_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(drain_dct_in)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(drain_dct_in, in, opcode, MLX5_CMD_OP_DRAIN_DCT); + MLX5_SET(drain_dct_in, in, dctn, qp->qpn); + return mlx5_cmd_exec(dev, (void *)&in, sizeof(in), + (void *)&out, sizeof(out)); +} + +int mlx5_core_destroy_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct) +{ + u32 out[MLX5_ST_SZ_DW(destroy_dct_out)] = {0}; + u32 in[MLX5_ST_SZ_DW(destroy_dct_in)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + int err; + + err = mlx5_core_drain_dct(dev, dct); + if (err) { + if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) { + goto destroy; + } else { + mlx5_core_warn(dev, "failed drain DCT 0x%x with error 0x%x\n", qp->qpn, err); + return err; + } + } + wait_for_completion(&dct->drained); +destroy: + destroy_resource_common(dev, &dct->mqp); + MLX5_SET(destroy_dct_in, in, opcode, MLX5_CMD_OP_DESTROY_DCT); + MLX5_SET(destroy_dct_in, in, dctn, qp->qpn); + err = mlx5_cmd_exec(dev, (void *)&in, sizeof(in), + (void *)&out, sizeof(out)); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_core_destroy_dct); + int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp) { @@ -229,7 +316,7 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, mlx5_debug_qp_remove(dev, qp); - destroy_qprqsq_common(dev, qp); + destroy_resource_common(dev, qp); MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP); MLX5_SET(destroy_qp_in, in, qpn, qp->qpn); @@ -405,6 +492,20 @@ int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, } EXPORT_SYMBOL_GPL(mlx5_core_qp_query); +int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen) +{ + u32 in[MLX5_ST_SZ_DW(query_dct_in)] = {0}; + struct mlx5_core_qp *qp = &dct->mqp; + + MLX5_SET(query_dct_in, in, opcode, MLX5_CMD_OP_QUERY_DCT); + MLX5_SET(query_dct_in, in, dctn, qp->qpn); + + return mlx5_cmd_exec(dev, (void *)&in, sizeof(in), + (void *)out, outlen); +} +EXPORT_SYMBOL_GPL(mlx5_core_dct_query); + int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn) { u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {0}; @@ -441,7 +542,7 @@ int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; rq->qpn = rqn; - err = create_qprqsq_common(dev, rq, MLX5_RES_RQ); + err = create_resource_common(dev, rq, MLX5_RES_RQ); if (err) goto err_destroy_rq; @@ -457,7 +558,7 @@ EXPORT_SYMBOL(mlx5_core_create_rq_tracked); void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, struct mlx5_core_qp *rq) { - destroy_qprqsq_common(dev, rq); + destroy_resource_common(dev, rq); mlx5_core_destroy_rq(dev, rq->qpn); } EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked); @@ -473,7 +574,7 @@ int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; sq->qpn = sqn; - err = create_qprqsq_common(dev, sq, MLX5_RES_SQ); + err = create_resource_common(dev, sq, MLX5_RES_SQ); if (err) goto err_destroy_sq; @@ -489,7 +590,7 @@ EXPORT_SYMBOL(mlx5_core_create_sq_tracked); void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, struct mlx5_core_qp *sq) { - destroy_qprqsq_common(dev, sq); + destroy_resource_common(dev, sq); mlx5_core_destroy_sq(dev, sq->qpn); } EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 52b8ea423dd2..9aee835b7393 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -286,6 +286,7 @@ enum { MLX5_EVENT_QUEUE_TYPE_QP = 0, MLX5_EVENT_QUEUE_TYPE_RQ = 1, MLX5_EVENT_QUEUE_TYPE_SQ = 2, + MLX5_EVENT_QUEUE_TYPE_DCT = 6, }; enum mlx5_event { @@ -321,6 +322,8 @@ enum mlx5_event { MLX5_EVENT_TYPE_PAGE_FAULT = 0xc, MLX5_EVENT_TYPE_NIC_VPORT_CHANGE = 0xd, + MLX5_EVENT_TYPE_DCT_DRAINED = 0x1c, + MLX5_EVENT_TYPE_FPGA_ERROR = 0x20, }; @@ -613,6 +616,11 @@ struct mlx5_eqe_pps { u8 rsvd2[12]; } __packed; +struct mlx5_eqe_dct { + __be32 reserved[6]; + __be32 dctn; +}; + union ev_data { __be32 raw[7]; struct mlx5_eqe_cmd cmd; @@ -628,6 +636,7 @@ union ev_data { struct mlx5_eqe_vport_change vport_change; struct mlx5_eqe_port_module port_module; struct mlx5_eqe_pps pps; + struct mlx5_eqe_dct dct; } __packed; struct mlx5_eqe { diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2fe263f69751..0776554f18dc 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -154,6 +154,13 @@ enum mlx5_dcbx_oper_mode { MLX5E_DCBX_PARAM_VER_OPER_AUTO = 0x3, }; +enum mlx5_dct_atomic_mode { + MLX5_ATOMIC_MODE_DCT_OFF = 20, + MLX5_ATOMIC_MODE_DCT_NONE = 0 << MLX5_ATOMIC_MODE_DCT_OFF, + MLX5_ATOMIC_MODE_DCT_IB_COMP = 1 << MLX5_ATOMIC_MODE_DCT_OFF, + MLX5_ATOMIC_MODE_DCT_CX = 2 << MLX5_ATOMIC_MODE_DCT_OFF, +}; + enum { MLX5_ATOMIC_OPS_CMP_SWAP = 1 << 0, MLX5_ATOMIC_OPS_FETCH_ADD = 1 << 1, @@ -432,6 +439,7 @@ enum mlx5_res_type { MLX5_RES_SRQ = 3, MLX5_RES_XSRQ = 4, MLX5_RES_XRQ = 5, + MLX5_RES_DCT = MLX5_EVENT_QUEUE_TYPE_DCT, }; struct mlx5_core_rsc_common { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 62af7512dabb..4778d41085d4 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -473,6 +473,11 @@ struct mlx5_core_qp { int pid; }; +struct mlx5_core_dct { + struct mlx5_core_qp mqp; + struct completion drained; +}; + struct mlx5_qp_path { u8 fl_free_ar; u8 rsvd3; @@ -549,6 +554,9 @@ static inline struct mlx5_core_mkey *__mlx5_mr_lookup(struct mlx5_core_dev *dev, return radix_tree_lookup(&dev->priv.mkey_table.tree, key); } +int mlx5_core_create_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *qp, + u32 *in, int inlen); int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *in, @@ -558,8 +566,12 @@ int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 opcode, struct mlx5_core_qp *qp); int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); +int mlx5_core_destroy_dct(struct mlx5_core_dev *dev, + struct mlx5_core_dct *dct); int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, u32 *out, int outlen); +int mlx5_core_dct_query(struct mlx5_core_dev *dev, struct mlx5_core_dct *dct, + u32 *out, int outlen); int mlx5_core_set_delay_drop(struct mlx5_core_dev *dev, u32 timeout_usec); -- cgit v1.2.3 From dd44572aebee9260fa1d90569d20d8bab28f90ae Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 2 Jan 2018 16:19:29 +0200 Subject: net/mlx5: Enable DC transport Enable DC transport in the firmware to provide its functionality. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 5f323442cc5a..1292aecb09f2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -552,6 +552,9 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) cache_line_128byte, cache_line_size() == 128 ? 1 : 0); + if (MLX5_CAP_GEN_MAX(dev, dct)) + MLX5_SET(cmd_hca_cap, set_hca_cap, dct, 1); + err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); -- cgit v1.2.3 From 8011c1e33626ea7b04f74f648aad7bb2e48f8a81 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 2 Jan 2018 16:19:30 +0200 Subject: IB/core: Introduce driver QP type Vendors can implement type of QPs that are not described in the InfiniBand specification. To still be able to use the IB/core layer services (e.g. user object management) without tainting this layer with driver proprietary logic, a new QP type is added - IB_QPT_DRIVER. This will be a general QP type that the core layer doesn't know about its true nature. When a command like create_qp() is passed to a hardware driver the extra data that is required is taken from the driver channel. Downstream patches from this series will use that QP type in the mlx5 driver. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 60c3268c8c04..e44a8adac677 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1082,6 +1082,7 @@ enum ib_qp_type { IB_QPT_XRC_INI = 9, IB_QPT_XRC_TGT, IB_QPT_MAX, + IB_QPT_DRIVER = 0xFF, /* Reserve a range for qp types internal to the low level driver. * These qp types will not be visible at the IB core layer, so the * IB_QPT_MAX usages should not be affected in the core layer -- cgit v1.2.3 From b4aaa1f0b415cf8aa79742cbed56a2d75cfc5102 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 2 Jan 2018 16:19:31 +0200 Subject: IB/mlx5: Handle type IB_QPT_DRIVER when creating a QP The QP type IB_QPT_DRIVER doesn't describe the transport or the service that the QP provides but those are known only to the hardware driver. The actual type of the QP is stored in the hardware driver context (i.e. mlx5_qp) under the field qp_sub_type. Take the real QP type and any extra data that is required to create the QP from the driver channel and modify the QP initial attributes before continuing with create_qp(). Downstream patches from this series will add support for both DCI and DCT driver QPs. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 10 ++++ drivers/infiniband/hw/mlx5/qp.c | 103 ++++++++++++++++++++++++++++++++++- include/uapi/rdma/mlx5-abi.h | 7 ++- 3 files changed, 118 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index b3f2f5cae672..6286992e1d39 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -206,6 +206,8 @@ struct mlx5_ib_flow_db { * creates the actual hardware QP. */ #define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 +#define MLX5_IB_QPT_DCI IB_QPT_RESERVED3 +#define MLX5_IB_QPT_DCT IB_QPT_RESERVED4 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 #define MLX5_IB_UMR_OCTOWORD 16 @@ -366,12 +368,18 @@ struct mlx5_bf { struct mlx5_sq_bfreg *bfreg; }; +struct mlx5_ib_dct { + struct mlx5_core_dct mdct; + u32 *in; +}; + struct mlx5_ib_qp { struct ib_qp ibqp; union { struct mlx5_ib_qp_trans trans_qp; struct mlx5_ib_raw_packet_qp raw_packet_qp; struct mlx5_ib_rss_qp rss_qp; + struct mlx5_ib_dct dct; }; struct mlx5_buf buf; @@ -410,6 +418,8 @@ struct mlx5_ib_qp { u32 rate_limit; u32 underlay_qpn; bool tunnel_offload_en; + /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ + enum ib_qp_type qp_sub_type; }; struct mlx5_ib_cq_buf { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 45b9aba599c8..f7f6fe618620 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2115,20 +2115,108 @@ static const char *ib_qp_type_str(enum ib_qp_type type) return "IB_QPT_RAW_PACKET"; case MLX5_IB_QPT_REG_UMR: return "MLX5_IB_QPT_REG_UMR"; + case IB_QPT_DRIVER: + return "IB_QPT_DRIVER"; case IB_QPT_MAX: default: return "Invalid QP type"; } } +static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct mlx5_ib_create_qp *ucmd) +{ + struct mlx5_ib_dev *dev; + struct mlx5_ib_qp *qp; + int err = 0; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + void *dctc; + + if (!attr->srq || !attr->recv_cq) + return ERR_PTR(-EINVAL); + + dev = to_mdev(pd->device); + + err = get_qp_user_index(to_mucontext(pd->uobject->context), + ucmd, sizeof(*ucmd), &uidx); + if (err) + return ERR_PTR(err); + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->dct.in = kzalloc(MLX5_ST_SZ_BYTES(create_dct_in), GFP_KERNEL); + if (!qp->dct.in) { + err = -ENOMEM; + goto err_free; + } + + dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + qp->driver_qp_type = MLX5_IB_QPT_DCT; + MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn); + MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn); + MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn); + MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); + MLX5_SET(dctc, dctc, user_index, uidx); + + qp->state = IB_QPS_RESET; + + return &qp->ibqp; +err_free: + kfree(qp); + return ERR_PTR(err); +} + +static int set_mlx_qp_type(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp *ucmd, + struct ib_udata *udata) +{ + enum { MLX_QP_FLAGS = MLX5_QP_FLAG_TYPE_DCT | MLX5_QP_FLAG_TYPE_DCI }; + int err; + + if (!udata) + return -EINVAL; + + if (udata->inlen < sizeof(*ucmd)) { + mlx5_ib_dbg(dev, "create_qp user command is smaller than expected\n"); + return -EINVAL; + } + err = ib_copy_from_udata(ucmd, udata, sizeof(*ucmd)); + if (err) + return err; + + if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCI) { + init_attr->qp_type = MLX5_IB_QPT_DCI; + } else { + if ((ucmd->flags & MLX_QP_FLAGS) == MLX5_QP_FLAG_TYPE_DCT) { + init_attr->qp_type = MLX5_IB_QPT_DCT; + } else { + mlx5_ib_dbg(dev, "Invalid QP flags\n"); + return -EINVAL; + } + } + + if (!MLX5_CAP_GEN(dev->mdev, dct)) { + mlx5_ib_dbg(dev, "DC transport is not supported\n"); + return -EOPNOTSUPP; + } + + return 0; +} + struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *init_attr, + struct ib_qp_init_attr *verbs_init_attr, struct ib_udata *udata) { struct mlx5_ib_dev *dev; struct mlx5_ib_qp *qp; u16 xrcdn = 0; int err; + struct ib_qp_init_attr mlx_init_attr; + struct ib_qp_init_attr *init_attr = verbs_init_attr; if (pd) { dev = to_mdev(pd->device); @@ -2153,6 +2241,16 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); } + if (init_attr->qp_type == IB_QPT_DRIVER) { + struct mlx5_ib_create_qp ucmd; + + init_attr = &mlx_init_attr; + memcpy(init_attr, verbs_init_attr, sizeof(*verbs_init_attr)); + err = set_mlx_qp_type(dev, init_attr, &ucmd, udata); + if (err) + return ERR_PTR(err); + } + switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: case IB_QPT_XRC_INI: @@ -2214,6 +2312,9 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-EINVAL); } + if (verbs_init_attr->qp_type == IB_QPT_DRIVER) + qp->qp_sub_type = init_attr->qp_type; + return &qp->ibqp; } diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 0f7e45680ce5..83bde975d3f9 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -42,6 +42,8 @@ enum { MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, MLX5_QP_FLAG_TUNNEL_OFFLOADS = 1 << 2, MLX5_QP_FLAG_BFREG_INDEX = 1 << 3, + MLX5_QP_FLAG_TYPE_DCT = 1 << 4, + MLX5_QP_FLAG_TYPE_DCI = 1 << 5, }; enum { @@ -284,7 +286,10 @@ struct mlx5_ib_create_qp { __u32 flags; __u32 uidx; __u32 bfreg_index; - __u64 sq_buf_addr; + union { + __u64 sq_buf_addr; + __u64 access_key; + }; }; /* RX Hash function flags */ -- cgit v1.2.3 From c32a4f296e1dc20ffb8da7580b0a190c2b4f3dd3 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 2 Jan 2018 16:19:32 +0200 Subject: IB/mlx5: Add support for DC Initiator QP DC Initiator (DCI) QP is represented like any other QP in the hardware. However, like any other transport QP there are attributes and settings that are special to DCI QP and needs specific attention and care. Make necessary changes to configure DCI QP. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 77 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f7f6fe618620..be721f3a36c3 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -613,6 +613,7 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_XRC_TGT: return MLX5_QP_ST_XRC; case IB_QPT_SMI: return MLX5_QP_ST_QP0; case MLX5_IB_QPT_HW_GSI: return MLX5_QP_ST_QP1; + case MLX5_IB_QPT_DCI: return MLX5_QP_ST_DCI; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; case IB_QPT_RAW_PACKET: case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; @@ -1044,6 +1045,7 @@ static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) { if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) || + (attr->qp_type == MLX5_IB_QPT_DCI) || (attr->qp_type == IB_QPT_XRC_INI)) return MLX5_SRQ_RQ; else if (!qp->has_rq) @@ -2249,6 +2251,14 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, err = set_mlx_qp_type(dev, init_attr, &ucmd, udata); if (err) return ERR_PTR(err); + + if (init_attr->qp_type == MLX5_IB_QPT_DCI) { + if (init_attr->cap.max_recv_wr || + init_attr->cap.max_recv_sge) { + mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n"); + return ERR_PTR(-EINVAL); + } + } } switch (init_attr->qp_type) { @@ -2272,6 +2282,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, case IB_QPT_SMI: case MLX5_IB_QPT_HW_GSI: case MLX5_IB_QPT_REG_UMR: + case MLX5_IB_QPT_DCI: qp = kzalloc(sizeof(*qp), GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); @@ -2893,7 +2904,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!context) return -ENOMEM; - err = to_mlx5_st(ibqp->qp_type); + err = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? + qp->qp_sub_type : ibqp->qp_type); if (err < 0) { mlx5_ib_dbg(dev, "unsupported qp type %d\n", ibqp->qp_type); goto out; @@ -3052,7 +3064,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); - mlx5_st = to_mlx5_st(ibqp->qp_type); + mlx5_st = to_mlx5_st(ibqp->qp_type == IB_QPT_DRIVER ? + qp->qp_sub_type : ibqp->qp_type); if (mlx5_st < 0) goto out; @@ -3124,6 +3137,50 @@ out: return err; } +static inline bool is_valid_mask(int mask, int req, int opt) +{ + if ((mask & req) != req) + return false; + + if (mask & ~(req | opt)) + return false; + + return true; +} + +/* check valid transition for driver QP types + * for now the only QP type that this function supports is DCI + */ +static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new_state, + enum ib_qp_attr_mask attr_mask) +{ + int req = IB_QP_STATE; + int opt = 0; + + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + req |= IB_QP_PKEY_INDEX | IB_QP_PORT; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { + opt = IB_QP_PKEY_INDEX | IB_QP_PORT; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + req |= IB_QP_PATH_MTU; + opt = IB_QP_PKEY_INDEX; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RTR && new_state == IB_QPS_RTS) { + req |= IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | + IB_QP_MAX_QP_RD_ATOMIC | IB_QP_SQ_PSN; + opt = IB_QP_MIN_RNR_TIMER; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RTS && new_state == IB_QPS_RTS) { + opt = IB_QP_MIN_RNR_TIMER; + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state != IB_QPS_RESET && new_state == IB_QPS_ERR) { + return is_valid_mask(attr_mask, req, opt); + } + return false; +} + int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -3141,8 +3198,12 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_modify_qp(ibqp, attr, attr_mask); - qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? - IB_QPT_GSI : ibqp->qp_type; + if (ibqp->qp_type == IB_QPT_DRIVER) + qp_type = qp->qp_sub_type; + else + qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? + IB_QPT_GSI : ibqp->qp_type; + mutex_lock(&qp->mutex); @@ -3161,10 +3222,16 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } } else if (qp_type != MLX5_IB_QPT_REG_UMR && - !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + qp_type != MLX5_IB_QPT_DCI && + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); goto out; + } else if (qp_type == MLX5_IB_QPT_DCI && + !modify_dci_qp_is_ok(cur_state, new_state, attr_mask)) { + mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", + cur_state, new_state, qp_type, attr_mask); + goto out; } if ((attr_mask & IB_QP_PORT) && -- cgit v1.2.3 From 776a3906b692963586ee9952e64ed87fb4b401c6 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 2 Jan 2018 16:19:33 +0200 Subject: IB/mlx5: Add support for DC target QP A DC Target (DCT) QP is represented in the hardware as a unique object. This object is created by CREATE_DCT command and destroyed by DESTROY_DCT command. However, in the driver we describe it as a QP. The hardware command that creates a DCT needs parameters that the verb create_qp() does not provide. Those remaining parameters are provided with the call to the verb modify_qp(). Therefore we delay the actual creation of a DCT in the hardware until the stage of modify_qp() to RTR. A support for query_qp() was added as well. It uses QUERY_DCT command to retrieve the applicable fields. Signed-off-by: Moni Shoua Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 27 ++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 + drivers/infiniband/hw/mlx5/qp.c | 186 ++++++++++++++++++++++++++++++++++- include/uapi/rdma/mlx5-abi.h | 5 + 4 files changed, 217 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b9e195d154b1..675144a20f95 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -436,11 +436,11 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev) } static void get_atomic_caps(struct mlx5_ib_dev *dev, + u8 atomic_size_qp, struct ib_device_attr *props) { u8 tmp; u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); - u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); u8 atomic_req_8B_endianness_mode = MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianness_mode); @@ -457,6 +457,29 @@ static void get_atomic_caps(struct mlx5_ib_dev *dev, } } +static void get_atomic_caps_qp(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + + get_atomic_caps(dev, atomic_size_qp, props); +} + +static void get_atomic_caps_dc(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + + get_atomic_caps(dev, atomic_size_qp, props); +} + +bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev) +{ + struct ib_device_attr props = {}; + + get_atomic_caps_dc(dev, &props); + return (props.atomic_cap == IB_ATOMIC_HCA) ? true : false; +} static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { @@ -745,7 +768,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = 1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size); - get_atomic_caps(dev, props); + get_atomic_caps_qp(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 6286992e1d39..57405cf8a5c5 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1002,6 +1002,8 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); +bool mlx5_ib_dc_atomic_is_supported(struct mlx5_ib_dev *dev); + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index be721f3a36c3..f59de13b657e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2156,7 +2156,7 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, } dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); - qp->driver_qp_type = MLX5_IB_QPT_DCT; + qp->qp_sub_type = MLX5_IB_QPT_DCT; MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn); MLX5_SET(dctc, dctc, srqn_xrqn, to_msrq(attr->srq)->msrq.srqn); MLX5_SET(dctc, dctc, cqn, to_mcq(attr->recv_cq)->mcq.cqn); @@ -2258,6 +2258,8 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, mlx5_ib_dbg(dev, "DCI QP requires zero size receive queue\n"); return ERR_PTR(-EINVAL); } + } else { + return mlx5_ib_create_dct(pd, init_attr, &ucmd); } } @@ -2329,6 +2331,25 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, return &qp->ibqp; } +static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp) +{ + struct mlx5_ib_dev *dev = to_mdev(mqp->ibqp.device); + + if (mqp->state == IB_QPS_RTR) { + int err; + + err = mlx5_core_destroy_dct(dev->mdev, &mqp->dct.mdct); + if (err) { + mlx5_ib_warn(dev, "failed to destroy DCT %d\n", err); + return err; + } + } + + kfree(mqp->dct.in); + kfree(mqp); + return 0; +} + int mlx5_ib_destroy_qp(struct ib_qp *qp) { struct mlx5_ib_dev *dev = to_mdev(qp->device); @@ -2337,6 +2358,9 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) if (unlikely(qp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_destroy_qp(qp); + if (mqp->qp_sub_type == MLX5_IB_QPT_DCT) + return mlx5_ib_destroy_dct(mqp); + destroy_qp_common(dev, mqp); kfree(mqp); @@ -3181,6 +3205,95 @@ static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new return false; } +/* mlx5_ib_modify_dct: modify a DCT QP + * valid transitions are: + * RESET to INIT: must set access_flags, pkey_index and port + * INIT to RTR : must set min_rnr_timer, tclass, flow_label, + * mtu, gid_index and hop_limit + * Other transitions and attributes are illegal + */ +static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + enum ib_qp_state cur_state, new_state; + int err = 0; + int required = IB_QP_STATE; + void *dctc; + + if (!(attr_mask & IB_QP_STATE)) + return -EINVAL; + + cur_state = qp->state; + new_state = attr->qp_state; + + dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + required |= IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT; + if (!is_valid_mask(attr_mask, required, 0)) + return -EINVAL; + + if (attr->port_num == 0 || + attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports)) { + mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", + attr->port_num, dev->num_ports); + return -EINVAL; + } + if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) + MLX5_SET(dctc, dctc, rre, 1); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) + MLX5_SET(dctc, dctc, rwe, 1); + if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { + if (!mlx5_ib_dc_atomic_is_supported(dev)) + return -EOPNOTSUPP; + MLX5_SET(dctc, dctc, rae, 1); + MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX); + } + MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index); + MLX5_SET(dctc, dctc, port, attr->port_num); + MLX5_SET(dctc, dctc, counter_set_id, dev->port[attr->port_num - 1].cnts.set_id); + + } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + struct mlx5_ib_modify_qp_resp resp = {}; + u32 min_resp_len = offsetof(typeof(resp), dctn) + + sizeof(resp.dctn); + + if (udata->outlen < min_resp_len) + return -EINVAL; + resp.response_length = min_resp_len; + + required |= IB_QP_MIN_RNR_TIMER | IB_QP_AV | IB_QP_PATH_MTU; + if (!is_valid_mask(attr_mask, required, 0)) + return -EINVAL; + MLX5_SET(dctc, dctc, min_rnr_nak, attr->min_rnr_timer); + MLX5_SET(dctc, dctc, tclass, attr->ah_attr.grh.traffic_class); + MLX5_SET(dctc, dctc, flow_label, attr->ah_attr.grh.flow_label); + MLX5_SET(dctc, dctc, mtu, attr->path_mtu); + MLX5_SET(dctc, dctc, my_addr_index, attr->ah_attr.grh.sgid_index); + MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); + + err = mlx5_core_create_dct(dev->mdev, &qp->dct.mdct, qp->dct.in, + MLX5_ST_SZ_BYTES(create_dct_in)); + if (err) + return err; + resp.dctn = qp->dct.mdct.mqp.qpn; + err = ib_copy_to_udata(udata, &resp, resp.response_length); + if (err) { + mlx5_core_destroy_dct(dev->mdev, &qp->dct.mdct); + return err; + } + } else { + mlx5_ib_warn(dev, "Modify DCT: Invalid transition from %d to %d\n", cur_state, new_state); + return -EINVAL; + } + if (err) + qp->state = IB_QPS_ERR; + else + qp->state = new_state; + return err; +} + int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { @@ -3204,6 +3317,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp_type = (unlikely(ibqp->qp_type == MLX5_IB_QPT_HW_GSI)) ? IB_QPT_GSI : ibqp->qp_type; + if (qp_type == MLX5_IB_QPT_DCT) + return mlx5_ib_modify_dct(ibqp, attr, attr_mask, udata); mutex_lock(&qp->mutex); @@ -4775,6 +4890,71 @@ out: return err; } +static int mlx5_ib_dct_query_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *mqp, + struct ib_qp_attr *qp_attr, int qp_attr_mask, + struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_core_dct *dct = &mqp->dct.mdct; + u32 *out; + u32 access_flags = 0; + int outlen = MLX5_ST_SZ_BYTES(query_dct_out); + void *dctc; + int err; + int supported_mask = IB_QP_STATE | + IB_QP_ACCESS_FLAGS | + IB_QP_PORT | + IB_QP_MIN_RNR_TIMER | + IB_QP_AV | + IB_QP_PATH_MTU | + IB_QP_PKEY_INDEX; + + if (qp_attr_mask & ~supported_mask) + return -EINVAL; + if (mqp->state != IB_QPS_RTR) + return -EINVAL; + + out = kzalloc(outlen, GFP_KERNEL); + if (!out) + return -ENOMEM; + + err = mlx5_core_dct_query(dev->mdev, dct, out, outlen); + if (err) + goto out; + + dctc = MLX5_ADDR_OF(query_dct_out, out, dct_context_entry); + + if (qp_attr_mask & IB_QP_STATE) + qp_attr->qp_state = IB_QPS_RTR; + + if (qp_attr_mask & IB_QP_ACCESS_FLAGS) { + if (MLX5_GET(dctc, dctc, rre)) + access_flags |= IB_ACCESS_REMOTE_READ; + if (MLX5_GET(dctc, dctc, rwe)) + access_flags |= IB_ACCESS_REMOTE_WRITE; + if (MLX5_GET(dctc, dctc, rae)) + access_flags |= IB_ACCESS_REMOTE_ATOMIC; + qp_attr->qp_access_flags = access_flags; + } + + if (qp_attr_mask & IB_QP_PORT) + qp_attr->port_num = MLX5_GET(dctc, dctc, port); + if (qp_attr_mask & IB_QP_MIN_RNR_TIMER) + qp_attr->min_rnr_timer = MLX5_GET(dctc, dctc, min_rnr_nak); + if (qp_attr_mask & IB_QP_AV) { + qp_attr->ah_attr.grh.traffic_class = MLX5_GET(dctc, dctc, tclass); + qp_attr->ah_attr.grh.flow_label = MLX5_GET(dctc, dctc, flow_label); + qp_attr->ah_attr.grh.sgid_index = MLX5_GET(dctc, dctc, my_addr_index); + qp_attr->ah_attr.grh.hop_limit = MLX5_GET(dctc, dctc, hop_limit); + } + if (qp_attr_mask & IB_QP_PATH_MTU) + qp_attr->path_mtu = MLX5_GET(dctc, dctc, mtu); + if (qp_attr_mask & IB_QP_PKEY_INDEX) + qp_attr->pkey_index = MLX5_GET(dctc, dctc, pkey_index); +out: + kfree(out); + return err; +} + int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { @@ -4794,6 +4974,10 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, memset(qp_init_attr, 0, sizeof(*qp_init_attr)); memset(qp_attr, 0, sizeof(*qp_attr)); + if (unlikely(qp->qp_sub_type == MLX5_IB_QPT_DCT)) + return mlx5_ib_dct_query_qp(dev, qp, qp_attr, + qp_attr_mask, qp_init_attr); + mutex_lock(&qp->mutex); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET || diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 83bde975d3f9..f6d319dfc7bf 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -362,6 +362,11 @@ struct mlx5_ib_create_ah_resp { __u8 reserved[6]; }; +struct mlx5_ib_modify_qp_resp { + __u32 response_length; + __u32 dctn; +}; + struct mlx5_ib_create_wq_resp { __u32 response_length; __u32 reserved; -- cgit v1.2.3 From 734dc065fc41f6143ff88225aa5d335cb1e0f6aa Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:31 +0200 Subject: net/mlx5: Fix race for multiple RoCE enable There are two potential problems with the existing implementation. 1. Enable and disable can race after the atomic operations. 2. If a command fails the refcount is left in an inconsistent state. Introduce a lock and perform error checking. Fixes: a6f7d2aff623 ("net/mlx5: Add support for multiple RoCE enable") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 33 ++++++++++++++++++++----- include/linux/mlx5/driver.h | 2 +- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index d653b0025b13..916523103f16 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -36,6 +36,9 @@ #include #include "mlx5_core.h" +/* Mutex to hold while enabling or disabling RoCE */ +static DEFINE_MUTEX(mlx5_roce_en_lock); + static int _mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u32 *out, int outlen) { @@ -988,17 +991,35 @@ static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev, int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev) { - if (atomic_inc_return(&mdev->roce.roce_en) != 1) - return 0; - return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED); + int err = 0; + + mutex_lock(&mlx5_roce_en_lock); + if (!mdev->roce.roce_en) + err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED); + + if (!err) + mdev->roce.roce_en++; + mutex_unlock(&mlx5_roce_en_lock); + + return err; } EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce); int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) { - if (atomic_dec_return(&mdev->roce.roce_en) != 0) - return 0; - return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); + int err = 0; + + mutex_lock(&mlx5_roce_en_lock); + if (mdev->roce.roce_en) { + mdev->roce.roce_en--; + if (mdev->roce.roce_en == 0) + err = mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); + + if (err) + mdev->roce.roce_en++; + } + mutex_unlock(&mlx5_roce_en_lock); + return err; } EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 0776554f18dc..5b0443c9d337 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -835,7 +835,7 @@ struct mlx5_core_dev { struct mlx5e_resources mlx5e_res; struct { struct mlx5_rsvd_gids reserved_gids; - atomic_t roce_en; + u32 roce_en; } roce; #ifdef CONFIG_MLX5_FPGA struct mlx5_fpga_device *fpga; -- cgit v1.2.3 From 8737f818ca3b8ef7c9945525af7df128e1be4575 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:32 +0200 Subject: net/mlx5: Set software owner ID during init HCA Generate a unique 128bit identifier for each host and pass that value to firmware in the INIT_HCA command if it reports the sw_owner_id capability. Each device bound to the mlx5_core driver will have the same software owner ID. In subsequent patches mlx5_core devices will be bound via a new VPort command so that they can operate together under a single InfiniBand device. Only devices that have the same software owner ID can be bound, to prevent traffic intended for one host arriving at another. The INIT_HCA command length was expanded by 128 bits. The command length is provided as an input FW commands. Older FW does not have a problem receiving this command in the new longer form. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/fw.c | 10 +++++++++- drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 +++++- drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +- include/linux/mlx5/device.h | 5 +++++ include/linux/mlx5/mlx5_ifc.h | 5 ++++- 5 files changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c index 5ef1b56b6a96..9d11e92fb541 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fw.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c @@ -195,12 +195,20 @@ int mlx5_query_hca_caps(struct mlx5_core_dev *dev) return 0; } -int mlx5_cmd_init_hca(struct mlx5_core_dev *dev) +int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id) { u32 out[MLX5_ST_SZ_DW(init_hca_out)] = {0}; u32 in[MLX5_ST_SZ_DW(init_hca_in)] = {0}; + int i; MLX5_SET(init_hca_in, in, opcode, MLX5_CMD_OP_INIT_HCA); + + if (MLX5_CAP_GEN(dev, sw_owner_id)) { + for (i = 0; i < 4; i++) + MLX5_ARRAY_SET(init_hca_in, in, sw_owner_id, i, + sw_owner_id[i]); + } + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 1292aecb09f2..e382a3ca759e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -75,6 +75,8 @@ static unsigned int prof_sel = MLX5_DEFAULT_PROF; module_param_named(prof_sel, prof_sel, uint, 0444); MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2"); +static u32 sw_owner_id[4]; + enum { MLX5_ATOMIC_REQ_MODE_BE = 0x0, MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1, @@ -1055,7 +1057,7 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv, goto reclaim_boot_pages; } - err = mlx5_cmd_init_hca(dev); + err = mlx5_cmd_init_hca(dev, sw_owner_id); if (err) { dev_err(&pdev->dev, "init hca failed\n"); goto err_pagealloc_stop; @@ -1577,6 +1579,8 @@ static int __init init(void) { int err; + get_random_bytes(&sw_owner_id, sizeof(sw_owner_id)); + mlx5_core_verify_params(); mlx5_register_debugfs(); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index ff4a0b889a6f..b05868728da7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -86,7 +86,7 @@ enum { int mlx5_query_hca_caps(struct mlx5_core_dev *dev); int mlx5_query_board_id(struct mlx5_core_dev *dev); -int mlx5_cmd_init_hca(struct mlx5_core_dev *dev); +int mlx5_cmd_init_hca(struct mlx5_core_dev *dev, uint32_t *sw_owner_id); int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev); int mlx5_cmd_force_teardown_hca(struct mlx5_core_dev *dev); void mlx5_core_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event, diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 9aee835b7393..e5258ee4e38b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -79,6 +79,11 @@ << __mlx5_dw_bit_off(typ, fld))); \ } while (0) +#define MLX5_ARRAY_SET(typ, p, fld, idx, v) do { \ + BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 32); \ + MLX5_SET(typ, p, fld[idx], v); \ +} while (0) + #define MLX5_SET_TO_ONES(typ, p, fld) do { \ BUILD_BUG_ON(__mlx5_st_sz_bits(typ) % 32); \ *((__be32 *)(p) + __mlx5_dw_off(typ, fld)) = \ diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 38a7577a9ce7..b1c81d7a86cb 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1066,7 +1066,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_5f8[0x3]; u8 log_max_xrq[0x5]; - u8 reserved_at_600[0x200]; + u8 reserved_at_600[0x1e]; + u8 sw_owner_id; + u8 reserved_at_61f[0x1e1]; }; enum mlx5_flow_destination_type { @@ -5531,6 +5533,7 @@ struct mlx5_ifc_init_hca_in_bits { u8 op_mod[0x10]; u8 reserved_at_40[0x40]; + u8 sw_owner_id[4][0x20]; }; struct mlx5_ifc_init2rtr_qp_out_bits { -- cgit v1.2.3 From 908d6460b3d81dd2dca9e9e8fc7fa19b5a1e4d99 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:33 +0200 Subject: IB/core: Change roce_rescan_device to return void It always returns 0. Change return type to void. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 7 +------ drivers/infiniband/core/core_priv.h | 2 +- drivers/infiniband/core/roce_gid_mgmt.c | 4 +--- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 7babdbceb6d0..fc4022884dbb 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -821,12 +821,7 @@ static int gid_table_setup_one(struct ib_device *ib_dev) if (err) return err; - err = roce_rescan_device(ib_dev); - - if (err) { - gid_table_cleanup_one(ib_dev); - gid_table_release_one(ib_dev); - } + roce_rescan_device(ib_dev); return err; } diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 0deff4c4911b..39e3c1d02613 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -137,7 +137,7 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); -int roce_rescan_device(struct ib_device *ib_dev); +void roce_rescan_device(struct ib_device *ib_dev); unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); int ib_cache_setup_one(struct ib_device *device); diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 90e3889b7fbe..ebfe45739ca7 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -412,12 +412,10 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, /* This function will rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. */ -int roce_rescan_device(struct ib_device *ib_dev) +void roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); - - return 0; } static void callback_for_addr_gid_device_scan(struct ib_device *device, -- cgit v1.2.3 From 508562d6f708888b10127c2892513f76ea8b22ba Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:34 +0200 Subject: IB/mlx5: Reduce the use of num_port capability Remove use of the num_ports general capability throughout. The number of ports will be variable in the future, and reported in a different way. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mad.c | 2 +- drivers/infiniband/hw/mlx5/main.c | 16 ++++++++-------- drivers/infiniband/hw/mlx5/qp.c | 5 ++--- 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 1003b0133a49..0559e0a9e398 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -519,7 +519,7 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, int ext_active_speed; int err = -ENOMEM; - if (port < 1 || port > MLX5_CAP_GEN(mdev, num_ports)) { + if (port < 1 || port > dev->num_ports) { mlx5_ib_warn(dev, "invalid port number %d\n", port); return -EINVAL; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 675144a20f95..0c24cd42b411 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1512,7 +1512,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, mutex_init(&context->db_page_mutex); resp.tot_bfregs = req.total_num_bfregs; - resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); + resp.num_ports = dev->num_ports; if (field_avail(typeof(resp), cqe_version, udata->outlen)) resp.response_length += sizeof(resp.cqe_version); @@ -2774,7 +2774,7 @@ static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp, return ERR_PTR(-ENOMEM); if (domain != IB_FLOW_DOMAIN_USER || - flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) || + flow_attr->port > dev->num_ports || (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP)) return ERR_PTR(-EINVAL); @@ -3122,7 +3122,7 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev) int err; int port; - for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + for (port = 1; port <= dev->num_ports; port++) { dev->mdev->port_caps[port - 1].has_smi = false; if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) { @@ -3149,7 +3149,7 @@ static void get_ext_port_caps(struct mlx5_ib_dev *dev) { int port; - for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) + for (port = 1; port <= dev->num_ports; port++) mlx5_query_ext_port_caps(dev, port); } @@ -3179,7 +3179,7 @@ static int get_port_caps(struct mlx5_ib_dev *dev) goto out; } - for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + for (port = 1; port <= dev->num_ports; port++) { memset(pprops, 0, sizeof(*pprops)); err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { @@ -4061,7 +4061,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) const char *name; int err; - dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port), + dev->port = kcalloc(dev->num_ports, sizeof(*dev->port), GFP_KERNEL); if (!dev->port) return -ENOMEM; @@ -4083,8 +4083,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; - dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); - dev->ib_dev.phys_port_cnt = dev->num_ports; + dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->mdev->priv.eq_table.num_comp_vectors; dev->ib_dev.dev.parent = &mdev->pdev->dev; @@ -4443,6 +4442,7 @@ static void *__mlx5_ib_add(struct mlx5_core_dev *mdev, return NULL; dev->mdev = mdev; + dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { if (profile->stage[i].init) { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f59de13b657e..4b08472cf0ba 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3351,7 +3351,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || - attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports))) { + attr->port_num > dev->num_ports)) { mlx5_ib_dbg(dev, "invalid port number %d. number of ports is %d\n", attr->port_num, dev->num_ports); goto out; @@ -4670,14 +4670,13 @@ static void to_rdma_ah_attr(struct mlx5_ib_dev *ibdev, struct rdma_ah_attr *ah_attr, struct mlx5_qp_path *path) { - struct mlx5_core_dev *dev = ibdev->mdev; memset(ah_attr, 0, sizeof(*ah_attr)); ah_attr->type = rdma_ah_find_type(&ibdev->ib_dev, path->port); rdma_ah_set_port_num(ah_attr, path->port); if (rdma_ah_get_port_num(ah_attr) == 0 || - rdma_ah_get_port_num(ah_attr) > MLX5_CAP_GEN(dev, num_ports)) + rdma_ah_get_port_num(ah_attr) > ibdev->num_ports) return; rdma_ah_set_port_num(ah_attr, path->port); -- cgit v1.2.3 From 7fd8aefb7ce202dd9d97f752bf249be6215f1004 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:35 +0200 Subject: IB/mlx5: Make netdev notifications multiport capable When multiple RoCE ports are supported registration for events on multiple netdevs is required. Refactor the event registration and handling to support multiple ports. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 85 +++++++++++++++++++++--------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 +- drivers/infiniband/hw/mlx5/qp.c | 3 +- include/linux/mlx5/driver.h | 5 +++ 4 files changed, 60 insertions(+), 37 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0c24cd42b411..5fcb2ed94c11 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -113,24 +113,30 @@ static int get_port_state(struct ib_device *ibdev, static int mlx5_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); struct net_device *ndev = netdev_notifier_info_to_dev(ptr); - struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev, - roce.nb); + u8 port_num = roce->native_port_num; + struct mlx5_core_dev *mdev; + struct mlx5_ib_dev *ibdev; + + ibdev = roce->dev; + mdev = ibdev->mdev; switch (event) { case NETDEV_REGISTER: case NETDEV_UNREGISTER: - write_lock(&ibdev->roce.netdev_lock); - if (ndev->dev.parent == &ibdev->mdev->pdev->dev) - ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? - NULL : ndev; - write_unlock(&ibdev->roce.netdev_lock); + write_lock(&roce->netdev_lock); + + if (ndev->dev.parent == &mdev->pdev->dev) + roce->netdev = (event == NETDEV_UNREGISTER) ? + NULL : ndev; + write_unlock(&roce->netdev_lock); break; case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { - struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(ibdev->mdev); + struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); struct net_device *upper = NULL; if (lag_ndev) { @@ -138,27 +144,28 @@ static int mlx5_netdev_event(struct notifier_block *this, dev_put(lag_ndev); } - if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev)) + if ((upper == ndev || (!upper && ndev == roce->netdev)) && ibdev->ib_active) { struct ib_event ibev = { }; enum ib_port_state port_state; - if (get_port_state(&ibdev->ib_dev, 1, &port_state)) - return NOTIFY_DONE; + if (get_port_state(&ibdev->ib_dev, port_num, + &port_state)) + goto done; - if (ibdev->roce.last_port_state == port_state) - return NOTIFY_DONE; + if (roce->last_port_state == port_state) + goto done; - ibdev->roce.last_port_state = port_state; + roce->last_port_state = port_state; ibev.device = &ibdev->ib_dev; if (port_state == IB_PORT_DOWN) ibev.event = IB_EVENT_PORT_ERR; else if (port_state == IB_PORT_ACTIVE) ibev.event = IB_EVENT_PORT_ACTIVE; else - return NOTIFY_DONE; + goto done; - ibev.element.port_num = 1; + ibev.element.port_num = port_num; ib_dispatch_event(&ibev); } break; @@ -167,7 +174,7 @@ static int mlx5_netdev_event(struct notifier_block *this, default: break; } - +done: return NOTIFY_DONE; } @@ -183,11 +190,11 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, /* Ensure ndev does not disappear before we invoke dev_hold() */ - read_lock(&ibdev->roce.netdev_lock); - ndev = ibdev->roce.netdev; + read_lock(&ibdev->roce[port_num - 1].netdev_lock); + ndev = ibdev->roce[port_num - 1].netdev; if (ndev) dev_hold(ndev); - read_unlock(&ibdev->roce.netdev_lock); + read_unlock(&ibdev->roce[port_num - 1].netdev_lock); return ndev; } @@ -3579,33 +3586,33 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) } } -static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev) +static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) { int err; - dev->roce.nb.notifier_call = mlx5_netdev_event; - err = register_netdevice_notifier(&dev->roce.nb); + dev->roce[port_num].nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier(&dev->roce[port_num].nb); if (err) { - dev->roce.nb.notifier_call = NULL; + dev->roce[port_num].nb.notifier_call = NULL; return err; } return 0; } -static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev) +static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num) { - if (dev->roce.nb.notifier_call) { - unregister_netdevice_notifier(&dev->roce.nb); - dev->roce.nb.notifier_call = NULL; + if (dev->roce[port_num].nb.notifier_call) { + unregister_netdevice_notifier(&dev->roce[port_num].nb); + dev->roce[port_num].nb.notifier_call = NULL; } } -static int mlx5_enable_eth(struct mlx5_ib_dev *dev) +static int mlx5_enable_eth(struct mlx5_ib_dev *dev, u8 port_num) { int err; - err = mlx5_add_netdev_notifier(dev); + err = mlx5_add_netdev_notifier(dev, port_num); if (err) return err; @@ -3626,7 +3633,7 @@ err_disable_roce: mlx5_nic_vport_disable_roce(dev->mdev); err_unregister_netdevice_notifier: - mlx5_remove_netdev_notifier(dev); + mlx5_remove_netdev_notifier(dev, port_num); return err; } @@ -4066,7 +4073,6 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (!dev->port) return -ENOMEM; - rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); if (err) goto err_free_port; @@ -4246,12 +4252,21 @@ static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; enum rdma_link_layer ll; int port_type_cap; + u8 port_num = 0; int err; + int i; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) { + for (i = 0; i < dev->num_ports; i++) { + rwlock_init(&dev->roce[i].netdev_lock); + dev->roce[i].dev = dev; + dev->roce[i].native_port_num = i + 1; + dev->roce[i].last_port_state = IB_PORT_DOWN; + } + dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.create_wq = mlx5_ib_create_wq; dev->ib_dev.modify_wq = mlx5_ib_modify_wq; @@ -4264,10 +4279,9 @@ static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) (1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) | (1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL); - err = mlx5_enable_eth(dev); + err = mlx5_enable_eth(dev, port_num); if (err) return err; - dev->roce.last_port_state = IB_PORT_DOWN; } return 0; @@ -4278,13 +4292,14 @@ static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; enum rdma_link_layer ll; int port_type_cap; + u8 port_num = 0; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) { mlx5_disable_eth(dev); - mlx5_remove_netdev_notifier(dev); + mlx5_remove_netdev_notifier(dev, port_num); } } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 57405cf8a5c5..6106dde35144 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -667,6 +667,8 @@ struct mlx5_roce { struct notifier_block nb; atomic_t next_port; enum ib_port_state last_port_state; + struct mlx5_ib_dev *dev; + u8 native_port_num; }; struct mlx5_ib_dbg_param { @@ -757,7 +759,7 @@ struct mlx5_ib_profile { struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; - struct mlx5_roce roce; + struct mlx5_roce roce[MLX5_MAX_PORTS]; int num_ports; /* serialize update of capability mask */ diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 4b08472cf0ba..ae36db3d0deb 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2962,8 +2962,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, (ibqp->qp_type == IB_QPT_XRC_INI) || (ibqp->qp_type == IB_QPT_XRC_TGT)) { if (mlx5_lag_is_active(dev->mdev)) { + u8 p = mlx5_core_native_port_num(dev->mdev); tx_affinity = (unsigned int)atomic_add_return(1, - &dev->roce.next_port) % + &dev->roce[p].next_port) % MLX5_MAX_PORTS + 1; context->flags |= cpu_to_be32(tx_affinity << 24); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5b0443c9d337..28733529f6ff 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1234,6 +1234,11 @@ static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev) return !!(dev->priv.rl_table.max_size); } +static inline int mlx5_core_native_port_num(struct mlx5_core_dev *dev) +{ + return 1; +} + enum { MLX5_TRIGGERED_CMD_COMP = (u64)1 << 32, }; -- cgit v1.2.3 From 32f69e4be269739c3850cd20f1a3322e95c1145f Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:36 +0200 Subject: {net, IB}/mlx5: Manage port association for multiport RoCE When mlx5_ib_add is called determine if the mlx5 core device being added is capable of dual port RoCE operation. If it is, determine whether it is a master device or a slave device using the num_vhca_ports and affiliate_nic_vport_criteria capabilities. If the device is a slave, attempt to find a master device to affiliate it with. Devices that can be affiliated will share a system image guid. If none are found place it on a list of unaffiliated ports. If a master is found bind the port to it by configuring the port affiliation in the NIC vport context. Similarly when mlx5_ib_remove is called determine the port type. If it's a slave port, unaffiliate it from the master device, otherwise just remove it from the unaffiliated port list. The IB device is registered as a multiport device, even if a 2nd port is not available for affiliation. When the 2nd port is affiliated later the GID cache must be refreshed in order to get the default GIDs for the 2nd port in the cache. Export roce_rescan_device to provide a mechanism to refresh the cache after a new port is bound. In a multiport configuration all IB object (QP, MR, PD, etc) related commands should flow through the master mlx5_core_dev, other commands must be sent to the slave port mlx5_core_mdev, an interface is provide to get the correct mdev for non IB object commands. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 2 +- drivers/infiniband/core/core_priv.h | 1 - drivers/infiniband/core/roce_gid_mgmt.c | 11 +- drivers/infiniband/hw/mlx5/main.c | 421 +++++++++++++++++++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 28 ++ .../net/ethernet/mellanox/mlx5/core/fpga/conn.c | 4 +- .../net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/vport.c | 58 +++ include/linux/mlx5/driver.h | 22 +- include/linux/mlx5/mlx5_ifc.h | 31 +- include/linux/mlx5/vport.h | 4 + include/rdma/ib_verbs.h | 8 + 12 files changed, 550 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index fc4022884dbb..e9a409d7f4e2 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -821,7 +821,7 @@ static int gid_table_setup_one(struct ib_device *ib_dev) if (err) return err; - roce_rescan_device(ib_dev); + rdma_roce_rescan_device(ib_dev); return err; } diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 39e3c1d02613..39e4acdb025e 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -137,7 +137,6 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); -void roce_rescan_device(struct ib_device *ib_dev); unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); int ib_cache_setup_one(struct ib_device *device); diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index ebfe45739ca7..5a52ec77940a 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -410,13 +410,18 @@ static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev, rtnl_unlock(); } -/* This function will rescan all of the network devices in the system - * and add their gids, as needed, to the relevant RoCE devices. */ -void roce_rescan_device(struct ib_device *ib_dev) +/** + * rdma_roce_rescan_device - Rescan all of the network devices in the system + * and add their gids, as needed, to the relevant RoCE devices. + * + * @device: the rdma device + */ +void rdma_roce_rescan_device(struct ib_device *ib_dev) { ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL, enum_all_gids_of_dev_cb, NULL); } +EXPORT_SYMBOL(rdma_roce_rescan_device); static void callback_for_addr_gid_device_scan(struct ib_device *device, u8 port, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5fcb2ed94c11..4fbbe4c7a99b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -74,6 +74,23 @@ enum { MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, }; +static LIST_HEAD(mlx5_ib_unaffiliated_port_list); +static LIST_HEAD(mlx5_ib_dev_list); +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_ib_multiport_mutex); + +struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi) +{ + struct mlx5_ib_dev *dev; + + mutex_lock(&mlx5_ib_multiport_mutex); + dev = mpi->ibdev; + mutex_unlock(&mlx5_ib_multiport_mutex); + return dev; +} + static enum rdma_link_layer mlx5_port_type_cap_to_rdma_ll(int port_type_cap) { @@ -120,7 +137,9 @@ static int mlx5_netdev_event(struct notifier_block *this, struct mlx5_ib_dev *ibdev; ibdev = roce->dev; - mdev = ibdev->mdev; + mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); + if (!mdev) + return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: @@ -175,6 +194,7 @@ static int mlx5_netdev_event(struct notifier_block *this, break; } done: + mlx5_ib_put_native_port_mdev(ibdev, port_num); return NOTIFY_DONE; } @@ -183,10 +203,15 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, { struct mlx5_ib_dev *ibdev = to_mdev(device); struct net_device *ndev; + struct mlx5_core_dev *mdev; + + mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); + if (!mdev) + return NULL; - ndev = mlx5_lag_get_roce_netdev(ibdev->mdev); + ndev = mlx5_lag_get_roce_netdev(mdev); if (ndev) - return ndev; + goto out; /* Ensure ndev does not disappear before we invoke dev_hold() */ @@ -196,9 +221,70 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, dev_hold(ndev); read_unlock(&ibdev->roce[port_num - 1].netdev_lock); +out: + mlx5_ib_put_native_port_mdev(ibdev, port_num); return ndev; } +struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, + u8 ib_port_num, + u8 *native_port_num) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, + ib_port_num); + struct mlx5_core_dev *mdev = NULL; + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_port *port; + + if (native_port_num) + *native_port_num = 1; + + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return ibdev->mdev; + + port = &ibdev->port[ib_port_num - 1]; + if (!port) + return NULL; + + spin_lock(&port->mp.mpi_lock); + mpi = ibdev->port[ib_port_num - 1].mp.mpi; + if (mpi && !mpi->unaffiliate) { + mdev = mpi->mdev; + /* If it's the master no need to refcount, it'll exist + * as long as the ib_dev exists. + */ + if (!mpi->is_master) + mpi->mdev_refcnt++; + } + spin_unlock(&port->mp.mpi_lock); + + return mdev; +} + +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *ibdev, u8 port_num) +{ + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&ibdev->ib_dev, + port_num); + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_port *port; + + if (!mlx5_core_mp_enabled(ibdev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return; + + port = &ibdev->port[port_num - 1]; + + spin_lock(&port->mp.mpi_lock); + mpi = ibdev->port[port_num - 1].mp.mpi; + if (mpi->is_master) + goto out; + + mpi->mdev_refcnt--; + if (mpi->unaffiliate) + complete(&mpi->unref_comp); +out: + spin_unlock(&port->mp.mpi_lock); +} + static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed, u8 *active_width) { @@ -3160,12 +3246,11 @@ static void get_ext_port_caps(struct mlx5_ib_dev *dev) mlx5_query_ext_port_caps(dev, port); } -static int get_port_caps(struct mlx5_ib_dev *dev) +static int get_port_caps(struct mlx5_ib_dev *dev, u8 port) { struct ib_device_attr *dprops = NULL; struct ib_port_attr *pprops = NULL; int err = -ENOMEM; - int port; struct ib_udata uhw = {.inlen = 0, .outlen = 0}; pprops = kmalloc(sizeof(*pprops), GFP_KERNEL); @@ -3186,22 +3271,21 @@ static int get_port_caps(struct mlx5_ib_dev *dev) goto out; } - for (port = 1; port <= dev->num_ports; port++) { - memset(pprops, 0, sizeof(*pprops)); - err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); - if (err) { - mlx5_ib_warn(dev, "query_port %d failed %d\n", - port, err); - break; - } - dev->mdev->port_caps[port - 1].pkey_table_len = - dprops->max_pkeys; - dev->mdev->port_caps[port - 1].gid_table_len = - pprops->gid_tbl_len; - mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n", - dprops->max_pkeys, pprops->gid_tbl_len); + memset(pprops, 0, sizeof(*pprops)); + err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); + if (err) { + mlx5_ib_warn(dev, "query_port %d failed %d\n", + port, err); + goto out; } + dev->mdev->port_caps[port - 1].pkey_table_len = + dprops->max_pkeys; + dev->mdev->port_caps[port - 1].gid_table_len = + pprops->gid_tbl_len; + mlx5_ib_dbg(dev, "port %d: pkey_table_len %d, gid_table_len %d\n", + port, dprops->max_pkeys, pprops->gid_tbl_len); + out: kfree(pprops); kfree(dprops); @@ -4054,8 +4138,203 @@ mlx5_ib_get_vector_affinity(struct ib_device *ibdev, int comp_vector) return mlx5_get_vector_affinity(dev->mdev, comp_vector); } +/* The mlx5_ib_multiport_mutex should be held when calling this function */ +static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, + struct mlx5_ib_multiport_info *mpi) +{ + u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + struct mlx5_ib_port *port = &ibdev->port[port_num]; + int comps; + int err; + int i; + + spin_lock(&port->mp.mpi_lock); + if (!mpi->ibdev) { + spin_unlock(&port->mp.mpi_lock); + return; + } + mpi->ibdev = NULL; + + spin_unlock(&port->mp.mpi_lock); + mlx5_remove_netdev_notifier(ibdev, port_num); + spin_lock(&port->mp.mpi_lock); + + comps = mpi->mdev_refcnt; + if (comps) { + mpi->unaffiliate = true; + init_completion(&mpi->unref_comp); + spin_unlock(&port->mp.mpi_lock); + + for (i = 0; i < comps; i++) + wait_for_completion(&mpi->unref_comp); + + spin_lock(&port->mp.mpi_lock); + mpi->unaffiliate = false; + } + + port->mp.mpi = NULL; + + list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); + + spin_unlock(&port->mp.mpi_lock); + + err = mlx5_nic_vport_unaffiliate_multiport(mpi->mdev); + + mlx5_ib_dbg(ibdev, "unaffiliated port %d\n", port_num + 1); + /* Log an error, still needed to cleanup the pointers and add + * it back to the list. + */ + if (err) + mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n", + port_num + 1); + + ibdev->roce[port_num].last_port_state = IB_PORT_DOWN; +} + +/* The mlx5_ib_multiport_mutex should be held when calling this function */ +static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, + struct mlx5_ib_multiport_info *mpi) +{ + u8 port_num = mlx5_core_native_port_num(mpi->mdev) - 1; + int err; + + spin_lock(&ibdev->port[port_num].mp.mpi_lock); + if (ibdev->port[port_num].mp.mpi) { + mlx5_ib_warn(ibdev, "port %d already affiliated.\n", + port_num + 1); + spin_unlock(&ibdev->port[port_num].mp.mpi_lock); + return false; + } + + ibdev->port[port_num].mp.mpi = mpi; + mpi->ibdev = ibdev; + spin_unlock(&ibdev->port[port_num].mp.mpi_lock); + + err = mlx5_nic_vport_affiliate_multiport(ibdev->mdev, mpi->mdev); + if (err) + goto unbind; + + err = get_port_caps(ibdev, mlx5_core_native_port_num(mpi->mdev)); + if (err) + goto unbind; + + err = mlx5_add_netdev_notifier(ibdev, port_num); + if (err) { + mlx5_ib_err(ibdev, "failed adding netdev notifier for port %u\n", + port_num + 1); + goto unbind; + } + + return true; + +unbind: + mlx5_ib_unbind_slave_port(ibdev, mpi); + return false; +} + +static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) +{ + int port_num = mlx5_core_native_port_num(dev->mdev) - 1; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, + port_num + 1); + struct mlx5_ib_multiport_info *mpi; + int err; + int i; + + if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return 0; + + err = mlx5_query_nic_vport_system_image_guid(dev->mdev, + &dev->sys_image_guid); + if (err) + return err; + + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + return err; + + mutex_lock(&mlx5_ib_multiport_mutex); + for (i = 0; i < dev->num_ports; i++) { + bool bound = false; + + /* build a stub multiport info struct for the native port. */ + if (i == port_num) { + mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); + if (!mpi) { + mutex_unlock(&mlx5_ib_multiport_mutex); + mlx5_nic_vport_disable_roce(dev->mdev); + return -ENOMEM; + } + + mpi->is_master = true; + mpi->mdev = dev->mdev; + mpi->sys_image_guid = dev->sys_image_guid; + dev->port[i].mp.mpi = mpi; + mpi->ibdev = dev; + mpi = NULL; + continue; + } + + list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, + list) { + if (dev->sys_image_guid == mpi->sys_image_guid && + (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + bound = mlx5_ib_bind_slave_port(dev, mpi); + } + + if (bound) { + dev_dbg(&mpi->mdev->pdev->dev, "removing port from unaffiliated list.\n"); + mlx5_ib_dbg(dev, "port %d bound\n", i + 1); + list_del(&mpi->list); + break; + } + } + if (!bound) { + get_port_caps(dev, i + 1); + mlx5_ib_dbg(dev, "no free port found for port %d\n", + i + 1); + } + } + + list_add_tail(&dev->ib_dev_list, &mlx5_ib_dev_list); + mutex_unlock(&mlx5_ib_multiport_mutex); + return err; +} + +static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) +{ + int port_num = mlx5_core_native_port_num(dev->mdev) - 1; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, + port_num + 1); + int i; + + if (!mlx5_core_is_mp_master(dev->mdev) || ll != IB_LINK_LAYER_ETHERNET) + return; + + mutex_lock(&mlx5_ib_multiport_mutex); + for (i = 0; i < dev->num_ports; i++) { + if (dev->port[i].mp.mpi) { + /* Destroy the native port stub */ + if (i == port_num) { + kfree(dev->port[i].mp.mpi); + dev->port[i].mp.mpi = NULL; + } else { + mlx5_ib_dbg(dev, "unbinding port_num: %d\n", i + 1); + mlx5_ib_unbind_slave_port(dev, dev->port[i].mp.mpi); + } + } + } + + mlx5_ib_dbg(dev, "removing from devlist\n"); + list_del(&dev->ib_dev_list); + mutex_unlock(&mlx5_ib_multiport_mutex); + + mlx5_nic_vport_disable_roce(dev->mdev); +} + static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { + mlx5_ib_cleanup_multiport_master(dev); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING cleanup_srcu_struct(&dev->mr_srcu); #endif @@ -4067,16 +4346,36 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; const char *name; int err; + int i; dev->port = kcalloc(dev->num_ports, sizeof(*dev->port), GFP_KERNEL); if (!dev->port) return -ENOMEM; - err = get_port_caps(dev); + for (i = 0; i < dev->num_ports; i++) { + spin_lock_init(&dev->port[i].mp.mpi_lock); + rwlock_init(&dev->roce[i].netdev_lock); + } + + err = mlx5_ib_init_multiport_master(dev); if (err) goto err_free_port; + if (!mlx5_core_mp_enabled(mdev)) { + int i; + + for (i = 1; i <= dev->num_ports; i++) { + err = get_port_caps(dev, i); + if (err) + break; + } + } else { + err = get_port_caps(dev, mlx5_core_native_port_num(mdev)); + } + if (err) + goto err_mp; + if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); @@ -4106,6 +4405,8 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) #endif return 0; +err_mp: + mlx5_ib_cleanup_multiport_master(dev); err_free_port: kfree(dev->port); @@ -4252,16 +4553,16 @@ static int mlx5_ib_stage_roce_init(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; enum rdma_link_layer ll; int port_type_cap; - u8 port_num = 0; + u8 port_num; int err; int i; + port_num = mlx5_core_native_port_num(dev->mdev) - 1; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); if (ll == IB_LINK_LAYER_ETHERNET) { for (i = 0; i < dev->num_ports; i++) { - rwlock_init(&dev->roce[i].netdev_lock); dev->roce[i].dev = dev; dev->roce[i].native_port_num = i + 1; dev->roce[i].last_port_state = IB_PORT_DOWN; @@ -4292,8 +4593,9 @@ static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev) struct mlx5_core_dev *mdev = dev->mdev; enum rdma_link_layer ll; int port_type_cap; - u8 port_num = 0; + u8 port_num; + port_num = mlx5_core_native_port_num(dev->mdev) - 1; port_type_cap = MLX5_CAP_GEN(mdev, port_type); ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); @@ -4443,6 +4745,8 @@ static void __mlx5_ib_remove(struct mlx5_ib_dev *dev, ib_dealloc_device((struct ib_device *)dev); } +static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num); + static void *__mlx5_ib_add(struct mlx5_core_dev *mdev, const struct mlx5_ib_profile *profile) { @@ -4457,7 +4761,8 @@ static void *__mlx5_ib_add(struct mlx5_core_dev *mdev, return NULL; dev->mdev = mdev; - dev->num_ports = MLX5_CAP_GEN(mdev, num_ports); + dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports), + MLX5_CAP_GEN(mdev, num_vhca_ports)); for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { if (profile->stage[i].init) { @@ -4520,15 +4825,81 @@ static const struct mlx5_ib_profile pf_profile = { NULL), }; +static void *mlx5_ib_add_slave_port(struct mlx5_core_dev *mdev, u8 port_num) +{ + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_dev *dev; + bool bound = false; + int err; + + mpi = kzalloc(sizeof(*mpi), GFP_KERNEL); + if (!mpi) + return NULL; + + mpi->mdev = mdev; + + err = mlx5_query_nic_vport_system_image_guid(mdev, + &mpi->sys_image_guid); + if (err) { + kfree(mpi); + return NULL; + } + + mutex_lock(&mlx5_ib_multiport_mutex); + list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { + if (dev->sys_image_guid == mpi->sys_image_guid) + bound = mlx5_ib_bind_slave_port(dev, mpi); + + if (bound) { + rdma_roce_rescan_device(&dev->ib_dev); + break; + } + } + + if (!bound) { + list_add_tail(&mpi->list, &mlx5_ib_unaffiliated_port_list); + dev_dbg(&mdev->pdev->dev, "no suitable IB device found to bind to, added to unaffiliated list.\n"); + } else { + mlx5_ib_dbg(dev, "bound port %u\n", port_num + 1); + } + mutex_unlock(&mlx5_ib_multiport_mutex); + + return mpi; +} + static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { + enum rdma_link_layer ll; + int port_type_cap; + + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET) { + u8 port_num = mlx5_core_native_port_num(mdev) - 1; + + return mlx5_ib_add_slave_port(mdev, port_num); + } + return __mlx5_ib_add(mdev, &pf_profile); } static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { - struct mlx5_ib_dev *dev = context; + struct mlx5_ib_multiport_info *mpi; + struct mlx5_ib_dev *dev; + + if (mlx5_core_is_mp_slave(mdev)) { + mpi = context; + mutex_lock(&mlx5_ib_multiport_mutex); + if (mpi->ibdev) + mlx5_ib_unbind_slave_port(mpi->ibdev, mpi); + list_del(&mpi->list); + mutex_unlock(&mlx5_ib_multiport_mutex); + return; + } + dev = context; __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 6106dde35144..a70a4c02e396 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -654,8 +654,17 @@ struct mlx5_ib_counters { u16 set_id; }; +struct mlx5_ib_multiport_info; + +struct mlx5_ib_multiport { + struct mlx5_ib_multiport_info *mpi; + /* To be held when accessing the multiport info */ + spinlock_t mpi_lock; +}; + struct mlx5_ib_port { struct mlx5_ib_counters cnts; + struct mlx5_ib_multiport mp; }; struct mlx5_roce { @@ -756,6 +765,17 @@ struct mlx5_ib_profile { struct mlx5_ib_stage stage[MLX5_IB_STAGE_MAX]; }; +struct mlx5_ib_multiport_info { + struct list_head list; + struct mlx5_ib_dev *ibdev; + struct mlx5_core_dev *mdev; + struct completion unref_comp; + u64 sys_image_guid; + u32 mdev_refcnt; + bool is_master; + bool unaffiliate; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; @@ -800,6 +820,8 @@ struct mlx5_ib_dev { struct mutex lb_mutex; u32 user_td; u8 umr_fence; + struct list_head ib_dev_list; + u64 sys_image_guid; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1071,6 +1093,12 @@ int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc); void mlx5_ib_free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn); +struct mlx5_ib_dev *mlx5_ib_get_ibdev_from_mpi(struct mlx5_ib_multiport_info *mpi); +struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *dev, + u8 ib_port_num, + u8 *native_port_num); +void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, + u8 port_num); static inline void init_query_mad(struct ib_smp *mad) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index c4392f741c5f..c841b03c3e48 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -688,7 +688,7 @@ static inline int mlx5_fpga_conn_init_qp(struct mlx5_fpga_conn *conn) MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC); MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED); MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX); - MLX5_SET(qpc, qpc, primary_address_path.port, MLX5_FPGA_PORT_NUM); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM); MLX5_SET(qpc, qpc, pd, conn->fdev->conn_res.pdn); MLX5_SET(qpc, qpc, cqn_snd, conn->cq.mcq.cqn); MLX5_SET(qpc, qpc, cqn_rcv, conn->cq.mcq.cqn); @@ -727,7 +727,7 @@ static inline int mlx5_fpga_conn_rtr_qp(struct mlx5_fpga_conn *conn) MLX5_SET(qpc, qpc, next_rcv_psn, MLX5_GET(fpga_qpc, conn->fpga_qpc, next_send_psn)); MLX5_SET(qpc, qpc, primary_address_path.pkey_index, MLX5_FPGA_PKEY_INDEX); - MLX5_SET(qpc, qpc, primary_address_path.port, MLX5_FPGA_PORT_NUM); + MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, MLX5_FPGA_PORT_NUM); ether_addr_copy(MLX5_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), MLX5_ADDR_OF(fpga_qpc, conn->fpga_qpc, fpga_mac_47_32)); MLX5_SET(qpc, qpc, primary_address_path.udp_sport, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c index d2a66dc4adc6..261b95d014a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c @@ -187,7 +187,7 @@ int mlx5i_create_underlay_qp(struct mlx5_core_dev *mdev, struct mlx5_core_qp *qp MLX5_QP_ENHANCED_ULP_STATELESS_MODE); addr_path = MLX5_ADDR_OF(qpc, qpc, primary_address_path); - MLX5_SET(ads, addr_path, port, 1); + MLX5_SET(ads, addr_path, vhca_port_num, 1); MLX5_SET(ads, addr_path, grh, 1); ret = mlx5_core_create_qp(mdev, qp, in, inlen); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 916523103f16..9cb939b6a859 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -1121,3 +1121,61 @@ ex: return err; } EXPORT_SYMBOL_GPL(mlx5_core_modify_hca_vport_context); + +int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, + struct mlx5_core_dev *port_mdev) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + err = mlx5_nic_vport_enable_roce(port_mdev); + if (err) + goto free; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliated_vhca_id, + MLX5_CAP_GEN(master_mdev, vhca_id)); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliation_criteria, + MLX5_CAP_GEN(port_mdev, affiliate_nic_vport_criteria)); + + err = mlx5_modify_nic_vport_context(port_mdev, in, inlen); + if (err) + mlx5_nic_vport_disable_roce(port_mdev); + +free: + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_affiliate_multiport); + +int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev) +{ + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + void *in; + int err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_nic_vport_context_in, in, field_select.affiliation, 1); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliated_vhca_id, 0); + MLX5_SET(modify_nic_vport_context_in, in, + nic_vport_context.affiliation_criteria, 0); + + err = mlx5_modify_nic_vport_context(port_mdev, in, inlen); + if (!err) + mlx5_nic_vport_disable_roce(port_mdev); + + kvfree(in); + return err; +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_unaffiliate_multiport); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 28733529f6ff..d5c787519e06 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1234,9 +1234,29 @@ static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev) return !!(dev->priv.rl_table.max_size); } +static inline int mlx5_core_is_mp_slave(struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, affiliate_nic_vport_criteria) && + MLX5_CAP_GEN(dev, num_vhca_ports) <= 1; +} + +static inline int mlx5_core_is_mp_master(struct mlx5_core_dev *dev) +{ + return MLX5_CAP_GEN(dev, num_vhca_ports) > 1; +} + +static inline int mlx5_core_mp_enabled(struct mlx5_core_dev *dev) +{ + return mlx5_core_is_mp_slave(dev) || + mlx5_core_is_mp_master(dev); +} + static inline int mlx5_core_native_port_num(struct mlx5_core_dev *dev) { - return 1; + if (!mlx5_core_mp_enabled(dev)) + return 1; + + return MLX5_CAP_GEN(dev, native_port_num); } enum { diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index b1c81d7a86cb..7e88c8e7f374 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -502,7 +502,7 @@ struct mlx5_ifc_ads_bits { u8 dei_cfi[0x1]; u8 eth_prio[0x3]; u8 sl[0x4]; - u8 port[0x8]; + u8 vhca_port_num[0x8]; u8 rmac_47_32[0x10]; u8 rmac_31_0[0x20]; @@ -794,7 +794,10 @@ enum { }; struct mlx5_ifc_cmd_hca_cap_bits { - u8 reserved_at_0[0x80]; + u8 reserved_at_0[0x30]; + u8 vhca_id[0x10]; + + u8 reserved_at_40[0x40]; u8 log_max_srq_sz[0x8]; u8 log_max_qp_sz[0x8]; @@ -1066,8 +1069,11 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_5f8[0x3]; u8 log_max_xrq[0x5]; - u8 reserved_at_600[0x1e]; - u8 sw_owner_id; + u8 affiliate_nic_vport_criteria[0x8]; + u8 native_port_num[0x8]; + u8 num_vhca_ports[0x8]; + u8 reserved_at_618[0x6]; + u8 sw_owner_id[0x1]; u8 reserved_at_61f[0x1e1]; }; @@ -2617,7 +2623,12 @@ struct mlx5_ifc_nic_vport_context_bits { u8 event_on_mc_address_change[0x1]; u8 event_on_uc_address_change[0x1]; - u8 reserved_at_40[0xf0]; + u8 reserved_at_40[0xc]; + + u8 affiliation_criteria[0x4]; + u8 affiliated_vhca_id[0x10]; + + u8 reserved_at_60[0xd0]; u8 mtu[0x10]; @@ -3260,7 +3271,8 @@ struct mlx5_ifc_set_roce_address_in_bits { u8 op_mod[0x10]; u8 roce_address_index[0x10]; - u8 reserved_at_50[0x10]; + u8 reserved_at_50[0xc]; + u8 vhca_port_num[0x4]; u8 reserved_at_60[0x20]; @@ -3880,7 +3892,8 @@ struct mlx5_ifc_query_roce_address_in_bits { u8 op_mod[0x10]; u8 roce_address_index[0x10]; - u8 reserved_at_50[0x10]; + u8 reserved_at_50[0xc]; + u8 vhca_port_num[0x4]; u8 reserved_at_60[0x20]; }; @@ -5312,7 +5325,9 @@ struct mlx5_ifc_modify_nic_vport_context_out_bits { }; struct mlx5_ifc_modify_nic_vport_field_select_bits { - u8 reserved_at_0[0x14]; + u8 reserved_at_0[0x12]; + u8 affiliation[0x1]; + u8 reserved_at_e[0x1]; u8 disable_uc_local_lb[0x1]; u8 disable_mc_local_lb[0x1]; u8 node_guid[0x1]; diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index aaa0bb9e7655..64e193e87394 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -116,4 +116,8 @@ int mlx5_core_modify_hca_vport_context(struct mlx5_core_dev *dev, struct mlx5_hca_vport_context *req); int mlx5_nic_vport_update_local_lb(struct mlx5_core_dev *mdev, bool enable); int mlx5_nic_vport_query_local_lb(struct mlx5_core_dev *mdev, bool *status); + +int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, + struct mlx5_core_dev *port_mdev); +int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev); #endif /* __MLX5_VPORT_H__ */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e44a8adac677..f25c03687ee9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3850,4 +3850,12 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) } +/** + * rdma_roce_rescan_device - Rescan all of the network devices in the system + * and add their gids, as needed, to the relevant RoCE devices. + * + * @device: the rdma device + */ +void rdma_roce_rescan_device(struct ib_device *ibdev); + #endif /* IB_VERBS_H */ -- cgit v1.2.3 From d69a24e036596877ce08c25ab8e63a202412fd9e Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:37 +0200 Subject: IB/mlx5: Move IB event processing onto a workqueue Because mlx5_ib_event can be called from atomic context move event handling onto a workqueue. A mutex lock is required to get the IB device for slave ports, so move event processing onto a work queue. When an IB event is received, check if the mlx5_core_dev is a slave port, if so attempt to get the IB device it's affiliated with. If found process the event for that device, otherwise return. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 69 +++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4fbbe4c7a99b..0ff6da1b885f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -70,10 +70,19 @@ static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION "\n"; +struct mlx5_ib_event_work { + struct work_struct work; + struct mlx5_core_dev *dev; + void *context; + enum mlx5_dev_event event; + unsigned long param; +}; + enum { MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, }; +static struct workqueue_struct *mlx5_ib_event_wq; static LIST_HEAD(mlx5_ib_unaffiliated_port_list); static LIST_HEAD(mlx5_ib_dev_list); /* @@ -3132,15 +3141,24 @@ static void delay_drop_handler(struct work_struct *work) mutex_unlock(&delay_drop->lock); } -static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, - enum mlx5_dev_event event, unsigned long param) +static void mlx5_ib_handle_event(struct work_struct *_work) { - struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context; + struct mlx5_ib_event_work *work = + container_of(_work, struct mlx5_ib_event_work, work); + struct mlx5_ib_dev *ibdev; struct ib_event ibev; bool fatal = false; u8 port = 0; - switch (event) { + if (mlx5_core_is_mp_slave(work->dev)) { + ibdev = mlx5_ib_get_ibdev_from_mpi(work->context); + if (!ibdev) + goto out; + } else { + ibdev = work->context; + } + + switch (work->event) { case MLX5_DEV_EVENT_SYS_ERROR: ibev.event = IB_EVENT_DEVICE_FATAL; mlx5_ib_handle_internal_error(ibdev); @@ -3150,39 +3168,39 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, case MLX5_DEV_EVENT_PORT_UP: case MLX5_DEV_EVENT_PORT_DOWN: case MLX5_DEV_EVENT_PORT_INITIALIZED: - port = (u8)param; + port = (u8)work->param; /* In RoCE, port up/down events are handled in * mlx5_netdev_event(). */ if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) == IB_LINK_LAYER_ETHERNET) - return; + goto out; - ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ? + ibev.event = (work->event == MLX5_DEV_EVENT_PORT_UP) ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; break; case MLX5_DEV_EVENT_LID_CHANGE: ibev.event = IB_EVENT_LID_CHANGE; - port = (u8)param; + port = (u8)work->param; break; case MLX5_DEV_EVENT_PKEY_CHANGE: ibev.event = IB_EVENT_PKEY_CHANGE; - port = (u8)param; + port = (u8)work->param; schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work); break; case MLX5_DEV_EVENT_GUID_CHANGE: ibev.event = IB_EVENT_GID_CHANGE; - port = (u8)param; + port = (u8)work->param; break; case MLX5_DEV_EVENT_CLIENT_REREG: ibev.event = IB_EVENT_CLIENT_REREGISTER; - port = (u8)param; + port = (u8)work->param; break; case MLX5_DEV_EVENT_DELAY_DROP_TIMEOUT: schedule_work(&ibdev->delay_drop.delay_drop_work); @@ -3204,9 +3222,29 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, if (fatal) ibdev->ib_active = false; - out: - return; + kfree(work); +} + +static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, + enum mlx5_dev_event event, unsigned long param) +{ + struct mlx5_ib_event_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(&work->work, mlx5_ib_handle_event); + work->dev = dev; + work->param = param; + work->context = context; + work->event = event; + + queue_work(mlx5_ib_event_wq, &work->work); + return; + } + + dev_warn(&dev->pdev->dev, "%s: mlx5_dev_event: %d, with param: %lu dropped, couldn't allocate memory.\n", + __func__, event, param); } static int set_has_smi_cap(struct mlx5_ib_dev *dev) @@ -4917,6 +4955,10 @@ static int __init mlx5_ib_init(void) { int err; + mlx5_ib_event_wq = alloc_ordered_workqueue("mlx5_ib_event_wq", 0); + if (!mlx5_ib_event_wq) + return -ENOMEM; + mlx5_ib_odp_init(); err = mlx5_register_interface(&mlx5_ib_interface); @@ -4927,6 +4969,7 @@ static int __init mlx5_ib_init(void) static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); + destroy_workqueue(mlx5_ib_event_wq); } module_init(mlx5_ib_init); -- cgit v1.2.3 From b3cbd6f080c90f64fa425589d20fe4506d96fb19 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:38 +0200 Subject: IB/mlx5: Implement dual port functionality in query routines Port operations must be routed to their native mlx5_core_dev. A multiport RoCE device registers itself as having 2 ports even before a 2nd port is affiliated. If an unaffilated port is queried use capability information from the master port, these values are the same. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 102 +++++++++++++++++++++++++++++++------- 1 file changed, 85 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0ff6da1b885f..59e26901d935 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -359,16 +359,30 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, struct mlx5_core_dev *mdev = dev->mdev; struct net_device *ndev, *upper; enum ib_mtu ndev_ib_mtu; + bool put_mdev = true; u16 qkey_viol_cntr; u32 eth_prot_oper; + u8 mdev_port_num; int err; + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) { + /* This means the port isn't affiliated yet. Get the + * info for the master port instead. + */ + put_mdev = false; + mdev = dev->mdev; + mdev_port_num = 1; + port_num = 1; + } + /* Possible bad flows are checked before filling out props so in case * of an error it will still be zeroed out. */ - err = mlx5_query_port_eth_proto_oper(mdev, ð_prot_oper, port_num); + err = mlx5_query_port_eth_proto_oper(mdev, ð_prot_oper, + mdev_port_num); if (err) - return err; + goto out; translate_eth_proto_oper(eth_prot_oper, &props->active_speed, &props->active_width); @@ -384,12 +398,16 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, props->state = IB_PORT_DOWN; props->phys_state = 3; - mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr); + mlx5_query_nic_vport_qkey_viol_cntr(mdev, &qkey_viol_cntr); props->qkey_viol_cntr = qkey_viol_cntr; + /* If this is a stub query for an unaffiliated port stop here */ + if (!put_mdev) + goto out; + ndev = mlx5_ib_get_netdev(device, port_num); if (!ndev) - return 0; + goto out; if (mlx5_lag_is_active(dev->mdev)) { rcu_read_lock(); @@ -412,7 +430,10 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, dev_put(ndev); props->active_mtu = min(props->max_mtu, ndev_ib_mtu); - return 0; +out: + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port_num); + return err; } static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, @@ -1221,7 +1242,22 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, } if (!ret && props) { - count = mlx5_core_reserved_gids_count(to_mdev(ibdev)->mdev); + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_core_dev *mdev; + bool put_mdev = true; + + mdev = mlx5_ib_get_native_port_mdev(dev, port, NULL); + if (!mdev) { + /* If the port isn't affiliated yet query the master. + * The master and slave will have the same values. + */ + mdev = dev->mdev; + port = 1; + put_mdev = false; + } + count = mlx5_core_reserved_gids_count(mdev); + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port); props->gid_tbl_len -= count; } return ret; @@ -1246,20 +1282,43 @@ static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index, } -static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, - u16 *pkey) +static int mlx5_query_hca_nic_pkey(struct ib_device *ibdev, u8 port, + u16 index, u16 *pkey) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_core_dev *mdev; + bool put_mdev = true; + u8 mdev_port_num; + int err; + mdev = mlx5_ib_get_native_port_mdev(dev, port, &mdev_port_num); + if (!mdev) { + /* The port isn't affiliated yet, get the PKey from the master + * port. For RoCE the PKey tables will be the same. + */ + put_mdev = false; + mdev = dev->mdev; + mdev_port_num = 1; + } + + err = mlx5_query_hca_vport_pkey(mdev, 0, mdev_port_num, 0, + index, pkey); + if (put_mdev) + mlx5_ib_put_native_port_mdev(dev, port); + + return err; +} + +static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, + u16 *pkey) +{ switch (mlx5_get_vport_access_method(ibdev)) { case MLX5_VPORT_ACCESS_METHOD_MAD: return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey); case MLX5_VPORT_ACCESS_METHOD_HCA: case MLX5_VPORT_ACCESS_METHOD_NIC: - return mlx5_query_hca_vport_pkey(mdev, 0, port, 0, index, - pkey); + return mlx5_query_hca_nic_pkey(ibdev, port, index, pkey); default: return -EINVAL; } @@ -1298,23 +1357,32 @@ static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask, u32 value) { struct mlx5_hca_vport_context ctx = {}; + struct mlx5_core_dev *mdev; + u8 mdev_port_num; int err; - err = mlx5_query_hca_vport_context(dev->mdev, 0, - port_num, 0, &ctx); + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) + return -ENODEV; + + err = mlx5_query_hca_vport_context(mdev, 0, mdev_port_num, 0, &ctx); if (err) - return err; + goto out; if (~ctx.cap_mask1_perm & mask) { mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n", mask, ctx.cap_mask1_perm); - return -EINVAL; + err = -EINVAL; + goto out; } ctx.cap_mask1 = value; ctx.cap_mask1_perm = mask; - err = mlx5_core_modify_hca_vport_context(dev->mdev, 0, - port_num, 0, &ctx); + err = mlx5_core_modify_hca_vport_context(mdev, 0, mdev_port_num, + 0, &ctx); + +out: + mlx5_ib_put_native_port_mdev(dev, port_num); return err; } -- cgit v1.2.3 From a9e546e73ace1ebfb80dc9b55b46ace306f684cd Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 4 Jan 2018 17:25:39 +0200 Subject: IB/mlx5: Change debugfs to have per port contents When there are multiple ports for single IB(RoCE) device, support debugfs entries to be available for each port. Signed-off-by: Parav Pandit Signed-off-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cong.c | 83 +++++++++++++++++++++++++----------- drivers/infiniband/hw/mlx5/main.c | 12 +++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 7 +-- 3 files changed, 73 insertions(+), 29 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c index 2d32b519bb61..985fa2637390 100644 --- a/drivers/infiniband/hw/mlx5/cong.c +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -247,21 +247,30 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, } } -static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var) +static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, u8 port_num, + int offset, u32 *var) { int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out); void *out; void *field; int err; enum mlx5_ib_cong_node_type node; + struct mlx5_core_dev *mdev; + + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return -ENODEV; out = kvzalloc(outlen, GFP_KERNEL); - if (!out) - return -ENOMEM; + if (!out) { + err = -ENOMEM; + goto alloc_err; + } node = mlx5_ib_param_to_node(offset); - err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen); + err = mlx5_cmd_query_cong_params(mdev, node, out, outlen); if (err) goto free; @@ -270,21 +279,32 @@ static int mlx5_ib_get_cc_params(struct mlx5_ib_dev *dev, int offset, u32 *var) free: kvfree(out); +alloc_err: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); return err; } -static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var) +static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u8 port_num, + int offset, u32 var) { int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in); void *in; void *field; enum mlx5_ib_cong_node_type node; + struct mlx5_core_dev *mdev; u32 attr_mask = 0; int err; + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) + return -ENODEV; + in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; + if (!in) { + err = -ENOMEM; + goto alloc_err; + } MLX5_SET(modify_cong_params_in, in, opcode, MLX5_CMD_OP_MODIFY_CONG_PARAMS); @@ -299,8 +319,10 @@ static int mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, int offset, u32 var) MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp, attr_mask); - err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen); + err = mlx5_cmd_modify_cong_params(mdev, in, inlen); kvfree(in); +alloc_err: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); return err; } @@ -324,7 +346,7 @@ static ssize_t set_param(struct file *filp, const char __user *buf, if (kstrtou32(lbuf, 0, &var)) return -EINVAL; - ret = mlx5_ib_set_cc_params(param->dev, offset, var); + ret = mlx5_ib_set_cc_params(param->dev, param->port_num, offset, var); return ret ? ret : count; } @@ -340,7 +362,7 @@ static ssize_t get_param(struct file *filp, char __user *buf, size_t count, if (*pos) return 0; - ret = mlx5_ib_get_cc_params(param->dev, offset, &var); + ret = mlx5_ib_get_cc_params(param->dev, param->port_num, offset, &var); if (ret) return ret; @@ -362,44 +384,51 @@ static const struct file_operations dbg_cc_fops = { .read = get_param, }; -void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev) +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num) { if (!mlx5_debugfs_root || - !dev->dbg_cc_params || - !dev->dbg_cc_params->root) + !dev->port[port_num].dbg_cc_params || + !dev->port[port_num].dbg_cc_params->root) return; - debugfs_remove_recursive(dev->dbg_cc_params->root); - kfree(dev->dbg_cc_params); - dev->dbg_cc_params = NULL; + debugfs_remove_recursive(dev->port[port_num].dbg_cc_params->root); + kfree(dev->port[port_num].dbg_cc_params); + dev->port[port_num].dbg_cc_params = NULL; } -int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev) +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num) { struct mlx5_ib_dbg_cc_params *dbg_cc_params; + struct mlx5_core_dev *mdev; int i; if (!mlx5_debugfs_root) goto out; - if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed) || - !MLX5_CAP_GEN(dev->mdev, cc_modify_allowed)) + /* Takes a 1-based port number */ + mdev = mlx5_ib_get_native_port_mdev(dev, port_num + 1, NULL); + if (!mdev) goto out; + if (!MLX5_CAP_GEN(mdev, cc_query_allowed) || + !MLX5_CAP_GEN(mdev, cc_modify_allowed)) + goto put_mdev; + dbg_cc_params = kzalloc(sizeof(*dbg_cc_params), GFP_KERNEL); if (!dbg_cc_params) - goto out; + goto err; - dev->dbg_cc_params = dbg_cc_params; + dev->port[port_num].dbg_cc_params = dbg_cc_params; dbg_cc_params->root = debugfs_create_dir("cc_params", - dev->mdev->priv.dbg_root); + mdev->priv.dbg_root); if (!dbg_cc_params->root) goto err; for (i = 0; i < MLX5_IB_DBG_CC_MAX; i++) { dbg_cc_params->params[i].offset = i; dbg_cc_params->params[i].dev = dev; + dbg_cc_params->params[i].port_num = port_num; dbg_cc_params->params[i].dentry = debugfs_create_file(mlx5_ib_dbg_cc_name[i], 0600, dbg_cc_params->root, @@ -408,11 +437,17 @@ int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev) if (!dbg_cc_params->params[i].dentry) goto err; } -out: return 0; + +put_mdev: + mlx5_ib_put_native_port_mdev(dev, port_num + 1); +out: + return 0; err: mlx5_ib_warn(dev, "cong debugfs failure\n"); - mlx5_ib_cleanup_cong_debugfs(dev); + mlx5_ib_cleanup_cong_debugfs(dev, port_num); + mlx5_ib_put_native_port_mdev(dev, port_num + 1); + /* * We don't want to fail driver if debugfs failed to initialize, * so we are not forwarding error to the user. diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 59e26901d935..2ced365e8247 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4254,6 +4254,8 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev, int err; int i; + mlx5_ib_cleanup_cong_debugfs(ibdev, port_num); + spin_lock(&port->mp.mpi_lock); if (!mpi->ibdev) { spin_unlock(&port->mp.mpi_lock); @@ -4331,6 +4333,10 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev, goto unbind; } + err = mlx5_ib_init_cong_debugfs(ibdev, port_num); + if (err) + goto unbind; + return true; unbind: @@ -4748,12 +4754,14 @@ static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev) static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev) { - return mlx5_ib_init_cong_debugfs(dev); + return mlx5_ib_init_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); } static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev) { - mlx5_ib_cleanup_cong_debugfs(dev); + mlx5_ib_cleanup_cong_debugfs(dev, + mlx5_core_native_port_num(dev->mdev) - 1); } static int mlx5_ib_stage_uar_init(struct mlx5_ib_dev *dev) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a70a4c02e396..92faba9a47af 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -665,6 +665,7 @@ struct mlx5_ib_multiport { struct mlx5_ib_port { struct mlx5_ib_counters cnts; struct mlx5_ib_multiport mp; + struct mlx5_ib_dbg_cc_params *dbg_cc_params; }; struct mlx5_roce { @@ -684,6 +685,7 @@ struct mlx5_ib_dbg_param { int offset; struct mlx5_ib_dev *dev; struct dentry *dentry; + u8 port_num; }; enum mlx5_ib_dbg_cc_types { @@ -813,7 +815,6 @@ struct mlx5_ib_dev { struct mlx5_sq_bfreg bfreg; struct mlx5_sq_bfreg fp_bfreg; struct mlx5_ib_delay_drop delay_drop; - struct mlx5_ib_dbg_cc_params *dbg_cc_params; const struct mlx5_ib_profile *profile; /* protect the user_td */ @@ -1071,8 +1072,8 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, int index, enum ib_gid_type *gid_type); -void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev); -int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev); +void mlx5_ib_cleanup_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); +int mlx5_ib_init_cong_debugfs(struct mlx5_ib_dev *dev, u8 port_num); /* GSI QP helper functions */ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, -- cgit v1.2.3 From aac4492ef23a176b6f1a41aadb99177eceb1fc06 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:40 +0200 Subject: IB/mlx5: Update counter implementation for dual port RoCE Update the counter interface for multiple ports. Some counter sets always comes from the primary device. Port specific counters should be accessed per mlx5_core_dev not always through the IB master mdev. Signed-off-by: Daniel Jurgens Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 69 +++++++++++++++++++++--------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + 2 files changed, 42 insertions(+), 28 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2ced365e8247..4791d747cc57 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3885,11 +3885,12 @@ static const struct mlx5_ib_counter extended_err_cnts[] = { static void mlx5_ib_dealloc_counters(struct mlx5_ib_dev *dev) { - unsigned int i; + int i; for (i = 0; i < dev->num_ports; i++) { - mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].cnts.set_id); + if (dev->port[i].cnts.set_id) + mlx5_core_dealloc_q_counter(dev->mdev, + dev->port[i].cnts.set_id); kfree(dev->port[i].cnts.names); kfree(dev->port[i].cnts.offsets); } @@ -3931,6 +3932,7 @@ static int __mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev, err_names: kfree(cnts->names); + cnts->names = NULL; return -ENOMEM; } @@ -3977,37 +3979,33 @@ static void mlx5_ib_fill_counters(struct mlx5_ib_dev *dev, static int mlx5_ib_alloc_counters(struct mlx5_ib_dev *dev) { + int err = 0; int i; - int ret; for (i = 0; i < dev->num_ports; i++) { - struct mlx5_ib_port *port = &dev->port[i]; + err = __mlx5_ib_alloc_counters(dev, &dev->port[i].cnts); + if (err) + goto err_alloc; + + mlx5_ib_fill_counters(dev, dev->port[i].cnts.names, + dev->port[i].cnts.offsets); - ret = mlx5_core_alloc_q_counter(dev->mdev, - &port->cnts.set_id); - if (ret) { + err = mlx5_core_alloc_q_counter(dev->mdev, + &dev->port[i].cnts.set_id); + if (err) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d, err %d\n", - i + 1, ret); - goto dealloc_counters; + i + 1, err); + goto err_alloc; } - - ret = __mlx5_ib_alloc_counters(dev, &port->cnts); - if (ret) - goto dealloc_counters; - - mlx5_ib_fill_counters(dev, port->cnts.names, - port->cnts.offsets); + dev->port[i].cnts.set_id_valid = true; } return 0; -dealloc_counters: - while (--i >= 0) - mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].cnts.set_id); - - return ret; +err_alloc: + mlx5_ib_dealloc_counters(dev); + return err; } static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, @@ -4026,7 +4024,7 @@ static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, RDMA_HW_STATS_DEFAULT_LIFESPAN); } -static int mlx5_ib_query_q_counters(struct mlx5_ib_dev *dev, +static int mlx5_ib_query_q_counters(struct mlx5_core_dev *mdev, struct mlx5_ib_port *port, struct rdma_hw_stats *stats) { @@ -4039,7 +4037,7 @@ static int mlx5_ib_query_q_counters(struct mlx5_ib_dev *dev, if (!out) return -ENOMEM; - ret = mlx5_core_query_q_counter(dev->mdev, + ret = mlx5_core_query_q_counter(mdev, port->cnts.set_id, 0, out, outlen); if (ret) @@ -4061,28 +4059,43 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, { struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_port *port = &dev->port[port_num - 1]; + struct mlx5_core_dev *mdev; int ret, num_counters; + u8 mdev_port_num; if (!stats) return -EINVAL; - ret = mlx5_ib_query_q_counters(dev, port, stats); + num_counters = port->cnts.num_q_counters + port->cnts.num_cong_counters; + + /* q_counters are per IB device, query the master mdev */ + ret = mlx5_ib_query_q_counters(dev->mdev, port, stats); if (ret) return ret; - num_counters = port->cnts.num_q_counters; if (MLX5_CAP_GEN(dev->mdev, cc_query_allowed)) { + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, + &mdev_port_num); + if (!mdev) { + /* If port is not affiliated yet, its in down state + * which doesn't have any counters yet, so it would be + * zero. So no need to read from the HCA. + */ + goto done; + } ret = mlx5_lag_query_cong_counters(dev->mdev, stats->value + port->cnts.num_q_counters, port->cnts.num_cong_counters, port->cnts.offsets + port->cnts.num_q_counters); + + mlx5_ib_put_native_port_mdev(dev, port_num); if (ret) return ret; - num_counters += port->cnts.num_cong_counters; } +done: return num_counters; } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 92faba9a47af..51228dfcfbe7 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -652,6 +652,7 @@ struct mlx5_ib_counters { u32 num_q_counters; u32 num_cong_counters; u16 set_id; + bool set_id_valid; }; struct mlx5_ib_multiport_info; -- cgit v1.2.3 From cfe4e37fdcacbc33176cfc2430df96355ee14489 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:41 +0200 Subject: {net, IB}/mlx5: Change set_roce_gid to take a port number When in dual port mode setting a RoCE GID for any port flows through the master ports mlx5_core_dev. Provide an interface to set the port when sending this command. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c | 7 ++++--- drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c | 5 ++++- include/linux/mlx5/driver.h | 2 +- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4791d747cc57..653b56377e69 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -475,7 +475,7 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num, return mlx5_core_roce_gid_set(dev->mdev, index, roce_version, roce_l3_type, gid->raw, mac, vlan, - vlan_id); + vlan_id, port_num); } static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index c841b03c3e48..e6175f8ac0e4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -888,7 +888,8 @@ struct mlx5_fpga_conn *mlx5_fpga_conn_create(struct mlx5_fpga_device *fdev, err = mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index, MLX5_ROCE_VERSION_2, MLX5_ROCE_L3_TYPE_IPV6, - remote_ip, remote_mac, true, 0); + remote_ip, remote_mac, true, 0, + MLX5_FPGA_PORT_NUM); if (err) { mlx5_fpga_err(fdev, "Failed to set SGID: %d\n", err); ret = ERR_PTR(err); @@ -954,7 +955,7 @@ err_cq: mlx5_fpga_conn_destroy_cq(conn); err_gid: mlx5_core_roce_gid_set(fdev->mdev, conn->qp.sgid_index, 0, 0, NULL, - NULL, false, 0); + NULL, false, 0, MLX5_FPGA_PORT_NUM); err_rsvd_gid: mlx5_core_reserved_gid_free(fdev->mdev, conn->qp.sgid_index); err: @@ -982,7 +983,7 @@ void mlx5_fpga_conn_destroy(struct mlx5_fpga_conn *conn) mlx5_fpga_conn_destroy_cq(conn); mlx5_core_roce_gid_set(conn->fdev->mdev, conn->qp.sgid_index, 0, 0, - NULL, NULL, false, 0); + NULL, NULL, false, 0, MLX5_FPGA_PORT_NUM); mlx5_core_reserved_gid_free(conn->fdev->mdev, conn->qp.sgid_index); kfree(conn); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c index 573f59f46d41..7722a3f9bb68 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/gid.c @@ -121,7 +121,7 @@ EXPORT_SYMBOL_GPL(mlx5_core_reserved_gids_count); int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, u8 roce_version, u8 roce_l3_type, const u8 *gid, - const u8 *mac, bool vlan, u16 vlan_id) + const u8 *mac, bool vlan, u16 vlan_id, u8 port_num) { #define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) u32 in[MLX5_ST_SZ_DW(set_roce_address_in)] = {0}; @@ -148,6 +148,9 @@ int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, memcpy(addr_l3_addr, gid, gidsz); } + if (MLX5_CAP_GEN(dev, num_vhca_ports) > 0) + MLX5_SET(set_roce_address_in, in, vhca_port_num, port_num); + MLX5_SET(set_roce_address_in, in, roce_address_index, index); MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index d5c787519e06..9136e35f2f7e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1112,7 +1112,7 @@ void mlx5_free_bfreg(struct mlx5_core_dev *mdev, struct mlx5_sq_bfreg *bfreg); unsigned int mlx5_core_reserved_gids_count(struct mlx5_core_dev *dev); int mlx5_core_roce_gid_set(struct mlx5_core_dev *dev, unsigned int index, u8 roce_version, u8 roce_l3_type, const u8 *gid, - const u8 *mac, bool vlan, u16 vlan_id); + const u8 *mac, bool vlan, u16 vlan_id, u8 port_num); static inline int fw_initializing(struct mlx5_core_dev *dev) { -- cgit v1.2.3 From 212f2a87b74f1efd645297893c7f3657abd55dcd Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:42 +0200 Subject: IB/mlx5: Route MADs for dual port RoCE Route performance query MADs to the correct mlx5_core_dev when using dual port RoCE mode. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mad.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 0559e0a9e398..32a9e9228b13 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -197,10 +197,9 @@ static void pma_cnt_assign(struct ib_pma_portcounters *pma_cnt, vl_15_dropped); } -static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, +static int process_pma_cmd(struct mlx5_core_dev *mdev, u8 port_num, const struct ib_mad *in_mad, struct ib_mad *out_mad) { - struct mlx5_ib_dev *dev = to_mdev(ibdev); int err; void *out_cnt; @@ -222,7 +221,7 @@ static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, if (!out_cnt) return IB_MAD_RESULT_FAILURE; - err = mlx5_core_query_vport_counter(dev->mdev, 0, 0, + err = mlx5_core_query_vport_counter(mdev, 0, 0, port_num, out_cnt, sz); if (!err) pma_cnt_ext_assign(pma_cnt_ext, out_cnt); @@ -235,7 +234,7 @@ static int process_pma_cmd(struct ib_device *ibdev, u8 port_num, if (!out_cnt) return IB_MAD_RESULT_FAILURE; - err = mlx5_core_query_ib_ppcnt(dev->mdev, port_num, + err = mlx5_core_query_ib_ppcnt(mdev, port_num, out_cnt, sz); if (!err) pma_cnt_assign(pma_cnt, out_cnt); @@ -255,9 +254,11 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, u16 *out_mad_pkey_index) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_core_dev *mdev = dev->mdev; const struct ib_mad *in_mad = (const struct ib_mad *)in; struct ib_mad *out_mad = (struct ib_mad *)out; + struct mlx5_core_dev *mdev; + u8 mdev_port_num; + int ret; if (WARN_ON_ONCE(in_mad_size != sizeof(*in_mad) || *out_mad_size != sizeof(*out_mad))) @@ -265,14 +266,20 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, memset(out_mad->data, 0, sizeof(out_mad->data)); + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, &mdev_port_num); + if (!mdev) + return IB_MAD_RESULT_FAILURE; + if (MLX5_CAP_GEN(mdev, vport_counters) && in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT && in_mad->mad_hdr.method == IB_MGMT_METHOD_GET) { - return process_pma_cmd(ibdev, port_num, in_mad, out_mad); + ret = process_pma_cmd(mdev, mdev_port_num, in_mad, out_mad); } else { - return process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, + ret = process_mad(ibdev, mad_flags, port_num, in_wc, in_grh, in_mad, out_mad); } + mlx5_ib_put_native_port_mdev(dev, port_num); + return ret; } int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port) -- cgit v1.2.3 From 85c7c01499a9ebbf049422668e05c58999b5ae45 Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:43 +0200 Subject: IB/mlx5: Don't advertise RAW QP support in dual port mode When operating in dual port RoCE mode FW doesn't support steering for raw QPs on the slave port. They still work on the master port, but the user has no way of knowing which port is the master. The capability is reported per device, not per port. Signed-off-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 653b56377e69..5d6fba986fa5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -731,6 +731,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, int max_rq_sg; int max_sq_sg; u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz); + bool raw_support = !mlx5_core_mp_enabled(mdev); struct mlx5_ib_query_device_resp resp = {}; size_t resp_len; u64 max_tso; @@ -794,7 +795,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; - if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) { + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && raw_support) { if (MLX5_CAP_ETH(mdev, csum_cap)) { /* Legacy bit to support old userspace libraries */ props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; @@ -843,7 +844,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(dev->mdev, rq_delay_drop) && - MLX5_CAP_GEN(dev->mdev, general_notification_event)) + MLX5_CAP_GEN(dev->mdev, general_notification_event) && + raw_support) props->raw_packet_caps |= IB_RAW_PACKET_CAP_DELAY_DROP; if (MLX5_CAP_GEN(mdev, ipoib_enhanced_offloads) && @@ -851,7 +853,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && - MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { + MLX5_CAP_ETH(dev->mdev, scatter_fcs) && + raw_support) { /* Legacy bit to support old userspace libraries */ props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS; @@ -915,7 +918,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION; if (mlx5_ib_port_link_layer(ibdev, 1) == - IB_LINK_LAYER_ETHERNET) { + IB_LINK_LAYER_ETHERNET && raw_support) { props->rss_caps.max_rwq_indirection_tables = 1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt); props->rss_caps.max_rwq_indirection_table_size = @@ -952,7 +955,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.cqe_comp_caps); } - if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen)) { + if (field_avail(typeof(resp), packet_pacing_caps, uhw->outlen) && + raw_support) { if (MLX5_CAP_QOS(mdev, packet_pacing) && MLX5_CAP_GEN(mdev, qos)) { resp.packet_pacing_caps.qp_rate_limit_max = @@ -1011,7 +1015,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen)) { + if (field_avail(typeof(resp), striding_rq_caps, uhw->outlen) && + raw_support) { resp.response_length += sizeof(resp.striding_rq_caps); if (MLX5_CAP_GEN(mdev, striding_rq)) { resp.striding_rq_caps.min_single_stride_log_num_of_bytes = @@ -3681,12 +3686,14 @@ static u32 get_core_cap_flags(struct ib_device *ibdev) enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); + bool raw_support = !mlx5_core_mp_enabled(dev->mdev); u32 ret = 0; if (ll == IB_LINK_LAYER_INFINIBAND) return RDMA_CORE_PORT_IBA_IB; - ret = RDMA_CORE_PORT_RAW_PACKET; + if (raw_support) + ret = RDMA_CORE_PORT_RAW_PACKET; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) return ret; -- cgit v1.2.3 From c4b76d8d95600d143f4d75f1bd700eb5d378099f Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Thu, 4 Jan 2018 17:25:44 +0200 Subject: net/mlx5: Set num_vhca_ports capability Set the current capability to the max capability. Doing so enables dual port RoCE functionality if supported by the firmware. Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e382a3ca759e..d4a471a76d82 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -557,6 +557,12 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) if (MLX5_CAP_GEN_MAX(dev, dct)) MLX5_SET(cmd_hca_cap, set_hca_cap, dct, 1); + if (MLX5_CAP_GEN_MAX(dev, num_vhca_ports)) + MLX5_SET(cmd_hca_cap, + set_hca_cap, + num_vhca_ports, + MLX5_CAP_GEN_MAX(dev, num_vhca_ports)); + err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); -- cgit v1.2.3 From 1b0bb73f72d1b91c872e5c068ee5b46d943accad Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:40 -0800 Subject: IB/srpt: Remove an unused structure member Fixes: commit a42d985bd5b2 ("ib_srpt: Initial SRP Target merge for v3.3-rc1") Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 1 - drivers/infiniband/ulp/srpt/ib_srpt.h | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index bfa576aa9f03..81e8085bc5bb 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1123,7 +1123,6 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) ioctx->state = SRPT_STATE_NEW; ioctx->n_rdma = 0; ioctx->n_rw_ctx = 0; - init_completion(&ioctx->tx_done); ioctx->queue_status_only = false; /* * transport_init_se_cmd() does not initialize all fields, so do it diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 673387d365a3..7fff73a5d1e6 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -207,7 +207,6 @@ struct srpt_send_ioctx { spinlock_t spinlock; enum srpt_command_state state; struct se_cmd cmd; - struct completion tx_done; u8 n_rdma; u8 n_rw_ctx; bool queue_status_only; -- cgit v1.2.3 From 10eac19bb272415cad6f28ebe8c055b648f334b1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:41 -0800 Subject: IB/srpt: Fix kernel-doc warnings in ib_srpt.c Avoid that warnings about missing parameter descriptions are reported when building with W=1. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 205 ++++++++++++++++++++++++---------- 1 file changed, 145 insertions(+), 60 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 81e8085bc5bb..1bcf8b3b6095 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -120,7 +120,9 @@ static bool srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new) } /** - * srpt_event_handler() - Asynchronous IB event callback function. + * srpt_event_handler - asynchronous IB event callback function + * @handler: IB event handler registered by ib_register_event_handler(). + * @event: Description of the event that occurred. * * Callback function called by the InfiniBand core when an asynchronous IB * event occurs. This callback may occur in interrupt context. See also @@ -169,7 +171,9 @@ static void srpt_event_handler(struct ib_event_handler *handler, } /** - * srpt_srq_event() - SRQ event callback function. + * srpt_srq_event - SRQ event callback function + * @event: Description of the event that occurred. + * @ctx: Context pointer specified at SRQ creation time. */ static void srpt_srq_event(struct ib_event *event, void *ctx) { @@ -194,7 +198,9 @@ static const char *get_ch_state_name(enum rdma_ch_state s) } /** - * srpt_qp_event() - QP event callback function. + * srpt_qp_event - QP event callback function + * @event: Description of the event that occurred. + * @ch: SRPT RDMA channel. */ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) { @@ -217,8 +223,8 @@ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) } /** - * srpt_set_ioc() - Helper function for initializing an IOUnitInfo structure. - * + * srpt_set_ioc - initialize a IOUnitInfo structure + * @c_list: controller list. * @slot: one-based slot number. * @value: four-bit value. * @@ -241,7 +247,8 @@ static void srpt_set_ioc(u8 *c_list, u32 slot, u8 value) } /** - * srpt_get_class_port_info() - Copy ClassPortInfo to a management datagram. + * srpt_get_class_port_info - copy ClassPortInfo to a management datagram + * @mad: Datagram that will be sent as response to DM_ATTR_CLASS_PORT_INFO. * * See also section 16.3.3.1 ClassPortInfo in the InfiniBand Architecture * Specification. @@ -260,7 +267,8 @@ static void srpt_get_class_port_info(struct ib_dm_mad *mad) } /** - * srpt_get_iou() - Write IOUnitInfo to a management datagram. + * srpt_get_iou - write IOUnitInfo to a management datagram + * @mad: Datagram that will be sent as response to DM_ATTR_IOU_INFO. * * See also section 16.3.3.3 IOUnitInfo in the InfiniBand Architecture * Specification. See also section B.7, table B.6 in the SRP r16a document. @@ -284,7 +292,10 @@ static void srpt_get_iou(struct ib_dm_mad *mad) } /** - * srpt_get_ioc() - Write IOControllerprofile to a management datagram. + * srpt_get_ioc - write IOControllerprofile to a management datagram + * @sport: HCA port through which the MAD has been received. + * @slot: Slot number specified in DM_ATTR_IOC_PROFILE query. + * @mad: Datagram that will be sent as response to DM_ATTR_IOC_PROFILE. * * See also section 16.3.3.4 IOControllerProfile in the InfiniBand * Architecture Specification. See also section B.7, table B.7 in the SRP @@ -342,7 +353,12 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, } /** - * srpt_get_svc_entries() - Write ServiceEntries to a management datagram. + * srpt_get_svc_entries - write ServiceEntries to a management datagram + * @ioc_guid: I/O controller GUID to use in reply. + * @slot: I/O controller number. + * @hi: End of the range of service entries to be specified in the reply. + * @lo: Start of the range of service entries to be specified in the reply.. + * @mad: Datagram that will be sent as response to DM_ATTR_SVC_ENTRIES. * * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture * Specification. See also section B.7, table B.8 in the SRP r16a document. @@ -379,8 +395,8 @@ static void srpt_get_svc_entries(u64 ioc_guid, } /** - * srpt_mgmt_method_get() - Process a received management datagram. - * @sp: source port through which the MAD has been received. + * srpt_mgmt_method_get - process a received management datagram + * @sp: HCA port through which the MAD has been received. * @rq_mad: received MAD. * @rsp_mad: response MAD. */ @@ -419,7 +435,9 @@ static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad, } /** - * srpt_mad_send_handler() - Post MAD-send callback function. + * srpt_mad_send_handler - MAD send completion callback + * @mad_agent: Return value of ib_register_mad_agent(). + * @mad_wc: Work completion reporting that the MAD has been sent. */ static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_wc *mad_wc) @@ -429,7 +447,10 @@ static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, } /** - * srpt_mad_recv_handler() - MAD reception callback function. + * srpt_mad_recv_handler - MAD reception callback function + * @mad_agent: Return value of ib_register_mad_agent(). + * @send_buf: Not used. + * @mad_wc: Work completion reporting that a MAD has been received. */ static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, @@ -494,7 +515,8 @@ err: } /** - * srpt_refresh_port() - Configure a HCA port. + * srpt_refresh_port - configure a HCA port + * @sport: SRPT HCA port. * * Enable InfiniBand management datagram processing, update the cached sm_lid, * lid and gid values, and register a callback function for processing MADs @@ -577,7 +599,8 @@ err_mod_port: } /** - * srpt_unregister_mad_agent() - Unregister MAD callback functions. + * srpt_unregister_mad_agent - unregister MAD callback functions + * @sdev: SRPT HCA pointer. * * Note: It is safe to call this function more than once for the same device. */ @@ -602,7 +625,11 @@ static void srpt_unregister_mad_agent(struct srpt_device *sdev) } /** - * srpt_alloc_ioctx() - Allocate an SRPT I/O context structure. + * srpt_alloc_ioctx - allocate a SRPT I/O context structure + * @sdev: SRPT HCA pointer. + * @ioctx_size: I/O context size. + * @dma_size: Size of I/O context DMA buffer. + * @dir: DMA data direction. */ static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev, int ioctx_size, int dma_size, @@ -633,7 +660,11 @@ err: } /** - * srpt_free_ioctx() - Free an SRPT I/O context structure. + * srpt_free_ioctx - free a SRPT I/O context structure + * @sdev: SRPT HCA pointer. + * @ioctx: I/O context pointer. + * @dma_size: Size of I/O context DMA buffer. + * @dir: DMA data direction. */ static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx, int dma_size, enum dma_data_direction dir) @@ -647,7 +678,7 @@ static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx, } /** - * srpt_alloc_ioctx_ring() - Allocate a ring of SRPT I/O context structures. + * srpt_alloc_ioctx_ring - allocate a ring of SRPT I/O context structures * @sdev: Device to allocate the I/O context ring for. * @ring_size: Number of elements in the I/O context ring. * @ioctx_size: I/O context size. @@ -685,7 +716,12 @@ out: } /** - * srpt_free_ioctx_ring() - Free the ring of SRPT I/O context structures. + * srpt_free_ioctx_ring - free the ring of SRPT I/O context structures + * @ioctx_ring: I/O context ring to be freed. + * @sdev: SRPT HCA pointer. + * @ring_size: Number of ring elements. + * @dma_size: Size of I/O context DMA buffer. + * @dir: DMA data direction. */ static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, struct srpt_device *sdev, int ring_size, @@ -702,7 +738,8 @@ static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, } /** - * srpt_get_cmd_state() - Get the state of a SCSI command. + * srpt_get_cmd_state - get the state of a SCSI command + * @ioctx: Send I/O context. */ static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx) { @@ -718,7 +755,9 @@ static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx) } /** - * srpt_set_cmd_state() - Set the state of a SCSI command. + * srpt_set_cmd_state - set the state of a SCSI command + * @ioctx: Send I/O context. + * @new: New I/O context state. * * Does not modify the state of aborted commands. Returns the previous command * state. @@ -741,7 +780,10 @@ static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx, } /** - * srpt_test_and_set_cmd_state() - Test and set the state of a command. + * srpt_test_and_set_cmd_state - test and set the state of a command + * @ioctx: Send I/O context. + * @old: Current I/O context state. + * @new: New I/O context state. * * Returns true if and only if the previous command state was equal to 'old'. */ @@ -765,7 +807,10 @@ static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx, } /** - * srpt_post_recv() - Post an IB receive request. + * srpt_post_recv - post an IB receive request + * @sdev: SRPT HCA pointer. + * @ch: SRPT RDMA channel. + * @ioctx: Receive I/O context pointer. */ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *ioctx) @@ -791,7 +836,8 @@ static int srpt_post_recv(struct srpt_device *sdev, struct srpt_rdma_ch *ch, } /** - * srpt_zerolength_write() - Perform a zero-length RDMA write. + * srpt_zerolength_write - perform a zero-length RDMA write + * @ch: SRPT RDMA channel. * * A quote from the InfiniBand specification: C9-88: For an HCA responder * using Reliable Connection service, for each zero-length RDMA READ or WRITE @@ -928,11 +974,13 @@ static inline void *srpt_get_desc_buf(struct srp_cmd *srp_cmd) } /** - * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request. + * srpt_get_desc_tbl - parse the data descriptors of a SRP_CMD request * @ioctx: Pointer to the I/O context associated with the request. * @srp_cmd: Pointer to the SRP_CMD request data. * @dir: Pointer to the variable to which the transfer direction will be * written. + * @sg: [out] scatterlist allocated for the parsed SRP_CMD. + * @sg_cnt: [out] length of @sg. * @data_len: Pointer to the variable to which the total data length of all * descriptors in the SRP_CMD request will be written. * @@ -998,7 +1046,9 @@ static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx, } /** - * srpt_init_ch_qp() - Initialize queue pair attributes. + * srpt_init_ch_qp - initialize queue pair attributes + * @ch: SRPT RDMA channel. + * @qp: Queue pair pointer. * * Initialized the attributes of queue pair 'qp' by allowing local write, * remote read and remote write. Also transitions 'qp' to state IB_QPS_INIT. @@ -1026,7 +1076,7 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) } /** - * srpt_ch_qp_rtr() - Change the state of a channel to 'ready to receive' (RTR). + * srpt_ch_qp_rtr - change the state of a channel to 'ready to receive' (RTR) * @ch: channel of the queue pair. * @qp: queue pair to change the state of. * @@ -1056,7 +1106,7 @@ out: } /** - * srpt_ch_qp_rts() - Change the state of a channel to 'ready to send' (RTS). + * srpt_ch_qp_rts - change the state of a channel to 'ready to send' (RTS) * @ch: channel of the queue pair. * @qp: queue pair to change the state of. * @@ -1086,7 +1136,8 @@ out: } /** - * srpt_ch_qp_err() - Set the channel queue pair state to 'error'. + * srpt_ch_qp_err - set the channel queue pair state to 'error' + * @ch: SRPT RDMA channel. */ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) { @@ -1097,7 +1148,8 @@ static int srpt_ch_qp_err(struct srpt_rdma_ch *ch) } /** - * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator. + * srpt_get_send_ioctx - obtain an I/O context for sending to the initiator + * @ch: SRPT RDMA channel. */ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) { @@ -1135,9 +1187,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) } /** - * srpt_abort_cmd() - Abort a SCSI command. + * srpt_abort_cmd - abort a SCSI command * @ioctx: I/O context associated with the SCSI command. - * @context: Preferred execution context. */ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) { @@ -1205,6 +1256,10 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) } /** + * srpt_rdma_read_done - RDMA read completion callback + * @cq: Completion queue. + * @wc: Work completion. + * * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping * the data that has been transferred via IB RDMA had to be postponed until the * check_stop_free() callback. None of this is necessary anymore and needs to @@ -1236,7 +1291,7 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) } /** - * srpt_build_cmd_rsp() - Build an SRP_RSP response. + * srpt_build_cmd_rsp - build a SRP_RSP response * @ch: RDMA channel through which the request has been received. * @ioctx: I/O context associated with the SRP_CMD request. The response will * be built in the buffer ioctx->buf points at and hence this function will @@ -1296,7 +1351,7 @@ static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch, } /** - * srpt_build_tskmgmt_rsp() - Build a task management response. + * srpt_build_tskmgmt_rsp - build a task management response * @ch: RDMA channel through which the request has been received. * @ioctx: I/O context in which the SRP_RSP response will be built. * @rsp_code: RSP_CODE that will be stored in the response. @@ -1344,7 +1399,10 @@ static int srpt_check_stop_free(struct se_cmd *cmd) } /** - * srpt_handle_cmd() - Process SRP_CMD. + * srpt_handle_cmd - process a SRP_CMD information unit + * @ch: SRPT RDMA channel. + * @recv_ioctx: Receive I/O context. + * @send_ioctx: Send I/O context. */ static void srpt_handle_cmd(struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *recv_ioctx, @@ -1426,7 +1484,10 @@ static int srp_tmr_to_tcm(int fn) } /** - * srpt_handle_tsk_mgmt() - Process an SRP_TSK_MGMT information unit. + * srpt_handle_tsk_mgmt - process a SRP_TSK_MGMT information unit + * @ch: SRPT RDMA channel. + * @recv_ioctx: Receive I/O context. + * @send_ioctx: Send I/O context. * * Returns 0 if and only if the request will be processed by the target core. * @@ -1469,9 +1530,10 @@ fail: } /** - * srpt_handle_new_iu() - Process a newly received information unit. + * srpt_handle_new_iu - process a newly received information unit * @ch: RDMA channel through which the information unit has been received. - * @ioctx: SRPT I/O context associated with the information unit. + * @recv_ioctx: Receive I/O context associated with the information unit. + * @send_ioctx: Send I/O context. */ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *recv_ioctx, @@ -1577,6 +1639,10 @@ static void srpt_process_wait_list(struct srpt_rdma_ch *ch) } /** + * srpt_send_done - send completion callback + * @cq: Completion queue. + * @wc: Work completion. + * * Note: Although this has not yet been observed during tests, at least in * theory it is possible that the srpt_get_send_ioctx() call invoked by * srpt_handle_new_iu() fails. This is possible because the req_lim_delta @@ -1618,7 +1684,8 @@ static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) } /** - * srpt_create_ch_ib() - Create receive and send completion queues. + * srpt_create_ch_ib - create receive and send completion queues + * @ch: SRPT RDMA channel. */ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) { @@ -1717,7 +1784,8 @@ static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch) } /** - * srpt_close_ch() - Close an RDMA channel. + * srpt_close_ch - close a RDMA channel + * @ch: SRPT RDMA channel. * * Make sure all resources associated with the channel will be deallocated at * an appropriate time. @@ -1900,7 +1968,10 @@ static void srpt_release_channel_work(struct work_struct *w) } /** - * srpt_cm_req_recv() - Process the event IB_CM_REQ_RECEIVED. + * srpt_cm_req_recv - process the event IB_CM_REQ_RECEIVED + * @cm_id: IB/CM connection identifier. + * @param: IB/CM REQ parameters. + * @private_data: IB/CM REQ private data. * * Ownership of the cm_id is transferred to the target session if this * functions returns zero. Otherwise the caller remains the owner of cm_id. @@ -2205,7 +2276,8 @@ static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch, } /** - * srpt_cm_rtu_recv() - Process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event. + * srpt_cm_rtu_recv - process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event + * @ch: SRPT RDMA channel. * * An IB_CM_RTU_RECEIVED message indicates that the connection is established * and that the recipient may begin transmitting (RTU = ready to use). @@ -2228,7 +2300,9 @@ static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch) } /** - * srpt_cm_handler() - IB connection manager callback function. + * srpt_cm_handler - IB connection manager callback function + * @cm_id: IB/CM connection identifier. + * @event: IB/CM event. * * A non-zero return value will cause the caller destroy the CM ID. * @@ -2297,7 +2371,7 @@ static int srpt_write_pending_status(struct se_cmd *se_cmd) } /* - * srpt_write_pending() - Start data transfer from initiator to target (write). + * srpt_write_pending - Start data transfer from initiator to target (write). */ static int srpt_write_pending(struct se_cmd *se_cmd) { @@ -2354,7 +2428,8 @@ static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status) } /** - * srpt_queue_response() - Transmits the response to a SCSI command. + * srpt_queue_response - transmit the response to a SCSI command + * @cmd: SCSI target command. * * Callback function called by the TCM core. Must not block since it can be * invoked on the context of the IB completion handler. @@ -2494,7 +2569,8 @@ static void srpt_refresh_port_work(struct work_struct *work) } /** - * srpt_release_sdev() - Free the channel resources associated with a target. + * srpt_release_sdev - disable login and wait for associated channels + * @sdev: SRPT HCA pointer. */ static int srpt_release_sdev(struct srpt_device *sdev) { @@ -2622,7 +2698,8 @@ static int srpt_use_srq(struct srpt_device *sdev, bool use_srq) } /** - * srpt_add_one() - Infiniband device addition callback function. + * srpt_add_one - InfiniBand device addition callback function + * @device: Describes a HCA. */ static void srpt_add_one(struct ib_device *device) { @@ -2720,7 +2797,9 @@ err: } /** - * srpt_remove_one() - InfiniBand device removal callback function. + * srpt_remove_one - InfiniBand device removal callback function + * @device: Describes a HCA. + * @client_data: The value passed as the third argument to ib_set_client_data(). */ static void srpt_remove_one(struct ib_device *device, void *client_data) { @@ -2826,7 +2905,8 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) } /** - * srpt_close_session() - Forcibly close a session. + * srpt_close_session - forcibly close a session + * @se_sess: SCSI target session. * * Callback function invoked by the TCM core to clean up sessions associated * with a node ACL when the user invokes @@ -2843,7 +2923,8 @@ static void srpt_close_session(struct se_session *se_sess) } /** - * srpt_sess_get_index() - Return the value of scsiAttIntrPortIndex (SCSI-MIB). + * srpt_sess_get_index - return the value of scsiAttIntrPortIndex (SCSI-MIB) + * @se_sess: SCSI target session. * * A quote from RFC 4455 (SCSI-MIB) about this MIB object: * This object represents an arbitrary integer used to uniquely identify a @@ -2882,7 +2963,7 @@ out: } /** - * srpt_parse_i_port_id() - Parse an initiator port ID. + * srpt_parse_i_port_id - parse an initiator port ID * @name: ASCII representation of a 128-bit initiator port ID. * @i_port_id: Binary 128-bit port ID. */ @@ -3133,8 +3214,10 @@ static struct configfs_attribute *srpt_tpg_attrs[] = { }; /** - * configfs callback invoked for - * mkdir /sys/kernel/config/target/$driver/$port/$tpg + * srpt_make_tpg - configfs callback invoked for mkdir /sys/kernel/config/target/$driver/$port/$tpg + * @wwn: Corresponds to $driver/$port. + * @group: Not used. + * @name: $tpg. */ static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, struct config_group *group, @@ -3156,8 +3239,8 @@ static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn, } /** - * configfs callback invoked for - * rmdir /sys/kernel/config/target/$driver/$port/$tpg + * srpt_drop_tpg - configfs callback invoked for rmdir /sys/kernel/config/target/$driver/$port/$tpg + * @tpg: Target portal group to deregister. */ static void srpt_drop_tpg(struct se_portal_group *tpg) { @@ -3168,8 +3251,10 @@ static void srpt_drop_tpg(struct se_portal_group *tpg) } /** - * configfs callback invoked for - * mkdir /sys/kernel/config/target/$driver/$port + * srpt_make_tport - configfs callback invoked for mkdir /sys/kernel/config/target/$driver/$port + * @tf: Not used. + * @group: Not used. + * @name: $port. */ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf, struct config_group *group, @@ -3179,8 +3264,8 @@ static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf, } /** - * configfs callback invoked for - * rmdir /sys/kernel/config/target/$driver/$port + * srpt_drop_tport - configfs callback invoked for rmdir /sys/kernel/config/target/$driver/$port + * @wwn: $port. */ static void srpt_drop_tport(struct se_wwn *wwn) { @@ -3238,7 +3323,7 @@ static const struct target_core_fabric_ops srpt_template = { }; /** - * srpt_init_module() - Kernel module initialization. + * srpt_init_module - kernel module initialization * * Note: Since ib_register_client() registers callback functions, and since at * least one of these callback functions (srpt_add_one()) calls target core -- cgit v1.2.3 From f8781a53001b11ba8277da6566f352a61229b667 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:42 -0800 Subject: IB/srpt: Document all structure members in ib_srpt.h This patch avoids that the following command reports any warnings: scripts/kernel-doc -none drivers/infiniband/ulp/srpt/ib_srpt.h Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.h | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 7fff73a5d1e6..0c94cd6a7e9d 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -134,7 +134,7 @@ enum { }; /** - * enum srpt_command_state - SCSI command state managed by SRPT. + * enum srpt_command_state - SCSI command state managed by SRPT * @SRPT_STATE_NEW: New command arrived and is being processed. * @SRPT_STATE_NEED_DATA: Processing a write or bidir command and waiting * for data arrival. @@ -158,7 +158,8 @@ enum srpt_command_state { }; /** - * struct srpt_ioctx - Shared SRPT I/O context information. + * struct srpt_ioctx - shared SRPT I/O context information + * @cqe: Completion queue element. * @buf: Pointer to the buffer. * @dma: DMA address of the buffer. * @index: Index of the I/O context in its ioctx_ring array. @@ -171,7 +172,7 @@ struct srpt_ioctx { }; /** - * struct srpt_recv_ioctx - SRPT receive I/O context. + * struct srpt_recv_ioctx - SRPT receive I/O context * @ioctx: See above. * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list. */ @@ -187,13 +188,21 @@ struct srpt_rw_ctx { }; /** - * struct srpt_send_ioctx - SRPT send I/O context. + * struct srpt_send_ioctx - SRPT send I/O context * @ioctx: See above. * @ch: Channel pointer. + * @s_rw_ctx: @rw_ctxs points here if only a single rw_ctx is needed. + * @rw_ctxs: RDMA read/write contexts. + * @rdma_cqe: RDMA completion queue element. + * @free_list: Node in srpt_rdma_ch.free_list. * @spinlock: Protects 'state'. * @state: I/O context state. * @cmd: Target core command data structure. * @sense_data: SCSI sense data. + * @n_rdma: Number of work requests needed to transfer this ioctx. + * @n_rw_ctx: Size of rw_ctxs array. + * @queue_status_only: Send a SCSI status back to the initiator but no data. + * @sense_data: Sense data to be sent to the initiator. */ struct srpt_send_ioctx { struct srpt_ioctx ioctx; @@ -214,7 +223,7 @@ struct srpt_send_ioctx { }; /** - * enum rdma_ch_state - SRP channel state. + * enum rdma_ch_state - SRP channel state * @CH_CONNECTING: QP is in RTR state; waiting for RTU. * @CH_LIVE: QP is in RTS state. * @CH_DISCONNECTING: DREQ has been sent and waiting for DREP or DREQ has @@ -232,12 +241,14 @@ enum rdma_ch_state { }; /** - * struct srpt_rdma_ch - RDMA channel. + * struct srpt_rdma_ch - RDMA channel * @cm_id: IB CM ID associated with the channel. * @qp: IB queue pair used for communicating over this channel. * @cq: IB completion queue for this channel. + * @zw_cqe: Zero-length write CQE. + * @kref: kref for this channel. * @rq_size: IB receive queue size. - * @rsp_size IB response message size in bytes. + * @rsp_size: IB response message size in bytes. * @sq_wr_avail: number of work requests available in the send queue. * @sport: pointer to the information of the HCA port used by this * channel. @@ -292,7 +303,7 @@ struct srpt_rdma_ch { }; /** - * struct srpt_port_attib - Attributes for SRPT port + * struct srpt_port_attib - attributes for SRPT port * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections. * @srp_max_rsp_size: Maximum size of SRP response messages in bytes. * @srp_sq_size: Shared receive queue (SRQ) size. @@ -306,7 +317,7 @@ struct srpt_port_attrib { }; /** - * struct srpt_port - Information associated by SRPT with a single IB port. + * struct srpt_port - information associated by SRPT with a single IB port * @sdev: backpointer to the HCA information. * @mad_agent: per-port management datagram processing information. * @enabled: Whether or not this target port is enabled. @@ -322,7 +333,7 @@ struct srpt_port_attrib { * @port_guid_wwn: WWN associated with target port GUID. * @port_gid_tpg: TPG associated with target port GID. * @port_gid_wwn: WWN associated with target port GID. - * @port_acl_list: Head of the list with all node ACLs for this port. + * @port_attrib: Port attributes that can be accessed through configfs. */ struct srpt_port { struct srpt_device *sdev; @@ -343,7 +354,7 @@ struct srpt_port { }; /** - * struct srpt_device - Information associated by SRPT with a single HCA. + * struct srpt_device - information associated by SRPT with a single HCA * @device: Backpointer to the struct ib_device managed by the IB core. * @pd: IB protection domain. * @lkey: L_Key (local key) with write access to all local memory. -- cgit v1.2.3 From ed262287e2b46927905a41e86100a63dc2327dac Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:43 -0800 Subject: IB/srpt: Rename a local variable, a member variable and a constant Rename rsp_size into max_rsp_size and SRPT_RQ_SIZE into MAX_SRPT_RQ_SIZE. The new names better reflect the role of this member variable and constant. Since the prefix "srp_" is superfluous in the context of the function that creates an RDMA channel, rename srp_sq_size into sq_size. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 26 +++++++++++++------------- drivers/infiniband/ulp/srpt/ib_srpt.h | 6 +++--- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 1bcf8b3b6095..4fb884c070af 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -325,7 +325,7 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, if (sdev->use_srq) send_queue_depth = sdev->srq_size; else - send_queue_depth = min(SRPT_RQ_SIZE, + send_queue_depth = min(MAX_SRPT_RQ_SIZE, sdev->device->attrs.max_qp_wr); memset(iocp, 0, sizeof(*iocp)); @@ -1693,7 +1693,7 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) struct srpt_port *sport = ch->sport; struct srpt_device *sdev = sport->sdev; const struct ib_device_attr *attrs = &sdev->device->attrs; - u32 srp_sq_size = sport->port_attrib.srp_sq_size; + int sq_size = sport->port_attrib.srp_sq_size; int i, ret; WARN_ON(ch->rq_size < 1); @@ -1704,12 +1704,12 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) goto out; retry: - ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + srp_sq_size, + ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + sq_size, 0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE); if (IS_ERR(ch->cq)) { ret = PTR_ERR(ch->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", - ch->rq_size + srp_sq_size, ret); + ch->rq_size + sq_size, ret); goto out; } @@ -1727,8 +1727,8 @@ retry: * both both, as RDMA contexts will also post completions for the * RDMA READ case. */ - qp_init->cap.max_send_wr = min(srp_sq_size / 2, attrs->max_qp_wr + 0U); - qp_init->cap.max_rdma_ctxs = srp_sq_size / 2; + qp_init->cap.max_send_wr = min(sq_size / 2, attrs->max_qp_wr); + qp_init->cap.max_rdma_ctxs = sq_size / 2; qp_init->cap.max_send_sge = min(attrs->max_sge, SRPT_MAX_SG_PER_WQE); qp_init->port_num = ch->sport->port; if (sdev->use_srq) { @@ -1742,8 +1742,8 @@ retry: if (IS_ERR(ch->qp)) { ret = PTR_ERR(ch->qp); if (ret == -ENOMEM) { - srp_sq_size /= 2; - if (srp_sq_size >= MIN_SRPT_SQ_SIZE) { + sq_size /= 2; + if (sq_size >= MIN_SRPT_SQ_SIZE) { ib_destroy_cq(ch->cq); goto retry; } @@ -1950,7 +1950,7 @@ static void srpt_release_channel_work(struct work_struct *w) srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, ch->sport->sdev, ch->rq_size, - ch->rsp_size, DMA_TO_DEVICE); + ch->max_rsp_size, DMA_TO_DEVICE); srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_recv_ring, sdev, ch->rq_size, @@ -2098,16 +2098,16 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, * depth to avoid that the initiator driver has to report QUEUE_FULL * to the SCSI mid-layer. */ - ch->rq_size = min(SRPT_RQ_SIZE, sdev->device->attrs.max_qp_wr); + ch->rq_size = min(MAX_SRPT_RQ_SIZE, sdev->device->attrs.max_qp_wr); spin_lock_init(&ch->spinlock); ch->state = CH_CONNECTING; INIT_LIST_HEAD(&ch->cmd_wait_list); - ch->rsp_size = ch->sport->port_attrib.srp_max_rsp_size; + ch->max_rsp_size = ch->sport->port_attrib.srp_max_rsp_size; ch->ioctx_ring = (struct srpt_send_ioctx **) srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size, sizeof(*ch->ioctx_ring[0]), - ch->rsp_size, DMA_TO_DEVICE); + ch->max_rsp_size, DMA_TO_DEVICE); if (!ch->ioctx_ring) goto free_ch; @@ -2235,7 +2235,7 @@ free_recv_ring: free_ring: srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring, ch->sport->sdev, ch->rq_size, - ch->rsp_size, DMA_TO_DEVICE); + ch->max_rsp_size, DMA_TO_DEVICE); free_ch: kfree(ch); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 0c94cd6a7e9d..8f90a7ca98e6 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -114,7 +114,7 @@ enum { MIN_SRPT_SQ_SIZE = 16, DEF_SRPT_SQ_SIZE = 4096, - SRPT_RQ_SIZE = 128, + MAX_SRPT_RQ_SIZE = 128, MIN_SRPT_SRQ_SIZE = 4, DEFAULT_SRPT_SRQ_SIZE = 4095, MAX_SRPT_SRQ_SIZE = 65535, @@ -248,7 +248,7 @@ enum rdma_ch_state { * @zw_cqe: Zero-length write CQE. * @kref: kref for this channel. * @rq_size: IB receive queue size. - * @rsp_size: IB response message size in bytes. + * @max_rsp_size: Maximum size of an RSP response message in bytes. * @sq_wr_avail: number of work requests available in the send queue. * @sport: pointer to the information of the HCA port used by this * channel. @@ -280,7 +280,7 @@ struct srpt_rdma_ch { struct ib_cqe zw_cqe; struct kref kref; int rq_size; - u32 rsp_size; + u32 max_rsp_size; atomic_t sq_wr_avail; struct srpt_port *sport; u8 i_port_id[16]; -- cgit v1.2.3 From d9f45ae69ff14e672c9b1d8cb42383d4cc2a6878 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:44 -0800 Subject: IB/srpt: Reduce the severity level of a log message Since the SRQ event message is only useful for debugging purposes, reduce its severity from "informational" to "debug". Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 4fb884c070af..a71664fe939b 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -177,7 +177,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, */ static void srpt_srq_event(struct ib_event *event, void *ctx) { - pr_info("SRQ event %d\n", event->event); + pr_debug("SRQ event %d\n", event->event); } static const char *get_ch_state_name(enum rdma_ch_state s) -- cgit v1.2.3 From 5f1141b165cc0c7dc1be9143461127dd856615da Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:45 -0800 Subject: IB/srpt: Verify port numbers in srpt_event_handler() Verify whether port numbers are in the expected range before using these as an array index. Complain if a port number is out of range. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index a71664fe939b..c5efe0afa2c8 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -134,6 +134,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, { struct srpt_device *sdev; struct srpt_port *sport; + u8 port_num; sdev = ib_get_client_data(event->device, &srpt_client); if (!sdev || sdev->device != event->device) @@ -144,10 +145,15 @@ static void srpt_event_handler(struct ib_event_handler *handler, switch (event->event) { case IB_EVENT_PORT_ERR: - if (event->element.port_num <= sdev->device->phys_port_cnt) { - sport = &sdev->port[event->element.port_num - 1]; + port_num = event->element.port_num - 1; + if (port_num < sdev->device->phys_port_cnt) { + sport = &sdev->port[port_num]; sport->lid = 0; sport->sm_lid = 0; + } else { + WARN(true, "event %d: port_num %d out of range 1..%d\n", + event->event, port_num + 1, + sdev->device->phys_port_cnt); } break; case IB_EVENT_PORT_ACTIVE: @@ -157,15 +163,19 @@ static void srpt_event_handler(struct ib_event_handler *handler, case IB_EVENT_CLIENT_REREGISTER: case IB_EVENT_GID_CHANGE: /* Refresh port data asynchronously. */ - if (event->element.port_num <= sdev->device->phys_port_cnt) { - sport = &sdev->port[event->element.port_num - 1]; + port_num = event->element.port_num - 1; + if (port_num < sdev->device->phys_port_cnt) { + sport = &sdev->port[port_num]; if (!sport->lid && !sport->sm_lid) schedule_work(&sport->work); + } else { + WARN(true, "event %d: port_num %d out of range 1..%d\n", + event->event, port_num + 1, + sdev->device->phys_port_cnt); } break; default: - pr_err("received unrecognized IB event %d\n", - event->event); + pr_err("received unrecognized IB event %d\n", event->event); break; } } -- cgit v1.2.3 From b185d3f895632f1091513405ce80a9455cb5bf30 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:46 -0800 Subject: IB/srpt: Use the IPv6 format for GIDs in log messages Make the ib_srpt driver use the IPv6 format for GIDs in log messages to improve consistency of this driver with other RDMA kernel drivers. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index c5efe0afa2c8..25c5e188c740 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2010,17 +2010,9 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, it_iu_len = be32_to_cpu(req->req_it_iu_len); - pr_info("Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx," - " t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d" - " (guid=0x%llx:0x%llx)\n", - be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]), - be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]), - be64_to_cpu(*(__be64 *)&req->target_port_id[0]), - be64_to_cpu(*(__be64 *)&req->target_port_id[8]), - it_iu_len, - param->port, - be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]), - be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8])); + pr_info("Received SRP_LOGIN_REQ with i_port_id %pI6, t_port_id %pI6 and it_iu_len %d on port %d (guid=%pI6)\n", + req->initiator_port_id, req->target_port_id, it_iu_len, + param->port, &sport->gid); rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); rej = kzalloc(sizeof(*rej), GFP_KERNEL); -- cgit v1.2.3 From ea51d2e12aac9205291b0ec88ffb7d33da7eafb5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:47 -0800 Subject: IB/srpt: Convert a warning into a debug message At least when running the ib_srpt driver on top of the rdma_rxe driver it is easy to trigger a zero-length write completion in the CH_DISCONNECTED state. Hence make the message that reports this less noisy. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 25c5e188c740..31b6f108dffb 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -875,7 +875,8 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) if (srpt_set_ch_state(ch, CH_DISCONNECTED)) schedule_work(&ch->release_work); else - WARN_ONCE(1, "%s-%d\n", ch->sess_name, ch->qp->qp_num); + pr_debug("%s-%d: already disconnected.\n", + ch->sess_name, ch->qp->qp_num); } } -- cgit v1.2.3 From 3fa7f0e99446f1179cbb2742e5bc4db097ad78b5 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:48 -0800 Subject: IB/srpt: Reduce frequency of receive failure messages Disabling an SRP target port causes the state of all QPs associated with a port to be changed into IB_QPS_ERR. Avoid that this causes one error message per I/O context to be reported. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 31b6f108dffb..6f5a4d66eacc 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1622,8 +1622,8 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) pr_err("req_lim = %d < 0\n", req_lim); srpt_handle_new_iu(ch, ioctx, NULL); } else { - pr_info("receiving failed for ioctx %p with status %d\n", - ioctx, wc->status); + pr_info_ratelimited("receiving failed for ioctx %p with status %d\n", + ioctx, wc->status); } } -- cgit v1.2.3 From b14cb74479a2606b0052cc909727938953a939ac Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:49 -0800 Subject: IB/srpt: Introduce srpt_format_guid() Introduce a function for converting a GUID into an ASCII string. This patch does not change any functionality. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 6f5a4d66eacc..e4c1446699a9 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -524,6 +524,15 @@ err: ib_free_recv_mad(mad_wc); } +static int srpt_format_guid(char *buf, unsigned int size, const __be64 *guid) +{ + const __be16 *g = (const __be16 *)guid; + + return snprintf(buf, size, "%04x:%04x:%04x:%04x", + be16_to_cpu(g[0]), be16_to_cpu(g[1]), + be16_to_cpu(g[2]), be16_to_cpu(g[3])); +} + /** * srpt_refresh_port - configure a HCA port * @sport: SRPT HCA port. @@ -539,7 +548,6 @@ static int srpt_refresh_port(struct srpt_port *sport) struct ib_mad_reg_req reg_req; struct ib_port_modify port_modify; struct ib_port_attr port_attr; - __be16 *guid; int ret; memset(&port_modify, 0, sizeof(port_modify)); @@ -563,11 +571,8 @@ static int srpt_refresh_port(struct srpt_port *sport) goto err_query_port; sport->port_guid_wwn.priv = sport; - guid = (__be16 *)&sport->gid.global.interface_id; - snprintf(sport->port_guid, sizeof(sport->port_guid), - "%04x:%04x:%04x:%04x", - be16_to_cpu(guid[0]), be16_to_cpu(guid[1]), - be16_to_cpu(guid[2]), be16_to_cpu(guid[3])); + srpt_format_guid(sport->port_guid, sizeof(sport->port_guid), + &sport->gid.global.interface_id); sport->port_gid_wwn.priv = sport; snprintf(sport->port_gid, sizeof(sport->port_gid), "0x%016llx%016llx", @@ -1998,7 +2003,6 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, struct srp_login_rej *rej; struct ib_cm_rep_param *rep_param; struct srpt_rdma_ch *ch, *tmp_ch; - __be16 *guid; u32 it_iu_len; int i, ret = 0; @@ -2150,10 +2154,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto destroy_ib; } - guid = (__be16 *)¶m->primary_path->dgid.global.interface_id; - snprintf(ch->ini_guid, sizeof(ch->ini_guid), "%04x:%04x:%04x:%04x", - be16_to_cpu(guid[0]), be16_to_cpu(guid[1]), - be16_to_cpu(guid[2]), be16_to_cpu(guid[3])); + srpt_format_guid(ch->ini_guid, sizeof(ch->ini_guid), + ¶m->primary_path->dgid.global.interface_id); snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx", be64_to_cpu(*(__be64 *)ch->i_port_id), be64_to_cpu(*(__be64 *)(ch->i_port_id + 8))); -- cgit v1.2.3 From dd3bec8655d6f760540afd2261ca7ae9b6888cb0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:50 -0800 Subject: IB/srpt: Inline srpt_get_cmd_state() It is not necessary to obtain ioctx->spinlock when reading the ioctx state. Since after removal of this locking only a single line remains, inline the srpt_get_cmd_state() function. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index e4c1446699a9..d2835b0c15e8 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -752,23 +752,6 @@ static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring, kfree(ioctx_ring); } -/** - * srpt_get_cmd_state - get the state of a SCSI command - * @ioctx: Send I/O context. - */ -static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx) -{ - enum srpt_command_state state; - unsigned long flags; - - BUG_ON(!ioctx); - - spin_lock_irqsave(&ioctx->spinlock, flags); - state = ioctx->state; - spin_unlock_irqrestore(&ioctx->spinlock, flags); - return state; -} - /** * srpt_set_cmd_state - set the state of a SCSI command * @ioctx: Send I/O context. @@ -1303,7 +1286,7 @@ static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) target_execute_cmd(&ioctx->cmd); else pr_err("%s[%d]: wrong state = %d\n", __func__, - __LINE__, srpt_get_cmd_state(ioctx)); + __LINE__, ioctx->state); } /** @@ -2372,7 +2355,7 @@ static int srpt_write_pending_status(struct se_cmd *se_cmd) struct srpt_send_ioctx *ioctx; ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); - return srpt_get_cmd_state(ioctx) == SRPT_STATE_NEED_DATA; + return ioctx->state == SRPT_STATE_NEED_DATA; } /* @@ -2951,7 +2934,7 @@ static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd) struct srpt_send_ioctx *ioctx; ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd); - return srpt_get_cmd_state(ioctx); + return ioctx->state; } static int srpt_parse_guid(u64 *guid, const char *name) -- cgit v1.2.3 From 2d67017cc78f1607bac5347ce0c5258734796faf Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 8 Jan 2018 11:00:51 -0800 Subject: IB/srpt: Micro-optimize I/O context state manipulation Since all I/O context state changes are already serialized, it is not necessary to protect I/O context state changes with the I/O context spinlock. Hence remove that spinlock. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 16 +--------------- drivers/infiniband/ulp/srpt/ib_srpt.h | 2 -- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index d2835b0c15e8..d78f60dcc2ba 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -764,15 +764,10 @@ static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx, enum srpt_command_state new) { enum srpt_command_state previous; - unsigned long flags; - - BUG_ON(!ioctx); - spin_lock_irqsave(&ioctx->spinlock, flags); previous = ioctx->state; if (previous != SRPT_STATE_DONE) ioctx->state = new; - spin_unlock_irqrestore(&ioctx->spinlock, flags); return previous; } @@ -790,17 +785,15 @@ static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx, enum srpt_command_state new) { enum srpt_command_state previous; - unsigned long flags; WARN_ON(!ioctx); WARN_ON(old == SRPT_STATE_DONE); WARN_ON(new == SRPT_STATE_NEW); - spin_lock_irqsave(&ioctx->spinlock, flags); previous = ioctx->state; if (previous == old) ioctx->state = new; - spin_unlock_irqrestore(&ioctx->spinlock, flags); + return previous == old; } @@ -1170,7 +1163,6 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) return ioctx; BUG_ON(ioctx->ch != ch); - spin_lock_init(&ioctx->spinlock); ioctx->state = SRPT_STATE_NEW; ioctx->n_rdma = 0; ioctx->n_rw_ctx = 0; @@ -1192,7 +1184,6 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) { enum srpt_command_state state; - unsigned long flags; BUG_ON(!ioctx); @@ -1201,7 +1192,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) * the ib_srpt driver, change the state to the next state. */ - spin_lock_irqsave(&ioctx->spinlock, flags); state = ioctx->state; switch (state) { case SRPT_STATE_NEED_DATA: @@ -1216,7 +1206,6 @@ static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx) __func__, state); break; } - spin_unlock_irqrestore(&ioctx->spinlock, flags); pr_debug("Aborting cmd with state %d -> %d and tag %lld\n", state, ioctx->state, ioctx->cmd.tag); @@ -2431,13 +2420,11 @@ static void srpt_queue_response(struct se_cmd *cmd) struct ib_send_wr send_wr, *first_wr = &send_wr, *bad_wr; struct ib_sge sge; enum srpt_command_state state; - unsigned long flags; int resp_len, ret, i; u8 srp_tm_status; BUG_ON(!ch); - spin_lock_irqsave(&ioctx->spinlock, flags); state = ioctx->state; switch (state) { case SRPT_STATE_NEW: @@ -2452,7 +2439,6 @@ static void srpt_queue_response(struct se_cmd *cmd) ch, ioctx->ioctx.index, ioctx->state); break; } - spin_unlock_irqrestore(&ioctx->spinlock, flags); if (unlikely(WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) return; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 8f90a7ca98e6..11ce8c94a051 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -195,7 +195,6 @@ struct srpt_rw_ctx { * @rw_ctxs: RDMA read/write contexts. * @rdma_cqe: RDMA completion queue element. * @free_list: Node in srpt_rdma_ch.free_list. - * @spinlock: Protects 'state'. * @state: I/O context state. * @cmd: Target core command data structure. * @sense_data: SCSI sense data. @@ -213,7 +212,6 @@ struct srpt_send_ioctx { struct ib_cqe rdma_cqe; struct list_head free_list; - spinlock_t spinlock; enum srpt_command_state state; struct se_cmd cmd; u8 n_rdma; -- cgit v1.2.3 From 91eab79653b89f92f266b411792f0a0d46857a3a Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 7 Jan 2018 07:08:48 -0500 Subject: IB/rxe: add the static type to the variable The variable recv_sockets is only used in the file rxe_net.c. So it is better to add static type to it. CC: Srinivas Eeda CC: Joe Jin CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_net.c | 2 +- drivers/infiniband/sw/rxe/rxe_net.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 59dee10bebcb..44d77807a69d 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -82,7 +82,7 @@ struct rxe_dev *get_rxe_by_name(const char *name) } -struct rxe_recv_sockets recv_sockets; +static struct rxe_recv_sockets recv_sockets; struct device *rxe_dma_device(struct rxe_dev *rxe) { diff --git a/drivers/infiniband/sw/rxe/rxe_net.h b/drivers/infiniband/sw/rxe/rxe_net.h index 1c06b3bfe1b6..728d8c71b36a 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.h +++ b/drivers/infiniband/sw/rxe/rxe_net.h @@ -43,7 +43,6 @@ struct rxe_recv_sockets { struct socket *sk6; }; -extern struct rxe_recv_sockets recv_sockets; extern struct notifier_block rxe_net_notifier; void rxe_release_udp_tunnel(struct socket *sk); -- cgit v1.2.3 From 5793b46521553774d66653993cfdc132846bb23d Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Mon, 8 Jan 2018 00:14:25 -0500 Subject: IB/rxe: remove unnecessary skb_clone in xmit In xmit, there is a skb_clone. This function copies the struct sk_buff. And some parameters are changed to the new skb. Then the new skb is sent while the old skb is freed. While the function skb_clone is removed, the parameter changes are made on the old skb, then the old skb is sent. It can also work well. The following tests are made. server client --------- --------- |1.1.1.1|<----rxe-channel--->|1.1.1.2| --------- --------- On server: rping -s -a 1.1.1.1 -v -C 1000 -S 512 On client: rping -c -a 1.1.1.1 -v -C 1000 -S 512 The kernel config CONFIG_DEBUG_KMEMLEAK is enabled on both server and client. This test runs for several hours. There is no memory leak and the whole system can work well. As the above network, the following tests are made. Server: ibv_rc_pingpong -d rxe0 -g 1 Client: ibv_rc_pingpong -d rxe0 -g 1 1.1.1.1 The result on Server. Before: 8192000 bytes in 0.88 seconds = 74.36 Mbit/sec 1000 iters in 0.88 seconds = 881.30 usec/iter After: 8192000 bytes in 0.81 seconds = 81.15 Mbit/sec 1000 iters in 0.81 seconds = 807.62 usec/iter The throughput is enhanced and the latency is reduced. CC: Srinivas Eeda CC: Joe Jin CC: Junxiao Bi Signed-off-by: Zhu Yanjun Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_net.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 44d77807a69d..159246b03867 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -452,31 +452,26 @@ static void rxe_skb_tx_dtor(struct sk_buff *skb) int rxe_send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct sk_buff *skb) { - struct sk_buff *nskb; struct rxe_av *av; int err; av = rxe_get_av(pkt); - nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) - return -ENOMEM; - - nskb->destructor = rxe_skb_tx_dtor; - nskb->sk = pkt->qp->sk->sk; + skb->destructor = rxe_skb_tx_dtor; + skb->sk = pkt->qp->sk->sk; rxe_add_ref(pkt->qp); atomic_inc(&pkt->qp->skb_out); if (av->network_type == RDMA_NETWORK_IPV4) { - err = ip_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); + err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else if (av->network_type == RDMA_NETWORK_IPV6) { - err = ip6_local_out(dev_net(skb_dst(skb)->dev), nskb->sk, nskb); + err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); } else { pr_err("Unknown layer 3 protocol: %d\n", av->network_type); atomic_dec(&pkt->qp->skb_out); rxe_drop_ref(pkt->qp); - kfree_skb(nskb); + kfree_skb(skb); return -EINVAL; } @@ -485,7 +480,6 @@ int rxe_send(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct sk_buff *skb) return -EAGAIN; } - kfree_skb(skb); return 0; } -- cgit v1.2.3 From da005f9fab3af121a8a9ef17ff7e06ddbc8edf11 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 9 Jan 2018 15:55:43 +0000 Subject: IB/mlx5: remove redundant assignment of mdev The initial assignment to mdev is redundant as mdev is re-assigned later and the first assigned value is never read. Remove this redundant assignment. Cleans up clang warning: drivers/infiniband/hw/mlx5/main.c:359:24: warning: Value stored to 'mdev' during its initialization is never read Signed-off-by: Colin Ian King Acked-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5d6fba986fa5..91e6b42798e5 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -356,7 +356,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, struct ib_port_attr *props) { struct mlx5_ib_dev *dev = to_mdev(device); - struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_core_dev *mdev; struct net_device *ndev, *upper; enum ib_mtu ndev_ib_mtu; bool put_mdev = true; -- cgit v1.2.3 From 6f301e06de4cf9ab7303f5acd43e64fcd4aa04be Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 9 Jan 2018 11:23:40 -0800 Subject: RDMA/rxe: Fix a race condition related to the QP error state The following sequence: * Change queue pair state into IB_QPS_ERR. * Post a work request on the queue pair. Triggers the following race condition in the rdma_rxe driver: * rxe_qp_error() triggers an asynchronous call of rxe_completer(), the function that examines the QP send queue. * rxe_post_send() posts a work request on the QP send queue. If rxe_completer() runs prior to rxe_post_send(), it will drain the send queue and the driver will assume no further action is necessary. However, once we post the send to the send queue, because the queue is in error, no send completion will ever happen and the send will get stuck. In order to process the send, we need to make sure that rxe_completer() gets run after a send is posted to a queue pair in an error state. This patch ensures that happens. Signed-off-by: Bart Van Assche Cc: Moni Shoua Cc: # v4.8 Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_verbs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index d03002b9d84d..7210a784abb4 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -814,6 +814,8 @@ static int rxe_post_send_kernel(struct rxe_qp *qp, struct ib_send_wr *wr, (queue_count(qp->sq.queue) > 1); rxe_run_task(&qp->req.task, must_sched); + if (unlikely(qp->req.state == QP_STATE_ERROR)) + rxe_run_task(&qp->comp.task, 1); return err; } -- cgit v1.2.3 From 008b656f429824a1987f28150d66eaf8f26d58a1 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 8 Jan 2018 12:15:37 +0200 Subject: IB/core: Remove the locking for character device bitmaps Remove the locks that protect character device bitmaps of uverbs, umad and issm. The character device bitmaps are accessed in "client->add" and "client->remove" calls from ib_register_device and ib_unregister_device respectively. These calls are already protected by the "device_mutex" mutex. Thus, the spinlocks are not needed. Signed-off-by: Huy Nguyen Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/user_mad.c | 5 ----- drivers/infiniband/core/uverbs_main.c | 5 ----- 2 files changed, 10 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index d4de187b1064..608176e9cd5d 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -129,7 +129,6 @@ static struct class *umad_class; static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); -static DEFINE_SPINLOCK(port_lock); static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); static void ib_umad_add_one(struct ib_device *device); @@ -1172,15 +1171,12 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, int devnum; dev_t base; - spin_lock(&port_lock); devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); if (devnum >= IB_UMAD_MAX_PORTS) { - spin_unlock(&port_lock); devnum = find_overflow_devnum(device); if (devnum < 0) return -1; - spin_lock(&port_lock); port->dev_num = devnum + IB_UMAD_MAX_PORTS; base = devnum + overflow_maj; set_bit(devnum, overflow_map); @@ -1189,7 +1185,6 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, base = devnum + base_dev; set_bit(devnum, dev_map); } - spin_unlock(&port_lock); port->ib_dev = device; port->port_num = port_num; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 381fd9c096ae..9e06f810bbeb 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -69,7 +69,6 @@ enum { static struct class *uverbs_class; -static DEFINE_SPINLOCK(map_lock); static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, @@ -1062,15 +1061,12 @@ static void ib_uverbs_add_one(struct ib_device *device) INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); - spin_lock(&map_lock); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); if (devnum >= IB_UVERBS_MAX_DEVICES) { - spin_unlock(&map_lock); devnum = find_overflow_devnum(); if (devnum < 0) goto err; - spin_lock(&map_lock); uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; base = devnum + overflow_maj; set_bit(devnum, overflow_map); @@ -1079,7 +1075,6 @@ static void ib_uverbs_add_one(struct ib_device *device) base = devnum + IB_UVERBS_BASE_DEV; set_bit(devnum, dev_map); } - spin_unlock(&map_lock); rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; -- cgit v1.2.3 From 8cf12d7780b72ee421d18e1f8a15daf43fa6d1e6 Mon Sep 17 00:00:00 2001 From: Huy Nguyen Date: Mon, 8 Jan 2018 12:15:38 +0200 Subject: IB/core: Increase number of char device minors There is a need to increase number of possible char devices to support large number of SR-IOV instances. The current limit is in the range of 64-128 devices/ports. Increase it to support up to 1024. The patch performs the following steps to refactor the code: 1. Removes the split bitmap for fixed and overflow dev numbers. 2. Pre-allocates the non-legacy major number range during driver initialization, choosen for simplicity. 3. Add new define (RDMA_MAX_PORTS) that is shared between all drivers. This is the maximum total number of ports on all struct ib_devices. 4. Set RDMA_MAX_PORTS to 1024. Signed-off-by: Huy Nguyen Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 3 + drivers/infiniband/core/ucm.c | 73 +++++++++--------------- drivers/infiniband/core/user_mad.c | 103 ++++++++++++++++------------------ drivers/infiniband/core/uverbs_main.c | 90 +++++++++++------------------ 4 files changed, 113 insertions(+), 156 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 74091bdca245..aef9aa0ac0e6 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -42,6 +42,9 @@ #include #include "mad_priv.h" +/* Total number of ports combined across all struct ib_devices's */ +#define RDMA_MAX_PORTS 1024 + struct pkey_index_qp_list { struct list_head pkey_index_list; u16 pkey_index; diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c index f2a7f62c2834..f3eb1c3b617d 100644 --- a/drivers/infiniband/core/ucm.c +++ b/drivers/infiniband/core/ucm.c @@ -53,6 +53,8 @@ #include #include +#include "core_priv.h" + MODULE_AUTHOR("Libor Michalek"); MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access"); MODULE_LICENSE("Dual BSD/GPL"); @@ -104,10 +106,13 @@ struct ib_ucm_event { enum { IB_UCM_MAJOR = 231, IB_UCM_BASE_MINOR = 224, - IB_UCM_MAX_DEVICES = 32 + IB_UCM_MAX_DEVICES = RDMA_MAX_PORTS, + IB_UCM_NUM_FIXED_MINOR = 32, + IB_UCM_NUM_DYNAMIC_MINOR = IB_UCM_MAX_DEVICES - IB_UCM_NUM_FIXED_MINOR, }; #define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR) +static dev_t dynamic_ucm_dev; static void ib_ucm_add_one(struct ib_device *device); static void ib_ucm_remove_one(struct ib_device *device, void *client_data); @@ -1199,7 +1204,6 @@ static int ib_ucm_close(struct inode *inode, struct file *filp) return 0; } -static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES); static void ib_ucm_release_dev(struct device *dev) { struct ib_ucm_device *ucm_dev; @@ -1210,10 +1214,7 @@ static void ib_ucm_release_dev(struct device *dev) static void ib_ucm_free_dev(struct ib_ucm_device *ucm_dev) { - if (ucm_dev->devnum < IB_UCM_MAX_DEVICES) - clear_bit(ucm_dev->devnum, dev_map); - else - clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, overflow_map); + clear_bit(ucm_dev->devnum, dev_map); } static const struct file_operations ucm_fops = { @@ -1235,27 +1236,6 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); -static dev_t overflow_maj; -static int find_overflow_devnum(void) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES, - "infiniband_cm"); - if (ret) { - pr_err("ucm: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES); - if (ret >= IB_UCM_MAX_DEVICES) - return -1; - - return ret; -} - static void ib_ucm_add_one(struct ib_device *device) { int devnum; @@ -1274,19 +1254,14 @@ static void ib_ucm_add_one(struct ib_device *device) ucm_dev->dev.release = ib_ucm_release_dev; devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES); - if (devnum >= IB_UCM_MAX_DEVICES) { - devnum = find_overflow_devnum(); - if (devnum < 0) - goto err; - - ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - ucm_dev->devnum = devnum; - base = devnum + IB_UCM_BASE_DEV; - set_bit(devnum, dev_map); - } + if (devnum >= IB_UCM_MAX_DEVICES) + goto err; + ucm_dev->devnum = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UCM_NUM_FIXED_MINOR) + base = dynamic_ucm_dev + devnum - IB_UCM_NUM_FIXED_MINOR; + else + base = IB_UCM_BASE_DEV + devnum; cdev_init(&ucm_dev->cdev, &ucm_fops); ucm_dev->cdev.owner = THIS_MODULE; @@ -1334,13 +1309,20 @@ static int __init ib_ucm_init(void) { int ret; - ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES, + ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR, "infiniband_cm"); if (ret) { pr_err("ucm: couldn't register device number\n"); goto error1; } + ret = alloc_chrdev_region(&dynamic_ucm_dev, 0, IB_UCM_NUM_DYNAMIC_MINOR, + "infiniband_cm"); + if (ret) { + pr_err("ucm: couldn't register dynamic device number\n"); + goto err_alloc; + } + ret = class_create_file(&cm_class, &class_attr_abi_version.attr); if (ret) { pr_err("ucm: couldn't create abi_version attribute\n"); @@ -1357,7 +1339,9 @@ static int __init ib_ucm_init(void) error3: class_remove_file(&cm_class, &class_attr_abi_version.attr); error2: - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); + unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); +err_alloc: + unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); error1: return ret; } @@ -1366,9 +1350,8 @@ static void __exit ib_ucm_cleanup(void) { ib_unregister_client(&ucm_client); class_remove_file(&cm_class, &class_attr_abi_version.attr); - unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES); + unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR); + unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR); idr_destroy(&ctx_id_table); } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 608176e9cd5d..f0ed883492ec 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -55,16 +55,21 @@ #include #include +#include "core_priv.h" + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand userspace MAD packet access"); MODULE_LICENSE("Dual BSD/GPL"); enum { - IB_UMAD_MAX_PORTS = 64, + IB_UMAD_MAX_PORTS = RDMA_MAX_PORTS, IB_UMAD_MAX_AGENTS = 32, IB_UMAD_MAJOR = 231, - IB_UMAD_MINOR_BASE = 0 + IB_UMAD_MINOR_BASE = 0, + IB_UMAD_NUM_FIXED_MINOR = 64, + IB_UMAD_NUM_DYNAMIC_MINOR = IB_UMAD_MAX_PORTS - IB_UMAD_NUM_FIXED_MINOR, + IB_ISSM_MINOR_BASE = IB_UMAD_NUM_FIXED_MINOR, }; /* @@ -127,7 +132,11 @@ struct ib_umad_packet { static struct class *umad_class; -static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); +static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE); +static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + + IB_UMAD_NUM_FIXED_MINOR; +static dev_t dynamic_umad_dev; +static dev_t dynamic_issm_dev; static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); @@ -1141,49 +1150,25 @@ static DEVICE_ATTR(port, S_IRUGO, show_port, NULL); static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_MAD_ABI_VERSION)); -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); -static int find_overflow_devnum(struct ib_device *device) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, - "infiniband_mad"); - if (ret) { - dev_err(&device->dev, - "couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS); - if (ret >= IB_UMAD_MAX_PORTS) - return -1; - - return ret; -} - static int ib_umad_init_port(struct ib_device *device, int port_num, struct ib_umad_device *umad_dev, struct ib_umad_port *port) { int devnum; - dev_t base; + dev_t base_umad; + dev_t base_issm; devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (devnum >= IB_UMAD_MAX_PORTS) { - devnum = find_overflow_devnum(device); - if (devnum < 0) - return -1; - - port->dev_num = devnum + IB_UMAD_MAX_PORTS; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); + if (devnum >= IB_UMAD_MAX_PORTS) + return -1; + port->dev_num = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { + base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; + base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; } else { - port->dev_num = devnum; - base = devnum + base_dev; - set_bit(devnum, dev_map); + base_umad = devnum + base_umad_dev; + base_issm = devnum + base_issm_dev; } port->ib_dev = device; @@ -1196,7 +1181,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, port->cdev.owner = THIS_MODULE; cdev_set_parent(&port->cdev, &umad_dev->kobj); kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num); - if (cdev_add(&port->cdev, base, 1)) + if (cdev_add(&port->cdev, base_umad, 1)) goto err_cdev; port->dev = device_create(umad_class, device->dev.parent, @@ -1210,12 +1195,11 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, if (device_create_file(port->dev, &dev_attr_port)) goto err_dev; - base += IB_UMAD_MAX_PORTS; cdev_init(&port->sm_cdev, &umad_sm_fops); port->sm_cdev.owner = THIS_MODULE; cdev_set_parent(&port->sm_cdev, &umad_dev->kobj); kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num); - if (cdev_add(&port->sm_cdev, base, 1)) + if (cdev_add(&port->sm_cdev, base_issm, 1)) goto err_sm_cdev; port->sm_dev = device_create(umad_class, device->dev.parent, @@ -1242,10 +1226,7 @@ err_dev: err_cdev: cdev_del(&port->cdev); - if (port->dev_num < IB_UMAD_MAX_PORTS) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + clear_bit(devnum, dev_map); return -1; } @@ -1279,11 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) } mutex_unlock(&port->file_mutex); - - if (port->dev_num < IB_UMAD_MAX_PORTS) - clear_bit(port->dev_num, dev_map); - else - clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map); + clear_bit(port->dev_num, dev_map); } static void ib_umad_add_one(struct ib_device *device) @@ -1359,13 +1336,23 @@ static int __init ib_umad_init(void) { int ret; - ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, + ret = register_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2, "infiniband_mad"); if (ret) { pr_err("couldn't register device number\n"); goto out; } + ret = alloc_chrdev_region(&dynamic_umad_dev, 0, + IB_UMAD_NUM_DYNAMIC_MINOR * 2, + "infiniband_mad"); + if (ret) { + pr_err("couldn't register dynamic device number\n"); + goto out_alloc; + } + dynamic_issm_dev = dynamic_umad_dev + IB_UMAD_NUM_DYNAMIC_MINOR; + umad_class = class_create(THIS_MODULE, "infiniband_mad"); if (IS_ERR(umad_class)) { ret = PTR_ERR(umad_class); @@ -1393,7 +1380,12 @@ out_class: class_destroy(umad_class); out_chrdev: - unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); + unregister_chrdev_region(dynamic_umad_dev, + IB_UMAD_NUM_DYNAMIC_MINOR * 2); + +out_alloc: + unregister_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2); out: return ret; @@ -1403,9 +1395,10 @@ static void __exit ib_umad_cleanup(void) { ib_unregister_client(&umad_client); class_destroy(umad_class); - unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2); + unregister_chrdev_region(base_umad_dev, + IB_UMAD_NUM_FIXED_MINOR * 2); + unregister_chrdev_region(dynamic_umad_dev, + IB_UMAD_NUM_DYNAMIC_MINOR * 2); } module_init(ib_umad_init); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 9e06f810bbeb..899102ad6bb6 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -62,11 +62,14 @@ MODULE_LICENSE("Dual BSD/GPL"); enum { IB_UVERBS_MAJOR = 231, IB_UVERBS_BASE_MINOR = 192, - IB_UVERBS_MAX_DEVICES = 32 + IB_UVERBS_MAX_DEVICES = RDMA_MAX_PORTS, + IB_UVERBS_NUM_FIXED_MINOR = 32, + IB_UVERBS_NUM_DYNAMIC_MINOR = IB_UVERBS_MAX_DEVICES - IB_UVERBS_NUM_FIXED_MINOR, }; #define IB_UVERBS_BASE_DEV MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR) +static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); @@ -1004,34 +1007,6 @@ static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); -static dev_t overflow_maj; -static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES); - -/* - * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by - * requesting a new major number and doubling the number of max devices we - * support. It's stupid, but simple. - */ -static int find_overflow_devnum(void) -{ - int ret; - - if (!overflow_maj) { - ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES, - "infiniband_verbs"); - if (ret) { - pr_err("user_verbs: couldn't register dynamic device number\n"); - return ret; - } - } - - ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES); - if (ret >= IB_UVERBS_MAX_DEVICES) - return -1; - - return ret; -} - static void ib_uverbs_add_one(struct ib_device *device) { int devnum; @@ -1062,19 +1037,14 @@ static void ib_uverbs_add_one(struct ib_device *device) INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (devnum >= IB_UVERBS_MAX_DEVICES) { - devnum = find_overflow_devnum(); - if (devnum < 0) - goto err; - - uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES; - base = devnum + overflow_maj; - set_bit(devnum, overflow_map); - } else { - uverbs_dev->devnum = devnum; - base = devnum + IB_UVERBS_BASE_DEV; - set_bit(devnum, dev_map); - } + if (devnum >= IB_UVERBS_MAX_DEVICES) + goto err; + uverbs_dev->devnum = devnum; + set_bit(devnum, dev_map); + if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) + base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; + else + base = IB_UVERBS_BASE_DEV + devnum; rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; @@ -1119,10 +1089,7 @@ err_class: err_cdev: cdev_del(&uverbs_dev->cdev); - if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(devnum, dev_map); - else - clear_bit(devnum, overflow_map); + clear_bit(devnum, dev_map); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) @@ -1214,11 +1181,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) dev_set_drvdata(uverbs_dev->dev, NULL); device_destroy(uverbs_class, uverbs_dev->cdev.dev); cdev_del(&uverbs_dev->cdev); - - if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES) - clear_bit(uverbs_dev->devnum, dev_map); - else - clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map); + clear_bit(uverbs_dev->devnum, dev_map); if (device->disassociate_ucontext) { /* We disassociate HW resources and immediately return. @@ -1260,13 +1223,22 @@ static int __init ib_uverbs_init(void) { int ret; - ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES, + ret = register_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR, "infiniband_verbs"); if (ret) { pr_err("user_verbs: couldn't register device number\n"); goto out; } + ret = alloc_chrdev_region(&dynamic_uverbs_dev, 0, + IB_UVERBS_NUM_DYNAMIC_MINOR, + "infiniband_verbs"); + if (ret) { + pr_err("couldn't register dynamic device number\n"); + goto out_alloc; + } + uverbs_class = class_create(THIS_MODULE, "infiniband_verbs"); if (IS_ERR(uverbs_class)) { ret = PTR_ERR(uverbs_class); @@ -1294,7 +1266,12 @@ out_class: class_destroy(uverbs_class); out_chrdev: - unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); + +out_alloc: + unregister_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR); out: return ret; @@ -1304,9 +1281,10 @@ static void __exit ib_uverbs_cleanup(void) { ib_unregister_client(&uverbs_client); class_destroy(uverbs_class); - unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES); - if (overflow_maj) - unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES); + unregister_chrdev_region(IB_UVERBS_BASE_DEV, + IB_UVERBS_NUM_FIXED_MINOR); + unregister_chrdev_region(dynamic_uverbs_dev, + IB_UVERBS_NUM_DYNAMIC_MINOR); } module_init(ib_uverbs_init); -- cgit v1.2.3 From 608bc44634f3a1ce624d9e8586d05a6887bb3b3c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 8 Jan 2018 17:04:43 +0200 Subject: RDMA/cma: Use the right net namespace for the rdma_cm_id The net namespace is set in addr during create_rdma_id(), cma_resolve_iboe_route() should use that instead of the init namespace. The original code was added in commit fa20105e09e9 ("IB/cma: Add support for network namespaces"), but this path wasn't in use back then. This patch updates the code to use right namespace, as preparation for improving namespace support. Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 65c55f79444a..7db2355541f6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2566,7 +2566,8 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) goto err2; } - ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + ndev = dev_get_by_index(addr->dev_addr.net, + addr->dev_addr.bound_dev_if); if (!ndev) { ret = -ENODEV; goto err2; @@ -2582,7 +2583,7 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) gid_type = ib_network_to_gid_type(addr->dev_addr.network); route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); - sa_path_set_ndev(route->path_rec, &init_net); + sa_path_set_ndev(route->path_rec, addr->dev_addr.net); sa_path_set_ifindex(route->path_rec, ndev->ifindex); sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); -- cgit v1.2.3 From 9327c7afdce3780895728c8977144bf3b5f854c5 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 8 Jan 2018 17:04:44 +0200 Subject: RDMA/cma: Provide a function to set RoCE path record L2 parameters Introduce a helper function to set path record L2 fields for RoCE. This includes setting GID type, destination mac address and netdev ifindex. Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 88 ++++++++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 7db2355541f6..5bcbb5c5ea1c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2465,6 +2465,58 @@ err1: return ret; } +static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, + unsigned long supported_gids, + enum ib_gid_type default_gid) +{ + if ((network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) && + test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + return default_gid; +} + +/* + * cma_iboe_set_path_rec_l2_fields() is helper function which sets + * path record type based on GID type. + * It also sets up other L2 fields which includes destination mac address + * netdev ifindex, of the path record. + * It returns the netdev of the bound interface for this path record entry. + */ +static struct net_device * +cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) +{ + struct rdma_route *route = &id_priv->id.route; + enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; + struct rdma_addr *addr = &route->addr; + unsigned long supported_gids; + struct net_device *ndev; + + if (!addr->dev_addr.bound_dev_if) + return NULL; + + ndev = dev_get_by_index(addr->dev_addr.net, + addr->dev_addr.bound_dev_if); + if (!ndev) + return NULL; + + supported_gids = roce_gid_type_mask_support(id_priv->id.device, + id_priv->id.port_num); + gid_type = cma_route_gid_type(addr->dev_addr.network, + supported_gids, + id_priv->gid_type); + /* Use the hint from IP Stack to select GID Type */ + if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + gid_type = ib_network_to_gid_type(addr->dev_addr.network); + route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); + + sa_path_set_ndev(route->path_rec, addr->dev_addr.net); + sa_path_set_ifindex(route->path_rec, ndev->ifindex); + sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); + return ndev; +} + int rdma_set_ib_paths(struct rdma_cm_id *id, struct sa_path_rec *path_rec, int num_paths) { @@ -2522,18 +2574,6 @@ static int iboe_tos_to_sl(struct net_device *ndev, int tos) return 0; } -static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type, - unsigned long supported_gids, - enum ib_gid_type default_gid) -{ - if ((network_type == RDMA_NETWORK_IPV4 || - network_type == RDMA_NETWORK_IPV6) && - test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids)) - return IB_GID_TYPE_ROCE_UDP_ENCAP; - - return default_gid; -} - static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) { struct rdma_route *route = &id_priv->id.route; @@ -2541,8 +2581,6 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) struct cma_work *work; int ret; struct net_device *ndev; - enum ib_gid_type gid_type = IB_GID_TYPE_ROCE; - unsigned long supported_gids; u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; @@ -2561,32 +2599,12 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) route->num_paths = 1; - if (!addr->dev_addr.bound_dev_if) { - ret = -ENODEV; - goto err2; - } - - ndev = dev_get_by_index(addr->dev_addr.net, - addr->dev_addr.bound_dev_if); + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); if (!ndev) { ret = -ENODEV; goto err2; } - supported_gids = roce_gid_type_mask_support(id_priv->id.device, - id_priv->id.port_num); - gid_type = cma_route_gid_type(addr->dev_addr.network, - supported_gids, - id_priv->gid_type); - /* Use the hint from IP Stack to select GID Type */ - if (gid_type < ib_network_to_gid_type(addr->dev_addr.network)) - gid_type = ib_network_to_gid_type(addr->dev_addr.network); - route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type); - - sa_path_set_ndev(route->path_rec, addr->dev_addr.net); - sa_path_set_ifindex(route->path_rec, ndev->ifindex); - sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &route->path_rec->sgid); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, -- cgit v1.2.3 From fe75889f27940c7a1c0e9c4c81464e0a8e936aa2 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 8 Jan 2018 17:04:45 +0200 Subject: RDMA/{cma, ucma}: Simplify and rename rdma_set_ib_paths Since 2006 there has been no user of rdmacm based application to make use of setting multiple path records using rdma_set_ib_paths API. Therefore code is simplified to allow setting one path record entry. Now that it sets only single path, it is renamed to reflect the same. Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 10 +++++----- drivers/infiniband/core/ucma.c | 4 ++-- include/rdma/rdma_cm_ib.h | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5bcbb5c5ea1c..5c158cda08e3 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2517,8 +2517,8 @@ cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv) return ndev; } -int rdma_set_ib_paths(struct rdma_cm_id *id, - struct sa_path_rec *path_rec, int num_paths) +int rdma_set_ib_path(struct rdma_cm_id *id, + struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; int ret; @@ -2528,20 +2528,20 @@ int rdma_set_ib_paths(struct rdma_cm_id *id, RDMA_CM_ROUTE_RESOLVED)) return -EINVAL; - id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths, + id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec), GFP_KERNEL); if (!id->route.path_rec) { ret = -ENOMEM; goto err; } - id->route.num_paths = num_paths; + id->route.num_paths = 1; return 0; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; } -EXPORT_SYMBOL(rdma_set_ib_paths); +EXPORT_SYMBOL(rdma_set_ib_path); static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) { diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index eb85b546e223..6d32af27bac6 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -1231,9 +1231,9 @@ static int ucma_set_ib_path(struct ucma_context *ctx, struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); - ret = rdma_set_ib_paths(ctx->cm_id, &opa, 1); + ret = rdma_set_ib_path(ctx->cm_id, &opa); } else { - ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1); + ret = rdma_set_ib_path(ctx->cm_id, &sa_path); } if (ret) return ret; diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h index 6947a6ba2557..6a69d71a21a5 100644 --- a/include/rdma/rdma_cm_ib.h +++ b/include/rdma/rdma_cm_ib.h @@ -36,17 +36,17 @@ #include /** - * rdma_set_ib_paths - Manually sets the path records used to establish a + * rdma_set_ib_path - Manually sets the path record used to establish a * connection. * @id: Connection identifier associated with the request. * @path_rec: Reference to the path record * * This call permits a user to specify routing information for rdma_cm_id's - * bound to Infiniband devices. It is called on the client side of a + * bound to InfiniBand devices. It is called on the client side of a * connection and replaces the call to rdma_resolve_route. */ -int rdma_set_ib_paths(struct rdma_cm_id *id, - struct sa_path_rec *path_rec, int num_paths); +int rdma_set_ib_path(struct rdma_cm_id *id, + struct sa_path_rec *path_rec); /* Global qkey for UDP QPs and multicast groups. */ #define RDMA_UDP_QKEY 0x01234567 -- cgit v1.2.3 From 8d20a1f0ecd5ff66ebf23829688c2a8d7133704e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 8 Jan 2018 17:04:47 +0200 Subject: RDMA/cma: Fix rdma_cm raw IB path setting for RoCE rdma_set_ib_path() missed setting path record fields for RoCE transport when RoCE support was added. This results in setting incorrect ndev, destination mac address, incorrect GID type etc errors when user space attempts to set a raw IB path using the roce IB path compatibility mapping from userspace. Fixes: 3c86aa70bf67 ("RDMA/cm: Add RDMA CM support for IBoE devices") Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5c158cda08e3..30d1c32a816f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2521,6 +2521,7 @@ int rdma_set_ib_path(struct rdma_cm_id *id, struct sa_path_rec *path_rec) { struct rdma_id_private *id_priv; + struct net_device *ndev; int ret; id_priv = container_of(id, struct rdma_id_private, id); @@ -2535,8 +2536,21 @@ int rdma_set_ib_path(struct rdma_cm_id *id, goto err; } + if (rdma_protocol_roce(id->device, id->port_num)) { + ndev = cma_iboe_set_path_rec_l2_fields(id_priv); + if (!ndev) { + ret = -ENODEV; + goto err_free; + } + dev_put(ndev); + } + id->route.num_paths = 1; return 0; + +err_free: + kfree(id->route.path_rec); + id->route.path_rec = NULL; err: cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED); return ret; -- cgit v1.2.3 From 89838118a515847d3e5c904d2e022779a7173bec Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 8 Jan 2018 17:04:48 +0200 Subject: RDMA/cma: Fix rdma_cm path querying for RoCE The 'if' logic in ucma_query_path was broken with OPA was introduced and started to treat RoCE paths as as OPA paths. Invert the logic of the 'if' so only OPA paths are treated as OPA paths. Otherwise the path records returned to rdma_cma users are mangled when in RoCE mode. Fixes: 57520751445b ("IB/SA: Add OPA path record type") Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 6d32af27bac6..2ec0f89035fb 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -904,13 +904,14 @@ static ssize_t ucma_query_path(struct ucma_context *ctx, resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; - if (rec->rec_type == SA_PATH_REC_TYPE_IB) { - ib_sa_pack_path(rec, &resp->path_data[i].path_rec); - } else { + if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { struct sa_path_rec ib; sa_convert_path_opa_to_ib(&ib, rec); ib_sa_pack_path(&ib, &resp->path_data[i].path_rec); + + } else { + ib_sa_pack_path(rec, &resp->path_data[i].path_rec); } } -- cgit v1.2.3 From 1f58621e40abc7663d1767cedfaf71f38d13bfa8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 5 Jan 2018 16:21:40 -0800 Subject: infiniband: fix core/verbs.c kernel-doc notation Change function parameter name in kernel-doc notation and other comments to eliminate a kernel-doc warning. ../drivers/infiniband/core/verbs.c:1790: warning: Excess function parameter 'wq_init_attr' description in 'ib_create_wq' Signed-off-by: Randy Dunlap Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-doc@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a1bcdacb7f40..15dd26cab5d8 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1807,11 +1807,11 @@ EXPORT_SYMBOL(ib_dealloc_xrcd); * ib_create_wq - Creates a WQ associated with the specified protection * domain. * @pd: The protection domain associated with the WQ. - * @wq_init_attr: A list of initial attributes required to create the + * @wq_attr: A list of initial attributes required to create the * WQ. If WQ creation succeeds, then the attributes are updated to * the actual capabilities of the created WQ. * - * wq_init_attr->max_wr and wq_init_attr->max_sge determine + * wq_attr->max_wr and wq_attr->max_sge determine * the requested size of the WQ, and set to the actual values allocated * on return. * If ib_create_wq() succeeds, then max_wr and max_sge will always be -- cgit v1.2.3 From 92f617038e456cbef7d5b2cd2d8710a99a046030 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 5 Jan 2018 16:21:53 -0800 Subject: infiniband: fix core/fmr_pool.c kernel-doc notation Fix kernel-doc warning for ib_fmr_pool_map_phys() and also format it with function description and text spacing. ../drivers/infiniband/core/fmr_pool.c:404: warning: Excess function parameter 'pool' description in 'ib_fmr_pool_map_phys' Signed-off-by: Randy Dunlap Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-doc@vger.kernel.org Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/fmr_pool.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 84d2615b5d4b..a0a9ed719031 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -388,13 +388,11 @@ int ib_flush_fmr_pool(struct ib_fmr_pool *pool) EXPORT_SYMBOL(ib_flush_fmr_pool); /** - * ib_fmr_pool_map_phys - - * @pool:FMR pool to allocate FMR from - * @page_list:List of pages to map - * @list_len:Number of pages in @page_list - * @io_virtual_address:I/O virtual address for new FMR - * - * Map an FMR from an FMR pool. + * ib_fmr_pool_map_phys - Map an FMR from an FMR pool. + * @pool_handle: FMR pool to allocate FMR from + * @page_list: List of pages to map + * @list_len: Number of pages in @page_list + * @io_virtual_address: I/O virtual address for new FMR */ struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle, u64 *page_list, -- cgit v1.2.3 From cb8d09426a8d1ac1f9175a0dd7f244ea55dcc0ea Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 5 Jan 2018 16:22:04 -0800 Subject: infiniband: fix ulp/opa_vnic/opa_vnic_vema.c kernel-doc notation Use correct parameter name and description in kernel-doc notation to eliminate a kernel-doc warning. ../drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c:730: warning: Excess function parameter 'cport' description in 'opa_vnic_vema_send_trap' Signed-off-by: Randy Dunlap Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-doc@vger.kernel.org Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index 4b615c1451e7..15711dcc6f58 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -710,7 +710,7 @@ vema_get_port(struct opa_vnic_ctrl_port *cport, u8 port_num) /** * opa_vnic_vema_send_trap -- This function sends a trap to the EM - * @cport: pointer to vnic control port + * @adapter: pointer to vnic adapter * @data: pointer to trap data filled by calling function * @lid: issuers lid (encap_slid from vesw_port_info) * -- cgit v1.2.3 From 4f9a3018a2bb09cadcc27bcf2d58000eb5f7e037 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 5 Jan 2018 16:22:32 -0800 Subject: infiniband: fix sw/rdmavt/* kernel-doc notation Use correct parameter names and formatting in function kernel-doc notation to eliminate warnings from scripts/kernel-doc. ../drivers/infiniband/sw/rdmavt/mr.c:784: warning: Excess function parameter 'ibmfr' description in 'rvt_map_phys_fmr' ../drivers/infiniband/sw/rdmavt/vt.c:234: warning: Excess function parameter 'intex' description in 'rvt_query_pkey' ../drivers/infiniband/sw/rdmavt/vt.c:266: warning: Excess function parameter 'index' description in 'rvt_query_gid' ../drivers/infiniband/sw/rdmavt/vt.c:306: warning: Excess function parameter 'data' description in 'rvt_alloc_ucontext' ../drivers/infiniband/sw/rdmavt/cq.c:65: warning: Excess function parameter 'sig' description in 'rvt_cq_enter' ../drivers/infiniband/sw/rdmavt/qp.c:279: warning: Excess function parameter 'qpt' description in 'rvt_free_all_qps' ../drivers/infiniband/sw/rdmavt/mcast.c:282: warning: Excess function parameter 'igd' description in 'rvt_attach_mcast' ../drivers/infiniband/sw/rdmavt/mcast.c:345: warning: Excess function parameter 'igd' description in 'rvt_detach_mcast' Signed-off-by: Randy Dunlap Cc: Doug Ledford Cc: Jason Gunthorpe Cc: linux-doc@vger.kernel.org Acked-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rdmavt/cq.c | 2 +- drivers/infiniband/sw/rdmavt/mcast.c | 4 ++-- drivers/infiniband/sw/rdmavt/mr.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 18 +++++++++--------- drivers/infiniband/sw/rdmavt/vt.c | 9 +++++---- 5 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 88fa4d44ab5f..61fca14c2598 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -56,7 +56,7 @@ * rvt_cq_enter - add a new entry to the completion queue * @cq: completion queue * @entry: work completion entry to add - * @sig: true if @entry is solicited + * @solicited: true if @entry is solicited * * This may be called with qp->s_lock held. */ diff --git a/drivers/infiniband/sw/rdmavt/mcast.c b/drivers/infiniband/sw/rdmavt/mcast.c index b3a38c5e4cad..dd11c6fcd060 100644 --- a/drivers/infiniband/sw/rdmavt/mcast.c +++ b/drivers/infiniband/sw/rdmavt/mcast.c @@ -272,7 +272,7 @@ bail: /** * rvt_attach_mcast - attach a qp to a multicast group * @ibqp: Infiniband qp - * @igd: multicast guid + * @gid: multicast guid * @lid: multicast lid * * Return: 0 on success @@ -335,7 +335,7 @@ bail_mcast: /** * rvt_detach_mcast - remove a qp from a multicast group * @ibqp: Infiniband qp - * @igd: multicast guid + * @gid: multicast guid * @lid: multicast lid * * Return: 0 on success diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 42713511b53b..1b2e5362a3ff 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -768,7 +768,7 @@ bail: /** * rvt_map_phys_fmr - set up a fast memory region - * @ibmfr: the fast memory region to set up + * @ibfmr: the fast memory region to set up * @page_list: the list of pages to associate with the fast memory region * @list_len: the number of pages to associate with the fast memory region * @iova: the virtual address of the start of the fast memory region diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 0bbf20597056..4c548bf1f032 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -269,7 +269,7 @@ no_qp_table: /** * free_all_qps - check for QPs still in use - * @qpt: the QP table to empty + * @rdi: rvt device info structure * * There should not be any QPs still in use. * Free memory for table. @@ -335,9 +335,9 @@ static inline unsigned mk_qpn(struct rvt_qpn_table *qpt, /** * alloc_qpn - Allocate the next available qpn or zero/one for QP type * IB_QPT_SMI/IB_QPT_GSI - *@rdi: rvt device info structure - *@qpt: queue pair number table pointer - *@port_num: IB port number, 1 based, comes from core + * @rdi: rvt device info structure + * @qpt: queue pair number table pointer + * @port_num: IB port number, 1 based, comes from core * * Return: The queue pair number */ @@ -1650,9 +1650,9 @@ static inline int rvt_qp_valid_operation( /** * rvt_qp_is_avail - determine queue capacity - * @qp - the qp - * @rdi - the rdmavt device - * @reserved_op - is reserved operation + * @qp: the qp + * @rdi: the rdmavt device + * @reserved_op: is reserved operation * * This assumes the s_hlock is held but the s_last * qp variable is uncontrolled. @@ -2174,8 +2174,8 @@ EXPORT_SYMBOL(rvt_rc_rnr_retry); /** * rvt_qp_iter_init - initial for QP iteration - * @rdi - rvt devinfo - * @v - u64 value + * @rdi: rvt devinfo + * @v: u64 value * * This returns an iterator suitable for iterating QPs * in the system. diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 088fb2d6d919..a4553b2b3696 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -224,7 +224,8 @@ static int rvt_modify_port(struct ib_device *ibdev, u8 port_num, * rvt_query_pkey - Return a pkey from the table at a given index * @ibdev: Verbs IB dev * @port_num: Port number, 1 based from ib core - * @intex: Index into pkey table + * @index: Index into pkey table + * @pkey: returned pkey from the port pkey table * * Return: 0 on failure pkey otherwise */ @@ -255,7 +256,7 @@ static int rvt_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, * rvt_query_gid - Return a gid from the table * @ibdev: Verbs IB dev * @port_num: Port number, 1 based from ib core - * @index: = Index in table + * @guid_index: Index in table * @gid: Gid to return * * Return: 0 on success @@ -297,8 +298,8 @@ static inline struct rvt_ucontext *to_iucontext(struct ib_ucontext /** * rvt_alloc_ucontext - Allocate a user context - * @ibdev: Vers IB dev - * @data: User data allocated + * @ibdev: Verbs IB dev + * @udata: User data allocated */ static struct ib_ucontext *rvt_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) -- cgit v1.2.3 From b2bedfb39541a7e14798d066b6f8685d84c8fcf5 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 9 Jan 2018 15:24:50 +0200 Subject: IB/core: Perform modify QP on real one Currently qp->port stores the port number whenever IB_QP_PORT QP attribute mask is set (during QP state transition to INIT state). This port number should be stored for the real QP when XRC target QP is used. Follow the ib_modify_qp() implementation and hide the access to ->real_qp. Fixes: a512c2fbef9c ("IB/core: Introduce modify QP operation with udata") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 15dd26cab5d8..be18ed7c2326 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1324,7 +1324,7 @@ static int ib_resolve_eth_dmac(struct ib_device *device, /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. - * @qp: The QP to modify. + * @ib_qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP @@ -1333,9 +1333,10 @@ static int ib_resolve_eth_dmac(struct ib_device *device, * are being modified. * It returns 0 on success and returns appropriate error code on error. */ -int ib_modify_qp_with_udata(struct ib_qp *qp, struct ib_qp_attr *attr, +int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { + struct ib_qp *qp = ib_qp->real_qp; u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; int ret; -- cgit v1.2.3 From b96ac05a87da602d501b05883385a049862c0476 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 9 Jan 2018 15:24:51 +0200 Subject: IB/core: Limit DMAC resolution to userspace QPs Currently ah_attr is initialized by the ib_cm layer for rdma_cm based applications. For RoCE transport ah_attr.roce.dmac is already initialized by ib_cm, rdma_cm either from wc, path record, route resolve, explicit path record setting depending on active or passive side QP. Therefore avoid resolving DMAC for QP of kernel consumers. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 48 +++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index be18ed7c2326..af3f8780f93f 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1323,29 +1323,14 @@ static int ib_resolve_eth_dmac(struct ib_device *device, } /** - * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. - * @ib_qp: The QP to modify. - * @attr: On input, specifies the QP attributes to modify. On output, - * the current values of selected QP attributes are returned. - * @attr_mask: A bit-mask used to specify which attributes of the QP - * are being modified. - * @udata: pointer to user's input output buffer information - * are being modified. - * It returns 0 on success and returns appropriate error code on error. + * IB core internal function to perform QP attributes modification. */ -int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) +static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) { - struct ib_qp *qp = ib_qp->real_qp; u8 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; int ret; - if (attr_mask & IB_QP_AV) { - ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); - if (ret) - return ret; - } - if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", @@ -1366,6 +1351,31 @@ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, return ret; } + +/** + * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. + * @ib_qp: The QP to modify. + * @attr: On input, specifies the QP attributes to modify. On output, + * the current values of selected QP attributes are returned. + * @attr_mask: A bit-mask used to specify which attributes of the QP + * are being modified. + * @udata: pointer to user's input output buffer information + * are being modified. + * It returns 0 on success and returns appropriate error code on error. + */ +int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct ib_qp *qp = ib_qp->real_qp; + int ret; + + if (attr_mask & IB_QP_AV) { + ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); + if (ret) + return ret; + } + return _ib_modify_qp(qp, attr, attr_mask, udata); +} EXPORT_SYMBOL(ib_modify_qp_with_udata); int ib_get_eth_speed(struct ib_device *dev, u8 port_num, u8 *speed, u8 *width) @@ -1427,7 +1437,7 @@ int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { - return ib_modify_qp_with_udata(qp, qp_attr, qp_attr_mask, NULL); + return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); -- cgit v1.2.3 From f2290d6d522dbcdb8c2bd36e98e57c05bf5ed523 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 9 Jan 2018 15:24:52 +0200 Subject: IB/core: Attempt DMAC resolution for only RoCE Instead of returning 0 (success) for RoCE scenarios where DMAC should not be resolved, avoid such attempt and make code consistent with ib_create_user_ah(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index af3f8780f93f..b110db6dbef0 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1301,9 +1301,6 @@ static int ib_resolve_eth_dmac(struct ib_device *device, if (!rdma_is_port_valid(device, rdma_ah_get_port_num(ah_attr))) return -EINVAL; - if (ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE) - return 0; - grh = rdma_ah_retrieve_grh(ah_attr); if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { @@ -1369,7 +1366,8 @@ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, struct ib_qp *qp = ib_qp->real_qp; int ret; - if (attr_mask & IB_QP_AV) { + if (attr_mask & IB_QP_AV && + attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) { ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) return ret; -- cgit v1.2.3 From a6753c4d6257feef2a82522d0d222166d4be4aa8 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 9 Jan 2018 15:24:53 +0200 Subject: IB/core: Limit DMAC resolution to RoCE Connected QPs Resolving DMAC for RoCE is applicable to only Connected mode QPs. So resolve DMAC for only for Connected mode QPs. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b110db6dbef0..49a2a764c0bf 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1349,6 +1349,14 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, return ret; } +static bool is_qp_type_connected(const struct ib_qp *qp) +{ + return (qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_RC || + qp->qp_type == IB_QPT_XRC_INI || + qp->qp_type == IB_QPT_XRC_TGT); +} + /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. * @ib_qp: The QP to modify. @@ -1367,7 +1375,8 @@ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, int ret; if (attr_mask & IB_QP_AV && - attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) { + attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && + is_qp_type_connected(qp)) { ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) return ret; -- cgit v1.2.3 From c966ea12c00515e70a976aec57d1a4334582c411 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Jan 2018 14:43:05 -0700 Subject: RDMA: Mark imm_data as be32 in the verbs uapi header This matches what the userspace copy of this header has been doing for a while. imm_data is an opaque 4 byte array carried over the network, and invalidate_rkey is in CPU byte order. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/sw/rdmavt/cq.c | 3 +-- drivers/infiniband/sw/rxe/rxe_resp.c | 3 +-- include/uapi/rdma/ib_user_verbs.h | 4 ++-- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c216d98bb816..4ddd61d90507 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1196,7 +1196,7 @@ static int copy_wc_to_user(struct ib_device *ib_dev, void __user *dest, tmp.opcode = wc->opcode; tmp.vendor_err = wc->vendor_err; tmp.byte_len = wc->byte_len; - tmp.ex.imm_data = (__u32 __force) wc->ex.imm_data; + tmp.ex.imm_data = wc->ex.imm_data; tmp.qp_num = wc->qp->qp_num; tmp.src_qp = wc->src_qp; tmp.wc_flags = wc->wc_flags; diff --git a/drivers/infiniband/sw/rdmavt/cq.c b/drivers/infiniband/sw/rdmavt/cq.c index 61fca14c2598..fb52b669bfce 100644 --- a/drivers/infiniband/sw/rdmavt/cq.c +++ b/drivers/infiniband/sw/rdmavt/cq.c @@ -101,8 +101,7 @@ void rvt_cq_enter(struct rvt_cq *cq, struct ib_wc *entry, bool solicited) wc->uqueue[head].opcode = entry->opcode; wc->uqueue[head].vendor_err = entry->vendor_err; wc->uqueue[head].byte_len = entry->byte_len; - wc->uqueue[head].ex.imm_data = - (__u32 __force)entry->ex.imm_data; + wc->uqueue[head].ex.imm_data = entry->ex.imm_data; wc->uqueue[head].qp_num = entry->qp->qp_num; wc->uqueue[head].src_qp = entry->src_qp; wc->uqueue[head].wc_flags = entry->wc_flags; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 4240866a5331..f07324f2cde2 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -863,8 +863,7 @@ static enum resp_states do_complete(struct rxe_qp *qp, if (pkt->mask & RXE_IMMDT_MASK) { uwc->wc_flags |= IB_WC_WITH_IMM; - uwc->ex.imm_data = - (__u32 __force)immdt_imm(pkt); + uwc->ex.imm_data = immdt_imm(pkt); } if (pkt->mask & RXE_IETH_MASK) { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 7e11bb8651b6..fd035641cf41 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -449,7 +449,7 @@ struct ib_uverbs_wc { __u32 vendor_err; __u32 byte_len; union { - __u32 imm_data; + __be32 imm_data; __u32 invalidate_rkey; } ex; __u32 qp_num; @@ -765,7 +765,7 @@ struct ib_uverbs_send_wr { __u32 opcode; __u32 send_flags; union { - __u32 imm_data; + __be32 imm_data; __u32 invalidate_rkey; } ex; union { -- cgit v1.2.3 From ccb8a29e7db29f2b889300a80bd0684d646f796b Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Jan 2018 14:43:06 -0700 Subject: RDMA/hns: Fix endian problems around imm_data and rkey This matches the changes made recently to the userspace hns driver when it was made sparse clean. See rdma-core commit bffd380cfe56 ("libhns: Make the provider sparse clean") wc->imm_data is not used in the kernel so this change has no practical impact. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 7 ++++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++--- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 ++++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 939355ede14a..833a305085ef 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -2311,15 +2311,16 @@ static int hns_roce_v1_poll_one(struct hns_roce_cq *hr_cq, case HNS_ROCE_OPCODE_RDMA_WITH_IMM_RECEIVE: wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = le32_to_cpu(cqe->immediate_data); + wc->ex.imm_data = + cpu_to_be32(le32_to_cpu(cqe->immediate_data)); break; case HNS_ROCE_OPCODE_SEND_DATA_RECEIVE: if (roce_get_bit(cqe->cqe_byte_4, CQE_BYTE_4_IMM_INDICATOR_S)) { wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = le32_to_cpu( - cqe->immediate_data); + wc->ex.imm_data = cpu_to_be32( + le32_to_cpu(cqe->immediate_data)); } else { wc->opcode = IB_WC_RECV; wc->wc_flags = 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 2ca35e341d09..34f13c3cb088 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1709,7 +1709,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, case HNS_ROCE_V2_OPCODE_RDMA_WRITE_IMM: wc->opcode = IB_WC_RECV_RDMA_WITH_IMM; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = le32_to_cpu(cqe->rkey_immtdata); + wc->ex.imm_data = cqe->immtdata; break; case HNS_ROCE_V2_OPCODE_SEND: wc->opcode = IB_WC_RECV; @@ -1718,12 +1718,12 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, case HNS_ROCE_V2_OPCODE_SEND_WITH_IMM: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_IMM; - wc->ex.imm_data = le32_to_cpu(cqe->rkey_immtdata); + wc->ex.imm_data = cqe->immtdata; break; case HNS_ROCE_V2_OPCODE_SEND_WITH_INV: wc->opcode = IB_WC_RECV; wc->wc_flags = IB_WC_WITH_INVALIDATE; - wc->ex.invalidate_rkey = cqe->rkey_immtdata; + wc->ex.invalidate_rkey = le32_to_cpu(cqe->rkey); break; default: wc->status = IB_WC_GENERAL_ERR; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 463edab9b719..cbf61ad2539c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -762,7 +762,10 @@ struct hns_roce_v2_qp_context { struct hns_roce_v2_cqe { u32 byte_4; - u32 rkey_immtdata; + union { + __le32 rkey; + __be32 immtdata; + }; u32 byte_12; u32 byte_16; u32 byte_cnt; -- cgit v1.2.3 From 7bed7ebcb7c33eb789292f8ecc881d785b13a04c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 11 Jan 2018 14:43:07 -0700 Subject: RDMA/qedr: Fix endian problems around imm_data The double swap matches what user space rdma-core does to imm_data. wc->imm_data is not used in the kernel so this change has no practical impact. Acked-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 3b9c89848d66..5551120ac6ea 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -3039,7 +3039,7 @@ static int __qedr_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, swqe->wqe_size = 2; swqe2 = qed_chain_produce(&qp->sq.pbl); - swqe->inv_key_or_imm_data = cpu_to_le32(wr->ex.imm_data); + swqe->inv_key_or_imm_data = cpu_to_le32(be32_to_cpu(wr->ex.imm_data)); length = qedr_prepare_sq_send_data(dev, qp, swqe, swqe2, wr, bad_wr); swqe->length = cpu_to_le32(length); @@ -3590,7 +3590,7 @@ static inline int qedr_set_ok_cqe_resp_wc(struct rdma_cqe_responder *resp, wc->byte_len = le32_to_cpu(resp->length); if (resp->flags & QEDR_RESP_IMM) { - wc->ex.imm_data = le32_to_cpu(resp->imm_data_or_inv_r_Key); + wc->ex.imm_data = cpu_to_be32(le32_to_cpu(resp->imm_data_or_inv_r_Key)); wc->wc_flags |= IB_WC_WITH_IMM; if (resp->flags & QEDR_RESP_RDMA) -- cgit v1.2.3 From 852f6927594d0d3e8632c889b2ab38cbc46476ad Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Fri, 12 Jan 2018 07:58:40 +0200 Subject: IB/mlx4: Fix incorrectly releasing steerable UD QPs when have only ETH ports Allocating steerable UD QPs depends on having at least one IB port, while releasing those QPs does not. As a result, when there are only ETH ports, the IB (RoCE) driver requests releasing a qp range whose base qp is zero, with qp count zero. When SR-IOV is enabled, and the VF driver is running on a VM over a hypervisor which treats such qp release calls as errors (rather than NOPs), we see lines in the VM message log like: mlx4_core 0002:00:02.0: Failed to release qp range base:0 cnt:0 Fix this by adding a check for a zero count in mlx4_release_qp_range() (which thus treats releasing 0 qps as a nop), and eliminating the check for device managed flow steering when releasing steerable UD QPs. (Freeing ib_uc_qpns_bitmap unconditionally is also OK, since it remains NULL when steerable UD QPs are not allocated). Cc: Fixes: 4196670be786 ("IB/mlx4: Don't allocate range of steerable UD QPs for Ethernet-only device") Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/main.c | 13 +++++-------- drivers/net/ethernet/mellanox/mlx4/qp.c | 3 +++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 5695ce53fddb..8d2ee9322f2e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -3001,9 +3001,8 @@ err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); err_steer_qp_release: - if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) - mlx4_qp_release_range(dev, ibdev->steer_qpn_base, - ibdev->steer_qpn_count); + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); err_counter: for (i = 0; i < ibdev->num_ports; ++i) mlx4_ib_delete_counters_table(ibdev, &ibdev->counters_table[i]); @@ -3108,11 +3107,9 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) ibdev->iboe.nb.notifier_call = NULL; } - if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) { - mlx4_qp_release_range(dev, ibdev->steer_qpn_base, - ibdev->steer_qpn_count); - kfree(ibdev->ib_uc_qpns_bitmap); - } + mlx4_qp_release_range(dev, ibdev->steer_qpn_base, + ibdev->steer_qpn_count); + kfree(ibdev->ib_uc_qpns_bitmap); iounmap(ibdev->uar_map); for (p = 0; p < ibdev->num_ports; ++p) diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 769598f7b6c8..3aaf4bad6c5a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -287,6 +287,9 @@ void mlx4_qp_release_range(struct mlx4_dev *dev, int base_qpn, int cnt) u64 in_param = 0; int err; + if (!cnt) + return; + if (mlx4_is_mfunc(dev)) { set_param_l(&in_param, base_qpn); set_param_h(&in_param, cnt); -- cgit v1.2.3 From cd2a6e7d384b043d5d029e39663061cebc949385 Mon Sep 17 00:00:00 2001 From: Bodong Wang Date: Fri, 12 Jan 2018 07:58:41 +0200 Subject: IB/core: Fix ib_wc structure size to remain in 64 bytes boundary The change of slid from u16 to u32 results in sizeof(struct ib_wc) cross 64B boundary, which causes more cache misses. This patch rearranges the fields and remain the size to 64B. Pahole output before this change: struct ib_wc { union { u64 wr_id; /* 8 */ struct ib_cqe * wr_cqe; /* 8 */ }; /* 0 8 */ enum ib_wc_status status; /* 8 4 */ enum ib_wc_opcode opcode; /* 12 4 */ u32 vendor_err; /* 16 4 */ u32 byte_len; /* 20 4 */ struct ib_qp * qp; /* 24 8 */ union { __be32 imm_data; /* 4 */ u32 invalidate_rkey; /* 4 */ } ex; /* 32 4 */ u32 src_qp; /* 36 4 */ int wc_flags; /* 40 4 */ u16 pkey_index; /* 44 2 */ /* XXX 2 bytes hole, try to pack */ u32 slid; /* 48 4 */ u8 sl; /* 52 1 */ u8 dlid_path_bits; /* 53 1 */ u8 port_num; /* 54 1 */ u8 smac[6]; /* 55 6 */ /* XXX 1 byte hole, try to pack */ u16 vlan_id; /* 62 2 */ /* --- cacheline 1 boundary (64 bytes) --- */ u8 network_hdr_type; /* 64 1 */ /* size: 72, cachelines: 2, members: 17 */ /* sum members: 62, holes: 2, sum holes: 3 */ /* padding: 7 */ /* last cacheline: 8 bytes */ }; Pahole output after this change: struct ib_wc { union { u64 wr_id; /* 8 */ struct ib_cqe * wr_cqe; /* 8 */ }; /* 0 8 */ enum ib_wc_status status; /* 8 4 */ enum ib_wc_opcode opcode; /* 12 4 */ u32 vendor_err; /* 16 4 */ u32 byte_len; /* 20 4 */ struct ib_qp * qp; /* 24 8 */ union { __be32 imm_data; /* 4 */ u32 invalidate_rkey; /* 4 */ } ex; /* 32 4 */ u32 src_qp; /* 36 4 */ u32 slid; /* 40 4 */ int wc_flags; /* 44 4 */ u16 pkey_index; /* 48 2 */ u8 sl; /* 50 1 */ u8 dlid_path_bits; /* 51 1 */ u8 port_num; /* 52 1 */ u8 smac[6]; /* 53 6 */ /* XXX 1 byte hole, try to pack */ u16 vlan_id; /* 60 2 */ u8 network_hdr_type; /* 62 1 */ /* size: 64, cachelines: 1, members: 17 */ /* sum members: 62, holes: 1, sum holes: 1 */ /* padding: 1 */ }; Cc: # v4.13 Fixes: 7db20ecd1d97 ("IB/core: Change wc.slid from 16 to 32 bits") Signed-off-by: Bodong Wang Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f25c03687ee9..f2b1e2244016 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -983,9 +983,9 @@ struct ib_wc { u32 invalidate_rkey; } ex; u32 src_qp; + u32 slid; int wc_flags; u16 pkey_index; - u32 slid; u8 sl; u8 dlid_path_bits; u8 port_num; /* valid only for DR SMPs on switches */ -- cgit v1.2.3 From a6532e7139660c103dda181aa5b2c734aa26ed6c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 12 Jan 2018 07:58:42 +0200 Subject: RDMA/core: Clarify rdma_ah_find_type iWARP does not use rdma_ah_attr_type, and for this reason we do not have a RDMA_AH_ATTR_TYPE_IWARP. rdma_ah_find_type should not even be called on iwarp ports and for clarity it shouldn't have a special test for iWarp. This changes the result from RDMA_AH_ATTR_TYPE_ROCE to RDMA_AH_ATTR_TYPE_IB when wrongly called on an iWarp port. Fixes: 44c58487d51a ("IB/core: Define 'ib' and 'roce' rdma_ah_attr types") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f2b1e2244016..5e32fe781ca3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3793,8 +3793,7 @@ static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr, static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, u32 port_num) { - if ((rdma_protocol_roce(dev, port_num)) || - (rdma_protocol_iwarp(dev, port_num))) + if (rdma_protocol_roce(dev, port_num)) return RDMA_AH_ATTR_TYPE_ROCE; else if ((rdma_protocol_ib(dev, port_num)) && (rdma_cap_opa_ah(dev, port_num))) -- cgit v1.2.3 From 979a459c8347a797fd03717a3f12289c91617982 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Fri, 12 Jan 2018 15:56:05 +0800 Subject: IB/cma: use strlcpy() instead of strncpy() gcc-8 reports drivers/infiniband/core/cma_configfs.c: In function 'make_cma_dev': ./include/linux/string.h:245:9: warning: '__builtin_strncpy' specified bound 64 equals destination size [-Wstringop-truncation] We need to use strlcpy() to make sure the string is nul-terminated. Signed-off-by: Xiongfeng Wang Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma_configfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 31dfee0c8295..eee38b40be99 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -295,7 +295,7 @@ static struct config_group *make_cma_dev(struct config_group *group, goto fail; } - strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + strlcpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); config_group_init_type_name(&cma_dev_group->ports_group, "ports", &cma_ports_group_type); -- cgit v1.2.3 From 6d13b869ea1e929842e9d758867e4b7473759f9d Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 10 Jan 2018 14:39:47 +0800 Subject: RDMA/hns: Assign the correct value for tx_cqn When modifying qp from init to init, it need to assign the cqn of send cq for tx cqn field of qp context. Otherwise, it will cause a mistake when the send and recv cq sizes are different. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 34f13c3cb088..faf91eea9cf0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2342,7 +2342,7 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_80_RX_CQN_S, 0); roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M, - V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->recv_cq)->cqn); + V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn); roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M, V2_QPC_BYTE_252_TX_CQN_S, 0); -- cgit v1.2.3 From b66efc932067cb653da6bcbe03fe3f7bc53bf30d Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 10 Jan 2018 14:39:48 +0800 Subject: RDMA/hns: Create gsi qp in hip08 The gsi qp and rc qp use the same qp context structure and the created flow, only differentiate them by qpn and qp type. Signed-off-by: Lijun Ou Signed-off-by: Yixian Liu Signed-off-by: Wei Hu (Xavier) Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 351fa3119420..4414cea9ef56 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -455,6 +455,13 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sge.sge_shift = 4; } + /* ud sqwqe's sge use extend sge */ + if (hr_dev->caps.max_sq_sg > 2 && hr_qp->ibqp.qp_type == IB_QPT_GSI) { + hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * + hr_qp->sq.max_gs); + hr_qp->sge.sge_shift = 4; + } + /* Get buf size, SQ and RQ are aligned to PAGE_SIZE */ page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); hr_qp->sq.offset = 0; @@ -502,6 +509,8 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev, hr_qp->state = IB_QPS_RESET; + hr_qp->ibqp.qp_type = init_attr->qp_type; + if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) hr_qp->sq_signal_bits = IB_SIGNAL_ALL_WR; else @@ -764,8 +773,13 @@ struct ib_qp *hns_roce_create_qp(struct ib_pd *pd, hr_qp = &hr_sqp->hr_qp; hr_qp->port = init_attr->port_num - 1; hr_qp->phy_port = hr_dev->iboe.phy_port[hr_qp->port]; - hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS + - hr_dev->iboe.phy_port[hr_qp->port]; + + /* when hw version is v1, the sqpn is allocated */ + if (hr_dev->caps.max_sq_sg <= 2) + hr_qp->ibqp.qp_num = HNS_ROCE_MAX_PORTS + + hr_dev->iboe.phy_port[hr_qp->port]; + else + hr_qp->ibqp.qp_num = 1; ret = hns_roce_create_qp_common(hr_dev, pd, init_attr, udata, hr_qp->ibqp.qp_num, hr_qp); -- cgit v1.2.3 From 0fa95a9a71025a966434573e6b8d7d5c6b50116d Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 10 Jan 2018 14:39:49 +0800 Subject: RDMA/hns: Add gsi qp support for modifying qp in hip08 It needs to Assign the values for some fields in qp context when qp type is gsi qp type in hip08. Signed-off-by: Lijun Ou Signed-off-by: Yixian Liu Signed-off-by: Wei Hu (Xavier) Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 64 +++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 4afa070b20fd..42c3b5a2d441 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -485,6 +485,7 @@ struct hns_roce_qp { u32 access_flags; u32 atomic_rd_en; u32 pkey_index; + u32 qkey; void (*event)(struct hns_roce_qp *, enum hns_roce_event); unsigned long qpn; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index faf91eea9cf0..c919ffc1b34b 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1975,6 +1975,7 @@ static void set_access_flags(struct hns_roce_qp *hr_qp, static void modify_qp_reset_to_init(struct ib_qp *ibqp, const struct ib_qp_attr *attr, + int attr_mask, struct hns_roce_v2_qp_context *context, struct hns_roce_v2_qp_context *qpc_mask) { @@ -1991,9 +1992,18 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M, V2_QPC_BYTE_4_TST_S, 0); - roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, - V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ? - ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + if (ibqp->qp_type == IB_QPT_GSI) + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, + V2_QPC_BYTE_4_SGE_SHIFT_S, + ilog2((unsigned int)hr_qp->sge.sge_cnt)); + else + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, + V2_QPC_BYTE_4_SGE_SHIFT_S, + hr_qp->sq.max_gs > 2 ? + ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, V2_QPC_BYTE_4_SGE_SHIFT_S, 0); @@ -2058,6 +2068,12 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0); roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0); + if (attr_mask & IB_QP_QKEY) { + context->qkey_xrcd = attr->qkey; + qpc_mask->qkey_xrcd = 0; + hr_qp->qkey = attr->qkey; + } + roce_set_bit(context->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 1); roce_set_bit(qpc_mask->byte_76_srqn_op_en, V2_QPC_BYTE_76_RQIE_S, 0); @@ -2279,9 +2295,17 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_TST_M, V2_QPC_BYTE_4_TST_S, 0); - roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, - V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ? - ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + if (ibqp->qp_type == IB_QPT_GSI) + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, + V2_QPC_BYTE_4_SGE_SHIFT_S, + ilog2((unsigned int)hr_qp->sge.sge_cnt)); + else + roce_set_field(context->byte_4_sqpn_tst, + V2_QPC_BYTE_4_SGE_SHIFT_M, + V2_QPC_BYTE_4_SGE_SHIFT_S, hr_qp->sq.max_gs > 2 ? + ilog2((unsigned int)hr_qp->sge.sge_cnt) : 0); + roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SGE_SHIFT_M, V2_QPC_BYTE_4_SGE_SHIFT_S, 0); @@ -2358,10 +2382,10 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0); } - if (attr_mask & IB_QP_PKEY_INDEX) - context->qkey_xrcd = attr->pkey_index; - else - context->qkey_xrcd = hr_qp->pkey_index; + if (attr_mask & IB_QP_QKEY) { + context->qkey_xrcd = attr->qkey; + qpc_mask->qkey_xrcd = 0; + } roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M, V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn); @@ -2457,7 +2481,8 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(context->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, - hr_qp->sq.max_gs > 2 ? hr_dev->caps.mtt_hop_num : 0); + ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? + hr_dev->caps.mtt_hop_num : 0); roce_set_field(qpc_mask->byte_20_smac_sgid_idx, V2_QPC_BYTE_20_SGE_HOP_NUM_M, V2_QPC_BYTE_20_SGE_HOP_NUM_S, 0); @@ -2617,8 +2642,13 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S, 0); - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, - V2_QPC_BYTE_24_MTU_S, attr->path_mtu); + if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_UD) + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, + V2_QPC_BYTE_24_MTU_S, IB_MTU_4096); + else + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, + V2_QPC_BYTE_24_MTU_S, attr->path_mtu); + roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_MTU_M, V2_QPC_BYTE_24_MTU_S, 0); @@ -2725,13 +2755,14 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_S, 0); page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); - context->sq_cur_sge_blk_addr = hr_qp->sq.max_gs > 2 ? + context->sq_cur_sge_blk_addr = + ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? ((u32)(mtts[hr_qp->sge.offset / page_size] >> PAGE_ADDR_SHIFT)) : 0; roce_set_field(context->byte_184_irrl_idx, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_M, V2_QPC_BYTE_184_SQ_CUR_SGE_BLK_ADDR_S, - hr_qp->sq.max_gs > 2 ? + ((ibqp->qp_type == IB_QPT_GSI) || hr_qp->sq.max_gs > 2) ? (mtts[hr_qp->sge.offset / page_size] >> (32 + PAGE_ADDR_SHIFT)) : 0); qpc_mask->sq_cur_sge_blk_addr = 0; @@ -2902,7 +2933,8 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, */ memset(qpc_mask, 0xff, sizeof(*qpc_mask)); if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { - modify_qp_reset_to_init(ibqp, attr, context, qpc_mask); + modify_qp_reset_to_init(ibqp, attr, attr_mask, context, + qpc_mask); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { modify_qp_init_to_init(ibqp, attr, attr_mask, context, qpc_mask); -- cgit v1.2.3 From 7bdee4158b3778493d81bf09105568c91abce110 Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 10 Jan 2018 14:39:50 +0800 Subject: RDMA/hns: Fill sq wqe context of ud type in hip08 This patch mainly configure the fields of sq wqe of ud type when posting wr of gsi qp type. Signed-off-by: Lijun Ou Signed-off-by: Yixian Liu Signed-off-by: Wei Hu (Xavier) Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 446 +++++++++++++++++++---------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 84 ++++++ 2 files changed, 385 insertions(+), 145 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c919ffc1b34b..908309c102a6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -51,26 +51,99 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, dseg->len = cpu_to_le32(sg->length); } +static int set_rwqe_data_seg(struct ib_qp *ibqp, struct ib_send_wr *wr, + struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, + void *wqe, unsigned int *sge_ind, + struct ib_send_wr **bad_wr) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_v2_wqe_data_seg *dseg = wqe; + struct hns_roce_qp *qp = to_hr_qp(ibqp); + int i; + + if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) { + if (rc_sq_wqe->msg_len > hr_dev->caps.max_sq_inline) { + *bad_wr = wr; + dev_err(hr_dev->dev, "inline len(1-%d)=%d, illegal", + rc_sq_wqe->msg_len, hr_dev->caps.max_sq_inline); + return -EINVAL; + } + + for (i = 0; i < wr->num_sge; i++) { + memcpy(wqe, ((void *)wr->sg_list[i].addr), + wr->sg_list[i].length); + wqe += wr->sg_list[i].length; + } + + roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_INLINE_S, + 1); + } else { + if (wr->num_sge <= 2) { + for (i = 0; i < wr->num_sge; i++) { + if (likely(wr->sg_list[i].length)) { + set_data_seg_v2(dseg, wr->sg_list + i); + dseg++; + } + } + } else { + roce_set_field(rc_sq_wqe->byte_20, + V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, + V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, + (*sge_ind) & (qp->sge.sge_cnt - 1)); + + for (i = 0; i < 2; i++) { + if (likely(wr->sg_list[i].length)) { + set_data_seg_v2(dseg, wr->sg_list + i); + dseg++; + } + } + + dseg = get_send_extend_sge(qp, + (*sge_ind) & (qp->sge.sge_cnt - 1)); + + for (i = 0; i < wr->num_sge - 2; i++) { + if (likely(wr->sg_list[i + 2].length)) { + set_data_seg_v2(dseg, + wr->sg_list + 2 + i); + dseg++; + (*sge_ind)++; + } + } + } + + roce_set_field(rc_sq_wqe->byte_16, + V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, + V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, wr->num_sge); + } + + return 0; +} + static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr) { struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); + struct hns_roce_ah *ah = to_hr_ah(ud_wr(wr)->ah); + struct hns_roce_v2_ud_send_wqe *ud_sq_wqe; struct hns_roce_v2_rc_send_wqe *rc_sq_wqe; struct hns_roce_qp *qp = to_hr_qp(ibqp); struct hns_roce_v2_wqe_data_seg *dseg; struct device *dev = hr_dev->dev; struct hns_roce_v2_db sq_db; unsigned int sge_ind = 0; - unsigned int wqe_sz = 0; unsigned int owner_bit; unsigned long flags; unsigned int ind; void *wqe = NULL; + bool loopback; int ret = 0; + u8 *smac; int nreq; int i; - if (unlikely(ibqp->qp_type != IB_QPT_RC)) { + if (unlikely(ibqp->qp_type != IB_QPT_RC && + ibqp->qp_type != IB_QPT_GSI && + ibqp->qp_type != IB_QPT_UD)) { dev_err(dev, "Not supported QP(0x%x)type!\n", ibqp->qp_type); *bad_wr = NULL; return -EOPNOTSUPP; @@ -107,172 +180,255 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, wr->wr_id; owner_bit = ~(qp->sq.head >> ilog2(qp->sq.wqe_cnt)) & 0x1; - rc_sq_wqe = wqe; - memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe)); - for (i = 0; i < wr->num_sge; i++) - rc_sq_wqe->msg_len += wr->sg_list[i].length; - rc_sq_wqe->inv_key_immtdata = send_ieth(wr); + /* Corresponding to the QP type, wqe process separately */ + if (ibqp->qp_type == IB_QPT_GSI) { + ud_sq_wqe = wqe; + memset(ud_sq_wqe, 0, sizeof(*ud_sq_wqe)); + + roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_0_M, + V2_UD_SEND_WQE_DMAC_0_S, ah->av.mac[0]); + roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_1_M, + V2_UD_SEND_WQE_DMAC_1_S, ah->av.mac[1]); + roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_2_M, + V2_UD_SEND_WQE_DMAC_2_S, ah->av.mac[2]); + roce_set_field(ud_sq_wqe->dmac, V2_UD_SEND_WQE_DMAC_3_M, + V2_UD_SEND_WQE_DMAC_3_S, ah->av.mac[3]); + roce_set_field(ud_sq_wqe->byte_48, + V2_UD_SEND_WQE_BYTE_48_DMAC_4_M, + V2_UD_SEND_WQE_BYTE_48_DMAC_4_S, + ah->av.mac[4]); + roce_set_field(ud_sq_wqe->byte_48, + V2_UD_SEND_WQE_BYTE_48_DMAC_5_M, + V2_UD_SEND_WQE_BYTE_48_DMAC_5_S, + ah->av.mac[5]); + + /* MAC loopback */ + smac = (u8 *)hr_dev->dev_addr[qp->port]; + loopback = ether_addr_equal_unaligned(ah->av.mac, + smac) ? 1 : 0; + + roce_set_bit(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_LBI_S, loopback); + + roce_set_field(ud_sq_wqe->byte_4, + V2_UD_SEND_WQE_BYTE_4_OPCODE_M, + V2_UD_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_SEND); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_FENCE_S, - (wr->send_flags & IB_SEND_FENCE) ? 1 : 0); + for (i = 0; i < wr->num_sge; i++) + ud_sq_wqe->msg_len += wr->sg_list[i].length; - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_SE_S, - (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0); + ud_sq_wqe->immtdata = send_ieth(wr); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_CQE_S, - (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); + /* Set sig attr */ + roce_set_bit(ud_sq_wqe->byte_4, + V2_UD_SEND_WQE_BYTE_4_CQE_S, + (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); - roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S, - owner_bit); + /* Set se attr */ + roce_set_bit(ud_sq_wqe->byte_4, + V2_UD_SEND_WQE_BYTE_4_SE_S, + (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0); - switch (wr->opcode) { - case IB_WR_RDMA_READ: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_READ); - rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); - rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); - break; - case IB_WR_RDMA_WRITE: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_WRITE); - rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); - rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); - break; - case IB_WR_RDMA_WRITE_WITH_IMM: - roce_set_field(rc_sq_wqe->byte_4, + roce_set_bit(ud_sq_wqe->byte_4, + V2_UD_SEND_WQE_BYTE_4_OWNER_S, owner_bit); + + roce_set_field(ud_sq_wqe->byte_16, + V2_UD_SEND_WQE_BYTE_16_PD_M, + V2_UD_SEND_WQE_BYTE_16_PD_S, + to_hr_pd(ibqp->pd)->pdn); + + roce_set_field(ud_sq_wqe->byte_16, + V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M, + V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S, + wr->num_sge); + + roce_set_field(ud_sq_wqe->byte_20, + V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, + V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, + sge_ind & (qp->sge.sge_cnt - 1)); + + roce_set_field(ud_sq_wqe->byte_24, + V2_UD_SEND_WQE_BYTE_24_UDPSPN_M, + V2_UD_SEND_WQE_BYTE_24_UDPSPN_S, 0); + ud_sq_wqe->qkey = + cpu_to_be32(ud_wr(wr)->remote_qkey & 0x80000000) ? + qp->qkey : ud_wr(wr)->remote_qkey; + roce_set_field(ud_sq_wqe->byte_32, + V2_UD_SEND_WQE_BYTE_32_DQPN_M, + V2_UD_SEND_WQE_BYTE_32_DQPN_S, + ud_wr(wr)->remote_qpn); + + roce_set_field(ud_sq_wqe->byte_36, + V2_UD_SEND_WQE_BYTE_36_VLAN_M, + V2_UD_SEND_WQE_BYTE_36_VLAN_S, + ah->av.vlan); + roce_set_field(ud_sq_wqe->byte_36, + V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M, + V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S, + ah->av.hop_limit); + roce_set_field(ud_sq_wqe->byte_36, + V2_UD_SEND_WQE_BYTE_36_TCLASS_M, + V2_UD_SEND_WQE_BYTE_36_TCLASS_S, + 0); + roce_set_field(ud_sq_wqe->byte_36, + V2_UD_SEND_WQE_BYTE_36_TCLASS_M, + V2_UD_SEND_WQE_BYTE_36_TCLASS_S, + 0); + roce_set_field(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M, + V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S, 0); + roce_set_field(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_SL_M, + V2_UD_SEND_WQE_BYTE_40_SL_S, + ah->av.sl_tclass_flowlabel >> + HNS_ROCE_SL_SHIFT); + roce_set_field(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_PORTN_M, + V2_UD_SEND_WQE_BYTE_40_PORTN_S, + qp->port); + + roce_set_field(ud_sq_wqe->byte_48, + V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M, + V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S, + hns_get_gid_index(hr_dev, qp->phy_port, + ah->av.gid_index)); + + memcpy(&ud_sq_wqe->dgid[0], &ah->av.dgid[0], + GID_LEN_V2); + + dseg = get_send_extend_sge(qp, + sge_ind & (qp->sge.sge_cnt - 1)); + for (i = 0; i < wr->num_sge; i++) { + set_data_seg_v2(dseg + i, wr->sg_list + i); + sge_ind++; + } + + ind++; + } else if (ibqp->qp_type == IB_QPT_RC) { + rc_sq_wqe = wqe; + memset(rc_sq_wqe, 0, sizeof(*rc_sq_wqe)); + for (i = 0; i < wr->num_sge; i++) + rc_sq_wqe->msg_len += wr->sg_list[i].length; + + rc_sq_wqe->inv_key_immtdata = send_ieth(wr); + + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_FENCE_S, + (wr->send_flags & IB_SEND_FENCE) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_SE_S, + (wr->send_flags & IB_SEND_SOLICITED) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_CQE_S, + (wr->send_flags & IB_SEND_SIGNALED) ? 1 : 0); + + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit); + + switch (wr->opcode) { + case IB_WR_RDMA_READ: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_RDMA_READ); + rc_sq_wqe->rkey = + cpu_to_le32(rdma_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le64(rdma_wr(wr)->remote_addr); + break; + case IB_WR_RDMA_WRITE: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_RDMA_WRITE); + rc_sq_wqe->rkey = + cpu_to_le32(rdma_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le64(rdma_wr(wr)->remote_addr); + break; + case IB_WR_RDMA_WRITE_WITH_IMM: + roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM); - rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); - rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); - break; - case IB_WR_SEND: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND); - break; - case IB_WR_SEND_WITH_INV: - roce_set_field(rc_sq_wqe->byte_4, + rc_sq_wqe->rkey = + cpu_to_le32(rdma_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le64(rdma_wr(wr)->remote_addr); + break; + case IB_WR_SEND: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_SEND); + break; + case IB_WR_SEND_WITH_INV: + roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, HNS_ROCE_V2_WQE_OP_SEND_WITH_INV); - break; - case IB_WR_SEND_WITH_IMM: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM); - break; - case IB_WR_LOCAL_INV: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_LOCAL_INV); - break; - case IB_WR_ATOMIC_CMP_AND_SWP: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP); - break; - case IB_WR_ATOMIC_FETCH_AND_ADD: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD); - break; - case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: - roce_set_field(rc_sq_wqe->byte_4, + break; + case IB_WR_SEND_WITH_IMM: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM); + break; + case IB_WR_LOCAL_INV: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_LOCAL_INV); + break; + case IB_WR_ATOMIC_CMP_AND_SWP: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP); + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD); + break; + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP); - break; - case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: - roce_set_field(rc_sq_wqe->byte_4, + break; + case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD); - break; - default: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_MASK); - break; - } - - wqe += sizeof(struct hns_roce_v2_rc_send_wqe); - dseg = wqe; - if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) { - if (rc_sq_wqe->msg_len > - hr_dev->caps.max_sq_inline) { - ret = -EINVAL; - *bad_wr = wr; - dev_err(dev, "inline len(1-%d)=%d, illegal", - rc_sq_wqe->msg_len, - hr_dev->caps.max_sq_inline); - goto out; + break; + default: + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, + HNS_ROCE_V2_WQE_OP_MASK); + break; } - for (i = 0; i < wr->num_sge; i++) { - memcpy(wqe, ((void *)wr->sg_list[i].addr), - wr->sg_list[i].length); - wqe += wr->sg_list[i].length; - wqe_sz += wr->sg_list[i].length; - } + wqe += sizeof(struct hns_roce_v2_rc_send_wqe); + dseg = wqe; - roce_set_bit(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_INLINE_S, 1); + ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, + &sge_ind, bad_wr); + if (ret) + goto out; + ind++; } else { - if (wr->num_sge <= 2) { - for (i = 0; i < wr->num_sge; i++) { - if (likely(wr->sg_list[i].length)) { - set_data_seg_v2(dseg, - wr->sg_list + i); - dseg++; - } - } - } else { - roce_set_field(rc_sq_wqe->byte_20, - V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M, - V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S, - sge_ind & (qp->sge.sge_cnt - 1)); - - for (i = 0; i < 2; i++) { - if (likely(wr->sg_list[i].length)) { - set_data_seg_v2(dseg, - wr->sg_list + i); - dseg++; - } - } - - dseg = get_send_extend_sge(qp, - sge_ind & (qp->sge.sge_cnt - 1)); - - for (i = 0; i < wr->num_sge - 2; i++) { - if (likely(wr->sg_list[i + 2].length)) { - set_data_seg_v2(dseg, - wr->sg_list + 2 + i); - dseg++; - sge_ind++; - } - } - } - - roce_set_field(rc_sq_wqe->byte_16, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, - V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, - wr->num_sge); - wqe_sz += wr->num_sge * - sizeof(struct hns_roce_v2_wqe_data_seg); + dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type); + spin_unlock_irqrestore(&qp->sq.lock, flags); + return -EOPNOTSUPP; } - ind++; } out: diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index cbf61ad2539c..26fa7005dbf1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -919,6 +919,90 @@ struct hns_roce_v2_cq_db { #define V2_CQ_DB_PARAMETER_NOTIFY_S 24 +struct hns_roce_v2_ud_send_wqe { + u32 byte_4; + u32 msg_len; + u32 immtdata; + u32 byte_16; + u32 byte_20; + u32 byte_24; + u32 qkey; + u32 byte_32; + u32 byte_36; + u32 byte_40; + u32 dmac; + u32 byte_48; + u8 dgid[GID_LEN_V2]; + +}; +#define V2_UD_SEND_WQE_BYTE_4_OPCODE_S 0 +#define V2_UD_SEND_WQE_BYTE_4_OPCODE_M GENMASK(4, 0) + +#define V2_UD_SEND_WQE_BYTE_4_OWNER_S 7 + +#define V2_UD_SEND_WQE_BYTE_4_CQE_S 8 + +#define V2_UD_SEND_WQE_BYTE_4_SE_S 11 + +#define V2_UD_SEND_WQE_BYTE_16_PD_S 0 +#define V2_UD_SEND_WQE_BYTE_16_PD_M GENMASK(23, 0) + +#define V2_UD_SEND_WQE_BYTE_16_SGE_NUM_S 24 +#define V2_UD_SEND_WQE_BYTE_16_SGE_NUM_M GENMASK(31, 24) + +#define V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0 +#define V2_UD_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0) + +#define V2_UD_SEND_WQE_BYTE_24_UDPSPN_S 16 +#define V2_UD_SEND_WQE_BYTE_24_UDPSPN_M GENMASK(31, 16) + +#define V2_UD_SEND_WQE_BYTE_32_DQPN_S 0 +#define V2_UD_SEND_WQE_BYTE_32_DQPN_M GENMASK(23, 0) + +#define V2_UD_SEND_WQE_BYTE_36_VLAN_S 0 +#define V2_UD_SEND_WQE_BYTE_36_VLAN_M GENMASK(15, 0) + +#define V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_S 16 +#define V2_UD_SEND_WQE_BYTE_36_HOPLIMIT_M GENMASK(23, 16) + +#define V2_UD_SEND_WQE_BYTE_36_TCLASS_S 24 +#define V2_UD_SEND_WQE_BYTE_36_TCLASS_M GENMASK(31, 24) + +#define V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_S 0 +#define V2_UD_SEND_WQE_BYTE_40_FLOW_LABEL_M GENMASK(19, 0) + +#define V2_UD_SEND_WQE_BYTE_40_SL_S 20 +#define V2_UD_SEND_WQE_BYTE_40_SL_M GENMASK(23, 20) + +#define V2_UD_SEND_WQE_BYTE_40_PORTN_S 24 +#define V2_UD_SEND_WQE_BYTE_40_PORTN_M GENMASK(26, 24) + +#define V2_UD_SEND_WQE_BYTE_40_LBI_S 31 + +#define V2_UD_SEND_WQE_DMAC_0_S 0 +#define V2_UD_SEND_WQE_DMAC_0_M GENMASK(7, 0) + +#define V2_UD_SEND_WQE_DMAC_1_S 8 +#define V2_UD_SEND_WQE_DMAC_1_M GENMASK(15, 8) + +#define V2_UD_SEND_WQE_DMAC_2_S 16 +#define V2_UD_SEND_WQE_DMAC_2_M GENMASK(23, 16) + +#define V2_UD_SEND_WQE_DMAC_3_S 24 +#define V2_UD_SEND_WQE_DMAC_3_M GENMASK(31, 24) + +#define V2_UD_SEND_WQE_BYTE_48_DMAC_4_S 0 +#define V2_UD_SEND_WQE_BYTE_48_DMAC_4_M GENMASK(7, 0) + +#define V2_UD_SEND_WQE_BYTE_48_DMAC_5_S 8 +#define V2_UD_SEND_WQE_BYTE_48_DMAC_5_M GENMASK(15, 8) + +#define V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S 16 +#define V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M GENMASK(23, 16) + +#define V2_UD_SEND_WQE_BYTE_48_SMAC_INDX_S 24 +#define V2_UD_SEND_WQE_BYTE_48_SMAC_INDX_M GENMASK(31, 24) + struct hns_roce_v2_rc_send_wqe { u32 byte_4; u32 msg_len; -- cgit v1.2.3 From 6c1f08b347f64de38460d5b3d1eb4a30028869cb Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 10 Jan 2018 14:39:51 +0800 Subject: RDMA/hns: Assign zero for pkey_index of wc in hip08 Because pkey is fixed for hip08 RoCE, it needs to assign zero for pkey_index of wc. otherwise, it will happen an error when establishing connection by communication management mechanism. Signed-off-by: Lijun Ou Signed-off-by: Yixian Liu Signed-off-by: Wei Hu (Xavier) Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 908309c102a6..321f5932bb36 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1910,6 +1910,9 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, wc->wc_flags |= (roce_get_bit(cqe->byte_32, V2_CQE_BYTE_32_GRH_S) ? IB_WC_GRH : 0); + wc->port_num = roce_get_field(cqe->byte_32, + V2_CQE_BYTE_32_PORTN_M, V2_CQE_BYTE_32_PORTN_S); + wc->pkey_index = 0; } return 0; -- cgit v1.2.3 From 2eade675351b0ef9e054ccb62334efd716aa853c Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 10 Jan 2018 14:39:52 +0800 Subject: RDMA/hns: Update the verbs of polling for completion If the port is a RoCEv2 port, the remote port address and QP information which returned for UD will be modified. Signed-off-by: Lijun Ou Signed-off-by: Yixian Liu Signed-off-by: Wei Hu (Xavier) Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 12 ++++++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 321f5932bb36..11391c8ce6df 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1913,6 +1913,18 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, wc->port_num = roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_PORTN_M, V2_CQE_BYTE_32_PORTN_S); wc->pkey_index = 0; + memcpy(wc->smac, cqe->smac, 4); + wc->smac[4] = roce_get_field(cqe->byte_28, + V2_CQE_BYTE_28_SMAC_4_M, + V2_CQE_BYTE_28_SMAC_4_S); + wc->smac[5] = roce_get_field(cqe->byte_28, + V2_CQE_BYTE_28_SMAC_5_M, + V2_CQE_BYTE_28_SMAC_5_S); + wc->vlan_id = 0xffff; + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); + wc->network_hdr_type = roce_get_field(cqe->byte_28, + V2_CQE_BYTE_28_PORT_TYPE_M, + V2_CQE_BYTE_28_PORT_TYPE_S); } return 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 26fa7005dbf1..960df095392a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -769,7 +769,7 @@ struct hns_roce_v2_cqe { u32 byte_12; u32 byte_16; u32 byte_cnt; - u32 smac; + u8 smac[4]; u32 byte_28; u32 byte_32; }; -- cgit v1.2.3 From d4994d2f1f7a7b24622f990d4bb437eacf69b816 Mon Sep 17 00:00:00 2001 From: oulijun Date: Wed, 10 Jan 2018 14:39:53 +0800 Subject: RDMA/hns: Set the guid for hip08 RoCE device This patch assign a guid(Global Unique identifer) value to the hip08 device. Signed-off-by: Lijun Ou Signed-off-by: Yixian Liu Signed-off-by: Wei Hu (Xavier) Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 11391c8ce6df..256fe110107a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "hnae3.h" @@ -4679,6 +4680,9 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev, hr_dev->iboe.netdevs[0] = handle->rinfo.netdev; hr_dev->iboe.phy_port[0] = 0; + addrconf_addr_eui48((u8 *)&hr_dev->ib_dev.node_guid, + hr_dev->iboe.netdevs[0]->dev_addr); + for (i = 0; i < HNS_ROCE_V2_MAX_IRQ_NUM; i++) hr_dev->irq[i] = pci_irq_vector(handle->pdev, i + handle->rinfo.base_vector); -- cgit v1.2.3 From 23541b28e5a33a277f654dd6672484ded1050216 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Thu, 11 Jan 2018 18:10:50 -0600 Subject: i40iw: Remove extra call to i40iw_est_sd() Remove redundant estimate SD function call. sd_needed should already be updated at the end of the do while resource reduction loop. Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index caa958177631..c91dcfdf933f 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -3957,8 +3957,6 @@ enum i40iw_status_code i40iw_config_fpm_values(struct i40iw_sc_dev *dev, u32 qp_ pblewanted -= FPM_MULTIPLIER * 1000; } while (sd_needed > hmc_fpm_misc->max_sds && loop_count < 2000); - sd_needed = i40iw_est_sd(dev, hmc_info); - i40iw_debug(dev, I40IW_DEBUG_HMC, "loop_cnt=%d, sd_needed=%lld, qpcnt = %d, cqcnt=%d, mrcnt=%d, pblecnt=%d\n", loop_count, sd_needed, -- cgit v1.2.3 From 6376e926af1a8661dd1b2e6d0896e07f84a35844 Mon Sep 17 00:00:00 2001 From: Shiraz Saleem Date: Thu, 11 Jan 2018 18:10:51 -0600 Subject: i40iw: Zero-out consumer key on allocate stag for FMR If the application invalidates the MR before the FMR WR, HW parses the consumer key portion of the stag and returns an invalid stag key Asynchronous Event (AE) that tears down the QP. Fix this by zeroing-out the consumer key portion of the allocated stag returned to application for FMR. Fixes: ee855d3b93f3 ("RDMA/i40iw: Add base memory management extensions") Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 3c6f3ce88f89..9f18997b2e9e 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -1637,6 +1637,7 @@ static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, err_code = -EOVERFLOW; goto err; } + stag &= ~I40IW_CQPSQ_STAG_KEY_MASK; iwmr->stag = stag; iwmr->ibmr.rkey = stag; iwmr->ibmr.lkey = stag; -- cgit v1.2.3 From 72b30e986d2527c023de455d55570e2d32494839 Mon Sep 17 00:00:00 2001 From: Sindhu Devale Date: Thu, 11 Jan 2018 18:10:52 -0600 Subject: i40iw: Remove limit on re-posting AEQ entries to HW Currently, if the number of processed Asynchronous Event Queue (AEQ) entries exceeds 255, they are not returned to HW for re-use. During scale-up, the unreturned AEQ entries can grow to the max AEQ size and cause the HW to report an AEQ overflow. Remove the check which limits the number of processed AEQ entries returned to HW. Fixes: 86dbcd0f12e9 ("RDMA/i40iw: add file to handle cqp calls") Signed-off-by: Sindhu Devale Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 2 -- drivers/infiniband/hw/i40iw/i40iw_user.h | 1 - 2 files changed, 3 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index c91dcfdf933f..c74fd3309b93 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -1893,8 +1893,6 @@ static enum i40iw_status_code i40iw_sc_get_next_aeqe(struct i40iw_sc_aeq *aeq, static enum i40iw_status_code i40iw_sc_repost_aeq_entries(struct i40iw_sc_dev *dev, u32 count) { - if (count > I40IW_MAX_AEQ_ALLOCATE_COUNT) - return I40IW_ERR_INVALID_SIZE; if (dev->is_pf) i40iw_wr32(dev->hw, I40E_PFPE_AEQALLOC, count); diff --git a/drivers/infiniband/hw/i40iw/i40iw_user.h b/drivers/infiniband/hw/i40iw/i40iw_user.h index 5467c6fdad03..b125925641e0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_user.h +++ b/drivers/infiniband/hw/i40iw/i40iw_user.h @@ -59,7 +59,6 @@ enum i40iw_device_capabilities_const { I40IW_MAX_CEQ_ENTRIES = 131071, I40IW_MIN_CQ_SIZE = 1, I40IW_MAX_CQ_SIZE = 1048575, - I40IW_MAX_AEQ_ALLOCATE_COUNT = 255, I40IW_DB_ID_ZERO = 0, I40IW_MAX_WQ_FRAGMENT_COUNT = 3, I40IW_MAX_SGE_RD = 1, -- cgit v1.2.3 From ebb6c0c015321ba7f3af21e4ce059ca9e73b3896 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Thu, 11 Jan 2018 18:10:53 -0600 Subject: i40iw: Remove setting of rem_addr.len Remove setting of rem_addr.len before calling iw_rdma_write, iw_inline_rdma_write and rdma_read. rem_addr.len is not used in those functions. Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 9f18997b2e9e..fdc1a38f21b5 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2243,14 +2243,12 @@ static int i40iw_post_send(struct ib_qp *ibqp, info.op.inline_rdma_write.len = ib_wr->sg_list[0].length; info.op.inline_rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr; info.op.inline_rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey; - info.op.inline_rdma_write.rem_addr.len = ib_wr->sg_list->length; ret = ukqp->ops.iw_inline_rdma_write(ukqp, &info, false); } else { info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list; info.op.rdma_write.num_lo_sges = ib_wr->num_sge; info.op.rdma_write.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr; info.op.rdma_write.rem_addr.stag = rdma_wr(ib_wr)->rkey; - info.op.rdma_write.rem_addr.len = ib_wr->sg_list->length; ret = ukqp->ops.iw_rdma_write(ukqp, &info, false); } @@ -2272,7 +2270,6 @@ static int i40iw_post_send(struct ib_qp *ibqp, info.op_type = I40IW_OP_TYPE_RDMA_READ; info.op.rdma_read.rem_addr.tag_off = rdma_wr(ib_wr)->remote_addr; info.op.rdma_read.rem_addr.stag = rdma_wr(ib_wr)->rkey; - info.op.rdma_read.rem_addr.len = ib_wr->sg_list->length; info.op.rdma_read.lo_addr.tag_off = ib_wr->sg_list->addr; info.op.rdma_read.lo_addr.stag = ib_wr->sg_list->lkey; info.op.rdma_read.lo_addr.len = ib_wr->sg_list->length; -- cgit v1.2.3 From f20d429511affab6a2a9129f46042f43e6ffe396 Mon Sep 17 00:00:00 2001 From: Mustafa Ismail Date: Thu, 11 Jan 2018 18:10:54 -0600 Subject: i40iw: Free IEQ resources The iWARP Exception Queue (IEQ) resources are not freed when a QP is destroyed. Fix this by freeing IEQ resources when freeing QP resources. Fixes: d37498417947 ("i40iw: add files for iwarp interface") Signed-off-by: Mustafa Ismail Signed-off-by: Shiraz Saleem Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_puda.c | 3 +-- drivers/infiniband/hw/i40iw/i40iw_puda.h | 1 + drivers/infiniband/hw/i40iw/i40iw_verbs.c | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index f64b6700f43f..4c21197830b3 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -48,7 +48,6 @@ static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid); static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx); static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc *rsrc, bool initial); -static void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp); /** * i40iw_puda_get_listbuf - get buffer from puda list * @list: list to use for buffers (ILQ or IEQ) @@ -1483,7 +1482,7 @@ static void i40iw_ieq_tx_compl(struct i40iw_sc_vsi *vsi, void *sqwrid) * @ieq: ieq resource * @qp: all pending fpdu buffers */ -static void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp) +void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp) { struct i40iw_puda_buf *buf; struct i40iw_pfpdu *pfpdu = &qp->pfpdu; diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.h b/drivers/infiniband/hw/i40iw/i40iw_puda.h index 660aa3edae56..53a7d58c84b5 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.h +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.h @@ -184,4 +184,5 @@ enum i40iw_status_code i40iw_cqp_qp_create_cmd(struct i40iw_sc_dev *dev, struct enum i40iw_status_code i40iw_cqp_cq_create_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq); void i40iw_cqp_qp_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_qp *qp); void i40iw_cqp_cq_destroy_cmd(struct i40iw_sc_dev *dev, struct i40iw_sc_cq *cq); +void i40iw_ieq_cleanup_qp(struct i40iw_puda_rsrc *ieq, struct i40iw_sc_qp *qp); #endif diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index fdc1a38f21b5..70024e8e2692 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -412,6 +412,7 @@ void i40iw_free_qp_resources(struct i40iw_device *iwdev, { struct i40iw_pbl *iwpbl = &iwqp->iwpbl; + i40iw_ieq_cleanup_qp(iwdev->vsi.ieq, &iwqp->sc_qp); i40iw_dealloc_push_page(iwdev, &iwqp->sc_qp); if (qp_num) i40iw_free_resource(iwdev, iwdev->allocated_qps, qp_num); -- cgit v1.2.3 From ccd9d0d3dffcd783a1a626337146b79f4c4ee53d Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 11 Jan 2018 11:52:07 -0500 Subject: RDMA/bnxt_re: Enable RoCE on virtual functions RoCE can be used by virtual functions (VFs) as well. Adding code changes to allow resource reservation, initialization and avail the resources to the RDMA applications running on those VFs. Currently, fifty percent of the total available resources are reserved for PF and remaining are equally divided among active VFs. Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 15 ++++ drivers/infiniband/hw/bnxt_re/ib_verbs.c | 17 ++-- drivers/infiniband/hw/bnxt_re/main.c | 135 ++++++++++++++++++++++++------- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 37 ++++++++- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 5 +- drivers/infiniband/hw/bnxt_re/roce_hsi.h | 10 +++ 6 files changed, 182 insertions(+), 37 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index ecbac91b2e14..b604277ed0dd 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -57,6 +57,19 @@ #define BNXT_RE_MAX_MRW_COUNT (64 * 1024) #define BNXT_RE_MAX_SRQC_COUNT (64 * 1024) #define BNXT_RE_MAX_CQ_COUNT (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_64K (64 * 1024) +#define BNXT_RE_MAX_MRW_COUNT_256K (256 * 1024) + +/* Number of MRs to reserve for PF, leaving remainder for VFs */ +#define BNXT_RE_RESVD_MR_FOR_PF (32 * 1024) +#define BNXT_RE_MAX_GID_PER_VF 128 + +/* + * Percentage of resources of each type reserved for PF. + * Remaining resources are divided equally among VFs. + * [0, 100] + */ +#define BNXT_RE_PCT_RSVD_FOR_PF 50 #define BNXT_RE_UD_QP_HW_STALL 0x400000 @@ -145,6 +158,8 @@ struct bnxt_re_dev { struct bnxt_re_ah *sqp_ah; struct bnxt_re_sqp_entries sqp_tbl[1024]; atomic_t nq_alloc_cnt; + u32 is_virtfn; + u32 num_vfs; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 2032db7db766..6dd2fe1421d0 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2295,10 +2295,14 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, struct ib_recv_wr *wr, /* Completion Queues */ int bnxt_re_destroy_cq(struct ib_cq *ib_cq) { - struct bnxt_re_cq *cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); - struct bnxt_re_dev *rdev = cq->rdev; int rc; - struct bnxt_qplib_nq *nq = cq->qplib_cq.nq; + struct bnxt_re_cq *cq; + struct bnxt_qplib_nq *nq; + struct bnxt_re_dev *rdev; + + cq = container_of(ib_cq, struct bnxt_re_cq, ib_cq); + rdev = cq->rdev; + nq = cq->qplib_cq.nq; rc = bnxt_qplib_destroy_cq(&rdev->qplib_res, &cq->qplib_cq); if (rc) { @@ -2308,12 +2312,11 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq) if (!IS_ERR_OR_NULL(cq->umem)) ib_umem_release(cq->umem); - if (cq) { - kfree(cq->cql); - kfree(cq); - } atomic_dec(&rdev->cq_count); nq->budget--; + kfree(cq->cql); + kfree(cq); + return 0; } diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index bf268bf1f496..677263ef2859 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -80,6 +80,79 @@ static DEFINE_MUTEX(bnxt_re_dev_lock); static struct workqueue_struct *bnxt_re_wq; static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait); +/* SR-IOV helper functions */ + +static void bnxt_re_get_sriov_func_type(struct bnxt_re_dev *rdev) +{ + struct bnxt *bp; + + bp = netdev_priv(rdev->en_dev->net); + if (BNXT_VF(bp)) + rdev->is_virtfn = 1; +} + +/* Set the maximum number of each resource that the driver actually wants + * to allocate. This may be up to the maximum number the firmware has + * reserved for the function. The driver may choose to allocate fewer + * resources than the firmware maximum. + */ +static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) +{ + u32 vf_qps = 0, vf_srqs = 0, vf_cqs = 0, vf_mrws = 0, vf_gids = 0; + u32 i; + u32 vf_pct; + u32 num_vfs; + struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + + rdev->qplib_ctx.qpc_count = min_t(u32, BNXT_RE_MAX_QPC_COUNT, + dev_attr->max_qp); + + rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT_256K; + /* Use max_mr from fw since max_mrw does not get set */ + rdev->qplib_ctx.mrw_count = min_t(u32, rdev->qplib_ctx.mrw_count, + dev_attr->max_mr); + rdev->qplib_ctx.srqc_count = min_t(u32, BNXT_RE_MAX_SRQC_COUNT, + dev_attr->max_srq); + rdev->qplib_ctx.cq_count = min_t(u32, BNXT_RE_MAX_CQ_COUNT, + dev_attr->max_cq); + + for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) + rdev->qplib_ctx.tqm_count[i] = + rdev->dev_attr.tqm_alloc_reqs[i]; + + if (rdev->num_vfs) { + /* + * Reserve a set of resources for the PF. Divide the remaining + * resources among the VFs + */ + vf_pct = 100 - BNXT_RE_PCT_RSVD_FOR_PF; + num_vfs = 100 * rdev->num_vfs; + vf_qps = (rdev->qplib_ctx.qpc_count * vf_pct) / num_vfs; + vf_srqs = (rdev->qplib_ctx.srqc_count * vf_pct) / num_vfs; + vf_cqs = (rdev->qplib_ctx.cq_count * vf_pct) / num_vfs; + /* + * The driver allows many more MRs than other resources. If the + * firmware does also, then reserve a fixed amount for the PF + * and divide the rest among VFs. VFs may use many MRs for NFS + * mounts, ISER, NVME applications, etc. If the firmware + * severely restricts the number of MRs, then let PF have + * half and divide the rest among VFs, as for the other + * resource types. + */ + if (rdev->qplib_ctx.mrw_count < BNXT_RE_MAX_MRW_COUNT_64K) + vf_mrws = rdev->qplib_ctx.mrw_count * vf_pct / num_vfs; + else + vf_mrws = (rdev->qplib_ctx.mrw_count - + BNXT_RE_RESVD_MR_FOR_PF) / rdev->num_vfs; + vf_gids = BNXT_RE_MAX_GID_PER_VF; + } + rdev->qplib_ctx.vf_res.max_mrw_per_vf = vf_mrws; + rdev->qplib_ctx.vf_res.max_gid_per_vf = vf_gids; + rdev->qplib_ctx.vf_res.max_qp_per_vf = vf_qps; + rdev->qplib_ctx.vf_res.max_srq_per_vf = vf_srqs; + rdev->qplib_ctx.vf_res.max_cq_per_vf = vf_cqs; +} + /* for handling bnxt_en callbacks later */ static void bnxt_re_stop(void *p) { @@ -91,6 +164,15 @@ static void bnxt_re_start(void *p) static void bnxt_re_sriov_config(void *p, int num_vfs) { + struct bnxt_re_dev *rdev = p; + + if (!rdev) + return; + + rdev->num_vfs = num_vfs; + bnxt_re_set_resource_limits(rdev); + bnxt_qplib_set_func_resources(&rdev->qplib_res, &rdev->rcfw, + &rdev->qplib_ctx); } static void bnxt_re_shutdown(void *p) @@ -734,7 +816,8 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr, + rdev->is_virtfn); if (rc) goto fail; @@ -1035,19 +1118,6 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev, bool lock_wait) } } -static void bnxt_re_set_resource_limits(struct bnxt_re_dev *rdev) -{ - u32 i; - - rdev->qplib_ctx.qpc_count = BNXT_RE_MAX_QPC_COUNT; - rdev->qplib_ctx.mrw_count = BNXT_RE_MAX_MRW_COUNT; - rdev->qplib_ctx.srqc_count = BNXT_RE_MAX_SRQC_COUNT; - rdev->qplib_ctx.cq_count = BNXT_RE_MAX_CQ_COUNT; - for (i = 0; i < MAX_TQM_ALLOC_REQ; i++) - rdev->qplib_ctx.tqm_count[i] = - rdev->dev_attr.tqm_alloc_reqs[i]; -} - /* worker thread for polling periodic events. Now used for QoS programming*/ static void bnxt_re_worker(struct work_struct *work) { @@ -1070,6 +1140,9 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) } set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); + /* Check whether VF or PF */ + bnxt_re_get_sriov_func_type(rdev); + rc = bnxt_re_request_msix(rdev); if (rc) { pr_err("Failed to get MSI-X vectors: %#x\n", rc); @@ -1101,16 +1174,18 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) (rdev->en_dev->pdev, &rdev->rcfw, rdev->msix_entries[BNXT_RE_AEQ_IDX].vector, rdev->msix_entries[BNXT_RE_AEQ_IDX].db_offset, - 0, &bnxt_re_aeq_handler); + rdev->is_virtfn, &bnxt_re_aeq_handler); if (rc) { pr_err("Failed to enable RCFW channel: %#x\n", rc); goto free_ring; } - rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr); + rc = bnxt_qplib_get_dev_attr(&rdev->rcfw, &rdev->dev_attr, + rdev->is_virtfn); if (rc) goto disable_rcfw; - bnxt_re_set_resource_limits(rdev); + if (!rdev->is_virtfn) + bnxt_re_set_resource_limits(rdev); rc = bnxt_qplib_alloc_ctx(rdev->en_dev->pdev, &rdev->qplib_ctx, 0); if (rc) { @@ -1125,7 +1200,8 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) goto free_ctx; } - rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx, 0); + rc = bnxt_qplib_init_rcfw(&rdev->rcfw, &rdev->qplib_ctx, + rdev->is_virtfn); if (rc) { pr_err("Failed to initialize RCFW: %#x\n", rc); goto free_sctx; @@ -1144,13 +1220,15 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) goto fail; } - rc = bnxt_re_setup_qos(rdev); - if (rc) - pr_info("RoCE priority not yet configured\n"); + if (!rdev->is_virtfn) { + rc = bnxt_re_setup_qos(rdev); + if (rc) + pr_info("RoCE priority not yet configured\n"); - INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); - set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); - schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); + INIT_DELAYED_WORK(&rdev->worker, bnxt_re_worker); + set_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags); + schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); + } /* Register ib dev */ rc = bnxt_re_register_ib(rdev); @@ -1400,7 +1478,7 @@ err_netdev: static void __exit bnxt_re_mod_exit(void) { - struct bnxt_re_dev *rdev; + struct bnxt_re_dev *rdev, *next; LIST_HEAD(to_be_deleted); mutex_lock(&bnxt_re_dev_lock); @@ -1408,8 +1486,11 @@ static void __exit bnxt_re_mod_exit(void) if (!list_empty(&bnxt_re_dev_list)) list_splice_init(&bnxt_re_dev_list, &to_be_deleted); mutex_unlock(&bnxt_re_dev_lock); - - list_for_each_entry(rdev, &to_be_deleted, list) { + /* + * Cleanup the devices in reverse order so that the VF device + * cleanup is done before PF cleanup + */ + list_for_each_entry_safe_reverse(rdev, next, &to_be_deleted, list) { dev_info(rdev_to_dev(rdev), "Unregistering Device"); bnxt_re_dev_stop(rdev); bnxt_re_ib_unreg(rdev, true); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 9543ce51a28a..f5450b78a34d 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -65,7 +65,7 @@ static bool bnxt_qplib_is_atomic_cap(struct bnxt_qplib_rcfw *rcfw) } int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr) + struct bnxt_qplib_dev_attr *attr, bool vf) { struct cmdq_query_func req; struct creq_query_func_resp resp; @@ -95,7 +95,8 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, /* Extract the context from the side buffer */ attr->max_qp = le32_to_cpu(sb->max_qp); /* max_qp value reported by FW for PF doesn't include the QP1 for PF */ - attr->max_qp += 1; + if (!vf) + attr->max_qp += 1; attr->max_qp_rd_atom = sb->max_qp_rd_atom > BNXT_QPLIB_MAX_OUT_RD_ATOM ? BNXT_QPLIB_MAX_OUT_RD_ATOM : sb->max_qp_rd_atom; @@ -150,6 +151,38 @@ bail: return rc; } +int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, + struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_ctx *ctx) +{ + struct cmdq_set_func_resources req; + struct creq_set_func_resources_resp resp; + u16 cmd_flags = 0; + int rc = 0; + + RCFW_CMD_PREP(req, SET_FUNC_RESOURCES, cmd_flags); + + req.number_of_qp = cpu_to_le32(ctx->qpc_count); + req.number_of_mrw = cpu_to_le32(ctx->mrw_count); + req.number_of_srq = cpu_to_le32(ctx->srqc_count); + req.number_of_cq = cpu_to_le32(ctx->cq_count); + + req.max_qp_per_vf = cpu_to_le32(ctx->vf_res.max_qp_per_vf); + req.max_mrw_per_vf = cpu_to_le32(ctx->vf_res.max_mrw_per_vf); + req.max_srq_per_vf = cpu_to_le32(ctx->vf_res.max_srq_per_vf); + req.max_cq_per_vf = cpu_to_le32(ctx->vf_res.max_cq_per_vf); + req.max_gid_per_vf = cpu_to_le32(ctx->vf_res.max_gid_per_vf); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, + NULL, 0); + if (rc) { + dev_err(&res->pdev->dev, + "QPLIB: Failed to set function resources"); + } + return rc; +} + /* SGID */ int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, struct bnxt_qplib_sgid_tbl *sgid_tbl, int index, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 11322582f5e4..51bb7842b867 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -147,7 +147,10 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res, struct bnxt_qplib_pkey_tbl *pkey_tbl, u16 *pkey, bool update); int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, - struct bnxt_qplib_dev_attr *attr); + struct bnxt_qplib_dev_attr *attr, bool vf); +int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, + struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_ctx *ctx); int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah); int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah); int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index c3cba6063a03..f429fdb18e21 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -1799,6 +1799,16 @@ struct cmdq_set_func_resources { u8 resp_size; u8 reserved8; __le64 resp_addr; + __le32 number_of_qp; + __le32 number_of_mrw; + __le32 number_of_srq; + __le32 number_of_cq; + __le32 max_qp_per_vf; + __le32 max_mrw_per_vf; + __le32 max_srq_per_vf; + __le32 max_cq_per_vf; + __le32 max_gid_per_vf; + __le32 stat_ctx_id; }; /* Read hardware resource context command (24 bytes) */ -- cgit v1.2.3 From 2fc68543f2ab8dbee1048bb2275b065eef90e19d Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 11 Jan 2018 11:52:08 -0500 Subject: RDMA/bnxt_re: Add support for query firmware version The device now reports firmware version thus, removing the hard coded values of the FW version string and redundant fw_rev hook from sysfs. Adding code to query firmware version from underlying device and report it through the kernel verb to get firmware version string. Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 14 ++++++++++++-- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 1 + drivers/infiniband/hw/bnxt_re/main.c | 11 +---------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 3 ++- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 22 +++++++++++++++++++++- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 3 ++- 6 files changed, 39 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 6dd2fe1421d0..8a80e9548d06 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -141,8 +141,9 @@ int bnxt_re_query_device(struct ib_device *ibdev, struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; memset(ib_attr, 0, sizeof(*ib_attr)); - - ib_attr->fw_ver = (u64)(unsigned long)(dev_attr->fw_ver); + memcpy(&ib_attr->fw_ver, dev_attr->fw_ver, + min(sizeof(dev_attr->fw_ver), + sizeof(ib_attr->fw_ver))); bnxt_qplib_get_guid(rdev->netdev->dev_addr, (u8 *)&ib_attr->sys_image_guid); ib_attr->max_mr_size = BNXT_RE_MAX_MR_SIZE; @@ -281,6 +282,15 @@ int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); + + snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d.%d", + rdev->dev_attr.fw_ver[0], rdev->dev_attr.fw_ver[1], + rdev->dev_attr.fw_ver[2], rdev->dev_attr.fw_ver[3]); +} + int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, u16 *pkey) { diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 1df11ed272ea..66dd8d2c4655 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -143,6 +143,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, struct ib_port_attr *port_attr); int bnxt_re_get_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable); +void bnxt_re_query_fw_str(struct ib_device *ibdev, char *str); int bnxt_re_query_pkey(struct ib_device *ibdev, u8 port_num, u16 index, u16 *pkey); int bnxt_re_del_gid(struct ib_device *ibdev, u8 port_num, diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 677263ef2859..ca1195024e9b 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -572,6 +572,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->query_port = bnxt_re_query_port; ibdev->get_port_immutable = bnxt_re_get_port_immutable; + ibdev->get_dev_fw_str = bnxt_re_query_fw_str; ibdev->query_pkey = bnxt_re_query_pkey; ibdev->query_gid = bnxt_re_query_gid; ibdev->get_netdev = bnxt_re_get_netdev; @@ -623,14 +624,6 @@ static ssize_t show_rev(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); } -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); - - return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->dev_attr.fw_ver); -} - static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { @@ -640,12 +633,10 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, } static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL); -static DEVICE_ATTR(fw_rev, 0444, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, 0444, show_hca, NULL); static struct device_attribute *bnxt_re_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_rev, &dev_attr_hca_type }; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index bb5574adf195..6a3633af1d52 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -93,7 +93,8 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, opcode = req->opcode; if (!test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) && (opcode != CMDQ_BASE_OPCODE_QUERY_FUNC && - opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW)) { + opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && + opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { dev_err(&rcfw->pdev->dev, "QPLIB: RCFW not initialized, reject opcode 0x%x", opcode); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index f5450b78a34d..08df34adbb8f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -64,6 +64,26 @@ static bool bnxt_qplib_is_atomic_cap(struct bnxt_qplib_rcfw *rcfw) return !!(pcie_ctl2 & PCI_EXP_DEVCTL2_ATOMIC_REQ); } +static void bnxt_qplib_query_version(struct bnxt_qplib_rcfw *rcfw, + char *fw_ver) +{ + struct cmdq_query_version req; + struct creq_query_version_resp resp; + u16 cmd_flags = 0; + int rc = 0; + + RCFW_CMD_PREP(req, QUERY_VERSION, cmd_flags); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, NULL, 0); + if (rc) + return; + fw_ver[0] = resp.fw_maj; + fw_ver[1] = resp.fw_minor; + fw_ver[2] = resp.fw_bld; + fw_ver[3] = resp.fw_rsvd; +} + int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_dev_attr *attr, bool vf) { @@ -134,7 +154,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->l2_db_size = (sb->l2_db_space_size + 1) * PAGE_SIZE; attr->max_sgid = le32_to_cpu(sb->max_gid); - strlcpy(attr->fw_ver, "20.6.28.0", sizeof(attr->fw_ver)); + bnxt_qplib_query_version(rcfw, attr->fw_ver); for (i = 0; i < MAX_TQM_ALLOC_REQ / 4; i++) { temp = le32_to_cpu(sb->tqm_alloc_reqs[i]); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 51bb7842b867..0828bb1ea41b 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -45,7 +45,8 @@ #define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 struct bnxt_qplib_dev_attr { - char fw_ver[32]; +#define FW_VER_ARR_LEN 4 + u8 fw_ver[FW_VER_ARR_LEN]; u16 max_sgid; u16 max_mrw; u32 max_qp; -- cgit v1.2.3 From 872f3578241d7e648b3bfcf6451a55faf97ce2e9 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Thu, 11 Jan 2018 11:52:09 -0500 Subject: RDMA/bnxt_re: Add support for MRs with Huge pages Depending on the OS page-table configurations, applications may request MRs which has page size alignment other than 4K Underlying provider driver needs to adjust its PBL boundaries according to the incoming page boundaries in the PA list. Adding a capability to register MRs having pages-sizes other than 4K (Hugepages). Signed-off-by: Somnath Kotur Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 26 ++++--- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 129 ++++++++++++++++++++----------- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 12 ++- drivers/infiniband/hw/bnxt_re/qplib_sp.h | 2 +- drivers/infiniband/hw/bnxt_re/roce_hsi.h | 28 ++++++- 5 files changed, 140 insertions(+), 57 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index b604277ed0dd..085ca000973e 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -43,15 +43,23 @@ #define ROCE_DRV_MODULE_VERSION "1.0.0" #define BNXT_RE_DESC "Broadcom NetXtreme-C/E RoCE Driver" - -#define BNXT_RE_PAGE_SIZE_4K BIT(12) -#define BNXT_RE_PAGE_SIZE_8K BIT(13) -#define BNXT_RE_PAGE_SIZE_64K BIT(16) -#define BNXT_RE_PAGE_SIZE_2M BIT(21) -#define BNXT_RE_PAGE_SIZE_8M BIT(23) -#define BNXT_RE_PAGE_SIZE_1G BIT(30) - -#define BNXT_RE_MAX_MR_SIZE BIT(30) +#define BNXT_RE_PAGE_SHIFT_4K (12) +#define BNXT_RE_PAGE_SHIFT_8K (13) +#define BNXT_RE_PAGE_SHIFT_64K (16) +#define BNXT_RE_PAGE_SHIFT_2M (21) +#define BNXT_RE_PAGE_SHIFT_8M (23) +#define BNXT_RE_PAGE_SHIFT_1G (30) + +#define BNXT_RE_PAGE_SIZE_4K BIT(BNXT_RE_PAGE_SHIFT_4K) +#define BNXT_RE_PAGE_SIZE_8K BIT(BNXT_RE_PAGE_SHIFT_8K) +#define BNXT_RE_PAGE_SIZE_64K BIT(BNXT_RE_PAGE_SHIFT_64K) +#define BNXT_RE_PAGE_SIZE_2M BIT(BNXT_RE_PAGE_SHIFT_2M) +#define BNXT_RE_PAGE_SIZE_8M BIT(BNXT_RE_PAGE_SHIFT_8M) +#define BNXT_RE_PAGE_SIZE_1G BIT(BNXT_RE_PAGE_SHIFT_1G) + +#define BNXT_RE_MAX_MR_SIZE_LOW BIT(BNXT_RE_PAGE_SHIFT_1G) +#define BNXT_RE_MAX_MR_SIZE_HIGH BIT(39) +#define BNXT_RE_MAX_MR_SIZE BNXT_RE_MAX_MR_SIZE_HIGH #define BNXT_RE_MAX_QPC_COUNT (64 * 1024) #define BNXT_RE_MAX_MRW_COUNT (64 * 1024) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 8a80e9548d06..c8c4a57397ee 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -147,7 +147,7 @@ int bnxt_re_query_device(struct ib_device *ibdev, bnxt_qplib_get_guid(rdev->netdev->dev_addr, (u8 *)&ib_attr->sys_image_guid); ib_attr->max_mr_size = BNXT_RE_MAX_MR_SIZE; - ib_attr->page_size_cap = BNXT_RE_PAGE_SIZE_4K; + ib_attr->page_size_cap = BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_2M; ib_attr->vendor_id = rdev->en_dev->pdev->vendor; ib_attr->vendor_part_id = rdev->en_dev->pdev->device; @@ -248,8 +248,7 @@ int bnxt_re_query_port(struct ib_device *ibdev, u8 port_num, IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS; - /* Max MSG size set to 2G for now */ - port_attr->max_msg_sz = 0x80000000; + port_attr->max_msg_sz = (u32)BNXT_RE_MAX_MR_SIZE_LOW; port_attr->bad_pkey_cntr = 0; port_attr->qkey_viol_cntr = 0; port_attr->pkey_tbl_len = dev_attr->max_pkey; @@ -542,7 +541,7 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; pbl_tbl = dma_addr; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl_tbl, - BNXT_RE_FENCE_PBL_SIZE, false); + BNXT_RE_FENCE_PBL_SIZE, false, PAGE_SIZE); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to register fence-MR\n"); goto fail; @@ -3091,7 +3090,8 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->qplib_mr.hwq.level = PBL_LVL_MAX; mr->qplib_mr.total_size = -1; /* Infinte length */ - rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false); + rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, &pbl, 0, false, + PAGE_SIZE); if (rc) goto fail_mr; @@ -3117,10 +3117,8 @@ int bnxt_re_dereg_mr(struct ib_mr *ib_mr) int rc; rc = bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { + if (rc) dev_err(rdev_to_dev(rdev), "Dereg MR failed: %#x\n", rc); - return rc; - } if (mr->pages) { rc = bnxt_qplib_free_fast_reg_page_list(&rdev->qplib_res, @@ -3183,7 +3181,7 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type, rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) - goto fail; + goto bail; mr->ib_mr.lkey = mr->qplib_mr.lkey; mr->ib_mr.rkey = mr->ib_mr.lkey; @@ -3205,9 +3203,10 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type, return &mr->ib_mr; fail_mr: - bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); -fail: kfree(mr->pages); +fail: + bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); +bail: kfree(mr); return ERR_PTR(rc); } @@ -3261,6 +3260,46 @@ int bnxt_re_dealloc_mw(struct ib_mw *ib_mw) return rc; } +static int bnxt_re_page_size_ok(int page_shift) +{ + switch (page_shift) { + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M: + case CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G: + return 1; + default: + return 0; + } +} + +static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig, + int page_shift) +{ + u64 *pbl_tbl = pbl_tbl_orig; + u64 paddr; + u64 page_mask = (1ULL << page_shift) - 1; + int i, pages; + struct scatterlist *sg; + int entry; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { + pages = sg_dma_len(sg) >> PAGE_SHIFT; + for (i = 0; i < pages; i++) { + paddr = sg_dma_address(sg) + (i << PAGE_SHIFT); + if (pbl_tbl == pbl_tbl_orig) + *pbl_tbl++ = paddr & ~page_mask; + else if ((paddr & page_mask) == 0) + *pbl_tbl++ = paddr; + } + } + return pbl_tbl - pbl_tbl_orig; +} + /* uverbs */ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, @@ -3270,10 +3309,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, struct bnxt_re_dev *rdev = pd->rdev; struct bnxt_re_mr *mr; struct ib_umem *umem; - u64 *pbl_tbl, *pbl_tbl_orig; - int i, umem_pgs, pages, rc; - struct scatterlist *sg; - int entry; + u64 *pbl_tbl = NULL; + int umem_pgs, page_shift, rc; if (length > BNXT_RE_MAX_MR_SIZE) { dev_err(rdev_to_dev(rdev), "MR Size: %lld > Max supported:%ld\n", @@ -3290,64 +3327,68 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR; + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + dev_err(rdev_to_dev(rdev), "Failed to allocate MR"); + goto free_mr; + } + /* The fixed portion of the rkey is the same as the lkey */ + mr->ib_mr.rkey = mr->qplib_mr.rkey; + umem = ib_umem_get(ib_pd->uobject->context, start, length, mr_access_flags, 0); if (IS_ERR(umem)) { dev_err(rdev_to_dev(rdev), "Failed to get umem"); rc = -EFAULT; - goto free_mr; + goto free_mrw; } mr->ib_umem = umem; - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - dev_err(rdev_to_dev(rdev), "Failed to allocate MR"); - goto release_umem; - } - /* The fixed portion of the rkey is the same as the lkey */ - mr->ib_mr.rkey = mr->qplib_mr.rkey; - mr->qplib_mr.va = virt_addr; umem_pgs = ib_umem_page_count(umem); if (!umem_pgs) { dev_err(rdev_to_dev(rdev), "umem is invalid!"); rc = -EINVAL; - goto free_mrw; + goto free_umem; } mr->qplib_mr.total_size = length; pbl_tbl = kcalloc(umem_pgs, sizeof(u64 *), GFP_KERNEL); if (!pbl_tbl) { - rc = -EINVAL; - goto free_mrw; + rc = -ENOMEM; + goto free_umem; } - pbl_tbl_orig = pbl_tbl; - if (umem->hugetlb) { - dev_err(rdev_to_dev(rdev), "umem hugetlb not supported!"); + page_shift = umem->page_shift; + + if (!bnxt_re_page_size_ok(page_shift)) { + dev_err(rdev_to_dev(rdev), "umem page size unsupported!"); rc = -EFAULT; goto fail; } - if (umem->page_shift != PAGE_SHIFT) { - dev_err(rdev_to_dev(rdev), "umem page shift unsupported!"); - rc = -EFAULT; + if (!umem->hugetlb && length > BNXT_RE_MAX_MR_SIZE_LOW) { + dev_err(rdev_to_dev(rdev), "Requested MR Sz:%llu Max sup:%llu", + length, (u64)BNXT_RE_MAX_MR_SIZE_LOW); + rc = -EINVAL; goto fail; } - /* Map umem buf ptrs to the PBL */ - for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { - pages = sg_dma_len(sg) >> umem->page_shift; - for (i = 0; i < pages; i++, pbl_tbl++) - *pbl_tbl = sg_dma_address(sg) + (i << umem->page_shift); + if (umem->hugetlb && length > BNXT_RE_PAGE_SIZE_2M) { + page_shift = BNXT_RE_PAGE_SHIFT_2M; + dev_warn(rdev_to_dev(rdev), "umem hugetlb set page_size %x", + 1 << page_shift); } - rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl_orig, - umem_pgs, false); + + /* Map umem buf ptrs to the PBL */ + umem_pgs = fill_umem_pbl_tbl(umem, pbl_tbl, page_shift); + rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, pbl_tbl, + umem_pgs, false, 1 << page_shift); if (rc) { dev_err(rdev_to_dev(rdev), "Failed to register user MR"); goto fail; } - kfree(pbl_tbl_orig); + kfree(pbl_tbl); mr->ib_mr.lkey = mr->qplib_mr.lkey; mr->ib_mr.rkey = mr->qplib_mr.lkey; @@ -3355,11 +3396,11 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, return &mr->ib_mr; fail: - kfree(pbl_tbl_orig); + kfree(pbl_tbl); +free_umem: + ib_umem_release(umem); free_mrw: bnxt_qplib_free_mrw(&rdev->qplib_res, &mr->qplib_mr); -release_umem: - ib_umem_release(umem); free_mr: kfree(mr); return ERR_PTR(rc); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 08df34adbb8f..e71bc57a1f3a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -657,7 +657,7 @@ int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, } int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - u64 *pbl_tbl, int num_pbls, bool block) + u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size) { struct bnxt_qplib_rcfw *rcfw = res->rcfw; struct cmdq_register_mr req; @@ -668,6 +668,9 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, u32 pg_size; if (num_pbls) { + /* Allocate memory for the non-leaf pages to store buf ptrs. + * Non-leaf pages always uses system PAGE_SIZE + */ pg_ptrs = roundup_pow_of_two(num_pbls); pages = pg_ptrs >> MAX_PBL_LVL_1_PGS_SHIFT; if (!pages) @@ -685,6 +688,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, bnxt_qplib_free_hwq(res->pdev, &mr->hwq); mr->hwq.max_elements = pages; + /* Use system PAGE_SIZE */ rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0, &mr->hwq.max_elements, PAGE_SIZE, 0, PAGE_SIZE, @@ -705,18 +709,22 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, /* Configure the request */ if (mr->hwq.level == PBL_LVL_MAX) { + /* No PBL provided, just use system PAGE_SIZE */ level = 0; req.pbl = 0; pg_size = PAGE_SIZE; } else { level = mr->hwq.level + 1; req.pbl = cpu_to_le64(mr->hwq.pbl[PBL_LVL_0].pg_map_arr[0]); - pg_size = mr->hwq.pbl[PBL_LVL_0].pg_size; } + pg_size = buf_pg_size ? buf_pg_size : PAGE_SIZE; req.log2_pg_size_lvl = (level << CMDQ_REGISTER_MR_LVL_SFT) | ((ilog2(pg_size) << CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT) & CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK); + req.log2_pbl_pg_size = cpu_to_le16(((ilog2(PAGE_SIZE) << + CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT) & + CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK)); req.access = (mr->flags & 0xFFFF); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 0828bb1ea41b..074e5e325349 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -159,7 +159,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw, bool block); int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, - u64 *pbl_tbl, int num_pbls, bool block); + u64 *pbl_tbl, int num_pbls, bool block, u32 buf_pg_size); int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr); int bnxt_qplib_alloc_fast_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, int max); diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index f429fdb18e21..5cd31de22330 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -1383,8 +1383,20 @@ struct cmdq_register_mr { #define CMDQ_REGISTER_MR_LVL_LVL_0 0x0UL #define CMDQ_REGISTER_MR_LVL_LVL_1 0x1UL #define CMDQ_REGISTER_MR_LVL_LVL_2 0x2UL + #define CMDQ_REGISTER_MR_LVL_LAST CMDQ_REGISTER_MR_LVL_LVL_2 #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_MASK 0x7cUL #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_SFT 2 + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_4K (0xcUL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_8K (0xdUL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_64K (0x10UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_256K (0x12UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1M (0x14UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_2M (0x15UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_4M (0x16UL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1G (0x1eUL << 2) + #define CMDQ_REGISTER_MR_LOG2_PG_SIZE_LAST \ + CMDQ_REGISTER_MR_LOG2_PG_SIZE_PG_1G + #define CMDQ_REGISTER_MR_UNUSED1 0x80UL u8 access; #define CMDQ_REGISTER_MR_ACCESS_LOCAL_WRITE 0x1UL #define CMDQ_REGISTER_MR_ACCESS_REMOTE_READ 0x2UL @@ -1392,7 +1404,21 @@ struct cmdq_register_mr { #define CMDQ_REGISTER_MR_ACCESS_REMOTE_ATOMIC 0x8UL #define CMDQ_REGISTER_MR_ACCESS_MW_BIND 0x10UL #define CMDQ_REGISTER_MR_ACCESS_ZERO_BASED 0x20UL - __le16 unused_1; + __le16 log2_pbl_pg_size; + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK 0x1fUL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT 0 + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4K 0xcUL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_8K 0xdUL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_64K 0x10UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_256K 0x12UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1M 0x14UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_2M 0x15UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_4M 0x16UL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G 0x1eUL + #define CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_LAST \ + CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_PG_1G + #define CMDQ_REGISTER_MR_UNUSED11_MASK 0xffe0UL + #define CMDQ_REGISTER_MR_UNUSED11_SFT 5 __le32 key; __le64 pbl; __le64 va; -- cgit v1.2.3 From 89f81008baac887799e170188169cad3c8cd9a68 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Thu, 11 Jan 2018 11:52:10 -0500 Subject: RDMA/bnxt_re: expose detailed stats retrieved from HW Broadcom's adapter supports more granular statistics to allow better understanding about the state of the chip when data traffic is flowing. Exposing the detailed stats to the consumer through the standard hook available in the kverbs interface. In order to retrieve all the information, driver implements a firmware command. Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 + drivers/infiniband/hw/bnxt_re/hw_counters.c | 145 ++++++++++++++++++++++++++-- drivers/infiniband/hw/bnxt_re/hw_counters.h | 39 ++++++++ drivers/infiniband/hw/bnxt_re/main.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_sp.c | 70 ++++++++++++++ drivers/infiniband/hw/bnxt_re/qplib_sp.h | 81 ++++++++++++++++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 89 +++++++++++++++++ 7 files changed, 417 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 085ca000973e..ca32057e886f 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -121,6 +121,7 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_RCFW_CHANNEL_EN 4 #define BNXT_RE_FLAG_QOS_WORK_REG 5 #define BNXT_RE_FLAG_TASK_IN_PROG 6 +#define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; unsigned int version, major, minor; struct bnxt_en_dev *en_dev; @@ -168,6 +169,7 @@ struct bnxt_re_dev { atomic_t nq_alloc_cnt; u32 is_virtfn; u32 num_vfs; + struct bnxt_qplib_roce_stats stats; }; #define to_bnxt_re_dev(ptr, member) \ diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 7b28219eba46..77416bc61e6e 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -58,16 +58,55 @@ #include "hw_counters.h" static const char * const bnxt_re_stat_name[] = { - [BNXT_RE_ACTIVE_QP] = "active_qps", - [BNXT_RE_ACTIVE_SRQ] = "active_srqs", - [BNXT_RE_ACTIVE_CQ] = "active_cqs", - [BNXT_RE_ACTIVE_MR] = "active_mrs", - [BNXT_RE_ACTIVE_MW] = "active_mws", - [BNXT_RE_RX_PKTS] = "rx_pkts", - [BNXT_RE_RX_BYTES] = "rx_bytes", - [BNXT_RE_TX_PKTS] = "tx_pkts", - [BNXT_RE_TX_BYTES] = "tx_bytes", - [BNXT_RE_RECOVERABLE_ERRORS] = "recoverable_errors" + [BNXT_RE_ACTIVE_QP] = "active_qps", + [BNXT_RE_ACTIVE_SRQ] = "active_srqs", + [BNXT_RE_ACTIVE_CQ] = "active_cqs", + [BNXT_RE_ACTIVE_MR] = "active_mrs", + [BNXT_RE_ACTIVE_MW] = "active_mws", + [BNXT_RE_RX_PKTS] = "rx_pkts", + [BNXT_RE_RX_BYTES] = "rx_bytes", + [BNXT_RE_TX_PKTS] = "tx_pkts", + [BNXT_RE_TX_BYTES] = "tx_bytes", + [BNXT_RE_RECOVERABLE_ERRORS] = "recoverable_errors", + [BNXT_RE_TO_RETRANSMITS] = "to_retransmits", + [BNXT_RE_SEQ_ERR_NAKS_RCVD] = "seq_err_naks_rcvd", + [BNXT_RE_MAX_RETRY_EXCEEDED] = "max_retry_exceeded", + [BNXT_RE_RNR_NAKS_RCVD] = "rnr_naks_rcvd", + [BNXT_RE_MISSING_RESP] = "missin_resp", + [BNXT_RE_UNRECOVERABLE_ERR] = "unrecoverable_err", + [BNXT_RE_BAD_RESP_ERR] = "bad_resp_err", + [BNXT_RE_LOCAL_QP_OP_ERR] = "local_qp_op_err", + [BNXT_RE_LOCAL_PROTECTION_ERR] = "local_protection_err", + [BNXT_RE_MEM_MGMT_OP_ERR] = "mem_mgmt_op_err", + [BNXT_RE_REMOTE_INVALID_REQ_ERR] = "remote_invalid_req_err", + [BNXT_RE_REMOTE_ACCESS_ERR] = "remote_access_err", + [BNXT_RE_REMOTE_OP_ERR] = "remote_op_err", + [BNXT_RE_DUP_REQ] = "dup_req", + [BNXT_RE_RES_EXCEED_MAX] = "res_exceed_max", + [BNXT_RE_RES_LENGTH_MISMATCH] = "res_length_mismatch", + [BNXT_RE_RES_EXCEEDS_WQE] = "res_exceeds_wqe", + [BNXT_RE_RES_OPCODE_ERR] = "res_opcode_err", + [BNXT_RE_RES_RX_INVALID_RKEY] = "res_rx_invalid_rkey", + [BNXT_RE_RES_RX_DOMAIN_ERR] = "res_rx_domain_err", + [BNXT_RE_RES_RX_NO_PERM] = "res_rx_no_perm", + [BNXT_RE_RES_RX_RANGE_ERR] = "res_rx_range_err", + [BNXT_RE_RES_TX_INVALID_RKEY] = "res_tx_invalid_rkey", + [BNXT_RE_RES_TX_DOMAIN_ERR] = "res_tx_domain_err", + [BNXT_RE_RES_TX_NO_PERM] = "res_tx_no_perm", + [BNXT_RE_RES_TX_RANGE_ERR] = "res_tx_range_err", + [BNXT_RE_RES_IRRQ_OFLOW] = "res_irrq_oflow", + [BNXT_RE_RES_UNSUP_OPCODE] = "res_unsup_opcode", + [BNXT_RE_RES_UNALIGNED_ATOMIC] = "res_unaligned_atomic", + [BNXT_RE_RES_REM_INV_ERR] = "res_rem_inv_err", + [BNXT_RE_RES_MEM_ERROR] = "res_mem_err", + [BNXT_RE_RES_SRQ_ERR] = "res_srq_err", + [BNXT_RE_RES_CMP_ERR] = "res_cmp_err", + [BNXT_RE_RES_INVALID_DUP_RKEY] = "res_invalid_dup_rkey", + [BNXT_RE_RES_WQE_FORMAT_ERR] = "res_wqe_format_err", + [BNXT_RE_RES_CQ_LOAD_ERR] = "res_cq_load_err", + [BNXT_RE_RES_SRQ_LOAD_ERR] = "res_srq_load_err", + [BNXT_RE_RES_TX_PCI_ERR] = "res_tx_pci_err", + [BNXT_RE_RES_RX_PCI_ERR] = "res_rx_pci_err" }; int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, @@ -76,6 +115,7 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, { struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); struct ctx_hw_stats *bnxt_re_stats = rdev->qplib_ctx.stats.dma; + int rc = 0; if (!port || !stats) return -EINVAL; @@ -97,6 +137,91 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, stats->value[BNXT_RE_TX_BYTES] = le64_to_cpu(bnxt_re_stats->tx_ucast_bytes); } + if (test_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags)) { + rc = bnxt_qplib_get_roce_stats(&rdev->rcfw, &rdev->stats); + if (rc) + clear_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, + &rdev->flags); + stats->value[BNXT_RE_TO_RETRANSMITS] = + rdev->stats.to_retransmits; + stats->value[BNXT_RE_SEQ_ERR_NAKS_RCVD] = + rdev->stats.seq_err_naks_rcvd; + stats->value[BNXT_RE_MAX_RETRY_EXCEEDED] = + rdev->stats.max_retry_exceeded; + stats->value[BNXT_RE_RNR_NAKS_RCVD] = + rdev->stats.rnr_naks_rcvd; + stats->value[BNXT_RE_MISSING_RESP] = + rdev->stats.missing_resp; + stats->value[BNXT_RE_UNRECOVERABLE_ERR] = + rdev->stats.unrecoverable_err; + stats->value[BNXT_RE_BAD_RESP_ERR] = + rdev->stats.bad_resp_err; + stats->value[BNXT_RE_LOCAL_QP_OP_ERR] = + rdev->stats.local_qp_op_err; + stats->value[BNXT_RE_LOCAL_PROTECTION_ERR] = + rdev->stats.local_protection_err; + stats->value[BNXT_RE_MEM_MGMT_OP_ERR] = + rdev->stats.mem_mgmt_op_err; + stats->value[BNXT_RE_REMOTE_INVALID_REQ_ERR] = + rdev->stats.remote_invalid_req_err; + stats->value[BNXT_RE_REMOTE_ACCESS_ERR] = + rdev->stats.remote_access_err; + stats->value[BNXT_RE_REMOTE_OP_ERR] = + rdev->stats.remote_op_err; + stats->value[BNXT_RE_DUP_REQ] = + rdev->stats.dup_req; + stats->value[BNXT_RE_RES_EXCEED_MAX] = + rdev->stats.res_exceed_max; + stats->value[BNXT_RE_RES_LENGTH_MISMATCH] = + rdev->stats.res_length_mismatch; + stats->value[BNXT_RE_RES_EXCEEDS_WQE] = + rdev->stats.res_exceeds_wqe; + stats->value[BNXT_RE_RES_OPCODE_ERR] = + rdev->stats.res_opcode_err; + stats->value[BNXT_RE_RES_RX_INVALID_RKEY] = + rdev->stats.res_rx_invalid_rkey; + stats->value[BNXT_RE_RES_RX_DOMAIN_ERR] = + rdev->stats.res_rx_domain_err; + stats->value[BNXT_RE_RES_RX_NO_PERM] = + rdev->stats.res_rx_no_perm; + stats->value[BNXT_RE_RES_RX_RANGE_ERR] = + rdev->stats.res_rx_range_err; + stats->value[BNXT_RE_RES_TX_INVALID_RKEY] = + rdev->stats.res_tx_invalid_rkey; + stats->value[BNXT_RE_RES_TX_DOMAIN_ERR] = + rdev->stats.res_tx_domain_err; + stats->value[BNXT_RE_RES_TX_NO_PERM] = + rdev->stats.res_tx_no_perm; + stats->value[BNXT_RE_RES_TX_RANGE_ERR] = + rdev->stats.res_tx_range_err; + stats->value[BNXT_RE_RES_IRRQ_OFLOW] = + rdev->stats.res_irrq_oflow; + stats->value[BNXT_RE_RES_UNSUP_OPCODE] = + rdev->stats.res_unsup_opcode; + stats->value[BNXT_RE_RES_UNALIGNED_ATOMIC] = + rdev->stats.res_unaligned_atomic; + stats->value[BNXT_RE_RES_REM_INV_ERR] = + rdev->stats.res_rem_inv_err; + stats->value[BNXT_RE_RES_MEM_ERROR] = + rdev->stats.res_mem_error; + stats->value[BNXT_RE_RES_SRQ_ERR] = + rdev->stats.res_srq_err; + stats->value[BNXT_RE_RES_CMP_ERR] = + rdev->stats.res_cmp_err; + stats->value[BNXT_RE_RES_INVALID_DUP_RKEY] = + rdev->stats.res_invalid_dup_rkey; + stats->value[BNXT_RE_RES_WQE_FORMAT_ERR] = + rdev->stats.res_wqe_format_err; + stats->value[BNXT_RE_RES_CQ_LOAD_ERR] = + rdev->stats.res_cq_load_err; + stats->value[BNXT_RE_RES_SRQ_LOAD_ERR] = + rdev->stats.res_srq_load_err; + stats->value[BNXT_RE_RES_TX_PCI_ERR] = + rdev->stats.res_tx_pci_err; + stats->value[BNXT_RE_RES_RX_PCI_ERR] = + rdev->stats.res_rx_pci_err; + } + return ARRAY_SIZE(bnxt_re_stat_name); } diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h index be0dc0093b58..a01a922717d5 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.h +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h @@ -51,6 +51,45 @@ enum bnxt_re_hw_stats { BNXT_RE_TX_PKTS, BNXT_RE_TX_BYTES, BNXT_RE_RECOVERABLE_ERRORS, + BNXT_RE_TO_RETRANSMITS, + BNXT_RE_SEQ_ERR_NAKS_RCVD, + BNXT_RE_MAX_RETRY_EXCEEDED, + BNXT_RE_RNR_NAKS_RCVD, + BNXT_RE_MISSING_RESP, + BNXT_RE_UNRECOVERABLE_ERR, + BNXT_RE_BAD_RESP_ERR, + BNXT_RE_LOCAL_QP_OP_ERR, + BNXT_RE_LOCAL_PROTECTION_ERR, + BNXT_RE_MEM_MGMT_OP_ERR, + BNXT_RE_REMOTE_INVALID_REQ_ERR, + BNXT_RE_REMOTE_ACCESS_ERR, + BNXT_RE_REMOTE_OP_ERR, + BNXT_RE_DUP_REQ, + BNXT_RE_RES_EXCEED_MAX, + BNXT_RE_RES_LENGTH_MISMATCH, + BNXT_RE_RES_EXCEEDS_WQE, + BNXT_RE_RES_OPCODE_ERR, + BNXT_RE_RES_RX_INVALID_RKEY, + BNXT_RE_RES_RX_DOMAIN_ERR, + BNXT_RE_RES_RX_NO_PERM, + BNXT_RE_RES_RX_RANGE_ERR, + BNXT_RE_RES_TX_INVALID_RKEY, + BNXT_RE_RES_TX_DOMAIN_ERR, + BNXT_RE_RES_TX_NO_PERM, + BNXT_RE_RES_TX_RANGE_ERR, + BNXT_RE_RES_IRRQ_OFLOW, + BNXT_RE_RES_UNSUP_OPCODE, + BNXT_RE_RES_UNALIGNED_ATOMIC, + BNXT_RE_RES_REM_INV_ERR, + BNXT_RE_RES_MEM_ERROR, + BNXT_RE_RES_SRQ_ERR, + BNXT_RE_RES_CMP_ERR, + BNXT_RE_RES_INVALID_DUP_RKEY, + BNXT_RE_RES_WQE_FORMAT_ERR, + BNXT_RE_RES_CQ_LOAD_ERR, + BNXT_RE_RES_SRQ_LOAD_ERR, + BNXT_RE_RES_TX_PCI_ERR, + BNXT_RE_RES_RX_PCI_ERR, BNXT_RE_NUM_COUNTERS }; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index ca1195024e9b..3caf70a103e6 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1245,6 +1245,7 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, &rdev->active_width); + set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_PORT_ACTIVE); bnxt_re_dispatch_event(&rdev->ibdev, NULL, 1, IB_EVENT_GID_CHANGE); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index e71bc57a1f3a..c015c1861351 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -790,3 +790,73 @@ int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids) 0); return 0; } + +int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_roce_stats *stats) +{ + struct cmdq_query_roce_stats req; + struct creq_query_roce_stats_resp resp; + struct bnxt_qplib_rcfw_sbuf *sbuf; + struct creq_query_roce_stats_resp_sb *sb; + u16 cmd_flags = 0; + int rc = 0; + + RCFW_CMD_PREP(req, QUERY_ROCE_STATS, cmd_flags); + + sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); + if (!sbuf) { + dev_err(&rcfw->pdev->dev, + "QPLIB: SP: QUERY_ROCE_STATS alloc side buffer failed"); + return -ENOMEM; + } + + sb = sbuf->sb; + req.resp_size = sizeof(*sb) / BNXT_QPLIB_CMDQE_UNITS; + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, + (void *)sbuf, 0); + if (rc) + goto bail; + /* Extract the context from the side buffer */ + stats->to_retransmits = le64_to_cpu(sb->to_retransmits); + stats->seq_err_naks_rcvd = le64_to_cpu(sb->seq_err_naks_rcvd); + stats->max_retry_exceeded = le64_to_cpu(sb->max_retry_exceeded); + stats->rnr_naks_rcvd = le64_to_cpu(sb->rnr_naks_rcvd); + stats->missing_resp = le64_to_cpu(sb->missing_resp); + stats->unrecoverable_err = le64_to_cpu(sb->unrecoverable_err); + stats->bad_resp_err = le64_to_cpu(sb->bad_resp_err); + stats->local_qp_op_err = le64_to_cpu(sb->local_qp_op_err); + stats->local_protection_err = le64_to_cpu(sb->local_protection_err); + stats->mem_mgmt_op_err = le64_to_cpu(sb->mem_mgmt_op_err); + stats->remote_invalid_req_err = le64_to_cpu(sb->remote_invalid_req_err); + stats->remote_access_err = le64_to_cpu(sb->remote_access_err); + stats->remote_op_err = le64_to_cpu(sb->remote_op_err); + stats->dup_req = le64_to_cpu(sb->dup_req); + stats->res_exceed_max = le64_to_cpu(sb->res_exceed_max); + stats->res_length_mismatch = le64_to_cpu(sb->res_length_mismatch); + stats->res_exceeds_wqe = le64_to_cpu(sb->res_exceeds_wqe); + stats->res_opcode_err = le64_to_cpu(sb->res_opcode_err); + stats->res_rx_invalid_rkey = le64_to_cpu(sb->res_rx_invalid_rkey); + stats->res_rx_domain_err = le64_to_cpu(sb->res_rx_domain_err); + stats->res_rx_no_perm = le64_to_cpu(sb->res_rx_no_perm); + stats->res_rx_range_err = le64_to_cpu(sb->res_rx_range_err); + stats->res_tx_invalid_rkey = le64_to_cpu(sb->res_tx_invalid_rkey); + stats->res_tx_domain_err = le64_to_cpu(sb->res_tx_domain_err); + stats->res_tx_no_perm = le64_to_cpu(sb->res_tx_no_perm); + stats->res_tx_range_err = le64_to_cpu(sb->res_tx_range_err); + stats->res_irrq_oflow = le64_to_cpu(sb->res_irrq_oflow); + stats->res_unsup_opcode = le64_to_cpu(sb->res_unsup_opcode); + stats->res_unaligned_atomic = le64_to_cpu(sb->res_unaligned_atomic); + stats->res_rem_inv_err = le64_to_cpu(sb->res_rem_inv_err); + stats->res_mem_error = le64_to_cpu(sb->res_mem_error); + stats->res_srq_err = le64_to_cpu(sb->res_srq_err); + stats->res_cmp_err = le64_to_cpu(sb->res_cmp_err); + stats->res_invalid_dup_rkey = le64_to_cpu(sb->res_invalid_dup_rkey); + stats->res_wqe_format_err = le64_to_cpu(sb->res_wqe_format_err); + stats->res_cq_load_err = le64_to_cpu(sb->res_cq_load_err); + stats->res_srq_load_err = le64_to_cpu(sb->res_srq_load_err); + stats->res_tx_pci_err = le64_to_cpu(sb->res_tx_pci_err); + stats->res_rx_pci_err = le64_to_cpu(sb->res_rx_pci_err); +bail: + bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); + return rc; +} diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 074e5e325349..9d3e8b994945 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -128,6 +128,85 @@ struct bnxt_qplib_frpl { #define BNXT_QPLIB_ACCESS_ZERO_BASED BIT(5) #define BNXT_QPLIB_ACCESS_ON_DEMAND BIT(6) +struct bnxt_qplib_roce_stats { + u64 to_retransmits; + u64 seq_err_naks_rcvd; + /* seq_err_naks_rcvd is 64 b */ + u64 max_retry_exceeded; + /* max_retry_exceeded is 64 b */ + u64 rnr_naks_rcvd; + /* rnr_naks_rcvd is 64 b */ + u64 missing_resp; + u64 unrecoverable_err; + /* unrecoverable_err is 64 b */ + u64 bad_resp_err; + /* bad_resp_err is 64 b */ + u64 local_qp_op_err; + /* local_qp_op_err is 64 b */ + u64 local_protection_err; + /* local_protection_err is 64 b */ + u64 mem_mgmt_op_err; + /* mem_mgmt_op_err is 64 b */ + u64 remote_invalid_req_err; + /* remote_invalid_req_err is 64 b */ + u64 remote_access_err; + /* remote_access_err is 64 b */ + u64 remote_op_err; + /* remote_op_err is 64 b */ + u64 dup_req; + /* dup_req is 64 b */ + u64 res_exceed_max; + /* res_exceed_max is 64 b */ + u64 res_length_mismatch; + /* res_length_mismatch is 64 b */ + u64 res_exceeds_wqe; + /* res_exceeds_wqe is 64 b */ + u64 res_opcode_err; + /* res_opcode_err is 64 b */ + u64 res_rx_invalid_rkey; + /* res_rx_invalid_rkey is 64 b */ + u64 res_rx_domain_err; + /* res_rx_domain_err is 64 b */ + u64 res_rx_no_perm; + /* res_rx_no_perm is 64 b */ + u64 res_rx_range_err; + /* res_rx_range_err is 64 b */ + u64 res_tx_invalid_rkey; + /* res_tx_invalid_rkey is 64 b */ + u64 res_tx_domain_err; + /* res_tx_domain_err is 64 b */ + u64 res_tx_no_perm; + /* res_tx_no_perm is 64 b */ + u64 res_tx_range_err; + /* res_tx_range_err is 64 b */ + u64 res_irrq_oflow; + /* res_irrq_oflow is 64 b */ + u64 res_unsup_opcode; + /* res_unsup_opcode is 64 b */ + u64 res_unaligned_atomic; + /* res_unaligned_atomic is 64 b */ + u64 res_rem_inv_err; + /* res_rem_inv_err is 64 b */ + u64 res_mem_error; + /* res_mem_error is 64 b */ + u64 res_srq_err; + /* res_srq_err is 64 b */ + u64 res_cmp_err; + /* res_cmp_err is 64 b */ + u64 res_invalid_dup_rkey; + /* res_invalid_dup_rkey is 64 b */ + u64 res_wqe_format_err; + /* res_wqe_format_err is 64 b */ + u64 res_cq_load_err; + /* res_cq_load_err is 64 b */ + u64 res_srq_load_err; + /* res_srq_load_err is 64 b */ + u64 res_tx_pci_err; + /* res_tx_pci_err is 64 b */ + u64 res_rx_pci_err; + /* res_rx_pci_err is 64 b */ +}; + int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, struct bnxt_qplib_sgid_tbl *sgid_tbl, int index, struct bnxt_qplib_gid *gid); @@ -168,4 +247,6 @@ int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res, int bnxt_qplib_free_fast_reg_page_list(struct bnxt_qplib_res *res, struct bnxt_qplib_frpl *frpl); int bnxt_qplib_map_tc2cos(struct bnxt_qplib_res *res, u16 *cids); +int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw, + struct bnxt_qplib_roce_stats *stats); #endif /* __BNXT_QPLIB_SP_H__*/ diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 5cd31de22330..2d7ea096a247 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -954,6 +954,7 @@ struct cmdq_base { #define CMDQ_BASE_OPCODE_QUERY_VERSION 0x8bUL #define CMDQ_BASE_OPCODE_MODIFY_CC 0x8cUL #define CMDQ_BASE_OPCODE_QUERY_CC 0x8dUL + #define CMDQ_BASE_OPCODE_QUERY_ROCE_STATS 0x8eUL u8 cmd_size; __le16 flags; __le16 cookie; @@ -2049,6 +2050,20 @@ struct creq_modify_qp_resp { __le16 reserved48[3]; }; +/* cmdq_query_roce_stats (size:128b/16B) */ +struct cmdq_query_roce_stats { + u8 opcode; + #define CMDQ_QUERY_ROCE_STATS_OPCODE_QUERY_ROCE_STATS 0x8eUL + #define CMDQ_QUERY_ROCE_STATS_OPCODE_LAST \ + CMDQ_QUERY_ROCE_STATS_OPCODE_QUERY_ROCE_STATS + u8 cmd_size; + __le16 flags; + __le16 cookie; + u8 resp_size; + u8 reserved8; + __le64 resp_addr; +}; + /* Query QP command response (16 bytes) */ struct creq_query_qp_resp { u8 type; @@ -2819,6 +2834,80 @@ struct creq_query_cc_resp_sb { __le64 reserved64_1; }; +/* creq_query_roce_stats_resp (size:128b/16B) */ +struct creq_query_roce_stats_resp { + u8 type; + #define CREQ_QUERY_ROCE_STATS_RESP_TYPE_MASK 0x3fUL + #define CREQ_QUERY_ROCE_STATS_RESP_TYPE_SFT 0 + #define CREQ_QUERY_ROCE_STATS_RESP_TYPE_QP_EVENT 0x38UL + #define CREQ_QUERY_ROCE_STATS_RESP_TYPE_LAST \ + CREQ_QUERY_ROCE_STATS_RESP_TYPE_QP_EVENT + u8 status; + __le16 cookie; + __le32 size; + u8 v; + #define CREQ_QUERY_ROCE_STATS_RESP_V 0x1UL + u8 event; + #define CREQ_QUERY_ROCE_STATS_RESP_EVENT_QUERY_ROCE_STATS 0x8eUL + #define CREQ_QUERY_ROCE_STATS_RESP_EVENT_LAST \ + CREQ_QUERY_ROCE_STATS_RESP_EVENT_QUERY_ROCE_STATS + u8 reserved48[6]; +}; + +/* creq_query_roce_stats_resp_sb (size:2624b/328B) */ +struct creq_query_roce_stats_resp_sb { + u8 opcode; + #define CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_QUERY_ROCE_STATS 0x8eUL + #define CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_LAST \ + CREQ_QUERY_ROCE_STATS_RESP_SB_OPCODE_QUERY_ROCE_STATS + u8 status; + __le16 cookie; + __le16 flags; + u8 resp_size; + u8 rsvd; + __le32 num_counters; + __le32 rsvd1; + __le64 to_retransmits; + __le64 seq_err_naks_rcvd; + __le64 max_retry_exceeded; + __le64 rnr_naks_rcvd; + __le64 missing_resp; + __le64 unrecoverable_err; + __le64 bad_resp_err; + __le64 local_qp_op_err; + __le64 local_protection_err; + __le64 mem_mgmt_op_err; + __le64 remote_invalid_req_err; + __le64 remote_access_err; + __le64 remote_op_err; + __le64 dup_req; + __le64 res_exceed_max; + __le64 res_length_mismatch; + __le64 res_exceeds_wqe; + __le64 res_opcode_err; + __le64 res_rx_invalid_rkey; + __le64 res_rx_domain_err; + __le64 res_rx_no_perm; + __le64 res_rx_range_err; + __le64 res_tx_invalid_rkey; + __le64 res_tx_domain_err; + __le64 res_tx_no_perm; + __le64 res_tx_range_err; + __le64 res_irrq_oflow; + __le64 res_unsup_opcode; + __le64 res_unaligned_atomic; + __le64 res_rem_inv_err; + __le64 res_mem_error; + __le64 res_srq_err; + __le64 res_cmp_err; + __le64 res_invalid_dup_rkey; + __le64 res_wqe_format_err; + __le64 res_cq_load_err; + __le64 res_srq_load_err; + __le64 res_tx_pci_err; + __le64 res_rx_pci_err; +}; + /* QP error notification event (16 bytes) */ struct creq_qp_error_notification { u8 type; -- cgit v1.2.3 From 37cb11acf1f72a007a85894a6dd2ec93932bde46 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Thu, 11 Jan 2018 11:52:11 -0500 Subject: RDMA/bnxt_re: Add SRQ support for Broadcom adapters Shared receive queue (SRQ) is defined as a pool of receive buffers shared among multiple QPs which belong to same protection domain in a given process context. Use of SRQ reduces the memory foot print of IB applications. Broadcom adapters support SRQ, adding code-changes to enable shared receive queue. Signed-off-by: Devesh Sharma Signed-off-by: Doug Ledford --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 244 ++++++++++++++- drivers/infiniband/hw/bnxt_re/ib_verbs.h | 19 ++ drivers/infiniband/hw/bnxt_re/main.c | 102 ++++++- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 462 +++++++++++++++++++++++++---- drivers/infiniband/hw/bnxt_re/qplib_fp.h | 78 +++-- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 2 +- drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 7 +- include/uapi/rdma/bnxt_re-abi.h | 9 + 8 files changed, 825 insertions(+), 98 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index c8c4a57397ee..9b8fa77b8831 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1027,6 +1027,7 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd, struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; struct bnxt_re_qp *qp; struct bnxt_re_cq *cq; + struct bnxt_re_srq *srq; int rc, entries; if ((qp_init_attr->cap.max_send_wr > dev_attr->max_qp_wqes) || @@ -1082,9 +1083,15 @@ struct ib_qp *bnxt_re_create_qp(struct ib_pd *ib_pd, } if (qp_init_attr->srq) { - dev_err(rdev_to_dev(rdev), "SRQ not supported"); - rc = -ENOTSUPP; - goto fail; + srq = container_of(qp_init_attr->srq, struct bnxt_re_srq, + ib_srq); + if (!srq) { + dev_err(rdev_to_dev(rdev), "SRQ not found"); + rc = -EINVAL; + goto fail; + } + qp->qplib_qp.srq = &srq->qplib_srq; + qp->qplib_qp.rq.max_wqe = 0; } else { /* Allocate 1 more than what's provided so posting max doesn't * mean empty @@ -1289,6 +1296,237 @@ static enum ib_mtu __to_ib_mtu(u32 mtu) } } +/* Shared Receive Queues */ +int bnxt_re_destroy_srq(struct ib_srq *ib_srq) +{ + struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, + ib_srq); + struct bnxt_re_dev *rdev = srq->rdev; + struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq; + struct bnxt_qplib_nq *nq = NULL; + int rc; + + if (qplib_srq->cq) + nq = qplib_srq->cq->nq; + rc = bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq); + if (rc) { + dev_err(rdev_to_dev(rdev), "Destroy HW SRQ failed!"); + return rc; + } + + if (srq->umem && !IS_ERR(srq->umem)) + ib_umem_release(srq->umem); + kfree(srq); + atomic_dec(&rdev->srq_count); + if (nq) + nq->budget--; + return 0; +} + +static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev, + struct bnxt_re_pd *pd, + struct bnxt_re_srq *srq, + struct ib_udata *udata) +{ + struct bnxt_re_srq_req ureq; + struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq; + struct ib_umem *umem; + int bytes = 0; + struct ib_ucontext *context = pd->ib_pd.uobject->context; + struct bnxt_re_ucontext *cntx = container_of(context, + struct bnxt_re_ucontext, + ib_uctx); + if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) + return -EFAULT; + + bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); + bytes = PAGE_ALIGN(bytes); + umem = ib_umem_get(context, ureq.srqva, bytes, + IB_ACCESS_LOCAL_WRITE, 1); + if (IS_ERR(umem)) + return PTR_ERR(umem); + + srq->umem = umem; + qplib_srq->nmap = umem->nmap; + qplib_srq->sglist = umem->sg_head.sgl; + qplib_srq->srq_handle = ureq.srq_handle; + qplib_srq->dpi = &cntx->dpi; + + return 0; +} + +struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata) +{ + struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + struct bnxt_re_dev *rdev = pd->rdev; + struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; + struct bnxt_re_srq *srq; + struct bnxt_qplib_nq *nq = NULL; + int rc, entries; + + if (srq_init_attr->attr.max_wr >= dev_attr->max_srq_wqes) { + dev_err(rdev_to_dev(rdev), "Create CQ failed - max exceeded"); + rc = -EINVAL; + goto exit; + } + + if (srq_init_attr->srq_type != IB_SRQT_BASIC) { + rc = -ENOTSUPP; + goto exit; + } + + srq = kzalloc(sizeof(*srq), GFP_KERNEL); + if (!srq) { + rc = -ENOMEM; + goto exit; + } + srq->rdev = rdev; + srq->qplib_srq.pd = &pd->qplib_pd; + srq->qplib_srq.dpi = &rdev->dpi_privileged; + /* Allocate 1 more than what's provided so posting max doesn't + * mean empty + */ + entries = roundup_pow_of_two(srq_init_attr->attr.max_wr + 1); + if (entries > dev_attr->max_srq_wqes + 1) + entries = dev_attr->max_srq_wqes + 1; + + srq->qplib_srq.max_wqe = entries; + srq->qplib_srq.max_sge = srq_init_attr->attr.max_sge; + srq->qplib_srq.threshold = srq_init_attr->attr.srq_limit; + srq->srq_limit = srq_init_attr->attr.srq_limit; + srq->qplib_srq.eventq_hw_ring_id = rdev->nq[0].ring_id; + nq = &rdev->nq[0]; + + if (udata) { + rc = bnxt_re_init_user_srq(rdev, pd, srq, udata); + if (rc) + goto fail; + } + + rc = bnxt_qplib_create_srq(&rdev->qplib_res, &srq->qplib_srq); + if (rc) { + dev_err(rdev_to_dev(rdev), "Create HW SRQ failed!"); + goto fail; + } + + if (udata) { + struct bnxt_re_srq_resp resp; + + resp.srqid = srq->qplib_srq.id; + rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); + if (rc) { + dev_err(rdev_to_dev(rdev), "SRQ copy to udata failed!"); + bnxt_qplib_destroy_srq(&rdev->qplib_res, + &srq->qplib_srq); + goto exit; + } + } + if (nq) + nq->budget++; + atomic_inc(&rdev->srq_count); + + return &srq->ib_srq; + +fail: + if (udata && srq->umem && !IS_ERR(srq->umem)) { + ib_umem_release(srq->umem); + srq->umem = NULL; + } + + kfree(srq); +exit: + return ERR_PTR(rc); +} + +int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata) +{ + struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, + ib_srq); + struct bnxt_re_dev *rdev = srq->rdev; + int rc; + + switch (srq_attr_mask) { + case IB_SRQ_MAX_WR: + /* SRQ resize is not supported */ + break; + case IB_SRQ_LIMIT: + /* Change the SRQ threshold */ + if (srq_attr->srq_limit > srq->qplib_srq.max_wqe) + return -EINVAL; + + srq->qplib_srq.threshold = srq_attr->srq_limit; + rc = bnxt_qplib_modify_srq(&rdev->qplib_res, &srq->qplib_srq); + if (rc) { + dev_err(rdev_to_dev(rdev), "Modify HW SRQ failed!"); + return rc; + } + /* On success, update the shadow */ + srq->srq_limit = srq_attr->srq_limit; + /* No need to Build and send response back to udata */ + break; + default: + dev_err(rdev_to_dev(rdev), + "Unsupported srq_attr_mask 0x%x", srq_attr_mask); + return -EINVAL; + } + return 0; +} + +int bnxt_re_query_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr) +{ + struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, + ib_srq); + struct bnxt_re_srq tsrq; + struct bnxt_re_dev *rdev = srq->rdev; + int rc; + + /* Get live SRQ attr */ + tsrq.qplib_srq.id = srq->qplib_srq.id; + rc = bnxt_qplib_query_srq(&rdev->qplib_res, &tsrq.qplib_srq); + if (rc) { + dev_err(rdev_to_dev(rdev), "Query HW SRQ failed!"); + return rc; + } + srq_attr->max_wr = srq->qplib_srq.max_wqe; + srq_attr->max_sge = srq->qplib_srq.max_sge; + srq_attr->srq_limit = tsrq.qplib_srq.threshold; + + return 0; +} + +int bnxt_re_post_srq_recv(struct ib_srq *ib_srq, struct ib_recv_wr *wr, + struct ib_recv_wr **bad_wr) +{ + struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq, + ib_srq); + struct bnxt_qplib_swqe wqe; + unsigned long flags; + int rc = 0, payload_sz = 0; + + spin_lock_irqsave(&srq->lock, flags); + while (wr) { + /* Transcribe each ib_recv_wr to qplib_swqe */ + wqe.num_sge = wr->num_sge; + payload_sz = bnxt_re_build_sgl(wr->sg_list, wqe.sg_list, + wr->num_sge); + wqe.wr_id = wr->wr_id; + wqe.type = BNXT_QPLIB_SWQE_TYPE_RECV; + + rc = bnxt_qplib_post_srq_recv(&srq->qplib_srq, &wqe); + if (rc) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + spin_unlock_irqrestore(&srq->lock, flags); + + return rc; +} static int bnxt_re_modify_shadow_qp(struct bnxt_re_dev *rdev, struct bnxt_re_qp *qp1_qp, int qp_attr_mask) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 66dd8d2c4655..423ebe012f95 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -68,6 +68,15 @@ struct bnxt_re_ah { struct bnxt_qplib_ah qplib_ah; }; +struct bnxt_re_srq { + struct bnxt_re_dev *rdev; + u32 srq_limit; + struct ib_srq ib_srq; + struct bnxt_qplib_srq qplib_srq; + struct ib_umem *umem; + spinlock_t lock; /* protect srq */ +}; + struct bnxt_re_qp { struct list_head list; struct bnxt_re_dev *rdev; @@ -165,6 +174,16 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd, int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); int bnxt_re_destroy_ah(struct ib_ah *ah); +struct ib_srq *bnxt_re_create_srq(struct ib_pd *pd, + struct ib_srq_init_attr *srq_init_attr, + struct ib_udata *udata); +int bnxt_re_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, + enum ib_srq_attr_mask srq_attr_mask, + struct ib_udata *udata); +int bnxt_re_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr); +int bnxt_re_destroy_srq(struct ib_srq *srq); +int bnxt_re_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *recv_wr, + struct ib_recv_wr **bad_recv_wr); struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 3caf70a103e6..508d00a5a106 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -588,6 +588,12 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->query_ah = bnxt_re_query_ah; ibdev->destroy_ah = bnxt_re_destroy_ah; + ibdev->create_srq = bnxt_re_create_srq; + ibdev->modify_srq = bnxt_re_modify_srq; + ibdev->query_srq = bnxt_re_query_srq; + ibdev->destroy_srq = bnxt_re_destroy_srq; + ibdev->post_srq_recv = bnxt_re_post_srq_recv; + ibdev->create_qp = bnxt_re_create_qp; ibdev->modify_qp = bnxt_re_modify_qp; ibdev->query_qp = bnxt_re_query_qp; @@ -689,10 +695,10 @@ static struct bnxt_re_dev *bnxt_re_dev_add(struct net_device *netdev, return rdev; } -static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw, - struct creq_func_event *aeqe) +static int bnxt_re_handle_unaffi_async_event(struct creq_func_event + *unaffi_async) { - switch (aeqe->event) { + switch (unaffi_async->event) { case CREQ_FUNC_EVENT_EVENT_TX_WQE_ERROR: break; case CREQ_FUNC_EVENT_EVENT_TX_DATA_ERROR: @@ -721,6 +727,93 @@ static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw, return 0; } +static int bnxt_re_handle_qp_async_event(struct creq_qp_event *qp_event, + struct bnxt_re_qp *qp) +{ + struct ib_event event; + + memset(&event, 0, sizeof(event)); + if (qp->qplib_qp.srq) { + event.device = &qp->rdev->ibdev; + event.element.qp = &qp->ib_qp; + event.event = IB_EVENT_QP_LAST_WQE_REACHED; + } + + if (event.device && qp->ib_qp.event_handler) + qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context); + + return 0; +} + +static int bnxt_re_handle_affi_async_event(struct creq_qp_event *affi_async, + void *obj) +{ + int rc = 0; + u8 event; + + if (!obj) + return rc; /* QP was already dead, still return success */ + + event = affi_async->event; + if (event == CREQ_QP_EVENT_EVENT_QP_ERROR_NOTIFICATION) { + struct bnxt_qplib_qp *lib_qp = obj; + struct bnxt_re_qp *qp = container_of(lib_qp, struct bnxt_re_qp, + qplib_qp); + rc = bnxt_re_handle_qp_async_event(affi_async, qp); + } + return rc; +} + +static int bnxt_re_aeq_handler(struct bnxt_qplib_rcfw *rcfw, + void *aeqe, void *obj) +{ + struct creq_qp_event *affi_async; + struct creq_func_event *unaffi_async; + u8 type; + int rc; + + type = ((struct creq_base *)aeqe)->type; + if (type == CREQ_BASE_TYPE_FUNC_EVENT) { + unaffi_async = aeqe; + rc = bnxt_re_handle_unaffi_async_event(unaffi_async); + } else { + affi_async = aeqe; + rc = bnxt_re_handle_affi_async_event(affi_async, obj); + } + + return rc; +} + +static int bnxt_re_srqn_handler(struct bnxt_qplib_nq *nq, + struct bnxt_qplib_srq *handle, u8 event) +{ + struct bnxt_re_srq *srq = container_of(handle, struct bnxt_re_srq, + qplib_srq); + struct ib_event ib_event; + int rc = 0; + + if (!srq) { + dev_err(NULL, "%s: SRQ is NULL, SRQN not handled", + ROCE_DRV_MODULE_NAME); + rc = -EINVAL; + goto done; + } + ib_event.device = &srq->rdev->ibdev; + ib_event.element.srq = &srq->ib_srq; + if (event == NQ_SRQ_EVENT_EVENT_SRQ_THRESHOLD_EVENT) + ib_event.event = IB_EVENT_SRQ_LIMIT_REACHED; + else + ib_event.event = IB_EVENT_SRQ_ERR; + + if (srq->ib_srq.event_handler) { + /* Lock event_handler? */ + (*srq->ib_srq.event_handler)(&ib_event, + srq->ib_srq.srq_context); + } +done: + return rc; +} + static int bnxt_re_cqn_handler(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *handle) { @@ -763,7 +856,8 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) rc = bnxt_qplib_enable_nq(rdev->en_dev->pdev, &rdev->nq[i - 1], i - 1, rdev->msix_entries[i].vector, rdev->msix_entries[i].db_offset, - &bnxt_re_cqn_handler, NULL); + &bnxt_re_cqn_handler, + &bnxt_re_srqn_handler); if (rc) { dev_err(rdev_to_dev(rdev), diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index eb7195c20b88..8b5f11ac0e42 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -52,6 +52,7 @@ static void bnxt_qplib_arm_cq_enable(struct bnxt_qplib_cq *cq); static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp); +static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type); static void bnxt_qplib_cancel_phantom_processing(struct bnxt_qplib_qp *qp) { @@ -278,6 +279,7 @@ static void bnxt_qplib_service_nq(unsigned long data) struct nq_base *nqe, **nq_ptr; struct bnxt_qplib_cq *cq; int num_cqne_processed = 0; + int num_srqne_processed = 0; u32 sw_cons, raw_cons; u16 type; int budget = nq->budget; @@ -320,6 +322,26 @@ static void bnxt_qplib_service_nq(unsigned long data) spin_unlock_bh(&cq->compl_lock); break; } + case NQ_BASE_TYPE_SRQ_EVENT: + { + struct nq_srq_event *nqsrqe = + (struct nq_srq_event *)nqe; + + q_handle = le32_to_cpu(nqsrqe->srq_handle_low); + q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high) + << 32; + bnxt_qplib_arm_srq((struct bnxt_qplib_srq *)q_handle, + DBR_DBR_TYPE_SRQ_ARMENA); + if (!nq->srqn_handler(nq, + (struct bnxt_qplib_srq *)q_handle, + nqsrqe->event)) + num_srqne_processed++; + else + dev_warn(&nq->pdev->dev, + "QPLIB: SRQ event 0x%x not handled", + nqsrqe->event); + break; + } case NQ_BASE_TYPE_DBQ_EVENT: break; default: @@ -384,17 +406,19 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, int (*cqn_handler)(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *), int (*srqn_handler)(struct bnxt_qplib_nq *nq, - void *, u8 event)) + struct bnxt_qplib_srq *, + u8 event)) { resource_size_t nq_base; int rc = -1; nq->pdev = pdev; nq->vector = msix_vector; + if (cqn_handler) + nq->cqn_handler = cqn_handler; - nq->cqn_handler = cqn_handler; - - nq->srqn_handler = srqn_handler; + if (srqn_handler) + nq->srqn_handler = srqn_handler; tasklet_init(&nq->worker, bnxt_qplib_service_nq, (unsigned long)nq); @@ -468,6 +492,238 @@ int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq) return 0; } +/* SRQ */ +static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type) +{ + struct bnxt_qplib_hwq *srq_hwq = &srq->hwq; + struct dbr_dbr db_msg = { 0 }; + void __iomem *db; + u32 sw_prod = 0; + + /* Ring DB */ + sw_prod = (arm_type == DBR_DBR_TYPE_SRQ_ARM) ? srq->threshold : + HWQ_CMP(srq_hwq->prod, srq_hwq); + db_msg.index = cpu_to_le32((sw_prod << DBR_DBR_INDEX_SFT) & + DBR_DBR_INDEX_MASK); + db_msg.type_xid = cpu_to_le32(((srq->id << DBR_DBR_XID_SFT) & + DBR_DBR_XID_MASK) | arm_type); + db = (arm_type == DBR_DBR_TYPE_SRQ_ARMENA) ? + srq->dbr_base : srq->dpi->dbr; + wmb(); /* barrier before db ring */ + __iowrite64_copy(db, &db_msg, sizeof(db_msg) / sizeof(u64)); +} + +int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq) +{ + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct cmdq_destroy_srq req; + struct creq_destroy_srq_resp resp; + u16 cmd_flags = 0; + int rc; + + RCFW_CMD_PREP(req, DESTROY_SRQ, cmd_flags); + + /* Configure the request */ + req.srq_cid = cpu_to_le32(srq->id); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, NULL, 0); + if (rc) + return rc; + + bnxt_qplib_free_hwq(res->pdev, &srq->hwq); + kfree(srq->swq); + return 0; +} + +int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq) +{ + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct cmdq_create_srq req; + struct creq_create_srq_resp resp; + struct bnxt_qplib_pbl *pbl; + u16 cmd_flags = 0; + int rc, idx; + + srq->hwq.max_elements = srq->max_wqe; + rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, srq->sglist, + srq->nmap, &srq->hwq.max_elements, + BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0, + PAGE_SIZE, HWQ_TYPE_QUEUE); + if (rc) + goto exit; + + srq->swq = kcalloc(srq->hwq.max_elements, sizeof(*srq->swq), + GFP_KERNEL); + if (!srq->swq) + goto fail; + + RCFW_CMD_PREP(req, CREATE_SRQ, cmd_flags); + + /* Configure the request */ + req.dpi = cpu_to_le32(srq->dpi->dpi); + req.srq_handle = cpu_to_le64(srq); + + req.srq_size = cpu_to_le16((u16)srq->hwq.max_elements); + pbl = &srq->hwq.pbl[PBL_LVL_0]; + req.pg_size_lvl = cpu_to_le16((((u16)srq->hwq.level & + CMDQ_CREATE_SRQ_LVL_MASK) << + CMDQ_CREATE_SRQ_LVL_SFT) | + (pbl->pg_size == ROCE_PG_SIZE_4K ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_4K : + pbl->pg_size == ROCE_PG_SIZE_8K ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_8K : + pbl->pg_size == ROCE_PG_SIZE_64K ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_64K : + pbl->pg_size == ROCE_PG_SIZE_2M ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_2M : + pbl->pg_size == ROCE_PG_SIZE_8M ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_8M : + pbl->pg_size == ROCE_PG_SIZE_1G ? + CMDQ_CREATE_SRQ_PG_SIZE_PG_1G : + CMDQ_CREATE_SRQ_PG_SIZE_PG_4K)); + req.pbl = cpu_to_le64(pbl->pg_map_arr[0]); + req.pd_id = cpu_to_le32(srq->pd->id); + req.eventq_id = cpu_to_le16(srq->eventq_hw_ring_id); + + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, + (void *)&resp, NULL, 0); + if (rc) + goto fail; + + spin_lock_init(&srq->lock); + srq->start_idx = 0; + srq->last_idx = srq->hwq.max_elements - 1; + for (idx = 0; idx < srq->hwq.max_elements; idx++) + srq->swq[idx].next_idx = idx + 1; + srq->swq[srq->last_idx].next_idx = -1; + + srq->id = le32_to_cpu(resp.xid); + srq->dbr_base = res->dpi_tbl.dbr_bar_reg_iomem; + if (srq->threshold) + bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARMENA); + srq->arm_req = false; + + return 0; +fail: + bnxt_qplib_free_hwq(res->pdev, &srq->hwq); + kfree(srq->swq); +exit: + return rc; +} + +int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq) +{ + struct bnxt_qplib_hwq *srq_hwq = &srq->hwq; + u32 sw_prod, sw_cons, count = 0; + + sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq); + sw_cons = HWQ_CMP(srq_hwq->cons, srq_hwq); + + count = sw_prod > sw_cons ? sw_prod - sw_cons : + srq_hwq->max_elements - sw_cons + sw_prod; + if (count > srq->threshold) { + srq->arm_req = false; + bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM); + } else { + /* Deferred arming */ + srq->arm_req = true; + } + + return 0; +} + +int bnxt_qplib_query_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq) +{ + struct bnxt_qplib_rcfw *rcfw = res->rcfw; + struct cmdq_query_srq req; + struct creq_query_srq_resp resp; + struct bnxt_qplib_rcfw_sbuf *sbuf; + struct creq_query_srq_resp_sb *sb; + u16 cmd_flags = 0; + int rc = 0; + + RCFW_CMD_PREP(req, QUERY_SRQ, cmd_flags); + req.srq_cid = cpu_to_le32(srq->id); + + /* Configure the request */ + sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); + if (!sbuf) + return -ENOMEM; + sb = sbuf->sb; + rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, + (void *)sbuf, 0); + srq->threshold = le16_to_cpu(sb->srq_limit); + bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); + + return rc; +} + +int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, + struct bnxt_qplib_swqe *wqe) +{ + struct bnxt_qplib_hwq *srq_hwq = &srq->hwq; + struct rq_wqe *srqe, **srqe_ptr; + struct sq_sge *hw_sge; + u32 sw_prod, sw_cons, count = 0; + int i, rc = 0, next; + + spin_lock(&srq_hwq->lock); + if (srq->start_idx == srq->last_idx) { + dev_err(&srq_hwq->pdev->dev, "QPLIB: FP: SRQ (0x%x) is full!", + srq->id); + rc = -EINVAL; + spin_unlock(&srq_hwq->lock); + goto done; + } + next = srq->start_idx; + srq->start_idx = srq->swq[next].next_idx; + spin_unlock(&srq_hwq->lock); + + sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq); + srqe_ptr = (struct rq_wqe **)srq_hwq->pbl_ptr; + srqe = &srqe_ptr[RQE_PG(sw_prod)][RQE_IDX(sw_prod)]; + memset(srqe, 0, BNXT_QPLIB_MAX_RQE_ENTRY_SIZE); + /* Calculate wqe_size16 and data_len */ + for (i = 0, hw_sge = (struct sq_sge *)srqe->data; + i < wqe->num_sge; i++, hw_sge++) { + hw_sge->va_or_pa = cpu_to_le64(wqe->sg_list[i].addr); + hw_sge->l_key = cpu_to_le32(wqe->sg_list[i].lkey); + hw_sge->size = cpu_to_le32(wqe->sg_list[i].size); + } + srqe->wqe_type = wqe->type; + srqe->flags = wqe->flags; + srqe->wqe_size = wqe->num_sge + + ((offsetof(typeof(*srqe), data) + 15) >> 4); + srqe->wr_id[0] = cpu_to_le32((u32)next); + srq->swq[next].wr_id = wqe->wr_id; + + srq_hwq->prod++; + + spin_lock(&srq_hwq->lock); + sw_prod = HWQ_CMP(srq_hwq->prod, srq_hwq); + /* retaining srq_hwq->cons for this logic + * actually the lock is only required to + * read srq_hwq->cons. + */ + sw_cons = HWQ_CMP(srq_hwq->cons, srq_hwq); + count = sw_prod > sw_cons ? sw_prod - sw_cons : + srq_hwq->max_elements - sw_cons + sw_prod; + spin_unlock(&srq_hwq->lock); + /* Ring DB */ + bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ); + if (srq->arm_req == true && count > srq->threshold) { + srq->arm_req = false; + bnxt_qplib_arm_srq(srq, DBR_DBR_TYPE_SRQ_ARM); + } +done: + return rc; +} + /* QP */ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) { @@ -736,6 +992,12 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) pbl->pg_size == ROCE_PG_SIZE_1G ? CMDQ_CREATE_QP_RQ_PG_SIZE_PG_1G : CMDQ_CREATE_QP_RQ_PG_SIZE_PG_4K); + } else { + /* SRQ */ + if (qp->srq) { + qp_flags |= CMDQ_CREATE_QP_QP_FLAGS_SRQ_USED; + req.srq_cid = cpu_to_le32(qp->srq->id); + } } if (qp->rcq) @@ -2067,6 +2329,16 @@ done: return rc; } +static void bnxt_qplib_release_srqe(struct bnxt_qplib_srq *srq, u32 tag) +{ + spin_lock(&srq->hwq.lock); + srq->swq[srq->last_idx].next_idx = (int)tag; + srq->last_idx = (int)tag; + srq->swq[srq->last_idx].next_idx = -1; + srq->hwq.cons++; /* Support for SRQE counter */ + spin_unlock(&srq->hwq.lock); +} + static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, struct cq_res_rc *hwcqe, struct bnxt_qplib_cqe **pcqe, @@ -2074,6 +2346,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; + struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; u32 wr_id_idx; int rc = 0; @@ -2101,27 +2374,46 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, wr_id_idx = le32_to_cpu(hwcqe->srq_or_rq_wr_id) & CQ_RES_RC_SRQ_OR_RQ_WR_ID_MASK; - rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process RC "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", - wr_id_idx, rq->hwq.max_elements); - return -EINVAL; - } - - cqe->wr_id = rq->swq[wr_id_idx].wr_id; - cqe++; - (*budget)--; - rq->hwq.cons++; - *pcqe = cqe; + if (cqe->flags & CQ_RES_RC_FLAGS_SRQ_SRQ) { + srq = qp->srq; + if (!srq) + return -EINVAL; + if (wr_id_idx > srq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process RC "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + wr_id_idx, srq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = srq->swq[wr_id_idx].wr_id; + bnxt_qplib_release_srqe(srq, wr_id_idx); + cqe++; + (*budget)--; + *pcqe = cqe; + } else { + rq = &qp->rq; + if (wr_id_idx > rq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process RC "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", + wr_id_idx, rq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = rq->swq[wr_id_idx].wr_id; + cqe++; + (*budget)--; + rq->hwq.cons++; + *pcqe = cqe; - if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; - /* Add qp to flush list of the CQ */ - bnxt_qplib_lock_buddy_cq(qp, cq); - __bnxt_qplib_add_flush_qp(qp); - bnxt_qplib_unlock_buddy_cq(qp, cq); + if (hwcqe->status != CQ_RES_RC_STATUS_OK) { + qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); + } } done: @@ -2135,6 +2427,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; + struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; u32 wr_id_idx; int rc = 0; @@ -2165,27 +2458,48 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, hwcqe->src_qp_high_srq_or_rq_wr_id) & CQ_RES_UD_SRC_QP_HIGH_MASK) >> 8); - rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process UD "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx %#x exceeded RQ max %#x", - wr_id_idx, rq->hwq.max_elements); - return -EINVAL; - } + if (cqe->flags & CQ_RES_RC_FLAGS_SRQ_SRQ) { + srq = qp->srq; + if (!srq) + return -EINVAL; - cqe->wr_id = rq->swq[wr_id_idx].wr_id; - cqe++; - (*budget)--; - rq->hwq.cons++; - *pcqe = cqe; + if (wr_id_idx > srq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process UD "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + wr_id_idx, srq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = srq->swq[wr_id_idx].wr_id; + bnxt_qplib_release_srqe(srq, wr_id_idx); + cqe++; + (*budget)--; + *pcqe = cqe; + } else { + rq = &qp->rq; + if (wr_id_idx > rq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process UD "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", + wr_id_idx, rq->hwq.max_elements); + return -EINVAL; + } - if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; - /* Add qp to flush list of the CQ */ - bnxt_qplib_lock_buddy_cq(qp, cq); - __bnxt_qplib_add_flush_qp(qp); - bnxt_qplib_unlock_buddy_cq(qp, cq); + cqe->wr_id = rq->swq[wr_id_idx].wr_id; + cqe++; + (*budget)--; + rq->hwq.cons++; + *pcqe = cqe; + + if (hwcqe->status != CQ_RES_RC_STATUS_OK) { + qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); + } } done: return rc; @@ -2217,6 +2531,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *rq; + struct bnxt_qplib_srq *srq; struct bnxt_qplib_cqe *cqe; u32 wr_id_idx; int rc = 0; @@ -2255,26 +2570,49 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, cqe->raweth_qp1_flags2 = le32_to_cpu(hwcqe->raweth_qp1_flags2); cqe->raweth_qp1_metadata = le32_to_cpu(hwcqe->raweth_qp1_metadata); - rq = &qp->rq; - if (wr_id_idx > rq->hwq.max_elements) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id "); - dev_err(&cq->hwq.pdev->dev, "QPLIB: ix 0x%x exceeded RQ max 0x%x", - wr_id_idx, rq->hwq.max_elements); - return -EINVAL; - } - - cqe->wr_id = rq->swq[wr_id_idx].wr_id; - cqe++; - (*budget)--; - rq->hwq.cons++; - *pcqe = cqe; + if (cqe->flags & CQ_RES_RAWETH_QP1_FLAGS_SRQ_SRQ) { + srq = qp->srq; + if (!srq) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: SRQ used but not defined??"); + return -EINVAL; + } + if (wr_id_idx > srq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process Raw/QP1 "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + wr_id_idx, srq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = srq->swq[wr_id_idx].wr_id; + bnxt_qplib_release_srqe(srq, wr_id_idx); + cqe++; + (*budget)--; + *pcqe = cqe; + } else { + rq = &qp->rq; + if (wr_id_idx > rq->hwq.max_elements) { + dev_err(&cq->hwq.pdev->dev, + "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id "); + dev_err(&cq->hwq.pdev->dev, + "QPLIB: ix 0x%x exceeded RQ max 0x%x", + wr_id_idx, rq->hwq.max_elements); + return -EINVAL; + } + cqe->wr_id = rq->swq[wr_id_idx].wr_id; + cqe++; + (*budget)--; + rq->hwq.cons++; + *pcqe = cqe; - if (hwcqe->status != CQ_RES_RC_STATUS_OK) { - qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; - /* Add qp to flush list of the CQ */ - bnxt_qplib_lock_buddy_cq(qp, cq); - __bnxt_qplib_add_flush_qp(qp); - bnxt_qplib_unlock_buddy_cq(qp, cq); + if (hwcqe->status != CQ_RES_RC_STATUS_OK) { + qp->state = CMDQ_MODIFY_QP_NEW_STATE_ERR; + /* Add qp to flush list of the CQ */ + bnxt_qplib_lock_buddy_cq(qp, cq); + __bnxt_qplib_add_flush_qp(qp); + bnxt_qplib_unlock_buddy_cq(qp, cq); + } } done: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index c582d4ec8173..211b27a8f9e2 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -39,6 +39,27 @@ #ifndef __BNXT_QPLIB_FP_H__ #define __BNXT_QPLIB_FP_H__ +struct bnxt_qplib_srq { + struct bnxt_qplib_pd *pd; + struct bnxt_qplib_dpi *dpi; + void __iomem *dbr_base; + u64 srq_handle; + u32 id; + u32 max_wqe; + u32 max_sge; + u32 threshold; + bool arm_req; + struct bnxt_qplib_cq *cq; + struct bnxt_qplib_hwq hwq; + struct bnxt_qplib_swq *swq; + struct scatterlist *sglist; + int start_idx; + int last_idx; + u32 nmap; + u16 eventq_hw_ring_id; + spinlock_t lock; /* protect SRQE link list */ +}; + struct bnxt_qplib_sge { u64 addr; u32 lkey; @@ -79,6 +100,7 @@ static inline u32 get_psne_idx(u32 val) struct bnxt_qplib_swq { u64 wr_id; + int next_idx; u8 type; u8 flags; u32 start_psn; @@ -404,29 +426,27 @@ struct bnxt_qplib_cq { writel(NQ_DB_CP_FLAGS | ((raw_cons) & ((cp_bit) - 1)), db) struct bnxt_qplib_nq { - struct pci_dev *pdev; - - int vector; - cpumask_t mask; - int budget; - bool requested; - struct tasklet_struct worker; - struct bnxt_qplib_hwq hwq; - - u16 bar_reg; - u16 bar_reg_off; - u16 ring_id; - void __iomem *bar_reg_iomem; - - int (*cqn_handler) - (struct bnxt_qplib_nq *nq, - struct bnxt_qplib_cq *cq); - int (*srqn_handler) - (struct bnxt_qplib_nq *nq, - void *srq, - u8 event); - struct workqueue_struct *cqn_wq; - char name[32]; + struct pci_dev *pdev; + + int vector; + cpumask_t mask; + int budget; + bool requested; + struct tasklet_struct worker; + struct bnxt_qplib_hwq hwq; + + u16 bar_reg; + u16 bar_reg_off; + u16 ring_id; + void __iomem *bar_reg_iomem; + + int (*cqn_handler)(struct bnxt_qplib_nq *nq, + struct bnxt_qplib_cq *cq); + int (*srqn_handler)(struct bnxt_qplib_nq *nq, + struct bnxt_qplib_srq *srq, + u8 event); + struct workqueue_struct *cqn_wq; + char name[32]; }; struct bnxt_qplib_nq_work { @@ -441,8 +461,18 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, int (*cqn_handler)(struct bnxt_qplib_nq *nq, struct bnxt_qplib_cq *cq), int (*srqn_handler)(struct bnxt_qplib_nq *nq, - void *srq, + struct bnxt_qplib_srq *srq, u8 event)); +int bnxt_qplib_create_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq); +int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq); +int bnxt_qplib_query_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq); +int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res, + struct bnxt_qplib_srq *srq); +int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, + struct bnxt_qplib_swqe *wqe); int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp); int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp); int bnxt_qplib_modify_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 6a3633af1d52..8329ec6a7946 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -616,7 +616,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, int msix_vector, int cp_bar_reg_off, int virt_fn, int (*aeq_handler)(struct bnxt_qplib_rcfw *, - struct creq_func_event *)) + void *, void *)) { resource_size_t res_base; struct cmdq_init init; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 2946a7cfae82..6bee6e3636ea 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -167,7 +167,7 @@ struct bnxt_qplib_rcfw { #define FIRMWARE_TIMED_OUT 3 wait_queue_head_t waitq; int (*aeq_handler)(struct bnxt_qplib_rcfw *, - struct creq_func_event *); + void *, void *); u32 seq_num; /* Bar region info */ @@ -199,9 +199,8 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, struct bnxt_qplib_rcfw *rcfw, int msix_vector, int cp_bar_reg_off, int virt_fn, - int (*aeq_handler) - (struct bnxt_qplib_rcfw *, - struct creq_func_event *)); + int (*aeq_handler)(struct bnxt_qplib_rcfw *, + void *aeqe, void *obj)); struct bnxt_qplib_rcfw_sbuf *bnxt_qplib_rcfw_alloc_sbuf( struct bnxt_qplib_rcfw *rcfw, diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index 398a514ee446..db54115be044 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -82,6 +82,15 @@ struct bnxt_re_qp_resp { __u32 rsvd; }; +struct bnxt_re_srq_req { + __u64 srqva; + __u64 srq_handle; +}; + +struct bnxt_re_srq_resp { + __u32 srqid; +}; + enum bnxt_re_shpg_offt { BNXT_RE_BEG_RESV_OFFT = 0x00, BNXT_RE_AVID_OFFT = 0x10, -- cgit v1.2.3 From 65567e41219888feec72fee1de98ccf1efbbc16d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 12 Jan 2018 15:11:58 -0800 Subject: RDMA/rxe: Fix a race condition in rxe_requester() The rxe driver works as follows: * The send queue, receive queue and completion queues are implemented as circular buffers. * ib_post_send() and ib_post_recv() calls are serialized through a spinlock. * Removing elements from various queues happens from tasklet context. Tasklets are guaranteed to run on at most one CPU. This serializes access to these queues. See also rxe_completer(), rxe_requester() and rxe_responder(). * rxe_completer() processes the skbs queued onto qp->resp_pkts. * rxe_requester() handles the send queue (qp->sq.queue). * rxe_responder() processes the skbs queued onto qp->req_pkts. Since rxe_drain_req_pkts() processes qp->req_pkts, calling rxe_drain_req_pkts() from rxe_requester() is racy. Hence this patch. Reported-by: Moni Shoua Signed-off-by: Bart Van Assche Cc: stable@vger.kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_loc.h | 1 - drivers/infiniband/sw/rxe/rxe_req.c | 9 +-------- drivers/infiniband/sw/rxe/rxe_resp.c | 2 +- 3 files changed, 2 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index d7472a442a2c..96c3a6c5c4b5 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -237,7 +237,6 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq, void rxe_release(struct kref *kref); -void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify); int rxe_completer(void *arg); int rxe_requester(void *arg); int rxe_responder(void *arg); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 26a7f923045b..7bdaf71b8221 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -594,15 +594,8 @@ int rxe_requester(void *arg) rxe_add_ref(qp); next_wqe: - if (unlikely(!qp->valid)) { - rxe_drain_req_pkts(qp, true); + if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR)) goto exit; - } - - if (unlikely(qp->req.state == QP_STATE_ERROR)) { - rxe_drain_req_pkts(qp, true); - goto exit; - } if (unlikely(qp->req.state == QP_STATE_RESET)) { qp->req.wqe_index = consumer_index(qp->sq.queue); diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index f07324f2cde2..d37bb9b97569 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -1209,7 +1209,7 @@ static enum resp_states do_class_d1e_error(struct rxe_qp *qp) } } -void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify) +static void rxe_drain_req_pkts(struct rxe_qp *qp, bool notify) { struct sk_buff *skb; -- cgit v1.2.3 From bb3ffb7ad48a21e98a5c64eb21103a74fd9f03f6 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 12 Jan 2018 15:11:59 -0800 Subject: RDMA/rxe: Fix rxe_qp_cleanup() rxe_qp_cleanup() can sleep so it must be run in thread context and not in atomic context. This patch avoids that the following bug is triggered: Kernel BUG at 00000000560033f3 [verbose debug info unavailable] BUG: sleeping function called from invalid context at net/core/sock.c:2761 in_atomic(): 1, irqs_disabled(): 0, pid: 7, name: ksoftirqd/0 INFO: lockdep is turned off. Preemption disabled at: [<00000000b6e69628>] __do_softirq+0x4e/0x540 CPU: 0 PID: 7 Comm: ksoftirqd/0 Not tainted 4.15.0-rc7-dbg+ #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/01/2014 Call Trace: dump_stack+0x85/0xbf ___might_sleep+0x177/0x260 lock_sock_nested+0x1d/0x90 inet_shutdown+0x2e/0xd0 rxe_qp_cleanup+0x107/0x140 [rdma_rxe] rxe_elem_release+0x18/0x80 [rdma_rxe] rxe_requester+0x1cf/0x11b0 [rdma_rxe] rxe_do_task+0x78/0xf0 [rdma_rxe] tasklet_action+0x99/0x270 __do_softirq+0xc0/0x540 run_ksoftirqd+0x1c/0x70 smpboot_thread_fn+0x1be/0x270 kthread+0x117/0x130 ret_from_fork+0x24/0x30 Signed-off-by: Bart Van Assche Cc: Moni Shoua Cc: stable@vger.kernel.org Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_qp.c | 12 ++++++++++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 3 +++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 4469592b839d..137d6c0c49d4 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -824,9 +824,9 @@ void rxe_qp_destroy(struct rxe_qp *qp) } /* called when the last reference to the qp is dropped */ -void rxe_qp_cleanup(struct rxe_pool_entry *arg) +static void rxe_qp_do_cleanup(struct work_struct *work) { - struct rxe_qp *qp = container_of(arg, typeof(*qp), pelem); + struct rxe_qp *qp = container_of(work, typeof(*qp), cleanup_work.work); rxe_drop_all_mcast_groups(qp); @@ -859,3 +859,11 @@ void rxe_qp_cleanup(struct rxe_pool_entry *arg) kernel_sock_shutdown(qp->sk, SHUT_RDWR); sock_release(qp->sk); } + +/* called when the last reference to the qp is dropped */ +void rxe_qp_cleanup(struct rxe_pool_entry *arg) +{ + struct rxe_qp *qp = container_of(arg, typeof(*qp), pelem); + + execute_in_process_context(rxe_qp_do_cleanup, &qp->cleanup_work); +} diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 0c2dbe45c729..1019f5e7dbdd 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -35,6 +35,7 @@ #define RXE_VERBS_H #include +#include #include #include "rxe_pool.h" #include "rxe_task.h" @@ -281,6 +282,8 @@ struct rxe_qp { struct timer_list rnr_nak_timer; spinlock_t state_lock; /* guard requester and completer */ + + struct execute_work cleanup_work; }; enum rxe_mem_state { -- cgit v1.2.3 From aaebd377c0e9c75bc95217b924e4456a392c65ed Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Sun, 14 Jan 2018 17:07:48 +0200 Subject: IB/core: postpone WR initialization during queue drain No need to initialize completion and WR in case we fail during QP modification. Signed-off-by: Max Gurtovoy Acked-by: Sagi Grimberg Signed-off-by: Doug Ledford --- drivers/infiniband/core/verbs.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 49a2a764c0bf..e9c3991a93ff 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2191,16 +2191,16 @@ static void __ib_drain_sq(struct ib_qp *qp) struct ib_send_wr swr = {}, *bad_swr; int ret; - swr.wr_cqe = &sdrain.cqe; - sdrain.cqe.done = ib_drain_qp_done; - init_completion(&sdrain.done); - ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } + swr.wr_cqe = &sdrain.cqe; + sdrain.cqe.done = ib_drain_qp_done; + init_completion(&sdrain.done); + ret = ib_post_send(qp, &swr, &bad_swr); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); @@ -2225,16 +2225,16 @@ static void __ib_drain_rq(struct ib_qp *qp) struct ib_recv_wr rwr = {}, *bad_rwr; int ret; - rwr.wr_cqe = &rdrain.cqe; - rdrain.cqe.done = ib_drain_qp_done; - init_completion(&rdrain.done); - ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } + rwr.wr_cqe = &rdrain.cqe; + rdrain.cqe.done = ib_drain_qp_done; + init_completion(&rdrain.done); + ret = ib_post_recv(qp, &rwr, &bad_rwr); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); -- cgit v1.2.3 From 246d8b184c100e8eb6b4e8c88f232c2ed2a4e672 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 14 Jan 2018 17:07:50 +0200 Subject: IB/cq: Don't force IB_POLL_DIRECT poll context for ib_process_cq_direct polling the completion queue directly does not interfere with the existing polling logic, hence drop the requirement. Be aware that running ib_process_cq_direct with non IB_POLL_DIRECT CQ may trigger concurrent CQ processing. This can be used for polling mode ULPs. Cc: Bart Van Assche Reported-by: Steve Wise Signed-off-by: Sagi Grimberg [maxg: added wcs array argument to __ib_process_cq] Signed-off-by: Max Gurtovoy Signed-off-by: Doug Ledford --- drivers/infiniband/core/cq.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index f2ae75fa3128..c8c5a5a7f433 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -25,9 +25,10 @@ #define IB_POLL_FLAGS \ (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) -static int __ib_process_cq(struct ib_cq *cq, int budget) +static int __ib_process_cq(struct ib_cq *cq, int budget, struct ib_wc *poll_wc) { int i, n, completed = 0; + struct ib_wc *wcs = poll_wc ? : cq->wc; /* * budget might be (-1) if the caller does not @@ -35,9 +36,9 @@ static int __ib_process_cq(struct ib_cq *cq, int budget) * minimum here. */ while ((n = ib_poll_cq(cq, min_t(u32, IB_POLL_BATCH, - budget - completed), cq->wc)) > 0) { + budget - completed), wcs)) > 0) { for (i = 0; i < n; i++) { - struct ib_wc *wc = &cq->wc[i]; + struct ib_wc *wc = &wcs[i]; if (wc->wr_cqe) wc->wr_cqe->done(cq, wc); @@ -60,18 +61,20 @@ static int __ib_process_cq(struct ib_cq *cq, int budget) * @cq: CQ to process * @budget: number of CQEs to poll for * - * This function is used to process all outstanding CQ entries on a - * %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different - * context and does not ask for completion interrupts from the HCA. + * This function is used to process all outstanding CQ entries. + * It does not offload CQ processing to a different context and does + * not ask for completion interrupts from the HCA. + * Using direct processing on CQ with non IB_POLL_DIRECT type may trigger + * concurrent processing. * * Note: do not pass -1 as %budget unless it is guaranteed that the number * of completions that will be processed is small. */ int ib_process_cq_direct(struct ib_cq *cq, int budget) { - WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT); + struct ib_wc wcs[IB_POLL_BATCH]; - return __ib_process_cq(cq, budget); + return __ib_process_cq(cq, budget, wcs); } EXPORT_SYMBOL(ib_process_cq_direct); @@ -85,7 +88,7 @@ static int ib_poll_handler(struct irq_poll *iop, int budget) struct ib_cq *cq = container_of(iop, struct ib_cq, iop); int completed; - completed = __ib_process_cq(cq, budget); + completed = __ib_process_cq(cq, budget, NULL); if (completed < budget) { irq_poll_complete(&cq->iop); if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) @@ -105,7 +108,7 @@ static void ib_cq_poll_work(struct work_struct *work) struct ib_cq *cq = container_of(work, struct ib_cq, work); int completed; - completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE); + completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE, NULL); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) queue_work(ib_comp_wq, &cq->work); -- cgit v1.2.3 From 24d33d2c8e92abffe1f0653d42fc65b8f164a6d9 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Tue, 16 Jan 2018 20:08:40 +0200 Subject: net/mlx5e: Add clock info page to mlx5 core devices Adds a new page to mlx5 core containing clock info data that allows user level applications to translate between cqe timestamp to nanoseconds. The information stored into this page is represented through mlx5_ib_clock_info. In order to synchronize between kernel and user space a sequence number is incremented at the beginning and end of each update. An odd number means the data is being updated while an even means the access was already done. To guarantee that the data structure was accessed atomically user will: repeat: seq1 = goto while odd seq2 = if seq1 != seq2 goto repeat Reviewed-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Reviewed-by: Alex Vesker Signed-off-by: Feras Daoud Signed-off-by: Eitan Rabin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- .../net/ethernet/mellanox/mlx5/core/lib/clock.c | 55 ++++++++++++++++++++++ include/linux/mlx5/driver.h | 3 ++ include/uapi/rdma/mlx5-abi.h | 16 +++++++ 3 files changed, 74 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c index fa8aed62b231..4b6cb9b38686 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c @@ -31,6 +31,8 @@ */ #include +#include +#include #include "en.h" enum { @@ -71,6 +73,28 @@ static u64 read_internal_timer(const struct cyclecounter *cc) return mlx5_read_internal_timer(mdev) & cc->mask; } +static void mlx5_update_clock_info_page(struct mlx5_core_dev *mdev) +{ + struct mlx5_ib_clock_info *clock_info = mdev->clock_info; + struct mlx5_clock *clock = &mdev->clock; + u32 sign; + + if (!clock_info) + return; + + sign = smp_load_acquire(&clock_info->sign); + smp_store_mb(clock_info->sign, + sign | MLX5_IB_CLOCK_INFO_KERNEL_UPDATING); + + clock_info->cycles = clock->tc.cycle_last; + clock_info->mult = clock->cycles.mult; + clock_info->nsec = clock->tc.nsec; + clock_info->frac = clock->tc.frac; + + smp_store_release(&clock_info->sign, + sign + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING * 2); +} + static void mlx5_pps_out(struct work_struct *work) { struct mlx5_pps *pps_info = container_of(work, struct mlx5_pps, @@ -109,6 +133,7 @@ static void mlx5_timestamp_overflow(struct work_struct *work) write_lock_irqsave(&clock->lock, flags); timecounter_read(&clock->tc); + mlx5_update_clock_info_page(clock->mdev); write_unlock_irqrestore(&clock->lock, flags); schedule_delayed_work(&clock->overflow_work, clock->overflow_period); } @@ -123,6 +148,7 @@ static int mlx5_ptp_settime(struct ptp_clock_info *ptp, write_lock_irqsave(&clock->lock, flags); timecounter_init(&clock->tc, &clock->cycles, ns); + mlx5_update_clock_info_page(clock->mdev); write_unlock_irqrestore(&clock->lock, flags); return 0; @@ -152,6 +178,7 @@ static int mlx5_ptp_adjtime(struct ptp_clock_info *ptp, s64 delta) write_lock_irqsave(&clock->lock, flags); timecounter_adjtime(&clock->tc, delta); + mlx5_update_clock_info_page(clock->mdev); write_unlock_irqrestore(&clock->lock, flags); return 0; @@ -179,6 +206,7 @@ static int mlx5_ptp_adjfreq(struct ptp_clock_info *ptp, s32 delta) timecounter_read(&clock->tc); clock->cycles.mult = neg_adj ? clock->nominal_c_mult - diff : clock->nominal_c_mult + diff; + mlx5_update_clock_info_page(clock->mdev); write_unlock_irqrestore(&clock->lock, flags); return 0; @@ -470,6 +498,7 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) clock->cycles.shift); clock->nominal_c_mult = clock->cycles.mult; clock->cycles.mask = CLOCKSOURCE_MASK(41); + clock->mdev = mdev; timecounter_init(&clock->tc, &clock->cycles, ktime_to_ns(ktime_get_real())); @@ -482,6 +511,25 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev) do_div(ns, NSEC_PER_SEC / 2 / HZ); clock->overflow_period = ns; + mdev->clock_info_page = alloc_page(GFP_KERNEL); + if (mdev->clock_info_page) { + mdev->clock_info = kmap(mdev->clock_info_page); + if (!mdev->clock_info) { + __free_page(mdev->clock_info_page); + mlx5_core_warn(mdev, "failed to map clock page\n"); + } else { + mdev->clock_info->sign = 0; + mdev->clock_info->nsec = clock->tc.nsec; + mdev->clock_info->cycles = clock->tc.cycle_last; + mdev->clock_info->mask = clock->cycles.mask; + mdev->clock_info->mult = clock->nominal_c_mult; + mdev->clock_info->shift = clock->cycles.shift; + mdev->clock_info->frac = clock->tc.frac; + mdev->clock_info->overflow_period = + clock->overflow_period; + } + } + INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out); INIT_DELAYED_WORK(&clock->overflow_work, mlx5_timestamp_overflow); if (clock->overflow_period) @@ -521,5 +569,12 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev) cancel_work_sync(&clock->pps_info.out_work); cancel_delayed_work_sync(&clock->overflow_work); + + if (mdev->clock_info) { + kunmap(mdev->clock_info_page); + __free_page(mdev->clock_info_page); + mdev->clock_info = NULL; + } + kfree(clock->ptp_info.pin_config); } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 9136e35f2f7e..c403151133e9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -797,6 +797,7 @@ struct mlx5_clock { u32 nominal_c_mult; unsigned long overflow_period; struct delayed_work overflow_work; + struct mlx5_core_dev *mdev; struct ptp_clock *ptp; struct ptp_clock_info ptp_info; struct mlx5_pps pps_info; @@ -844,6 +845,8 @@ struct mlx5_core_dev { struct cpu_rmap *rmap; #endif struct mlx5_clock clock; + struct mlx5_ib_clock_info *clock_info; + struct page *clock_info_page; }; struct mlx5_db { diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index f6d319dfc7bf..0299deed71a2 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -381,4 +381,20 @@ struct mlx5_ib_modify_wq { __u32 comp_mask; __u32 reserved; }; + +struct mlx5_ib_clock_info { + __u32 sign; + __u32 resv; + __u64 nsec; + __u64 cycles; + __u64 frac; + __u32 mult; + __u32 shift; + __u64 mask; + __u64 overflow_period; +}; + +enum { + MLX5_IB_CLOCK_INFO_KERNEL_UPDATING = 1, +}; #endif /* MLX5_ABI_USER_H */ -- cgit v1.2.3 From 5c99eaecb1fce76e86cf74020624e36fbb63c3bf Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Tue, 16 Jan 2018 20:08:41 +0200 Subject: IB/mlx5: Mmap the HCA's clock info to user-space This patch maps the new page to user space applications to allow converting a user space completion timestamp to system wall time at the lowest possible latency cost. By using a versioning scheme we allow compatibility between current and future userspace libraries. The change moves mlx5_ib_mmap_cmd enum from mlx5_ib.h to the abi header file mlx5-abi.h. Reviewed-by: Alex Vesker Reviewed-by: Yishai Hadas Signed-off-by: Feras Daoud Signed-off-by: Eitan Rabin Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 44 ++++++++++++++++++++++++++++++++++-- drivers/infiniband/hw/mlx5/mlx5_ib.h | 10 -------- include/uapi/rdma/mlx5-abi.h | 18 ++++++++++++++- 3 files changed, 59 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 91e6b42798e5..f282e281eff2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1579,6 +1579,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_alloc_ucontext_req_v2 req = {}; struct mlx5_ib_alloc_ucontext_resp resp = {}; + struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_ib_ucontext *context; struct mlx5_bfreg_info *bfregi; int ver; @@ -1706,6 +1707,12 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.response_length += sizeof(resp.eth_min_inline); } + if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) { + if (mdev->clock_info) + resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1); + resp.response_length += sizeof(resp.clock_info_versions); + } + /* * We don't want to expose information from the PCI bar that is located * after 4096 bytes, so if the arch only supports larger pages, let's @@ -1719,8 +1726,7 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.hca_core_clock_offset = offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; } - resp.response_length += sizeof(resp.hca_core_clock_offset) + - sizeof(resp.reserved2); + resp.response_length += sizeof(resp.hca_core_clock_offset); } if (field_avail(typeof(resp), log_uar_size, udata->outlen)) @@ -1959,6 +1965,38 @@ static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) } } +static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, + struct vm_area_struct *vma, + struct mlx5_ib_ucontext *context) +{ + phys_addr_t pfn; + int err; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1) + return -EOPNOTSUPP; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + + if (!dev->mdev->clock_info_page) + return -EOPNOTSUPP; + + pfn = page_to_pfn(dev->mdev->clock_info_page); + err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, + vma->vm_page_prot); + if (err) + return err; + + mlx5_ib_dbg(dev, "mapped clock info at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); + + return mlx5_ib_set_vma_data(vma, context); +} + static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) @@ -2121,6 +2159,8 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm vma->vm_start, (unsigned long long)pfn << PAGE_SHIFT); break; + case MLX5_IB_MMAP_CLOCK_INFO: + return mlx5_ib_mmap_clock_info_page(dev, vma, context); default: return -EINVAL; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 51228dfcfbe7..69a80f7512f0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -70,16 +70,6 @@ enum { MLX5_IB_MMAP_CMD_MASK = 0xff, }; -enum mlx5_ib_mmap_cmd { - MLX5_IB_MMAP_REGULAR_PAGE = 0, - MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, - MLX5_IB_MMAP_WC_PAGE = 2, - MLX5_IB_MMAP_NC_PAGE = 3, - /* 5 is chosen in order to be compatible with old versions of libmlx5 */ - MLX5_IB_MMAP_CORE_CLOCK = 5, - MLX5_IB_MMAP_ALLOC_WC = 6, -}; - enum { MLX5_RES_SCAT_DATA32_CQE = 0x1, MLX5_RES_SCAT_DATA64_CQE = 0x2, diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 0299deed71a2..1111aa4e7c1e 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -124,7 +124,7 @@ struct mlx5_ib_alloc_ucontext_resp { __u8 cqe_version; __u8 cmds_supp_uhw; __u8 eth_min_inline; - __u8 reserved2; + __u8 clock_info_versions; __u64 hca_core_clock_offset; __u32 log_uar_size; __u32 num_uars_per_page; @@ -394,7 +394,23 @@ struct mlx5_ib_clock_info { __u64 overflow_period; }; +enum mlx5_ib_mmap_cmd { + MLX5_IB_MMAP_REGULAR_PAGE = 0, + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + MLX5_IB_MMAP_WC_PAGE = 2, + MLX5_IB_MMAP_NC_PAGE = 3, + /* 5 is chosen in order to be compatible with old versions of libmlx5 */ + MLX5_IB_MMAP_CORE_CLOCK = 5, + MLX5_IB_MMAP_ALLOC_WC = 6, + MLX5_IB_MMAP_CLOCK_INFO = 7, +}; + enum { MLX5_IB_CLOCK_INFO_KERNEL_UPDATING = 1, }; + +/* Bit indexes for the mlx5_alloc_ucontext_resp.clock_info_versions bitmap */ +enum { + MLX5_IB_CLOCK_INFO_V1 = 0, +}; #endif /* MLX5_ABI_USER_H */ -- cgit v1.2.3 From 0b5fe5c43ab67130d259d800fcc109c7340acb63 Mon Sep 17 00:00:00 2001 From: "weiyongjun (A)" Date: Wed, 17 Jan 2018 11:28:38 +0000 Subject: RDMA/hns: Remove unnecessary platform_get_resource() error check devm_ioremap_resource() already checks if the resource is NULL, so remove the unnecessary platform_get_resource() error check. Signed-off-by: Wei Yongjun Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 833a305085ef..21ca9fa7c9d1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -4788,10 +4788,6 @@ static int hns_roce_get_cfg(struct hns_roce_dev *hr_dev) /* get the mapped register base address */ res = platform_get_resource(hr_dev->pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(dev, "memory resource not found!\n"); - return -EINVAL; - } hr_dev->reg_base = devm_ioremap_resource(dev, res); if (IS_ERR(hr_dev->reg_base)) return PTR_ERR(hr_dev->reg_base); -- cgit v1.2.3 From 2a174df0c602dd2385c7e9efc6d0977c37fbd681 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 10:39:42 -0800 Subject: IB/srp: Use kstrtoull() instead of simple_strtoull() Use kstrtoull() since simple_strtoull() is deprecated. This patch improves error checking but otherwise does not change any functionality. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srp/ib_srp.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 62d88212c1b0..39b3e43efbbe 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3111,6 +3111,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) char *options, *sep_opt; char *p; substring_t args[MAX_OPT_ARGS]; + unsigned long long ull; int opt_mask = 0; int token; int ret = -EINVAL; @@ -3135,7 +3136,13 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) ret = -ENOMEM; goto out; } - target->id_ext = cpu_to_be64(simple_strtoull(p, NULL, 16)); + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("invalid id_ext parameter '%s'\n", p); + kfree(p); + goto out; + } + target->id_ext = cpu_to_be64(ull); kfree(p); break; @@ -3145,7 +3152,13 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) ret = -ENOMEM; goto out; } - target->ioc_guid = cpu_to_be64(simple_strtoull(p, NULL, 16)); + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("invalid ioc_guid parameter '%s'\n", p); + kfree(p); + goto out; + } + target->ioc_guid = cpu_to_be64(ull); kfree(p); break; @@ -3181,7 +3194,13 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) ret = -ENOMEM; goto out; } - target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16)); + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("bad service_id parameter '%s'\n", p); + kfree(p); + goto out; + } + target->service_id = cpu_to_be64(ull); kfree(p); break; @@ -3235,7 +3254,13 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) ret = -ENOMEM; goto out; } - target->initiator_ext = cpu_to_be64(simple_strtoull(p, NULL, 16)); + ret = kstrtoull(p, 16, &ull); + if (ret) { + pr_warn("bad initiator_ext value '%s'\n", p); + kfree(p); + goto out; + } + target->initiator_ext = cpu_to_be64(ull); kfree(p); break; -- cgit v1.2.3 From 85769c6f3b850b2a9c6428be70ba55f339da6ae2 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 10:39:43 -0800 Subject: IB/srp: Improve path record query error message Show all path record query parameters if a path record query fails. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srp/ib_srp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 39b3e43efbbe..fec6e800adf4 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -702,7 +702,10 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) ret = ch->status; if (ret < 0) shost_printk(KERN_WARNING, target->scsi_host, - PFX "Path record query failed\n"); + PFX "Path record query failed: sgid %pI6, dgid %pI6, pkey %#04x, service_id %#16llx\n", + ch->path.sgid.raw, ch->path.dgid.raw, + be16_to_cpu(target->pkey), + be64_to_cpu(target->service_id)); put: scsi_host_put(target->scsi_host); -- cgit v1.2.3 From 48900a28348f5fac37bd2afd9ef6810442353196 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 10:39:44 -0800 Subject: IB/srp: Refactor srp_send_req() This patch does not change any functionality but prepares for the patch that adds RDMA_CM support by making the RDMA_CM patch much easier to read. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srp/ib_srp.c | 65 +++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index fec6e800adf4..9472f5989d27 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -738,35 +738,21 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) struct ib_cm_req_param param; struct srp_login_req priv; } *req = NULL; + char *ipi, *tpi; int status; - u8 subnet_timeout; - - subnet_timeout = srp_get_subnet_timeout(target->srp_host); req = kzalloc(sizeof *req, GFP_KERNEL); if (!req) return -ENOMEM; - req->param.primary_path = &ch->path; - req->param.alternate_path = NULL; - req->param.service_id = target->service_id; - req->param.qp_num = ch->qp->qp_num; - req->param.qp_type = ch->qp->qp_type; - req->param.private_data = &req->priv; - req->param.private_data_len = sizeof req->priv; req->param.flow_control = 1; - - get_random_bytes(&req->param.starting_psn, 4); - req->param.starting_psn &= 0xffffff; + req->param.retry_count = target->tl_retry_count; /* * Pick some arbitrary defaults here; we could make these * module parameters if anyone cared about setting them. */ req->param.responder_resources = 4; - req->param.remote_cm_response_timeout = subnet_timeout + 2; - req->param.local_cm_response_timeout = subnet_timeout + 2; - req->param.retry_count = target->tl_retry_count; req->param.rnr_retry_count = 7; req->param.max_cm_retries = 15; @@ -777,6 +763,28 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) SRP_BUF_FORMAT_INDIRECT); req->priv.req_flags = (multich ? SRP_MULTICHAN_MULTI : SRP_MULTICHAN_SINGLE); + + { + u8 subnet_timeout; + + subnet_timeout = srp_get_subnet_timeout(target->srp_host); + + req->param.primary_path = &ch->path; + req->param.alternate_path = NULL; + req->param.service_id = target->service_id; + get_random_bytes(&req->param.starting_psn, 4); + req->param.starting_psn &= 0xffffff; + req->param.qp_num = ch->qp->qp_num; + req->param.qp_type = ch->qp->qp_type; + req->param.local_cm_response_timeout = subnet_timeout + 2; + req->param.remote_cm_response_timeout = subnet_timeout + 2; + req->param.private_data = &req->priv; + req->param.private_data_len = sizeof(req->priv); + + ipi = req->priv.initiator_port_id; + tpi = req->priv.target_port_id; + } + /* * In the published SRP specification (draft rev. 16a), the * port identifier format is 8 bytes of ID extension followed @@ -787,19 +795,15 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) * recognized by the I/O Class they report. */ if (target->io_class == SRP_REV10_IB_IO_CLASS) { - memcpy(req->priv.initiator_port_id, - &target->sgid.global.interface_id, 8); - memcpy(req->priv.initiator_port_id + 8, - &target->initiator_ext, 8); - memcpy(req->priv.target_port_id, &target->ioc_guid, 8); - memcpy(req->priv.target_port_id + 8, &target->id_ext, 8); + memcpy(ipi, &target->sgid.global.interface_id, 8); + memcpy(ipi + 8, &target->initiator_ext, 8); + memcpy(tpi, &target->ioc_guid, 8); + memcpy(tpi + 8, &target->id_ext, 8); } else { - memcpy(req->priv.initiator_port_id, - &target->initiator_ext, 8); - memcpy(req->priv.initiator_port_id + 8, - &target->sgid.global.interface_id, 8); - memcpy(req->priv.target_port_id, &target->id_ext, 8); - memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8); + memcpy(ipi, &target->initiator_ext, 8); + memcpy(ipi + 8, &target->sgid.global.interface_id, 8); + memcpy(tpi, &target->id_ext, 8); + memcpy(tpi + 8, &target->ioc_guid, 8); } /* @@ -812,9 +816,8 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) PFX "Topspin/Cisco initiator port ID workaround " "activated for target GUID %016llx\n", be64_to_cpu(target->ioc_guid)); - memset(req->priv.initiator_port_id, 0, 8); - memcpy(req->priv.initiator_port_id + 8, - &target->srp_host->srp_dev->dev->node_guid, 8); + memset(ipi, 0, 8); + memcpy(ipi + 8, &target->srp_host->srp_dev->dev->node_guid, 8); } status = ib_send_cm_req(ch->cm_id, &req->param); -- cgit v1.2.3 From 795bc112cd5a40e7612313f54ee527198b1e734f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:05 -0800 Subject: IB/srpt: Make it safe to use RCU for srpt_device.rch_list The next patch will iterate over rch_list from a context from which it is not allowed to block. Hence make rch_list RCU-safe. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 13 ++++++++++--- drivers/infiniband/ulp/srpt/ib_srpt.h | 2 ++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index d78f60dcc2ba..4dd15378bc7c 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1906,7 +1906,7 @@ static void srpt_free_ch(struct kref *kref) { struct srpt_rdma_ch *ch = container_of(kref, struct srpt_rdma_ch, kref); - kfree(ch); + kfree_rcu(ch, rcu); } static void srpt_release_channel_work(struct work_struct *w) @@ -1945,11 +1945,17 @@ static void srpt_release_channel_work(struct work_struct *w) srp_max_req_size, DMA_FROM_DEVICE); mutex_lock(&sdev->mutex); - list_del_init(&ch->list); + list_del_rcu(&ch->list); if (ch->release_done) complete(ch->release_done); mutex_unlock(&sdev->mutex); + synchronize_rcu(); + + mutex_lock(&sdev->mutex); + INIT_LIST_HEAD(&ch->list); + mutex_unlock(&sdev->mutex); + wake_up(&sdev->ch_releaseQ); kref_put(&ch->kref, srpt_free_ch); @@ -2064,6 +2070,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto reject; } + init_rcu_head(&ch->rcu); kref_init(&ch->kref); ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); @@ -2190,7 +2197,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, } mutex_lock(&sdev->mutex); - list_add_tail(&ch->list, &sdev->rch_list); + list_add_tail_rcu(&ch->list, &sdev->rch_list); mutex_unlock(&sdev->mutex); goto out; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 11ce8c94a051..0ab59c60f2ef 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -244,6 +244,7 @@ enum rdma_ch_state { * @qp: IB queue pair used for communicating over this channel. * @cq: IB completion queue for this channel. * @zw_cqe: Zero-length write CQE. + * @rcu: RCU head. * @kref: kref for this channel. * @rq_size: IB receive queue size. * @max_rsp_size: Maximum size of an RSP response message in bytes. @@ -276,6 +277,7 @@ struct srpt_rdma_ch { struct ib_qp *qp; struct ib_cq *cq; struct ib_cqe zw_cqe; + struct rcu_head rcu; struct kref kref; int rq_size; u32 max_rsp_size; -- cgit v1.2.3 From 4413834452a65dd322aeeb8da3b4da58b3daa73b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:06 -0800 Subject: IB/srpt: Rework srpt_disconnect_ch_sync() This patch fixes a use-after-free issue for ch->release_done when running the SRP protocol on top of the rdma_rxe driver. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 45 ++++++++++++++++++----------------- drivers/infiniband/ulp/srpt/ib_srpt.h | 2 -- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 4dd15378bc7c..5386b993daf9 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1841,6 +1841,23 @@ static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) return ret; } +static bool srpt_ch_closed(struct srpt_device *sdev, struct srpt_rdma_ch *ch) +{ + struct srpt_rdma_ch *ch2; + bool res = true; + + rcu_read_lock(); + list_for_each_entry(ch2, &sdev->rch_list, list) { + if (ch2 == ch) { + res = false; + break; + } + } + rcu_read_unlock(); + + return res; +} + /* * Send DREQ and wait for DREP. Return true if and only if this function * changed the state of @ch. @@ -1848,31 +1865,24 @@ static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) static bool srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch) __must_hold(&sdev->mutex) { - DECLARE_COMPLETION_ONSTACK(release_done); struct srpt_device *sdev = ch->sport->sdev; - bool wait; + int ret; lockdep_assert_held(&sdev->mutex); pr_debug("ch %s-%d state %d\n", ch->sess_name, ch->qp->qp_num, ch->state); - WARN_ON(ch->release_done); - ch->release_done = &release_done; - wait = !list_empty(&ch->list); - srpt_disconnect_ch(ch); + ret = srpt_disconnect_ch(ch); mutex_unlock(&sdev->mutex); - if (!wait) - goto out; - - while (wait_for_completion_timeout(&release_done, 180 * HZ) == 0) + while (wait_event_timeout(sdev->ch_releaseQ, srpt_ch_closed(sdev, ch), + 5 * HZ) == 0) pr_info("%s(%s-%d state %d): still waiting ...\n", __func__, ch->sess_name, ch->qp->qp_num, ch->state); -out: mutex_lock(&sdev->mutex); - return wait; + return ret == 0; } static void srpt_set_enabled(struct srpt_port *sport, bool enabled) @@ -1916,8 +1926,7 @@ static void srpt_release_channel_work(struct work_struct *w) struct se_session *se_sess; ch = container_of(w, struct srpt_rdma_ch, release_work); - pr_debug("%s: %s-%d; release_done = %p\n", __func__, ch->sess_name, - ch->qp->qp_num, ch->release_done); + pr_debug("%s-%d\n", ch->sess_name, ch->qp->qp_num); sdev = ch->sport->sdev; BUG_ON(!sdev); @@ -1946,14 +1955,6 @@ static void srpt_release_channel_work(struct work_struct *w) mutex_lock(&sdev->mutex); list_del_rcu(&ch->list); - if (ch->release_done) - complete(ch->release_done); - mutex_unlock(&sdev->mutex); - - synchronize_rcu(); - - mutex_lock(&sdev->mutex); - INIT_LIST_HEAD(&ch->list); mutex_unlock(&sdev->mutex); wake_up(&sdev->ch_releaseQ); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 0ab59c60f2ef..67248338b4c9 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -270,7 +270,6 @@ enum rdma_ch_state { * @sess_name: Session name. * @ini_guid: Initiator port GUID. * @release_work: Allows scheduling of srpt_release_channel(). - * @release_done: Enables waiting for srpt_release_channel() completion. */ struct srpt_rdma_ch { struct ib_cm_id *cm_id; @@ -299,7 +298,6 @@ struct srpt_rdma_ch { u8 sess_name[36]; u8 ini_guid[24]; struct work_struct release_work; - struct completion *release_done; }; /** -- cgit v1.2.3 From c5efb62148a4bf2865d76c43c012a3329f5b5b8c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:07 -0800 Subject: IB/srpt: Add P_Key support Process connection requests that use another P_Key than the default correctly. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 14 +++++++++++--- drivers/infiniband/ulp/srpt/ib_srpt.h | 2 ++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 5386b993daf9..cafa73083ee8 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1057,7 +1058,12 @@ static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp) attr->qp_state = IB_QPS_INIT; attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE; attr->port_num = ch->sport->port; - attr->pkey_index = 0; + + ret = ib_find_cached_pkey(ch->sport->sdev->device, ch->sport->port, + ch->pkey, &attr->pkey_index); + if (ret < 0) + pr_err("Translating pkey %#x failed (%d) - using index 0\n", + ch->pkey, ret); ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT | @@ -1994,9 +2000,10 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, it_iu_len = be32_to_cpu(req->req_it_iu_len); - pr_info("Received SRP_LOGIN_REQ with i_port_id %pI6, t_port_id %pI6 and it_iu_len %d on port %d (guid=%pI6)\n", + pr_info("Received SRP_LOGIN_REQ with i_port_id %pI6, t_port_id %pI6 and it_iu_len %d on port %d (guid=%pI6); pkey %#04x\n", req->initiator_port_id, req->target_port_id, it_iu_len, - param->port, &sport->gid); + param->port, &sport->gid, + be16_to_cpu(param->primary_path->pkey)); rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); rej = kzalloc(sizeof(*rej), GFP_KERNEL); @@ -2073,6 +2080,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, init_rcu_head(&ch->rcu); kref_init(&ch->kref); + ch->pkey = be16_to_cpu(param->primary_path->pkey); ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); memcpy(ch->i_port_id, req->initiator_port_id, 16); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 67248338b4c9..f830968e7fd4 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -266,6 +266,7 @@ enum rdma_ch_state { * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This * list contains struct srpt_ioctx elements and is protected * against concurrent modification by the cm_id spinlock. + * @pkey: P_Key of the IB partition for this SRP channel. * @sess: Session information associated with this SRP channel. * @sess_name: Session name. * @ini_guid: Initiator port GUID. @@ -294,6 +295,7 @@ struct srpt_rdma_ch { struct srpt_recv_ioctx **ioctx_recv_ring; struct list_head list; struct list_head cmd_wait_list; + uint16_t pkey; struct se_session *sess; u8 sess_name[36]; u8 ini_guid[24]; -- cgit v1.2.3 From ba60c84f82f6facfe66688073458f233e4e5ba51 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:08 -0800 Subject: IB/srpt: One target per port In multipathing setups where a target system is equipped with dual-port HCAs it is useful to have one connection per target port instead of one connection per target HCA. Hence move the connection list (rch_list) from struct srpt_device into struct srpt_port. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 98 +++++++++++++++++++---------------- drivers/infiniband/ulp/srpt/ib_srpt.h | 16 +++--- 2 files changed, 61 insertions(+), 53 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index cafa73083ee8..7893fc420794 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1847,13 +1847,13 @@ static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) return ret; } -static bool srpt_ch_closed(struct srpt_device *sdev, struct srpt_rdma_ch *ch) +static bool srpt_ch_closed(struct srpt_port *sport, struct srpt_rdma_ch *ch) { struct srpt_rdma_ch *ch2; bool res = true; rcu_read_lock(); - list_for_each_entry(ch2, &sdev->rch_list, list) { + list_for_each_entry(ch2, &sport->rch_list, list) { if (ch2 == ch) { res = false; break; @@ -1871,33 +1871,32 @@ static bool srpt_ch_closed(struct srpt_device *sdev, struct srpt_rdma_ch *ch) static bool srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch) __must_hold(&sdev->mutex) { - struct srpt_device *sdev = ch->sport->sdev; + struct srpt_port *sport = ch->sport; int ret; - lockdep_assert_held(&sdev->mutex); + lockdep_assert_held(&sport->mutex); pr_debug("ch %s-%d state %d\n", ch->sess_name, ch->qp->qp_num, ch->state); ret = srpt_disconnect_ch(ch); - mutex_unlock(&sdev->mutex); + mutex_unlock(&sport->mutex); - while (wait_event_timeout(sdev->ch_releaseQ, srpt_ch_closed(sdev, ch), + while (wait_event_timeout(sport->ch_releaseQ, srpt_ch_closed(sport, ch), 5 * HZ) == 0) pr_info("%s(%s-%d state %d): still waiting ...\n", __func__, ch->sess_name, ch->qp->qp_num, ch->state); - mutex_lock(&sdev->mutex); + mutex_lock(&sport->mutex); return ret == 0; } static void srpt_set_enabled(struct srpt_port *sport, bool enabled) - __must_hold(&sdev->mutex) + __must_hold(&sport->mutex) { - struct srpt_device *sdev = sport->sdev; struct srpt_rdma_ch *ch; - lockdep_assert_held(&sdev->mutex); + lockdep_assert_held(&sport->mutex); if (sport->enabled == enabled) return; @@ -1906,10 +1905,10 @@ static void srpt_set_enabled(struct srpt_port *sport, bool enabled) return; again: - list_for_each_entry(ch, &sdev->rch_list, list) { + list_for_each_entry(ch, &sport->rch_list, list) { if (ch->sport == sport) { pr_info("%s: closing channel %s-%d\n", - sdev->device->name, ch->sess_name, + sport->sdev->device->name, ch->sess_name, ch->qp->qp_num); if (srpt_disconnect_ch_sync(ch)) goto again; @@ -1929,6 +1928,7 @@ static void srpt_release_channel_work(struct work_struct *w) { struct srpt_rdma_ch *ch; struct srpt_device *sdev; + struct srpt_port *sport; struct se_session *se_sess; ch = container_of(w, struct srpt_rdma_ch, release_work); @@ -1959,11 +1959,12 @@ static void srpt_release_channel_work(struct work_struct *w) sdev, ch->rq_size, srp_max_req_size, DMA_FROM_DEVICE); - mutex_lock(&sdev->mutex); + sport = ch->sport; + mutex_lock(&sport->mutex); list_del_rcu(&ch->list); - mutex_unlock(&sdev->mutex); + mutex_unlock(&sport->mutex); - wake_up(&sdev->ch_releaseQ); + wake_up(&sport->ch_releaseQ); kref_put(&ch->kref, srpt_free_ch); } @@ -2036,9 +2037,9 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; - mutex_lock(&sdev->mutex); + mutex_lock(&sport->mutex); - list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) { + list_for_each_entry_safe(ch, tmp_ch, &sport->rch_list, list) { if (!memcmp(ch->i_port_id, req->initiator_port_id, 16) && !memcmp(ch->t_port_id, req->target_port_id, 16) && param->port == ch->sport->port @@ -2053,7 +2054,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, } } - mutex_unlock(&sdev->mutex); + mutex_unlock(&sport->mutex); } else rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; @@ -2205,9 +2206,9 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto release_channel; } - mutex_lock(&sdev->mutex); - list_add_tail_rcu(&ch->list, &sdev->rch_list); - mutex_unlock(&sdev->mutex); + mutex_lock(&sport->mutex); + list_add_tail_rcu(&ch->list, &sport->rch_list); + mutex_unlock(&sport->mutex); goto out; @@ -2559,24 +2560,21 @@ static void srpt_refresh_port_work(struct work_struct *work) } /** - * srpt_release_sdev - disable login and wait for associated channels - * @sdev: SRPT HCA pointer. + * srpt_release_sport - disable login and wait for associated channels + * @sport: SRPT HCA port. */ -static int srpt_release_sdev(struct srpt_device *sdev) +static int srpt_release_sport(struct srpt_port *sport) { - int i, res; + int res; WARN_ON_ONCE(irqs_disabled()); - BUG_ON(!sdev); - - mutex_lock(&sdev->mutex); - for (i = 0; i < ARRAY_SIZE(sdev->port); i++) - srpt_set_enabled(&sdev->port[i], false); - mutex_unlock(&sdev->mutex); + mutex_lock(&sport->mutex); + srpt_set_enabled(sport, false); + mutex_unlock(&sport->mutex); - res = wait_event_interruptible(sdev->ch_releaseQ, - list_empty_careful(&sdev->rch_list)); + res = wait_event_interruptible(sport->ch_releaseQ, + list_empty_careful(&sport->rch_list)); if (res) pr_err("%s: interrupted.\n", __func__); @@ -2704,9 +2702,7 @@ static void srpt_add_one(struct ib_device *device) goto err; sdev->device = device; - INIT_LIST_HEAD(&sdev->rch_list); - init_waitqueue_head(&sdev->ch_releaseQ); - mutex_init(&sdev->mutex); + mutex_init(&sdev->sdev_mutex); sdev->pd = ib_alloc_pd(device, 0); if (IS_ERR(sdev->pd)) @@ -2747,6 +2743,9 @@ static void srpt_add_one(struct ib_device *device) for (i = 1; i <= sdev->device->phys_port_cnt; i++) { sport = &sdev->port[i - 1]; + INIT_LIST_HEAD(&sport->rch_list); + init_waitqueue_head(&sport->ch_releaseQ); + mutex_init(&sport->mutex); sport->sdev = sdev; sport->port = i; sport->port_attrib.srp_max_rdma_size = DEFAULT_MAX_RDMA_SIZE; @@ -2819,7 +2818,9 @@ static void srpt_remove_one(struct ib_device *device, void *client_data) spin_lock(&srpt_dev_lock); list_del(&sdev->list); spin_unlock(&srpt_dev_lock); - srpt_release_sdev(sdev); + + for (i = 0; i < sdev->device->phys_port_cnt; i++) + srpt_release_sport(&sdev->port[i]); srpt_free_srq(sdev); @@ -2905,11 +2906,11 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) static void srpt_close_session(struct se_session *se_sess) { struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; - struct srpt_device *sdev = ch->sport->sdev; + struct srpt_port *sport = ch->sport; - mutex_lock(&sdev->mutex); + mutex_lock(&sport->mutex); srpt_disconnect_ch_sync(ch); - mutex_unlock(&sdev->mutex); + mutex_unlock(&sport->mutex); } /** @@ -3134,18 +3135,24 @@ static ssize_t srpt_tpg_attrib_use_srq_store(struct config_item *item, if (val != !!val) return -EINVAL; - ret = mutex_lock_interruptible(&sdev->mutex); + ret = mutex_lock_interruptible(&sdev->sdev_mutex); if (ret < 0) return ret; + ret = mutex_lock_interruptible(&sport->mutex); + if (ret < 0) + goto unlock_sdev; enabled = sport->enabled; /* Log out all initiator systems before changing 'use_srq'. */ srpt_set_enabled(sport, false); sport->port_attrib.use_srq = val; srpt_use_srq(sdev, sport->port_attrib.use_srq); srpt_set_enabled(sport, enabled); - mutex_unlock(&sdev->mutex); + ret = count; + mutex_unlock(&sport->mutex); +unlock_sdev: + mutex_unlock(&sdev->sdev_mutex); - return count; + return ret; } CONFIGFS_ATTR(srpt_tpg_attrib_, srp_max_rdma_size); @@ -3174,7 +3181,6 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, { struct se_portal_group *se_tpg = to_tpg(item); struct srpt_port *sport = srpt_tpg_to_sport(se_tpg); - struct srpt_device *sdev = sport->sdev; unsigned long tmp; int ret; @@ -3189,9 +3195,9 @@ static ssize_t srpt_tpg_enable_store(struct config_item *item, return -EINVAL; } - mutex_lock(&sdev->mutex); + mutex_lock(&sport->mutex); srpt_set_enabled(sport, tmp); - mutex_unlock(&sdev->mutex); + mutex_unlock(&sport->mutex); return count; } diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index f830968e7fd4..1434f0cd45f7 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -262,7 +262,7 @@ enum rdma_ch_state { * @state: channel state. See also enum rdma_ch_state. * @ioctx_ring: Send ring. * @ioctx_recv_ring: Receive I/O context ring. - * @list: Node for insertion in the srpt_device.rch_list list. + * @list: Node in srpt_port.rch_list. * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This * list contains struct srpt_ioctx elements and is protected * against concurrent modification by the cm_id spinlock. @@ -334,6 +334,9 @@ struct srpt_port_attrib { * @port_gid_tpg: TPG associated with target port GID. * @port_gid_wwn: WWN associated with target port GID. * @port_attrib: Port attributes that can be accessed through configfs. + * @ch_releaseQ: Enables waiting for removal from rch_list. + * @mutex: Protects rch_list. + * @rch_list: Channel list. See also srpt_rdma_ch.list. */ struct srpt_port { struct srpt_device *sdev; @@ -351,6 +354,9 @@ struct srpt_port { struct se_portal_group port_gid_tpg; struct se_wwn port_gid_wwn; struct srpt_port_attrib port_attrib; + wait_queue_head_t ch_releaseQ; + struct mutex mutex; + struct list_head rch_list; }; /** @@ -361,11 +367,9 @@ struct srpt_port { * @srq: Per-HCA SRQ (shared receive queue). * @cm_id: Connection identifier. * @srq_size: SRQ size. + * @sdev_mutex: Serializes use_srq changes. * @use_srq: Whether or not to use SRQ. * @ioctx_ring: Per-HCA SRQ. - * @rch_list: Per-device channel list -- see also srpt_rdma_ch.list. - * @ch_releaseQ: Enables waiting for removal from rch_list. - * @mutex: Protects rch_list. * @port: Information about the ports owned by this HCA. * @event_handler: Per-HCA asynchronous IB event handler. * @list: Node in srpt_dev_list. @@ -377,11 +381,9 @@ struct srpt_device { struct ib_srq *srq; struct ib_cm_id *cm_id; int srq_size; + struct mutex sdev_mutex; bool use_srq; struct srpt_recv_ioctx **ioctx_ring; - struct list_head rch_list; - wait_queue_head_t ch_releaseQ; - struct mutex mutex; struct srpt_port port[2]; struct ib_event_handler event_handler; struct list_head list; -- cgit v1.2.3 From 2dc98f09f9e68fea37ca67b5b4a406f5906c6d8b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:09 -0800 Subject: IB/srpt: Use the source GID as session name Use the source GID as session name instead of the initiator port ID from the SRP login request. The only functional change in this patch is that it changes the session name shown in debug messages. Note: the fifth argument that is passed to target_alloc_session() is what the SCSI target core uses as key for lookups in the ACL (access control list) information. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 11 ++++++----- drivers/infiniband/ulp/srpt/ib_srpt.h | 4 +--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 7893fc420794..597fc0bc0734 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1989,6 +1989,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, struct srp_login_rej *rej; struct ib_cm_rep_param *rep_param; struct srpt_rdma_ch *ch, *tmp_ch; + char i_port_id[36]; u32 it_iu_len; int i, ret = 0; @@ -2143,9 +2144,9 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto destroy_ib; } - srpt_format_guid(ch->ini_guid, sizeof(ch->ini_guid), + srpt_format_guid(ch->sess_name, sizeof(ch->sess_name), ¶m->primary_path->dgid.global.interface_id); - snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx", + snprintf(i_port_id, sizeof(i_port_id), "0x%016llx%016llx", be64_to_cpu(*(__be64 *)ch->i_port_id), be64_to_cpu(*(__be64 *)(ch->i_port_id + 8))); @@ -2154,16 +2155,16 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (sport->port_guid_tpg.se_tpg_wwn) ch->sess = target_alloc_session(&sport->port_guid_tpg, 0, 0, TARGET_PROT_NORMAL, - ch->ini_guid, ch, NULL); + ch->sess_name, ch, NULL); if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess)) ch->sess = target_alloc_session(&sport->port_gid_tpg, 0, 0, - TARGET_PROT_NORMAL, ch->sess_name, ch, + TARGET_PROT_NORMAL, i_port_id, ch, NULL); /* Retry without leading "0x" */ if (sport->port_gid_tpg.se_tpg_wwn && IS_ERR_OR_NULL(ch->sess)) ch->sess = target_alloc_session(&sport->port_gid_tpg, 0, 0, TARGET_PROT_NORMAL, - ch->sess_name + 2, ch, NULL); + i_port_id + 2, ch, NULL); if (IS_ERR_OR_NULL(ch->sess)) { pr_info("Rejected login because no ACL has been configured yet for initiator %s.\n", ch->sess_name); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 1434f0cd45f7..6c5a14ac7742 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -269,7 +269,6 @@ enum rdma_ch_state { * @pkey: P_Key of the IB partition for this SRP channel. * @sess: Session information associated with this SRP channel. * @sess_name: Session name. - * @ini_guid: Initiator port GUID. * @release_work: Allows scheduling of srpt_release_channel(). */ struct srpt_rdma_ch { @@ -297,8 +296,7 @@ struct srpt_rdma_ch { struct list_head cmd_wait_list; uint16_t pkey; struct se_session *sess; - u8 sess_name[36]; - u8 ini_guid[24]; + u8 sess_name[24]; struct work_struct release_work; }; -- cgit v1.2.3 From a11253142e6d317c25215ddb3029f9c754baefef Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:10 -0800 Subject: IB/srpt: Rework multi-channel support Store initiator and target port ID's once per nexus instead of in each channel data structure. This change simplifies the duplicate connection check in srpt_cm_req_recv(). Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 186 ++++++++++++++++++++++++---------- drivers/infiniband/ulp/srpt/ib_srpt.h | 34 +++++-- 2 files changed, 160 insertions(+), 60 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 597fc0bc0734..de3e77df146e 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1849,16 +1849,20 @@ static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) static bool srpt_ch_closed(struct srpt_port *sport, struct srpt_rdma_ch *ch) { + struct srpt_nexus *nexus; struct srpt_rdma_ch *ch2; bool res = true; rcu_read_lock(); - list_for_each_entry(ch2, &sport->rch_list, list) { - if (ch2 == ch) { - res = false; - break; + list_for_each_entry(nexus, &sport->nexus_list, entry) { + list_for_each_entry(ch2, &nexus->ch_list, list) { + if (ch2 == ch) { + res = false; + goto done; + } } } +done: rcu_read_unlock(); return res; @@ -1891,30 +1895,78 @@ static bool srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch) return ret == 0; } -static void srpt_set_enabled(struct srpt_port *sport, bool enabled) - __must_hold(&sport->mutex) +static void __srpt_close_all_ch(struct srpt_port *sport) { + struct srpt_nexus *nexus; struct srpt_rdma_ch *ch; lockdep_assert_held(&sport->mutex); - if (sport->enabled == enabled) - return; - sport->enabled = enabled; - if (sport->enabled) - return; + list_for_each_entry(nexus, &sport->nexus_list, entry) { + list_for_each_entry(ch, &nexus->ch_list, list) { + if (srpt_disconnect_ch(ch) >= 0) + pr_info("Closing channel %s-%d because target %s_%d has been disabled\n", + ch->sess_name, ch->qp->qp_num, + sport->sdev->device->name, sport->port); + srpt_close_ch(ch); + } + } +} + +/* + * Look up (i_port_id, t_port_id) in sport->nexus_list. Create an entry if + * it does not yet exist. + */ +static struct srpt_nexus *srpt_get_nexus(struct srpt_port *sport, + const u8 i_port_id[16], + const u8 t_port_id[16]) +{ + struct srpt_nexus *nexus = NULL, *tmp_nexus = NULL, *n; -again: - list_for_each_entry(ch, &sport->rch_list, list) { - if (ch->sport == sport) { - pr_info("%s: closing channel %s-%d\n", - sport->sdev->device->name, ch->sess_name, - ch->qp->qp_num); - if (srpt_disconnect_ch_sync(ch)) - goto again; + for (;;) { + mutex_lock(&sport->mutex); + list_for_each_entry(n, &sport->nexus_list, entry) { + if (memcmp(n->i_port_id, i_port_id, 16) == 0 && + memcmp(n->t_port_id, t_port_id, 16) == 0) { + nexus = n; + break; + } + } + if (!nexus && tmp_nexus) { + list_add_tail_rcu(&tmp_nexus->entry, + &sport->nexus_list); + swap(nexus, tmp_nexus); } + mutex_unlock(&sport->mutex); + + if (nexus) + break; + tmp_nexus = kzalloc(sizeof(*nexus), GFP_KERNEL); + if (!tmp_nexus) { + nexus = ERR_PTR(-ENOMEM); + break; + } + init_rcu_head(&tmp_nexus->rcu); + INIT_LIST_HEAD(&tmp_nexus->ch_list); + memcpy(tmp_nexus->i_port_id, i_port_id, 16); + memcpy(tmp_nexus->t_port_id, t_port_id, 16); } + kfree(tmp_nexus); + + return nexus; +} + +static void srpt_set_enabled(struct srpt_port *sport, bool enabled) + __must_hold(&sport->mutex) +{ + lockdep_assert_held(&sport->mutex); + + if (sport->enabled == enabled) + return; + sport->enabled = enabled; + if (!enabled) + __srpt_close_all_ch(sport); } static void srpt_free_ch(struct kref *kref) @@ -1984,11 +2036,12 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, { struct srpt_device *sdev = cm_id->context; struct srpt_port *sport = &sdev->port[param->port - 1]; + struct srpt_nexus *nexus; struct srp_login_req *req; - struct srp_login_rsp *rsp; - struct srp_login_rej *rej; - struct ib_cm_rep_param *rep_param; - struct srpt_rdma_ch *ch, *tmp_ch; + struct srp_login_rsp *rsp = NULL; + struct srp_login_rej *rej = NULL; + struct ib_cm_rep_param *rep_param = NULL; + struct srpt_rdma_ch *ch; char i_port_id[36]; u32 it_iu_len; int i, ret = 0; @@ -2007,6 +2060,13 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, param->port, &sport->gid, be16_to_cpu(param->primary_path->pkey)); + nexus = srpt_get_nexus(sport, req->initiator_port_id, + req->target_port_id); + if (IS_ERR(nexus)) { + ret = PTR_ERR(nexus); + goto out; + } + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); rej = kzalloc(sizeof(*rej), GFP_KERNEL); rep_param = kzalloc(sizeof(*rep_param), GFP_KERNEL); @@ -2036,29 +2096,22 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, } if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { + struct srpt_rdma_ch *ch2; + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; mutex_lock(&sport->mutex); - - list_for_each_entry_safe(ch, tmp_ch, &sport->rch_list, list) { - if (!memcmp(ch->i_port_id, req->initiator_port_id, 16) - && !memcmp(ch->t_port_id, req->target_port_id, 16) - && param->port == ch->sport->port - && param->listen_id == ch->sport->sdev->cm_id - && ch->cm_id) { - if (srpt_disconnect_ch(ch) < 0) - continue; - pr_info("Relogin - closed existing channel %s\n", - ch->sess_name); - rsp->rsp_flags = - SRP_LOGIN_RSP_MULTICHAN_TERMINATED; - } + list_for_each_entry(ch2, &nexus->ch_list, list) { + if (srpt_disconnect_ch(ch2) < 0) + continue; + pr_info("Relogin - closed existing channel %s\n", + ch2->sess_name); + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED; } - mutex_unlock(&sport->mutex); - - } else + } else { rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; + } if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid) || *(__be64 *)(req->target_port_id + 8) != @@ -2083,10 +2136,9 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, init_rcu_head(&ch->rcu); kref_init(&ch->kref); ch->pkey = be16_to_cpu(param->primary_path->pkey); + ch->nexus = nexus; ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); - memcpy(ch->i_port_id, req->initiator_port_id, 16); - memcpy(ch->t_port_id, req->target_port_id, 16); ch->sport = &sdev->port[param->port - 1]; ch->cm_id = cm_id; cm_id->context = ch; @@ -2147,8 +2199,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, srpt_format_guid(ch->sess_name, sizeof(ch->sess_name), ¶m->primary_path->dgid.global.interface_id); snprintf(i_port_id, sizeof(i_port_id), "0x%016llx%016llx", - be64_to_cpu(*(__be64 *)ch->i_port_id), - be64_to_cpu(*(__be64 *)(ch->i_port_id + 8))); + be64_to_cpu(*(__be64 *)nexus->i_port_id), + be64_to_cpu(*(__be64 *)(nexus->i_port_id + 8))); pr_debug("registering session %s\n", ch->sess_name); @@ -2208,7 +2260,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, } mutex_lock(&sport->mutex); - list_add_tail_rcu(&ch->list, &sport->rch_list); + list_add_tail_rcu(&ch->list, &nexus->ch_list); mutex_unlock(&sport->mutex); goto out; @@ -2560,13 +2612,28 @@ static void srpt_refresh_port_work(struct work_struct *work) srpt_refresh_port(sport); } +static bool srpt_ch_list_empty(struct srpt_port *sport) +{ + struct srpt_nexus *nexus; + bool res = true; + + rcu_read_lock(); + list_for_each_entry(nexus, &sport->nexus_list, entry) + if (!list_empty(&nexus->ch_list)) + res = false; + rcu_read_unlock(); + + return res; +} + /** * srpt_release_sport - disable login and wait for associated channels * @sport: SRPT HCA port. */ static int srpt_release_sport(struct srpt_port *sport) { - int res; + struct srpt_nexus *nexus, *next_n; + struct srpt_rdma_ch *ch; WARN_ON_ONCE(irqs_disabled()); @@ -2574,10 +2641,27 @@ static int srpt_release_sport(struct srpt_port *sport) srpt_set_enabled(sport, false); mutex_unlock(&sport->mutex); - res = wait_event_interruptible(sport->ch_releaseQ, - list_empty_careful(&sport->rch_list)); - if (res) - pr_err("%s: interrupted.\n", __func__); + while (wait_event_timeout(sport->ch_releaseQ, + srpt_ch_list_empty(sport), 5 * HZ) <= 0) { + pr_info("%s_%d: waiting for session unregistration ...\n", + sport->sdev->device->name, sport->port); + rcu_read_lock(); + list_for_each_entry(nexus, &sport->nexus_list, entry) { + list_for_each_entry(ch, &nexus->ch_list, list) { + pr_info("%s-%d: state %s\n", + ch->sess_name, ch->qp->qp_num, + get_ch_state_name(ch->state)); + } + } + rcu_read_unlock(); + } + + mutex_lock(&sport->mutex); + list_for_each_entry_safe(nexus, next_n, &sport->nexus_list, entry) { + list_del(&nexus->entry); + kfree_rcu(nexus, rcu); + } + mutex_unlock(&sport->mutex); return 0; } @@ -2744,7 +2828,7 @@ static void srpt_add_one(struct ib_device *device) for (i = 1; i <= sdev->device->phys_port_cnt; i++) { sport = &sdev->port[i - 1]; - INIT_LIST_HEAD(&sport->rch_list); + INIT_LIST_HEAD(&sport->nexus_list); init_waitqueue_head(&sport->ch_releaseQ); mutex_init(&sport->mutex); sport->sdev = sdev; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 6c5a14ac7742..59261d5de292 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -54,6 +54,8 @@ */ #define SRP_SERVICE_NAME_PREFIX "SRP.T10:" +struct srpt_nexus; + enum { /* * SRP IOControllerProfile attributes for SRP target ports that have @@ -240,6 +242,7 @@ enum rdma_ch_state { /** * struct srpt_rdma_ch - RDMA channel + * @nexus: I_T nexus this channel is associated with. * @cm_id: IB CM ID associated with the channel. * @qp: IB queue pair used for communicating over this channel. * @cq: IB completion queue for this channel. @@ -251,8 +254,6 @@ enum rdma_ch_state { * @sq_wr_avail: number of work requests available in the send queue. * @sport: pointer to the information of the HCA port used by this * channel. - * @i_port_id: 128-bit initiator port identifier copied from SRP_LOGIN_REQ. - * @t_port_id: 128-bit target port identifier copied from SRP_LOGIN_REQ. * @max_ti_iu_len: maximum target-to-initiator information unit length. * @req_lim: request limit: maximum number of requests that may be sent * by the initiator without having received a response. @@ -262,7 +263,7 @@ enum rdma_ch_state { * @state: channel state. See also enum rdma_ch_state. * @ioctx_ring: Send ring. * @ioctx_recv_ring: Receive I/O context ring. - * @list: Node in srpt_port.rch_list. + * @list: Node in srpt_nexus.ch_list. * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This * list contains struct srpt_ioctx elements and is protected * against concurrent modification by the cm_id spinlock. @@ -272,6 +273,7 @@ enum rdma_ch_state { * @release_work: Allows scheduling of srpt_release_channel(). */ struct srpt_rdma_ch { + struct srpt_nexus *nexus; struct ib_cm_id *cm_id; struct ib_qp *qp; struct ib_cq *cq; @@ -282,8 +284,6 @@ struct srpt_rdma_ch { u32 max_rsp_size; atomic_t sq_wr_avail; struct srpt_port *sport; - u8 i_port_id[16]; - u8 t_port_id[16]; int max_ti_iu_len; atomic_t req_lim; atomic_t req_lim_delta; @@ -300,6 +300,22 @@ struct srpt_rdma_ch { struct work_struct release_work; }; +/** + * struct srpt_nexus - I_T nexus + * @rcu: RCU head for this data structure. + * @entry: srpt_port.nexus_list list node. + * @ch_list: struct srpt_rdma_ch list. Protected by srpt_port.mutex. + * @i_port_id: 128-bit initiator port identifier copied from SRP_LOGIN_REQ. + * @t_port_id: 128-bit target port identifier copied from SRP_LOGIN_REQ. + */ +struct srpt_nexus { + struct rcu_head rcu; + struct list_head entry; + struct list_head ch_list; + u8 i_port_id[16]; + u8 t_port_id[16]; +}; + /** * struct srpt_port_attib - attributes for SRPT port * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections. @@ -332,9 +348,9 @@ struct srpt_port_attrib { * @port_gid_tpg: TPG associated with target port GID. * @port_gid_wwn: WWN associated with target port GID. * @port_attrib: Port attributes that can be accessed through configfs. - * @ch_releaseQ: Enables waiting for removal from rch_list. - * @mutex: Protects rch_list. - * @rch_list: Channel list. See also srpt_rdma_ch.list. + * @ch_releaseQ: Enables waiting for removal from nexus_list. + * @mutex: Protects nexus_list. + * @nexus_list: Nexus list. See also srpt_nexus.entry. */ struct srpt_port { struct srpt_device *sdev; @@ -354,7 +370,7 @@ struct srpt_port { struct srpt_port_attrib port_attrib; wait_queue_head_t ch_releaseQ; struct mutex mutex; - struct list_head rch_list; + struct list_head nexus_list; }; /** -- cgit v1.2.3 From 940874f8beae5b954350317920170e84ce5dcbee Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:11 -0800 Subject: IB/srpt: Simplify srpt_close_session() Move a mutex lock and unlock statement from srpt_close_session() into srpt_disconnect_ch_sync(). Since the previous patch removed the last user of the return value of that function, change the return value of srpt_disconnect_ch_sync() into void. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index de3e77df146e..b248515a4fe4 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1868,22 +1868,16 @@ done: return res; } -/* - * Send DREQ and wait for DREP. Return true if and only if this function - * changed the state of @ch. - */ -static bool srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch) - __must_hold(&sdev->mutex) +/* Send DREQ and wait for DREP. */ +static void srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch) { struct srpt_port *sport = ch->sport; - int ret; - - lockdep_assert_held(&sport->mutex); pr_debug("ch %s-%d state %d\n", ch->sess_name, ch->qp->qp_num, ch->state); - ret = srpt_disconnect_ch(ch); + mutex_lock(&sport->mutex); + srpt_disconnect_ch(ch); mutex_unlock(&sport->mutex); while (wait_event_timeout(sport->ch_releaseQ, srpt_ch_closed(sport, ch), @@ -1891,8 +1885,6 @@ static bool srpt_disconnect_ch_sync(struct srpt_rdma_ch *ch) pr_info("%s(%s-%d state %d): still waiting ...\n", __func__, ch->sess_name, ch->qp->qp_num, ch->state); - mutex_lock(&sport->mutex); - return ret == 0; } static void __srpt_close_all_ch(struct srpt_port *sport) @@ -2991,11 +2983,8 @@ static void srpt_release_cmd(struct se_cmd *se_cmd) static void srpt_close_session(struct se_session *se_sess) { struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr; - struct srpt_port *sport = ch->sport; - mutex_lock(&sport->mutex); srpt_disconnect_ch_sync(ch); - mutex_unlock(&sport->mutex); } /** -- cgit v1.2.3 From 63d370a6a929b17859c140d58cb54ce36d0c4f04 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:12 -0800 Subject: IB/srpt: Log all zero-length writes and completions The new pr_debug() statements are useful when debugging the ib_srpt driver. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index b248515a4fe4..866ff4be553c 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -840,6 +840,9 @@ static int srpt_zerolength_write(struct srpt_rdma_ch *ch) { struct ib_send_wr wr, *bad_wr; + pr_debug("%s-%d: queued zerolength write\n", ch->sess_name, + ch->qp->qp_num); + memset(&wr, 0, sizeof(wr)); wr.opcode = IB_WR_RDMA_WRITE; wr.wr_cqe = &ch->zw_cqe; @@ -851,6 +854,9 @@ static void srpt_zerolength_write_done(struct ib_cq *cq, struct ib_wc *wc) { struct srpt_rdma_ch *ch = cq->cq_context; + pr_debug("%s-%d wc->status %d\n", ch->sess_name, ch->qp->qp_num, + wc->status); + if (wc->status == IB_WC_SUCCESS) { srpt_process_wait_list(ch); } else { @@ -1804,8 +1810,6 @@ static bool srpt_close_ch(struct srpt_rdma_ch *ch) pr_err("%s-%d: changing queue pair into error state failed: %d\n", ch->sess_name, ch->qp->qp_num, ret); - pr_debug("%s-%d: queued zerolength write\n", ch->sess_name, - ch->qp->qp_num); ret = srpt_zerolength_write(ch); if (ret < 0) { pr_err("%s-%d: queuing zero-length write failed: %d\n", -- cgit v1.2.3 From db7683d7deb25d6edc9c59ac45c56c6a48a45514 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:13 -0800 Subject: IB/srpt: Fix login-related race conditions Make sure that sport->mutex is not released between the duplicate channel check, adding a channel to the channel list and performing the sport enabled check. Avoid that srpt_disconnect_ch() can be invoked concurrently with the ib_send_cm_rep() call by srpt_cm_req_recv(). Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 195 +++++++++++++++++++--------------- 1 file changed, 111 insertions(+), 84 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 866ff4be553c..6278c4448061 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2040,7 +2040,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, struct srpt_rdma_ch *ch; char i_port_id[36]; u32 it_iu_len; - int i, ret = 0; + int i, ret; WARN_ON_ONCE(irqs_disabled()); @@ -2063,69 +2063,43 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto out; } + ret = -ENOMEM; rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); rej = kzalloc(sizeof(*rej), GFP_KERNEL); rep_param = kzalloc(sizeof(*rep_param), GFP_KERNEL); - - if (!rsp || !rej || !rep_param) { - ret = -ENOMEM; + if (!rsp || !rej || !rep_param) goto out; - } + ret = -EINVAL; if (it_iu_len > srp_max_req_size || it_iu_len < 64) { rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); - ret = -EINVAL; - pr_err("rejected SRP_LOGIN_REQ because its" - " length (%d bytes) is out of range (%d .. %d)\n", + SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE); + pr_err("rejected SRP_LOGIN_REQ because its length (%d bytes) is out of range (%d .. %d)\n", it_iu_len, 64, srp_max_req_size); goto reject; } if (!sport->enabled) { - rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - ret = -EINVAL; - pr_err("rejected SRP_LOGIN_REQ because the target port" - " has not yet been enabled\n"); + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_info("rejected SRP_LOGIN_REQ because target port %s_%d has not yet been enabled\n", + sport->sdev->device->name, param->port); goto reject; } - if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { - struct srpt_rdma_ch *ch2; - - rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; - - mutex_lock(&sport->mutex); - list_for_each_entry(ch2, &nexus->ch_list, list) { - if (srpt_disconnect_ch(ch2) < 0) - continue; - pr_info("Relogin - closed existing channel %s\n", - ch2->sess_name); - rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED; - } - mutex_unlock(&sport->mutex); - } else { - rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; - } - if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid) || *(__be64 *)(req->target_port_id + 8) != cpu_to_be64(srpt_service_guid)) { rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); - ret = -ENOMEM; - pr_err("rejected SRP_LOGIN_REQ because it" - " has an invalid target port identifier.\n"); + SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL); + pr_err("rejected SRP_LOGIN_REQ because it has an invalid target port identifier.\n"); goto reject; } + ret = -ENOMEM; ch = kzalloc(sizeof(*ch), GFP_KERNEL); if (!ch) { - rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - pr_err("rejected SRP_LOGIN_REQ because no memory.\n"); - ret = -ENOMEM; + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_err("rejected SRP_LOGIN_REQ because out of memory.\n"); goto reject; } @@ -2153,8 +2127,11 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size, sizeof(*ch->ioctx_ring[0]), ch->max_rsp_size, DMA_TO_DEVICE); - if (!ch->ioctx_ring) + if (!ch->ioctx_ring) { + pr_err("rejected SRP_LOGIN_REQ because creating a new QP SQ ring failed.\n"); + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); goto free_ch; + } INIT_LIST_HEAD(&ch->free_list); for (i = 0; i < ch->rq_size; i++) { @@ -2176,20 +2153,10 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, } ret = srpt_create_ch_ib(ch); - if (ret) { - rej->reason = cpu_to_be32( - SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - pr_err("rejected SRP_LOGIN_REQ because creating" - " a new RDMA channel failed.\n"); - goto free_recv_ring; - } - - ret = srpt_ch_qp_rtr(ch, ch->qp); if (ret) { rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); - pr_err("rejected SRP_LOGIN_REQ because enabling" - " RTR failed (error code = %d)\n", ret); - goto destroy_ib; + pr_err("rejected SRP_LOGIN_REQ because creating a new RDMA channel failed.\n"); + goto free_recv_ring; } srpt_format_guid(ch->sess_name, sizeof(ch->sess_name), @@ -2214,11 +2181,51 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, TARGET_PROT_NORMAL, i_port_id + 2, ch, NULL); if (IS_ERR_OR_NULL(ch->sess)) { - pr_info("Rejected login because no ACL has been configured yet for initiator %s.\n", - ch->sess_name); - rej->reason = cpu_to_be32((PTR_ERR(ch->sess) == -ENOMEM) ? + ret = PTR_ERR(ch->sess); + pr_info("Rejected login for initiator %s: ret = %d.\n", + ch->sess_name, ret); + rej->reason = cpu_to_be32(ret == -ENOMEM ? SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES : SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED); + goto reject; + } + + mutex_lock(&sport->mutex); + + if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) { + struct srpt_rdma_ch *ch2; + + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN; + + list_for_each_entry(ch2, &nexus->ch_list, list) { + if (srpt_disconnect_ch(ch2) < 0) + continue; + pr_info("Relogin - closed existing channel %s\n", + ch2->sess_name); + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_TERMINATED; + } + } else { + rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED; + } + + list_add_tail_rcu(&ch->list, &nexus->ch_list); + + if (!sport->enabled) { + rej->reason = cpu_to_be32( + SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n", + sdev->device->name, param->port); + mutex_unlock(&sport->mutex); + goto reject; + } + + mutex_unlock(&sport->mutex); + + ret = srpt_ch_qp_rtr(ch, ch->qp); + if (ret) { + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_err("rejected SRP_LOGIN_REQ because enabling RTR failed (error code = %d)\n", + ret); goto destroy_ib; } @@ -2231,8 +2238,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rsp->max_it_iu_len = req->req_it_iu_len; rsp->max_ti_iu_len = req->req_it_iu_len; ch->max_ti_iu_len = it_iu_len; - rsp->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT - | SRP_BUF_FORMAT_INDIRECT); + rsp->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | + SRP_BUF_FORMAT_INDIRECT); rsp->req_lim_delta = cpu_to_be32(ch->rq_size); atomic_set(&ch->req_lim, ch->rq_size); atomic_set(&ch->req_lim_delta, 0); @@ -2248,24 +2255,30 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rep_param->responder_resources = 4; rep_param->initiator_depth = 4; - ret = ib_send_cm_rep(cm_id, rep_param); - if (ret) { - pr_err("sending SRP_LOGIN_REQ response failed" - " (error code = %d)\n", ret); - goto release_channel; - } - + /* + * Hold the sport mutex while accepting a connection to avoid that + * srpt_disconnect_ch() is invoked concurrently with this code. + */ mutex_lock(&sport->mutex); - list_add_tail_rcu(&ch->list, &nexus->ch_list); + if (sport->enabled && ch->state == CH_CONNECTING) + ret = ib_send_cm_rep(cm_id, rep_param); + else + ret = -EINVAL; mutex_unlock(&sport->mutex); - goto out; + switch (ret) { + case 0: + break; + case -EINVAL: + goto reject; + default: + rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); + pr_err("sending SRP_LOGIN_REQ response failed (error code = %d)\n", + ret); + goto reject; + } -release_channel: - srpt_disconnect_ch(ch); - transport_deregister_session_configfs(ch->sess); - transport_deregister_session(ch->sess); - ch->sess = NULL; + goto out; destroy_ib: srpt_destroy_ch_ib(ch); @@ -2280,13 +2293,18 @@ free_ring: ch->sport->sdev, ch->rq_size, ch->max_rsp_size, DMA_TO_DEVICE); free_ch: + cm_id->context = NULL; kfree(ch); + ch = NULL; + + WARN_ON_ONCE(ret == 0); reject: + pr_info("Rejecting login with reason %#x\n", be32_to_cpu(rej->reason)); rej->opcode = SRP_LOGIN_REJ; rej->tag = req->tag; - rej->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT - | SRP_BUF_FORMAT_INDIRECT); + rej->buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | + SRP_BUF_FORMAT_INDIRECT); ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0, (void *)rej, sizeof(*rej)); @@ -2329,17 +2347,26 @@ static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch) { int ret; - if (srpt_set_ch_state(ch, CH_LIVE)) { - ret = srpt_ch_qp_rts(ch, ch->qp); - - if (ret == 0) { - /* Trigger wait list processing. */ - ret = srpt_zerolength_write(ch); - WARN_ONCE(ret < 0, "%d\n", ret); - } else { - srpt_close_ch(ch); - } + ret = srpt_ch_qp_rts(ch, ch->qp); + if (ret < 0) { + pr_err("%s-%d: QP transition to RTS failed\n", ch->sess_name, + ch->qp->qp_num); + srpt_close_ch(ch); + return; } + + /* Trigger wait list processing. */ + ret = srpt_zerolength_write(ch); + WARN_ONCE(ret < 0, "%d\n", ret); + + /* + * Note: calling srpt_close_ch() if the transition to the LIVE state + * fails is not necessary since that means that that function has + * already been invoked from another thread. + */ + if (!srpt_set_ch_state(ch, CH_LIVE)) + pr_err("%s-%d: channel transition to LIVE state failed\n", + ch->sess_name, ch->qp->qp_num); } /** -- cgit v1.2.3 From e28a547da6e9c6dd5ba64b978d361222db3592e7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:14 -0800 Subject: IB/srpt: Fix a race condition related to wait list processing Wait list processing only occurs if the channel state >= CH_LIVE. Hence set the channel state to CH_LIVE before triggering wait list processing asynchronously. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 6278c4448061..372f1eb2fa49 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2355,18 +2355,20 @@ static void srpt_cm_rtu_recv(struct srpt_rdma_ch *ch) return; } - /* Trigger wait list processing. */ - ret = srpt_zerolength_write(ch); - WARN_ONCE(ret < 0, "%d\n", ret); - /* * Note: calling srpt_close_ch() if the transition to the LIVE state * fails is not necessary since that means that that function has * already been invoked from another thread. */ - if (!srpt_set_ch_state(ch, CH_LIVE)) + if (!srpt_set_ch_state(ch, CH_LIVE)) { pr_err("%s-%d: channel transition to LIVE state failed\n", ch->sess_name, ch->qp->qp_num); + return; + } + + /* Trigger wait list processing. */ + ret = srpt_zerolength_write(ch); + WARN_ONCE(ret < 0, "%d\n", ret); } /** -- cgit v1.2.3 From fcf589364f2a106544d79594dcf722d97528031b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:15 -0800 Subject: IB/srpt: Don't allow reordering of commands on wait list If a receive I/O context is removed from the wait list and srpt_handle_new_iu() fails to allocate a send I/O context then re-adding the receive I/O context to the wait list can cause reordering. Avoid this by only removing a receive I/O context from the wait list after allocating a send I/O context succeeded. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 82 ++++++++++++++++++++--------------- drivers/infiniband/ulp/srpt/ib_srpt.h | 2 + 2 files changed, 49 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 372f1eb2fa49..5b2e74b1d808 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1533,39 +1533,39 @@ fail: * srpt_handle_new_iu - process a newly received information unit * @ch: RDMA channel through which the information unit has been received. * @recv_ioctx: Receive I/O context associated with the information unit. - * @send_ioctx: Send I/O context. */ -static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, - struct srpt_recv_ioctx *recv_ioctx, - struct srpt_send_ioctx *send_ioctx) +static bool +srpt_handle_new_iu(struct srpt_rdma_ch *ch, struct srpt_recv_ioctx *recv_ioctx) { + struct srpt_send_ioctx *send_ioctx = NULL; struct srp_cmd *srp_cmd; + bool res = false; + u8 opcode; BUG_ON(!ch); BUG_ON(!recv_ioctx); + if (unlikely(ch->state == CH_CONNECTING)) + goto push; + ib_dma_sync_single_for_cpu(ch->sport->sdev->device, recv_ioctx->ioctx.dma, srp_max_req_size, DMA_FROM_DEVICE); - if (unlikely(ch->state == CH_CONNECTING)) - goto out_wait; - - if (unlikely(ch->state != CH_LIVE)) - return; - srp_cmd = recv_ioctx->ioctx.buf; - if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) { - if (!send_ioctx) { - if (!list_empty(&ch->cmd_wait_list)) - goto out_wait; - send_ioctx = srpt_get_send_ioctx(ch); - } + opcode = srp_cmd->opcode; + if (opcode == SRP_CMD || opcode == SRP_TSK_MGMT) { + send_ioctx = srpt_get_send_ioctx(ch); if (unlikely(!send_ioctx)) - goto out_wait; + goto push; } - switch (srp_cmd->opcode) { + if (!list_empty(&recv_ioctx->wait_list)) { + WARN_ON_ONCE(!ch->processing_wait_list); + list_del_init(&recv_ioctx->wait_list); + } + + switch (opcode) { case SRP_CMD: srpt_handle_cmd(ch, recv_ioctx, send_ioctx); break; @@ -1585,16 +1585,22 @@ static void srpt_handle_new_iu(struct srpt_rdma_ch *ch, pr_err("Received SRP_RSP\n"); break; default: - pr_err("received IU with unknown opcode 0x%x\n", - srp_cmd->opcode); + pr_err("received IU with unknown opcode 0x%x\n", opcode); break; } srpt_post_recv(ch->sport->sdev, ch, recv_ioctx); - return; + res = true; -out_wait: - list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); +out: + return res; + +push: + if (list_empty(&recv_ioctx->wait_list)) { + WARN_ON_ONCE(ch->processing_wait_list); + list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list); + } + goto out; } static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -1609,7 +1615,7 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) req_lim = atomic_dec_return(&ch->req_lim); if (unlikely(req_lim < 0)) pr_err("req_lim = %d < 0\n", req_lim); - srpt_handle_new_iu(ch, ioctx, NULL); + srpt_handle_new_iu(ch, ioctx); } else { pr_info_ratelimited("receiving failed for ioctx %p with status %d\n", ioctx, wc->status); @@ -1623,19 +1629,21 @@ static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) */ static void srpt_process_wait_list(struct srpt_rdma_ch *ch) { - struct srpt_send_ioctx *ioctx; + struct srpt_recv_ioctx *recv_ioctx, *tmp; - while (!list_empty(&ch->cmd_wait_list) && - ch->state >= CH_LIVE && - (ioctx = srpt_get_send_ioctx(ch)) != NULL) { - struct srpt_recv_ioctx *recv_ioctx; + WARN_ON_ONCE(ch->state == CH_CONNECTING); - recv_ioctx = list_first_entry(&ch->cmd_wait_list, - struct srpt_recv_ioctx, - wait_list); - list_del(&recv_ioctx->wait_list); - srpt_handle_new_iu(ch, recv_ioctx, ioctx); + if (list_empty(&ch->cmd_wait_list)) + return; + + WARN_ON_ONCE(ch->processing_wait_list); + ch->processing_wait_list = true; + list_for_each_entry_safe(recv_ioctx, tmp, &ch->cmd_wait_list, + wait_list) { + if (!srpt_handle_new_iu(ch, recv_ioctx)) + break; } + ch->processing_wait_list = false; } /** @@ -2150,6 +2158,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); goto free_ring; } + for (i = 0; i < ch->rq_size; i++) + INIT_LIST_HEAD(&ch->ioctx_recv_ring[i]->wait_list); } ret = srpt_create_ch_ib(ch); @@ -2773,8 +2783,10 @@ static int srpt_alloc_srq(struct srpt_device *sdev) sdev->use_srq = true; sdev->srq = srq; - for (i = 0; i < sdev->srq_size; ++i) + for (i = 0; i < sdev->srq_size; ++i) { + INIT_LIST_HEAD(&sdev->ioctx_ring[i]->wait_list); srpt_post_recv(sdev, NULL, sdev->ioctx_ring[i]); + } return 0; } diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 59261d5de292..10f9b2667ef2 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -261,6 +261,7 @@ enum rdma_ch_state { * @spinlock: Protects free_list and state. * @free_list: Head of list with free send I/O contexts. * @state: channel state. See also enum rdma_ch_state. + * @processing_wait_list: Whether or not cmd_wait_list is being processed. * @ioctx_ring: Send ring. * @ioctx_recv_ring: Receive I/O context ring. * @list: Node in srpt_nexus.ch_list. @@ -295,6 +296,7 @@ struct srpt_rdma_ch { struct list_head list; struct list_head cmd_wait_list; uint16_t pkey; + bool processing_wait_list; struct se_session *sess; u8 sess_name[24]; struct work_struct release_work; -- cgit v1.2.3 From 090fa24bf239ce1ec58f2b90c5ba54feca456cb0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:16 -0800 Subject: IB/srpt: Preparations for adding RDMA/CM support Introduce a union in struct srpt_rdma_ch for member variables that depend on the type of connection manager. Avoid that error messages report the IB/CM ID. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 32 ++++++++++++++++---------------- drivers/infiniband/ulp/srpt/ib_srpt.h | 8 ++++++-- 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 5b2e74b1d808..f066fac19f13 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -215,12 +215,12 @@ static const char *get_ch_state_name(enum rdma_ch_state s) */ static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch) { - pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n", - event->event, ch->cm_id, ch->sess_name, ch->state); + pr_debug("QP event %d on ch=%p sess_name=%s state=%d\n", + event->event, ch, ch->sess_name, ch->state); switch (event->event) { case IB_EVENT_COMM_EST: - ib_cm_notify(ch->cm_id, event->event); + ib_cm_notify(ch->ib_cm.cm_id, event->event); break; case IB_EVENT_QP_LAST_WQE_REACHED: pr_debug("%s-%d, state %s: received Last WQE event.\n", @@ -1097,7 +1097,7 @@ static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp) int ret; qp_attr.qp_state = IB_QPS_RTR; - ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); + ret = ib_cm_init_qp_attr(ch->ib_cm.cm_id, &qp_attr, &attr_mask); if (ret) goto out; @@ -1127,7 +1127,7 @@ static int srpt_ch_qp_rts(struct srpt_rdma_ch *ch, struct ib_qp *qp) int ret; qp_attr.qp_state = IB_QPS_RTS; - ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask); + ret = ib_cm_init_qp_attr(ch->ib_cm.cm_id, &qp_attr, &attr_mask); if (ret) goto out; @@ -1509,9 +1509,9 @@ static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch, srp_tsk = recv_ioctx->ioctx.buf; cmd = &send_ioctx->cmd; - pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld" - " cm_id %p sess %p\n", srp_tsk->tsk_mgmt_func, - srp_tsk->task_tag, srp_tsk->tag, ch->cm_id, ch->sess); + pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld ch %p sess %p\n", + srp_tsk->tsk_mgmt_func, srp_tsk->task_tag, srp_tsk->tag, ch, + ch->sess); srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT); send_ioctx->cmd.tag = srp_tsk->tag; @@ -1762,9 +1762,9 @@ retry: atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr); - pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d ch= %p\n", __func__, ch->cq->cqe, qp_init->cap.max_send_sge, - qp_init->cap.max_send_wr, ch->cm_id); + qp_init->cap.max_send_wr, ch); ret = srpt_init_ch_qp(ch, ch->qp); if (ret) @@ -1849,9 +1849,9 @@ static int srpt_disconnect_ch(struct srpt_rdma_ch *ch) if (!srpt_set_ch_state(ch, CH_DISCONNECTING)) return -ENOTCONN; - ret = ib_send_cm_dreq(ch->cm_id, NULL, 0); + ret = ib_send_cm_dreq(ch->ib_cm.cm_id, NULL, 0); if (ret < 0) - ret = ib_send_cm_drep(ch->cm_id, NULL, 0); + ret = ib_send_cm_drep(ch->ib_cm.cm_id, NULL, 0); if (ret < 0 && srpt_close_ch(ch)) ret = 0; @@ -2003,7 +2003,7 @@ static void srpt_release_channel_work(struct work_struct *w) transport_deregister_session(se_sess); ch->sess = NULL; - ib_destroy_cm_id(ch->cm_id); + ib_destroy_cm_id(ch->ib_cm.cm_id); srpt_destroy_ch_ib(ch); @@ -2118,7 +2118,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); ch->sport = &sdev->port[param->port - 1]; - ch->cm_id = cm_id; + ch->ib_cm.cm_id = cm_id; cm_id->context = ch; /* * ch->rq_size should be at least as large as the initiator queue @@ -2239,8 +2239,8 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto destroy_ib; } - pr_debug("Establish connection sess=%p name=%s cm_id=%p\n", ch->sess, - ch->sess_name, ch->cm_id); + pr_debug("Establish connection sess=%p name=%s ch=%p\n", ch->sess, + ch->sess_name, ch); /* create srp_login_response */ rsp->opcode = SRP_LOGIN_RSP; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 10f9b2667ef2..4d9199fd00dc 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -243,8 +243,8 @@ enum rdma_ch_state { /** * struct srpt_rdma_ch - RDMA channel * @nexus: I_T nexus this channel is associated with. - * @cm_id: IB CM ID associated with the channel. * @qp: IB queue pair used for communicating over this channel. + * @cm_id: IB CM ID associated with the channel. * @cq: IB completion queue for this channel. * @zw_cqe: Zero-length write CQE. * @rcu: RCU head. @@ -275,8 +275,12 @@ enum rdma_ch_state { */ struct srpt_rdma_ch { struct srpt_nexus *nexus; - struct ib_cm_id *cm_id; struct ib_qp *qp; + union { + struct { + struct ib_cm_id *cm_id; + } ib_cm; + }; struct ib_cq *cq; struct ib_cqe zw_cqe; struct rcu_head rcu; -- cgit v1.2.3 From 2ffcf04263832557dc55d5a7339d8f29428b679a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 16 Jan 2018 16:14:17 -0800 Subject: IB/srpt: Move the code for parsing struct ib_cm_req_event_param This patch does not change any functionality but makes srpt_cm_req_recv() independent of the IB/CM and hence simplifies the patch that introduces RDMA/CM support. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 49 +++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index f066fac19f13..bf37816a1b12 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2028,20 +2028,22 @@ static void srpt_release_channel_work(struct work_struct *w) /** * srpt_cm_req_recv - process the event IB_CM_REQ_RECEIVED * @cm_id: IB/CM connection identifier. - * @param: IB/CM REQ parameters. - * @private_data: IB/CM REQ private data. + * @port_num: Port through which the IB/CM REQ message was received. + * @pkey: P_Key of the incoming connection. + * @req: SRP login request. + * @src_addr: GID of the port that submitted the login request. * * Ownership of the cm_id is transferred to the target session if this * functions returns zero. Otherwise the caller remains the owner of cm_id. */ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, - struct ib_cm_req_event_param *param, - void *private_data) + u8 port_num, __be16 pkey, + const struct srp_login_req *req, + const char *src_addr) { struct srpt_device *sdev = cm_id->context; - struct srpt_port *sport = &sdev->port[param->port - 1]; + struct srpt_port *sport = &sdev->port[port_num - 1]; struct srpt_nexus *nexus; - struct srp_login_req *req; struct srp_login_rsp *rsp = NULL; struct srp_login_rej *rej = NULL; struct ib_cm_rep_param *rep_param = NULL; @@ -2052,17 +2054,14 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, WARN_ON_ONCE(irqs_disabled()); - if (WARN_ON(!sdev || !private_data)) + if (WARN_ON(!sdev || !req)) return -EINVAL; - req = (struct srp_login_req *)private_data; - it_iu_len = be32_to_cpu(req->req_it_iu_len); pr_info("Received SRP_LOGIN_REQ with i_port_id %pI6, t_port_id %pI6 and it_iu_len %d on port %d (guid=%pI6); pkey %#04x\n", req->initiator_port_id, req->target_port_id, it_iu_len, - param->port, &sport->gid, - be16_to_cpu(param->primary_path->pkey)); + port_num, &sport->gid, be16_to_cpu(pkey)); nexus = srpt_get_nexus(sport, req->initiator_port_id, req->target_port_id); @@ -2090,7 +2089,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, if (!sport->enabled) { rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_info("rejected SRP_LOGIN_REQ because target port %s_%d has not yet been enabled\n", - sport->sdev->device->name, param->port); + sport->sdev->device->name, port_num); goto reject; } @@ -2113,11 +2112,11 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, init_rcu_head(&ch->rcu); kref_init(&ch->kref); - ch->pkey = be16_to_cpu(param->primary_path->pkey); + ch->pkey = be16_to_cpu(pkey); ch->nexus = nexus; ch->zw_cqe.done = srpt_zerolength_write_done; INIT_WORK(&ch->release_work, srpt_release_channel_work); - ch->sport = &sdev->port[param->port - 1]; + ch->sport = sport; ch->ib_cm.cm_id = cm_id; cm_id->context = ch; /* @@ -2169,8 +2168,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto free_recv_ring; } - srpt_format_guid(ch->sess_name, sizeof(ch->sess_name), - ¶m->primary_path->dgid.global.interface_id); + strlcpy(ch->sess_name, src_addr, sizeof(ch->sess_name)); snprintf(i_port_id, sizeof(i_port_id), "0x%016llx%016llx", be64_to_cpu(*(__be64 *)nexus->i_port_id), be64_to_cpu(*(__be64 *)(nexus->i_port_id + 8))); @@ -2224,7 +2222,7 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, rej->reason = cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n", - sdev->device->name, param->port); + sdev->device->name, port_num); mutex_unlock(&sport->mutex); goto reject; } @@ -2327,6 +2325,19 @@ out: return ret; } +static int srpt_ib_cm_req_recv(struct ib_cm_id *cm_id, + struct ib_cm_req_event_param *param, + void *private_data) +{ + char sguid[40]; + + srpt_format_guid(sguid, sizeof(sguid), + ¶m->primary_path->dgid.global.interface_id); + + return srpt_cm_req_recv(cm_id, param->port, param->primary_path->pkey, + private_data, sguid); +} + static void srpt_cm_rej_recv(struct srpt_rdma_ch *ch, enum ib_cm_rej_reason reason, const u8 *private_data, @@ -2401,8 +2412,8 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) ret = 0; switch (event->event) { case IB_CM_REQ_RECEIVED: - ret = srpt_cm_req_recv(cm_id, &event->param.req_rcvd, - event->private_data); + ret = srpt_ib_cm_req_recv(cm_id, &event->param.req_rcvd, + event->private_data); break; case IB_CM_REJ_RECEIVED: srpt_cm_rej_recv(ch, event->param.rej_rcvd.reason, -- cgit v1.2.3 From 411460ac50b0832eeb631d167a355dfe9a228c89 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 18 Jan 2018 10:11:16 +0200 Subject: RDMA/cma: Introduce API to read GIDs for multiple transports This patch introduces an API that allows legacy applications to query GIDs for a rdma_cm_id which is used during connection establishment. GIDs are stored and created differently for iWarp, IB and RoCE transports. Therefore rdma_read_gids() returns GID for all the transports hiding such internal details to caller. It is usable for client side and server side connections. In general continued use of GID based addressing outside of IB is discouraged, so rdma_read_gids() should not be used by any new ULPs. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 27 +++++++++++++++++++++++++++ include/rdma/rdma_cm.h | 19 +++++++++++++++++++ 2 files changed, 46 insertions(+) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 30d1c32a816f..66590a4c612d 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2036,6 +2036,33 @@ __be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr) } EXPORT_SYMBOL(rdma_get_service_id); +void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, + union ib_gid *dgid) +{ + struct rdma_addr *addr = &cm_id->route.addr; + + if (!cm_id->device) { + if (sgid) + memset(sgid, 0, sizeof(*sgid)); + if (dgid) + memset(dgid, 0, sizeof(*dgid)); + return; + } + + if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) { + if (sgid) + rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid); + if (dgid) + rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid); + } else { + if (sgid) + rdma_addr_get_sgid(&addr->dev_addr, sgid); + if (dgid) + rdma_addr_get_dgid(&addr->dev_addr, dgid); + } +} +EXPORT_SYMBOL(rdma_read_gids); + static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event) { struct rdma_id_private *id_priv = iw_id->context; diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 3d2eed3c4e75..6538a5cc27b6 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -413,4 +413,23 @@ bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason); const void *rdma_consumer_reject_data(struct rdma_cm_id *id, struct rdma_cm_event *ev, u8 *data_len); +/** + * rdma_read_gids - Return the SGID and DGID used for establishing + * connection. This can be used after rdma_resolve_addr() + * on client side. This can be use on new connection + * on server side. This is applicable to IB, RoCE, iWarp. + * If cm_id is not bound yet to the RDMA device, it doesn't + * copy and SGID or DGID to the given pointers. + * @id: Communication identifier whose GIDs are queried. + * @sgid: Pointer to SGID where SGID will be returned. It is optional. + * @dgid: Pointer to DGID where DGID will be returned. It is optional. + * Note: This API should not be used by any new ULPs or new code. + * Instead, users interested in querying GIDs should refer to path record + * of the rdma_cm_id to query the GIDs. + * This API is provided for compatibility for existing users. + */ + +void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid, + union ib_gid *dgid); + #endif /* RDMA_CM_H */ -- cgit v1.2.3 From 7a2f64ee4a953c64474a46b10b3b89d55a92c55e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 18 Jan 2018 10:11:17 +0200 Subject: RDMA/ucma: Use rdma cm API to query GID Make use of rdma_read_gids() API to read SGID and DGID which returns correct GIDs for RoCE and other transports. rdma_addr_get_dgid() for RoCE for client side connections returns MAC address, instead of DGID. rdma_addr_get_sgid() for RoCE doesn't return correct SGID for IPv6 and when more than one IP address is assigned to the netdevice. Therefore use transport agnostic rdma_read_gids() API provided by rdma_cm module. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 2ec0f89035fb..d67219db99be 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -944,8 +944,8 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; - rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr, - (union ib_gid *) &addr->sib_addr); + rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr, + NULL); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.src_addr); } @@ -957,8 +957,8 @@ static ssize_t ucma_query_gid(struct ucma_context *ctx, } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; - rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr, - (union ib_gid *) &addr->sib_addr); + rdma_read_gids(ctx->cm_id, NULL, + (union ib_gid *)&addr->sib_addr); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr); } -- cgit v1.2.3 From a2e812ea26043b115b310f3aa0cefb9be0fc83d1 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 18 Jan 2018 10:11:18 +0200 Subject: net/rds: Use rdma_read_gids to read connection GIDs Use the newly introduced rdma_read_gids() to read the SGID and DGID for the connection which returns GID correctly for RoCE transport as well. rdma_addr_get_dgid() for RoCE for client side connections returns MAC address, instead of DGID. rdma_addr_get_sgid() for RoCE doesn't return correct SGID for IPv6 and when more than one IP address is assigned to the netdevice. Therefore use transport agnostic rdma_read_gids() API provided by rdma_cm module. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Acked-by: Santosh Shilimkar Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- net/rds/ib.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index 36dd2099048a..b2a5067b4afe 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -301,13 +301,11 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); if (rds_conn_state(conn) == RDS_CONN_UP) { struct rds_ib_device *rds_ibdev; - struct rdma_dev_addr *dev_addr; ic = conn->c_transport_data; - dev_addr = &ic->i_cm_id->route.addr.dev_addr; - rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); - rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); + rdma_read_gids(ic->i_cm_id, (union ib_gid *)&iinfo->src_gid, + (union ib_gid *)&iinfo->dst_gid); rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; -- cgit v1.2.3 From 15cbc5197693e138b16138b46050499e1cdebd1a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 18 Jan 2018 10:11:19 +0200 Subject: RDMA/core: Simplify rdma_addr_get_sgid() to not support RoCE Now that all callers who care about RoCE addresses have been converted to use rdma_read_gids() simplify rdma_addr_get_sgid() to only support real GID addresses. Callers should only use it for OPA and IB transports. The now deleted implementation for RoCE has several bugs related to IPv6 support and incorrect/inconsistent 'GID' addresses compared to the CM paths. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_addr.h | 33 +++++++-------------------------- 1 file changed, 7 insertions(+), 26 deletions(-) diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index dc0b642e0175..d656809f1217 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -197,34 +197,15 @@ static inline void rdma_gid2ip(struct sockaddr *out, const union ib_gid *gid) } } -static inline void iboe_addr_get_sgid(struct rdma_dev_addr *dev_addr, - union ib_gid *gid) -{ - struct net_device *dev; - struct in_device *ip4; - - dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); - if (dev) { - ip4 = in_dev_get(dev); - if (ip4 && ip4->ifa_list && ip4->ifa_list->ifa_address) - ipv6_addr_set_v4mapped(ip4->ifa_list->ifa_address, - (struct in6_addr *)gid); - - if (ip4) - in_dev_put(ip4); - - dev_put(dev); - } -} - +/* + * rdma_get/set_sgid/dgid() APIs are applicable to IB, and iWarp. + * They are not applicable to RoCE. + * RoCE GIDs are derived from the IP addresses. + */ static inline void rdma_addr_get_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) { - if (dev_addr->transport == RDMA_TRANSPORT_IB && - dev_addr->dev_type != ARPHRD_INFINIBAND) - iboe_addr_get_sgid(dev_addr, gid); - else - memcpy(gid, dev_addr->src_dev_addr + - rdma_addr_gid_offset(dev_addr), sizeof *gid); + memcpy(gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), + sizeof(*gid)); } static inline void rdma_addr_set_sgid(struct rdma_dev_addr *dev_addr, union ib_gid *gid) -- cgit v1.2.3 From 10bea9c8730477ab97ec2e1f590aff52cf8e4e0e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 19 Jan 2018 13:07:11 +0200 Subject: RDMA/mlx5: Remove redundant allocation warning print The kmalloc() failure to allocate memory generates enough information and doesn't need to be accompanied by another driver print. Fixes: d69a24e03659 ("IB/mlx5: Move IB event processing onto a workqueue") Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index f282e281eff2..5007280321b6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3345,19 +3345,16 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, struct mlx5_ib_event_work *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); - if (work) { - INIT_WORK(&work->work, mlx5_ib_handle_event); - work->dev = dev; - work->param = param; - work->context = context; - work->event = event; - - queue_work(mlx5_ib_event_wq, &work->work); + if (!work) return; - } - dev_warn(&dev->pdev->dev, "%s: mlx5_dev_event: %d, with param: %lu dropped, couldn't allocate memory.\n", - __func__, event, param); + INIT_WORK(&work->work, mlx5_ib_handle_event); + work->dev = dev; + work->param = param; + work->context = context; + work->event = event; + + queue_work(mlx5_ib_event_wq, &work->work); } static int set_has_smi_cap(struct mlx5_ib_dev *dev) -- cgit v1.2.3 From 00db63c128dd3daf38f481371976c24d32678142 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 9 Jan 2018 15:58:54 +0200 Subject: RDMA/cma: Check existence of netdevice during port validation If valid netdevice is not found for RoCE, GID table should not be searched with NULL netdevice. Doing so causes the search routines to ignore the netdev argument and may match the wrong GID table entry if the netdev is deleted. Fixes: abae1b71dd37 ("IB/cma: cma_validate_port should verify the port and netdevice") Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 66590a4c612d..61a2f3ead99b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -624,11 +624,13 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(&init_net, bound_if_index); - else + if (!ndev) + return ret; + } else { gid_type = IB_GID_TYPE_IB; - + } ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, NULL); -- cgit v1.2.3 From 2493a57bc10f4f67d241711b058e42c0b0f1b31a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 9 Jan 2018 15:58:55 +0200 Subject: RDMA/cma: Refactor to access multiple fields of rdma_dev_addr Pass the rdma_cm_id so that multiple fields of the rdma_dev_addr structure can be accessed, instead of passing each individual fields. This is needed to access some additional fields in followup patches. Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 61a2f3ead99b..575ee4676252 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -612,11 +612,14 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a static inline int cma_validate_port(struct ib_device *device, u8 port, enum ib_gid_type gid_type, - union ib_gid *gid, int dev_type, - int bound_if_index) + union ib_gid *gid, + struct rdma_id_private *id_priv) { - int ret = -ENODEV; + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + int bound_if_index = dev_addr->bound_dev_if; + int dev_type = dev_addr->dev_type; struct net_device *ndev = NULL; + int ret = -ENODEV; if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port)) return ret; @@ -671,8 +674,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, rdma_protocol_ib(cma_dev->device, port) ? IB_GID_TYPE_IB : listen_id_priv->gid_type, gidp, - dev_addr->dev_type, - dev_addr->bound_dev_if); + id_priv); if (!ret) { id_priv->id.port_num = port; goto out; @@ -693,8 +695,7 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, rdma_protocol_ib(cma_dev->device, port) ? IB_GID_TYPE_IB : cma_dev->default_gid_type[port - 1], - gidp, dev_addr->dev_type, - dev_addr->bound_dev_if); + gidp, id_priv); if (!ret) { id_priv->id.port_num = port; goto out; -- cgit v1.2.3 From 66c74d746db127152fadb33f75b2a8ffa4a02b27 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 9 Jan 2018 15:58:56 +0200 Subject: RDMA/cma: Update cma_validate_port to honor net namespace cma_validate_port uses rdma_dev_addr to validate the port of the cm_id. It needs to honor the net namespace which is setup during cm_id creation when finding netdevice. Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 575ee4676252..2784512f597f 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -628,7 +628,7 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, return ret; if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { - ndev = dev_get_by_index(&init_net, bound_if_index); + ndev = dev_get_by_index(dev_addr->net, bound_if_index); if (!ndev) return ret; } else { -- cgit v1.2.3 From 052eac6eeb5655c52a490a49f09c55500f868558 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 9 Jan 2018 15:58:57 +0200 Subject: RDMA/cma: Update RoCE multicast routines to use net namespace rdma_dev_addr contains the net namespace pointer, while referring bound_dev_if of the rdma_dev_addr, refer to the net namespace of rdma_cm_id stored in rdma_dev_addr. Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2784512f597f..84812b9dea9b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -3930,7 +3930,7 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct net_device *ndev = - dev_get_by_index(&init_net, dev_addr->bound_dev_if); + dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); enum ib_gid_type gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - rdma_start_port(id_priv->cma_dev->device)]; @@ -4120,7 +4120,7 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY); if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if); + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!ndev) { err = -ENODEV; goto out2; @@ -4238,7 +4238,7 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) struct net_device *ndev = NULL; if (dev_addr->bound_dev_if) - ndev = dev_get_by_index(&init_net, + ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (ndev) { cma_igmp_send(ndev, -- cgit v1.2.3 From 172856eac7cfaaeabad6a282d3c79d687ad69fc0 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jan 2018 14:27:11 -0800 Subject: kobject: Export kobj_ns_grab_current() and kobj_ns_drop() Make it possible to call these two functions from a kernel module. Note: despite their name, these two functions can be used meaningfully independent of kobjects. A later patch will add calls to these functions from the SRP driver because this patch series modifies the SRP driver such that it can hold a reference to a namespace that can last longer than the lifetime of the process through which the namespace reference was obtained. Signed-off-by: Bart Van Assche Acked-by: Greg Kroah-Hartman Signed-off-by: Doug Ledford --- lib/kobject.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/kobject.c b/lib/kobject.c index 763d70a18941..06b849eee0ca 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -1039,6 +1039,7 @@ void *kobj_ns_grab_current(enum kobj_ns_type type) return ns; } +EXPORT_SYMBOL_GPL(kobj_ns_grab_current); const void *kobj_ns_netlink(enum kobj_ns_type type, struct sock *sk) { @@ -1074,3 +1075,4 @@ void kobj_ns_drop(enum kobj_ns_type type, void *ns) kobj_ns_ops_tbl[type]->drop_ns(ns); spin_unlock(&kobj_ns_type_lock); } +EXPORT_SYMBOL_GPL(kobj_ns_drop); -- cgit v1.2.3 From 19f313438c7754e6cc2bceddeebeaa5132e2e0a1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jan 2018 14:27:12 -0800 Subject: IB/srp: Add RDMA/CM support Since the SRP_LOGIN_REQ defined in the SRP standard is larger than what fits in the RDMA/CM login request private data, introduce a new login request format for the RDMA/CM. Note: since srp_daemon and ibsrpdm rely on the subnet manager and since there is no equivalent of the IB subnet manager in non-IB networks, login has to be performed manually for non-IB networks. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srp/ib_srp.c | 701 ++++++++++++++++++++++++++++-------- drivers/infiniband/ulp/srp/ib_srp.h | 42 ++- include/scsi/srp.h | 17 + 3 files changed, 606 insertions(+), 154 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 9472f5989d27..6109f2e8f0f8 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -144,7 +145,9 @@ static void srp_remove_one(struct ib_device *device, void *client_data); static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc); static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, const char *opname); -static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); +static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); +static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event); static struct scsi_transport_template *ib_srp_transport_template; static struct workqueue_struct *srp_remove_wq; @@ -265,8 +268,8 @@ static void srp_qp_event(struct ib_event *event, void *context) ib_event_msg(event->event), event->event); } -static int srp_init_qp(struct srp_target_port *target, - struct ib_qp *qp) +static int srp_init_ib_qp(struct srp_target_port *target, + struct ib_qp *qp) { struct ib_qp_attr *attr; int ret; @@ -277,7 +280,7 @@ static int srp_init_qp(struct srp_target_port *target, ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev, target->srp_host->port, - be16_to_cpu(target->pkey), + be16_to_cpu(target->ib_cm.pkey), &attr->pkey_index); if (ret) goto out; @@ -298,32 +301,110 @@ out: return ret; } -static int srp_new_cm_id(struct srp_rdma_ch *ch) +static int srp_new_ib_cm_id(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct ib_cm_id *new_cm_id; new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev, - srp_cm_handler, ch); + srp_ib_cm_handler, ch); if (IS_ERR(new_cm_id)) return PTR_ERR(new_cm_id); - if (ch->cm_id) - ib_destroy_cm_id(ch->cm_id); - ch->cm_id = new_cm_id; + if (ch->ib_cm.cm_id) + ib_destroy_cm_id(ch->ib_cm.cm_id); + ch->ib_cm.cm_id = new_cm_id; if (rdma_cap_opa_ah(target->srp_host->srp_dev->dev, target->srp_host->port)) - ch->path.rec_type = SA_PATH_REC_TYPE_OPA; + ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_OPA; else - ch->path.rec_type = SA_PATH_REC_TYPE_IB; - ch->path.sgid = target->sgid; - ch->path.dgid = target->orig_dgid; - ch->path.pkey = target->pkey; - ch->path.service_id = target->service_id; + ch->ib_cm.path.rec_type = SA_PATH_REC_TYPE_IB; + ch->ib_cm.path.sgid = target->sgid; + ch->ib_cm.path.dgid = target->ib_cm.orig_dgid; + ch->ib_cm.path.pkey = target->ib_cm.pkey; + ch->ib_cm.path.service_id = target->ib_cm.service_id; return 0; } +static const char *inet_ntop(const void *sa, char *dst, unsigned int size) +{ + switch (((struct sockaddr *)sa)->sa_family) { + case AF_INET: + snprintf(dst, size, "%pI4", + &((struct sockaddr_in *)sa)->sin_addr); + break; + case AF_INET6: + snprintf(dst, size, "%pI6", + &((struct sockaddr_in6 *)sa)->sin6_addr); + break; + default: + snprintf(dst, size, "???"); + break; + } + return dst; +} + +static int srp_new_rdma_cm_id(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + struct rdma_cm_id *new_cm_id; + char src_addr[64], dst_addr[64]; + int ret; + + new_cm_id = rdma_create_id(target->net, srp_rdma_cm_handler, ch, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(new_cm_id)) { + ret = PTR_ERR(new_cm_id); + new_cm_id = NULL; + goto out; + } + + init_completion(&ch->done); + ret = rdma_resolve_addr(new_cm_id, target->rdma_cm.src_specified ? + (struct sockaddr *)&target->rdma_cm.src : NULL, + (struct sockaddr *)&target->rdma_cm.dst, + SRP_PATH_REC_TIMEOUT_MS); + if (ret) { + pr_err("No route available from %s to %s (%d)\n", + target->rdma_cm.src_specified ? + inet_ntop(&target->rdma_cm.src, src_addr, + sizeof(src_addr)) : "(any)", + inet_ntop(&target->rdma_cm.dst, dst_addr, + sizeof(dst_addr)), + ret); + goto out; + } + ret = wait_for_completion_interruptible(&ch->done); + if (ret < 0) + goto out; + + ret = ch->status; + if (ret) { + pr_err("Resolving address %s failed (%d)\n", + inet_ntop(&target->rdma_cm.dst, dst_addr, + sizeof(dst_addr)), + ret); + goto out; + } + + swap(ch->rdma_cm.cm_id, new_cm_id); + +out: + if (new_cm_id) + rdma_destroy_id(new_cm_id); + + return ret; +} + +static int srp_new_cm_id(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + + return target->using_rdma_cm ? srp_new_rdma_cm_id(ch) : + srp_new_ib_cm_id(ch); +} + static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target) { struct srp_device *dev = target->srp_host->srp_dev; @@ -521,16 +602,25 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) init_attr->send_cq = send_cq; init_attr->recv_cq = recv_cq; - qp = ib_create_qp(dev->pd, init_attr); - if (IS_ERR(qp)) { - ret = PTR_ERR(qp); + if (target->using_rdma_cm) { + ret = rdma_create_qp(ch->rdma_cm.cm_id, dev->pd, init_attr); + qp = ch->rdma_cm.cm_id->qp; + } else { + qp = ib_create_qp(dev->pd, init_attr); + if (!IS_ERR(qp)) { + ret = srp_init_ib_qp(target, qp); + if (ret) + ib_destroy_qp(qp); + } else { + ret = PTR_ERR(qp); + } + } + if (ret) { + pr_err("QP creation failed for dev %s: %d\n", + dev_name(&dev->dev->dev), ret); goto err_send_cq; } - ret = srp_init_qp(target, qp); - if (ret) - goto err_qp; - if (dev->use_fast_reg) { fr_pool = srp_alloc_fr_pool(target); if (IS_ERR(fr_pool)) { @@ -574,7 +664,10 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) return 0; err_qp: - ib_destroy_qp(qp); + if (target->using_rdma_cm) + rdma_destroy_qp(ch->rdma_cm.cm_id); + else + ib_destroy_qp(qp); err_send_cq: ib_free_cq(send_cq); @@ -600,9 +693,16 @@ static void srp_free_ch_ib(struct srp_target_port *target, if (!ch->target) return; - if (ch->cm_id) { - ib_destroy_cm_id(ch->cm_id); - ch->cm_id = NULL; + if (target->using_rdma_cm) { + if (ch->rdma_cm.cm_id) { + rdma_destroy_id(ch->rdma_cm.cm_id); + ch->rdma_cm.cm_id = NULL; + } + } else { + if (ch->ib_cm.cm_id) { + ib_destroy_cm_id(ch->ib_cm.cm_id); + ch->ib_cm.cm_id = NULL; + } } /* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */ @@ -658,16 +758,16 @@ static void srp_path_rec_completion(int status, shost_printk(KERN_ERR, target->scsi_host, PFX "Got failed path rec status %d\n", status); else - ch->path = *pathrec; + ch->ib_cm.path = *pathrec; complete(&ch->done); } -static int srp_lookup_path(struct srp_rdma_ch *ch) +static int srp_ib_lookup_path(struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; int ret = -ENODEV; - ch->path.numb_path = 1; + ch->ib_cm.path.numb_path = 1; init_completion(&ch->done); @@ -678,10 +778,10 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) if (!scsi_host_get(target->scsi_host)) goto out; - ch->path_query_id = ib_sa_path_rec_get(&srp_sa_client, + ch->ib_cm.path_query_id = ib_sa_path_rec_get(&srp_sa_client, target->srp_host->srp_dev->dev, target->srp_host->port, - &ch->path, + &ch->ib_cm.path, IB_SA_PATH_REC_SERVICE_ID | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | @@ -690,8 +790,8 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) SRP_PATH_REC_TIMEOUT_MS, GFP_KERNEL, srp_path_rec_completion, - ch, &ch->path_query); - ret = ch->path_query_id; + ch, &ch->ib_cm.path_query); + ret = ch->ib_cm.path_query_id; if (ret < 0) goto put; @@ -703,9 +803,9 @@ static int srp_lookup_path(struct srp_rdma_ch *ch) if (ret < 0) shost_printk(KERN_WARNING, target->scsi_host, PFX "Path record query failed: sgid %pI6, dgid %pI6, pkey %#04x, service_id %#16llx\n", - ch->path.sgid.raw, ch->path.dgid.raw, - be16_to_cpu(target->pkey), - be64_to_cpu(target->service_id)); + ch->ib_cm.path.sgid.raw, ch->ib_cm.path.dgid.raw, + be16_to_cpu(target->ib_cm.pkey), + be64_to_cpu(target->ib_cm.service_id)); put: scsi_host_put(target->scsi_host); @@ -714,6 +814,34 @@ out: return ret; } +static int srp_rdma_lookup_path(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + int ret; + + init_completion(&ch->done); + + ret = rdma_resolve_route(ch->rdma_cm.cm_id, SRP_PATH_REC_TIMEOUT_MS); + if (ret) + return ret; + + wait_for_completion_interruptible(&ch->done); + + if (ch->status != 0) + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Path resolution failed\n"); + + return ch->status; +} + +static int srp_lookup_path(struct srp_rdma_ch *ch) +{ + struct srp_target_port *target = ch->target; + + return target->using_rdma_cm ? srp_rdma_lookup_path(ch) : + srp_ib_lookup_path(ch); +} + static u8 srp_get_subnet_timeout(struct srp_host *host) { struct ib_port_attr attr; @@ -735,8 +863,10 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) { struct srp_target_port *target = ch->target; struct { - struct ib_cm_req_param param; - struct srp_login_req priv; + struct rdma_conn_param rdma_param; + struct srp_login_req_rdma rdma_req; + struct ib_cm_req_param ib_param; + struct srp_login_req ib_req; } *req = NULL; char *ipi, *tpi; int status; @@ -745,44 +875,62 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) if (!req) return -ENOMEM; - req->param.flow_control = 1; - req->param.retry_count = target->tl_retry_count; + req->ib_param.flow_control = 1; + req->ib_param.retry_count = target->tl_retry_count; /* * Pick some arbitrary defaults here; we could make these * module parameters if anyone cared about setting them. */ - req->param.responder_resources = 4; - req->param.rnr_retry_count = 7; - req->param.max_cm_retries = 15; - - req->priv.opcode = SRP_LOGIN_REQ; - req->priv.tag = 0; - req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len); - req->priv.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | + req->ib_param.responder_resources = 4; + req->ib_param.rnr_retry_count = 7; + req->ib_param.max_cm_retries = 15; + + req->ib_req.opcode = SRP_LOGIN_REQ; + req->ib_req.tag = 0; + req->ib_req.req_it_iu_len = cpu_to_be32(target->max_iu_len); + req->ib_req.req_buf_fmt = cpu_to_be16(SRP_BUF_FORMAT_DIRECT | SRP_BUF_FORMAT_INDIRECT); - req->priv.req_flags = (multich ? SRP_MULTICHAN_MULTI : - SRP_MULTICHAN_SINGLE); - - { + req->ib_req.req_flags = (multich ? SRP_MULTICHAN_MULTI : + SRP_MULTICHAN_SINGLE); + + if (target->using_rdma_cm) { + req->rdma_param.flow_control = req->ib_param.flow_control; + req->rdma_param.responder_resources = + req->ib_param.responder_resources; + req->rdma_param.initiator_depth = req->ib_param.initiator_depth; + req->rdma_param.retry_count = req->ib_param.retry_count; + req->rdma_param.rnr_retry_count = req->ib_param.rnr_retry_count; + req->rdma_param.private_data = &req->rdma_req; + req->rdma_param.private_data_len = sizeof(req->rdma_req); + + req->rdma_req.opcode = req->ib_req.opcode; + req->rdma_req.tag = req->ib_req.tag; + req->rdma_req.req_it_iu_len = req->ib_req.req_it_iu_len; + req->rdma_req.req_buf_fmt = req->ib_req.req_buf_fmt; + req->rdma_req.req_flags = req->ib_req.req_flags; + + ipi = req->rdma_req.initiator_port_id; + tpi = req->rdma_req.target_port_id; + } else { u8 subnet_timeout; subnet_timeout = srp_get_subnet_timeout(target->srp_host); - req->param.primary_path = &ch->path; - req->param.alternate_path = NULL; - req->param.service_id = target->service_id; - get_random_bytes(&req->param.starting_psn, 4); - req->param.starting_psn &= 0xffffff; - req->param.qp_num = ch->qp->qp_num; - req->param.qp_type = ch->qp->qp_type; - req->param.local_cm_response_timeout = subnet_timeout + 2; - req->param.remote_cm_response_timeout = subnet_timeout + 2; - req->param.private_data = &req->priv; - req->param.private_data_len = sizeof(req->priv); - - ipi = req->priv.initiator_port_id; - tpi = req->priv.target_port_id; + req->ib_param.primary_path = &ch->ib_cm.path; + req->ib_param.alternate_path = NULL; + req->ib_param.service_id = target->ib_cm.service_id; + get_random_bytes(&req->ib_param.starting_psn, 4); + req->ib_param.starting_psn &= 0xffffff; + req->ib_param.qp_num = ch->qp->qp_num; + req->ib_param.qp_type = ch->qp->qp_type; + req->ib_param.local_cm_response_timeout = subnet_timeout + 2; + req->ib_param.remote_cm_response_timeout = subnet_timeout + 2; + req->ib_param.private_data = &req->ib_req; + req->ib_param.private_data_len = sizeof(req->ib_req); + + ipi = req->ib_req.initiator_port_id; + tpi = req->ib_req.target_port_id; } /* @@ -820,7 +968,10 @@ static int srp_send_req(struct srp_rdma_ch *ch, bool multich) memcpy(ipi + 8, &target->srp_host->srp_dev->dev->node_guid, 8); } - status = ib_send_cm_req(ch->cm_id, &req->param); + if (target->using_rdma_cm) + status = rdma_connect(ch->rdma_cm.cm_id, &req->rdma_param); + else + status = ib_send_cm_req(ch->ib_cm.cm_id, &req->ib_param); kfree(req); @@ -847,14 +998,23 @@ static bool srp_queue_remove_work(struct srp_target_port *target) static void srp_disconnect_target(struct srp_target_port *target) { struct srp_rdma_ch *ch; - int i; + int i, ret; /* XXX should send SRP_I_LOGOUT request */ for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; ch->connected = false; - if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) { + ret = 0; + if (target->using_rdma_cm) { + if (ch->rdma_cm.cm_id) + rdma_disconnect(ch->rdma_cm.cm_id); + } else { + if (ch->ib_cm.cm_id) + ret = ib_send_cm_dreq(ch->ib_cm.cm_id, + NULL, 0); + } + if (ret < 0) { shost_printk(KERN_DEBUG, target->scsi_host, PFX "Sending CM DREQ failed\n"); } @@ -968,6 +1128,7 @@ static void srp_remove_target(struct srp_target_port *target) scsi_remove_host(target->scsi_host); srp_stop_rport_timers(target->rport); srp_disconnect_target(target); + kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net); for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; srp_free_ch_ib(target, ch); @@ -2355,7 +2516,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, struct srp_target_port *target = ch->target; struct ib_qp_attr *qp_attr = NULL; int attr_mask = 0; - int ret; + int ret = 0; int i; if (lrsp->opcode == SRP_LOGIN_RSP) { @@ -2385,40 +2546,42 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, goto error; } - ret = -ENOMEM; - qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL); - if (!qp_attr) - goto error; - - qp_attr->qp_state = IB_QPS_RTR; - ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (ret) - goto error_free; - - ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); - if (ret) - goto error_free; - for (i = 0; i < target->queue_size; i++) { struct srp_iu *iu = ch->rx_ring[i]; ret = srp_post_recv(ch, iu); if (ret) - goto error_free; + goto error; } - qp_attr->qp_state = IB_QPS_RTS; - ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); - if (ret) - goto error_free; + if (!target->using_rdma_cm) { + ret = -ENOMEM; + qp_attr = kmalloc(sizeof(*qp_attr), GFP_KERNEL); + if (!qp_attr) + goto error; + + qp_attr->qp_state = IB_QPS_RTR; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; - target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); + if (ret) + goto error_free; - ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); - if (ret) - goto error_free; + qp_attr->qp_state = IB_QPS_RTS; + ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask); + if (ret) + goto error_free; + + target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask); + + ret = ib_modify_qp(ch->qp, qp_attr, attr_mask); + if (ret) + goto error_free; - ret = ib_send_cm_rtu(cm_id, NULL, 0); + ret = ib_send_cm_rtu(cm_id, NULL, 0); + } error_free: kfree(qp_attr); @@ -2427,41 +2590,43 @@ error: ch->status = ret; } -static void srp_cm_rej_handler(struct ib_cm_id *cm_id, - struct ib_cm_event *event, - struct srp_rdma_ch *ch) +static void srp_ib_cm_rej_handler(struct ib_cm_id *cm_id, + struct ib_cm_event *event, + struct srp_rdma_ch *ch) { struct srp_target_port *target = ch->target; struct Scsi_Host *shost = target->scsi_host; struct ib_class_port_info *cpi; int opcode; + u16 dlid; switch (event->param.rej_rcvd.reason) { case IB_CM_REJ_PORT_CM_REDIRECT: cpi = event->param.rej_rcvd.ari; - sa_path_set_dlid(&ch->path, ntohs(cpi->redirect_lid)); - ch->path.pkey = cpi->redirect_pkey; + dlid = be16_to_cpu(cpi->redirect_lid); + sa_path_set_dlid(&ch->ib_cm.path, dlid); + ch->ib_cm.path.pkey = cpi->redirect_pkey; cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff; - memcpy(ch->path.dgid.raw, cpi->redirect_gid, 16); + memcpy(ch->ib_cm.path.dgid.raw, cpi->redirect_gid, 16); - ch->status = sa_path_get_dlid(&ch->path) ? - SRP_DLID_REDIRECT : SRP_PORT_REDIRECT; + ch->status = dlid ? SRP_DLID_REDIRECT : SRP_PORT_REDIRECT; break; case IB_CM_REJ_PORT_REDIRECT: if (srp_target_is_topspin(target)) { + union ib_gid *dgid = &ch->ib_cm.path.dgid; + /* * Topspin/Cisco SRP gateways incorrectly send * reject reason code 25 when they mean 24 * (port redirect). */ - memcpy(ch->path.dgid.raw, - event->param.rej_rcvd.ari, 16); + memcpy(dgid->raw, event->param.rej_rcvd.ari, 16); shost_printk(KERN_DEBUG, shost, PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n", - be64_to_cpu(ch->path.dgid.global.subnet_prefix), - be64_to_cpu(ch->path.dgid.global.interface_id)); + be64_to_cpu(dgid->global.subnet_prefix), + be64_to_cpu(dgid->global.interface_id)); ch->status = SRP_PORT_REDIRECT; } else { @@ -2490,7 +2655,8 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, shost_printk(KERN_WARNING, shost, PFX "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n", target->sgid.raw, - target->orig_dgid.raw, reason); + target->ib_cm.orig_dgid.raw, + reason); } else shost_printk(KERN_WARNING, shost, " REJ reason: IB_CM_REJ_CONSUMER_DEFINED," @@ -2510,7 +2676,7 @@ static void srp_cm_rej_handler(struct ib_cm_id *cm_id, } } -static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) +static int srp_ib_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) { struct srp_rdma_ch *ch = cm_id->context; struct srp_target_port *target = ch->target; @@ -2533,7 +2699,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); comp = 1; - srp_cm_rej_handler(cm_id, event, ch); + srp_ib_cm_rej_handler(cm_id, event, ch); break; case IB_CM_DREQ_RECEIVED: @@ -2571,6 +2737,135 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return 0; } +static void srp_rdma_cm_rej_handler(struct srp_rdma_ch *ch, + struct rdma_cm_event *event) +{ + struct srp_target_port *target = ch->target; + struct Scsi_Host *shost = target->scsi_host; + int opcode; + + switch (event->status) { + case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID: + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n"); + ch->status = -ECONNRESET; + break; + + case IB_CM_REJ_CONSUMER_DEFINED: + opcode = *(u8 *) event->param.conn.private_data; + if (opcode == SRP_LOGIN_REJ) { + struct srp_login_rej *rej = + (struct srp_login_rej *) + event->param.conn.private_data; + u32 reason = be32_to_cpu(rej->reason); + + if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE) + shost_printk(KERN_WARNING, shost, + PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n"); + else + shost_printk(KERN_WARNING, shost, + PFX "SRP LOGIN REJECTED, reason 0x%08x\n", reason); + } else { + shost_printk(KERN_WARNING, shost, + " REJ reason: IB_CM_REJ_CONSUMER_DEFINED, opcode 0x%02x\n", + opcode); + } + ch->status = -ECONNRESET; + break; + + case IB_CM_REJ_STALE_CONN: + shost_printk(KERN_WARNING, shost, + " REJ reason: stale connection\n"); + ch->status = SRP_STALE_CONN; + break; + + default: + shost_printk(KERN_WARNING, shost, " REJ reason 0x%x\n", + event->status); + ch->status = -ECONNRESET; + break; + } +} + +static int srp_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct srp_rdma_ch *ch = cm_id->context; + struct srp_target_port *target = ch->target; + int comp = 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + ch->status = 0; + comp = 1; + break; + + case RDMA_CM_EVENT_ADDR_ERROR: + ch->status = -ENXIO; + comp = 1; + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + ch->status = 0; + comp = 1; + break; + + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + ch->status = -EHOSTUNREACH; + comp = 1; + break; + + case RDMA_CM_EVENT_CONNECT_ERROR: + shost_printk(KERN_DEBUG, target->scsi_host, + PFX "Sending CM REQ failed\n"); + comp = 1; + ch->status = -ECONNRESET; + break; + + case RDMA_CM_EVENT_ESTABLISHED: + comp = 1; + srp_cm_rep_handler(NULL, event->param.conn.private_data, ch); + break; + + case RDMA_CM_EVENT_REJECTED: + shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n"); + comp = 1; + + srp_rdma_cm_rej_handler(ch, event); + break; + + case RDMA_CM_EVENT_DISCONNECTED: + if (ch->connected) { + shost_printk(KERN_WARNING, target->scsi_host, + PFX "received DREQ\n"); + rdma_disconnect(ch->rdma_cm.cm_id); + comp = 1; + ch->status = 0; + queue_work(system_long_wq, &target->tl_err_work); + } + break; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + shost_printk(KERN_ERR, target->scsi_host, + PFX "connection closed\n"); + + comp = 1; + ch->status = 0; + break; + + default: + shost_printk(KERN_WARNING, target->scsi_host, + PFX "Unhandled CM event %d\n", event->event); + break; + } + + if (comp) + complete(&ch->done); + + return 0; +} + /** * srp_change_queue_depth - setting device queue depth * @sdev: scsi device struct @@ -2772,7 +3067,10 @@ static ssize_t show_service_id(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "0x%016llx\n", be64_to_cpu(target->service_id)); + if (target->using_rdma_cm) + return -ENOENT; + return sprintf(buf, "0x%016llx\n", + be64_to_cpu(target->ib_cm.service_id)); } static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, @@ -2780,7 +3078,9 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "0x%04x\n", be16_to_cpu(target->pkey)); + if (target->using_rdma_cm) + return -ENOENT; + return sprintf(buf, "0x%04x\n", be16_to_cpu(target->ib_cm.pkey)); } static ssize_t show_sgid(struct device *dev, struct device_attribute *attr, @@ -2797,7 +3097,9 @@ static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, struct srp_target_port *target = host_to_target(class_to_shost(dev)); struct srp_rdma_ch *ch = &target->ch[0]; - return sprintf(buf, "%pI6\n", ch->path.dgid.raw); + if (target->using_rdma_cm) + return -ENOENT; + return sprintf(buf, "%pI6\n", ch->ib_cm.path.dgid.raw); } static ssize_t show_orig_dgid(struct device *dev, @@ -2805,7 +3107,9 @@ static ssize_t show_orig_dgid(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "%pI6\n", target->orig_dgid.raw); + if (target->using_rdma_cm) + return -ENOENT; + return sprintf(buf, "%pI6\n", target->ib_cm.orig_dgid.raw); } static ssize_t show_req_lim(struct device *dev, @@ -3050,6 +3354,9 @@ static bool srp_conn_unique(struct srp_host *host, if (t != target && target->id_ext == t->id_ext && target->ioc_guid == t->ioc_guid && + (!target->using_rdma_cm || + memcmp(&target->rdma_cm.dst, &t->rdma_cm.dst, + sizeof(target->rdma_cm.dst)) == 0) && target->initiator_ext == t->initiator_ext) { ret = false; break; @@ -3066,6 +3373,9 @@ out: * * id_ext=,ioc_guid=,dgid=, * pkey=,service_id= + * or + * id_ext=,ioc_guid=, + * [src=,]dest=: * * to the add_target sysfs attribute. */ @@ -3086,11 +3396,19 @@ enum { SRP_OPT_COMP_VECTOR = 1 << 12, SRP_OPT_TL_RETRY_COUNT = 1 << 13, SRP_OPT_QUEUE_SIZE = 1 << 14, - SRP_OPT_ALL = (SRP_OPT_ID_EXT | - SRP_OPT_IOC_GUID | - SRP_OPT_DGID | - SRP_OPT_PKEY | - SRP_OPT_SERVICE_ID), + SRP_OPT_IP_SRC = 1 << 15, + SRP_OPT_IP_DEST = 1 << 16, +}; + +static unsigned int srp_opt_mandatory[] = { + SRP_OPT_ID_EXT | + SRP_OPT_IOC_GUID | + SRP_OPT_DGID | + SRP_OPT_PKEY | + SRP_OPT_SERVICE_ID, + SRP_OPT_ID_EXT | + SRP_OPT_IOC_GUID | + SRP_OPT_IP_DEST, }; static const match_table_t srp_opt_tokens = { @@ -3109,10 +3427,28 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, + { SRP_OPT_IP_SRC, "src=%s" }, + { SRP_OPT_IP_DEST, "dest=%s" }, { SRP_OPT_ERR, NULL } }; -static int srp_parse_options(const char *buf, struct srp_target_port *target) +static int srp_parse_in(struct net *net, struct sockaddr_storage *sa, + const char *addr_port_str) +{ + char *addr = kstrdup(addr_port_str, GFP_KERNEL); + char *port_str = addr; + int ret; + + if (!addr) + return -ENOMEM; + strsep(&port_str, ":"); + ret = inet_pton_with_scope(net, AF_UNSPEC, addr, port_str, sa); + kfree(addr); + return ret; +} + +static int srp_parse_options(struct net *net, const char *buf, + struct srp_target_port *target) { char *options, *sep_opt; char *p; @@ -3180,7 +3516,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) goto out; } - ret = hex2bin(target->orig_dgid.raw, p, 16); + ret = hex2bin(target->ib_cm.orig_dgid.raw, p, 16); kfree(p); if (ret < 0) goto out; @@ -3191,7 +3527,7 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) pr_warn("bad P_Key parameter '%s'\n", p); goto out; } - target->pkey = cpu_to_be16(token); + target->ib_cm.pkey = cpu_to_be16(token); break; case SRP_OPT_SERVICE_ID: @@ -3206,7 +3542,39 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) kfree(p); goto out; } - target->service_id = cpu_to_be64(ull); + target->ib_cm.service_id = cpu_to_be64(ull); + kfree(p); + break; + + case SRP_OPT_IP_SRC: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = srp_parse_in(net, &target->rdma_cm.src.ss, p); + if (ret < 0) { + pr_warn("bad source parameter '%s'\n", p); + kfree(p); + goto out; + } + target->rdma_cm.src_specified = true; + kfree(p); + break; + + case SRP_OPT_IP_DEST: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + ret = srp_parse_in(net, &target->rdma_cm.dst.ss, p); + if (ret < 0) { + pr_warn("bad dest parameter '%s'\n", p); + kfree(p); + goto out; + } + target->using_rdma_cm = true; kfree(p); break; @@ -3321,14 +3689,14 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) } } - if ((opt_mask & SRP_OPT_ALL) == SRP_OPT_ALL) - ret = 0; - else - for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i) - if ((srp_opt_tokens[i].token & SRP_OPT_ALL) && - !(srp_opt_tokens[i].token & opt_mask)) - pr_warn("target creation request is missing parameter '%s'\n", - srp_opt_tokens[i].pattern); + for (i = 0; i < ARRAY_SIZE(srp_opt_mandatory); i++) { + if ((opt_mask & srp_opt_mandatory[i]) == srp_opt_mandatory[i]) { + ret = 0; + break; + } + } + if (ret) + pr_warn("target creation request is missing one or more parameters\n"); if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) @@ -3369,6 +3737,7 @@ static ssize_t srp_create_target(struct device *dev, target = host_to_target(target_host); + target->net = kobj_ns_grab_current(KOBJ_NS_TYPE_NET); target->io_class = SRP_REV16A_IB_IO_CLASS; target->scsi_host = target_host; target->srp_host = host; @@ -3390,18 +3759,29 @@ static ssize_t srp_create_target(struct device *dev, if (ret < 0) goto put; - ret = srp_parse_options(buf, target); + ret = srp_parse_options(target->net, buf, target); if (ret) goto out; target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; if (!srp_conn_unique(target->srp_host, target)) { - shost_printk(KERN_INFO, target->scsi_host, - PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", - be64_to_cpu(target->id_ext), - be64_to_cpu(target->ioc_guid), - be64_to_cpu(target->initiator_ext)); + if (target->using_rdma_cm) { + char dst_addr[64]; + + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;dest=%s\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + inet_ntop(&target->rdma_cm.dst, dst_addr, + sizeof(dst_addr))); + } else { + shost_printk(KERN_INFO, target->scsi_host, + PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be64_to_cpu(target->initiator_ext)); + } ret = -EEXIST; goto out; } @@ -3502,11 +3882,18 @@ static ssize_t srp_create_target(struct device *dev, ret = srp_connect_ch(ch, multich); if (ret) { + char dst[64]; + + if (target->using_rdma_cm) + inet_ntop(&target->rdma_cm.dst, dst, + sizeof(dst)); + else + snprintf(dst, sizeof(dst), "%pI6", + target->ib_cm.orig_dgid.raw); shost_printk(KERN_ERR, target->scsi_host, - PFX "Connection %d/%d to %pI6 failed\n", + PFX "Connection %d/%d to %s failed\n", ch_start + cpu_idx, - target->ch_count, - ch->target->orig_dgid.raw); + target->ch_count, dst); if (node_idx == 0 && cpu_idx == 0) { goto free_ch; } else { @@ -3531,13 +3918,25 @@ connected: goto err_disconnect; if (target->state != SRP_TARGET_REMOVED) { - shost_printk(KERN_DEBUG, target->scsi_host, PFX - "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", - be64_to_cpu(target->id_ext), - be64_to_cpu(target->ioc_guid), - be16_to_cpu(target->pkey), - be64_to_cpu(target->service_id), - target->sgid.raw, target->orig_dgid.raw); + if (target->using_rdma_cm) { + char dst[64]; + + inet_ntop(&target->rdma_cm.dst, dst, sizeof(dst)); + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx sgid %pI6 dest %s\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + target->sgid.raw, dst); + } else { + shost_printk(KERN_DEBUG, target->scsi_host, PFX + "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n", + be64_to_cpu(target->id_ext), + be64_to_cpu(target->ioc_guid), + be16_to_cpu(target->ib_cm.pkey), + be64_to_cpu(target->ib_cm.service_id), + target->sgid.raw, + target->ib_cm.orig_dgid.raw); + } } ret = count; @@ -3547,8 +3946,16 @@ out: put: scsi_host_put(target->scsi_host); - if (ret < 0) + if (ret < 0) { + /* + * If a call to srp_remove_target() has not been scheduled, + * drop the network namespace reference now that was obtained + * earlier in this function. + */ + if (target->state != SRP_TARGET_REMOVED) + kobj_ns_drop(KOBJ_NS_TYPE_NET, target->net); scsi_host_put(target->scsi_host); + } return ret; diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index a814f5ef16f9..911fc1582f61 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -45,6 +45,7 @@ #include #include #include +#include enum { SRP_PATH_REC_TIMEOUT_MS = 1000, @@ -153,11 +154,18 @@ struct srp_rdma_ch { struct completion done; int status; - struct sa_path_rec path; - struct ib_sa_query *path_query; - int path_query_id; + union { + struct ib_cm { + struct sa_path_rec path; + struct ib_sa_query *path_query; + int path_query_id; + struct ib_cm_id *cm_id; + } ib_cm; + struct rdma_cm { + struct rdma_cm_id *cm_id; + } rdma_cm; + }; - struct ib_cm_id *cm_id; struct srp_iu **tx_ring; struct srp_iu **rx_ring; struct srp_request *req_ring; @@ -182,6 +190,7 @@ struct srp_target_port { /* read only in the hot path */ u32 global_rkey; struct srp_rdma_ch *ch; + struct net *net; u32 ch_count; u32 lkey; enum srp_target_state state; @@ -194,7 +203,6 @@ struct srp_target_port { union ib_gid sgid; __be64 id_ext; __be64 ioc_guid; - __be64 service_id; __be64 initiator_ext; u16 io_class; struct srp_host *srp_host; @@ -210,8 +218,28 @@ struct srp_target_port { int comp_vector; int tl_retry_count; - union ib_gid orig_dgid; - __be16 pkey; + bool using_rdma_cm; + + union { + struct { + __be64 service_id; + union ib_gid orig_dgid; + __be16 pkey; + } ib_cm; + struct { + union { + struct sockaddr_in ip4; + struct sockaddr_in6 ip6; + struct sockaddr_storage ss; + } src; + union { + struct sockaddr_in ip4; + struct sockaddr_in6 ip6; + struct sockaddr_storage ss; + } dst; + bool src_specified; + } rdma_cm; + }; u32 rq_tmo_jiffies; diff --git a/include/scsi/srp.h b/include/scsi/srp.h index 5be834de491a..c16a3c9a4d9b 100644 --- a/include/scsi/srp.h +++ b/include/scsi/srp.h @@ -129,6 +129,23 @@ struct srp_login_req { u8 target_port_id[16]; }; +/** + * struct srp_login_req_rdma - RDMA/CM login parameters. + * + * RDMA/CM over InfiniBand can only carry 92 - 36 = 56 bytes of private + * data. The %srp_login_req_rdma structure contains the same information as + * %srp_login_req but with the reserved data removed. + */ +struct srp_login_req_rdma { + u64 tag; + __be16 req_buf_fmt; + u8 req_flags; + u8 opcode; + __be32 req_it_iu_len; + u8 initiator_port_id[16]; + u8 target_port_id[16]; +}; + /* * The SRP spec defines the size of the LOGIN_RSP structure to be 52 * bytes, so it needs to be packed to avoid having it padded to 56 -- cgit v1.2.3 From b0780ee56ee264cfdfee46023e02386282041aad Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 22 Jan 2018 14:27:13 -0800 Subject: IB/srp: Add target_can_queue login parameter Although I'm not sure this parameter is useful for regular SRP users, setting this parameter to 1 has shown to be invaluable for testing the block layer core, SCSI core and device mapper queue running mechanisms. Signed-off-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srp/ib_srp.c | 22 ++++++++++++++++++++++ drivers/infiniband/ulp/srp/ib_srp.h | 1 + 2 files changed, 23 insertions(+) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 6109f2e8f0f8..b48843833d69 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3018,6 +3018,16 @@ static int srp_reset_host(struct scsi_cmnd *scmnd) return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } +static int srp_target_alloc(struct scsi_target *starget) +{ + struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); + struct srp_target_port *target = host_to_target(shost); + + if (target->target_can_queue) + starget->can_queue = target->target_can_queue; + return 0; +} + static int srp_slave_alloc(struct scsi_device *sdev) { struct Scsi_Host *shost = sdev->host; @@ -3231,6 +3241,7 @@ static struct scsi_host_template srp_template = { .module = THIS_MODULE, .name = "InfiniBand SRP initiator", .proc_name = DRV_NAME, + .target_alloc = srp_target_alloc, .slave_alloc = srp_slave_alloc, .slave_configure = srp_slave_configure, .info = srp_target_info, @@ -3398,6 +3409,7 @@ enum { SRP_OPT_QUEUE_SIZE = 1 << 14, SRP_OPT_IP_SRC = 1 << 15, SRP_OPT_IP_DEST = 1 << 16, + SRP_OPT_TARGET_CAN_QUEUE= 1 << 17, }; static unsigned int srp_opt_mandatory[] = { @@ -3419,6 +3431,7 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_SERVICE_ID, "service_id=%s" }, { SRP_OPT_MAX_SECT, "max_sect=%d" }, { SRP_OPT_MAX_CMD_PER_LUN, "max_cmd_per_lun=%d" }, + { SRP_OPT_TARGET_CAN_QUEUE, "target_can_queue=%d" }, { SRP_OPT_IO_CLASS, "io_class=%x" }, { SRP_OPT_INITIATOR_EXT, "initiator_ext=%s" }, { SRP_OPT_CMD_SG_ENTRIES, "cmd_sg_entries=%u" }, @@ -3607,6 +3620,15 @@ static int srp_parse_options(struct net *net, const char *buf, target->scsi_host->cmd_per_lun = token; break; + case SRP_OPT_TARGET_CAN_QUEUE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad max target_can_queue parameter '%s'\n", + p); + goto out; + } + target->target_can_queue = token; + break; + case SRP_OPT_IO_CLASS: if (match_hex(args, &token)) { pr_warn("bad IO class parameter '%s'\n", p); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 911fc1582f61..a2706086b9c7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -211,6 +211,7 @@ struct srp_target_port { char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; + unsigned int target_can_queue; int mr_pool_size; int mr_per_cmd; int queue_size; -- cgit v1.2.3 From f97f43c9ed1dd9d1f7afc758fc31a619752d08f3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 24 Jan 2018 08:56:05 +0200 Subject: RDMA/srpt: Fix RCU debug build error Combination of CONFIG_DEBUG_OBJECTS_RCU_HEAD=y and CONFIG_INFINIBAND_SRPT=m produces the following build error. ERROR: "init_rcu_head" [drivers/infiniband/ulp/srpt/ib_srpt.ko] undefined! make[1]: *** [scripts/Makefile.modpost:92: __modpost] Error 1 make: *** [Makefile:1216: modules] Error 2 The reason to it that init_rcu_head() is not exported and not supposed to be used in modules. It is needed for dynamic initialization of statically allocated rcu_head structures. Fixes: 795bc112cd5a ("IB/srpt: Make it safe to use RCU for srpt_device.rch_list") Fixes: a11253142e6d ("IB/srpt: Rework multi-channel support") Signed-off-by: Bart Van Assche Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srpt/ib_srpt.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index bf37816a1b12..0373b7c40902 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -1950,7 +1950,6 @@ static struct srpt_nexus *srpt_get_nexus(struct srpt_port *sport, nexus = ERR_PTR(-ENOMEM); break; } - init_rcu_head(&tmp_nexus->rcu); INIT_LIST_HEAD(&tmp_nexus->ch_list); memcpy(tmp_nexus->i_port_id, i_port_id, 16); memcpy(tmp_nexus->t_port_id, t_port_id, 16); @@ -2110,7 +2109,6 @@ static int srpt_cm_req_recv(struct ib_cm_id *cm_id, goto reject; } - init_rcu_head(&ch->rcu); kref_init(&ch->kref); ch->pkey = be16_to_cpu(pkey); ch->nexus = nexus; -- cgit v1.2.3 From 9ec1ddb80abb1118258bec620dd9127826f7a68b Mon Sep 17 00:00:00 2001 From: "Amrani, Ram" Date: Wed, 24 Jan 2018 09:29:53 +0200 Subject: MAINTAINERS: Remove Ram Amrani from Q-Logic RDMA driver Remove myself from maintaining the qedr module as my period of working with Cavium/Q-Logic has come to an end. I've had a pleasure working with the community, cheers! Signed-off-by: Michal Kalderon Signed-off-by: Doug Ledford --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 018a916ad58e..38c23ed5334f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11212,7 +11212,6 @@ F: include/linux/qed/ F: drivers/net/ethernet/qlogic/qede/ QLOGIC QL4xxx RDMA DRIVER -M: Ram Amrani M: Michal Kalderon M: Ariel Elior L: linux-rdma@vger.kernel.org -- cgit v1.2.3 From f2fe3c4e0911fca86438852876a7f305b43ac0c9 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Thu, 25 Jan 2018 10:54:22 -0500 Subject: MAINTAINERS: Fix the location of the rdma git repo When Jason Gunthorpe and I became co-maintainers of the rdma tree, we moved the official git repo location to a name neutral location. However, that update did not make it here as well. Fix that mistake. Signed-off-by: Doug Ledford --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 38c23ed5334f..1facaa8cd6c9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6887,7 +6887,7 @@ M: Jason Gunthorpe L: linux-rdma@vger.kernel.org W: http://www.openfabrics.org/ Q: http://patchwork.kernel.org/project/linux-rdma/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git S: Supported F: Documentation/devicetree/bindings/infiniband/ F: Documentation/infiniband/ -- cgit v1.2.3 From 3624a8f02568f08aef299d3b117f2226f621177d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 24 Jan 2018 19:58:34 -0700 Subject: RDMA/uverbs: Use an unambiguous errno for method not supported Returning EOPNOTSUPP is problematic because it can also be returned by the method function, and we use it in quite a few places in drivers these days. Instead, dedicate EPROTONOSUPPORT to indicate that the ioctl framework is enabled but the requested object and method are not supported by the kernel. No other case will return this code, and it lets userspace know to fall back to write(). grep says we do not use it today in drivers/infiniband subsystem. Signed-off-by: Jason Gunthorpe Reviewed-by: Matan Barak Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_ioctl.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 71ff2644e053..d96dc1d17be1 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -243,16 +243,13 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, size_t ctx_size; uintptr_t data[UVERBS_OPTIMIZE_USING_STACK_SZ / sizeof(uintptr_t)]; - if (hdr->reserved) - return -EINVAL; - object_spec = uverbs_get_object(ib_dev, hdr->object_id); if (!object_spec) - return -EOPNOTSUPP; + return -EPROTONOSUPPORT; method_spec = uverbs_get_method(object_spec, hdr->method_id); if (!method_spec) - return -EOPNOTSUPP; + return -EPROTONOSUPPORT; if ((method_spec->flags & UVERBS_ACTION_FLAG_CREATE_ROOT) ^ !file->ucontext) return -EINVAL; @@ -305,6 +302,16 @@ static long ib_uverbs_cmd_verbs(struct ib_device *ib_dev, err = uverbs_handle_method(buf, ctx->uattrs, hdr->num_attrs, ib_dev, file, method_spec, ctx->uverbs_attr_bundle); + + /* + * EPROTONOSUPPORT is ONLY to be returned if the ioctl framework can + * not invoke the method because the request is not supported. No + * other cases should return this code. + */ + if (unlikely(err == -EPROTONOSUPPORT)) { + WARN_ON_ONCE(err == -EPROTONOSUPPORT); + err = -EINVAL; + } out: if (ctx != (void *)data) kfree(ctx); @@ -341,7 +348,7 @@ long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } if (hdr.reserved) { - err = -EOPNOTSUPP; + err = -EPROTONOSUPPORT; goto out; } -- cgit v1.2.3 From dc728f779a71769526531d3d8593ab4ab1d8e436 Mon Sep 17 00:00:00 2001 From: "Kalderon, Michal" Date: Thu, 25 Jan 2018 13:23:20 +0200 Subject: RDMA/qedr: lower print level of flushed CQEs There are races where can still get flush on CQEs before the QP enters error state. This is not an error and should be treated as debug information. Signed-off-by: Michal Kalderon Signed-off-by: Ariel Elior Signed-off-by: Doug Ledford --- drivers/infiniband/hw/qedr/verbs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 5551120ac6ea..53f00dbf313f 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -3470,9 +3470,9 @@ static int qedr_poll_cq_req(struct qedr_dev *dev, break; case RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR: if (qp->state != QED_ROCE_QP_STATE_ERR) - DP_ERR(dev, - "Error: POLL CQ with RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. CQ icid=0x%x, QP icid=0x%x\n", - cq->icid, qp->icid); + DP_DEBUG(dev, QEDR_MSG_CQ, + "Error: POLL CQ with RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR. CQ icid=0x%x, QP icid=0x%x\n", + cq->icid, qp->icid); cnt = process_req(dev, qp, cq, num_entries, wc, req->sq_cons, IB_WC_WR_FLUSH_ERR, 1); break; -- cgit v1.2.3 From 487f6683f1b738e40aca2386b9f73da4ebb8223d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 25 Jan 2018 11:27:27 -0800 Subject: IB/mthca: Fix gup usage in mthca_map_user_db() get_user_pages() must be called with mmap_sem held, currently it is not. In fact it is called under the user db_table->mutex. To fix this we can convert gup to use the fast alternative, and safely avoid taking mmap_sem, if possible. Furthermore this is safe wrt to the mutex as other callers that take the lock (unmap and alloc_db) are not called under mmap_sem (hence possible deadlock). Signed-off-by: Davidlohr Bueso Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mthca/mthca_memfree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 2a41ed74add8..2fe503e86c1d 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -472,7 +472,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, goto out; } - ret = get_user_pages(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages, NULL); + ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages); if (ret < 0) goto out; -- cgit v1.2.3 From cac5141199cdd2cdc188a55576966a9d4a766518 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 27 Jan 2018 17:48:47 +0100 Subject: IB/iser: Delete an error message for a failed memory allocation in iser_send_data_out() Omit an extra message for a memory allocation failure in this function. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Reviewed-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iser_initiator.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 2a07692007bd..3ae2571ba5b4 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -452,10 +452,8 @@ int iser_send_data_out(struct iscsi_conn *conn, __func__,(int)itt,(int)data_seg_len,(int)buf_offset); tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC); - if (tx_desc == NULL) { - iser_err("Failed to alloc desc for post dataout\n"); + if (!tx_desc) return -ENOMEM; - } tx_desc->type = ISCSI_TX_DATAOUT; tx_desc->cqe.done = iser_dataout_comp; -- cgit v1.2.3 From d769e6be246cd811b4d467b77d5ff81a09450678 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 27 Jan 2018 17:55:13 +0100 Subject: IB/iser: Delete an unnecessary variable initialisation in iser_send_data_out() The variable "tx_desc" will be set to an appropriate pointer a bit later. Thus omit the explicit initialisation at the beginning. Signed-off-by: Markus Elfring Reviewed-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iser_initiator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 3ae2571ba5b4..1425c4957a04 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -436,7 +436,7 @@ int iser_send_data_out(struct iscsi_conn *conn, { struct iser_conn *iser_conn = conn->dd_data; struct iscsi_iser_task *iser_task = task->dd_data; - struct iser_tx_desc *tx_desc = NULL; + struct iser_tx_desc *tx_desc; struct iser_mem_reg *mem_reg; unsigned long buf_offset; unsigned long data_seg_len; -- cgit v1.2.3 From 4cb24c550f1105f4a91b9ec9f98a1502c7af5cbb Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sat, 27 Jan 2018 18:25:37 +0100 Subject: IB/iser: Combine substrings for three messages The script "checkpatch.pl" pointed information out like the following. WARNING: quoted string split across lines Thus fix the affected source code places. Signed-off-by: Markus Elfring Reviewed-by: Max Gurtovoy Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iser_initiator.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 1425c4957a04..df49c4eb67f7 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -142,8 +142,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, hdr->write_va = cpu_to_be64(mem_reg->sge.addr + unsol_sz); } - iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X " - "VA:%#llX + unsol:%d\n", + iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X VA:%#llX + unsol:%d\n", task->itt, mem_reg->rkey, (unsigned long long)mem_reg->sge.addr, unsol_sz); } @@ -473,8 +472,7 @@ int iser_send_data_out(struct iscsi_conn *conn, tx_desc->num_sge = 2; if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) { - iser_err("Offset:%ld & DSL:%ld in Data-Out " - "inconsistent with total len:%ld, itt:%d\n", + iser_err("Offset:%ld & DSL:%ld in Data-Out inconsistent with total len:%ld, itt:%d\n", buf_offset, data_seg_len, iser_task->data[ISER_DIR_OUT].data_len, itt); err = -EINVAL; @@ -612,8 +610,8 @@ iser_check_remote_inv(struct iser_conn *iser_conn, iser_conn, rkey); if (unlikely(!iser_conn->snd_w_inv)) { - iser_err("conn %p: unexpected remote invalidation, " - "terminating connection\n", iser_conn); + iser_err("conn %p: unexpected remote invalidation, terminating connection\n", + iser_conn); return -EPROTO; } -- cgit v1.2.3 From f23a5350e43c810ca36b26d4ed4ecd9a08686f47 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sun, 28 Jan 2018 11:25:29 +0200 Subject: IB/umad: Fix use of unprotected device pointer The ib_write_umad() is protected by taking the umad file mutex. However, it accesses file->port->ib_dev -- which is protected only by the port's mutex (field file_mutex). The ib_umad_remove_one() calls ib_umad_kill_port() which sets port->ib_dev to NULL under the port mutex (NOT the file mutex). It then sets the mad agent to "dead" under the umad file mutex. This is a race condition -- because there is a window where port->ib_dev is NULL, while the agent is not "dead". As a result, we saw stack traces like: [16490.678059] BUG: unable to handle kernel NULL pointer dereference at 00000000000000b0 [16490.678246] IP: ib_umad_write+0x29c/0xa3a [ib_umad] [16490.678333] PGD 0 P4D 0 [16490.678404] Oops: 0000 [#1] SMP PTI [16490.678466] Modules linked in: rdma_ucm(OE) ib_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_uverbs(OE) ib_umad(OE) mlx4_en(OE) ptp pps_core mlx4_ib(OE-) ib_core(OE) mlx4_core(OE) mlx_compat (OE) memtrack(OE) devlink mst_pciconf(OE) mst_pci(OE) netconsole nfsv3 nfs_acl nfs lockd grace fscache cfg80211 rfkill esp6_offload esp6 esp4_offload esp4 sunrpc kvm_intel kvm ppdev parport_pc irqbypass parport joydev i2c_piix4 virtio_balloon cirrus drm_kms_helper ttm drm e1000 serio_raw virtio_pci virtio_ring virtio ata_generic pata_acpi qemu_fw_cfg [last unloaded: mlxfw] [16490.679202] CPU: 4 PID: 3115 Comm: sminfo Tainted: G OE 4.14.13-300.fc27.x86_64 #1 [16490.679339] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu2 04/01/2014 [16490.679477] task: ffff9cf753890000 task.stack: ffffaf70c26b0000 [16490.679571] RIP: 0010:ib_umad_write+0x29c/0xa3a [ib_umad] [16490.679664] RSP: 0018:ffffaf70c26b3d90 EFLAGS: 00010202 [16490.679747] RAX: 0000000000000010 RBX: ffff9cf75610fd80 RCX: 0000000000000000 [16490.679856] RDX: 0000000000000001 RSI: 00007ffdf2bfd714 RDI: ffff9cf6bb2a9c00 In the above trace, ib_umad_write is trying to dereference the NULL file->port->ib_dev pointer. Fix this by using the agent's device pointer (the device field in struct ib_mad_agent) -- which IS protected by the umad file mutex. Cc: # v4.11 Fixes: 44c58487d51a ("IB/core: Define 'ib' and 'roce' rdma_ah_attr types") Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/user_mad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index f0ed883492ec..5e8ffd12c04e 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -511,7 +511,7 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, } memset(&ah_attr, 0, sizeof ah_attr); - ah_attr.type = rdma_ah_find_type(file->port->ib_dev, + ah_attr.type = rdma_ah_find_type(agent->device, file->port->port_num); rdma_ah_set_dlid(&ah_attr, be16_to_cpu(packet->mad.hdr.lid)); rdma_ah_set_sl(&ah_attr, packet->mad.hdr.sl); -- cgit v1.2.3 From b081808a66345ba725b77ecd8d759bee874cd937 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:25:30 +0200 Subject: RDMA/mlx5: Avoid memory leak in case of XRCD dealloc failure Failure in XRCD FW deallocation command leaves memory leaked and returns error to the user which he can't do anything about it. This patch changes behavior to always free memory and always return success to the user. Fixes: e126ba97dba9 ("mlx5: Add driver for Mellanox Connect-IB adapters") Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ae36db3d0deb..d5cc95ffd4ac 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -5067,13 +5067,10 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) int err; err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); - if (err) { + if (err) mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); - return err; - } kfree(xrcd); - return 0; } -- cgit v1.2.3 From 708ea056b30de17e38f2842c8e953df04b3e8b31 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 28 Jan 2018 11:25:31 +0200 Subject: IB/core: Avoid SGID attributes query while converting GID from OPA to IB SGID attributes are not used during OPA to IB GID conversion. Therefore don't query it. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index b8f8d3128a53..68df93558d80 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1577,15 +1577,13 @@ static void cm_opa_to_ib_sgid(struct cm_work *work, struct sa_path_rec *path) { struct ib_device *dev = work->port->cm_dev->ib_device; - struct ib_gid_attr gid_attr; u8 port_num = work->port->port_num; if (rdma_cap_opa_ah(dev, port_num) && (ib_is_opa_gid(&path->sgid))) { union ib_gid sgid; - if (ib_get_cached_gid(dev, port_num, 0, - &sgid, &gid_attr)) { + if (ib_get_cached_gid(dev, port_num, 0, &sgid, NULL)) { dev_warn(&dev->dev, "Error updating sgid in CM request\n"); return; -- cgit v1.2.3 From 3cd96fddccb8f0cb18899e169314276e3f9f2c44 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 28 Jan 2018 11:25:32 +0200 Subject: RDMA/cma: Use existing netif_is_bond_master function When checking whatever the current netdev is the bond master interface, use kernel API netif_is_bond_master() instead of hardcoding the check. No functionality is changed. Reviewed-by: Mark Bloch Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 84812b9dea9b..e66963ca58bd 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -4294,7 +4294,7 @@ static int cma_netdev_callback(struct notifier_block *self, unsigned long event, if (event != NETDEV_BONDING_FAILOVER) return NOTIFY_DONE; - if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING)) + if (!netif_is_bond_master(ndev)) return NOTIFY_DONE; mutex_lock(&lock); -- cgit v1.2.3 From 925f7ea7a6ba2d434f01662a4171867c26bda66a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:25:33 +0200 Subject: RDMA/cm: Fix access to uninitialized variable The ndev will be initialized and held only for successful ib_get_cached_gid(), otherwise it is garbage stack memory. Calling dev_put() in failure path is wrong. Fixes: 16c72e402867 ("IB/cm: Refactor to avoid setting path record software only fields") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 68df93558d80..e6749157fd86 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -1901,8 +1901,6 @@ static int cm_req_handler(struct cm_work *work) grh->sgid_index, &gid, &gid_attr); if (ret) { - if (gid_attr.ndev) - dev_put(gid_attr.ndev); ib_send_cm_rej(cm_id, IB_CM_REJ_UNSUPPORTED, NULL, 0, NULL, 0); goto rejected; } -- cgit v1.2.3 From 57ac30c0efd2a3ab0b541b9bb44851f7a0be5d1c Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Sun, 28 Jan 2018 12:39:43 +0000 Subject: IB/mthca: remove mthca_user.h mthca_user.h is unused since commit 486f60954c71 ("IB/mthca: Move user vendor structures") Remove it from tree. Signed-off-by: Corentin Labbe Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mthca/mthca_user.h | 112 ------------------------------- 1 file changed, 112 deletions(-) delete mode 100644 drivers/infiniband/hw/mthca/mthca_user.h diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h deleted file mode 100644 index 5fe56e810739..000000000000 --- a/drivers/infiniband/hw/mthca/mthca_user.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Copyright (c) 2005 Topspin Communications. All rights reserved. - * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef MTHCA_USER_H -#define MTHCA_USER_H - -#include - -/* - * Increment this value if any changes that break userspace ABI - * compatibility are made. - */ -#define MTHCA_UVERBS_ABI_VERSION 1 - -/* - * Make sure that all structs defined in this file remain laid out so - * that they pack the same way on 32-bit and 64-bit architectures (to - * avoid incompatibility between 32-bit userspace and 64-bit kernels). - * In particular do not use pointer types -- pass pointers in __u64 - * instead. - */ - -struct mthca_alloc_ucontext_resp { - __u32 qp_tab_size; - __u32 uarc_size; -}; - -struct mthca_alloc_pd_resp { - __u32 pdn; - __u32 reserved; -}; - -struct mthca_reg_mr { -/* - * Mark the memory region with a DMA attribute that causes - * in-flight DMA to be flushed when the region is written to: - */ -#define MTHCA_MR_DMASYNC 0x1 - __u32 mr_attrs; - __u32 reserved; -}; - -struct mthca_create_cq { - __u32 lkey; - __u32 pdn; - __u64 arm_db_page; - __u64 set_db_page; - __u32 arm_db_index; - __u32 set_db_index; -}; - -struct mthca_create_cq_resp { - __u32 cqn; - __u32 reserved; -}; - -struct mthca_resize_cq { - __u32 lkey; - __u32 reserved; -}; - -struct mthca_create_srq { - __u32 lkey; - __u32 db_index; - __u64 db_page; -}; - -struct mthca_create_srq_resp { - __u32 srqn; - __u32 reserved; -}; - -struct mthca_create_qp { - __u32 lkey; - __u32 reserved; - __u64 sq_db_page; - __u64 rq_db_page; - __u32 sq_db_index; - __u32 rq_db_index; -}; - -#endif /* MTHCA_USER_H */ -- cgit v1.2.3 From 3356c8c0b8da4562e68b4e713279d372fbef6c3c Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Sun, 28 Jan 2018 15:08:55 +0000 Subject: IB/qib: remove qib_keys.c qib_keys.c was left uncompilable in commit 7c2e11fe2dbe ("IB/qib: Remove qp and mr functionality from qib") Since nothing need it, remove it from tree. Signed-off-by: Corentin Labbe Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qib/qib_keys.c | 235 ----------------------------------- 1 file changed, 235 deletions(-) delete mode 100644 drivers/infiniband/hw/qib/qib_keys.c diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c deleted file mode 100644 index 8fdf79f8d4e4..000000000000 --- a/drivers/infiniband/hw/qib/qib_keys.c +++ /dev/null @@ -1,235 +0,0 @@ -/* - * Copyright (c) 2006, 2007, 2009 QLogic Corporation. All rights reserved. - * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. - * - * This software is available to you under a choice of one of two - * licenses. You may choose to be licensed under the terms of the GNU - * General Public License (GPL) Version 2, available from the file - * COPYING in the main directory of this source tree, or the - * OpenIB.org BSD license below: - * - * Redistribution and use in source and binary forms, with or - * without modification, are permitted provided that the following - * conditions are met: - * - * - Redistributions of source code must retain the above - * copyright notice, this list of conditions and the following - * disclaimer. - * - * - Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials - * provided with the distribution. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "qib.h" - -/** - * qib_alloc_lkey - allocate an lkey - * @mr: memory region that this lkey protects - * @dma_region: 0->normal key, 1->restricted DMA key - * - * Returns 0 if successful, otherwise returns -errno. - * - * Increments mr reference count as required. - * - * Sets the lkey field mr for non-dma regions. - * - */ - -int qib_alloc_lkey(struct rvt_mregion *mr, int dma_region) -{ - unsigned long flags; - u32 r; - u32 n; - int ret = 0; - struct qib_ibdev *dev = to_idev(mr->pd->device); - struct rvt_lkey_table *rkt = &dev->lk_table; - - spin_lock_irqsave(&rkt->lock, flags); - - /* special case for dma_mr lkey == 0 */ - if (dma_region) { - struct rvt_mregion *tmr; - - tmr = rcu_access_pointer(dev->dma_mr); - if (!tmr) { - qib_get_mr(mr); - rcu_assign_pointer(dev->dma_mr, mr); - mr->lkey_published = 1; - } - goto success; - } - - /* Find the next available LKEY */ - r = rkt->next; - n = r; - for (;;) { - if (rkt->table[r] == NULL) - break; - r = (r + 1) & (rkt->max - 1); - if (r == n) - goto bail; - } - rkt->next = (r + 1) & (rkt->max - 1); - /* - * Make sure lkey is never zero which is reserved to indicate an - * unrestricted LKEY. - */ - rkt->gen++; - /* - * bits are capped in qib_verbs.c to insure enough bits - * for generation number - */ - mr->lkey = (r << (32 - ib_rvt_lkey_table_size)) | - ((((1 << (24 - ib_rvt_lkey_table_size)) - 1) & rkt->gen) - << 8); - if (mr->lkey == 0) { - mr->lkey |= 1 << 8; - rkt->gen++; - } - qib_get_mr(mr); - rcu_assign_pointer(rkt->table[r], mr); - mr->lkey_published = 1; -success: - spin_unlock_irqrestore(&rkt->lock, flags); -out: - return ret; -bail: - spin_unlock_irqrestore(&rkt->lock, flags); - ret = -ENOMEM; - goto out; -} - -/** - * qib_free_lkey - free an lkey - * @mr: mr to free from tables - */ -void qib_free_lkey(struct rvt_mregion *mr) -{ - unsigned long flags; - u32 lkey = mr->lkey; - u32 r; - struct qib_ibdev *dev = to_idev(mr->pd->device); - struct rvt_lkey_table *rkt = &dev->lk_table; - - spin_lock_irqsave(&rkt->lock, flags); - if (!mr->lkey_published) - goto out; - if (lkey == 0) - RCU_INIT_POINTER(dev->dma_mr, NULL); - else { - r = lkey >> (32 - ib_rvt_lkey_table_size); - RCU_INIT_POINTER(rkt->table[r], NULL); - } - qib_put_mr(mr); - mr->lkey_published = 0; -out: - spin_unlock_irqrestore(&rkt->lock, flags); -} - -/** - * qib_rkey_ok - check the IB virtual address, length, and RKEY - * @qp: qp for validation - * @sge: SGE state - * @len: length of data - * @vaddr: virtual address to place data - * @rkey: rkey to check - * @acc: access flags - * - * Return 1 if successful, otherwise 0. - * - * increments the reference count upon success - */ -int qib_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, - u32 len, u64 vaddr, u32 rkey, int acc) -{ - struct rvt_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table; - struct rvt_mregion *mr; - unsigned n, m; - size_t off; - - /* We use RKEY == zero for kernel virtual addresses */ - rcu_read_lock(); - if (rkey == 0) { - struct rvt_pd *pd = ibpd_to_rvtpd(qp->ibqp.pd); - struct qib_ibdev *dev = to_idev(pd->ibpd.device); - - if (pd->user) - goto bail; - mr = rcu_dereference(dev->dma_mr); - if (!mr) - goto bail; - if (unlikely(!atomic_inc_not_zero(&mr->refcount))) - goto bail; - rcu_read_unlock(); - - sge->mr = mr; - sge->vaddr = (void *) vaddr; - sge->length = len; - sge->sge_length = len; - sge->m = 0; - sge->n = 0; - goto ok; - } - - mr = rcu_dereference( - rkt->table[(rkey >> (32 - ib_rvt_lkey_table_size))]); - if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd)) - goto bail; - - off = vaddr - mr->iova; - if (unlikely(vaddr < mr->iova || off + len > mr->length || - (mr->access_flags & acc) == 0)) - goto bail; - if (unlikely(!atomic_inc_not_zero(&mr->refcount))) - goto bail; - rcu_read_unlock(); - - off += mr->offset; - if (mr->page_shift) { - /* - page sizes are uniform power of 2 so no loop is necessary - entries_spanned_by_off is the number of times the loop below - would have executed. - */ - size_t entries_spanned_by_off; - - entries_spanned_by_off = off >> mr->page_shift; - off -= (entries_spanned_by_off << mr->page_shift); - m = entries_spanned_by_off / RVT_SEGSZ; - n = entries_spanned_by_off % RVT_SEGSZ; - } else { - m = 0; - n = 0; - while (off >= mr->map[m]->segs[n].length) { - off -= mr->map[m]->segs[n].length; - n++; - if (n >= RVT_SEGSZ) { - m++; - n = 0; - } - } - } - sge->mr = mr; - sge->vaddr = mr->map[m]->segs[n].vaddr + off; - sge->length = mr->map[m]->segs[n].length - off; - sge->sge_length = len; - sge->m = m; - sge->n = n; -ok: - return 1; -bail: - rcu_read_unlock(); - return 0; -} - -- cgit v1.2.3 From 0812ed13217827d3c84cb7e5cb267c787d94d0a4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Jan 2018 14:18:33 -0700 Subject: IB/rxe: Change RDMA_RXE kconfig to use select NET_UDP_TUNNEL is not user selectable, so it should be used as a select in kconfig. CRYPTO_CRC32 is a required library for RDMA_RXE so it should active automatically, as most other CRYPTO_ users do. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/Kconfig b/drivers/infiniband/sw/rxe/Kconfig index 320bffc980d8..bad4a576d7cf 100644 --- a/drivers/infiniband/sw/rxe/Kconfig +++ b/drivers/infiniband/sw/rxe/Kconfig @@ -1,8 +1,8 @@ config RDMA_RXE tristate "Software RDMA over Ethernet (RoCE) driver" depends on INET && PCI && INFINIBAND - depends on NET_UDP_TUNNEL - depends on CRYPTO_CRC32 + select NET_UDP_TUNNEL + select CRYPTO_CRC32 select DMA_VIRT_OPS ---help--- This driver implements the InfiniBand RDMA transport over -- cgit v1.2.3 From beb801ac51be3e024edef435333198d59ccfbb8f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 26 Jan 2018 15:16:46 -0700 Subject: RDMA: Move enum ib_cq_creation_flags to uapi headers The flags field the enum is used with comes directly from the uapi so it belongs in the uapi headers for clarity and so userspace can use it. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx4/cq.c | 4 ++-- drivers/infiniband/hw/mlx5/cq.c | 2 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++-- include/rdma/ib_verbs.h | 5 ----- include/uapi/rdma/ib_user_verbs.h | 7 ++++++- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index bf4f14a1b4fc..9a566ee3ceff 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -170,7 +170,7 @@ err_buf: return err; } -#define CQ_CREATE_FLAGS_SUPPORTED IB_CQ_FLAGS_TIMESTAMP_COMPLETION +#define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, @@ -246,7 +246,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, cq->db.dma, &cq->mcq, vector, 0, - !!(cq->create_flags & IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); + !!(cq->create_flags & IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)); if (err) goto err_dbmap; diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 18705cbcdc8c..5b974fb97611 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1010,7 +1010,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, MLX5_SET(cqc, cqc, uar_page, index); MLX5_SET(cqc, cqc, c_eqn, eqn); MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma); - if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN) + if (cq->create_flags & IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN) MLX5_SET(cqc, cqc, oi, 1); err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 69a80f7512f0..139385129973 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1123,8 +1123,8 @@ static inline u32 check_cq_create_flags(u32 flags) * It returns non-zero value for unsupported CQ * create flags, otherwise it returns zero. */ - return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN | - IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); + return (flags & ~(IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN | + IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION)); } static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 5e32fe781ca3..0b2942586840 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -300,11 +300,6 @@ struct ib_tm_caps { u32 max_sge; }; -enum ib_cq_creation_flags { - IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, - IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, -}; - struct ib_cq_init_attr { unsigned int cqe; int comp_vector; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index fd035641cf41..04d0e67b1312 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -402,13 +402,18 @@ struct ib_uverbs_create_cq { __u64 driver_data[0]; }; +enum ib_uverbs_ex_create_cq_flags { + IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, + IB_UVERBS_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, +}; + struct ib_uverbs_ex_create_cq { __u64 user_handle; __u32 cqe; __u32 comp_vector; __s32 comp_channel; __u32 comp_mask; - __u32 flags; + __u32 flags; /* bitmask of ib_uverbs_ex_create_cq_flags */ __u32 reserved; }; -- cgit v1.2.3 From e449644741af523fa345f41494e19a0914709a69 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:17:18 +0200 Subject: RDMA/core: Use the MODNAME instead of the function name for pd callers Each of our modules only allocates a PD in one place, so there isn't any loss in detail, while MODNAME is more useful and recognizable as something to expose to the user. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0b2942586840..6ebb46e4ecf5 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2853,7 +2853,7 @@ enum ib_pd_flags { struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller); #define ib_alloc_pd(device, flags) \ - __ib_alloc_pd((device), (flags), __func__) + __ib_alloc_pd((device), (flags), KBUILD_MODNAME) void ib_dealloc_pd(struct ib_pd *pd); /** -- cgit v1.2.3 From f66c8ba4c9fa193481fdb8030504287bf5ad4d69 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:17:19 +0200 Subject: RDMA/core: Save kernel caller name when creating PD and CQ objects The KBUILD_MODNAME variable contains the module name and it is known for kernel users during compilation, so let's reuse it to track the owners. Followup patches will store this for resource tracking. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cq.c | 10 ++++++---- drivers/infiniband/core/verbs.c | 4 ++-- include/rdma/ib_verbs.h | 13 ++++++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index c8c5a5a7f433..d99565dbd12f 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -120,20 +120,22 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) } /** - * ib_alloc_cq - allocate a completion queue + * __ib_alloc_cq - allocate a completion queue * @dev: device to allocate the CQ for * @private: driver private data, accessible from cq->cq_context * @nr_cqe: number of CQEs to allocate * @comp_vector: HCA completion vectors for this CQ * @poll_ctx: context to poll the CQ from. + * @caller: module owner name. * * This is the proper interface to allocate a CQ for in-kernel users. A * CQ allocated with this interface will automatically be polled from the * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id * to use this CQ abstraction. */ -struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, - int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) +struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, + enum ib_poll_context poll_ctx, const char *caller) { struct ib_cq_init_attr cq_attr = { .cqe = nr_cqe, @@ -185,7 +187,7 @@ out_destroy_cq: cq->device->destroy_cq(cq); return ERR_PTR(ret); } -EXPORT_SYMBOL(ib_alloc_cq); +EXPORT_SYMBOL(__ib_alloc_cq); /** * ib_free_cq - free a completion queue diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index e9c3991a93ff..c2b347f6e8a2 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1782,7 +1782,7 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) } EXPORT_SYMBOL(ib_detach_mcast); -struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) +struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller) { struct ib_xrcd *xrcd; @@ -1800,7 +1800,7 @@ struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device) return xrcd; } -EXPORT_SYMBOL(ib_alloc_xrcd); +EXPORT_SYMBOL(__ib_alloc_xrcd); int ib_dealloc_xrcd(struct ib_xrcd *xrcd) { diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6ebb46e4ecf5..df3ab2d967f7 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -3130,8 +3130,12 @@ static inline int ib_post_recv(struct ib_qp *qp, return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } -struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, - int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx); +struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, + enum ib_poll_context poll_ctx, const char *caller); +#define ib_alloc_cq(device, priv, nr_cqe, comp_vect, poll_ctx) \ + __ib_alloc_cq((device), (priv), (nr_cqe), (comp_vect), (poll_ctx), KBUILD_MODNAME) + void ib_free_cq(struct ib_cq *cq); int ib_process_cq_direct(struct ib_cq *cq, int budget); @@ -3555,8 +3559,11 @@ int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid); /** * ib_alloc_xrcd - Allocates an XRC domain. * @device: The device on which to allocate the XRC domain. + * @caller: Module name for kernel consumers */ -struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device); +struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller); +#define ib_alloc_xrcd(device) \ + __ib_alloc_xrcd((device), KBUILD_MODNAME) /** * ib_dealloc_xrcd - Deallocates an XRC domain. -- cgit v1.2.3 From 02d8883f520ee91c4c40c0a31892eb25ea2df2c9 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:17:20 +0200 Subject: RDMA/restrack: Add general infrastructure to track RDMA resources The RDMA subsystem has very strict set of objects to work with, but it completely lacks tracking facilities and has no visibility of resource utilization. The following patch adds such infrastructure to keep track of RDMA resources to help with debugging of user space applications. The primary user of this infrastructure is RDMA nldev netlink (following patches), to be exposed to userspace via rdmatool, but it is not limited too that. At this stage, the main three objects (PD, CQ and QP) are added, and more will be added later. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/Makefile | 2 +- drivers/infiniband/core/core_priv.h | 1 + drivers/infiniband/core/device.c | 4 + drivers/infiniband/core/restrack.c | 164 ++++++++++++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 19 +++++ include/rdma/restrack.h | 157 ++++++++++++++++++++++++++++++++++ 6 files changed, 346 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/core/restrack.c create mode 100644 include/rdma/restrack.h diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index 504b926552c6..f69833db0a32 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -12,7 +12,7 @@ ib_core-y := packer.o ud_header.o verbs.o cq.o rw.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o mr_pool.o addr.o sa_query.o \ multicast.o mad.o smi.o agent.o mad_rmpp.o \ - security.o nldev.o + security.o nldev.o restrack.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index aef9aa0ac0e6..2b1372da708a 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -40,6 +40,7 @@ #include #include #include +#include #include "mad_priv.h" /* Total number of ports combined across all struct ib_devices's */ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 2826e06311a5..e8010e73a1cf 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -263,6 +263,8 @@ struct ib_device *ib_alloc_device(size_t size) if (!device) return NULL; + rdma_restrack_init(&device->res); + device->dev.class = &ib_class; device_initialize(&device->dev); @@ -596,6 +598,8 @@ void ib_unregister_device(struct ib_device *device) } up_read(&lists_rwsem); + rdma_restrack_clean(&device->res); + ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c new file mode 100644 index 000000000000..857637bf46da --- /dev/null +++ b/drivers/infiniband/core/restrack.c @@ -0,0 +1,164 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +void rdma_restrack_init(struct rdma_restrack_root *res) +{ + init_rwsem(&res->rwsem); +} + +void rdma_restrack_clean(struct rdma_restrack_root *res) +{ + WARN_ON_ONCE(!hash_empty(res->hash)); +} + +int rdma_restrack_count(struct rdma_restrack_root *res, + enum rdma_restrack_type type, + struct pid_namespace *ns) +{ + struct rdma_restrack_entry *e; + u32 cnt = 0; + + down_read(&res->rwsem); + hash_for_each_possible(res->hash, e, node, type) { + if (ns == &init_pid_ns || + (!rdma_is_kernel_res(e) && + ns == task_active_pid_ns(e->task))) + cnt++; + } + up_read(&res->rwsem); + return cnt; +} +EXPORT_SYMBOL(rdma_restrack_count); + +static void set_kern_name(struct rdma_restrack_entry *res) +{ + enum rdma_restrack_type type = res->type; + struct ib_qp *qp; + + if (type != RDMA_RESTRACK_QP) + /* PD and CQ types already have this name embedded in */ + return; + + qp = container_of(res, struct ib_qp, res); + if (!qp->pd) { + WARN_ONCE(true, "XRC QPs are not supported\n"); + /* Survive, despite the programmer's error */ + res->kern_name = " "; + return; + } + + res->kern_name = qp->pd->res.kern_name; +} + +static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) +{ + enum rdma_restrack_type type = res->type; + struct ib_device *dev; + struct ib_xrcd *xrcd; + struct ib_pd *pd; + struct ib_cq *cq; + struct ib_qp *qp; + + switch (type) { + case RDMA_RESTRACK_PD: + pd = container_of(res, struct ib_pd, res); + dev = pd->device; + break; + case RDMA_RESTRACK_CQ: + cq = container_of(res, struct ib_cq, res); + dev = cq->device; + break; + case RDMA_RESTRACK_QP: + qp = container_of(res, struct ib_qp, res); + dev = qp->device; + break; + case RDMA_RESTRACK_XRCD: + xrcd = container_of(res, struct ib_xrcd, res); + dev = xrcd->device; + break; + default: + WARN_ONCE(true, "Wrong resource tracking type %u\n", type); + return NULL; + } + + return dev; +} + +void rdma_restrack_add(struct rdma_restrack_entry *res) +{ + struct ib_device *dev = res_to_dev(res); + + if (!dev) + return; + + if (!uaccess_kernel()) { + get_task_struct(current); + res->task = current; + res->kern_name = NULL; + } else { + set_kern_name(res); + res->task = NULL; + } + + kref_init(&res->kref); + init_completion(&res->comp); + res->valid = true; + + down_write(&dev->res.rwsem); + hash_add(dev->res.hash, &res->node, res->type); + up_write(&dev->res.rwsem); +} +EXPORT_SYMBOL(rdma_restrack_add); + +int __must_check rdma_restrack_get(struct rdma_restrack_entry *res) +{ + return kref_get_unless_zero(&res->kref); +} +EXPORT_SYMBOL(rdma_restrack_get); + +static void restrack_release(struct kref *kref) +{ + struct rdma_restrack_entry *res; + + res = container_of(kref, struct rdma_restrack_entry, kref); + complete(&res->comp); +} + +int rdma_restrack_put(struct rdma_restrack_entry *res) +{ + return kref_put(&res->kref, restrack_release); +} +EXPORT_SYMBOL(rdma_restrack_put); + +void rdma_restrack_del(struct rdma_restrack_entry *res) +{ + struct ib_device *dev; + + if (!res->valid) + return; + + dev = res_to_dev(res); + if (!dev) + return; + + rdma_restrack_put(res); + + wait_for_completion(&res->comp); + + down_write(&dev->res.rwsem); + hash_del(&res->node); + res->valid = false; + if (res->task) + put_task_struct(res->task); + up_write(&dev->res.rwsem); +} +EXPORT_SYMBOL(rdma_restrack_del); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index df3ab2d967f7..5263c86fd103 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -63,6 +63,7 @@ #include #include #include +#include #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -1525,6 +1526,7 @@ struct ib_pd { * Implementation details of the RDMA core, don't use in drivers: */ struct ib_mr *__internal_mr; + struct rdma_restrack_entry res; }; struct ib_xrcd { @@ -1534,6 +1536,10 @@ struct ib_xrcd { struct mutex tgt_qp_mutex; struct list_head tgt_qp_list; + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct rdma_restrack_entry res; }; struct ib_ah { @@ -1565,6 +1571,10 @@ struct ib_cq { struct irq_poll iop; struct work_struct work; }; + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct rdma_restrack_entry res; }; struct ib_srq { @@ -1741,6 +1751,11 @@ struct ib_qp { struct ib_rwq_ind_table *rwq_ind_tbl; struct ib_qp_security *qp_sec; u8 port; + + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct rdma_restrack_entry res; }; struct ib_mr { @@ -2347,6 +2362,10 @@ struct ib_device { #endif u32 index; + /* + * Implementation details of the RDMA core, don't use in drivers + */ + struct rdma_restrack_root res; /** * The following mandatory functions are used only at device diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h new file mode 100644 index 000000000000..c2d81167c858 --- /dev/null +++ b/include/rdma/restrack.h @@ -0,0 +1,157 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause) */ +/* + * Copyright (c) 2017-2018 Mellanox Technologies. All rights reserved. + */ + +#ifndef _RDMA_RESTRACK_H_ +#define _RDMA_RESTRACK_H_ + +#include +#include +#include +#include +#include + +/** + * enum rdma_restrack_type - HW objects to track + */ +enum rdma_restrack_type { + /** + * @RDMA_RESTRACK_PD: Protection domain (PD) + */ + RDMA_RESTRACK_PD, + /** + * @RDMA_RESTRACK_CQ: Completion queue (CQ) + */ + RDMA_RESTRACK_CQ, + /** + * @RDMA_RESTRACK_QP: Queue pair (QP) + */ + RDMA_RESTRACK_QP, + /** + * @RDMA_RESTRACK_XRCD: XRC domain (XRCD) + */ + RDMA_RESTRACK_XRCD, + /** + * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations + */ + RDMA_RESTRACK_MAX +}; + +#define RDMA_RESTRACK_HASH_BITS 8 +/** + * struct rdma_restrack_root - main resource tracking management + * entity, per-device + */ +struct rdma_restrack_root { + /* + * @rwsem: Read/write lock to protect lists + */ + struct rw_semaphore rwsem; + /** + * @hash: global database for all resources per-device + */ + DECLARE_HASHTABLE(hash, RDMA_RESTRACK_HASH_BITS); +}; + +/** + * struct rdma_restrack_entry - metadata per-entry + */ +struct rdma_restrack_entry { + /** + * @valid: validity indicator + * + * The entries are filled during rdma_restrack_add, + * can be attempted to be free during rdma_restrack_del. + * + * As an example for that, see mlx5 QPs with type MLX5_IB_QPT_HW_GSI + */ + bool valid; + /* + * @kref: Protect destroy of the resource + */ + struct kref kref; + /* + * @comp: Signal that all consumers of resource are completed their work + */ + struct completion comp; + /** + * @task: owner of resource tracking entity + * + * There are two types of entities: created by user and created + * by kernel. + * + * This is relevant for the entities created by users. + * For the entities created by kernel, this pointer will be NULL. + */ + struct task_struct *task; + /** + * @kern_name: name of owner for the kernel created entities. + */ + const char *kern_name; + /** + * @node: hash table entry + */ + struct hlist_node node; + /** + * @type: various objects in restrack database + */ + enum rdma_restrack_type type; +}; + +/** + * rdma_restrack_init() - initialize resource tracking + * @res: resource tracking root + */ +void rdma_restrack_init(struct rdma_restrack_root *res); + +/** + * rdma_restrack_clean() - clean resource tracking + * @res: resource tracking root + */ +void rdma_restrack_clean(struct rdma_restrack_root *res); + +/** + * rdma_restrack_count() - the current usage of specific object + * @res: resource entry + * @type: actual type of object to operate + * @ns: PID namespace + */ +int rdma_restrack_count(struct rdma_restrack_root *res, + enum rdma_restrack_type type, + struct pid_namespace *ns); + +/** + * rdma_restrack_add() - add object to the reource tracking database + * @res: resource entry + */ +void rdma_restrack_add(struct rdma_restrack_entry *res); + +/** + * rdma_restrack_del() - delete object from the reource tracking database + * @res: resource entry + * @type: actual type of object to operate + */ +void rdma_restrack_del(struct rdma_restrack_entry *res); + +/** + * rdma_is_kernel_res() - check the owner of resource + * @res: resource entry + */ +static inline bool rdma_is_kernel_res(struct rdma_restrack_entry *res) +{ + return !res->task; +} + +/** + * rdma_restrack_get() - grab to protect resource from release + * @res: resource entry + */ +int __must_check rdma_restrack_get(struct rdma_restrack_entry *res); + +/** + * rdma_restrack_put() - relase resource + * @res: resource entry + */ +int rdma_restrack_put(struct rdma_restrack_entry *res); +#endif /* _RDMA_RESTRACK_H_ */ -- cgit v1.2.3 From 78a0cd648a802450602c95e164a820fe1a165247 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:17:21 +0200 Subject: RDMA/core: Add resource tracking for create and destroy QPs Track create and destroy operations of QP objects. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/core_priv.h | 27 +++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_cmd.c | 3 +-- drivers/infiniband/core/verbs.c | 5 ++--- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 2b1372da708a..c4560d84dfae 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -301,4 +301,31 @@ struct ib_device *ib_device_get_by_index(u32 ifindex); /* RDMA device netlink */ void nldev_init(void); void nldev_exit(void); + +static inline struct ib_qp *_ib_create_qp(struct ib_device *dev, + struct ib_pd *pd, + struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct ib_qp *qp; + + qp = dev->create_qp(pd, attr, udata); + if (IS_ERR(qp)) + return qp; + + qp->device = dev; + qp->pd = pd; + /* + * We don't track XRC QPs for now, because they don't have PD + * and more importantly they are created internaly by driver, + * see mlx5 create_dev_resources() as an example. + */ + if (attr->qp_type < IB_QPT_XRC_INI) { + qp->res.type = RDMA_RESTRACK_QP; + rdma_restrack_add(&qp->res); + } else + qp->res.valid = false; + + return qp; +} #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4ddd61d90507..825325c764a1 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1514,7 +1514,7 @@ static int create_qp(struct ib_uverbs_file *file, if (cmd->qp_type == IB_QPT_XRC_TGT) qp = ib_create_qp(pd, &attr); else - qp = device->create_qp(pd, &attr, uhw); + qp = _ib_create_qp(device, pd, &attr, uhw); if (IS_ERR(qp)) { ret = PTR_ERR(qp); @@ -1527,7 +1527,6 @@ static int create_qp(struct ib_uverbs_file *file, goto err_cb; qp->real_qp = qp; - qp->device = device; qp->pd = pd; qp->send_cq = attr.send_cq; qp->recv_cq = attr.recv_cq; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c2b347f6e8a2..5324cf4788d0 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -882,7 +882,7 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, if (qp_init_attr->cap.max_rdma_ctxs) rdma_rw_init_qp(device, qp_init_attr); - qp = device->create_qp(pd, qp_init_attr, NULL); + qp = _ib_create_qp(device, pd, qp_init_attr, NULL); if (IS_ERR(qp)) return qp; @@ -892,7 +892,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, return ERR_PTR(ret); } - qp->device = device; qp->real_qp = qp; qp->uobject = NULL; qp->qp_type = qp_init_attr->qp_type; @@ -922,7 +921,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd, atomic_inc(&qp_init_attr->srq->usecnt); } - qp->pd = pd; qp->send_cq = qp_init_attr->send_cq; qp->xrcd = NULL; @@ -1538,6 +1536,7 @@ int ib_destroy_qp(struct ib_qp *qp) if (!qp->uobject) rdma_rw_cleanup_mrs(qp); + rdma_restrack_del(&qp->res); ret = qp->device->destroy_qp(qp); if (!ret) { if (pd) -- cgit v1.2.3 From 08f294a1524bc3211a28091daa6c7513828c7d33 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:17:22 +0200 Subject: RDMA/core: Add resource tracking for create and destroy CQs Track create and destroy operations of CQ objects. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cq.c | 6 ++++++ drivers/infiniband/core/uverbs_cmd.c | 2 ++ drivers/infiniband/core/uverbs_std_types.c | 3 +++ drivers/infiniband/core/verbs.c | 3 +++ 4 files changed, 14 insertions(+) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index d99565dbd12f..bc79ca8215d7 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -159,6 +159,10 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, if (!cq->wc) goto out_destroy_cq; + cq->res.type = RDMA_RESTRACK_CQ; + cq->res.kern_name = caller; + rdma_restrack_add(&cq->res); + switch (cq->poll_ctx) { case IB_POLL_DIRECT: cq->comp_handler = ib_cq_completion_direct; @@ -183,6 +187,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, out_free_wc: kfree(cq->wc); + rdma_restrack_del(&cq->res); out_destroy_cq: cq->device->destroy_cq(cq); return ERR_PTR(ret); @@ -214,6 +219,7 @@ void ib_free_cq(struct ib_cq *cq) } kfree(cq->wc); + rdma_restrack_del(&cq->res); ret = cq->device->destroy_cq(cq); WARN_ON_ONCE(ret); } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 825325c764a1..3e95acd29de7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1033,6 +1033,8 @@ static struct ib_ucq_object *create_cq(struct ib_uverbs_file *file, goto err_cb; uobj_alloc_commit(&obj->uobject); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); return obj; diff --git a/drivers/infiniband/core/uverbs_std_types.c b/drivers/infiniband/core/uverbs_std_types.c index c3ee5d9b336d..b571176babbe 100644 --- a/drivers/infiniband/core/uverbs_std_types.c +++ b/drivers/infiniband/core/uverbs_std_types.c @@ -35,6 +35,7 @@ #include #include #include +#include #include "rdma_core.h" #include "uverbs.h" @@ -319,6 +320,8 @@ static int uverbs_create_cq_handler(struct ib_device *ib_dev, obj->uobject.object = cq; obj->uobject.user_handle = user_handle; atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); ret = uverbs_copy_to(attrs, CREATE_CQ_RESP_CQE, &cq->cqe); if (ret) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 5324cf4788d0..8cb5c850828d 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1579,6 +1579,8 @@ struct ib_cq *ib_create_cq(struct ib_device *device, cq->event_handler = event_handler; cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); + cq->res.type = RDMA_RESTRACK_CQ; + rdma_restrack_add(&cq->res); } return cq; @@ -1597,6 +1599,7 @@ int ib_destroy_cq(struct ib_cq *cq) if (atomic_read(&cq->usecnt)) return -EBUSY; + rdma_restrack_del(&cq->res); return cq->device->destroy_cq(cq); } EXPORT_SYMBOL(ib_destroy_cq); -- cgit v1.2.3 From 9d5f8c209b3f29259e6aa9595ea2dc2dfa27549a Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:17:23 +0200 Subject: RDMA/core: Add resource tracking for create and destroy PDs Track create and destroy operations of PD objects. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 2 ++ drivers/infiniband/core/verbs.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 3e95acd29de7..256934d1f64f 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -340,6 +340,8 @@ ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file, uobj->object = pd; memset(&resp, 0, sizeof resp); resp.pd_handle = uobj->id; + pd->res.type = RDMA_RESTRACK_PD; + rdma_restrack_add(&pd->res); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof resp)) { ret = -EFAULT; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 8cb5c850828d..16ebc6372c31 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -263,6 +263,10 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } + pd->res.type = RDMA_RESTRACK_PD; + pd->res.kern_name = caller; + rdma_restrack_add(&pd->res); + if (mr_access_flags) { struct ib_mr *mr; @@ -312,6 +316,7 @@ void ib_dealloc_pd(struct ib_pd *pd) requires the caller to guarantee we can't race here. */ WARN_ON(atomic_read(&pd->usecnt)); + rdma_restrack_del(&pd->res); /* Making delalloc_pd a void return is a WIP, no driver should return an error here. */ ret = pd->device->dealloc_pd(pd); -- cgit v1.2.3 From bf3c5a93c52368410a521af34ed3bff91a99df44 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:17:24 +0200 Subject: RDMA/nldev: Provide global resource utilization Expose through the netlink interface the global per-device utilization of the supported object types. Provide both dumpit and doit callbacks. As an example of possible output from rdmatool for system with 5 mlx5 cards: $ rdma res 1: mlx5_0: qp 4 cq 5 pd 3 2: mlx5_1: qp 4 cq 5 pd 3 3: mlx5_2: qp 4 cq 5 pd 3 4: mlx5_3: qp 2 cq 3 pd 2 5: mlx5_4: qp 4 cq 5 pd 3 Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 147 +++++++++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 7 ++ 2 files changed, 154 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 5d790c507c7e..c37bb041f647 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -31,6 +31,8 @@ */ #include +#include +#include #include #include @@ -52,6 +54,11 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_PORT_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PORT_PHYS_STATE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_DEV_NODE_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING, + .len = 16 }, + [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -134,6 +141,65 @@ static int fill_port_info(struct sk_buff *msg, return 0; } +static int fill_res_info_entry(struct sk_buff *msg, + const char *name, u64 curr) +{ + struct nlattr *entry_attr; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, name)) + goto err; + if (nla_put_u64_64bit(msg, + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, curr, 0)) + goto err; + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); + return -EMSGSIZE; +} + +static int fill_res_info(struct sk_buff *msg, struct ib_device *device) +{ + static const char * const names[RDMA_RESTRACK_MAX] = { + [RDMA_RESTRACK_PD] = "pd", + [RDMA_RESTRACK_CQ] = "cq", + [RDMA_RESTRACK_QP] = "qp", + }; + + struct rdma_restrack_root *res = &device->res; + struct nlattr *table_attr; + int ret, i, curr; + + if (fill_nldev_handle(msg, device)) + return -EMSGSIZE; + + table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_SUMMARY); + if (!table_attr) + return -EMSGSIZE; + + for (i = 0; i < RDMA_RESTRACK_MAX; i++) { + if (!names[i]) + continue; + curr = rdma_restrack_count(res, i, task_active_pid_ns(current)); + ret = fill_res_info_entry(msg, names[i], curr); + if (ret) + goto err; + } + + nla_nest_end(msg, table_attr); + return 0; + +err: + nla_nest_cancel(msg, table_attr); + return ret; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -329,6 +395,83 @@ out: return skb->len; } +static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + struct sk_buff *msg; + u32 index; + int ret; + + ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); + if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + goto err; + + nlh = nlmsg_put(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), + 0, 0); + + ret = fill_res_info(msg, device); + if (ret) + goto err_free; + + nlmsg_end(msg, nlh); + put_device(&device->dev); + return rdma_nl_unicast(msg, NETLINK_CB(skb).portid); + +err_free: + nlmsg_free(msg); +err: + put_device(&device->dev); + return ret; +} + +static int _nldev_res_get_dumpit(struct ib_device *device, + struct sk_buff *skb, + struct netlink_callback *cb, + unsigned int idx) +{ + int start = cb->args[0]; + struct nlmsghdr *nlh; + + if (idx < start) + return 0; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_GET), + 0, NLM_F_MULTI); + + if (fill_res_info(skb, device)) { + nlmsg_cancel(skb, nlh); + goto out; + } + + nlmsg_end(skb, nlh); + + idx++; + +out: + cb->args[0] = idx; + return skb->len; +} + +static int nldev_res_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -338,6 +481,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, }, + [RDMA_NLDEV_CMD_RES_GET] = { + .doit = nldev_res_get_doit, + .dump = nldev_res_get_dumpit, + }, }; void __init nldev_init(void) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index cc002e316d09..22c39532c411 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -236,6 +236,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_PORT_NEW, RDMA_NLDEV_CMD_PORT_DEL, + RDMA_NLDEV_CMD_RES_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -303,6 +305,11 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_DEV_NODE_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_RES_SUMMARY, /* nested table */ + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY, /* nested table */ + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, /* string */ + RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, /* u64 */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3 From b5fa635aab8f0d39a824c01991266a6d06f007fb Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 28 Jan 2018 11:17:25 +0200 Subject: RDMA/nldev: Provide detailed QP information Implement RDMA nldev netlink interface to get detailed information on each QP in the system. This includes the owning process or kernel ULP and detailed information from the qp_attrs. Currently only the dumpit variant is implemented. Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/nldev.c | 227 +++++++++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 42 ++++++++ 2 files changed, 269 insertions(+) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index c37bb041f647..fa8655e3b3ed 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -59,6 +59,18 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME] = { .type = NLA_NUL_STRING, .len = 16 }, [RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR] = { .type = NLA_U64 }, + [RDMA_NLDEV_ATTR_RES_QP] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_QP_ENTRY] = { .type = NLA_NESTED }, + [RDMA_NLDEV_ATTR_RES_LQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQPN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_RQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_SQ_PSN] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_STATE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_RES_PID] = { .type = NLA_U32 }, + [RDMA_NLDEV_ATTR_RES_KERN_NAME] = { .type = NLA_NUL_STRING, + .len = TASK_COMM_LEN }, }; static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) @@ -200,6 +212,78 @@ err: return ret; } +static int fill_res_qp_entry(struct sk_buff *msg, + struct ib_qp *qp, uint32_t port) +{ + struct rdma_restrack_entry *res = &qp->res; + struct ib_qp_init_attr qp_init_attr; + struct nlattr *entry_attr; + struct ib_qp_attr qp_attr; + int ret; + + ret = ib_query_qp(qp, &qp_attr, 0, &qp_init_attr); + if (ret) + return ret; + + if (port && port != qp_attr.port_num) + return 0; + + entry_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_RES_QP_ENTRY); + if (!entry_attr) + goto out; + + /* In create_qp() port is not set yet */ + if (qp_attr.port_num && + nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, qp_attr.port_num)) + goto err; + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_LQPN, qp->qp_num)) + goto err; + if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC) { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQPN, + qp_attr.dest_qp_num)) + goto err; + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_RQ_PSN, + qp_attr.rq_psn)) + goto err; + } + + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_SQ_PSN, qp_attr.sq_psn)) + goto err; + + if (qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_UC || + qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT) { + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, + qp_attr.path_mig_state)) + goto err; + } + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_TYPE, qp->qp_type)) + goto err; + if (nla_put_u8(msg, RDMA_NLDEV_ATTR_RES_STATE, qp_attr.qp_state)) + goto err; + + /* + * Existence of task means that it is user QP and netlink + * user is invited to go and read /proc/PID/comm to get name + * of the task file and res->task_com should be NULL. + */ + if (rdma_is_kernel_res(res)) { + if (nla_put_string(msg, RDMA_NLDEV_ATTR_RES_KERN_NAME, res->kern_name)) + goto err; + } else { + if (nla_put_u32(msg, RDMA_NLDEV_ATTR_RES_PID, task_pid_vnr(res->task))) + goto err; + } + + nla_nest_end(msg, entry_attr); + return 0; + +err: + nla_nest_cancel(msg, entry_attr); +out: + return -EMSGSIZE; +} + static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -472,6 +556,136 @@ static int nldev_res_get_dumpit(struct sk_buff *skb, return ib_enum_all_devs(_nldev_res_get_dumpit, skb, cb); } +static int nldev_res_get_qp_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct rdma_restrack_entry *res; + int err, ret = 0, idx = 0; + struct nlattr *table_attr; + struct ib_device *device; + int start = cb->args[0]; + struct ib_qp *qp = NULL; + struct nlmsghdr *nlh; + u32 index, port = 0; + + err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NULL); + /* + * Right now, we are expecting the device index to get QP information, + * but it is possible to extend this code to return all devices in + * one shot by checking the existence of RDMA_NLDEV_ATTR_DEV_INDEX. + * if it doesn't exist, we will iterate over all devices. + * + * But it is not needed for now. + */ + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + /* + * If no PORT_INDEX is supplied, we will return all QPs from that device + */ + if (tb[RDMA_NLDEV_ATTR_PORT_INDEX]) { + port = nla_get_u32(tb[RDMA_NLDEV_ATTR_PORT_INDEX]); + if (!rdma_is_port_valid(device, port)) { + ret = -EINVAL; + goto err_index; + } + } + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_RES_QP_GET), + 0, NLM_F_MULTI); + + if (fill_nldev_handle(skb, device)) { + ret = -EMSGSIZE; + goto err; + } + + table_attr = nla_nest_start(skb, RDMA_NLDEV_ATTR_RES_QP); + if (!table_attr) { + ret = -EMSGSIZE; + goto err; + } + + down_read(&device->res.rwsem); + hash_for_each_possible(device->res.hash, res, node, RDMA_RESTRACK_QP) { + if (idx < start) + goto next; + + if ((rdma_is_kernel_res(res) && + task_active_pid_ns(current) != &init_pid_ns) || + (!rdma_is_kernel_res(res) && + task_active_pid_ns(current) != task_active_pid_ns(res->task))) + /* + * 1. Kernel QPs should be visible in init namspace only + * 2. Present only QPs visible in the current namespace + */ + goto next; + + if (!rdma_restrack_get(res)) + /* + * Resource is under release now, but we are not + * relesing lock now, so it will be released in + * our next pass, once we will get ->next pointer. + */ + goto next; + + qp = container_of(res, struct ib_qp, res); + + up_read(&device->res.rwsem); + ret = fill_res_qp_entry(skb, qp, port); + down_read(&device->res.rwsem); + /* + * Return resource back, but it won't be released till + * the &device->res.rwsem will be released for write. + */ + rdma_restrack_put(res); + + if (ret == -EMSGSIZE) + /* + * There is a chance to optimize here. + * It can be done by using list_prepare_entry + * and list_for_each_entry_continue afterwards. + */ + break; + if (ret) + goto res_err; +next: idx++; + } + up_read(&device->res.rwsem); + + nla_nest_end(skb, table_attr); + nlmsg_end(skb, nlh); + cb->args[0] = idx; + + /* + * No more QPs to fill, cancel the message and + * return 0 to mark end of dumpit. + */ + if (!qp) + goto err; + + put_device(&device->dev); + return skb->len; + +res_err: + nla_nest_cancel(skb, table_attr); + up_read(&device->res.rwsem); + +err: + nlmsg_cancel(skb, nlh); + +err_index: + put_device(&device->dev); + return ret; +} + static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { [RDMA_NLDEV_CMD_GET] = { .doit = nldev_get_doit, @@ -485,6 +699,19 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_res_get_doit, .dump = nldev_res_get_dumpit, }, + [RDMA_NLDEV_CMD_RES_QP_GET] = { + .dump = nldev_res_get_qp_dumpit, + /* + * .doit is not implemented yet for two reasons: + * 1. It is not needed yet. + * 2. There is a need to provide identifier, while it is easy + * for the QPs (device index + port index + LQPN), it is not + * the case for the rest of resources (PD and CQ). Because it + * is better to provide similar interface for all resources, + * let's wait till we will have other resources implemented + * too. + */ + }, }; void __init nldev_init(void) diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index 22c39532c411..17e59bec169e 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -238,6 +238,8 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_RES_GET, /* can dump */ + RDMA_NLDEV_CMD_RES_QP_GET, /* can dump */ + RDMA_NLDEV_NUM_OPS }; @@ -310,6 +312,46 @@ enum rdma_nldev_attr { RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_NAME, /* string */ RDMA_NLDEV_ATTR_RES_SUMMARY_ENTRY_CURR, /* u64 */ + RDMA_NLDEV_ATTR_RES_QP, /* nested table */ + RDMA_NLDEV_ATTR_RES_QP_ENTRY, /* nested table */ + /* + * Local QPN + */ + RDMA_NLDEV_ATTR_RES_LQPN, /* u32 */ + /* + * Remote QPN, + * Applicable for RC and UC only IBTA 11.2.5.3 QUERY QUEUE PAIR + */ + RDMA_NLDEV_ATTR_RES_RQPN, /* u32 */ + /* + * Receive Queue PSN, + * Applicable for RC and UC only 11.2.5.3 QUERY QUEUE PAIR + */ + RDMA_NLDEV_ATTR_RES_RQ_PSN, /* u32 */ + /* + * Send Queue PSN + */ + RDMA_NLDEV_ATTR_RES_SQ_PSN, /* u32 */ + RDMA_NLDEV_ATTR_RES_PATH_MIG_STATE, /* u8 */ + /* + * QP types as visible to RDMA/core, the reserved QPT + * are not exported through this interface. + */ + RDMA_NLDEV_ATTR_RES_TYPE, /* u8 */ + RDMA_NLDEV_ATTR_RES_STATE, /* u8 */ + /* + * Process ID which created object, + * in case of kernel origin, PID won't exist. + */ + RDMA_NLDEV_ATTR_RES_PID, /* u32 */ + /* + * The name of process created following resource. + * It will exist only for kernel objects. + * For user created objects, the user is supposed + * to read /proc/PID/comm file. + */ + RDMA_NLDEV_ATTR_RES_KERN_NAME, /* string */ + RDMA_NLDEV_ATTR_MAX }; #endif /* _UAPI_RDMA_NETLINK_H */ -- cgit v1.2.3