From 0e7f011a19696cc25d68a8d6631fc6c5aa60a54c Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 23 Apr 2008 16:49:54 -0500 Subject: svcrdma: Simplify receive buffer posting The svcrdma transport provider currently allocates receive buffers to the RQ through the xpo_release_rqst method. This approach is overly complicated since it means that the rqstp rq_xprt_ctxt has to be selectively set based on whether the RPC is going to be processed immediately or deferred. Instead, just post the receive buffer when we are certain that we are replying in the send_reply function. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index af408fc12634..1e0af2f205e9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -910,27 +910,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) return NULL; } -/* - * Post an RQ WQE to the RQ when the rqst is being released. This - * effectively returns an RQ credit to the client. The rq_xprt_ctxt - * will be null if the request is deferred due to an RDMA_READ or the - * transport had no data ready (EAGAIN). Note that an RPC deferred in - * svc_process will still return the credit, this is because the data - * is copied and no longer consume a WQE/WC. - */ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) { - int err; - struct svcxprt_rdma *rdma = - container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); - if (rqstp->rq_xprt_ctxt) { - BUG_ON(rqstp->rq_xprt_ctxt != rdma); - err = svc_rdma_post_recv(rdma); - if (err) - dprintk("svcrdma: failed to post an RQ WQE error=%d\n", - err); - } - rqstp->rq_xprt_ctxt = NULL; } /* -- cgit v1.2.3 From dbcd00eba99945acfc433508a58eadc5dcd18cad Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 6 May 2008 11:33:11 -0500 Subject: svcrdma: Fix race with dto_tasklet in svc_rdma_send The svc_rdma_send function will attempt to reap SQ WR to make room for a new request if it finds the SQ full. This function races with the dto_tasklet that also reaps SQ WR. To avoid polling and arming the CQ unnecessarily move the test_and_clear_bit of the RDMAXPRT_SQ_PENDING flag and arming of the CQ to the sq_cq_reap function. Refactor the rq_cq_reap function to match sq_cq_reap so that the code is easier to follow. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 40 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 1e0af2f205e9..73734173f994 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -228,23 +228,8 @@ static void dto_tasklet_func(unsigned long data) list_del_init(&xprt->sc_dto_q); spin_unlock_irqrestore(&dto_lock, flags); - if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) { - ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); - rq_cq_reap(xprt); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - /* - * If data arrived before established event, - * don't enqueue. This defers RPC I/O until the - * RDMA connection is complete. - */ - if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) - svc_xprt_enqueue(&xprt->sc_xprt); - } - - if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) { - ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); - sq_cq_reap(xprt); - } + rq_cq_reap(xprt); + sq_cq_reap(xprt); svc_xprt_put(&xprt->sc_xprt); spin_lock_irqsave(&dto_lock, flags); @@ -297,6 +282,10 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) struct ib_wc wc; struct svc_rdma_op_ctxt *ctxt = NULL; + if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_rq_poll); spin_lock_bh(&xprt->sc_rq_dto_lock); @@ -316,6 +305,15 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) if (ctxt) atomic_inc(&rdma_stat_rq_prod); + + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + /* + * If data arrived before established event, + * don't enqueue. This defers RPC I/O until the + * RDMA connection is complete. + */ + if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) + svc_xprt_enqueue(&xprt->sc_xprt); } /* @@ -328,6 +326,11 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) struct ib_cq *cq = xprt->sc_sq_cq; int ret; + + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; @@ -1010,7 +1013,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { spin_unlock_bh(&xprt->sc_lock); atomic_inc(&rdma_stat_sq_starve); - /* See if we can reap some SQ WR */ + + /* See if we can opportunistically reap SQ WR to make room */ sq_cq_reap(xprt); /* Wait until SQ WR available if SQ still full */ -- cgit v1.2.3 From 9d6347acd2134373c3a4c65a4d43e4f1d59aa012 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 25 Apr 2008 15:51:27 -0500 Subject: svcrdma: Fix return value in svc_rdma_send Fix the return value on close to -ENOTCONN so caller knows to free context. Also if a thread is waiting for free SQ space, check for close when waking to avoid posting WR to a closing transport. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 73734173f994..17f036b23a96 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1002,7 +1002,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) int ret; if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) - return 0; + return -ENOTCONN; BUG_ON(wr->send_flags != IB_SEND_SIGNALED); BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != -- cgit v1.2.3 From 120693d12cde0cc735d784c951b53381efec918f Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 24 Apr 2008 14:17:21 -0500 Subject: svcrdma: Add put of connection ESTABLISHED reference in rdma_cma_handler The svcrdma transport takes a reference when it gets the ESTABLISHED event from the provider. This reference is supposed to be removed when the DISCONNECT event is received, however, the call to svc_xprt_put was missing in the switch statement. This results in the memory associated with the transport never being freed. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 17f036b23a96..4bf8b5ad1675 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -630,6 +630,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, if (xprt) { set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } break; case RDMA_CM_EVENT_DEVICE_REMOVAL: -- cgit v1.2.3 From 05a0826a6e6d95ab6e9c3e4a10b58e10f233cc2b Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 25 Apr 2008 14:11:31 -0500 Subject: svcrdma: Free context on ib_post_recv error If there is an error posting the recv WR to the RQ, free the context associated with the WR. This would leak a context when asynchronous errors occurred on the transport while conccurent threads were processing their RPC. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4bf8b5ad1675..e85ac77f4954 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -524,6 +524,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) recv_wr.wr_id = (u64)(unsigned long)ctxt; ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); + if (ret) + svc_rdma_put_context(ctxt, 1); return ret; } -- cgit v1.2.3 From 58e8f62137f1c55fe3d31234167660f2ce509297 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 6 May 2008 09:45:54 -0500 Subject: svcrdma: Fix error handling during listening endpoint creation A listening endpoint isn't known to the generic transport switch until the svc_create_xprt function returns without error. Calling svc_xprt_put within the xpo_create function causes the module reference count to be erroneously decremented. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e85ac77f4954..d9ed5f24c362 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -667,31 +667,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, cma_xprt = rdma_create_xprt(serv, 1); if (!cma_xprt) - return ERR_PTR(ENOMEM); + return ERR_PTR(-ENOMEM); xprt = &cma_xprt->sc_xprt; listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); if (IS_ERR(listen_id)) { - svc_xprt_put(&cma_xprt->sc_xprt); - dprintk("svcrdma: rdma_create_id failed = %ld\n", - PTR_ERR(listen_id)); - return (void *)listen_id; + ret = PTR_ERR(listen_id); + dprintk("svcrdma: rdma_create_id failed = %d\n", ret); + goto err0; } + ret = rdma_bind_addr(listen_id, sa); if (ret) { - rdma_destroy_id(listen_id); - svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); - return ERR_PTR(ret); + goto err1; } cma_xprt->sc_cm_id = listen_id; ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); if (ret) { - rdma_destroy_id(listen_id); - svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_listen failed = %d\n", ret); - return ERR_PTR(ret); + goto err1; } /* @@ -702,6 +698,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); return &cma_xprt->sc_xprt; + + err1: + rdma_destroy_id(listen_id); + err0: + kfree(cma_xprt); + return ERR_PTR(ret); } /* -- cgit v1.2.3 From 10a38c33f46d128d11e299acba744bc325cde420 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 30 Apr 2008 17:32:17 -0500 Subject: svcrdma: Remove unused READ_DONE context flags bit The RDMACTXT_F_READ_DONE bit is not longer used. Remove it. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d9ed5f24c362..4a79dfda1465 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -353,7 +353,6 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) case IB_WR_RDMA_READ: if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); spin_lock_bh(&xprt->sc_read_complete_lock); list_add_tail(&ctxt->dto_q, &xprt->sc_read_complete_q); -- cgit v1.2.3 From 02e7452de74d308ca642f54f7e5ef801ced60a92 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 30 Apr 2008 19:50:56 -0500 Subject: svcrdma: Simplify RDMA_READ deferral buffer management An NFS_WRITE requires a set of RDMA_READ requests to fetch the write data from the client. There are two principal pieces of data that need to be tracked: the list of pages that comprise the completed RPC and the SGE of dma mapped pages to refer to this list of pages. Previously this whole bit was managed as a linked list of contexts with the context containing the page list buried in this list. This patch simplifies this processing by not keeping a linked list, but rather only a pionter from the last submitted RDMA_READ's context to the context that maps the set of pages that describe the RPC. This significantly simplifies this code path. SGE contexts are cleaned up inline in the DTO path instead of at read completion time. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4a79dfda1465..34141eaf25a0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -352,13 +352,16 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) case IB_WR_RDMA_READ: if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); spin_lock_bh(&xprt->sc_read_complete_lock); - list_add_tail(&ctxt->dto_q, + list_add_tail(&read_hdr->dto_q, &xprt->sc_read_complete_q); spin_unlock_bh(&xprt->sc_read_complete_lock); svc_xprt_enqueue(&xprt->sc_xprt); } + svc_rdma_put_context(ctxt, 0); break; default: -- cgit v1.2.3 From 8740767376b32a7772607e1b2b07cde0c24120cc Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 30 Apr 2008 20:44:39 -0500 Subject: svcrdma: Use standard Linux lists for context cache Replace the one-off linked list implementation used to implement the context cache with the standard Linux list_head lists. Add a context counter to catch resource leaks. A WARN_ON will be added later to ensure that we've freed all contexts. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 47 ++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 34141eaf25a0..817cf4de746c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -103,8 +103,8 @@ static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) spin_lock_bh(&xprt->sc_ctxt_lock); if (ctxt) { at_least_one = 1; - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + INIT_LIST_HEAD(&ctxt->free_list); + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); } else { /* kmalloc failed...give up for now */ xprt->sc_ctxt_cnt--; @@ -123,7 +123,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) while (1) { spin_lock_bh(&xprt->sc_ctxt_lock); - if (unlikely(xprt->sc_ctxt_head == NULL)) { + if (unlikely(list_empty(&xprt->sc_ctxt_free))) { /* Try to bump my cache. */ spin_unlock_bh(&xprt->sc_ctxt_lock); @@ -136,12 +136,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); continue; } - ctxt = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt->next; + ctxt = list_entry(xprt->sc_ctxt_free.next, + struct svc_rdma_op_ctxt, + free_list); + list_del_init(&ctxt->free_list); spin_unlock_bh(&xprt->sc_ctxt_lock); ctxt->xprt = xprt; INIT_LIST_HEAD(&ctxt->dto_q); ctxt->count = 0; + atomic_inc(&xprt->sc_ctxt_used); break; } return ctxt; @@ -163,10 +166,11 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); + spin_lock_bh(&xprt->sc_ctxt_lock); - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); spin_unlock_bh(&xprt->sc_ctxt_lock); + atomic_dec(&xprt->sc_ctxt_used); } /* ib_cq event handler */ @@ -412,28 +416,29 @@ static void create_context_cache(struct svcxprt_rdma *xprt, xprt->sc_ctxt_max = ctxt_max; xprt->sc_ctxt_bump = ctxt_bump; xprt->sc_ctxt_cnt = 0; - xprt->sc_ctxt_head = NULL; + atomic_set(&xprt->sc_ctxt_used, 0); + + INIT_LIST_HEAD(&xprt->sc_ctxt_free); for (i = 0; i < ctxt_count; i++) { ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); if (ctxt) { - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + INIT_LIST_HEAD(&ctxt->free_list); + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); xprt->sc_ctxt_cnt++; } } } -static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt) +static void destroy_context_cache(struct svcxprt_rdma *xprt) { - struct svc_rdma_op_ctxt *next; - if (!ctxt) - return; - - do { - next = ctxt->next; + while (!list_empty(&xprt->sc_ctxt_free)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(xprt->sc_ctxt_free.next, + struct svc_rdma_op_ctxt, + free_list); + list_del_init(&ctxt->free_list); kfree(ctxt); - ctxt = next; - } while (next); + } } static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, @@ -470,7 +475,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, reqs + cma_xprt->sc_sq_depth + RPCRDMA_MAX_THREADS + 1); /* max */ - if (!cma_xprt->sc_ctxt_head) { + if (list_empty(&cma_xprt->sc_ctxt_free)) { kfree(cma_xprt); return NULL; } @@ -976,7 +981,7 @@ static void svc_rdma_free(struct svc_xprt *xprt) if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); - destroy_context_cache(rdma->sc_ctxt_head); + destroy_context_cache(rdma); kfree(rdma); } -- cgit v1.2.3 From 47698e083e40bbd3ef87f5561390ae33abb13cd0 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 6 May 2008 11:49:05 -0500 Subject: svcrdma: Shrink scope of spinlock on RQ CQ The rq_cq_reap function is only called from the dto_tasklet. The only resource shared with other threads is the sc_rq_dto_q. Move the spin lock to protect only this list. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 817cf4de746c..78303f0fad92 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -292,7 +292,6 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_rq_poll); - spin_lock_bh(&xprt->sc_rq_dto_lock); while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; ctxt->wc_status = wc.status; @@ -303,9 +302,10 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) svc_rdma_put_context(ctxt, 1); continue; } + spin_lock_bh(&xprt->sc_rq_dto_lock); list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); } - spin_unlock_bh(&xprt->sc_rq_dto_lock); if (ctxt) atomic_inc(&rdma_stat_rq_prod); -- cgit v1.2.3 From 8da91ea8de873ee8be82377ff18637d05e882058 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 30 Apr 2008 22:00:46 -0500 Subject: svcrdma: Move destroy to kernel thread Some providers may wait while destroying adapter resources. Since it is possible that the last reference is put on the dto_tasklet, the actual destroy must be scheduled as a work item. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 78303f0fad92..028c6cf89364 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -963,12 +963,15 @@ static void svc_rdma_detach(struct svc_xprt *xprt) rdma_destroy_id(rdma->sc_cm_id); } -static void svc_rdma_free(struct svc_xprt *xprt) +static void __svc_rdma_free(struct work_struct *work) { - struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; + struct svcxprt_rdma *rdma = + container_of(work, struct svcxprt_rdma, sc_work); dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + /* We should only be called from kref_put */ - BUG_ON(atomic_read(&xprt->xpt_ref.refcount) != 0); + BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) ib_destroy_cq(rdma->sc_sq_cq); @@ -985,6 +988,14 @@ static void svc_rdma_free(struct svc_xprt *xprt) kfree(rdma); } +static void svc_rdma_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); + schedule_work(&rdma->sc_work); +} + static int svc_rdma_has_wspace(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = -- cgit v1.2.3 From 0905c0f0a2346516ecd12f0a4f33dca571b0dccd Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 1 May 2008 10:49:03 -0500 Subject: svcrdma: Add reference for each SQ/RQ WR Add a reference on the transport for every outstanding WR. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 028c6cf89364..31b1927b5ee6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -279,6 +279,8 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context) * * Take all completing WC off the CQE and enqueue the associated DTO * context on the dto_q for the transport. + * + * Note that caller must hold a transport reference. */ static void rq_cq_reap(struct svcxprt_rdma *xprt) { @@ -298,13 +300,16 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) ctxt->byte_len = wc.byte_len; if (wc.status != IB_WC_SUCCESS) { /* Close the transport */ + dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); svc_rdma_put_context(ctxt, 1); + svc_xprt_put(&xprt->sc_xprt); continue; } spin_lock_bh(&xprt->sc_rq_dto_lock); list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_put(&xprt->sc_xprt); } if (ctxt) @@ -322,6 +327,8 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) /* * Send Queue Completion Handler - potentially called on interrupt context. + * + * Note that caller must hold a transport reference. */ static void sq_cq_reap(struct svcxprt_rdma *xprt) { @@ -374,6 +381,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) wc.opcode, wc.status); break; } + svc_xprt_put(&xprt->sc_xprt); } if (ctxt) @@ -530,9 +538,12 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) recv_wr.num_sge = ctxt->count; recv_wr.wr_id = (u64)(unsigned long)ctxt; + svc_xprt_get(&xprt->sc_xprt); ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); - if (ret) + if (ret) { + svc_xprt_put(&xprt->sc_xprt); svc_rdma_put_context(ctxt, 1); + } return ret; } @@ -1049,14 +1060,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) continue; } /* Bumped used SQ WR count and post */ + svc_xprt_get(&xprt->sc_xprt); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); if (!ret) atomic_inc(&xprt->sc_sq_count); - else + else { + svc_xprt_put(&xprt->sc_xprt); dprintk("svcrdma: failed to post SQ WR rc=%d, " "sc_sq_count=%d, sc_sq_depth=%d\n", ret, atomic_read(&xprt->sc_sq_count), xprt->sc_sq_depth); + } spin_unlock_bh(&xprt->sc_lock); break; } -- cgit v1.2.3 From 1711386c62c97f7fb086a2247d44cdb1f8867640 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 1 May 2008 11:13:50 -0500 Subject: svcrdma: Move the QP and cm_id destruction to svc_rdma_free Move the destruction of the QP and CM_ID to the free path so that the QP cleanup code doesn't race with the dto_tasklet handling flushed WR. The QP reference is not needed because we now have a reference for every WR. Also add a guard in the SQ and RQ completion handlers to ignore calls generated by some providers when the QP is destroyed. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 40 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 31b1927b5ee6..b412a49c46fc 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -252,11 +252,15 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context) struct svcxprt_rdma *xprt = cq_context; unsigned long flags; + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + /* * Set the bit regardless of whether or not it's on the list * because it may be on the list already due to an SQ * completion. - */ + */ set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); /* @@ -393,11 +397,15 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context) struct svcxprt_rdma *xprt = cq_context; unsigned long flags; + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + /* * Set the bit regardless of whether or not it's on the list * because it may be on the list already due to an RQ * completion. - */ + */ set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); /* @@ -852,7 +860,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; } - svc_xprt_get(&newxprt->sc_xprt); newxprt->sc_qp = newxprt->sc_cm_id->qp; /* Register all of physical memory */ @@ -926,10 +933,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); /* Take a reference in case the DTO handler runs */ svc_xprt_get(&newxprt->sc_xprt); - if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) { + if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) ib_destroy_qp(newxprt->sc_qp); - svc_xprt_put(&newxprt->sc_xprt); - } rdma_destroy_id(newxprt->sc_cm_id); /* This call to put will destroy the transport */ svc_xprt_put(&newxprt->sc_xprt); @@ -941,10 +946,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) } /* - * When connected, an svc_xprt has at least three references: - * - * - A reference held by the QP. We still hold that here because this - * code deletes the QP and puts the reference. + * When connected, an svc_xprt has at least two references: * * - A reference held by the cm_id between the ESTABLISHED and * DISCONNECTED events. If the remote peer disconnected first, this @@ -953,7 +955,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) * - A reference held by the svc_recv code that called this function * as part of close processing. * - * At a minimum two references should still be held. + * At a minimum one references should still be held. */ static void svc_rdma_detach(struct svc_xprt *xprt) { @@ -963,15 +965,6 @@ static void svc_rdma_detach(struct svc_xprt *xprt) /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); - - /* Destroy the QP if present (not a listener) */ - if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) { - ib_destroy_qp(rdma->sc_qp); - svc_xprt_put(xprt); - } - - /* Destroy the CM ID */ - rdma_destroy_id(rdma->sc_cm_id); } static void __svc_rdma_free(struct work_struct *work) @@ -983,6 +976,13 @@ static void __svc_rdma_free(struct work_struct *work) /* We should only be called from kref_put */ BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + /* Destroy the QP if present (not a listener) */ + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) + ib_destroy_qp(rdma->sc_qp); + + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) ib_destroy_cq(rdma->sc_sq_cq); -- cgit v1.2.3 From 356d0a1519867422c3f17f79e2183f8c2d44f8ee Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 1 May 2008 11:25:02 -0500 Subject: svcrdma: Cleanup queued, but unprocessed I/O in svc_rdma_free When the transport is closing, the DTO tasklet may queue data that never gets processed. Clean up resources associated with this I/O. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 38 +++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b412a49c46fc..b1ff08d7da6c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -976,13 +976,42 @@ static void __svc_rdma_free(struct work_struct *work) /* We should only be called from kref_put */ BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + /* + * Destroy queued, but not processed read completions. Note + * that this cleanup has to be done before destroying the + * cm_id because the device ptr is needed to unmap the dma in + * svc_rdma_put_context. + */ + spin_lock_bh(&rdma->sc_read_complete_lock); + while (!list_empty(&rdma->sc_read_complete_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + spin_unlock_bh(&rdma->sc_read_complete_lock); + + /* Destroy queued, but not processed recv completions */ + spin_lock_bh(&rdma->sc_rq_dto_lock); + while (!list_empty(&rdma->sc_rq_dto_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + spin_unlock_bh(&rdma->sc_rq_dto_lock); + + /* Warn if we leaked a resource or under-referenced */ + WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); + /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_destroy_qp(rdma->sc_qp); - /* Destroy the CM ID */ - rdma_destroy_id(rdma->sc_cm_id); - if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) ib_destroy_cq(rdma->sc_sq_cq); @@ -995,6 +1024,9 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); + destroy_context_cache(rdma); kfree(rdma); } -- cgit v1.2.3 From 97a3df382e01c49555ea844bd7c4e5a08f245b9d Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 1 May 2008 14:02:45 -0500 Subject: svcrdma: Use ib verbs version of dma_unmap Use the ib_verbs version of the dma_unmap service in the svc_rdma_put_context function. This should support providers using software rdma. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b1ff08d7da6c..0b72c4c7d7cb 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -162,10 +162,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) put_page(ctxt->pages[i]); for (i = 0; i < ctxt->count; i++) - dma_unmap_single(xprt->sc_cm_id->device->dma_device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); spin_lock_bh(&xprt->sc_ctxt_lock); list_add(&ctxt->free_list, &xprt->sc_ctxt_free); -- cgit v1.2.3 From af261af4db14230fb35bcdc0ba9ef78ed6cf7bc1 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 7 May 2008 13:52:42 -0500 Subject: svcrdma: Copy transport address and arm CQ before calling rdma_accept This race was found by inspection. Messages can be received from the peer immediately following the rdma_accept call, however, the CQ have not yet been armed and the transport address has not yet been set. Set the transport address in the connect request handler and arm the CQ prior to calling rdma_accept. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 0b72c4c7d7cb..c7545203f4b3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -570,6 +570,7 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; + struct sockaddr *sa; /* Create a new transport */ newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); @@ -582,6 +583,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + /* Set the local and remote addresses in the transport */ + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + /* * Enqueue the new transport on the accept queue of the listening * transport @@ -750,7 +757,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; - struct sockaddr *sa; int ret; int i; @@ -883,6 +889,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; + /* + * Arm the CQs for the SQ and RQ before accepting so we can't + * miss the first message + */ + ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); + /* Accept Connection */ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); memset(&conn_param, 0, sizeof conn_param); @@ -919,14 +932,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_requests, newxprt->sc_ord); - /* Set the local and remote addresses in the transport */ - sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); - sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; - svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); - - ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); - ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); return &newxprt->sc_xprt; errout: -- cgit v1.2.3 From 008fdbc57164b0ac237ad6ee2766944f02ac9c28 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 7 May 2008 15:47:42 -0500 Subject: svcrdma: Change svc_rdma_send_error return type to void The svc_rdma_send_error function is called when an RPCRDMA protocol error is detected. This function attempts to post an error reply message. Since an error posting to a transport in error is ignored, change the return type to void. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net/sunrpc/xprtrdma/svc_rdma_transport.c') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index c7545203f4b3..e132509d1db0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1114,8 +1114,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) return ret; } -int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, - enum rpcrdma_errcode err) +void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err) { struct ib_send_wr err_wr; struct ib_sge sge; @@ -1153,9 +1153,8 @@ int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, /* Post It */ ret = svc_rdma_send(xprt, &err_wr); if (ret) { - dprintk("svcrdma: Error posting send = %d\n", ret); + dprintk("svcrdma: Error %d posting send for protocol error\n", + ret); svc_rdma_put_context(ctxt, 1); } - - return ret; } -- cgit v1.2.3