From 4fabb59449aa44a585b3603ffdadd4c5f4d0c033 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 6 Jul 2015 14:35:11 +0800 Subject: rds: rds_ib_device.refcount overflow Fixes: 3e0249f9c05c ("RDS/IB: add refcount tracking to struct rds_ib_device") There lacks a dropping on rds_ib_device.refcount in case rds_ib_alloc_fmr failed(mr pool running out). this lead to the refcount overflow. A complain in line 117(see following) is seen. From vmcore: s_ib_rdma_mr_pool_depleted is 2147485544 and rds_ibdev->refcount is -2147475448. That is the evidence the mr pool is used up. so rds_ib_alloc_fmr is very likely to return ERR_PTR(-EAGAIN). 115 void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) 116 { 117 BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); 118 if (atomic_dec_and_test(&rds_ibdev->refcount)) 119 queue_work(rds_wq, &rds_ibdev->free_work); 120 } fix is to drop refcount when rds_ib_alloc_fmr failed. Signed-off-by: Wengang Wang Reviewed-by: Haggai Eran Signed-off-by: Doug Ledford --- net/rds/ib_rdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/rds/ib_rdma.c') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 273b8bff6ba4..657ba9f5d308 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -759,8 +759,10 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, } ibmr = rds_ib_alloc_fmr(rds_ibdev); - if (IS_ERR(ibmr)) + if (IS_ERR(ibmr)) { + rds_ib_dev_put(rds_ibdev); return ibmr; + } ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents); if (ret == 0) -- cgit v1.2.3-70-g09d2 From e1f475a738e4c861d8aff84b737a0538680cbe05 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:25 -0700 Subject: RDS: don't update ip address tables if the address hasn't changed If the ip address tables hasn't changed, there is no need to remove them only to be added back again. Lets fix it. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/rds/ib_rdma.c') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 657ba9f5d308..e49c9568b4a5 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -151,12 +151,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) { + if (!rds_ibdev_old) + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + + if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); rds_ib_dev_put(rds_ibdev_old); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } + rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) -- cgit v1.2.3-70-g09d2 From 5c240fa2ab394af1dbde280e00cc038cbc7f0409 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Sat, 22 Aug 2015 15:45:31 -0700 Subject: RDS: Fix assertion level from fatal to warning Fix the asserion level since its not fatal and can be hit in normal execution paths. There is no need to take the system down. We keep the WARN_ON() to detect the condition if we get here with bad pages. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 +- net/rds/rdma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/rds/ib_rdma.c') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e49c9568b4a5..7b7aac8cdb56 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -490,7 +490,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 6401b501a215..c1df9b1cf3b2 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -451,7 +451,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ if (!ro->op_write) { - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); } put_page(page); -- cgit v1.2.3-70-g09d2 From 6116c2030fff91950f68b7fffb5959c91a05aaf6 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Tue, 25 Aug 2015 12:02:00 -0700 Subject: RDS: fix fmr pool dirty_count In rds_ib_flush_mr_pool(), dirty_count accounts the clean ones which is wrong. This can lead to a negative dirty count value. Lets fix it. Signed-off-by: Wengang Wang Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net/rds/ib_rdma.c') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 7b7aac8cdb56..a275b7d205ef 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -528,11 +528,13 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr /* * given an llist of mrs, put them all into the list_head for more processing */ -static void llist_append_to_list(struct llist_head *llist, struct list_head *list) +static unsigned int llist_append_to_list(struct llist_head *llist, + struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; + unsigned int count = 0; node = llist_del_all(llist); while (node) { @@ -540,7 +542,9 @@ static void llist_append_to_list(struct llist_head *llist, struct list_head *lis ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; + count++; } + return count; } /* @@ -581,7 +585,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); unsigned long unpinned = 0; - unsigned int nfreed = 0, ncleaned = 0, free_goal; + unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; int ret = 0; rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); @@ -623,8 +627,8 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ - llist_append_to_list(&pool->drop_list, &unmap_list); - llist_append_to_list(&pool->free_list, &unmap_list); + dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); + dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) llist_append_to_list(&pool->clean_list, &unmap_list); @@ -652,7 +656,6 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, kfree(ibmr); nfreed++; } - ncleaned++; } if (!list_empty(&unmap_list)) { @@ -678,7 +681,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, } atomic_sub(unpinned, &pool->free_pinned); - atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: -- cgit v1.2.3-70-g09d2 From ad1d7dc0d79d3dd2c5d2931b13edbd4fe33e5fac Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Tue, 25 Aug 2015 12:02:01 -0700 Subject: RDS: push FMR pool flush work to its own worker RDS FMR flush operation and also it races with connect/reconect which happes a lot with RDS. FMR flush being on common rds_wq aggrevates the problem. Lets push RDS FMR pool flush work to its own worker. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 9 ++++++++- net/rds/ib.h | 2 ++ net/rds/ib_rdma.c | 27 ++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 4 deletions(-) (limited to 'net/rds/ib_rdma.c') diff --git a/net/rds/ib.c b/net/rds/ib.c index 13814227b3b2..d020fade312c 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -366,6 +366,7 @@ void rds_ib_exit(void) rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); + rds_ib_fmr_exit(); } struct rds_transport rds_ib_transport = { @@ -401,10 +402,14 @@ int rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = ib_register_client(&rds_ib_client); + ret = rds_ib_fmr_init(); if (ret) goto out; + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out_fmr_exit; + ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; @@ -427,6 +432,8 @@ out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); +out_fmr_exit: + rds_ib_fmr_exit(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index 6422c52682e5..9fc95e38659a 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -313,6 +313,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); +int rds_ib_fmr_init(void); +void rds_ib_fmr_exit(void); /* ib_recv.c */ int rds_ib_recv_init(void); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a275b7d205ef..2ac78c9879ea 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -83,6 +83,25 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; +struct workqueue_struct *rds_ib_fmr_wq; + +int rds_ib_fmr_init(void) +{ + rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); + if (!rds_ib_fmr_wq) + return -ENOMEM; + return 0; +} + +/* By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_fmr_exit(void) +{ + destroy_workqueue(rds_ib_fmr_wq); +} + static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_mr_pool_flush_worker(struct work_struct *work); @@ -719,15 +738,17 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked - * as use_once and invalidate at the same time. */ - schedule_delayed_work(&pool->flush_worker, 10); + * as use_once and invalidate at the same time. + */ + queue_delayed_work(rds_ib_fmr_wq, + &pool->flush_worker, 10); } } -- cgit v1.2.3-70-g09d2 From ef5217a6e2e60bc3d0679f2652480b99730956fe Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Tue, 25 Aug 2015 12:02:02 -0700 Subject: RDS: flush the FMR pool less often FMR flush is an expensive and time consuming operation. Reduce the frequency of FMR pool flush by 50% so that more FMR work gets accumulated for more efficient flushing. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/rds/ib_rdma.c') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 2ac78c9879ea..e596dfb76038 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -737,7 +737,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) + atomic_read(&pool->dirty_count) >= pool->max_items / 5) queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { -- cgit v1.2.3-70-g09d2 From 272412141908c40517cc89d5bb2eb074a2ec1474 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Tue, 25 Aug 2015 12:02:03 -0700 Subject: RDS: remove superfluous from rds_ib_alloc_fmr() Memory allocated for 'ibmr' uses kzalloc_node() which already initialises the memory to zero. There is no need to do memset() 0 on that memory. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/rds/ib_rdma.c') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e596dfb76038..251d1ce0b7c7 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -360,8 +360,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } - memset(ibmr, 0, sizeof(*ibmr)); - ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | -- cgit v1.2.3-70-g09d2