From d6a488f21c0f3c44bfbb2339a75159ee55aa2b6f Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Mon, 9 Jun 2014 10:52:37 +0530 Subject: RDMA/ocrdma: Report correct value of max_fast_reg_page_list_len Fix ocrdma_query_device() to report correct value of max_fast_reg_page_list_len. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Mitesh Ahuja Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index acb434d16903..4527311171c5 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -101,7 +101,7 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) attr->max_srq_sge = dev->attr.max_srq_sge; attr->max_srq_wr = dev->attr.max_rqe; attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay; - attr->max_fast_reg_page_list_len = 0; + attr->max_fast_reg_page_list_len = dev->attr.max_pages_per_frmr; attr->max_pkeys = 1; return 0; } -- cgit v1.2.3-70-g09d2 From f93439e476d012b2503dbb07fe0fc675bcbff099 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Mon, 9 Jun 2014 10:52:38 +0530 Subject: RDMA/ocrdma: Do not skip setting deferred_arm When ib_request_notify_cq() is called for the first time, ocrdma tries to skip setting deffered_arm flag. This may lead CQ to an un-armed state thus never generating a CQ event and leaving consumer hung. This patch removes the part of code that skips setting deferred_arm. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Mitesh Ahuja Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 4527311171c5..8f5f2577f288 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -2846,11 +2846,9 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags) if (cq->first_arm) { ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0); cq->first_arm = false; - goto skip_defer; } - cq->deferred_arm = true; -skip_defer: + cq->deferred_arm = true; cq->deferred_sol = sol_needed; spin_unlock_irqrestore(&cq->cq_lock, flags); -- cgit v1.2.3-70-g09d2 From 87773dd56d5405ac28119fcfadacefd35877c18f Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Wed, 3 Sep 2014 12:13:57 -0500 Subject: IB: ib_umem_release() should decrement mm->pinned_vm from ib_umem_get In debugging an application that receives -ENOMEM from ib_reg_mr(), I found that ib_umem_get() can fail because the pinned_vm count has wrapped causing it to always be larger than the lock limit even with RLIMIT_MEMLOCK set to RLIM_INFINITY. The wrapping of pinned_vm occurs because the process that calls ib_reg_mr() will have its mm->pinned_vm count incremented. Later a different process with a different mm_struct than the one that allocated the ib_umem struct ends up releasing it which results in decrementing the new processes mm->pinned_vm count past zero and wrapping. I'm not entirely sure what circumstances cause a different process to release the ib_umem than the one that allocated it but the kernel stack trace of the freeing process from my situation looks like the following: Call Trace: [] dump_stack+0x19/0x1b [] ib_umem_release+0x1f5/0x200 [ib_core] [] mlx4_ib_destroy_qp+0x241/0x440 [mlx4_ib] [] ib_destroy_qp+0x12c/0x170 [ib_core] [] ib_uverbs_close+0x259/0x4e0 [ib_uverbs] [] __fput+0xba/0x240 [] ____fput+0xe/0x10 [] task_work_run+0xc4/0xe0 [] do_notify_resume+0x95/0xa0 [] int_signal+0x12/0x17 The following patch fixes the issue by storing the pid struct of the process that calls ib_umem_get() so that ib_umem_release and/or ib_umem_account() can properly decrement the pinned_vm count of the correct mm_struct. Signed-off-by: Shawn Bohrer Reviewed-by: Shachar Raindel Signed-off-by: Roland Dreier --- drivers/infiniband/core/umem.c | 19 +++++++++++++------ include/rdma/ib_umem.h | 1 + 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a3a2e9c1639b..df0c4f605a21 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -105,6 +105,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->length = size; umem->offset = addr & ~PAGE_MASK; umem->page_size = PAGE_SIZE; + umem->pid = get_task_pid(current, PIDTYPE_PID); /* * We ask for writable memory if any access flags other than * "remote read" are set. "Local write" and "remote write" @@ -198,6 +199,7 @@ out: if (ret < 0) { if (need_release) __ib_umem_release(context->device, umem, 0); + put_pid(umem->pid); kfree(umem); } else current->mm->pinned_vm = locked; @@ -230,15 +232,19 @@ void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; struct mm_struct *mm; + struct task_struct *task; unsigned long diff; __ib_umem_release(umem->context->device, umem, 1); - mm = get_task_mm(current); - if (!mm) { - kfree(umem); - return; - } + task = get_pid_task(umem->pid, PIDTYPE_PID); + put_pid(umem->pid); + if (!task) + goto out; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT; @@ -262,9 +268,10 @@ void ib_umem_release(struct ib_umem *umem) } else down_write(&mm->mmap_sem); - current->mm->pinned_vm -= diff; + mm->pinned_vm -= diff; up_write(&mm->mmap_sem); mmput(mm); +out: kfree(umem); } EXPORT_SYMBOL(ib_umem_release); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 1ea0b65c4cfb..a2bf41e0bde9 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -47,6 +47,7 @@ struct ib_umem { int writable; int hugetlb; struct work_struct work; + struct pid *pid; struct mm_struct *mm; unsigned long diff; struct sg_table sg_head; -- cgit v1.2.3-70-g09d2 From 3033771febebf2c66d5146624746ebc4a3d79314 Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 15 Sep 2014 11:43:06 -0400 Subject: IB/ipath: Change get_user_pages() usage to always NULL vmas The static helper routine, __ipath_get_user_pages(), accepts a vma arg, but current use always passes NULL. This has caused some confusion associated with the correct use of this argument, but since the current use case doesn't require the flexiblity, the best thing to do is to simplfy the code to always pass NULL to get_user_pages(). Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_user_pages.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c index dc66c4506916..1da1252dcdb3 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_pages.c +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -54,7 +54,7 @@ static void __ipath_release_user_pages(struct page **p, size_t num_pages, /* call with current->mm->mmap_sem held */ static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages, - struct page **p, struct vm_area_struct **vma) + struct page **p) { unsigned long lock_limit; size_t got; @@ -74,7 +74,7 @@ static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages, ret = get_user_pages(current, current->mm, start_page + got * PAGE_SIZE, num_pages - got, 1, 1, - p + got, vma); + p + got, NULL); if (ret < 0) goto bail_release; } @@ -165,7 +165,7 @@ int ipath_get_user_pages(unsigned long start_page, size_t num_pages, down_write(¤t->mm->mmap_sem); - ret = __ipath_get_user_pages(start_page, num_pages, p, NULL); + ret = __ipath_get_user_pages(start_page, num_pages, p); up_write(¤t->mm->mmap_sem); -- cgit v1.2.3-70-g09d2 From f5c4984e065556b81edf3c9dadfbc8750de690fc Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Mon, 15 Sep 2014 11:42:53 -0400 Subject: IB/qib: Change get_user_pages() usage to always NULL vmas The static helper routine, __qib_get_user_pages(), accepts a vma arg, but current use always passes NULL. This has caused some confusion associated with the correct use of this argument, but since the current use case doesn't require the flexiblity, the best thing to do is to simplfy the code to always pass NULL to get_user_pages(). Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_user_pages.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 2bc1d2b96298..74f90b2619f6 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -52,7 +52,7 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages, * Call with current->mm->mmap_sem held. */ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, - struct page **p, struct vm_area_struct **vma) + struct page **p) { unsigned long lock_limit; size_t got; @@ -69,7 +69,7 @@ static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, ret = get_user_pages(current, current->mm, start_page + got * PAGE_SIZE, num_pages - got, 1, 1, - p + got, vma); + p + got, NULL); if (ret < 0) goto bail_release; } @@ -136,7 +136,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, down_write(¤t->mm->mmap_sem); - ret = __qib_get_user_pages(start_page, num_pages, p, NULL); + ret = __qib_get_user_pages(start_page, num_pages, p); up_write(¤t->mm->mmap_sem); -- cgit v1.2.3-70-g09d2 From 50e2ec9105fe1045298924a6c7b9ce870a0b6c13 Mon Sep 17 00:00:00 2001 From: Markus Stockhausen Date: Wed, 13 Aug 2014 14:07:30 +0000 Subject: IB/mlx4: Disable TSO for Connect-X rev. A0 HCAs According to , revision A0 of Connect-X does not correctly assemble TSO packets. Disable that feature on that hardware revision. Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index e1e558a3d692..20f731f08c7e 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -59,6 +59,7 @@ #define MLX4_IB_FLOW_MAX_PRIO 0xFFF #define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF +#define MLX4_IB_CARD_REV_A0 0xA0 MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); @@ -158,7 +159,9 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; - if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) + if (dev->dev->caps.max_gso_sz && + (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) && + (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH)) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; -- cgit v1.2.3-70-g09d2 From 68f9d83c7765b110de54677573e9538f3ca6ca04 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 20 Aug 2014 11:15:08 -0400 Subject: IPoIB: Remove unnecessary port query There are two queries for port attributes one after another. A second call is not needed since port_attr structure already holds the data. Reviewed-by: Ira Weiny Signed-off-by: Alex Estrin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index d4e005720d01..ffb83b5f7e80 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -529,21 +529,13 @@ void ipoib_mcast_join_task(struct work_struct *work) port_attr.state); return; } + priv->local_lid = port_attr.lid; if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); - { - struct ib_port_attr attr; - - if (!ib_query_port(priv->ca, priv->port, &attr)) - priv->local_lid = attr.lid; - else - ipoib_warn(priv, "ib_query_port failed\n"); - } - if (!priv->broadcast) { struct ipoib_mcast *broadcast; -- cgit v1.2.3-70-g09d2 From 85cbb7c728bf39c45a9789b88c9471c0d7a58b0e Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 19 Sep 2014 08:32:19 -0400 Subject: IB/qib: Correct reference counting in debugfs qp_stats This particular reference count is not needed with the rcu protection, and the current code leaks a reference count, causing a hang in qib_qp_destroy(). Cc: Reviewed-by: Dennis Dalessandro Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_debugfs.c | 3 ++- drivers/infiniband/hw/qib/qib_qp.c | 8 -------- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c index 799a0c3bffc4..6abd3ed3cd51 100644 --- a/drivers/infiniband/hw/qib/qib_debugfs.c +++ b/drivers/infiniband/hw/qib/qib_debugfs.c @@ -193,6 +193,7 @@ static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos) struct qib_qp_iter *iter; loff_t n = *pos; + rcu_read_lock(); iter = qib_qp_iter_init(s->private); if (!iter) return NULL; @@ -224,7 +225,7 @@ static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr, static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr) { - /* nothing for now */ + rcu_read_unlock(); } static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr) diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 7fcc150d603c..6ddc0264aad2 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -1325,7 +1325,6 @@ int qib_qp_iter_next(struct qib_qp_iter *iter) struct qib_qp *pqp = iter->qp; struct qib_qp *qp; - rcu_read_lock(); for (; n < dev->qp_table_size; n++) { if (pqp) qp = rcu_dereference(pqp->next); @@ -1333,18 +1332,11 @@ int qib_qp_iter_next(struct qib_qp_iter *iter) qp = rcu_dereference(dev->qp_table[n]); pqp = qp; if (qp) { - if (iter->qp) - atomic_dec(&iter->qp->refcount); - atomic_inc(&qp->refcount); - rcu_read_unlock(); iter->qp = qp; iter->n = n; return 0; } } - rcu_read_unlock(); - if (iter->qp) - atomic_dec(&iter->qp->refcount); return ret; } -- cgit v1.2.3-70-g09d2 From 4ff0acca7344c93fd9ed778b4c3ce16d95c594e4 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 11 Sep 2014 13:18:37 +0300 Subject: mlx4: Correct error flows in rereg_mr This patch addresses feedback from Sagi Grimberg on the rereg_mr implementation of mlx4. The following are fixed: 1. Set the correct pd_flags 2. Make sure we change the iova and size MR fields only after successful write and allocation of the MTTs. 3. Make the error checking more robust Fixes: e630664c8383 ("mlx4_core: Add helper functions to support MR re-registration") Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/mr.c | 7 +++++-- drivers/net/ethernet/mellanox/mlx4/mr.c | 33 ++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 9b0e80e59b08..8f9325cfc85d 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -234,14 +234,13 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, 0); if (IS_ERR(mmr->umem)) { err = PTR_ERR(mmr->umem); + /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */ mmr->umem = NULL; goto release_mpt_entry; } n = ib_umem_page_count(mmr->umem); shift = ilog2(mmr->umem->page_size); - mmr->mmr.iova = virt_addr; - mmr->mmr.size = length; err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr, virt_addr, length, n, shift, *pmpt_entry); @@ -249,6 +248,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, ib_umem_release(mmr->umem); goto release_mpt_entry; } + mmr->mmr.iova = virt_addr; + mmr->mmr.size = length; err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem); if (err) { @@ -262,6 +263,8 @@ int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, * return a failure. But dereg_mr will free the resources. */ err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry); + if (!err && flags & IB_MR_REREG_ACCESS) + mmr->mmr.access = mr_access_flags; release_mpt_entry: mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry); diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index 7d717eccb7b0..193a6adb5d04 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -298,6 +298,7 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); } +/* Must protect against concurrent access */ int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, struct mlx4_mpt_entry ***mpt_entry) { @@ -305,13 +306,10 @@ int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1); struct mlx4_cmd_mailbox *mailbox = NULL; - /* Make sure that at this point we have single-threaded access only */ - if (mmr->enabled != MLX4_MPT_EN_HW) return -EINVAL; err = mlx4_HW2SW_MPT(dev, NULL, key); - if (err) { mlx4_warn(dev, "HW2SW_MPT failed (%d).", err); mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n"); @@ -333,7 +331,6 @@ int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, 0, MLX4_CMD_QUERY_MPT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); - if (err) goto free_mailbox; @@ -378,9 +375,10 @@ int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, err = mlx4_SW2HW_MPT(dev, mailbox, key); } - mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK; - if (!err) + if (!err) { + mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK; mmr->enabled = MLX4_MPT_EN_HW; + } return err; } EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt); @@ -400,11 +398,12 @@ EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt); int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry, u32 pdn) { - u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags); + u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags) & ~MLX4_MPT_PD_MASK; /* The wrapper function will put the slave's id here */ if (mlx4_is_mfunc(dev)) pd_flags &= ~MLX4_MPT_PD_VF_MASK; - mpt_entry->pd_flags = cpu_to_be32((pd_flags & ~MLX4_MPT_PD_MASK) | + + mpt_entry->pd_flags = cpu_to_be32(pd_flags | (pdn & MLX4_MPT_PD_MASK) | MLX4_MPT_PD_FLAG_EN_INV); return 0; @@ -600,14 +599,18 @@ int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, { int err; - mpt_entry->start = cpu_to_be64(mr->iova); - mpt_entry->length = cpu_to_be64(mr->size); - mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift); + mpt_entry->start = cpu_to_be64(iova); + mpt_entry->length = cpu_to_be64(size); + mpt_entry->entity_size = cpu_to_be32(page_shift); err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt); if (err) return err; + mpt_entry->pd_flags &= cpu_to_be32(MLX4_MPT_PD_MASK | + MLX4_MPT_PD_FLAG_EN_INV); + mpt_entry->flags &= cpu_to_be32(MLX4_MPT_FLAG_FREE | + MLX4_MPT_FLAG_SW_OWNS); if (mr->mtt.order < 0) { mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL); mpt_entry->mtt_addr = 0; @@ -617,6 +620,14 @@ int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, if (mr->mtt.page_shift == 0) mpt_entry->mtt_sz = cpu_to_be32(1 << mr->mtt.order); } + if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) { + /* fast register MR in free state */ + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_FREE); + mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG | + MLX4_MPT_PD_FLAG_RAE); + } else { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS); + } mr->enabled = MLX4_MPT_EN_SW; return 0; -- cgit v1.2.3-70-g09d2 From 1be528bcb88d0b854dda1d60b31f4f8f7310f034 Mon Sep 17 00:00:00 2001 From: "devesh.sharma@emulex.com" Date: Fri, 5 Sep 2014 15:09:48 +0530 Subject: RDMA/ocrdma: Resolve L2 address when creating user AH Because of IP-based GIDs, userspace AHs must have MAC and VLAN ID resolved separately. Presently, user AHs are broken for ocrdma. This patch resolves L2 addresses while creating user AH and obtains the right DMAC and VLAN ID before creating AH. Signed-off-by: Devesh Sharma Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 41 +++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index 40f8536c10b0..a9f967d47575 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -38,7 +38,7 @@ #define OCRDMA_VID_PCP_SHIFT 0xD static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, - struct ib_ah_attr *attr, int pdid) + struct ib_ah_attr *attr, union ib_gid *sgid, int pdid) { int status = 0; u16 vlan_tag; bool vlan_enabled = false; @@ -49,8 +49,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, memset(ð, 0, sizeof(eth)); memset(&grh, 0, sizeof(grh)); - ah->sgid_index = attr->grh.sgid_index; - + /* VLAN */ vlan_tag = attr->vlan_id; if (!vlan_tag || (vlan_tag > 0xFFF)) vlan_tag = dev->pvid; @@ -65,15 +64,14 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); eth_sz = sizeof(struct ocrdma_eth_basic); } + /* MAC */ memcpy(ð.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN); - memcpy(ð.dmac[0], attr->dmac, ETH_ALEN); status = ocrdma_resolve_dmac(dev, attr, ð.dmac[0]); if (status) return status; - status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, - (union ib_gid *)&grh.sgid[0]); - if (status) - return status; + ah->sgid_index = attr->grh.sgid_index; + memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid)); + memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw)); grh.tclass_flow = cpu_to_be32((6 << 28) | (attr->grh.traffic_class << 24) | @@ -81,8 +79,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, /* 0x1b is next header value in GRH */ grh.pdid_hoplimit = cpu_to_be32((pdid << 16) | (0x1b << 8) | attr->grh.hop_limit); - - memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw)); + /* Eth HDR */ memcpy(&ah->av->eth_hdr, ð, eth_sz); memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh)); if (vlan_enabled) @@ -98,6 +95,8 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) struct ocrdma_ah *ah; struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); + union ib_gid sgid; + u8 zmac[ETH_ALEN]; if (!(attr->ah_flags & IB_AH_GRH)) return ERR_PTR(-EINVAL); @@ -111,7 +110,27 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) status = ocrdma_alloc_av(dev, ah); if (status) goto av_err; - status = set_av_attr(dev, ah, attr, pd->id); + + status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, &sgid); + if (status) { + pr_err("%s(): Failed to query sgid, status = %d\n", + __func__, status); + goto av_conf_err; + } + + memset(&zmac, 0, ETH_ALEN); + if (pd->uctx && + memcmp(attr->dmac, &zmac, ETH_ALEN)) { + status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid, + attr->dmac, &attr->vlan_id); + if (status) { + pr_err("%s(): Failed to resolve dmac from gid." + "status = %d\n", __func__, status); + goto av_conf_err; + } + } + + status = set_av_attr(dev, ah, attr, &sgid, pd->id); if (status) goto av_conf_err; -- cgit v1.2.3-70-g09d2 From f0c2c225dfe9dfb668fe72eadabb8a3ec74ca036 Mon Sep 17 00:00:00 2001 From: "devesh.sharma@emulex.com" Date: Fri, 5 Sep 2014 15:09:49 +0530 Subject: RDMA/ocrdma: Use right macro in query AH ocrdma_query_ah() does not use correct macro, and checks the wrong bit for the validity of address handle in vector table. Fix this. Signed-off-by: Devesh Sharma Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index a9f967d47575..ac02ce4e8040 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -164,7 +164,7 @@ int ocrdma_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr) struct ocrdma_av *av = ah->av; struct ocrdma_grh *grh; attr->ah_flags |= IB_AH_GRH; - if (ah->av->valid & Bit(1)) { + if (ah->av->valid & OCRDMA_AV_VALID) { grh = (struct ocrdma_grh *)((u8 *)ah->av + sizeof(struct ocrdma_eth_vlan)); attr->sl = be16_to_cpu(av->eth_hdr.vlan_tag) >> 13; -- cgit v1.2.3-70-g09d2 From c33b15f00bbfb9324dc38e5176f576a0f46e0873 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Tue, 2 Sep 2014 17:08:41 +0300 Subject: IB/iser: Fix RX/TX CQ resource leak on error flow When failing to allocate TX CQ we already allocated RX CQ, so we need to make sure we release it. Also, when failing to register notification to the RX CQ we currently leak both RX and TX CQs of the current index, fix that too. Signed-off-by: Roi Dayan Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_verbs.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 3ef167f97d6f..3bfec4bbda52 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -73,7 +73,7 @@ static int iser_create_device_ib_res(struct iser_device *device) { struct iser_cq_desc *cq_desc; struct ib_device_attr *dev_attr = &device->dev_attr; - int ret, i, j; + int ret, i; ret = ib_query_device(device->ib_device, dev_attr); if (ret) { @@ -125,16 +125,20 @@ static int iser_create_device_ib_res(struct iser_device *device) iser_cq_event_callback, (void *)&cq_desc[i], ISER_MAX_RX_CQ_LEN, i); - if (IS_ERR(device->rx_cq[i])) + if (IS_ERR(device->rx_cq[i])) { + device->rx_cq[i] = NULL; goto cq_err; + } device->tx_cq[i] = ib_create_cq(device->ib_device, NULL, iser_cq_event_callback, (void *)&cq_desc[i], ISER_MAX_TX_CQ_LEN, i); - if (IS_ERR(device->tx_cq[i])) + if (IS_ERR(device->tx_cq[i])) { + device->tx_cq[i] = NULL; goto cq_err; + } if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP)) goto cq_err; @@ -160,14 +164,14 @@ static int iser_create_device_ib_res(struct iser_device *device) handler_err: ib_dereg_mr(device->mr); dma_mr_err: - for (j = 0; j < device->cqs_used; j++) - tasklet_kill(&device->cq_tasklet[j]); + for (i = 0; i < device->cqs_used; i++) + tasklet_kill(&device->cq_tasklet[i]); cq_err: - for (j = 0; j < i; j++) { - if (device->tx_cq[j]) - ib_destroy_cq(device->tx_cq[j]); - if (device->rx_cq[j]) - ib_destroy_cq(device->rx_cq[j]); + for (i = 0; i < device->cqs_used; i++) { + if (device->tx_cq[i]) + ib_destroy_cq(device->tx_cq[i]); + if (device->rx_cq[i]) + ib_destroy_cq(device->rx_cq[i]); } ib_dealloc_pd(device->pd); pd_err: -- cgit v1.2.3-70-g09d2 From 91eb1df39a1fba21bbc28895a84630782cd442ed Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Tue, 2 Sep 2014 17:08:42 +0300 Subject: IB/iser: Allow bind only when connection state is UP We need to fail the bind operation if the iser connection state != UP (started teardown) and this should be done under the state lock. Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 61ee91d88380..93ce62fe1594 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -344,7 +344,6 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, int is_leading) { struct iscsi_conn *conn = cls_conn->dd_data; - struct iscsi_session *session; struct iser_conn *ib_conn; struct iscsi_endpoint *ep; int error; @@ -363,9 +362,17 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, } ib_conn = ep->dd_data; - session = conn->session; - if (iser_alloc_rx_descriptors(ib_conn, session)) - return -ENOMEM; + mutex_lock(&ib_conn->state_mutex); + if (ib_conn->state != ISER_CONN_UP) { + error = -EINVAL; + iser_err("iser_conn %p state is %d, teardown started\n", + ib_conn, ib_conn->state); + goto out; + } + + error = iser_alloc_rx_descriptors(ib_conn, conn->session); + if (error) + goto out; /* binds the iSER connection retrieved from the previously * connected ep_handle to the iSCSI layer connection. exchanges @@ -375,7 +382,9 @@ iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session, conn->dd_data = ib_conn; ib_conn->iscsi_conn = conn; - return 0; +out: + mutex_unlock(&ib_conn->state_mutex); + return error; } static int -- cgit v1.2.3-70-g09d2 From 61aabb3c91c1b03478ffc1a4a2573f825e7f35f9 Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 2 Sep 2014 17:08:43 +0300 Subject: IB/iser: Bump version to 1.4.1 Signed-off-by: Roi Dayan Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index c877dad381cb..9f0e0e34d6ca 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -69,7 +69,7 @@ #define DRV_NAME "iser" #define PFX DRV_NAME ": " -#define DRV_VER "1.4" +#define DRV_VER "1.4.1" #define iser_dbg(fmt, arg...) \ do { \ -- cgit v1.2.3-70-g09d2 From e381835cf1b8e3b2857277dbc3b77d8c5350f70a Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 21 Aug 2014 14:28:37 +0300 Subject: IB/mlx4: Avoid null pointer dereference in mlx4_ib_scan_netdevs() When Ethernet netdev is not present for a port (e.g. when the link layer type of the port is InfiniBand) it's possible to dereference a null pointer when we do netdevice scanning. To fix that, we move a section of code that needs to run only when netdev is present to a proper if () statement. Fixes: ad4885d279b6 ("IB/mlx4: Build the port IBoE GID table properly under bonding") Reported-by: Dan Carpenter Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 49 +++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 20f731f08c7e..bf09d79afd4d 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1789,31 +1789,34 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; mlx4_ib_set_default_gid(ibdev, curr_netdev, port); - } else { - reset_gid_table(ibdev, port); - } - /* if using bonding/team and a slave port is down, we don't the bond IP - * based gids in the table since flows that select port by gid may get - * the down port. - */ - if (curr_master && (port_state == IB_PORT_DOWN)) { - reset_gid_table(ibdev, port); - mlx4_ib_set_default_gid(ibdev, curr_netdev, port); - } - /* if bonding is used it is possible that we add it to masters - * only after IP address is assigned to the net bonding - * interface. - */ - if (curr_master && (old_master != curr_master)) { - reset_gid_table(ibdev, port); - mlx4_ib_set_default_gid(ibdev, curr_netdev, port); - mlx4_ib_get_dev_addr(curr_master, ibdev, port); - } + /* if using bonding/team and a slave port is down, we + * don't the bond IP based gids in the table since + * flows that select port by gid may get the down port. + */ + if (curr_master && (port_state == IB_PORT_DOWN)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, port); + } + /* if bonding is used it is possible that we add it to + * masters only after IP address is assigned to the + * net bonding interface. + */ + if (curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, port); + mlx4_ib_get_dev_addr(curr_master, ibdev, port); + } - if (!curr_master && (old_master != curr_master)) { + if (!curr_master && (old_master != curr_master)) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, port); + mlx4_ib_get_dev_addr(curr_netdev, ibdev, port); + } + } else { reset_gid_table(ibdev, port); - mlx4_ib_set_default_gid(ibdev, curr_netdev, port); - mlx4_ib_get_dev_addr(curr_netdev, ibdev, port); } } -- cgit v1.2.3-70-g09d2 From f5c4834d9328c4ed9fe5dcbec6128d6da16db69a Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 21 Aug 2014 14:28:38 +0300 Subject: IB/mlx4: Don't duplicate the default RoCE GID When reading the IPv6 addresses from the net-device, make sure to avoid adding a duplicate entry to the GID table because of equality between the default GID we generate and the default IPv6 link-local address of the device. Fixes: acc4fccf4eff ("IB/mlx4: Make sure GID index 0 is always occupied") Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index bf09d79afd4d..16fb2327813a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1679,6 +1679,7 @@ static void mlx4_ib_get_dev_addr(struct net_device *dev, struct inet6_dev *in6_dev; union ib_gid *pgid; struct inet6_ifaddr *ifp; + union ib_gid default_gid; #endif union ib_gid gid; @@ -1699,12 +1700,15 @@ static void mlx4_ib_get_dev_addr(struct net_device *dev, in_dev_put(in_dev); } #if IS_ENABLED(CONFIG_IPV6) + mlx4_make_default_gid(dev, &default_gid); /* IPv6 gids */ in6_dev = in6_dev_get(dev); if (in6_dev) { read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { pgid = (union ib_gid *)&ifp->addr; + if (!memcmp(pgid, &default_gid, sizeof(*pgid))) + continue; update_gid_table(ibdev, port, pgid, 0, 0); } read_unlock_bh(&in6_dev->lock); -- cgit v1.2.3-70-g09d2 From 655b2aaefc353604f9975c31960d9722e6eda449 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 21 Aug 2014 14:28:39 +0300 Subject: IB/mlx4: Reorder steps in RoCE GID table initialization There's no need to reset the gid table twice and we need to do it only for Ethernet ports. Also, no need to actively scan ndetdevs since it's being done immediatly after we register netdev notifiers. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 16fb2327813a..49965b692042 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1730,24 +1730,33 @@ static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) struct net_device *dev; struct mlx4_ib_iboe *iboe = &ibdev->iboe; int i; + int err = 0; - for (i = 1; i <= ibdev->num_ports; ++i) - if (reset_gid_table(ibdev, i)) - return -1; + for (i = 1; i <= ibdev->num_ports; ++i) { + if (rdma_port_get_link_layer(&ibdev->ib_dev, i) == + IB_LINK_LAYER_ETHERNET) { + err = reset_gid_table(ibdev, i); + if (err) + goto out; + } + } read_lock(&dev_base_lock); spin_lock(&iboe->lock); for_each_netdev(&init_net, dev) { u8 port = mlx4_ib_get_dev_port(dev, ibdev); - if (port) + /* port will be non-zero only for ETH ports */ + if (port) { + mlx4_ib_set_default_gid(ibdev, dev, port); mlx4_ib_get_dev_addr(dev, ibdev, port); + } } spin_unlock(&iboe->lock); read_unlock(&dev_base_lock); - - return 0; +out: + return err; } static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, @@ -2202,12 +2211,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) } } #endif - for (i = 1 ; i <= ibdev->num_ports ; ++i) - reset_gid_table(ibdev, i); - rtnl_lock(); - mlx4_ib_scan_netdevs(ibdev, NULL, 0); - rtnl_unlock(); - mlx4_ib_init_gid_table(ibdev); + if (mlx4_ib_init_gid_table(ibdev)) + goto err_notif; } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { -- cgit v1.2.3-70-g09d2 From bccb84f1dfab92ed180adf09c76cfa9ddc90edb9 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 21 Aug 2014 14:28:40 +0300 Subject: IB/mlx4: Get upper dev addresses as RoCE GIDs when port comes up When a RoCE port becomes active and the netdev of the port has upper device (e.g bond/team), GIDs derived from the upper dev should appear in the port's RoCE GID table. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 49965b692042..d404a2eafa79 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1802,14 +1802,23 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; mlx4_ib_set_default_gid(ibdev, curr_netdev, port); - /* if using bonding/team and a slave port is down, we - * don't the bond IP based gids in the table since - * flows that select port by gid may get the down port. - */ - if (curr_master && (port_state == IB_PORT_DOWN)) { - reset_gid_table(ibdev, port); - mlx4_ib_set_default_gid(ibdev, - curr_netdev, port); + if (curr_master) { + /* if using bonding/team and a slave port is down, we + * don't want the bond IP based gids in the table since + * flows that select port by gid may get the down port. + */ + if (port_state == IB_PORT_DOWN) { + reset_gid_table(ibdev, port); + mlx4_ib_set_default_gid(ibdev, + curr_netdev, + port); + } else { + /* gids from the upper dev (bond/team) + * should appear in port's gid table + */ + mlx4_ib_get_dev_addr(curr_master, + ibdev, port); + } } /* if bonding is used it is possible that we add it to * masters only after IP address is assigned to the -- cgit v1.2.3-70-g09d2 From dba3ad2addcd74ec850e510f3b8a9d046cc24ef3 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 21 Aug 2014 14:28:41 +0300 Subject: IB/mlx4: Fix lockdep splat for the iboe lock Chuck Lever reported the following stack trace: ================================= [ INFO: inconsistent lock state ] 3.16.0-rc2-00024-g2e78883 #17 Tainted: G E --------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/0/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (&(&iboe->lock)->rlock){+.?...}, at: [] mlx4_ib_addr_event+0xdb/0x1a0 [mlx4_ib] {SOFTIRQ-ON-W} state was registered at: [] mark_irqflags+0x110/0x170 [] __lock_acquire+0x2c6/0x5b0 [] lock_acquire+0xe9/0x120 [] _raw_spin_lock+0x3e/0x80 [] mlx4_ib_scan_netdevs+0x34/0x260 [mlx4_ib] [] mlx4_ib_netdev_event+0x2b/0x40 [mlx4_ib] [] register_netdevice_notifier+0x99/0x1e0 [] mlx4_ib_add+0x743/0xbc0 [mlx4_ib] [] mlx4_add_device+0x48/0xa0 [mlx4_core] [] mlx4_register_interface+0x73/0xb0 [mlx4_core] [] cm_req_handler+0x13e/0x460 [ib_cm] [] do_one_initcall+0x112/0x1c0 [] do_init_module+0x34/0x190 [] load_module+0x5cf/0x740 [] SyS_init_module+0x99/0xd0 [] system_call_fastpath+0x16/0x1b irq event stamp: 336142 hardirqs last enabled at (336142): [] __local_bh_enable_ip+0xb5/0xc0 hardirqs last disabled at (336141): [] __local_bh_enable_ip+0x56/0xc0 softirqs last enabled at (336004): [] _local_bh_enable+0x4a/0x50 softirqs last disabled at (336005): [] irq_exit+0x44/0xd0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&(&iboe->lock)->rlock); lock(&(&iboe->lock)->rlock); *** DEADLOCK *** The above problem was caused by the spin lock being taken both in the process context and in a soft-irq context (in a netdev notifier handler). The required fix is to use spin_lock/unlock_bh() instead of spin_lock/unlock on the iboe lock. Reported-by: Chuck Lever Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d404a2eafa79..c231112396b2 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -360,7 +360,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; - spin_lock(&iboe->lock); + spin_lock_bh(&iboe->lock); ndev = iboe->netdevs[port - 1]; if (!ndev) goto out_unlock; @@ -372,7 +372,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); out_unlock: - spin_unlock(&iboe->lock); + spin_unlock_bh(&iboe->lock); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; @@ -814,11 +814,11 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, if (!mqp->port) return 0; - spin_lock(&mdev->iboe.lock); + spin_lock_bh(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) dev_hold(ndev); - spin_unlock(&mdev->iboe.lock); + spin_unlock_bh(&mdev->iboe.lock); if (ndev) { ret = 1; @@ -1265,11 +1265,11 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); if (ge) { - spin_lock(&mdev->iboe.lock); + spin_lock_bh(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; if (ndev) dev_hold(ndev); - spin_unlock(&mdev->iboe.lock); + spin_unlock_bh(&mdev->iboe.lock); if (ndev) dev_put(ndev); list_del(&ge->list); @@ -1554,7 +1554,7 @@ static int mlx4_ib_addr_event(int event, struct net_device *event_netdev, return 0; iboe = &ibdev->iboe; - spin_lock(&iboe->lock); + spin_lock_bh(&iboe->lock); for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) if ((netif_is_bond_master(real_dev) && @@ -1564,7 +1564,7 @@ static int mlx4_ib_addr_event(int event, struct net_device *event_netdev, update_gid_table(ibdev, port, gid, event == NETDEV_DOWN, 0); - spin_unlock(&iboe->lock); + spin_unlock_bh(&iboe->lock); return 0; } @@ -1742,7 +1742,7 @@ static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) } read_lock(&dev_base_lock); - spin_lock(&iboe->lock); + spin_lock_bh(&iboe->lock); for_each_netdev(&init_net, dev) { u8 port = mlx4_ib_get_dev_port(dev, ibdev); @@ -1753,7 +1753,7 @@ static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) } } - spin_unlock(&iboe->lock); + spin_unlock_bh(&iboe->lock); read_unlock(&dev_base_lock); out: return err; @@ -1770,7 +1770,7 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, iboe = &ibdev->iboe; - spin_lock(&iboe->lock); + spin_lock_bh(&iboe->lock); mlx4_foreach_ib_transport_port(port, ibdev->dev) { enum ib_port_state port_state = IB_PORT_NOP; struct net_device *old_master = iboe->masters[port - 1]; @@ -1842,7 +1842,7 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, } } - spin_unlock(&iboe->lock); + spin_unlock_bh(&iboe->lock); if (update_qps_port > 0) mlx4_ib_update_qps(ibdev, dev, update_qps_port); -- cgit v1.2.3-70-g09d2 From 4bf9715f184969dc703bde7be94919995024a6a9 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Thu, 21 Aug 2014 14:28:42 +0300 Subject: IB/mlx4: Avoid executing gid task when device is being removed When device is being removed (e.g during VPI port link type change from ETH to IB), tasks for gid table changes should not be executed. Flush the current queue of tasks and block further tasks from entering the queue. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c231112396b2..c61bcee9ee8a 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1390,6 +1390,9 @@ static void update_gids_task(struct work_struct *work) int err; struct mlx4_dev *dev = gw->dev->dev; + if (!gw->dev->ib_active) + return; + mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); @@ -1420,6 +1423,9 @@ static void reset_gids_task(struct work_struct *work) int err; struct mlx4_dev *dev = gw->dev->dev; + if (!gw->dev->ib_active) + return; + mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("reset gid table failed\n"); @@ -2369,6 +2375,9 @@ static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) struct mlx4_ib_dev *ibdev = ibdev_ptr; int p; + ibdev->ib_active = false; + flush_workqueue(wq); + mlx4_ib_close_sriov(ibdev); mlx4_ib_mad_cleanup(ibdev); ib_unregister_device(&ibdev->ib_dev); -- cgit v1.2.3-70-g09d2 From a59c5850f09b4c2d6ad2fc47e5e1be8d654529d6 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Tue, 2 Sep 2014 15:32:34 +0300 Subject: IB/core: When marshaling uverbs path, clear unused fields When marsheling a user path to the kernel struct ib_sa_path, need to zero smac, dmac and set the vlan id to the "no vlan" value. Fixes: dd5f03beb4f7 ("IB/core: Ethernet L2 attributes in verbs/cm structures") Reported-by: Aleksey Senin Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_marshall.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index e7bee46868d1..abd97247443e 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -140,5 +140,9 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, dst->packet_life_time = src->packet_life_time; dst->preference = src->preference; dst->packet_life_time_selector = src->packet_life_time_selector; + + memset(dst->smac, 0, sizeof(dst->smac)); + memset(dst->dmac, 0, sizeof(dst->dmac)); + dst->vlan_id = 0xffff; } EXPORT_SYMBOL(ib_copy_path_rec_from_user); -- cgit v1.2.3-70-g09d2 From 3e0629cb6c0518423c9e2671bbe8ec15dde5dcaf Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 11 Sep 2014 14:11:17 +0300 Subject: IB/mlx4: Avoid accessing netdevice when building RoCE qp1 header The source MAC is needed in RoCE when building the QP1 header. Currently, this is obtained from the source net device. However, the net device may not yet exist, or can be destroyed in parallel to this QP1 send operation (e.g through the VPI port change flow) so accessing it may cause a kernel crash. To fix this, we maintain a source MAC cache per port for the net device in struct mlx4_ib_roce. This cached MAC is initialized to be the default MAC address obtained during HCA initialization via QUERY_PORT. This cached MAC is updated via the netdev event notifier handler. Since the cached MAC is held in an atomic64 object, we do not need locking when accessing it. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 5 +++++ drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 + drivers/infiniband/hw/mlx4/qp.c | 38 +++++++++++++++++++++--------------- 3 files changed, 28 insertions(+), 16 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index c61bcee9ee8a..657ce0f810fd 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1643,6 +1643,8 @@ static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, new_smac = mlx4_mac_to_u64(dev->dev_addr); read_unlock(&dev_base_lock); + atomic64_set(&ibdev->iboe.mac[port - 1], new_smac); + mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); qp = ibdev->qp1_proxy[port - 1]; if (qp) { @@ -2190,6 +2192,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_steer_free_bitmap; } + for (j = 1; j <= ibdev->dev->caps.num_ports; j++) + atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]); + if (ib_register_device(&ibdev->ib_dev, NULL)) goto err_steer_free_bitmap; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e8cad3926bfc..6eb743f65f6f 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -451,6 +451,7 @@ struct mlx4_ib_iboe { spinlock_t lock; struct net_device *netdevs[MLX4_MAX_PORTS]; struct net_device *masters[MLX4_MAX_PORTS]; + atomic64_t mac[MLX4_MAX_PORTS]; struct notifier_block nb; struct notifier_block nb_inet; struct notifier_block nb_inet6; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 67780452f0cf..25e0208588e6 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1390,18 +1390,10 @@ static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac, struct mlx4_qp_context *context) { - struct net_device *ndev; u64 u64_mac; int smac_index; - - ndev = dev->iboe.netdevs[qp->port - 1]; - if (ndev) { - smac = ndev->dev_addr; - u64_mac = mlx4_mac_to_u64(smac); - } else { - u64_mac = dev->dev->caps.def_mac[qp->port]; - } + u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]); context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); if (!qp->pri.smac) { @@ -2083,6 +2075,16 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, return 0; } +static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac) +{ + int i; + + for (i = ETH_ALEN; i; i--) { + dst_mac[i - 1] = src_mac & 0xff; + src_mac >>= 8; + } +} + static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) { @@ -2197,7 +2199,6 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, } if (is_eth) { - u8 *smac; struct in6_addr in6; u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; @@ -2210,12 +2211,17 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr, memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); memcpy(&in6, sgid.raw, sizeof(in6)); - if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) - smac = to_mdev(sqp->qp.ibqp.device)-> - iboe.netdevs[sqp->qp.port - 1]->dev_addr; - else /* use the src mac of the tunnel */ - smac = ah->av.eth.s_mac; - memcpy(sqp->ud_header.eth.smac_h, smac, 6); + if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { + u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]); + u8 smac[ETH_ALEN]; + + mlx4_u64_to_smac(smac, mac); + memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN); + } else { + /* use the src mac of the tunnel */ + memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN); + } + if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); if (!is_vlan) { -- cgit v1.2.3-70-g09d2 From d24d9f43384b5933867a5786934e130efa8b5c92 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 11 Sep 2014 14:11:18 +0300 Subject: IB/mlx4: Don't update QP1 in native mode For native functions (non-SR-IOV), there's no reason to update the smac_index, as QP1 is a GSI QP. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 657ce0f810fd..6ad7f7a0e464 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1645,6 +1645,10 @@ static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, atomic64_set(&ibdev->iboe.mac[port - 1], new_smac); + /* no need for update QP1 and mac registration in non-SRIOV */ + if (!mlx4_is_mfunc(ibdev->dev)) + return; + mutex_lock(&ibdev->qp1_proxy_lock[port - 1]); qp = ibdev->qp1_proxy[port - 1]; if (qp) { -- cgit v1.2.3-70-g09d2 From 3dec48788817fce4a029cbab14f2b407ae78478f Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 11 Sep 2014 14:11:19 +0300 Subject: IB/mlx4: Do not allow APM under RoCE Automatic Path Migration is not supported under RoCE. Therefore, return a "not-supported" error if the caller attempts to set an alternate path in a QP context. In addition, if there are no IB ports configured, do not report APM capability in the device flags returned by mlx4_ib_query_device. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 16 +++++++++++++++- drivers/infiniband/hw/mlx4/qp.c | 6 ++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 6ad7f7a0e464..d57563b9d1fb 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -120,6 +120,17 @@ static int check_flow_steering_support(struct mlx4_dev *dev) return dmfs; } +static int num_ib_ports(struct mlx4_dev *dev) +{ + int ib_ports = 0; + int i; + + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) + ib_ports++; + + return ib_ports; +} + static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { @@ -127,6 +138,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; + int have_ib_ports; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); @@ -143,6 +155,8 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, memset(props, 0, sizeof *props); + have_ib_ports = num_ib_ports(dev->dev); + props->fw_ver = dev->dev->caps.fw_ver; props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | @@ -153,7 +167,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; - if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM) + if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 25e0208588e6..393423c69c00 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1424,6 +1424,12 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, int steer_qp = 0; int err = -EINVAL; + /* APM is not supported under RoCE */ + if (attr_mask & IB_QP_ALT_PATH && + rdma_port_get_link_layer(&dev->ib_dev, qp->port) == + IB_LINK_LAYER_ETHERNET) + return -ENOTSUPP; + context = kzalloc(sizeof *context, GFP_KERNEL); if (!context) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 25476b0209b2e48dfb689e1b4cf7278875082b1f Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Thu, 11 Sep 2014 14:11:20 +0300 Subject: IB/mlx4: Fix VF mac handling in RoCE We had several problems here. First, a race condition on QP1 mac handling between mlx4_ib_update_qps and mlx4_ib_modify_qp, which is fixed by taking the qp mutex in mlx4_ib_update_qps. Also, qp->pri.smac_port was not updated in mlx4_ib_update_qps. Last, in __mlx4_ib_modify_qp we did not properly handle the case where the mac is zero, but port is non-zero. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 15 ++++++++++----- drivers/infiniband/hw/mlx4/qp.c | 16 ++++++++++------ 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'drivers/infiniband') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index d57563b9d1fb..c7586a1cec30 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1667,9 +1667,11 @@ static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, qp = ibdev->qp1_proxy[port - 1]; if (qp) { int new_smac_index; - u64 old_smac = qp->pri.smac; + u64 old_smac; struct mlx4_update_qp_params update_params; + mutex_lock(&qp->mutex); + old_smac = qp->pri.smac; if (new_smac == old_smac) goto unlock; @@ -1684,17 +1686,20 @@ static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev, release_mac = new_smac; goto unlock; } - + /* if old port was zero, no mac was yet registered for this QP */ + if (qp->pri.smac_port) + release_mac = old_smac; qp->pri.smac = new_smac; + qp->pri.smac_port = port; qp->pri.smac_index = new_smac_index; - - release_mac = old_smac; } unlock: - mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); if (release_mac != MLX4_IB_INVALID_MAC) mlx4_unregister_mac(ibdev->dev, port, release_mac); + if (qp) + mutex_unlock(&qp->mutex); + mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]); } static void mlx4_ib_get_dev_addr(struct net_device *dev, diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 393423c69c00..577b477cfaa6 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -964,9 +964,10 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp)) pr_warn("modify QP %06x to RESET failed.\n", qp->mqp.qpn); - if (qp->pri.smac) { + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); qp->pri.smac = 0; + qp->pri.smac_port = 0; } if (qp->alt.smac) { mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); @@ -1325,7 +1326,8 @@ static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah, * If one was already assigned, but the new mac differs, * unregister the old one and register the new one. */ - if (!smac_info->smac || smac_info->smac != smac) { + if ((!smac_info->smac && !smac_info->smac_port) || + smac_info->smac != smac) { /* register candidate now, unreg if needed, after success */ smac_index = mlx4_register_mac(dev->dev, port, smac); if (smac_index >= 0) { @@ -1396,7 +1398,7 @@ static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp * u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]); context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6); - if (!qp->pri.smac) { + if (!qp->pri.smac && !qp->pri.smac_port) { smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac); if (smac_index >= 0) { qp->pri.candidate_smac_index = smac_index; @@ -1778,9 +1780,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, if (qp->flags & MLX4_IB_QP_NETIF) mlx4_ib_steer_qp_reg(dev, qp, 0); } - if (qp->pri.smac) { + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) { mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); qp->pri.smac = 0; + qp->pri.smac_port = 0; } if (qp->alt.smac) { mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac); @@ -1804,11 +1807,12 @@ out: if (err && steer_qp) mlx4_ib_steer_qp_reg(dev, qp, 0); kfree(context); - if (qp->pri.candidate_smac) { + if (qp->pri.candidate_smac || + (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) { if (err) { mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac); } else { - if (qp->pri.smac) + if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac); qp->pri.smac = qp->pri.candidate_smac; qp->pri.smac_index = qp->pri.candidate_smac_index; -- cgit v1.2.3-70-g09d2