From 872bf2fb69d90e3619befee842fc26db39d8e475 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 25 Jan 2015 16:59:35 +0200 Subject: net/mlx4_core: Maintain a persistent memory for mlx4 device Maintain a persistent memory that should survive reset flow/PCI error. This comes as a preparation for coming series to support above flows. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/main.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'drivers/infiniband/hw/mlx4/main.c') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 57ecc5b204f3..b4fa6f658800 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -198,7 +198,7 @@ static int mlx4_ib_query_device(struct ib_device *ibdev, props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; - props->vendor_part_id = dev->dev->pdev->device; + props->vendor_part_id = dev->dev->persist->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); @@ -1375,7 +1375,7 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); - return sprintf(buf, "MT%d\n", dev->dev->pdev->device); + return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, @@ -1937,7 +1937,8 @@ static void init_pkeys(struct mlx4_ib_dev *ibdev) int i; if (mlx4_is_master(ibdev->dev)) { - for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { + for (slave = 0; slave <= ibdev->dev->persist->num_vfs; + ++slave) { for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; @@ -1994,7 +1995,7 @@ static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s", - i, j, dev->pdev->bus->name); + i, j, dev->persist->pdev->bus->name); /* Set IRQ for specific name (per ring) */ if (mlx4_assign_eq(dev, name, NULL, &ibdev->eq_table[eq])) { @@ -2058,7 +2059,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { - dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); + dev_err(&dev->persist->pdev->dev, + "Device struct alloc failed\n"); return NULL; } @@ -2085,7 +2087,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->num_ports = num_ports; ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; - ibdev->ib_dev.dma_device = &dev->pdev->dev; + ibdev->ib_dev.dma_device = &dev->persist->pdev->dev; if (dev->caps.userspace_caps) ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; @@ -2236,7 +2238,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) sizeof(long), GFP_KERNEL); if (!ibdev->ib_uc_qpns_bitmap) { - dev_err(&dev->pdev->dev, "bit map alloc failed\n"); + dev_err(&dev->persist->pdev->dev, + "bit map alloc failed\n"); goto err_steer_qp_release; } -- cgit v1.2.3-70-g09d2 From a575009030931cd8a35c88ec81eb26b9e9f73539 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 3 Feb 2015 16:48:37 +0200 Subject: IB/mlx4: Add port aggregation support Register the interface with the mlx4 core driver with port aggregation support and check for port aggregation mode when the 'add' function is called. In this mode, only one physical port is reported to the upper layer (RoCE/IB core stack and ULPs). Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/main.c | 76 +++++++++++++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/hw/mlx4/main.c') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 9db258f7c804..ed21ae68a977 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -351,6 +351,7 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, enum ib_mtu tmp; struct mlx4_cmd_mailbox *mailbox; int err = 0; + int is_bonded = mlx4_is_bonded(mdev->dev); mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) @@ -374,8 +375,12 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; + if (is_bonded) + rtnl_lock(); /* required to get upper dev */ spin_lock_bh(&iboe->lock); ndev = iboe->netdevs[port - 1]; + if (ndev && is_bonded) + ndev = netdev_master_upper_dev_get(ndev); if (!ndev) goto out_unlock; @@ -387,6 +392,8 @@ static int eth_link_query_port(struct ib_device *ibdev, u8 port, props->phys_state = state_to_phys_state(props->state); out_unlock: spin_unlock_bh(&iboe->lock); + if (is_bonded) + rtnl_unlock(); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; @@ -1440,6 +1447,7 @@ static void update_gids_task(struct work_struct *work) union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; + int is_bonded = mlx4_is_bonded(dev); if (!gw->dev->ib_active) return; @@ -1459,7 +1467,10 @@ static void update_gids_task(struct work_struct *work) if (err) pr_warn("set port command failed\n"); else - mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); + if ((gw->port == 1) || !is_bonded) + mlx4_ib_dispatch_event(gw->dev, + is_bonded ? 1 : gw->port, + IB_EVENT_GID_CHANGE); mlx4_free_cmd_mailbox(dev, mailbox); kfree(gw); @@ -1875,7 +1886,8 @@ static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, * don't want the bond IP based gids in the table since * flows that select port by gid may get the down port. */ - if (port_state == IB_PORT_DOWN) { + if (port_state == IB_PORT_DOWN && + !mlx4_is_bonded(ibdev->dev)) { reset_gid_table(ibdev, port); mlx4_ib_set_default_gid(ibdev, curr_netdev, @@ -2047,6 +2059,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) int err; struct mlx4_ib_iboe *iboe; int ib_num_ports = 0; + int num_req_counters; pr_info_once("%s", mlx4_ib_version); @@ -2086,7 +2099,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; - ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; + ibdev->ib_dev.phys_port_cnt = mlx4_is_bonded(dev) ? + 1 : ibdev->num_ports; ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->persist->pdev->dev; @@ -2207,7 +2221,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (init_node_data(ibdev)) goto err_map; - for (i = 0; i < ibdev->num_ports; ++i) { + num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports; + for (i = 0; i < num_req_counters; ++i) { mutex_init(&ibdev->qp1_proxy_lock[i]); if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { @@ -2218,6 +2233,10 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->counters[i] = -1; } } + if (mlx4_is_bonded(dev)) + for (i = 1; i < ibdev->num_ports ; ++i) + ibdev->counters[i] = ibdev->counters[0]; + mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) ib_num_ports++; @@ -2538,6 +2557,38 @@ out: return; } +static void handle_bonded_port_state_event(struct work_struct *work) +{ + struct ib_event_work *ew = + container_of(work, struct ib_event_work, work); + struct mlx4_ib_dev *ibdev = ew->ib_dev; + enum ib_port_state bonded_port_state = IB_PORT_NOP; + int i; + struct ib_event ibev; + + kfree(ew); + spin_lock_bh(&ibdev->iboe.lock); + for (i = 0; i < MLX4_MAX_PORTS; ++i) { + struct net_device *curr_netdev = ibdev->iboe.netdevs[i]; + + enum ib_port_state curr_port_state = + (netif_running(curr_netdev) && + netif_carrier_ok(curr_netdev)) ? + IB_PORT_ACTIVE : IB_PORT_DOWN; + + bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ? + curr_port_state : IB_PORT_ACTIVE; + } + spin_unlock_bh(&ibdev->iboe.lock); + + ibev.device = &ibdev->ib_dev; + ibev.element.port_num = 1; + ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ? + IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR; + + ib_dispatch_event(&ibev); +} + static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, enum mlx4_dev_event event, unsigned long param) { @@ -2547,6 +2598,18 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, struct ib_event_work *ew; int p = 0; + if (mlx4_is_bonded(dev) && + ((event == MLX4_DEV_EVENT_PORT_UP) || + (event == MLX4_DEV_EVENT_PORT_DOWN))) { + ew = kmalloc(sizeof(*ew), GFP_ATOMIC); + if (!ew) + return; + INIT_WORK(&ew->work, handle_bonded_port_state_event); + ew->ib_dev = ibdev; + queue_work(wq, &ew->work); + return; + } + if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) eqe = (struct mlx4_eqe *)param; else @@ -2607,7 +2670,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, } ibev.device = ibdev_ptr; - ibev.element.port_num = (u8) p; + ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p; ib_dispatch_event(&ibev); } @@ -2616,7 +2679,8 @@ static struct mlx4_interface mlx4_ib_interface = { .add = mlx4_ib_add, .remove = mlx4_ib_remove, .event = mlx4_ib_event, - .protocol = MLX4_PROT_IB_IPV6 + .protocol = MLX4_PROT_IB_IPV6, + .flags = MLX4_INTFF_BONDING }; static int __init mlx4_ib_init(void) -- cgit v1.2.3-70-g09d2 From 146d6e19832a72136089afca51e5229d1fd72dcd Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 3 Feb 2015 16:48:38 +0200 Subject: IB/mlx4: Create mirror flows in port aggregation mode In port aggregation mode flows for port #1 (the only port) should be mirrored on port #2. This is because packets can arrive from either physical ports. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/main.c | 84 ++++++++++++++++++++++++++++++------ drivers/infiniband/hw/mlx4/mlx4_ib.h | 9 +++- 2 files changed, 80 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband/hw/mlx4/main.c') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ed21ae68a977..ca522382dedc 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -851,7 +851,7 @@ int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, struct mlx4_ib_steering { struct list_head list; - u64 reg_id; + struct mlx4_flow_reg_id reg_id; union ib_gid gid; }; @@ -1142,9 +1142,11 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) { - int err = 0, i = 0; + int err = 0, i = 0, j = 0; struct mlx4_ib_flow *mflow; enum mlx4_net_trans_promisc_mode type[2]; + struct mlx4_dev *dev = (to_mdev(qp->device))->dev; + int is_bonded = mlx4_is_bonded(dev); memset(type, 0, sizeof(type)); @@ -1179,26 +1181,55 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, while (i < ARRAY_SIZE(type) && type[i]) { err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], - &mflow->reg_id[i]); + &mflow->reg_id[i].id); if (err) goto err_create_flow; i++; + if (is_bonded) { + flow_attr->port = 2; + err = __mlx4_ib_create_flow(qp, flow_attr, + domain, type[j], + &mflow->reg_id[j].mirror); + flow_attr->port = 1; + if (err) + goto err_create_flow; + j++; + } + } if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) { - err = mlx4_ib_tunnel_steer_add(qp, flow_attr, &mflow->reg_id[i]); + err = mlx4_ib_tunnel_steer_add(qp, flow_attr, + &mflow->reg_id[i].id); if (err) goto err_create_flow; i++; + if (is_bonded) { + flow_attr->port = 2; + err = mlx4_ib_tunnel_steer_add(qp, flow_attr, + &mflow->reg_id[j].mirror); + flow_attr->port = 1; + if (err) + goto err_create_flow; + j++; + } + /* function to create mirror rule */ } return &mflow->ibflow; err_create_flow: while (i) { - (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, mflow->reg_id[i]); + (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, + mflow->reg_id[i].id); i--; } + + while (j) { + (void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev, + mflow->reg_id[j].mirror); + j--; + } err_free: kfree(mflow); return ERR_PTR(err); @@ -1211,10 +1242,16 @@ static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); struct mlx4_ib_flow *mflow = to_mflow(flow_id); - while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) { - err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]); + while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].id) { + err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].id); if (err) ret = err; + if (mflow->reg_id[i].mirror) { + err = __mlx4_ib_destroy_flow(mdev->dev, + mflow->reg_id[i].mirror); + if (err) + ret = err; + } i++; } @@ -1226,11 +1263,12 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_dev *dev = mdev->dev; struct mlx4_ib_qp *mqp = to_mqp(ibqp); - u64 reg_id; struct mlx4_ib_steering *ib_steering = NULL; enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; + struct mlx4_flow_reg_id reg_id; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { @@ -1242,10 +1280,20 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), - prot, ®_id); + prot, ®_id.id); if (err) goto err_malloc; + reg_id.mirror = 0; + if (mlx4_is_bonded(dev)) { + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, 2, + !!(mqp->flags & + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), + prot, ®_id.mirror); + if (err) + goto err_add; + } + err = add_gid_entry(ibqp, gid); if (err) goto err_add; @@ -1261,7 +1309,10 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) err_add: mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - prot, reg_id); + prot, reg_id.id); + if (reg_id.mirror) + mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.mirror); err_malloc: kfree(ib_steering); @@ -1288,10 +1339,12 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + struct mlx4_dev *dev = mdev->dev; struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct net_device *ndev; struct mlx4_ib_gid_entry *ge; - u64 reg_id = 0; + struct mlx4_flow_reg_id reg_id = {0, 0}; + enum mlx4_protocol prot = (gid->raw[1] == 0x0e) ? MLX4_PROT_IB_IPV4 : MLX4_PROT_IB_IPV6; @@ -1316,10 +1369,17 @@ static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) } err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, - prot, reg_id); + prot, reg_id.id); if (err) return err; + if (mlx4_is_bonded(dev)) { + err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, + prot, reg_id.mirror); + if (err) + return err; + } + mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); if (ge) { diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 6eb743f65f6f..2b49f9de2556 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -134,10 +134,17 @@ struct mlx4_ib_fmr { struct mlx4_fmr mfmr; }; +#define MAX_REGS_PER_FLOW 2 + +struct mlx4_flow_reg_id { + u64 id; + u64 mirror; +}; + struct mlx4_ib_flow { struct ib_flow ibflow; /* translating DMFS verbs sniffer rule to FW API requires two reg IDs */ - u64 reg_id[2]; + struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW]; }; struct mlx4_ib_wq { -- cgit v1.2.3-70-g09d2 From c6215745b66a7fbeeda1a826f94dd864a2ccf654 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Tue, 3 Feb 2015 16:48:39 +0200 Subject: IB/mlx4: Load balance ports in port aggregation mode When the mlx4 IB (RoCE) device works in link aggregation mode, it exposes a single port to upper layers. Therefore, applications always set '1' in port_num attribute when modifying a QP or creating an address handle. To make sure that a node uses all available ports the mlx4 driver will override the port_num attribute with a round robin policy. Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/ah.c | 1 + drivers/infiniband/hw/mlx4/main.c | 1 + drivers/infiniband/hw/mlx4/mlx4_ib.h | 8 ++++++++ drivers/infiniband/hw/mlx4/qp.c | 19 +++++++++++++++++++ 4 files changed, 29 insertions(+) (limited to 'drivers/infiniband/hw/mlx4/main.c') diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 2d8c3397774f..f50a546224ad 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "mlx4_ib.h" diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ca522382dedc..2ed5b996b2f4 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2153,6 +2153,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; + ibdev->bond_next_port = 0; strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 2b49f9de2556..721540c9163d 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -534,6 +534,7 @@ struct mlx4_ib_dev { struct mlx4_ib_qp *qp1_proxy[MLX4_MAX_PORTS]; /* lock when destroying qp1_proxy and getting netdev events */ struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; + u8 bond_next_port; }; struct ib_event_work { @@ -629,6 +630,13 @@ static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah) return container_of(ibah, struct mlx4_ib_ah, ibah); } +static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev) +{ + dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports; + + return dev->bond_next_port + 1; +} + int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev); void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 2f85fc762865..792f9dc86ada 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -1905,6 +1905,22 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, goto out; } + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) { + if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) { + if ((ibqp->qp_type == IB_QPT_RC) || + (ibqp->qp_type == IB_QPT_UD) || + (ibqp->qp_type == IB_QPT_UC) || + (ibqp->qp_type == IB_QPT_RAW_PACKET) || + (ibqp->qp_type == IB_QPT_XRC_INI)) { + attr->port_num = mlx4_ib_bond_next_port(dev); + } + } else { + /* no sense in changing port_num + * when ports are bonded */ + attr_mask &= ~IB_QP_PORT; + } + } + if ((attr_mask & IB_QP_PORT) && (attr->port_num == 0 || attr->port_num > dev->num_ports)) { pr_debug("qpn 0x%x: invalid port number (%d) specified " @@ -1955,6 +1971,9 @@ int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state); + if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) + attr->port_num = 1; + out: mutex_unlock(&qp->mutex); return err; -- cgit v1.2.3-70-g09d2 From 824c25c1abe70a527646056f6911d181facde9cc Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Sun, 8 Feb 2015 11:49:33 +0200 Subject: IB/mlx4: Always use the correct port for mirrored multicast attachments When attaching a QP to a multicast address in bonded mode, there was an assumption that the port of the QP must be #1. This assumption isn't the case under the flow which enables maximal usage of the physical ports. Fix it by always checking the port of the original flow and create the mirrored flow on the other port. Fixes: c6215745b66a ('IB/mlx4: Load balance ports in port aggregation mode') Signed-off-by: Moni Shoua Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/infiniband/hw/mlx4/main.c') diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 2ed5b996b2f4..3140da518a07 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1186,6 +1186,9 @@ static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, goto err_create_flow; i++; if (is_bonded) { + /* Application always sees one port so the mirror rule + * must be on port #2 + */ flow_attr->port = 2; err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[j], @@ -1286,7 +1289,8 @@ static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) reg_id.mirror = 0; if (mlx4_is_bonded(dev)) { - err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, 2, + err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, + (mqp->port == 1) ? 2 : 1, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), prot, ®_id.mirror); -- cgit v1.2.3-70-g09d2 From 35f05dabf95ac3ebc4c15bafd6833f7a3046e66f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 8 Feb 2015 11:49:34 +0200 Subject: IB/mlx4: Reset flow support for IB kernel ULPs The driver exposes interfaces that directly relate to HW state. Upon fatal error, consumers of these interfaces (ULPs) that rely on completion of all their posted work-request could hang, thereby introducing dependencies in shutdown order. To prevent this from happening, we manage the relevant resources (CQs, QPs) that are used by the device. Upon a fatal error, we now generate simulated completions for outstanding WQEs that were not completed at the time the HW was reset. It includes invoking the completion event handler for all involved CQs so that the ULPs will poll those CQs. When polled we return simulated CQEs with IB_WC_WR_FLUSH_ERR return code enabling ULPs to clean up their resources and not wait forever for completions upon receiving remove_one. The above change requires an extra check in the data path to make sure that when device is in error state, the simulated CQEs will be returned and no further WQEs will be posted. Signed-off-by: Yishai Hadas Signed-off-by: Or Gerlitz Signed-off-by: David S. Miller --- drivers/infiniband/hw/mlx4/cq.c | 57 ++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx4/main.c | 64 ++++++++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx4/mlx4_ib.h | 9 +++++ drivers/infiniband/hw/mlx4/qp.c | 59 +++++++++++++++++++++++++++++---- drivers/infiniband/hw/mlx4/srq.c | 8 +++++ include/linux/mlx4/device.h | 2 ++ 6 files changed, 193 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/hw/mlx4/main.c') diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index a3b70f6c4035..543ecdd8667b 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -188,6 +188,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector spin_lock_init(&cq->lock); cq->resize_buf = NULL; cq->resize_umem = NULL; + INIT_LIST_HEAD(&cq->send_qp_list); + INIT_LIST_HEAD(&cq->recv_qp_list); if (context) { struct mlx4_ib_create_cq ucmd; @@ -594,6 +596,55 @@ static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct return 0; } +static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries, + struct ib_wc *wc, int *npolled, int is_send) +{ + struct mlx4_ib_wq *wq; + unsigned cur; + int i; + + wq = is_send ? &qp->sq : &qp->rq; + cur = wq->head - wq->tail; + + if (cur == 0) + return; + + for (i = 0; i < cur && *npolled < num_entries; i++) { + wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + wc->status = IB_WC_WR_FLUSH_ERR; + wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR; + wq->tail++; + (*npolled)++; + wc->qp = &qp->ibqp; + wc++; + } +} + +static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries, + struct ib_wc *wc, int *npolled) +{ + struct mlx4_ib_qp *qp; + + *npolled = 0; + /* Find uncompleted WQEs belonging to that cq and retrun + * simulated FLUSH_ERR completions + */ + list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) { + mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1); + if (*npolled >= num_entries) + goto out; + } + + list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) { + mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0); + if (*npolled >= num_entries) + goto out; + } + +out: + return; +} + static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq, struct mlx4_ib_qp **cur_qp, struct ib_wc *wc) @@ -836,8 +887,13 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) unsigned long flags; int npolled; int err = 0; + struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device); spin_lock_irqsave(&cq->lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled); + goto out; + } for (npolled = 0; npolled < num_entries; ++npolled) { err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled); @@ -847,6 +903,7 @@ int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) mlx4_cq_set_ci(&cq->mcq); +out: spin_unlock_irqrestore(&cq->lock, flags); if (err == 0 || err == -EAGAIN) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 3140da518a07..eb8e215f1613 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2308,6 +2308,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); + INIT_LIST_HEAD(&ibdev->qp_list); + spin_lock_init(&ibdev->reset_flow_resource_lock); if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED && ib_num_ports) { @@ -2622,6 +2624,67 @@ out: return; } +static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev) +{ + struct mlx4_ib_qp *mqp; + unsigned long flags_qp; + unsigned long flags_cq; + struct mlx4_ib_cq *send_mcq, *recv_mcq; + struct list_head cq_notify_list; + struct mlx4_cq *mcq; + unsigned long flags; + + pr_warn("mlx4_ib_handle_catas_error was started\n"); + INIT_LIST_HEAD(&cq_notify_list); + + /* Go over qp list reside on that ibdev, sync with create/destroy qp.*/ + spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags); + + list_for_each_entry(mqp, &ibdev->qp_list, qps_list) { + spin_lock_irqsave(&mqp->sq.lock, flags_qp); + if (mqp->sq.tail != mqp->sq.head) { + send_mcq = to_mcq(mqp->ibqp.send_cq); + spin_lock_irqsave(&send_mcq->lock, flags_cq); + if (send_mcq->mcq.comp && + mqp->ibqp.send_cq->comp_handler) { + if (!send_mcq->mcq.reset_notify_added) { + send_mcq->mcq.reset_notify_added = 1; + list_add_tail(&send_mcq->mcq.reset_notify, + &cq_notify_list); + } + } + spin_unlock_irqrestore(&send_mcq->lock, flags_cq); + } + spin_unlock_irqrestore(&mqp->sq.lock, flags_qp); + /* Now, handle the QP's receive queue */ + spin_lock_irqsave(&mqp->rq.lock, flags_qp); + /* no handling is needed for SRQ */ + if (!mqp->ibqp.srq) { + if (mqp->rq.tail != mqp->rq.head) { + recv_mcq = to_mcq(mqp->ibqp.recv_cq); + spin_lock_irqsave(&recv_mcq->lock, flags_cq); + if (recv_mcq->mcq.comp && + mqp->ibqp.recv_cq->comp_handler) { + if (!recv_mcq->mcq.reset_notify_added) { + recv_mcq->mcq.reset_notify_added = 1; + list_add_tail(&recv_mcq->mcq.reset_notify, + &cq_notify_list); + } + } + spin_unlock_irqrestore(&recv_mcq->lock, + flags_cq); + } + } + spin_unlock_irqrestore(&mqp->rq.lock, flags_qp); + } + + list_for_each_entry(mcq, &cq_notify_list, reset_notify) { + mcq->comp(mcq); + } + spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags); + pr_warn("mlx4_ib_handle_catas_error ended\n"); +} + static void handle_bonded_port_state_event(struct work_struct *work) { struct ib_event_work *ew = @@ -2701,6 +2764,7 @@ static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; + mlx4_ib_handle_catas_error(ibdev); break; case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 721540c9163d..f829fd935b79 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -110,6 +110,9 @@ struct mlx4_ib_cq { struct mutex resize_mutex; struct ib_umem *umem; struct ib_umem *resize_umem; + /* List of qps that it serves.*/ + struct list_head send_qp_list; + struct list_head recv_qp_list; }; struct mlx4_ib_mr { @@ -300,6 +303,9 @@ struct mlx4_ib_qp { struct mlx4_roce_smac_vlan_info pri; struct mlx4_roce_smac_vlan_info alt; u64 reg_id; + struct list_head qps_list; + struct list_head cq_recv_list; + struct list_head cq_send_list; }; struct mlx4_ib_srq { @@ -535,6 +541,9 @@ struct mlx4_ib_dev { /* lock when destroying qp1_proxy and getting netdev events */ struct mutex qp1_proxy_lock[MLX4_MAX_PORTS]; u8 bond_next_port; + /* protect resources needed as part of reset flow */ + spinlock_t reset_flow_resource_lock; + struct list_head qp_list; }; struct ib_event_work { diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 792f9dc86ada..dfc6ca128a7e 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -46,6 +46,11 @@ #include "mlx4_ib.h" #include "user.h" +static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, + struct mlx4_ib_cq *recv_cq); +static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, + struct mlx4_ib_cq *recv_cq); + enum { MLX4_IB_ACK_REQ_FREQ = 8, }; @@ -618,6 +623,8 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, struct mlx4_ib_sqp *sqp; struct mlx4_ib_qp *qp; enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type; + struct mlx4_ib_cq *mcq; + unsigned long flags; /* When tunneling special qps, we use a plain UD qp */ if (sqpn) { @@ -828,6 +835,24 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, qp->mqp.event = mlx4_ib_qp_event; if (!*caller_qp) *caller_qp = qp; + + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); + mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq)); + /* Maintain device to QPs access, needed for further handling + * via reset flow + */ + list_add_tail(&qp->qps_list, &dev->qp_list); + /* Maintain CQ to QPs access, needed for further handling + * via reset flow + */ + mcq = to_mcq(init_attr->send_cq); + list_add_tail(&qp->cq_send_list, &mcq->send_qp_list); + mcq = to_mcq(init_attr->recv_cq); + list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list); + mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq), + to_mcq(init_attr->recv_cq)); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); return 0; err_qpn: @@ -886,13 +911,13 @@ static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv __acquires(&send_cq->lock) __acquires(&recv_cq->lock) { if (send_cq == recv_cq) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } @@ -902,13 +927,13 @@ static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *re { if (send_cq == recv_cq) { __release(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) { spin_unlock(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); - spin_unlock_irq(&recv_cq->lock); + spin_unlock(&recv_cq->lock); } } @@ -953,6 +978,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, int is_user) { struct mlx4_ib_cq *send_cq, *recv_cq; + unsigned long flags; if (qp->state != IB_QPS_RESET) { if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), @@ -984,8 +1010,13 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, get_cqs(qp, &send_cq, &recv_cq); + spin_lock_irqsave(&dev->reset_flow_resource_lock, flags); mlx4_ib_lock_cqs(send_cq, recv_cq); + /* del from lists under both locks above to protect reset flow paths */ + list_del(&qp->qps_list); + list_del(&qp->cq_send_list); + list_del(&qp->cq_recv_list); if (!is_user) { __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL); @@ -996,6 +1027,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, mlx4_qp_remove(dev->dev, &qp->mqp); mlx4_ib_unlock_cqs(send_cq, recv_cq); + spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags); mlx4_qp_free(dev->dev, &qp->mqp); @@ -2618,8 +2650,15 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, __be32 uninitialized_var(lso_hdr_sz); __be32 blh; int i; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); spin_lock_irqsave(&qp->sq.lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } ind = qp->sq_next_wqe; @@ -2917,10 +2956,18 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, int ind; int max_gs; int i; + struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); max_gs = qp->rq.max_gs; spin_lock_irqsave(&qp->rq.lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } + ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index 62d9285300af..dce5dfe3a70e 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -316,8 +316,15 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, int err = 0; int nreq; int i; + struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device); spin_lock_irqsave(&srq->lock, flags); + if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { + err = -EIO; + *bad_wr = wr; + nreq = 0; + goto out; + } for (nreq = 0; wr; ++nreq, wr = wr->next) { if (unlikely(wr->num_sge > srq->msrq.max_gs)) { @@ -362,6 +369,7 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr, *srq->db.db = cpu_to_be32(srq->wqe_ctr); } +out: spin_unlock_irqrestore(&srq->lock, flags); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index c116cb02475c..e4ebff7e9d02 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -689,6 +689,8 @@ struct mlx4_cq { void (*comp)(struct mlx4_cq *); void *priv; } tasklet_ctx; + int reset_notify_added; + struct list_head reset_notify; }; struct mlx4_qp { -- cgit v1.2.3-70-g09d2