diff options
Diffstat (limited to 'drivers/infiniband/hw')
46 files changed, 7271 insertions, 453 deletions
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile index fba0b3be903e..6b3a88046125 100644 --- a/drivers/infiniband/hw/Makefile +++ b/drivers/infiniband/hw/Makefile @@ -13,3 +13,4 @@ obj-$(CONFIG_INFINIBAND_HFI1) += hfi1/ obj-$(CONFIG_INFINIBAND_HNS) += hns/ obj-$(CONFIG_INFINIBAND_QEDR) += qedr/ obj-$(CONFIG_INFINIBAND_BNXT_RE) += bnxt_re/ +obj-$(CONFIG_INFINIBAND_ERDMA) += erdma/ diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 79401e6c6aa9..785c37cae3c0 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -173,7 +173,7 @@ struct bnxt_re_dev { /* Max of 2 lossless traffic class supported per port */ u16 cosq[2]; - /* QP for for handling QP1 packets */ + /* QP for handling QP1 packets */ struct bnxt_re_gsi_context gsi_ctx; struct bnxt_re_stats stats; atomic_t nq_alloc_cnt; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index c16017f6e8db..14392c942f49 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -2468,31 +2468,24 @@ static int accept_cr(struct c4iw_ep *ep, struct sk_buff *skb, opt2 |= CCTRL_ECN_V(1); } - skb_get(skb); - rpl = cplhdr(skb); if (!is_t4(adapter_type)) { - BUILD_BUG_ON(sizeof(*rpl5) != roundup(sizeof(*rpl5), 16)); - skb_trim(skb, sizeof(*rpl5)); - rpl5 = (void *)rpl; - INIT_TP_WR(rpl5, ep->hwtid); - } else { - skb_trim(skb, sizeof(*rpl)); - INIT_TP_WR(rpl, ep->hwtid); - } - OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, - ep->hwtid)); - - if (CHELSIO_CHIP_VERSION(adapter_type) > CHELSIO_T4) { u32 isn = (prandom_u32() & ~7UL) - 1; + + skb = get_skb(skb, roundup(sizeof(*rpl5), 16), GFP_KERNEL); + rpl5 = __skb_put_zero(skb, roundup(sizeof(*rpl5), 16)); + rpl = (void *)rpl5; + INIT_TP_WR_CPL(rpl5, CPL_PASS_ACCEPT_RPL, ep->hwtid); opt2 |= T5_OPT_2_VALID_F; opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE); opt2 |= T5_ISS_F; - rpl5 = (void *)rpl; - memset_after(rpl5, 0, iss); if (peer2peer) isn += 4; rpl5->iss = cpu_to_be32(isn); pr_debug("iss %u\n", be32_to_cpu(rpl5->iss)); + } else { + skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL); + rpl = __skb_put_zero(skb, sizeof(*rpl)); + INIT_TP_WR_CPL(rpl, CPL_PASS_ACCEPT_RPL, ep->hwtid); } rpl->opt0 = cpu_to_be64(opt0); diff --git a/drivers/infiniband/hw/erdma/Kconfig b/drivers/infiniband/hw/erdma/Kconfig new file mode 100644 index 000000000000..169038e3ceb1 --- /dev/null +++ b/drivers/infiniband/hw/erdma/Kconfig @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: GPL-2.0-only +config INFINIBAND_ERDMA + tristate "Alibaba Elastic RDMA Adapter (ERDMA) support" + depends on PCI_MSI && 64BIT + depends on INFINIBAND_ADDR_TRANS + depends on INFINIBAND_USER_ACCESS + help + This is a RDMA/iWarp driver for Alibaba Elastic RDMA Adapter(ERDMA), + which supports RDMA features in Alibaba cloud environment. + + To compile this driver as module, choose M here. The module will be + called erdma. diff --git a/drivers/infiniband/hw/erdma/Makefile b/drivers/infiniband/hw/erdma/Makefile new file mode 100644 index 000000000000..51d2ef91905a --- /dev/null +++ b/drivers/infiniband/hw/erdma/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_INFINIBAND_ERDMA) := erdma.o + +erdma-y := erdma_cm.o erdma_main.o erdma_cmdq.o erdma_cq.o erdma_verbs.o erdma_qp.o erdma_eq.o diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h new file mode 100644 index 000000000000..2aae635c1c8d --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -0,0 +1,287 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_H__ +#define __ERDMA_H__ + +#include <linux/bitfield.h> +#include <linux/netdevice.h> +#include <linux/xarray.h> +#include <rdma/ib_verbs.h> + +#include "erdma_hw.h" + +#define DRV_MODULE_NAME "erdma" +#define ERDMA_NODE_DESC "Elastic RDMA(iWARP) stack" + +struct erdma_eq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + + u16 ci; + u16 rsvd; + + atomic64_t event_num; + atomic64_t notify_num; + + u64 __iomem *db_addr; + u64 *db_record; +}; + +struct erdma_cmdq_sq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + u16 ci; + u16 pi; + + u16 wqebb_cnt; + + u64 *db_record; +}; + +struct erdma_cmdq_cq { + void *qbuf; + dma_addr_t qbuf_dma_addr; + + spinlock_t lock; + + u32 depth; + u32 ci; + u32 cmdsn; + + u64 *db_record; + + atomic64_t armed_num; +}; + +enum { + ERDMA_CMD_STATUS_INIT, + ERDMA_CMD_STATUS_ISSUED, + ERDMA_CMD_STATUS_FINISHED, + ERDMA_CMD_STATUS_TIMEOUT +}; + +struct erdma_comp_wait { + struct completion wait_event; + u32 cmd_status; + u32 ctx_id; + u16 sq_pi; + u8 comp_status; + u8 rsvd; + u32 comp_data[4]; +}; + +enum { + ERDMA_CMDQ_STATE_OK_BIT = 0, + ERDMA_CMDQ_STATE_TIMEOUT_BIT = 1, + ERDMA_CMDQ_STATE_CTX_ERR_BIT = 2, +}; + +#define ERDMA_CMDQ_TIMEOUT_MS 15000 +#define ERDMA_REG_ACCESS_WAIT_MS 20 +#define ERDMA_WAIT_DEV_DONE_CNT 500 + +struct erdma_cmdq { + unsigned long *comp_wait_bitmap; + struct erdma_comp_wait *wait_pool; + spinlock_t lock; + + bool use_event; + + struct erdma_cmdq_sq sq; + struct erdma_cmdq_cq cq; + struct erdma_eq eq; + + unsigned long state; + + struct semaphore credits; + u16 max_outstandings; +}; + +#define COMPROMISE_CC ERDMA_CC_CUBIC +enum erdma_cc_alg { + ERDMA_CC_NEWRENO = 0, + ERDMA_CC_CUBIC, + ERDMA_CC_HPCC_RTT, + ERDMA_CC_HPCC_ECN, + ERDMA_CC_HPCC_INT, + ERDMA_CC_METHODS_NUM +}; + +struct erdma_devattr { + u32 fw_version; + + unsigned char peer_addr[ETH_ALEN]; + + int numa_node; + enum erdma_cc_alg cc; + u32 grp_num; + u32 irq_num; + + bool disable_dwqe; + u16 dwqe_pages; + u16 dwqe_entries; + + u32 max_qp; + u32 max_send_wr; + u32 max_recv_wr; + u32 max_ord; + u32 max_ird; + + u32 max_send_sge; + u32 max_recv_sge; + u32 max_sge_rd; + u32 max_cq; + u32 max_cqe; + u64 max_mr_size; + u32 max_mr; + u32 max_pd; + u32 max_mw; + u32 local_dma_key; +}; + +#define ERDMA_IRQNAME_SIZE 50 + +struct erdma_irq { + char name[ERDMA_IRQNAME_SIZE]; + u32 msix_vector; + cpumask_t affinity_hint_mask; +}; + +struct erdma_eq_cb { + bool ready; + void *dev; /* All EQs use this fields to get erdma_dev struct */ + struct erdma_irq irq; + struct erdma_eq eq; + struct tasklet_struct tasklet; +}; + +struct erdma_resource_cb { + unsigned long *bitmap; + spinlock_t lock; + u32 next_alloc_idx; + u32 max_cap; +}; + +enum { + ERDMA_RES_TYPE_PD = 0, + ERDMA_RES_TYPE_STAG_IDX = 1, + ERDMA_RES_CNT = 2, +}; + +#define ERDMA_EXTRA_BUFFER_SIZE ERDMA_DB_SIZE +#define WARPPED_BUFSIZE(size) ((size) + ERDMA_EXTRA_BUFFER_SIZE) + +struct erdma_dev { + struct ib_device ibdev; + struct net_device *netdev; + struct pci_dev *pdev; + struct notifier_block netdev_nb; + + resource_size_t func_bar_addr; + resource_size_t func_bar_len; + u8 __iomem *func_bar; + + struct erdma_devattr attrs; + /* physical port state (only one port per device) */ + enum ib_port_state state; + + /* cmdq and aeq use the same msix vector */ + struct erdma_irq comm_irq; + struct erdma_cmdq cmdq; + struct erdma_eq aeq; + struct erdma_eq_cb ceqs[ERDMA_NUM_MSIX_VEC - 1]; + + spinlock_t lock; + struct erdma_resource_cb res_cb[ERDMA_RES_CNT]; + struct xarray qp_xa; + struct xarray cq_xa; + + u32 next_alloc_qpn; + u32 next_alloc_cqn; + + spinlock_t db_bitmap_lock; + /* We provide max 64 uContexts that each has one SQ doorbell Page. */ + DECLARE_BITMAP(sdb_page, ERDMA_DWQE_TYPE0_CNT); + /* + * We provide max 496 uContexts that each has one SQ normal Db, + * and one directWQE db。 + */ + DECLARE_BITMAP(sdb_entry, ERDMA_DWQE_TYPE1_CNT); + + atomic_t num_ctx; + struct list_head cep_list; +}; + +static inline void *get_queue_entry(void *qbuf, u32 idx, u32 depth, u32 shift) +{ + idx &= (depth - 1); + + return qbuf + (idx << shift); +} + +static inline struct erdma_dev *to_edev(struct ib_device *ibdev) +{ + return container_of(ibdev, struct erdma_dev, ibdev); +} + +static inline u32 erdma_reg_read32(struct erdma_dev *dev, u32 reg) +{ + return readl(dev->func_bar + reg); +} + +static inline u64 erdma_reg_read64(struct erdma_dev *dev, u32 reg) +{ + return readq(dev->func_bar + reg); +} + +static inline void erdma_reg_write32(struct erdma_dev *dev, u32 reg, u32 value) +{ + writel(value, dev->func_bar + reg); +} + +static inline void erdma_reg_write64(struct erdma_dev *dev, u32 reg, u64 value) +{ + writeq(value, dev->func_bar + reg); +} + +static inline u32 erdma_reg_read32_filed(struct erdma_dev *dev, u32 reg, + u32 filed_mask) +{ + u32 val = erdma_reg_read32(dev, reg); + + return FIELD_GET(filed_mask, val); +} + +int erdma_cmdq_init(struct erdma_dev *dev); +void erdma_finish_cmdq_init(struct erdma_dev *dev); +void erdma_cmdq_destroy(struct erdma_dev *dev); + +void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op); +int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size, + u64 *resp0, u64 *resp1); +void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq); + +int erdma_ceqs_init(struct erdma_dev *dev); +void erdma_ceqs_uninit(struct erdma_dev *dev); +void notify_eq(struct erdma_eq *eq); +void *get_next_valid_eqe(struct erdma_eq *eq); + +int erdma_aeq_init(struct erdma_dev *dev); +void erdma_aeq_destroy(struct erdma_dev *dev); + +void erdma_aeq_event_handler(struct erdma_dev *dev); +void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb); + +#endif diff --git a/drivers/infiniband/hw/erdma/erdma_cm.c b/drivers/infiniband/hw/erdma/erdma_cm.c new file mode 100644 index 000000000000..f13f16479eca --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cm.c @@ -0,0 +1,1430 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Fredy Neeser */ +/* Greg Joyce <greg@opengridcomputing.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#include <linux/errno.h> +#include <linux/inetdevice.h> +#include <linux/net.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <net/addrconf.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_verbs.h" + +static struct workqueue_struct *erdma_cm_wq; + +static void erdma_cm_llp_state_change(struct sock *sk); +static void erdma_cm_llp_data_ready(struct sock *sk); +static void erdma_cm_llp_error_report(struct sock *sk); + +static void erdma_sk_assign_cm_upcalls(struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + sk->sk_state_change = erdma_cm_llp_state_change; + sk->sk_data_ready = erdma_cm_llp_data_ready; + sk->sk_error_report = erdma_cm_llp_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void erdma_sk_save_upcalls(struct sock *sk) +{ + struct erdma_cep *cep = sk_to_cep(sk); + + write_lock_bh(&sk->sk_callback_lock); + cep->sk_state_change = sk->sk_state_change; + cep->sk_data_ready = sk->sk_data_ready; + cep->sk_error_report = sk->sk_error_report; + write_unlock_bh(&sk->sk_callback_lock); +} + +static void erdma_sk_restore_upcalls(struct sock *sk, struct erdma_cep *cep) +{ + sk->sk_state_change = cep->sk_state_change; + sk->sk_data_ready = cep->sk_data_ready; + sk->sk_error_report = cep->sk_error_report; + sk->sk_user_data = NULL; +} + +static void erdma_socket_disassoc(struct socket *s) +{ + struct sock *sk = s->sk; + struct erdma_cep *cep; + + if (sk) { + write_lock_bh(&sk->sk_callback_lock); + cep = sk_to_cep(sk); + if (cep) { + erdma_sk_restore_upcalls(sk, cep); + erdma_cep_put(cep); + } else { + WARN_ON_ONCE(1); + } + write_unlock_bh(&sk->sk_callback_lock); + } else { + WARN_ON_ONCE(1); + } +} + +static void erdma_cep_socket_assoc(struct erdma_cep *cep, struct socket *s) +{ + cep->sock = s; + erdma_cep_get(cep); + s->sk->sk_user_data = cep; + + erdma_sk_save_upcalls(s->sk); + erdma_sk_assign_cm_upcalls(s->sk); +} + +static void erdma_disassoc_listen_cep(struct erdma_cep *cep) +{ + if (cep->listen_cep) { + erdma_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + } +} + +static struct erdma_cep *erdma_cep_alloc(struct erdma_dev *dev) +{ + struct erdma_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); + unsigned long flags; + + if (!cep) + return NULL; + + INIT_LIST_HEAD(&cep->listenq); + INIT_LIST_HEAD(&cep->devq); + INIT_LIST_HEAD(&cep->work_freelist); + + kref_init(&cep->ref); + cep->state = ERDMA_EPSTATE_IDLE; + init_waitqueue_head(&cep->waitq); + spin_lock_init(&cep->lock); + cep->dev = dev; + + spin_lock_irqsave(&dev->lock, flags); + list_add_tail(&cep->devq, &dev->cep_list); + spin_unlock_irqrestore(&dev->lock, flags); + + return cep; +} + +static void erdma_cm_free_work(struct erdma_cep *cep) +{ + struct list_head *w, *tmp; + struct erdma_cm_work *work; + + list_for_each_safe(w, tmp, &cep->work_freelist) { + work = list_entry(w, struct erdma_cm_work, list); + list_del(&work->list); + kfree(work); + } +} + +static void erdma_cancel_mpatimer(struct erdma_cep *cep) +{ + spin_lock_bh(&cep->lock); + if (cep->mpa_timer) { + if (cancel_delayed_work(&cep->mpa_timer->work)) { + erdma_cep_put(cep); + kfree(cep->mpa_timer); + } + cep->mpa_timer = NULL; + } + spin_unlock_bh(&cep->lock); +} + +static void erdma_put_work(struct erdma_cm_work *work) +{ + INIT_LIST_HEAD(&work->list); + spin_lock_bh(&work->cep->lock); + list_add(&work->list, &work->cep->work_freelist); + spin_unlock_bh(&work->cep->lock); +} + +static void erdma_cep_set_inuse(struct erdma_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + while (cep->in_use) { + spin_unlock_irqrestore(&cep->lock, flags); + wait_event_interruptible(cep->waitq, !cep->in_use); + if (signal_pending(current)) + flush_signals(current); + + spin_lock_irqsave(&cep->lock, flags); + } + + cep->in_use = 1; + spin_unlock_irqrestore(&cep->lock, flags); +} + +static void erdma_cep_set_free(struct erdma_cep *cep) +{ + unsigned long flags; + + spin_lock_irqsave(&cep->lock, flags); + cep->in_use = 0; + spin_unlock_irqrestore(&cep->lock, flags); + + wake_up(&cep->waitq); +} + +static void __erdma_cep_dealloc(struct kref *ref) +{ + struct erdma_cep *cep = container_of(ref, struct erdma_cep, ref); + struct erdma_dev *dev = cep->dev; + unsigned long flags; + + WARN_ON(cep->listen_cep); + + kfree(cep->private_data); + kfree(cep->mpa.pdata); + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) + erdma_cm_free_work(cep); + spin_unlock_bh(&cep->lock); + + spin_lock_irqsave(&dev->lock, flags); + list_del(&cep->devq); + spin_unlock_irqrestore(&dev->lock, flags); + kfree(cep); +} + +static struct erdma_cm_work *erdma_get_work(struct erdma_cep *cep) +{ + struct erdma_cm_work *work = NULL; + + spin_lock_bh(&cep->lock); + if (!list_empty(&cep->work_freelist)) { + work = list_entry(cep->work_freelist.next, struct erdma_cm_work, + list); + list_del_init(&work->list); + } + + spin_unlock_bh(&cep->lock); + return work; +} + +static int erdma_cm_alloc_work(struct erdma_cep *cep, int num) +{ + struct erdma_cm_work *work; + + while (num--) { + work = kmalloc(sizeof(*work), GFP_KERNEL); + if (!work) { + if (!(list_empty(&cep->work_freelist))) + erdma_cm_free_work(cep); + return -ENOMEM; + } + work->cep = cep; + INIT_LIST_HEAD(&work->list); + list_add(&work->list, &cep->work_freelist); + } + + return 0; +} + +static int erdma_cm_upcall(struct erdma_cep *cep, enum iw_cm_event_type reason, + int status) +{ + struct iw_cm_event event; + struct iw_cm_id *cm_id; + + memset(&event, 0, sizeof(event)); + event.status = status; + event.event = reason; + + if (reason == IW_CM_EVENT_CONNECT_REQUEST) { + event.provider_data = cep; + cm_id = cep->listen_cep->cm_id; + + event.ird = cep->dev->attrs.max_ird; + event.ord = cep->dev->attrs.max_ord; + } else { + cm_id = cep->cm_id; + } + + if (reason == IW_CM_EVENT_CONNECT_REQUEST || + reason == IW_CM_EVENT_CONNECT_REPLY) { + u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); + + if (pd_len && cep->mpa.pdata) { + event.private_data_len = pd_len; + event.private_data = cep->mpa.pdata; + } + + getname_local(cep->sock, &event.local_addr); + getname_peer(cep->sock, &event.remote_addr); + } + + return cm_id->event_handler(cm_id, &event); +} + +void erdma_qp_cm_drop(struct erdma_qp *qp) +{ + struct erdma_cep *cep = qp->cep; + + if (!qp->cep) + return; + + erdma_cep_set_inuse(cep); + + /* already closed. */ + if (cep->state == ERDMA_EPSTATE_CLOSED) + goto out; + + if (cep->cm_id) { + switch (cep->state) { + case ERDMA_EPSTATE_AWAIT_MPAREP: + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EINVAL); + break; + case ERDMA_EPSTATE_RDMA_MODE: + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + break; + case ERDMA_EPSTATE_IDLE: + case ERDMA_EPSTATE_LISTENING: + case ERDMA_EPSTATE_CONNECTING: + case ERDMA_EPSTATE_AWAIT_MPAREQ: + case ERDMA_EPSTATE_RECVD_MPAREQ: + case ERDMA_EPSTATE_CLOSED: + default: + break; + } + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + erdma_cep_put(cep); + } + cep->state = ERDMA_EPSTATE_CLOSED; + + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + + if (cep->qp) { + cep->qp = NULL; + erdma_qp_put(qp); + } +out: + erdma_cep_set_free(cep); +} + +void erdma_cep_put(struct erdma_cep *cep) +{ + WARN_ON(kref_read(&cep->ref) < 1); + kref_put(&cep->ref, __erdma_cep_dealloc); +} + +void erdma_cep_get(struct erdma_cep *cep) +{ + kref_get(&cep->ref); +} + +static int erdma_send_mpareqrep(struct erdma_cep *cep, const void *pdata, + u8 pd_len) +{ + struct socket *s = cep->sock; + struct mpa_rr *rr = &cep->mpa.hdr; + struct kvec iov[3]; + struct msghdr msg; + int iovec_num = 0; + int ret; + int mpa_len; + + memset(&msg, 0, sizeof(msg)); + + rr->params.pd_len = cpu_to_be16(pd_len); + + iov[iovec_num].iov_base = rr; + iov[iovec_num].iov_len = sizeof(*rr); + iovec_num++; + mpa_len = sizeof(*rr); + + iov[iovec_num].iov_base = &cep->mpa.ext_data; + iov[iovec_num].iov_len = sizeof(cep->mpa.ext_data); + iovec_num++; + mpa_len += sizeof(cep->mpa.ext_data); + + if (pd_len) { + iov[iovec_num].iov_base = (char *)pdata; + iov[iovec_num].iov_len = pd_len; + mpa_len += pd_len; + iovec_num++; + } + + ret = kernel_sendmsg(s, &msg, iov, iovec_num, mpa_len); + + return ret < 0 ? ret : 0; +} + +static inline int ksock_recv(struct socket *sock, char *buf, size_t size, + int flags) +{ + struct kvec iov = { buf, size }; + struct msghdr msg = { .msg_name = NULL, .msg_flags = flags }; + + return kernel_recvmsg(sock, &msg, &iov, 1, size, flags); +} + +static int __recv_mpa_hdr(struct erdma_cep *cep, int hdr_rcvd, char *hdr, + int hdr_size, int *rcvd_out) +{ + struct socket *s = cep->sock; + int rcvd; + + *rcvd_out = 0; + if (hdr_rcvd < hdr_size) { + rcvd = ksock_recv(s, hdr + hdr_rcvd, hdr_size - hdr_rcvd, + MSG_DONTWAIT); + if (rcvd == -EAGAIN) + return -EAGAIN; + + if (rcvd <= 0) + return -ECONNABORTED; + + hdr_rcvd += rcvd; + *rcvd_out = rcvd; + + if (hdr_rcvd < hdr_size) + return -EAGAIN; + } + + return 0; +} + +static void __mpa_rr_set_revision(__be16 *bits, u8 rev) +{ + *bits = (*bits & ~MPA_RR_MASK_REVISION) | + (cpu_to_be16(rev) & MPA_RR_MASK_REVISION); +} + +static u8 __mpa_rr_revision(__be16 mpa_rr_bits) +{ + __be16 rev = mpa_rr_bits & MPA_RR_MASK_REVISION; + + return (u8)be16_to_cpu(rev); +} + +static void __mpa_ext_set_cc(__be32 *bits, u32 cc) +{ + *bits = (*bits & ~MPA_EXT_FLAG_CC) | + (cpu_to_be32(cc) & MPA_EXT_FLAG_CC); +} + +static u8 __mpa_ext_cc(__be32 mpa_ext_bits) +{ + __be32 cc = mpa_ext_bits & MPA_EXT_FLAG_CC; + + return (u8)be32_to_cpu(cc); +} + +/* + * Receive MPA Request/Reply header. + * + * Returns 0 if complete MPA Request/Reply haeder including + * eventual private data was received. Returns -EAGAIN if + * header was partially received or negative error code otherwise. + * + * Context: May be called in process context only + */ +static int erdma_recv_mpa_rr(struct erdma_cep *cep) +{ + struct mpa_rr *hdr = &cep->mpa.hdr; + struct socket *s = cep->sock; + u16 pd_len; + int rcvd, to_rcv, ret, pd_rcvd; + + if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { + ret = __recv_mpa_hdr(cep, cep->mpa.bytes_rcvd, + (char *)&cep->mpa.hdr, + sizeof(struct mpa_rr), &rcvd); + cep->mpa.bytes_rcvd += rcvd; + if (ret) + return ret; + } + + if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA || + __mpa_rr_revision(hdr->params.bits) != MPA_REVISION_EXT_1) + return -EPROTO; + + if (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) < + sizeof(struct erdma_mpa_ext)) { + ret = __recv_mpa_hdr( + cep, cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), + (char *)&cep->mpa.ext_data, + sizeof(struct erdma_mpa_ext), &rcvd); + cep->mpa.bytes_rcvd += rcvd; + if (ret) + return ret; + } + + pd_len = be16_to_cpu(hdr->params.pd_len); + pd_rcvd = cep->mpa.bytes_rcvd - sizeof(struct mpa_rr) - + sizeof(struct erdma_mpa_ext); + to_rcv = pd_len - pd_rcvd; + + if (!to_rcv) { + /* + * We have received the whole MPA Request/Reply message. + * Check against peer protocol violation. + */ + u32 word; + + ret = __recv_mpa_hdr(cep, 0, (char *)&word, sizeof(word), + &rcvd); + if (ret == -EAGAIN && rcvd == 0) + return 0; + + if (ret) + return ret; + + return -EPROTO; + } + + /* + * At this point, MPA header has been fully received, and pd_len != 0. + * So, begin to receive private data. + */ + if (!cep->mpa.pdata) { + cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); + if (!cep->mpa.pdata) + return -ENOMEM; + } + + rcvd = ksock_recv(s, cep->mpa.pdata + pd_rcvd, to_rcv + 4, + MSG_DONTWAIT); + if (rcvd < 0) + return rcvd; + + if (rcvd > to_rcv) + return -EPROTO; + + cep->mpa.bytes_rcvd += rcvd; + + if (to_rcv == rcvd) + return 0; + + return -EAGAIN; +} + +/* + * erdma_proc_mpareq() + * + * Read MPA Request from socket and signal new connection to IWCM + * if success. Caller must hold lock on corresponding listening CEP. + */ +static int erdma_proc_mpareq(struct erdma_cep *cep) +{ + struct mpa_rr *req; + int ret; + + ret = erdma_recv_mpa_rr(cep); + if (ret) + return ret; + + req = &cep->mpa.hdr; + + if (memcmp(req->key, MPA_KEY_REQ, MPA_KEY_SIZE)) + return -EPROTO; + + memcpy(req->key, MPA_KEY_REP, MPA_KEY_SIZE); + + /* Currently does not support marker and crc. */ + if (req->params.bits & MPA_RR_FLAG_MARKERS || + req->params.bits & MPA_RR_FLAG_CRC) + goto reject_conn; + + cep->state = ERDMA_EPSTATE_RECVD_MPAREQ; + + /* Keep reference until IWCM accepts/rejects */ + erdma_cep_get(cep); + ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); + if (ret) + erdma_cep_put(cep); + + return ret; + +reject_conn: + req->params.bits &= ~MPA_RR_FLAG_MARKERS; + req->params.bits |= MPA_RR_FLAG_REJECT; + req->params.bits &= ~MPA_RR_FLAG_CRC; + + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + erdma_send_mpareqrep(cep, NULL, 0); + + return -EOPNOTSUPP; +} + +static int erdma_proc_mpareply(struct erdma_cep *cep) +{ + struct erdma_qp_attrs qp_attrs; + struct erdma_qp *qp = cep->qp; + struct mpa_rr *rep; + int ret; + + ret = erdma_recv_mpa_rr(cep); + if (ret) + goto out_err; + + erdma_cancel_mpatimer(cep); + + rep = &cep->mpa.hdr; + + if (memcmp(rep->key, MPA_KEY_REP, MPA_KEY_SIZE)) { + ret = -EPROTO; + goto out_err; + } + + if (rep->params.bits & MPA_RR_FLAG_REJECT) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); + return -ECONNRESET; + } + + /* Currently does not support marker and crc. */ + if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || + (rep->params.bits & MPA_RR_FLAG_CRC)) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); + return -EINVAL; + } + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.irq_size = cep->ird; + qp_attrs.orq_size = cep->ord; + qp_attrs.state = ERDMA_QP_STATE_RTS; + + down_write(&qp->state_lock); + if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto out_err; + } + + qp->attrs.qp_type = ERDMA_QP_ACTIVE; + if (__mpa_ext_cc(cep->mpa.ext_data.bits) != qp->attrs.cc) + qp->attrs.cc = COMPROMISE_CC; + + ret = erdma_modify_qp_internal(qp, &qp_attrs, + ERDMA_QP_ATTR_STATE | + ERDMA_QP_ATTR_LLP_HANDLE | + ERDMA_QP_ATTR_MPA); + + up_write(&qp->state_lock); + + if (!ret) { + ret = erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); + if (!ret) + cep->state = ERDMA_EPSTATE_RDMA_MODE; + + return 0; + } + +out_err: + if (ret != -EAGAIN) + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); + + return ret; +} + +static void erdma_accept_newconn(struct erdma_cep *cep) +{ + struct socket *s = cep->sock; + struct socket *new_s = NULL; + struct erdma_cep *new_cep = NULL; + int ret = 0; + + if (cep->state != ERDMA_EPSTATE_LISTENING) + goto error; + + new_cep = erdma_cep_alloc(cep->dev); + if (!new_cep) + goto error; + + /* + * 4: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout. + */ + if (erdma_cm_alloc_work(new_cep, 4) != 0) + goto error; + + /* + * Copy saved socket callbacks from listening CEP + * and assign new socket with new CEP + */ + new_cep->sk_state_change = cep->sk_state_change; + new_cep->sk_data_ready = cep->sk_data_ready; + new_cep->sk_error_report = cep->sk_error_report; + + ret = kernel_accept(s, &new_s, O_NONBLOCK); + if (ret != 0) + goto error; + + new_cep->sock = new_s; + erdma_cep_get(new_cep); + new_s->sk->sk_user_data = new_cep; + + tcp_sock_set_nodelay(new_s->sk); + new_cep->state = ERDMA_EPSTATE_AWAIT_MPAREQ; + + ret = erdma_cm_queue_work(new_cep, ERDMA_CM_WORK_MPATIMEOUT); + if (ret) + goto error; + + new_cep->listen_cep = cep; + erdma_cep_get(cep); + + if (atomic_read(&new_s->sk->sk_rmem_alloc)) { + /* MPA REQ already queued */ + erdma_cep_set_inuse(new_cep); + ret = erdma_proc_mpareq(new_cep); + if (ret != -EAGAIN) { + erdma_cep_put(cep); + new_cep->listen_cep = NULL; + if (ret) { + erdma_cep_set_free(new_cep); + goto error; + } + } + erdma_cep_set_free(new_cep); + } + return; + +error: + if (new_cep) { + new_cep->state = ERDMA_EPSTATE_CLOSED; + erdma_cancel_mpatimer(new_cep); + + erdma_cep_put(new_cep); + new_cep->sock = NULL; + } + + if (new_s) { + erdma_socket_disassoc(new_s); + sock_release(new_s); + } +} + +static int erdma_newconn_connected(struct erdma_cep *cep) +{ + int ret = 0; + + cep->mpa.hdr.params.bits = 0; + __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, MPA_REVISION_EXT_1); + + memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, MPA_KEY_SIZE); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + __mpa_ext_set_cc(&cep->mpa.ext_data.bits, cep->qp->attrs.cc); + + ret = erdma_send_mpareqrep(cep, cep->private_data, cep->pd_len); + cep->state = ERDMA_EPSTATE_AWAIT_MPAREP; + cep->mpa.hdr.params.pd_len = 0; + + if (ret >= 0) + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_MPATIMEOUT); + + return ret; +} + +static void erdma_cm_work_handler(struct work_struct *w) +{ + struct erdma_cm_work *work; + struct erdma_cep *cep; + int release_cep = 0, ret = 0; + + work = container_of(w, struct erdma_cm_work, work.work); + cep = work->cep; + + erdma_cep_set_inuse(cep); + + switch (work->type) { + case ERDMA_CM_WORK_CONNECTED: + erdma_cancel_mpatimer(cep); + if (cep->state == ERDMA_EPSTATE_CONNECTING) { + ret = erdma_newconn_connected(cep); + if (ret) { + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -EIO); + release_cep = 1; + } + } + break; + case ERDMA_CM_WORK_CONNECTTIMEOUT: + if (cep->state == ERDMA_EPSTATE_CONNECTING) { + cep->mpa_timer = NULL; + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + } + break; + case ERDMA_CM_WORK_ACCEPT: + erdma_accept_newconn(cep); + break; + case ERDMA_CM_WORK_READ_MPAHDR: + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + if (cep->listen_cep) { + erdma_cep_set_inuse(cep->listen_cep); + + if (cep->listen_cep->state == + ERDMA_EPSTATE_LISTENING) + ret = erdma_proc_mpareq(cep); + else + ret = -EFAULT; + + erdma_cep_set_free(cep->listen_cep); + + if (ret != -EAGAIN) { + erdma_cep_put(cep->listen_cep); + cep->listen_cep = NULL; + if (ret) + erdma_cep_put(cep); + } + } + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + ret = erdma_proc_mpareply(cep); + } + + if (ret && ret != -EAGAIN) + release_cep = 1; + break; + case ERDMA_CM_WORK_CLOSE_LLP: + if (cep->cm_id) + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + release_cep = 1; + break; + case ERDMA_CM_WORK_PEER_CLOSE: + if (cep->cm_id) { + if (cep->state == ERDMA_EPSTATE_CONNECTING || + cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + /* + * MPA reply not received, but connection drop + */ + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ECONNRESET); + } else if (cep->state == ERDMA_EPSTATE_RDMA_MODE) { + /* + * NOTE: IW_CM_EVENT_DISCONNECT is given just + * to transition IWCM into CLOSING. + */ + erdma_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); + erdma_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); + } + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + /* Socket close before MPA request received. */ + erdma_disassoc_listen_cep(cep); + erdma_cep_put(cep); + } + release_cep = 1; + break; + case ERDMA_CM_WORK_MPATIMEOUT: + cep->mpa_timer = NULL; + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) { + /* + * MPA request timed out: + * Hide any partially received private data and signal + * timeout + */ + cep->mpa.hdr.params.pd_len = 0; + + if (cep->cm_id) + erdma_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, + -ETIMEDOUT); + release_cep = 1; + } else if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ) { + /* No MPA req received after peer TCP stream setup. */ + erdma_disassoc_listen_cep(cep); + + erdma_cep_put(cep); + release_cep = 1; + } + break; + default: + WARN(1, "Undefined CM work type: %d\n", work->type); + } + + if (release_cep) { + erdma_cancel_mpatimer(cep); + cep->state = ERDMA_EPSTATE_CLOSED; + if (cep->qp) { + struct erdma_qp *qp = cep->qp; + /* + * Serialize a potential race with application + * closing the QP and calling erdma_qp_cm_drop() + */ + erdma_qp_get(qp); + erdma_cep_set_free(cep); + + erdma_qp_llp_close(qp); + erdma_qp_put(qp); + + erdma_cep_set_inuse(cep); + cep->qp = NULL; + erdma_qp_put(qp); + } + + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + if (cep->state != ERDMA_EPSTATE_LISTENING) + erdma_cep_put(cep); + } + } + erdma_cep_set_free(cep); + erdma_put_work(work); + erdma_cep_put(cep); +} + +int erdma_cm_queue_work(struct erdma_cep *cep, enum erdma_work_type type) +{ + struct erdma_cm_work *work = erdma_get_work(cep); + unsigned long delay = 0; + + if (!work) + return -ENOMEM; + + work->type = type; + work->cep = cep; + + erdma_cep_get(cep); + + INIT_DELAYED_WORK(&work->work, erdma_cm_work_handler); + + if (type == ERDMA_CM_WORK_MPATIMEOUT) { + cep->mpa_timer = work; + + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) + delay = MPAREP_TIMEOUT; + else + delay = MPAREQ_TIMEOUT; + } else if (type == ERDMA_CM_WORK_CONNECTTIMEOUT) { + cep->mpa_timer = work; + + delay = CONNECT_TIMEOUT; + } + + queue_delayed_work(erdma_cm_wq, &work->work, delay); + + return 0; +} + +static void erdma_cm_llp_data_ready(struct sock *sk) +{ + struct erdma_cep *cep; + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) + goto out; + + if (cep->state == ERDMA_EPSTATE_AWAIT_MPAREQ || + cep->state == ERDMA_EPSTATE_AWAIT_MPAREP) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_READ_MPAHDR); + +out: + read_unlock(&sk->sk_callback_lock); +} + +static void erdma_cm_llp_error_report(struct sock *sk) +{ + struct erdma_cep *cep = sk_to_cep(sk); + + if (cep) + cep->sk_error_report(sk); +} + +static void erdma_cm_llp_state_change(struct sock *sk) +{ + struct erdma_cep *cep; + void (*orig_state_change)(struct sock *sk); + + read_lock(&sk->sk_callback_lock); + + cep = sk_to_cep(sk); + if (!cep) { + read_unlock(&sk->sk_callback_lock); + return; + } + orig_state_change = cep->sk_state_change; + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + if (cep->state == ERDMA_EPSTATE_CONNECTING) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED); + else + erdma_cm_queue_work(cep, ERDMA_CM_WORK_ACCEPT); + break; + case TCP_CLOSE: + case TCP_CLOSE_WAIT: + if (cep->state != ERDMA_EPSTATE_LISTENING) + erdma_cm_queue_work(cep, ERDMA_CM_WORK_PEER_CLOSE); + break; + default: + break; + } + read_unlock(&sk->sk_callback_lock); + orig_state_change(sk); +} + +static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, + int laddrlen, struct sockaddr *raddr, + int raddrlen, int flags) +{ + int ret; + + sock_set_reuseaddr(s->sk); + ret = s->ops->bind(s, laddr, laddrlen); + if (ret) + return ret; + ret = s->ops->connect(s, raddr, raddrlen, flags); + return ret < 0 ? ret : 0; +} + +int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct erdma_dev *dev = to_edev(id->device); + struct erdma_qp *qp; + struct erdma_cep *cep = NULL; + struct socket *s = NULL; + struct sockaddr *laddr = (struct sockaddr *)&id->m_local_addr; + struct sockaddr *raddr = (struct sockaddr *)&id->m_remote_addr; + u16 pd_len = params->private_data_len; + int ret; + + if (pd_len > MPA_MAX_PRIVDATA) + return -EINVAL; + + if (params->ird > dev->attrs.max_ird || + params->ord > dev->attrs.max_ord) + return -EINVAL; + + if (laddr->sa_family != AF_INET || raddr->sa_family != AF_INET) + return -EAFNOSUPPORT; + + qp = find_qp_by_qpn(dev, params->qpn); + if (!qp) + return -ENOENT; + erdma_qp_get(qp); + + ret = sock_create(AF_INET, SOCK_STREAM, IPPROTO_TCP, &s); + if (ret < 0) + goto error_put_qp; + + cep = erdma_cep_alloc(dev); + if (!cep) { + ret = -ENOMEM; + goto error_release_sock; + } + + erdma_cep_set_inuse(cep); + + /* Associate QP with CEP */ + erdma_cep_get(cep); + qp->cep = cep; + cep->qp = qp; + + /* Associate cm_id with CEP */ + id->add_ref(id); + cep->cm_id = id; + + /* + * 6: Allocate a sufficient number of work elements + * to allow concurrent handling of local + peer close + * events, MPA header processing + MPA timeout, connected event + * and connect timeout. + */ + ret = erdma_cm_alloc_work(cep, 6); + if (ret != 0) { + ret = -ENOMEM; + goto error_release_cep; + } + + cep->ird = params->ird; + cep->ord = params->ord; + cep->state = ERDMA_EPSTATE_CONNECTING; + + erdma_cep_socket_assoc(cep, s); + + if (pd_len) { + cep->pd_len = pd_len; + cep->private_data = kmalloc(pd_len, GFP_KERNEL); + if (!cep->private_data) { + ret = -ENOMEM; + goto error_disassoc; + } + + memcpy(cep->private_data, params->private_data, + params->private_data_len); + } + + ret = kernel_bindconnect(s, laddr, sizeof(*laddr), raddr, + sizeof(*raddr), O_NONBLOCK); + if (ret != -EINPROGRESS && ret != 0) { + goto error_disassoc; + } else if (ret == 0) { + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTED); + if (ret) + goto error_disassoc; + } else { + ret = erdma_cm_queue_work(cep, ERDMA_CM_WORK_CONNECTTIMEOUT); + if (ret) + goto error_disassoc; + } + + erdma_cep_set_free(cep); + return 0; + +error_disassoc: + kfree(cep->private_data); + cep->private_data = NULL; + cep->pd_len = 0; + + erdma_socket_disassoc(s); + +error_release_cep: + /* disassoc with cm_id */ + cep->cm_id = NULL; + id->rem_ref(id); + + /* disassoc with qp */ + qp->cep = NULL; + erdma_cep_put(cep); + cep->qp = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + + /* release the cep. */ + erdma_cep_put(cep); + +error_release_sock: + if (s) + sock_release(s); +error_put_qp: + erdma_qp_put(qp); + + return ret; +} + +int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) +{ + struct erdma_dev *dev = to_edev(id->device); + struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + struct erdma_qp *qp; + struct erdma_qp_attrs qp_attrs; + int ret; + + erdma_cep_set_inuse(cep); + erdma_cep_put(cep); + + /* Free lingering inbound private data */ + if (cep->mpa.hdr.params.pd_len) { + cep->mpa.hdr.params.pd_len = 0; + kfree(cep->mpa.pdata); + cep->mpa.pdata = NULL; + } + erdma_cancel_mpatimer(cep); + + if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) { + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return -ECONNRESET; + } + + qp = find_qp_by_qpn(dev, params->qpn); + if (!qp) + return -ENOENT; + erdma_qp_get(qp); + + down_write(&qp->state_lock); + if (qp->attrs.state > ERDMA_QP_STATE_RTR) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + if (params->ord > dev->attrs.max_ord || + params->ird > dev->attrs.max_ord) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + if (params->private_data_len > MPA_MAX_PRIVDATA) { + ret = -EINVAL; + up_write(&qp->state_lock); + goto error; + } + + cep->ird = params->ird; + cep->ord = params->ord; + + cep->cm_id = id; + id->add_ref(id); + + memset(&qp_attrs, 0, sizeof(qp_attrs)); + qp_attrs.orq_size = params->ord; + qp_attrs.irq_size = params->ird; + + qp_attrs.state = ERDMA_QP_STATE_RTS; + + /* Associate QP with CEP */ + erdma_cep_get(cep); + qp->cep = cep; + cep->qp = qp; + + cep->state = ERDMA_EPSTATE_RDMA_MODE; + + qp->attrs.qp_type = ERDMA_QP_PASSIVE; + qp->attrs.pd_len = params->private_data_len; + + if (qp->attrs.cc != __mpa_ext_cc(cep->mpa.ext_data.bits)) + qp->attrs.cc = COMPROMISE_CC; + + /* move to rts */ + ret = erdma_modify_qp_internal(qp, &qp_attrs, + ERDMA_QP_ATTR_STATE | + ERDMA_QP_ATTR_ORD | + ERDMA_QP_ATTR_LLP_HANDLE | + ERDMA_QP_ATTR_IRD | + ERDMA_QP_ATTR_MPA); + up_write(&qp->state_lock); + + if (ret) + goto error; + + cep->mpa.ext_data.bits = 0; + __mpa_ext_set_cc(&cep->mpa.ext_data.bits, qp->attrs.cc); + cep->mpa.ext_data.cookie = cpu_to_be32(cep->qp->attrs.cookie); + + ret = erdma_send_mpareqrep(cep, params->private_data, + params->private_data_len); + if (!ret) { + ret = erdma_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); + if (ret) + goto error; + + erdma_cep_set_free(cep); + + return 0; + } + +error: + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + if (cep->cm_id) { + cep->cm_id->rem_ref(id); + cep->cm_id = NULL; + } + + if (qp->cep) { + erdma_cep_put(cep); + qp->cep = NULL; + } + + cep->qp = NULL; + erdma_qp_put(qp); + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return ret; +} + +int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen) +{ + struct erdma_cep *cep = (struct erdma_cep *)id->provider_data; + + erdma_cep_set_inuse(cep); + erdma_cep_put(cep); + + erdma_cancel_mpatimer(cep); + + if (cep->state != ERDMA_EPSTATE_RECVD_MPAREQ) { + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return -ECONNRESET; + } + + if (__mpa_rr_revision(cep->mpa.hdr.params.bits) == MPA_REVISION_EXT_1) { + cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ + erdma_send_mpareqrep(cep, pdata, plen); + } + + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + + return 0; +} + +int erdma_create_listen(struct iw_cm_id *id, int backlog) +{ + struct socket *s; + struct erdma_cep *cep = NULL; + int ret = 0; + struct erdma_dev *dev = to_edev(id->device); + int addr_family = id->local_addr.ss_family; + struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); + + if (addr_family != AF_INET) + return -EAFNOSUPPORT; + + ret = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); + if (ret < 0) + return ret; + + sock_set_reuseaddr(s->sk); + + /* For wildcard addr, limit binding to current device only */ + if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) + s->sk->sk_bound_dev_if = dev->netdev->ifindex; + + ret = s->ops->bind(s, (struct sockaddr *)laddr, + sizeof(struct sockaddr_in)); + if (ret) + goto error; + + cep = erdma_cep_alloc(dev); + if (!cep) { + ret = -ENOMEM; + goto error; + } + erdma_cep_socket_assoc(cep, s); + + ret = erdma_cm_alloc_work(cep, backlog); + if (ret) + goto error; + + ret = s->ops->listen(s, backlog); + if (ret) + goto error; + + cep->cm_id = id; + id->add_ref(id); + + if (!id->provider_data) { + id->provider_data = + kmalloc(sizeof(struct list_head), GFP_KERNEL); + if (!id->provider_data) { + ret = -ENOMEM; + goto error; + } + INIT_LIST_HEAD((struct list_head *)id->provider_data); + } + + list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); + cep->state = ERDMA_EPSTATE_LISTENING; + + return 0; + +error: + if (cep) { + erdma_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + cep->sock = NULL; + erdma_socket_disassoc(s); + cep->state = ERDMA_EPSTATE_CLOSED; + + erdma_cep_set_free(cep); + erdma_cep_put(cep); + } + sock_release(s); + + return ret; +} + +static void erdma_drop_listeners(struct iw_cm_id *id) +{ + struct list_head *p, *tmp; + /* + * In case of a wildcard rdma_listen on a multi-homed device, + * a listener's IWCM id is associated with more than one listening CEP. + */ + list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { + struct erdma_cep *cep = + list_entry(p, struct erdma_cep, listenq); + + list_del(p); + + erdma_cep_set_inuse(cep); + + if (cep->cm_id) { + cep->cm_id->rem_ref(cep->cm_id); + cep->cm_id = NULL; + } + if (cep->sock) { + erdma_socket_disassoc(cep->sock); + sock_release(cep->sock); + cep->sock = NULL; + } + cep->state = ERDMA_EPSTATE_CLOSED; + erdma_cep_set_free(cep); + erdma_cep_put(cep); + } +} + +int erdma_destroy_listen(struct iw_cm_id *id) +{ + if (!id->provider_data) + return 0; + + erdma_drop_listeners(id); + kfree(id->provider_data); + id->provider_data = NULL; + + return 0; +} + +int erdma_cm_init(void) +{ + erdma_cm_wq = create_singlethread_workqueue("erdma_cm_wq"); + if (!erdma_cm_wq) + return -ENOMEM; + + return 0; +} + +void erdma_cm_exit(void) +{ + if (erdma_cm_wq) + destroy_workqueue(erdma_cm_wq); +} diff --git a/drivers/infiniband/hw/erdma/erdma_cm.h b/drivers/infiniband/hw/erdma/erdma_cm.h new file mode 100644 index 000000000000..8a3f998fec9b --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cm.h @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Greg Joyce <greg@opengridcomputing.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ +/* Copyright (c) 2017, Open Grid Computing, Inc. */ + +#ifndef __ERDMA_CM_H__ +#define __ERDMA_CM_H__ + +#include <linux/tcp.h> +#include <net/sock.h> +#include <rdma/iw_cm.h> + +/* iWarp MPA protocol defs */ +#define MPA_REVISION_EXT_1 129 +#define MPA_MAX_PRIVDATA RDMA_MAX_PRIVATE_DATA +#define MPA_KEY_REQ "MPA ID Req Frame" +#define MPA_KEY_REP "MPA ID Rep Frame" +#define MPA_KEY_SIZE 16 +#define MPA_DEFAULT_HDR_LEN 28 + +struct mpa_rr_params { + __be16 bits; + __be16 pd_len; +}; + +/* + * MPA request/response Hdr bits & fields + */ +enum { + MPA_RR_FLAG_MARKERS = __cpu_to_be16(0x8000), + MPA_RR_FLAG_CRC = __cpu_to_be16(0x4000), + MPA_RR_FLAG_REJECT = __cpu_to_be16(0x2000), + MPA_RR_RESERVED = __cpu_to_be16(0x1f00), + MPA_RR_MASK_REVISION = __cpu_to_be16(0x00ff) +}; + +/* + * MPA request/reply header + */ +struct mpa_rr { + u8 key[16]; + struct mpa_rr_params params; +}; + +struct erdma_mpa_ext { + __be32 cookie; + __be32 bits; +}; + +enum { + MPA_EXT_FLAG_CC = cpu_to_be32(0x0000000f), +}; + +struct erdma_mpa_info { + struct mpa_rr hdr; /* peer mpa hdr in host byte order */ + struct erdma_mpa_ext ext_data; + char *pdata; + int bytes_rcvd; +}; + +struct erdma_sk_upcalls { + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk, int bytes); + void (*sk_error_report)(struct sock *sk); +}; + +struct erdma_dev; + +enum erdma_cep_state { + ERDMA_EPSTATE_IDLE = 1, + ERDMA_EPSTATE_LISTENING, + ERDMA_EPSTATE_CONNECTING, + ERDMA_EPSTATE_AWAIT_MPAREQ, + ERDMA_EPSTATE_RECVD_MPAREQ, + ERDMA_EPSTATE_AWAIT_MPAREP, + ERDMA_EPSTATE_RDMA_MODE, + ERDMA_EPSTATE_CLOSED +}; + +struct erdma_cep { + struct iw_cm_id *cm_id; + struct erdma_dev *dev; + struct list_head devq; + spinlock_t lock; + struct kref ref; + int in_use; + wait_queue_head_t waitq; + enum erdma_cep_state state; + + struct list_head listenq; + struct erdma_cep *listen_cep; + + struct erdma_qp *qp; + struct socket *sock; + + struct erdma_cm_work *mpa_timer; + struct list_head work_freelist; + + struct erdma_mpa_info mpa; + int ord; + int ird; + + int pd_len; + /* hold user's private data. */ + void *private_data; + + /* Saved upcalls of socket llp.sock */ + void (*sk_state_change)(struct sock *sk); + void (*sk_data_ready)(struct sock *sk); + void (*sk_error_report)(struct sock *sk); +}; + +#define MPAREQ_TIMEOUT (HZ * 20) +#define MPAREP_TIMEOUT (HZ * 10) +#define CONNECT_TIMEOUT (HZ * 10) + +enum erdma_work_type { + ERDMA_CM_WORK_ACCEPT = 1, + ERDMA_CM_WORK_READ_MPAHDR, + ERDMA_CM_WORK_CLOSE_LLP, /* close socket */ + ERDMA_CM_WORK_PEER_CLOSE, /* socket indicated peer close */ + ERDMA_CM_WORK_MPATIMEOUT, + ERDMA_CM_WORK_CONNECTED, + ERDMA_CM_WORK_CONNECTTIMEOUT +}; + +struct erdma_cm_work { + struct delayed_work work; + struct list_head list; + enum erdma_work_type type; + struct erdma_cep *cep; +}; + +#define to_sockaddr_in(a) (*(struct sockaddr_in *)(&(a))) + +static inline int getname_peer(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 1); +} + +static inline int getname_local(struct socket *s, struct sockaddr_storage *a) +{ + return s->ops->getname(s, (struct sockaddr *)a, 0); +} + +int erdma_connect(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int erdma_accept(struct iw_cm_id *id, struct iw_cm_conn_param *param); +int erdma_reject(struct iw_cm_id *id, const void *pdata, u8 plen); +int erdma_create_listen(struct iw_cm_id *id, int backlog); +int erdma_destroy_listen(struct iw_cm_id *id); + +void erdma_cep_get(struct erdma_cep *ceq); +void erdma_cep_put(struct erdma_cep *ceq); +int erdma_cm_queue_work(struct erdma_cep *ceq, enum erdma_work_type type); + +int erdma_cm_init(void); +void erdma_cm_exit(void); + +#define sk_to_cep(sk) ((struct erdma_cep *)((sk)->sk_user_data)) + +#endif diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c new file mode 100644 index 000000000000..57da0c670472 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/types.h> + +#include "erdma.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static void arm_cmdq_cq(struct erdma_cmdq *cmdq) +{ + struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); + u64 db_data = FIELD_PREP(ERDMA_CQDB_CI_MASK, cmdq->cq.ci) | + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cmdq->cq.cmdsn) | + FIELD_PREP(ERDMA_CQDB_IDX_MASK, cmdq->cq.cmdsn); + + *cmdq->cq.db_record = db_data; + writeq(db_data, dev->func_bar + ERDMA_CMDQ_CQDB_REG); + + atomic64_inc(&cmdq->cq.armed_num); +} + +static void kick_cmdq_db(struct erdma_cmdq *cmdq) +{ + struct erdma_dev *dev = container_of(cmdq, struct erdma_dev, cmdq); + u64 db_data = FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi); + + *cmdq->sq.db_record = db_data; + writeq(db_data, dev->func_bar + ERDMA_CMDQ_SQDB_REG); +} + +static struct erdma_comp_wait *get_comp_wait(struct erdma_cmdq *cmdq) +{ + int comp_idx; + + spin_lock(&cmdq->lock); + comp_idx = find_first_zero_bit(cmdq->comp_wait_bitmap, + cmdq->max_outstandings); + if (comp_idx == cmdq->max_outstandings) { + spin_unlock(&cmdq->lock); + return ERR_PTR(-ENOMEM); + } + + __set_bit(comp_idx, cmdq->comp_wait_bitmap); + spin_unlock(&cmdq->lock); + + return &cmdq->wait_pool[comp_idx]; +} + +static void put_comp_wait(struct erdma_cmdq *cmdq, + struct erdma_comp_wait *comp_wait) +{ + int used; + + cmdq->wait_pool[comp_wait->ctx_id].cmd_status = ERDMA_CMD_STATUS_INIT; + spin_lock(&cmdq->lock); + used = __test_and_clear_bit(comp_wait->ctx_id, cmdq->comp_wait_bitmap); + spin_unlock(&cmdq->lock); + + WARN_ON(!used); +} + +static int erdma_cmdq_wait_res_init(struct erdma_dev *dev, + struct erdma_cmdq *cmdq) +{ + int i; + + cmdq->wait_pool = + devm_kcalloc(&dev->pdev->dev, cmdq->max_outstandings, + sizeof(struct erdma_comp_wait), GFP_KERNEL); + if (!cmdq->wait_pool) + return -ENOMEM; + + spin_lock_init(&cmdq->lock); + cmdq->comp_wait_bitmap = devm_bitmap_zalloc( + &dev->pdev->dev, cmdq->max_outstandings, GFP_KERNEL); + if (!cmdq->comp_wait_bitmap) + return -ENOMEM; + + for (i = 0; i < cmdq->max_outstandings; i++) { + init_completion(&cmdq->wait_pool[i].wait_event); + cmdq->wait_pool[i].ctx_id = i; + } + + return 0; +} + +static int erdma_cmdq_sq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_cmdq_sq *sq = &cmdq->sq; + u32 buf_size; + + sq->wqebb_cnt = SQEBB_COUNT(ERDMA_CMDQ_SQE_SIZE); + sq->depth = cmdq->max_outstandings * sq->wqebb_cnt; + + buf_size = sq->depth << SQEBB_SHIFT; + + sq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &sq->qbuf_dma_addr, GFP_KERNEL); + if (!sq->qbuf) + return -ENOMEM; + + sq->db_record = (u64 *)(sq->qbuf + buf_size); + + spin_lock_init(&sq->lock); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_H_REG, + upper_32_bits(sq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_SQ_ADDR_L_REG, + lower_32_bits(sq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_DEPTH_REG, sq->depth); + erdma_reg_write64(dev, ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG, + sq->qbuf_dma_addr + buf_size); + + return 0; +} + +static int erdma_cmdq_cq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_cmdq_cq *cq = &cmdq->cq; + u32 buf_size; + + cq->depth = cmdq->sq.depth; + buf_size = cq->depth << CQE_SHIFT; + + cq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &cq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!cq->qbuf) + return -ENOMEM; + + spin_lock_init(&cq->lock); + + cq->db_record = (u64 *)(cq->qbuf + buf_size); + + atomic64_set(&cq->armed_num, 0); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_H_REG, + upper_32_bits(cq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_CQ_ADDR_L_REG, + lower_32_bits(cq->qbuf_dma_addr)); + erdma_reg_write64(dev, ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG, + cq->qbuf_dma_addr + buf_size); + + return 0; +} + +static int erdma_cmdq_eq_init(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + struct erdma_eq *eq = &cmdq->eq; + u32 buf_size; + + eq->depth = cmdq->max_outstandings; + buf_size = eq->depth << EQE_SHIFT; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + + eq->db_addr = + (u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, + upper_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_L_REG, + lower_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_DEPTH_REG, eq->depth); + erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, + eq->qbuf_dma_addr + buf_size); + + return 0; +} + +int erdma_cmdq_init(struct erdma_dev *dev) +{ + int err, i; + struct erdma_cmdq *cmdq = &dev->cmdq; + u32 sts, ctrl; + + cmdq->max_outstandings = ERDMA_CMDQ_MAX_OUTSTANDING; + cmdq->use_event = false; + + sema_init(&cmdq->credits, cmdq->max_outstandings); + + err = erdma_cmdq_wait_res_init(dev, cmdq); + if (err) + return err; + + err = erdma_cmdq_sq_init(dev); + if (err) + return err; + + err = erdma_cmdq_cq_init(dev); + if (err) + goto err_destroy_sq; + + err = erdma_cmdq_eq_init(dev); + if (err) + goto err_destroy_cq; + + ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_INIT_MASK, 1); + erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); + + for (i = 0; i < ERDMA_WAIT_DEV_DONE_CNT; i++) { + sts = erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG, + ERDMA_REG_DEV_ST_INIT_DONE_MASK); + if (sts) + break; + + msleep(ERDMA_REG_ACCESS_WAIT_MS); + } + + if (i == ERDMA_WAIT_DEV_DONE_CNT) { + dev_err(&dev->pdev->dev, "wait init done failed.\n"); + err = -ETIMEDOUT; + goto err_destroy_eq; + } + + set_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + + return 0; + +err_destroy_eq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->eq.depth << EQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); + +err_destroy_cq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->cq.depth << CQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); + +err_destroy_sq: + dma_free_coherent(&dev->pdev->dev, + (cmdq->sq.depth << SQEBB_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + + return err; +} + +void erdma_finish_cmdq_init(struct erdma_dev *dev) +{ + /* after device init successfully, change cmdq to event mode. */ + dev->cmdq.use_event = true; + arm_cmdq_cq(&dev->cmdq); +} + +void erdma_cmdq_destroy(struct erdma_dev *dev) +{ + struct erdma_cmdq *cmdq = &dev->cmdq; + + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + + dma_free_coherent(&dev->pdev->dev, + (cmdq->eq.depth << EQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + (cmdq->sq.depth << SQEBB_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); + dma_free_coherent(&dev->pdev->dev, + (cmdq->cq.depth << CQE_SHIFT) + + ERDMA_EXTRA_BUFFER_SIZE, + cmdq->cq.qbuf, cmdq->cq.qbuf_dma_addr); +} + +static void *get_next_valid_cmdq_cqe(struct erdma_cmdq *cmdq) +{ + __be32 *cqe = get_queue_entry(cmdq->cq.qbuf, cmdq->cq.ci, + cmdq->cq.depth, CQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + __be32_to_cpu(READ_ONCE(*cqe))); + + return owner ^ !!(cmdq->cq.ci & cmdq->cq.depth) ? cqe : NULL; +} + +static void push_cmdq_sqe(struct erdma_cmdq *cmdq, u64 *req, size_t req_len, + struct erdma_comp_wait *comp_wait) +{ + __le64 *wqe; + u64 hdr = *req; + + comp_wait->cmd_status = ERDMA_CMD_STATUS_ISSUED; + reinit_completion(&comp_wait->wait_event); + comp_wait->sq_pi = cmdq->sq.pi; + + wqe = get_queue_entry(cmdq->sq.qbuf, cmdq->sq.pi, cmdq->sq.depth, + SQEBB_SHIFT); + memcpy(wqe, req, req_len); + + cmdq->sq.pi += cmdq->sq.wqebb_cnt; + hdr |= FIELD_PREP(ERDMA_CMD_HDR_WQEBB_INDEX_MASK, cmdq->sq.pi) | + FIELD_PREP(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, + comp_wait->ctx_id) | + FIELD_PREP(ERDMA_CMD_HDR_WQEBB_CNT_MASK, cmdq->sq.wqebb_cnt - 1); + *wqe = cpu_to_le64(hdr); + + kick_cmdq_db(cmdq); +} + +static int erdma_poll_single_cmd_completion(struct erdma_cmdq *cmdq) +{ + struct erdma_comp_wait *comp_wait; + u32 hdr0, sqe_idx; + __be32 *cqe; + u16 ctx_id; + u64 *sqe; + int i; + + cqe = get_next_valid_cmdq_cqe(cmdq); + if (!cqe) + return -EAGAIN; + + cmdq->cq.ci++; + + dma_rmb(); + hdr0 = __be32_to_cpu(*cqe); + sqe_idx = __be32_to_cpu(*(cqe + 1)); + + sqe = get_queue_entry(cmdq->sq.qbuf, sqe_idx, cmdq->sq.depth, + SQEBB_SHIFT); + ctx_id = FIELD_GET(ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK, *sqe); + comp_wait = &cmdq->wait_pool[ctx_id]; + if (comp_wait->cmd_status != ERDMA_CMD_STATUS_ISSUED) + return -EIO; + + comp_wait->cmd_status = ERDMA_CMD_STATUS_FINISHED; + comp_wait->comp_status = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, hdr0); + cmdq->sq.ci += cmdq->sq.wqebb_cnt; + + for (i = 0; i < 4; i++) + comp_wait->comp_data[i] = __be32_to_cpu(*(cqe + 2 + i)); + + if (cmdq->use_event) + complete(&comp_wait->wait_event); + + return 0; +} + +static void erdma_polling_cmd_completions(struct erdma_cmdq *cmdq) +{ + unsigned long flags; + u16 comp_num; + + spin_lock_irqsave(&cmdq->cq.lock, flags); + + /* We must have less than # of max_outstandings + * completions at one time. + */ + for (comp_num = 0; comp_num < cmdq->max_outstandings; comp_num++) + if (erdma_poll_single_cmd_completion(cmdq)) + break; + + if (comp_num && cmdq->use_event) + arm_cmdq_cq(cmdq); + + spin_unlock_irqrestore(&cmdq->cq.lock, flags); +} + +void erdma_cmdq_completion_handler(struct erdma_cmdq *cmdq) +{ + int got_event = 0; + + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state) || + !cmdq->use_event) + return; + + while (get_next_valid_eqe(&cmdq->eq)) { + cmdq->eq.ci++; + got_event++; + } + + if (got_event) { + cmdq->cq.cmdsn++; + erdma_polling_cmd_completions(cmdq); + } + + notify_eq(&cmdq->eq); +} + +static int erdma_poll_cmd_completion(struct erdma_comp_wait *comp_ctx, + struct erdma_cmdq *cmdq, u32 timeout) +{ + unsigned long comp_timeout = jiffies + msecs_to_jiffies(timeout); + + while (1) { + erdma_polling_cmd_completions(cmdq); + if (comp_ctx->cmd_status != ERDMA_CMD_STATUS_ISSUED) + break; + + if (time_is_before_jiffies(comp_timeout)) + return -ETIME; + + msleep(20); + } + + return 0; +} + +static int erdma_wait_cmd_completion(struct erdma_comp_wait *comp_ctx, + struct erdma_cmdq *cmdq, u32 timeout) +{ + unsigned long flags = 0; + + wait_for_completion_timeout(&comp_ctx->wait_event, + msecs_to_jiffies(timeout)); + + if (unlikely(comp_ctx->cmd_status != ERDMA_CMD_STATUS_FINISHED)) { + spin_lock_irqsave(&cmdq->cq.lock, flags); + comp_ctx->cmd_status = ERDMA_CMD_STATUS_TIMEOUT; + spin_unlock_irqrestore(&cmdq->cq.lock, flags); + return -ETIME; + } + + return 0; +} + +void erdma_cmdq_build_reqhdr(u64 *hdr, u32 mod, u32 op) +{ + *hdr = FIELD_PREP(ERDMA_CMD_HDR_SUB_MOD_MASK, mod) | + FIELD_PREP(ERDMA_CMD_HDR_OPCODE_MASK, op); +} + +int erdma_post_cmd_wait(struct erdma_cmdq *cmdq, u64 *req, u32 req_size, + u64 *resp0, u64 *resp1) +{ + struct erdma_comp_wait *comp_wait; + int ret; + + if (!test_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state)) + return -ENODEV; + + down(&cmdq->credits); + + comp_wait = get_comp_wait(cmdq); + if (IS_ERR(comp_wait)) { + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + set_bit(ERDMA_CMDQ_STATE_CTX_ERR_BIT, &cmdq->state); + up(&cmdq->credits); + return PTR_ERR(comp_wait); + } + + spin_lock(&cmdq->sq.lock); + push_cmdq_sqe(cmdq, req, req_size, comp_wait); + spin_unlock(&cmdq->sq.lock); + + if (cmdq->use_event) + ret = erdma_wait_cmd_completion(comp_wait, cmdq, + ERDMA_CMDQ_TIMEOUT_MS); + else + ret = erdma_poll_cmd_completion(comp_wait, cmdq, + ERDMA_CMDQ_TIMEOUT_MS); + + if (ret) { + set_bit(ERDMA_CMDQ_STATE_TIMEOUT_BIT, &cmdq->state); + clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); + goto out; + } + + if (comp_wait->comp_status) + ret = -EIO; + + if (resp0 && resp1) { + *resp0 = *((u64 *)&comp_wait->comp_data[0]); + *resp1 = *((u64 *)&comp_wait->comp_data[2]); + } + put_comp_wait(cmdq, comp_wait); + +out: + up(&cmdq->credits); + + return ret; +} diff --git a/drivers/infiniband/hw/erdma/erdma_cq.c b/drivers/infiniband/hw/erdma/erdma_cq.c new file mode 100644 index 000000000000..751c7f9f0de7 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_cq.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include <rdma/ib_verbs.h> + +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static void *get_next_valid_cqe(struct erdma_cq *cq) +{ + __be32 *cqe = get_queue_entry(cq->kern_cq.qbuf, cq->kern_cq.ci, + cq->depth, CQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, + __be32_to_cpu(READ_ONCE(*cqe))); + + return owner ^ !!(cq->kern_cq.ci & cq->depth) ? cqe : NULL; +} + +static void notify_cq(struct erdma_cq *cq, u8 solcitied) +{ + u64 db_data = + FIELD_PREP(ERDMA_CQDB_IDX_MASK, (cq->kern_cq.notify_cnt)) | + FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->cqn) | + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | + FIELD_PREP(ERDMA_CQDB_SOL_MASK, solcitied) | + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->kern_cq.cmdsn) | + FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->kern_cq.ci); + + *cq->kern_cq.db_record = db_data; + writeq(db_data, cq->kern_cq.db); +} + +int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) +{ + struct erdma_cq *cq = to_ecq(ibcq); + unsigned long irq_flags; + int ret = 0; + + spin_lock_irqsave(&cq->kern_cq.lock, irq_flags); + + notify_cq(cq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED); + + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && get_next_valid_cqe(cq)) + ret = 1; + + cq->kern_cq.notify_cnt++; + + spin_unlock_irqrestore(&cq->kern_cq.lock, irq_flags); + + return ret; +} + +static const enum ib_wc_opcode wc_mapping_table[ERDMA_NUM_OPCODES] = { + [ERDMA_OP_WRITE] = IB_WC_RDMA_WRITE, + [ERDMA_OP_READ] = IB_WC_RDMA_READ, + [ERDMA_OP_SEND] = IB_WC_SEND, + [ERDMA_OP_SEND_WITH_IMM] = IB_WC_SEND, + [ERDMA_OP_RECEIVE] = IB_WC_RECV, + [ERDMA_OP_RECV_IMM] = IB_WC_RECV_RDMA_WITH_IMM, + [ERDMA_OP_RECV_INV] = IB_WC_RECV, + [ERDMA_OP_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE, + [ERDMA_OP_INVALIDATE] = IB_WC_LOCAL_INV, + [ERDMA_OP_RSP_SEND_IMM] = IB_WC_RECV, + [ERDMA_OP_SEND_WITH_INV] = IB_WC_SEND, + [ERDMA_OP_REG_MR] = IB_WC_REG_MR, + [ERDMA_OP_LOCAL_INV] = IB_WC_LOCAL_INV, + [ERDMA_OP_READ_WITH_INV] = IB_WC_RDMA_READ, +}; + +static const struct { + enum erdma_wc_status erdma; + enum ib_wc_status base; + enum erdma_vendor_err vendor; +} map_cqe_status[ERDMA_NUM_WC_STATUS] = { + { ERDMA_WC_SUCCESS, IB_WC_SUCCESS, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_GENERAL_ERR, IB_WC_GENERAL_ERR, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_RECV_WQE_FORMAT_ERR, IB_WC_GENERAL_ERR, + ERDMA_WC_VENDOR_INVALID_RQE }, + { ERDMA_WC_RECV_STAG_INVALID_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_INVALID_STAG }, + { ERDMA_WC_RECV_ADDR_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION }, + { ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR }, + { ERDMA_WC_RECV_PDID_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_INVALID_PD }, + { ERDMA_WC_RECV_WARRPING_ERR, IB_WC_REM_ACCESS_ERR, + ERDMA_WC_VENDOR_RQE_WRAP_ERR }, + { ERDMA_WC_SEND_WQE_FORMAT_ERR, IB_WC_LOC_QP_OP_ERR, + ERDMA_WC_VENDOR_INVALID_SQE }, + { ERDMA_WC_SEND_WQE_ORD_EXCEED, IB_WC_GENERAL_ERR, + ERDMA_WC_VENDOR_ZERO_ORD }, + { ERDMA_WC_SEND_STAG_INVALID_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_INVALID_STAG }, + { ERDMA_WC_SEND_ADDR_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION }, + { ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_ACCESS_ERR }, + { ERDMA_WC_SEND_PDID_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_INVALID_PD }, + { ERDMA_WC_SEND_WARRPING_ERR, IB_WC_LOC_ACCESS_ERR, + ERDMA_WC_VENDOR_SQE_WARP_ERR }, + { ERDMA_WC_FLUSH_ERR, IB_WC_WR_FLUSH_ERR, ERDMA_WC_VENDOR_NO_ERR }, + { ERDMA_WC_RETRY_EXC_ERR, IB_WC_RETRY_EXC_ERR, ERDMA_WC_VENDOR_NO_ERR }, +}; + +#define ERDMA_POLLCQ_NO_QP 1 + +static int erdma_poll_one_cqe(struct erdma_cq *cq, struct ib_wc *wc) +{ + struct erdma_dev *dev = to_edev(cq->ibcq.device); + u8 opcode, syndrome, qtype; + struct erdma_kqp *kern_qp; + struct erdma_cqe *cqe; + struct erdma_qp *qp; + u16 wqe_idx, depth; + u32 qpn, cqe_hdr; + u64 *id_table; + u64 *wqe_hdr; + + cqe = get_next_valid_cqe(cq); + if (!cqe) + return -EAGAIN; + + cq->kern_cq.ci++; + + /* cqbuf should be ready when we poll */ + dma_rmb(); + + qpn = be32_to_cpu(cqe->qpn); + wqe_idx = be32_to_cpu(cqe->qe_idx); + cqe_hdr = be32_to_cpu(cqe->hdr); + + qp = find_qp_by_qpn(dev, qpn); + if (!qp) + return ERDMA_POLLCQ_NO_QP; + + kern_qp = &qp->kern_qp; + + qtype = FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr); + syndrome = FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr); + opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, cqe_hdr); + + if (qtype == ERDMA_CQE_QTYPE_SQ) { + id_table = kern_qp->swr_tbl; + depth = qp->attrs.sq_size; + wqe_hdr = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + kern_qp->sq_ci = + FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *wqe_hdr) + + wqe_idx + 1; + } else { + id_table = kern_qp->rwr_tbl; + depth = qp->attrs.rq_size; + } + wc->wr_id = id_table[wqe_idx & (depth - 1)]; + wc->byte_len = be32_to_cpu(cqe->size); + + wc->wc_flags = 0; + + wc->opcode = wc_mapping_table[opcode]; + if (opcode == ERDMA_OP_RECV_IMM || opcode == ERDMA_OP_RSP_SEND_IMM) { + wc->ex.imm_data = cpu_to_be32(le32_to_cpu(cqe->imm_data)); + wc->wc_flags |= IB_WC_WITH_IMM; + } else if (opcode == ERDMA_OP_RECV_INV) { + wc->ex.invalidate_rkey = be32_to_cpu(cqe->inv_rkey); + wc->wc_flags |= IB_WC_WITH_INVALIDATE; + } + + if (syndrome >= ERDMA_NUM_WC_STATUS) + syndrome = ERDMA_WC_GENERAL_ERR; + + wc->status = map_cqe_status[syndrome].base; + wc->vendor_err = map_cqe_status[syndrome].vendor; + wc->qp = &qp->ibqp; + + return 0; +} + +int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) +{ + struct erdma_cq *cq = to_ecq(ibcq); + unsigned long flags; + int npolled, ret; + + spin_lock_irqsave(&cq->kern_cq.lock, flags); + + for (npolled = 0; npolled < num_entries;) { + ret = erdma_poll_one_cqe(cq, wc + npolled); + + if (ret == -EAGAIN) /* no received new CQEs. */ + break; + else if (ret) /* ignore invalid CQEs. */ + continue; + + npolled++; + } + + spin_unlock_irqrestore(&cq->kern_cq.lock, flags); + + return npolled; +} diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c new file mode 100644 index 000000000000..8f2d094e0227 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/types.h> + +#include "erdma.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +#define MAX_POLL_CHUNK_SIZE 16 + +void notify_eq(struct erdma_eq *eq) +{ + u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) | + FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1); + + *eq->db_record = db_data; + writeq(db_data, eq->db_addr); + + atomic64_inc(&eq->notify_num); +} + +void *get_next_valid_eqe(struct erdma_eq *eq) +{ + u64 *eqe = get_queue_entry(eq->qbuf, eq->ci, eq->depth, EQE_SHIFT); + u32 owner = FIELD_GET(ERDMA_CEQE_HDR_O_MASK, READ_ONCE(*eqe)); + + return owner ^ !!(eq->ci & eq->depth) ? eqe : NULL; +} + +void erdma_aeq_event_handler(struct erdma_dev *dev) +{ + struct erdma_aeqe *aeqe; + u32 cqn, qpn; + struct erdma_qp *qp; + struct erdma_cq *cq; + struct ib_event event; + u32 poll_cnt = 0; + + memset(&event, 0, sizeof(event)); + + while (poll_cnt < MAX_POLL_CHUNK_SIZE) { + aeqe = get_next_valid_eqe(&dev->aeq); + if (!aeqe) + break; + + dma_rmb(); + + dev->aeq.ci++; + atomic64_inc(&dev->aeq.event_num); + poll_cnt++; + + if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK, + le32_to_cpu(aeqe->hdr)) == ERDMA_AE_TYPE_CQ_ERR) { + cqn = le32_to_cpu(aeqe->event_data0); + cq = find_cq_by_cqn(dev, cqn); + if (!cq) + continue; + + event.device = cq->ibcq.device; + event.element.cq = &cq->ibcq; + event.event = IB_EVENT_CQ_ERR; + if (cq->ibcq.event_handler) + cq->ibcq.event_handler(&event, + cq->ibcq.cq_context); + } else { + qpn = le32_to_cpu(aeqe->event_data0); + qp = find_qp_by_qpn(dev, qpn); + if (!qp) + continue; + + event.device = qp->ibqp.device; + event.element.qp = &qp->ibqp; + event.event = IB_EVENT_QP_FATAL; + if (qp->ibqp.event_handler) + qp->ibqp.event_handler(&event, + qp->ibqp.qp_context); + } + } + + notify_eq(&dev->aeq); +} + +int erdma_aeq_init(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + u32 buf_size; + + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; + buf_size = eq->depth << EQE_SHIFT; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + atomic64_set(&eq->notify_num, 0); + + eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, + upper_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, + lower_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); + erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, + eq->qbuf_dma_addr + buf_size); + + return 0; +} + +void erdma_aeq_destroy(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(eq->depth << EQE_SHIFT), eq->qbuf, + eq->qbuf_dma_addr); +} + +void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) +{ + struct erdma_dev *dev = ceq_cb->dev; + struct erdma_cq *cq; + u32 poll_cnt = 0; + u64 *ceqe; + int cqn; + + if (!ceq_cb->ready) + return; + + while (poll_cnt < MAX_POLL_CHUNK_SIZE) { + ceqe = get_next_valid_eqe(&ceq_cb->eq); + if (!ceqe) + break; + + dma_rmb(); + ceq_cb->eq.ci++; + poll_cnt++; + cqn = FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, READ_ONCE(*ceqe)); + + cq = find_cq_by_cqn(dev, cqn); + if (!cq) + continue; + + if (rdma_is_kernel_res(&cq->ibcq.res)) + cq->kern_cq.cmdsn++; + + if (cq->ibcq.comp_handler) + cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context); + } + + notify_eq(&ceq_cb->eq); +} + +static irqreturn_t erdma_intr_ceq_handler(int irq, void *data) +{ + struct erdma_eq_cb *ceq_cb = data; + + tasklet_schedule(&ceq_cb->tasklet); + + return IRQ_HANDLED; +} + +static void erdma_intr_ceq_task(unsigned long data) +{ + erdma_ceq_completion_handler((struct erdma_eq_cb *)data); +} + +static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq_cb *eqc = &dev->ceqs[ceqn]; + int err; + + snprintf(eqc->irq.name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s", ceqn, + pci_name(dev->pdev)); + eqc->irq.msix_vector = pci_irq_vector(dev->pdev, ceqn + 1); + + tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task, + (unsigned long)&dev->ceqs[ceqn]); + + cpumask_set_cpu(cpumask_local_spread(ceqn + 1, dev->attrs.numa_node), + &eqc->irq.affinity_hint_mask); + + err = request_irq(eqc->irq.msix_vector, erdma_intr_ceq_handler, 0, + eqc->irq.name, eqc); + if (err) { + dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err); + return err; + } + + irq_set_affinity_hint(eqc->irq.msix_vector, + &eqc->irq.affinity_hint_mask); + + return 0; +} + +static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq_cb *eqc = &dev->ceqs[ceqn]; + + irq_set_affinity_hint(eqc->irq.msix_vector, NULL); + free_irq(eqc->irq.msix_vector, eqc); +} + +static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq) +{ + struct erdma_cmdq_create_eq_req req; + dma_addr_t db_info_dma_addr; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_CREATE_EQ); + req.eqn = eqn; + req.depth = ilog2(eq->depth); + req.qbuf_addr = eq->qbuf_dma_addr; + req.qtype = ERDMA_EQ_TYPE_CEQ; + /* Vector index is the same as EQN. */ + req.vector_idx = eqn; + db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT); + req.db_dma_addr_l = lower_32_bits(db_info_dma_addr); + req.db_dma_addr_h = upper_32_bits(db_info_dma_addr); + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, + sizeof(struct erdma_cmdq_create_eq_req), + NULL, NULL); +} + +static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq *eq = &dev->ceqs[ceqn].eq; + u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; + int ret; + + eq->qbuf = + dma_alloc_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), + &eq->qbuf_dma_addr, GFP_KERNEL | __GFP_ZERO); + if (!eq->qbuf) + return -ENOMEM; + + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + atomic64_set(&eq->notify_num, 0); + + eq->depth = ERDMA_DEFAULT_EQ_DEPTH; + eq->db_addr = + (u64 __iomem *)(dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + + (ceqn + 1) * ERDMA_DB_SIZE); + eq->db_record = (u64 *)(eq->qbuf + buf_size); + eq->ci = 0; + dev->ceqs[ceqn].dev = dev; + + /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ + ret = create_eq_cmd(dev, ceqn + 1, eq); + dev->ceqs[ceqn].ready = ret ? false : true; + + return ret; +} + +static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) +{ + struct erdma_eq *eq = &dev->ceqs[ceqn].eq; + u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT; + struct erdma_cmdq_destroy_eq_req req; + int err; + + dev->ceqs[ceqn].ready = 0; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_DESTROY_EQ); + /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ + req.eqn = ceqn + 1; + req.qtype = ERDMA_EQ_TYPE_CEQ; + req.vector_idx = ceqn + 1; + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return; + + dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf, + eq->qbuf_dma_addr); +} + +int erdma_ceqs_init(struct erdma_dev *dev) +{ + u32 i, j; + int err; + + for (i = 0; i < dev->attrs.irq_num - 1; i++) { + err = erdma_ceq_init_one(dev, i); + if (err) + goto out_err; + + err = erdma_set_ceq_irq(dev, i); + if (err) { + erdma_ceq_uninit_one(dev, i); + goto out_err; + } + } + + return 0; + +out_err: + for (j = 0; j < i; j++) { + erdma_free_ceq_irq(dev, j); + erdma_ceq_uninit_one(dev, j); + } + + return err; +} + +void erdma_ceqs_uninit(struct erdma_dev *dev) +{ + u32 i; + + for (i = 0; i < dev->attrs.irq_num - 1; i++) { + erdma_free_ceq_irq(dev, i); + erdma_ceq_uninit_one(dev, i); + } +} diff --git a/drivers/infiniband/hw/erdma/erdma_hw.h b/drivers/infiniband/hw/erdma/erdma_hw.h new file mode 100644 index 000000000000..b210c49c669f --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_hw.h @@ -0,0 +1,508 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_HW_H__ +#define __ERDMA_HW_H__ + +#include <linux/kernel.h> +#include <linux/types.h> + +/* PCIe device related definition. */ +#define PCI_VENDOR_ID_ALIBABA 0x1ded + +#define ERDMA_PCI_WIDTH 64 +#define ERDMA_FUNC_BAR 0 +#define ERDMA_MISX_BAR 2 + +#define ERDMA_BAR_MASK (BIT(ERDMA_FUNC_BAR) | BIT(ERDMA_MISX_BAR)) + +/* MSI-X related. */ +#define ERDMA_NUM_MSIX_VEC 32U +#define ERDMA_MSIX_VECTOR_CMDQ 0 + +/* PCIe Bar0 Registers. */ +#define ERDMA_REGS_VERSION_REG 0x0 +#define ERDMA_REGS_DEV_CTRL_REG 0x10 +#define ERDMA_REGS_DEV_ST_REG 0x14 +#define ERDMA_REGS_NETDEV_MAC_L_REG 0x18 +#define ERDMA_REGS_NETDEV_MAC_H_REG 0x1C +#define ERDMA_REGS_CMDQ_SQ_ADDR_L_REG 0x20 +#define ERDMA_REGS_CMDQ_SQ_ADDR_H_REG 0x24 +#define ERDMA_REGS_CMDQ_CQ_ADDR_L_REG 0x28 +#define ERDMA_REGS_CMDQ_CQ_ADDR_H_REG 0x2C +#define ERDMA_REGS_CMDQ_DEPTH_REG 0x30 +#define ERDMA_REGS_CMDQ_EQ_DEPTH_REG 0x34 +#define ERDMA_REGS_CMDQ_EQ_ADDR_L_REG 0x38 +#define ERDMA_REGS_CMDQ_EQ_ADDR_H_REG 0x3C +#define ERDMA_REGS_AEQ_ADDR_L_REG 0x40 +#define ERDMA_REGS_AEQ_ADDR_H_REG 0x44 +#define ERDMA_REGS_AEQ_DEPTH_REG 0x48 +#define ERDMA_REGS_GRP_NUM_REG 0x4c +#define ERDMA_REGS_AEQ_DB_REG 0x50 +#define ERDMA_CMDQ_SQ_DB_HOST_ADDR_REG 0x60 +#define ERDMA_CMDQ_CQ_DB_HOST_ADDR_REG 0x68 +#define ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG 0x70 +#define ERDMA_AEQ_DB_HOST_ADDR_REG 0x78 +#define ERDMA_REGS_STATS_TSO_IN_PKTS_REG 0x80 +#define ERDMA_REGS_STATS_TSO_OUT_PKTS_REG 0x88 +#define ERDMA_REGS_STATS_TSO_OUT_BYTES_REG 0x90 +#define ERDMA_REGS_STATS_TX_DROP_PKTS_REG 0x98 +#define ERDMA_REGS_STATS_TX_BPS_METER_DROP_PKTS_REG 0xa0 +#define ERDMA_REGS_STATS_TX_PPS_METER_DROP_PKTS_REG 0xa8 +#define ERDMA_REGS_STATS_RX_PKTS_REG 0xc0 +#define ERDMA_REGS_STATS_RX_BYTES_REG 0xc8 +#define ERDMA_REGS_STATS_RX_DROP_PKTS_REG 0xd0 +#define ERDMA_REGS_STATS_RX_BPS_METER_DROP_PKTS_REG 0xd8 +#define ERDMA_REGS_STATS_RX_PPS_METER_DROP_PKTS_REG 0xe0 +#define ERDMA_REGS_CEQ_DB_BASE_REG 0x100 +#define ERDMA_CMDQ_SQDB_REG 0x200 +#define ERDMA_CMDQ_CQDB_REG 0x300 + +/* DEV_CTRL_REG details. */ +#define ERDMA_REG_DEV_CTRL_RESET_MASK 0x00000001 +#define ERDMA_REG_DEV_CTRL_INIT_MASK 0x00000002 + +/* DEV_ST_REG details. */ +#define ERDMA_REG_DEV_ST_RESET_DONE_MASK 0x00000001U +#define ERDMA_REG_DEV_ST_INIT_DONE_MASK 0x00000002U + +/* eRDMA PCIe DBs definition. */ +#define ERDMA_BAR_DB_SPACE_BASE 4096 + +#define ERDMA_BAR_SQDB_SPACE_OFFSET ERDMA_BAR_DB_SPACE_BASE +#define ERDMA_BAR_SQDB_SPACE_SIZE (384 * 1024) + +#define ERDMA_BAR_RQDB_SPACE_OFFSET \ + (ERDMA_BAR_SQDB_SPACE_OFFSET + ERDMA_BAR_SQDB_SPACE_SIZE) +#define ERDMA_BAR_RQDB_SPACE_SIZE (96 * 1024) + +#define ERDMA_BAR_CQDB_SPACE_OFFSET \ + (ERDMA_BAR_RQDB_SPACE_OFFSET + ERDMA_BAR_RQDB_SPACE_SIZE) + +/* Doorbell page resources related. */ +/* + * Max # of parallelly issued directSQE is 3072 per device, + * hardware organizes this into 24 group, per group has 128 credits. + */ +#define ERDMA_DWQE_MAX_GRP_CNT 24 +#define ERDMA_DWQE_NUM_PER_GRP 128 + +#define ERDMA_DWQE_TYPE0_CNT 64 +#define ERDMA_DWQE_TYPE1_CNT 496 +/* type1 DB contains 2 DBs, takes 256Byte. */ +#define ERDMA_DWQE_TYPE1_CNT_PER_PAGE 16 + +#define ERDMA_SDB_SHARED_PAGE_INDEX 95 + +/* Doorbell related. */ +#define ERDMA_DB_SIZE 8 + +#define ERDMA_CQDB_IDX_MASK GENMASK_ULL(63, 56) +#define ERDMA_CQDB_CQN_MASK GENMASK_ULL(55, 32) +#define ERDMA_CQDB_ARM_MASK BIT_ULL(31) +#define ERDMA_CQDB_SOL_MASK BIT_ULL(30) +#define ERDMA_CQDB_CMDSN_MASK GENMASK_ULL(29, 28) +#define ERDMA_CQDB_CI_MASK GENMASK_ULL(23, 0) + +#define ERDMA_EQDB_ARM_MASK BIT(31) +#define ERDMA_EQDB_CI_MASK GENMASK_ULL(23, 0) + +#define ERDMA_PAGE_SIZE_SUPPORT 0x7FFFF000 + +/* WQE related. */ +#define EQE_SIZE 16 +#define EQE_SHIFT 4 +#define RQE_SIZE 32 +#define RQE_SHIFT 5 +#define CQE_SIZE 32 +#define CQE_SHIFT 5 +#define SQEBB_SIZE 32 +#define SQEBB_SHIFT 5 +#define SQEBB_MASK (~(SQEBB_SIZE - 1)) +#define SQEBB_ALIGN(size) ((size + SQEBB_SIZE - 1) & SQEBB_MASK) +#define SQEBB_COUNT(size) (SQEBB_ALIGN(size) >> SQEBB_SHIFT) + +#define ERDMA_MAX_SQE_SIZE 128 +#define ERDMA_MAX_WQEBB_PER_SQE 4 + +/* CMDQ related. */ +#define ERDMA_CMDQ_MAX_OUTSTANDING 128 +#define ERDMA_CMDQ_SQE_SIZE 64 + +/* cmdq sub module definition. */ +enum CMDQ_WQE_SUB_MOD { + CMDQ_SUBMOD_RDMA = 0, + CMDQ_SUBMOD_COMMON = 1 +}; + +enum CMDQ_RDMA_OPCODE { + CMDQ_OPCODE_QUERY_DEVICE = 0, + CMDQ_OPCODE_CREATE_QP = 1, + CMDQ_OPCODE_DESTROY_QP = 2, + CMDQ_OPCODE_MODIFY_QP = 3, + CMDQ_OPCODE_CREATE_CQ = 4, + CMDQ_OPCODE_DESTROY_CQ = 5, + CMDQ_OPCODE_REG_MR = 8, + CMDQ_OPCODE_DEREG_MR = 9 +}; + +enum CMDQ_COMMON_OPCODE { + CMDQ_OPCODE_CREATE_EQ = 0, + CMDQ_OPCODE_DESTROY_EQ = 1, + CMDQ_OPCODE_QUERY_FW_INFO = 2, +}; + +/* cmdq-SQE HDR */ +#define ERDMA_CMD_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) +#define ERDMA_CMD_HDR_CONTEXT_COOKIE_MASK GENMASK_ULL(47, 32) +#define ERDMA_CMD_HDR_SUB_MOD_MASK GENMASK_ULL(25, 24) +#define ERDMA_CMD_HDR_OPCODE_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) + +struct erdma_cmdq_destroy_cq_req { + u64 hdr; + u32 cqn; +}; + +#define ERDMA_EQ_TYPE_AEQ 0 +#define ERDMA_EQ_TYPE_CEQ 1 + +struct erdma_cmdq_create_eq_req { + u64 hdr; + u64 qbuf_addr; + u8 vector_idx; + u8 eqn; + u8 depth; + u8 qtype; + u32 db_dma_addr_l; + u32 db_dma_addr_h; +}; + +struct erdma_cmdq_destroy_eq_req { + u64 hdr; + u64 rsvd0; + u8 vector_idx; + u8 eqn; + u8 rsvd1; + u8 qtype; +}; + +/* create_cq cfg0 */ +#define ERDMA_CMD_CREATE_CQ_DEPTH_MASK GENMASK(31, 24) +#define ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK GENMASK(23, 20) +#define ERDMA_CMD_CREATE_CQ_CQN_MASK GENMASK(19, 0) + +/* create_cq cfg1 */ +#define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16) +#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15) +#define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0) + +struct erdma_cmdq_create_cq_req { + u64 hdr; + u32 cfg0; + u32 qbuf_addr_l; + u32 qbuf_addr_h; + u32 cfg1; + u64 cq_db_info_addr; + u32 first_page_offset; +}; + +/* regmr/deregmr cfg0 */ +#define ERDMA_CMD_MR_VALID_MASK BIT(31) +#define ERDMA_CMD_MR_KEY_MASK GENMASK(27, 20) +#define ERDMA_CMD_MR_MPT_IDX_MASK GENMASK(19, 0) + +/* regmr cfg1 */ +#define ERDMA_CMD_REGMR_PD_MASK GENMASK(31, 12) +#define ERDMA_CMD_REGMR_TYPE_MASK GENMASK(7, 6) +#define ERDMA_CMD_REGMR_RIGHT_MASK GENMASK(5, 2) +#define ERDMA_CMD_REGMR_ACC_MODE_MASK GENMASK(1, 0) + +/* regmr cfg2 */ +#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27) +#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20) +#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0) + +struct erdma_cmdq_reg_mr_req { + u64 hdr; + u32 cfg0; + u32 cfg1; + u64 start_va; + u32 size; + u32 cfg2; + u64 phy_addr[4]; +}; + +struct erdma_cmdq_dereg_mr_req { + u64 hdr; + u32 cfg; +}; + +/* modify qp cfg */ +#define ERDMA_CMD_MODIFY_QP_STATE_MASK GENMASK(31, 24) +#define ERDMA_CMD_MODIFY_QP_CC_MASK GENMASK(23, 20) +#define ERDMA_CMD_MODIFY_QP_QPN_MASK GENMASK(19, 0) + +struct erdma_cmdq_modify_qp_req { + u64 hdr; + u32 cfg; + u32 cookie; + __be32 dip; + __be32 sip; + __be16 sport; + __be16 dport; + u32 send_nxt; + u32 recv_nxt; +}; + +/* create qp cfg0 */ +#define ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK GENMASK(31, 20) +#define ERDMA_CMD_CREATE_QP_QPN_MASK GENMASK(19, 0) + +/* create qp cfg1 */ +#define ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK GENMASK(31, 20) +#define ERDMA_CMD_CREATE_QP_PD_MASK GENMASK(19, 0) + +/* create qp cqn_mtt_cfg */ +#define ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK GENMASK(31, 28) +#define ERDMA_CMD_CREATE_QP_CQN_MASK GENMASK(23, 0) + +/* create qp mtt_cfg */ +#define ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK GENMASK(31, 12) +#define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1) +#define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0) + +#define ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK GENMASK_ULL(31, 0) + +struct erdma_cmdq_create_qp_req { + u64 hdr; + u32 cfg0; + u32 cfg1; + u32 sq_cqn_mtt_cfg; + u32 rq_cqn_mtt_cfg; + u64 sq_buf_addr; + u64 rq_buf_addr; + u32 sq_mtt_cfg; + u32 rq_mtt_cfg; + u64 sq_db_info_dma_addr; + u64 rq_db_info_dma_addr; +}; + +struct erdma_cmdq_destroy_qp_req { + u64 hdr; + u32 qpn; +}; + +/* cap qword 0 definition */ +#define ERDMA_CMD_DEV_CAP_MAX_CQE_MASK GENMASK_ULL(47, 40) +#define ERDMA_CMD_DEV_CAP_MAX_RECV_WR_MASK GENMASK_ULL(23, 16) +#define ERDMA_CMD_DEV_CAP_MAX_MR_SIZE_MASK GENMASK_ULL(7, 0) + +/* cap qword 1 definition */ +#define ERDMA_CMD_DEV_CAP_DMA_LOCAL_KEY_MASK GENMASK_ULL(63, 32) +#define ERDMA_CMD_DEV_CAP_DEFAULT_CC_MASK GENMASK_ULL(31, 28) +#define ERDMA_CMD_DEV_CAP_QBLOCK_MASK GENMASK_ULL(27, 16) +#define ERDMA_CMD_DEV_CAP_MAX_MW_MASK GENMASK_ULL(7, 0) + +#define ERDMA_NQP_PER_QBLOCK 1024 + +#define ERDMA_CMD_INFO0_FW_VER_MASK GENMASK_ULL(31, 0) + +/* CQE hdr */ +#define ERDMA_CQE_HDR_OWNER_MASK BIT(31) +#define ERDMA_CQE_HDR_OPCODE_MASK GENMASK(23, 16) +#define ERDMA_CQE_HDR_QTYPE_MASK GENMASK(15, 8) +#define ERDMA_CQE_HDR_SYNDROME_MASK GENMASK(7, 0) + +#define ERDMA_CQE_QTYPE_SQ 0 +#define ERDMA_CQE_QTYPE_RQ 1 +#define ERDMA_CQE_QTYPE_CMDQ 2 + +struct erdma_cqe { + __be32 hdr; + __be32 qe_idx; + __be32 qpn; + union { + __le32 imm_data; + __be32 inv_rkey; + }; + __be32 size; + __be32 rsvd[3]; +}; + +struct erdma_sge { + __aligned_le64 laddr; + __le32 length; + __le32 lkey; +}; + +/* Receive Queue Element */ +struct erdma_rqe { + __le16 qe_idx; + __le16 rsvd0; + __le32 qpn; + __le32 rsvd1; + __le32 rsvd2; + __le64 to; + __le32 length; + __le32 stag; +}; + +/* SQE */ +#define ERDMA_SQE_HDR_SGL_LEN_MASK GENMASK_ULL(63, 56) +#define ERDMA_SQE_HDR_WQEBB_CNT_MASK GENMASK_ULL(54, 52) +#define ERDMA_SQE_HDR_QPN_MASK GENMASK_ULL(51, 32) +#define ERDMA_SQE_HDR_OPCODE_MASK GENMASK_ULL(31, 27) +#define ERDMA_SQE_HDR_DWQE_MASK BIT_ULL(26) +#define ERDMA_SQE_HDR_INLINE_MASK BIT_ULL(25) +#define ERDMA_SQE_HDR_FENCE_MASK BIT_ULL(24) +#define ERDMA_SQE_HDR_SE_MASK BIT_ULL(23) +#define ERDMA_SQE_HDR_CE_MASK BIT_ULL(22) +#define ERDMA_SQE_HDR_WQEBB_INDEX_MASK GENMASK_ULL(15, 0) + +/* REG MR attrs */ +#define ERDMA_SQE_MR_MODE_MASK GENMASK(1, 0) +#define ERDMA_SQE_MR_ACCESS_MASK GENMASK(5, 2) +#define ERDMA_SQE_MR_MTT_TYPE_MASK GENMASK(7, 6) +#define ERDMA_SQE_MR_MTT_CNT_MASK GENMASK(31, 12) + +struct erdma_write_sqe { + __le64 hdr; + __be32 imm_data; + __le32 length; + + __le32 sink_stag; + __le32 sink_to_l; + __le32 sink_to_h; + + __le32 rsvd; + + struct erdma_sge sgl[0]; +}; + +struct erdma_send_sqe { + __le64 hdr; + union { + __be32 imm_data; + __le32 invalid_stag; + }; + + __le32 length; + struct erdma_sge sgl[0]; +}; + +struct erdma_readreq_sqe { + __le64 hdr; + __le32 invalid_stag; + __le32 length; + __le32 sink_stag; + __le32 sink_to_l; + __le32 sink_to_h; + __le32 rsvd; +}; + +struct erdma_reg_mr_sqe { + __le64 hdr; + __le64 addr; + __le32 length; + __le32 stag; + __le32 attrs; + __le32 rsvd; +}; + +/* EQ related. */ +#define ERDMA_DEFAULT_EQ_DEPTH 256 + +/* ceqe */ +#define ERDMA_CEQE_HDR_DB_MASK BIT_ULL(63) +#define ERDMA_CEQE_HDR_PI_MASK GENMASK_ULL(55, 32) +#define ERDMA_CEQE_HDR_O_MASK BIT_ULL(31) +#define ERDMA_CEQE_HDR_CQN_MASK GENMASK_ULL(19, 0) + +/* aeqe */ +#define ERDMA_AEQE_HDR_O_MASK BIT(31) +#define ERDMA_AEQE_HDR_TYPE_MASK GENMASK(23, 16) +#define ERDMA_AEQE_HDR_SUBTYPE_MASK GENMASK(7, 0) + +#define ERDMA_AE_TYPE_QP_FATAL_EVENT 0 +#define ERDMA_AE_TYPE_QP_ERQ_ERR_EVENT 1 +#define ERDMA_AE_TYPE_ACC_ERR_EVENT 2 +#define ERDMA_AE_TYPE_CQ_ERR 3 +#define ERDMA_AE_TYPE_OTHER_ERROR 4 + +struct erdma_aeqe { + __le32 hdr; + __le32 event_data0; + __le32 event_data1; + __le32 rsvd; +}; + +enum erdma_opcode { + ERDMA_OP_WRITE = 0, + ERDMA_OP_READ = 1, + ERDMA_OP_SEND = 2, + ERDMA_OP_SEND_WITH_IMM = 3, + + ERDMA_OP_RECEIVE = 4, + ERDMA_OP_RECV_IMM = 5, + ERDMA_OP_RECV_INV = 6, + + ERDMA_OP_REQ_ERR = 7, + ERDMA_OP_READ_RESPONSE = 8, + ERDMA_OP_WRITE_WITH_IMM = 9, + + ERDMA_OP_RECV_ERR = 10, + + ERDMA_OP_INVALIDATE = 11, + ERDMA_OP_RSP_SEND_IMM = 12, + ERDMA_OP_SEND_WITH_INV = 13, + + ERDMA_OP_REG_MR = 14, + ERDMA_OP_LOCAL_INV = 15, + ERDMA_OP_READ_WITH_INV = 16, + ERDMA_NUM_OPCODES = 17, + ERDMA_OP_INVALID = ERDMA_NUM_OPCODES + 1 +}; + +enum erdma_wc_status { + ERDMA_WC_SUCCESS = 0, + ERDMA_WC_GENERAL_ERR = 1, + ERDMA_WC_RECV_WQE_FORMAT_ERR = 2, + ERDMA_WC_RECV_STAG_INVALID_ERR = 3, + ERDMA_WC_RECV_ADDR_VIOLATION_ERR = 4, + ERDMA_WC_RECV_RIGHT_VIOLATION_ERR = 5, + ERDMA_WC_RECV_PDID_ERR = 6, + ERDMA_WC_RECV_WARRPING_ERR = 7, + ERDMA_WC_SEND_WQE_FORMAT_ERR = 8, + ERDMA_WC_SEND_WQE_ORD_EXCEED = 9, + ERDMA_WC_SEND_STAG_INVALID_ERR = 10, + ERDMA_WC_SEND_ADDR_VIOLATION_ERR = 11, + ERDMA_WC_SEND_RIGHT_VIOLATION_ERR = 12, + ERDMA_WC_SEND_PDID_ERR = 13, + ERDMA_WC_SEND_WARRPING_ERR = 14, + ERDMA_WC_FLUSH_ERR = 15, + ERDMA_WC_RETRY_EXC_ERR = 16, + ERDMA_NUM_WC_STATUS +}; + +enum erdma_vendor_err { + ERDMA_WC_VENDOR_NO_ERR = 0, + ERDMA_WC_VENDOR_INVALID_RQE = 1, + ERDMA_WC_VENDOR_RQE_INVALID_STAG = 2, + ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION = 3, + ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR = 4, + ERDMA_WC_VENDOR_RQE_INVALID_PD = 5, + ERDMA_WC_VENDOR_RQE_WRAP_ERR = 6, + ERDMA_WC_VENDOR_INVALID_SQE = 0x20, + ERDMA_WC_VENDOR_ZERO_ORD = 0x21, + ERDMA_WC_VENDOR_SQE_INVALID_STAG = 0x30, + ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION = 0x31, + ERDMA_WC_VENDOR_SQE_ACCESS_ERR = 0x32, + ERDMA_WC_VENDOR_SQE_INVALID_PD = 0x33, + ERDMA_WC_VENDOR_SQE_WARP_ERR = 0x34 +}; + +#endif diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c new file mode 100644 index 000000000000..07e743d24847 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -0,0 +1,608 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/pci.h> +#include <net/addrconf.h> +#include <rdma/erdma-abi.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +MODULE_AUTHOR("Cheng Xu <chengyou@linux.alibaba.com>"); +MODULE_DESCRIPTION("Alibaba elasticRDMA adapter driver"); +MODULE_LICENSE("Dual BSD/GPL"); + +static int erdma_netdev_event(struct notifier_block *nb, unsigned long event, + void *arg) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(arg); + struct erdma_dev *dev = container_of(nb, struct erdma_dev, netdev_nb); + + if (dev->netdev == NULL || dev->netdev != netdev) + goto done; + + switch (event) { + case NETDEV_UP: + dev->state = IB_PORT_ACTIVE; + erdma_port_event(dev, IB_EVENT_PORT_ACTIVE); + break; + case NETDEV_DOWN: + dev->state = IB_PORT_DOWN; + erdma_port_event(dev, IB_EVENT_PORT_ERR); + break; + case NETDEV_REGISTER: + case NETDEV_UNREGISTER: + case NETDEV_CHANGEADDR: + case NETDEV_CHANGEMTU: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGE: + default: + break; + } + +done: + return NOTIFY_OK; +} + +static int erdma_enum_and_get_netdev(struct erdma_dev *dev) +{ + struct net_device *netdev; + int ret = -ENODEV; + + /* Already binded to a net_device, so we skip. */ + if (dev->netdev) + return 0; + + rtnl_lock(); + for_each_netdev(&init_net, netdev) { + /* + * In erdma, the paired netdev and ibdev should have the same + * MAC address. erdma can get the value from its PCIe bar + * registers. Since erdma can not get the paired netdev + * reference directly, we do a traverse here to get the paired + * netdev. + */ + if (ether_addr_equal_unaligned(netdev->perm_addr, + dev->attrs.peer_addr)) { + ret = ib_device_set_netdev(&dev->ibdev, netdev, 1); + if (ret) { + rtnl_unlock(); + ibdev_warn(&dev->ibdev, + "failed (%d) to link netdev", ret); + return ret; + } + + dev->netdev = netdev; + break; + } + } + + rtnl_unlock(); + + return ret; +} + +static int erdma_device_register(struct erdma_dev *dev) +{ + struct ib_device *ibdev = &dev->ibdev; + int ret; + + ret = erdma_enum_and_get_netdev(dev); + if (ret) + return ret; + + addrconf_addr_eui48((u8 *)&ibdev->node_guid, dev->netdev->dev_addr); + + ret = ib_register_device(ibdev, "erdma_%d", &dev->pdev->dev); + if (ret) { + dev_err(&dev->pdev->dev, + "ib_register_device failed: ret = %d\n", ret); + return ret; + } + + dev->netdev_nb.notifier_call = erdma_netdev_event; + ret = register_netdevice_notifier(&dev->netdev_nb); + if (ret) { + ibdev_err(&dev->ibdev, "failed to register notifier.\n"); + ib_unregister_device(ibdev); + } + + return ret; +} + +static irqreturn_t erdma_comm_irq_handler(int irq, void *data) +{ + struct erdma_dev *dev = data; + + erdma_cmdq_completion_handler(&dev->cmdq); + erdma_aeq_event_handler(dev); + + return IRQ_HANDLED; +} + +static void erdma_dwqe_resource_init(struct erdma_dev *dev) +{ + int total_pages, type0, type1; + + dev->attrs.grp_num = erdma_reg_read32(dev, ERDMA_REGS_GRP_NUM_REG); + + if (dev->attrs.grp_num < 4) + dev->attrs.disable_dwqe = true; + else + dev->attrs.disable_dwqe = false; + + /* One page contains 4 goups. */ + total_pages = dev->attrs.grp_num * 4; + + if (dev->attrs.grp_num >= ERDMA_DWQE_MAX_GRP_CNT) { + dev->attrs.grp_num = ERDMA_DWQE_MAX_GRP_CNT; + type0 = ERDMA_DWQE_TYPE0_CNT; + type1 = ERDMA_DWQE_TYPE1_CNT / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + } else { + type1 = total_pages / 3; + type0 = total_pages - type1 - 1; + } + + dev->attrs.dwqe_pages = type0; + dev->attrs.dwqe_entries = type1 * ERDMA_DWQE_TYPE1_CNT_PER_PAGE; +} + +static int erdma_request_vectors(struct erdma_dev *dev) +{ + int expect_irq_num = min(num_possible_cpus() + 1, ERDMA_NUM_MSIX_VEC); + int ret; + + ret = pci_alloc_irq_vectors(dev->pdev, 1, expect_irq_num, PCI_IRQ_MSIX); + if (ret < 0) { + dev_err(&dev->pdev->dev, "request irq vectors failed(%d)\n", + ret); + return ret; + } + dev->attrs.irq_num = ret; + + return 0; +} + +static int erdma_comm_irq_init(struct erdma_dev *dev) +{ + snprintf(dev->comm_irq.name, ERDMA_IRQNAME_SIZE, "erdma-common@pci:%s", + pci_name(dev->pdev)); + dev->comm_irq.msix_vector = + pci_irq_vector(dev->pdev, ERDMA_MSIX_VECTOR_CMDQ); + + cpumask_set_cpu(cpumask_first(cpumask_of_pcibus(dev->pdev->bus)), + &dev->comm_irq.affinity_hint_mask); + irq_set_affinity_hint(dev->comm_irq.msix_vector, + &dev->comm_irq.affinity_hint_mask); + + return request_irq(dev->comm_irq.msix_vector, erdma_comm_irq_handler, 0, + dev->comm_irq.name, dev); +} + +static void erdma_comm_irq_uninit(struct erdma_dev *dev) +{ + irq_set_affinity_hint(dev->comm_irq.msix_vector, NULL); + free_irq(dev->comm_irq.msix_vector, dev); +} + +static int erdma_device_init(struct erdma_dev *dev, struct pci_dev *pdev) +{ + int ret; + + erdma_dwqe_resource_init(dev); + + ret = dma_set_mask_and_coherent(&pdev->dev, + DMA_BIT_MASK(ERDMA_PCI_WIDTH)); + if (ret) + return ret; + + dma_set_max_seg_size(&pdev->dev, UINT_MAX); + + return 0; +} + +static void erdma_device_uninit(struct erdma_dev *dev) +{ + u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1); + + erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); +} + +static const struct pci_device_id erdma_pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) }, + {} +}; + +static int erdma_probe_dev(struct pci_dev *pdev) +{ + struct erdma_dev *dev; + int bars, err; + u32 version; + + err = pci_enable_device(pdev); + if (err) { + dev_err(&pdev->dev, "pci_enable_device failed(%d)\n", err); + return err; + } + + pci_set_master(pdev); + + dev = ib_alloc_device(erdma_dev, ibdev); + if (!dev) { + dev_err(&pdev->dev, "ib_alloc_device failed\n"); + err = -ENOMEM; + goto err_disable_device; + } + + pci_set_drvdata(pdev, dev); + dev->pdev = pdev; + dev->attrs.numa_node = dev_to_node(&pdev->dev); + + bars = pci_select_bars(pdev, IORESOURCE_MEM); + err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME); + if (bars != ERDMA_BAR_MASK || err) { + err = err ? err : -EINVAL; + goto err_ib_device_release; + } + + dev->func_bar_addr = pci_resource_start(pdev, ERDMA_FUNC_BAR); + dev->func_bar_len = pci_resource_len(pdev, ERDMA_FUNC_BAR); + + dev->func_bar = + devm_ioremap(&pdev->dev, dev->func_bar_addr, dev->func_bar_len); + if (!dev->func_bar) { + dev_err(&pdev->dev, "devm_ioremap failed.\n"); + err = -EFAULT; + goto err_release_bars; + } + + version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); + if (version == 0) { + /* we knows that it is a non-functional function. */ + err = -ENODEV; + goto err_iounmap_func_bar; + } + + err = erdma_device_init(dev, pdev); + if (err) + goto err_iounmap_func_bar; + + err = erdma_request_vectors(dev); + if (err) + goto err_iounmap_func_bar; + + err = erdma_comm_irq_init(dev); + if (err) + goto err_free_vectors; + + err = erdma_aeq_init(dev); + if (err) + goto err_uninit_comm_irq; + + err = erdma_cmdq_init(dev); + if (err) + goto err_uninit_aeq; + + err = erdma_ceqs_init(dev); + if (err) + goto err_uninit_cmdq; + + erdma_finish_cmdq_init(dev); + + return 0; + +err_uninit_cmdq: + erdma_device_uninit(dev); + erdma_cmdq_destroy(dev); + +err_uninit_aeq: + erdma_aeq_destroy(dev); + +err_uninit_comm_irq: + erdma_comm_irq_uninit(dev); + +err_free_vectors: + pci_free_irq_vectors(dev->pdev); + +err_iounmap_func_bar: + devm_iounmap(&pdev->dev, dev->func_bar); + +err_release_bars: + pci_release_selected_regions(pdev, bars); + +err_ib_device_release: + ib_dealloc_device(&dev->ibdev); + +err_disable_device: + pci_disable_device(pdev); + + return err; +} + +static void erdma_remove_dev(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + + erdma_ceqs_uninit(dev); + + erdma_device_uninit(dev); + + erdma_cmdq_destroy(dev); + erdma_aeq_destroy(dev); + erdma_comm_irq_uninit(dev); + pci_free_irq_vectors(dev->pdev); + + devm_iounmap(&pdev->dev, dev->func_bar); + pci_release_selected_regions(pdev, ERDMA_BAR_MASK); + + ib_dealloc_device(&dev->ibdev); + + pci_disable_device(pdev); +} + +#define ERDMA_GET_CAP(name, cap) FIELD_GET(ERDMA_CMD_DEV_CAP_##name##_MASK, cap) + +static int erdma_dev_attrs_init(struct erdma_dev *dev) +{ + int err; + u64 req_hdr, cap0, cap1; + + erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_QUERY_DEVICE); + + err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, + &cap1); + if (err) + return err; + + dev->attrs.max_cqe = 1 << ERDMA_GET_CAP(MAX_CQE, cap0); + dev->attrs.max_mr_size = 1ULL << ERDMA_GET_CAP(MAX_MR_SIZE, cap0); + dev->attrs.max_mw = 1 << ERDMA_GET_CAP(MAX_MW, cap1); + dev->attrs.max_recv_wr = 1 << ERDMA_GET_CAP(MAX_RECV_WR, cap0); + dev->attrs.local_dma_key = ERDMA_GET_CAP(DMA_LOCAL_KEY, cap1); + dev->attrs.cc = ERDMA_GET_CAP(DEFAULT_CC, cap1); + dev->attrs.max_qp = ERDMA_NQP_PER_QBLOCK * ERDMA_GET_CAP(QBLOCK, cap1); + dev->attrs.max_mr = dev->attrs.max_qp << 1; + dev->attrs.max_cq = dev->attrs.max_qp << 1; + + dev->attrs.max_send_wr = ERDMA_MAX_SEND_WR; + dev->attrs.max_ord = ERDMA_MAX_ORD; + dev->attrs.max_ird = ERDMA_MAX_IRD; + dev->attrs.max_send_sge = ERDMA_MAX_SEND_SGE; + dev->attrs.max_recv_sge = ERDMA_MAX_RECV_SGE; + dev->attrs.max_sge_rd = ERDMA_MAX_SGE_RD; + dev->attrs.max_pd = ERDMA_MAX_PD; + + dev->res_cb[ERDMA_RES_TYPE_PD].max_cap = ERDMA_MAX_PD; + dev->res_cb[ERDMA_RES_TYPE_STAG_IDX].max_cap = dev->attrs.max_mr; + + erdma_cmdq_build_reqhdr(&req_hdr, CMDQ_SUBMOD_COMMON, + CMDQ_OPCODE_QUERY_FW_INFO); + + err = erdma_post_cmd_wait(&dev->cmdq, &req_hdr, sizeof(req_hdr), &cap0, + &cap1); + if (!err) + dev->attrs.fw_version = + FIELD_GET(ERDMA_CMD_INFO0_FW_VER_MASK, cap0); + + return err; +} + +static int erdma_res_cb_init(struct erdma_dev *dev) +{ + int i, j; + + for (i = 0; i < ERDMA_RES_CNT; i++) { + dev->res_cb[i].next_alloc_idx = 1; + spin_lock_init(&dev->res_cb[i].lock); + dev->res_cb[i].bitmap = + bitmap_zalloc(dev->res_cb[i].max_cap, GFP_KERNEL); + if (!dev->res_cb[i].bitmap) + goto err; + } + + return 0; + +err: + for (j = 0; j < i; j++) + bitmap_free(dev->res_cb[j].bitmap); + + return -ENOMEM; +} + +static void erdma_res_cb_free(struct erdma_dev *dev) +{ + int i; + + for (i = 0; i < ERDMA_RES_CNT; i++) + bitmap_free(dev->res_cb[i].bitmap); +} + +static const struct ib_device_ops erdma_device_ops = { + .owner = THIS_MODULE, + .driver_id = RDMA_DRIVER_ERDMA, + .uverbs_abi_ver = ERDMA_ABI_VERSION, + + .alloc_mr = erdma_ib_alloc_mr, + .alloc_pd = erdma_alloc_pd, + .alloc_ucontext = erdma_alloc_ucontext, + .create_cq = erdma_create_cq, + .create_qp = erdma_create_qp, + .dealloc_pd = erdma_dealloc_pd, + .dealloc_ucontext = erdma_dealloc_ucontext, + .dereg_mr = erdma_dereg_mr, + .destroy_cq = erdma_destroy_cq, + .destroy_qp = erdma_destroy_qp, + .get_dma_mr = erdma_get_dma_mr, + .get_port_immutable = erdma_get_port_immutable, + .iw_accept = erdma_accept, + .iw_add_ref = erdma_qp_get_ref, + .iw_connect = erdma_connect, + .iw_create_listen = erdma_create_listen, + .iw_destroy_listen = erdma_destroy_listen, + .iw_get_qp = erdma_get_ibqp, + .iw_reject = erdma_reject, + .iw_rem_ref = erdma_qp_put_ref, + .map_mr_sg = erdma_map_mr_sg, + .mmap = erdma_mmap, + .mmap_free = erdma_mmap_free, + .modify_qp = erdma_modify_qp, + .post_recv = erdma_post_recv, + .post_send = erdma_post_send, + .poll_cq = erdma_poll_cq, + .query_device = erdma_query_device, + .query_gid = erdma_query_gid, + .query_port = erdma_query_port, + .query_qp = erdma_query_qp, + .req_notify_cq = erdma_req_notify_cq, + .reg_user_mr = erdma_reg_user_mr, + + INIT_RDMA_OBJ_SIZE(ib_cq, erdma_cq, ibcq), + INIT_RDMA_OBJ_SIZE(ib_pd, erdma_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, erdma_ucontext, ibucontext), + INIT_RDMA_OBJ_SIZE(ib_qp, erdma_qp, ibqp), +}; + +static int erdma_ib_device_add(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + struct ib_device *ibdev = &dev->ibdev; + u64 mac; + int ret; + + ret = erdma_dev_attrs_init(dev); + if (ret) + return ret; + + ibdev->node_type = RDMA_NODE_RNIC; + memcpy(ibdev->node_desc, ERDMA_NODE_DESC, sizeof(ERDMA_NODE_DESC)); + + /* + * Current model (one-to-one device association): + * One ERDMA device per net_device or, equivalently, + * per physical port. + */ + ibdev->phys_port_cnt = 1; + ibdev->num_comp_vectors = dev->attrs.irq_num - 1; + + ib_set_device_ops(ibdev, &erdma_device_ops); + + INIT_LIST_HEAD(&dev->cep_list); + + spin_lock_init(&dev->lock); + xa_init_flags(&dev->qp_xa, XA_FLAGS_ALLOC1); + xa_init_flags(&dev->cq_xa, XA_FLAGS_ALLOC1); + dev->next_alloc_cqn = 1; + dev->next_alloc_qpn = 1; + + ret = erdma_res_cb_init(dev); + if (ret) + return ret; + + spin_lock_init(&dev->db_bitmap_lock); + bitmap_zero(dev->sdb_page, ERDMA_DWQE_TYPE0_CNT); + bitmap_zero(dev->sdb_entry, ERDMA_DWQE_TYPE1_CNT); + + atomic_set(&dev->num_ctx, 0); + + mac = erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_L_REG); + mac |= (u64)erdma_reg_read32(dev, ERDMA_REGS_NETDEV_MAC_H_REG) << 32; + + u64_to_ether_addr(mac, dev->attrs.peer_addr); + + ret = erdma_device_register(dev); + if (ret) + goto err_out; + + return 0; + +err_out: + xa_destroy(&dev->qp_xa); + xa_destroy(&dev->cq_xa); + + erdma_res_cb_free(dev); + + return ret; +} + +static void erdma_ib_device_remove(struct pci_dev *pdev) +{ + struct erdma_dev *dev = pci_get_drvdata(pdev); + + unregister_netdevice_notifier(&dev->netdev_nb); + ib_unregister_device(&dev->ibdev); + + erdma_res_cb_free(dev); + xa_destroy(&dev->qp_xa); + xa_destroy(&dev->cq_xa); +} + +static int erdma_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + int ret; + + ret = erdma_probe_dev(pdev); + if (ret) + return ret; + + ret = erdma_ib_device_add(pdev); + if (ret) { + erdma_remove_dev(pdev); + return ret; + } + + return 0; +} + +static void erdma_remove(struct pci_dev *pdev) +{ + erdma_ib_device_remove(pdev); + erdma_remove_dev(pdev); +} + +static struct pci_driver erdma_pci_driver = { + .name = DRV_MODULE_NAME, + .id_table = erdma_pci_tbl, + .probe = erdma_probe, + .remove = erdma_remove +}; + +MODULE_DEVICE_TABLE(pci, erdma_pci_tbl); + +static __init int erdma_init_module(void) +{ + int ret; + + ret = erdma_cm_init(); + if (ret) + return ret; + + ret = pci_register_driver(&erdma_pci_driver); + if (ret) + erdma_cm_exit(); + + return ret; +} + +static void __exit erdma_exit_module(void) +{ + pci_unregister_driver(&erdma_pci_driver); + + erdma_cm_exit(); +} + +module_init(erdma_init_module); +module_exit(erdma_exit_module); diff --git a/drivers/infiniband/hw/erdma/erdma_qp.c b/drivers/infiniband/hw/erdma/erdma_qp.c new file mode 100644 index 000000000000..bc3ec22a62c5 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_qp.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2021, Alibaba Group */ +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/scatterlist.h> +#include <linux/types.h> + +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_verbs.h" + +void erdma_qp_llp_close(struct erdma_qp *qp) +{ + struct erdma_qp_attrs qp_attrs; + + down_write(&qp->state_lock); + + switch (qp->attrs.state) { + case ERDMA_QP_STATE_RTS: + case ERDMA_QP_STATE_RTR: + case ERDMA_QP_STATE_IDLE: + case ERDMA_QP_STATE_TERMINATE: + qp_attrs.state = ERDMA_QP_STATE_CLOSING; + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + break; + case ERDMA_QP_STATE_CLOSING: + qp->attrs.state = ERDMA_QP_STATE_IDLE; + break; + default: + break; + } + + if (qp->cep) { + erdma_cep_put(qp->cep); + qp->cep = NULL; + } + + up_write(&qp->state_lock); +} + +struct ib_qp *erdma_get_ibqp(struct ib_device *ibdev, int id) +{ + struct erdma_qp *qp = find_qp_by_qpn(to_edev(ibdev), id); + + if (qp) + return &qp->ibqp; + + return NULL; +} + +static int erdma_modify_qp_state_to_rts(struct erdma_qp *qp, + struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + int ret; + struct erdma_dev *dev = qp->dev; + struct erdma_cmdq_modify_qp_req req; + struct tcp_sock *tp; + struct erdma_cep *cep = qp->cep; + struct sockaddr_storage local_addr, remote_addr; + + if (!(mask & ERDMA_QP_ATTR_LLP_HANDLE)) + return -EINVAL; + + if (!(mask & ERDMA_QP_ATTR_MPA)) + return -EINVAL; + + ret = getname_local(cep->sock, &local_addr); + if (ret < 0) + return ret; + + ret = getname_peer(cep->sock, &remote_addr); + if (ret < 0) + return ret; + + qp->attrs.state = ERDMA_QP_STATE_RTS; + + tp = tcp_sk(qp->cep->sock->sk); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, qp->attrs.state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_CC_MASK, qp->attrs.cc) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + req.cookie = be32_to_cpu(qp->cep->mpa.ext_data.cookie); + req.dip = to_sockaddr_in(remote_addr).sin_addr.s_addr; + req.sip = to_sockaddr_in(local_addr).sin_addr.s_addr; + req.dport = to_sockaddr_in(remote_addr).sin_port; + req.sport = to_sockaddr_in(local_addr).sin_port; + + req.send_nxt = tp->snd_nxt; + /* rsvd tcp seq for mpa-rsp in server. */ + if (qp->attrs.qp_type == ERDMA_QP_PASSIVE) + req.send_nxt += MPA_DEFAULT_HDR_LEN + qp->attrs.pd_len; + req.recv_nxt = tp->rcv_nxt; + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int erdma_modify_qp_state_to_stop(struct erdma_qp *qp, + struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + struct erdma_dev *dev = qp->dev; + struct erdma_cmdq_modify_qp_req req; + + qp->attrs.state = attrs->state; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_MODIFY_QP); + + req.cfg = FIELD_PREP(ERDMA_CMD_MODIFY_QP_STATE_MASK, attrs->state) | + FIELD_PREP(ERDMA_CMD_MODIFY_QP_QPN_MASK, QP_ID(qp)); + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask) +{ + int drop_conn, ret = 0; + + if (!mask) + return 0; + + if (!(mask & ERDMA_QP_ATTR_STATE)) + return 0; + + switch (qp->attrs.state) { + case ERDMA_QP_STATE_IDLE: + case ERDMA_QP_STATE_RTR: + if (attrs->state == ERDMA_QP_STATE_RTS) { + ret = erdma_modify_qp_state_to_rts(qp, attrs, mask); + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + qp->attrs.state = ERDMA_QP_STATE_ERROR; + if (qp->cep) { + erdma_cep_put(qp->cep); + qp->cep = NULL; + } + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + } + break; + case ERDMA_QP_STATE_RTS: + drop_conn = 0; + + if (attrs->state == ERDMA_QP_STATE_CLOSING) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + drop_conn = 1; + } else if (attrs->state == ERDMA_QP_STATE_TERMINATE) { + qp->attrs.state = ERDMA_QP_STATE_TERMINATE; + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + drop_conn = 1; + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + qp->attrs.state = ERDMA_QP_STATE_ERROR; + drop_conn = 1; + } + + if (drop_conn) + erdma_qp_cm_drop(qp); + + break; + case ERDMA_QP_STATE_TERMINATE: + if (attrs->state == ERDMA_QP_STATE_ERROR) + qp->attrs.state = ERDMA_QP_STATE_ERROR; + break; + case ERDMA_QP_STATE_CLOSING: + if (attrs->state == ERDMA_QP_STATE_IDLE) { + qp->attrs.state = ERDMA_QP_STATE_IDLE; + } else if (attrs->state == ERDMA_QP_STATE_ERROR) { + ret = erdma_modify_qp_state_to_stop(qp, attrs, mask); + qp->attrs.state = ERDMA_QP_STATE_ERROR; + } else if (attrs->state != ERDMA_QP_STATE_CLOSING) { + return -ECONNABORTED; + } + break; + default: + break; + } + + return ret; +} + +static void erdma_qp_safe_free(struct kref *ref) +{ + struct erdma_qp *qp = container_of(ref, struct erdma_qp, ref); + + complete(&qp->safe_free); +} + +void erdma_qp_put(struct erdma_qp *qp) +{ + WARN_ON(kref_read(&qp->ref) < 1); + kref_put(&qp->ref, erdma_qp_safe_free); +} + +void erdma_qp_get(struct erdma_qp *qp) +{ + kref_get(&qp->ref); +} + +static int fill_inline_data(struct erdma_qp *qp, + const struct ib_send_wr *send_wr, u16 wqe_idx, + u32 sgl_offset, __le32 *length_field) +{ + u32 remain_size, copy_size, data_off, bytes = 0; + char *data; + int i = 0; + + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, qp->attrs.sq_size, + SQEBB_SHIFT); + + while (i < send_wr->num_sge) { + bytes += send_wr->sg_list[i].length; + if (bytes > (int)ERDMA_MAX_INLINE) + return -EINVAL; + + remain_size = send_wr->sg_list[i].length; + data_off = 0; + + while (1) { + copy_size = min(remain_size, SQEBB_SIZE - sgl_offset); + + memcpy(data + sgl_offset, + (void *)(uintptr_t)send_wr->sg_list[i].addr + + data_off, + copy_size); + remain_size -= copy_size; + data_off += copy_size; + sgl_offset += copy_size; + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + + data = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + if (!remain_size) + break; + } + + i++; + } + *length_field = cpu_to_le32(bytes); + + return bytes; +} + +static int fill_sgl(struct erdma_qp *qp, const struct ib_send_wr *send_wr, + u16 wqe_idx, u32 sgl_offset, __le32 *length_field) +{ + int i = 0; + u32 bytes = 0; + char *sgl; + + if (send_wr->num_sge > qp->dev->attrs.max_send_sge) + return -EINVAL; + + if (sgl_offset & 0xF) + return -EINVAL; + + while (i < send_wr->num_sge) { + wqe_idx += (sgl_offset >> SQEBB_SHIFT); + sgl_offset &= (SQEBB_SIZE - 1); + sgl = get_queue_entry(qp->kern_qp.sq_buf, wqe_idx, + qp->attrs.sq_size, SQEBB_SHIFT); + + bytes += send_wr->sg_list[i].length; + memcpy(sgl + sgl_offset, &send_wr->sg_list[i], + sizeof(struct ib_sge)); + + sgl_offset += sizeof(struct ib_sge); + i++; + } + + *length_field = cpu_to_le32(bytes); + return 0; +} + +static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi, + const struct ib_send_wr *send_wr) +{ + u32 wqe_size, wqebb_cnt, hw_op, flags, sgl_offset; + u32 idx = *pi & (qp->attrs.sq_size - 1); + enum ib_wr_opcode op = send_wr->opcode; + struct erdma_readreq_sqe *read_sqe; + struct erdma_reg_mr_sqe *regmr_sge; + struct erdma_write_sqe *write_sqe; + struct erdma_send_sqe *send_sqe; + struct ib_rdma_wr *rdma_wr; + struct erdma_mr *mr; + __le32 *length_field; + u64 wqe_hdr, *entry; + struct ib_sge *sge; + u32 attrs; + int ret; + + entry = get_queue_entry(qp->kern_qp.sq_buf, idx, qp->attrs.sq_size, + SQEBB_SHIFT); + + /* Clear the SQE header section. */ + *entry = 0; + + qp->kern_qp.swr_tbl[idx] = send_wr->wr_id; + flags = send_wr->send_flags; + wqe_hdr = FIELD_PREP( + ERDMA_SQE_HDR_CE_MASK, + ((flags & IB_SEND_SIGNALED) || qp->kern_qp.sig_all) ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SE_MASK, + flags & IB_SEND_SOLICITED ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK, + flags & IB_SEND_FENCE ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK, + flags & IB_SEND_INLINE ? 1 : 0); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)); + + switch (op) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + hw_op = ERDMA_OP_WRITE; + if (op == IB_WR_RDMA_WRITE_WITH_IMM) + hw_op = ERDMA_OP_WRITE_WITH_IMM; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr); + write_sqe = (struct erdma_write_sqe *)entry; + + write_sqe->imm_data = send_wr->ex.imm_data; + write_sqe->sink_stag = cpu_to_le32(rdma_wr->rkey); + write_sqe->sink_to_h = + cpu_to_le32(upper_32_bits(rdma_wr->remote_addr)); + write_sqe->sink_to_l = + cpu_to_le32(lower_32_bits(rdma_wr->remote_addr)); + + length_field = &write_sqe->length; + wqe_size = sizeof(struct erdma_write_sqe); + sgl_offset = wqe_size; + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_READ_WITH_INV: + read_sqe = (struct erdma_readreq_sqe *)entry; + if (unlikely(send_wr->num_sge != 1)) + return -EINVAL; + hw_op = ERDMA_OP_READ; + if (op == IB_WR_RDMA_READ_WITH_INV) { + hw_op = ERDMA_OP_READ_WITH_INV; + read_sqe->invalid_stag = + cpu_to_le32(send_wr->ex.invalidate_rkey); + } + + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + rdma_wr = container_of(send_wr, struct ib_rdma_wr, wr); + read_sqe->length = cpu_to_le32(send_wr->sg_list[0].length); + read_sqe->sink_stag = cpu_to_le32(send_wr->sg_list[0].lkey); + read_sqe->sink_to_l = + cpu_to_le32(lower_32_bits(send_wr->sg_list[0].addr)); + read_sqe->sink_to_h = + cpu_to_le32(upper_32_bits(send_wr->sg_list[0].addr)); + + sge = get_queue_entry(qp->kern_qp.sq_buf, idx + 1, + qp->attrs.sq_size, SQEBB_SHIFT); + sge->addr = rdma_wr->remote_addr; + sge->lkey = rdma_wr->rkey; + sge->length = send_wr->sg_list[0].length; + wqe_size = sizeof(struct erdma_readreq_sqe) + + send_wr->num_sge * sizeof(struct ib_sge); + + goto out; + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + send_sqe = (struct erdma_send_sqe *)entry; + hw_op = ERDMA_OP_SEND; + if (op == IB_WR_SEND_WITH_IMM) { + hw_op = ERDMA_OP_SEND_WITH_IMM; + send_sqe->imm_data = send_wr->ex.imm_data; + } else if (op == IB_WR_SEND_WITH_INV) { + hw_op = ERDMA_OP_SEND_WITH_INV; + send_sqe->invalid_stag = + cpu_to_le32(send_wr->ex.invalidate_rkey); + } + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, hw_op); + length_field = &send_sqe->length; + wqe_size = sizeof(struct erdma_send_sqe); + sgl_offset = wqe_size; + + break; + case IB_WR_REG_MR: + wqe_hdr |= + FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, ERDMA_OP_REG_MR); + regmr_sge = (struct erdma_reg_mr_sqe *)entry; + mr = to_emr(reg_wr(send_wr)->mr); + + mr->access = ERDMA_MR_ACC_LR | + to_erdma_access_flags(reg_wr(send_wr)->access); + regmr_sge->addr = cpu_to_le64(mr->ibmr.iova); + regmr_sge->length = cpu_to_le32(mr->ibmr.length); + regmr_sge->stag = cpu_to_le32(reg_wr(send_wr)->key); + attrs = FIELD_PREP(ERDMA_SQE_MR_MODE_MASK, 0) | + FIELD_PREP(ERDMA_SQE_MR_ACCESS_MASK, mr->access) | + FIELD_PREP(ERDMA_SQE_MR_MTT_CNT_MASK, + mr->mem.mtt_nents); + + if (mr->mem.mtt_nents < ERDMA_MAX_INLINE_MTT_ENTRIES) { + attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 0); + /* Copy SGLs to SQE content to accelerate */ + memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1, + qp->attrs.sq_size, SQEBB_SHIFT), + mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents)); + wqe_size = sizeof(struct erdma_reg_mr_sqe) + + MTT_SIZE(mr->mem.mtt_nents); + } else { + attrs |= FIELD_PREP(ERDMA_SQE_MR_MTT_TYPE_MASK, 1); + wqe_size = sizeof(struct erdma_reg_mr_sqe); + } + + regmr_sge->attrs = cpu_to_le32(attrs); + goto out; + case IB_WR_LOCAL_INV: + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, + ERDMA_OP_LOCAL_INV); + regmr_sge = (struct erdma_reg_mr_sqe *)entry; + regmr_sge->stag = cpu_to_le32(send_wr->ex.invalidate_rkey); + wqe_size = sizeof(struct erdma_reg_mr_sqe); + goto out; + default: + return -EOPNOTSUPP; + } + + if (flags & IB_SEND_INLINE) { + ret = fill_inline_data(qp, send_wr, idx, sgl_offset, + length_field); + if (ret < 0) + return -EINVAL; + wqe_size += ret; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, ret); + } else { + ret = fill_sgl(qp, send_wr, idx, sgl_offset, length_field); + if (ret) + return -EINVAL; + wqe_size += send_wr->num_sge * sizeof(struct ib_sge); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, + send_wr->num_sge); + } + +out: + wqebb_cnt = SQEBB_COUNT(wqe_size); + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, wqebb_cnt - 1); + *pi += wqebb_cnt; + wqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, *pi); + + *entry = wqe_hdr; + + return 0; +} + +static void kick_sq_db(struct erdma_qp *qp, u16 pi) +{ + u64 db_data = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, QP_ID(qp)) | + FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, pi); + + *(u64 *)qp->kern_qp.sq_db_info = db_data; + writeq(db_data, qp->kern_qp.hw_sq_db); +} + +int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr) +{ + struct erdma_qp *qp = to_eqp(ibqp); + int ret = 0; + const struct ib_send_wr *wr = send_wr; + unsigned long flags; + u16 sq_pi; + + if (!send_wr) + return -EINVAL; + + spin_lock_irqsave(&qp->lock, flags); + sq_pi = qp->kern_qp.sq_pi; + + while (wr) { + if ((u16)(sq_pi - qp->kern_qp.sq_ci) >= qp->attrs.sq_size) { + ret = -ENOMEM; + *bad_send_wr = send_wr; + break; + } + + ret = erdma_push_one_sqe(qp, &sq_pi, wr); + if (ret) { + *bad_send_wr = wr; + break; + } + qp->kern_qp.sq_pi = sq_pi; + kick_sq_db(qp, sq_pi); + + wr = wr->next; + } + spin_unlock_irqrestore(&qp->lock, flags); + + return ret; +} + +static int erdma_post_recv_one(struct erdma_qp *qp, + const struct ib_recv_wr *recv_wr) +{ + struct erdma_rqe *rqe = + get_queue_entry(qp->kern_qp.rq_buf, qp->kern_qp.rq_pi, + qp->attrs.rq_size, RQE_SHIFT); + + rqe->qe_idx = cpu_to_le16(qp->kern_qp.rq_pi + 1); + rqe->qpn = cpu_to_le32(QP_ID(qp)); + + if (recv_wr->num_sge == 0) { + rqe->length = 0; + } else if (recv_wr->num_sge == 1) { + rqe->stag = cpu_to_le32(recv_wr->sg_list[0].lkey); + rqe->to = cpu_to_le64(recv_wr->sg_list[0].addr); + rqe->length = cpu_to_le32(recv_wr->sg_list[0].length); + } else { + return -EINVAL; + } + + *(u64 *)qp->kern_qp.rq_db_info = *(u64 *)rqe; + writeq(*(u64 *)rqe, qp->kern_qp.hw_rq_db); + + qp->kern_qp.rwr_tbl[qp->kern_qp.rq_pi & (qp->attrs.rq_size - 1)] = + recv_wr->wr_id; + qp->kern_qp.rq_pi++; + + return 0; +} + +int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr) +{ + const struct ib_recv_wr *wr = recv_wr; + struct erdma_qp *qp = to_eqp(ibqp); + unsigned long flags; + int ret; + + spin_lock_irqsave(&qp->lock, flags); + + while (wr) { + ret = erdma_post_recv_one(qp, wr); + if (ret) { + *bad_recv_wr = wr; + break; + } + wr = wr->next; + } + + spin_unlock_irqrestore(&qp->lock, flags); + return ret; +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c new file mode 100644 index 000000000000..699bd3f59cd3 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -0,0 +1,1460 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +/* Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. */ + +#include <linux/errno.h> +#include <linux/pci.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <net/addrconf.h> +#include <rdma/erdma-abi.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/ib_verbs.h> +#include <rdma/uverbs_ioctl.h> + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" +#include "erdma_verbs.h" + +static int create_qp_cmd(struct erdma_dev *dev, struct erdma_qp *qp) +{ + struct erdma_cmdq_create_qp_req req; + struct erdma_pd *pd = to_epd(qp->ibqp.pd); + struct erdma_uqp *user_qp; + u64 resp0, resp1; + int err; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_QP); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_QP_SQ_DEPTH_MASK, + ilog2(qp->attrs.sq_size)) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_QPN_MASK, QP_ID(qp)); + req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_QP_RQ_DEPTH_MASK, + ilog2(qp->attrs.rq_size)) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_PD_MASK, pd->pdn); + + if (rdma_is_kernel_res(&qp->ibqp.res)) { + u32 pgsz_range = ilog2(SZ_1M) - PAGE_SHIFT; + + req.sq_cqn_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + pgsz_range) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn); + req.rq_cqn_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + pgsz_range) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn); + + req.sq_mtt_cfg = + FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK, 0) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 1) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + ERDMA_MR_INLINE_MTT); + req.rq_mtt_cfg = req.sq_mtt_cfg; + + req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr; + req.sq_buf_addr = qp->kern_qp.sq_buf_dma_addr; + req.sq_db_info_dma_addr = qp->kern_qp.sq_buf_dma_addr + + (qp->attrs.sq_size << SQEBB_SHIFT); + req.rq_db_info_dma_addr = qp->kern_qp.rq_buf_dma_addr + + (qp->attrs.rq_size << RQE_SHIFT); + } else { + user_qp = &qp->user_qp; + req.sq_cqn_mtt_cfg = FIELD_PREP( + ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + ilog2(user_qp->sq_mtt.page_size) - PAGE_SHIFT); + req.sq_cqn_mtt_cfg |= + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->scq->cqn); + + req.rq_cqn_mtt_cfg = FIELD_PREP( + ERDMA_CMD_CREATE_QP_PAGE_SIZE_MASK, + ilog2(user_qp->rq_mtt.page_size) - PAGE_SHIFT); + req.rq_cqn_mtt_cfg |= + FIELD_PREP(ERDMA_CMD_CREATE_QP_CQN_MASK, qp->rcq->cqn); + + req.sq_mtt_cfg = user_qp->sq_mtt.page_offset; + req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, + user_qp->sq_mtt.mtt_nents) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + user_qp->sq_mtt.mtt_type); + + req.rq_mtt_cfg = user_qp->rq_mtt.page_offset; + req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, + user_qp->rq_mtt.mtt_nents) | + FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK, + user_qp->rq_mtt.mtt_type); + + req.sq_buf_addr = user_qp->sq_mtt.mtt_entry[0]; + req.rq_buf_addr = user_qp->rq_mtt.mtt_entry[0]; + + req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr; + req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr; + } + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), &resp0, + &resp1); + if (!err) + qp->attrs.cookie = + FIELD_GET(ERDMA_CMDQ_CREATE_QP_RESP_COOKIE_MASK, resp0); + + return err; +} + +static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr) +{ + struct erdma_cmdq_reg_mr_req req; + struct erdma_pd *pd = to_epd(mr->ibmr.pd); + u64 *phy_addr; + int i; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) | + FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) | + FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8); + req.cfg1 = FIELD_PREP(ERDMA_CMD_REGMR_PD_MASK, pd->pdn) | + FIELD_PREP(ERDMA_CMD_REGMR_TYPE_MASK, mr->type) | + FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access) | + FIELD_PREP(ERDMA_CMD_REGMR_ACC_MODE_MASK, 0); + req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK, + ilog2(mr->mem.page_size)) | + FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) | + FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt); + + if (mr->type == ERDMA_MR_TYPE_DMA) + goto post_cmd; + + if (mr->type == ERDMA_MR_TYPE_NORMAL) { + req.start_va = mr->mem.va; + req.size = mr->mem.len; + } + + if (mr->type == ERDMA_MR_TYPE_FRMR || + mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) { + phy_addr = req.phy_addr; + *phy_addr = mr->mem.mtt_entry[0]; + } else { + phy_addr = req.phy_addr; + for (i = 0; i < mr->mem.mtt_nents; i++) + *phy_addr++ = mr->mem.mtt_entry[i]; + } + +post_cmd: + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int create_cq_cmd(struct erdma_dev *dev, struct erdma_cq *cq) +{ + struct erdma_cmdq_create_cq_req req; + u32 page_size; + struct erdma_mem *mtt; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_CREATE_CQ); + + req.cfg0 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_CQN_MASK, cq->cqn) | + FIELD_PREP(ERDMA_CMD_CREATE_CQ_DEPTH_MASK, ilog2(cq->depth)); + req.cfg1 = FIELD_PREP(ERDMA_CMD_CREATE_CQ_EQN_MASK, cq->assoc_eqn); + + if (rdma_is_kernel_res(&cq->ibcq.res)) { + page_size = SZ_32M; + req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, + ilog2(page_size) - PAGE_SHIFT); + req.qbuf_addr_l = lower_32_bits(cq->kern_cq.qbuf_dma_addr); + req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr); + + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) | + FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, + ERDMA_MR_INLINE_MTT); + + req.first_page_offset = 0; + req.cq_db_info_addr = + cq->kern_cq.qbuf_dma_addr + (cq->depth << CQE_SHIFT); + } else { + mtt = &cq->user_cq.qbuf_mtt; + req.cfg0 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK, + ilog2(mtt->page_size) - PAGE_SHIFT); + if (mtt->mtt_nents == 1) { + req.qbuf_addr_l = lower_32_bits(*(u64 *)mtt->mtt_buf); + req.qbuf_addr_h = upper_32_bits(*(u64 *)mtt->mtt_buf); + } else { + req.qbuf_addr_l = lower_32_bits(mtt->mtt_entry[0]); + req.qbuf_addr_h = upper_32_bits(mtt->mtt_entry[0]); + } + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, + mtt->mtt_nents); + req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK, + mtt->mtt_type); + + req.first_page_offset = mtt->page_offset; + req.cq_db_info_addr = cq->user_cq.db_info_dma_addr; + } + + return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); +} + +static int erdma_alloc_idx(struct erdma_resource_cb *res_cb) +{ + int idx; + unsigned long flags; + + spin_lock_irqsave(&res_cb->lock, flags); + idx = find_next_zero_bit(res_cb->bitmap, res_cb->max_cap, + res_cb->next_alloc_idx); + if (idx == res_cb->max_cap) { + idx = find_first_zero_bit(res_cb->bitmap, res_cb->max_cap); + if (idx == res_cb->max_cap) { + res_cb->next_alloc_idx = 1; + spin_unlock_irqrestore(&res_cb->lock, flags); + return -ENOSPC; + } + } + + set_bit(idx, res_cb->bitmap); + res_cb->next_alloc_idx = idx + 1; + spin_unlock_irqrestore(&res_cb->lock, flags); + + return idx; +} + +static inline void erdma_free_idx(struct erdma_resource_cb *res_cb, u32 idx) +{ + unsigned long flags; + u32 used; + + spin_lock_irqsave(&res_cb->lock, flags); + used = __test_and_clear_bit(idx, res_cb->bitmap); + spin_unlock_irqrestore(&res_cb->lock, flags); + WARN_ON(!used); +} + +static struct rdma_user_mmap_entry * +erdma_user_mmap_entry_insert(struct erdma_ucontext *uctx, void *address, + u32 size, u8 mmap_flag, u64 *mmap_offset) +{ + struct erdma_user_mmap_entry *entry = + kzalloc(sizeof(*entry), GFP_KERNEL); + int ret; + + if (!entry) + return NULL; + + entry->address = (u64)address; + entry->mmap_flag = mmap_flag; + + size = PAGE_ALIGN(size); + + ret = rdma_user_mmap_entry_insert(&uctx->ibucontext, &entry->rdma_entry, + size); + if (ret) { + kfree(entry); + return NULL; + } + + *mmap_offset = rdma_user_mmap_get_offset(&entry->rdma_entry); + + return &entry->rdma_entry; +} + +int erdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, + struct ib_udata *unused) +{ + struct erdma_dev *dev = to_edev(ibdev); + + memset(attr, 0, sizeof(*attr)); + + attr->max_mr_size = dev->attrs.max_mr_size; + attr->vendor_id = PCI_VENDOR_ID_ALIBABA; + attr->vendor_part_id = dev->pdev->device; + attr->hw_ver = dev->pdev->revision; + attr->max_qp = dev->attrs.max_qp - 1; + attr->max_qp_wr = min(dev->attrs.max_send_wr, dev->attrs.max_recv_wr); + attr->max_qp_rd_atom = dev->attrs.max_ord; + attr->max_qp_init_rd_atom = dev->attrs.max_ird; + attr->max_res_rd_atom = dev->attrs.max_qp * dev->attrs.max_ird; + attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->kernel_cap_flags = IBK_LOCAL_DMA_LKEY; + ibdev->local_dma_lkey = dev->attrs.local_dma_key; + attr->max_send_sge = dev->attrs.max_send_sge; + attr->max_recv_sge = dev->attrs.max_recv_sge; + attr->max_sge_rd = dev->attrs.max_sge_rd; + attr->max_cq = dev->attrs.max_cq - 1; + attr->max_cqe = dev->attrs.max_cqe; + attr->max_mr = dev->attrs.max_mr; + attr->max_pd = dev->attrs.max_pd; + attr->max_mw = dev->attrs.max_mw; + attr->max_fast_reg_page_list_len = ERDMA_MAX_FRMR_PA; + attr->page_size_cap = ERDMA_PAGE_SIZE_SUPPORT; + attr->fw_ver = dev->attrs.fw_version; + + if (dev->netdev) + addrconf_addr_eui48((u8 *)&attr->sys_image_guid, + dev->netdev->dev_addr); + + return 0; +} + +int erdma_query_gid(struct ib_device *ibdev, u32 port, int idx, + union ib_gid *gid) +{ + struct erdma_dev *dev = to_edev(ibdev); + + memset(gid, 0, sizeof(*gid)); + ether_addr_copy(gid->raw, dev->attrs.peer_addr); + + return 0; +} + +int erdma_query_port(struct ib_device *ibdev, u32 port, + struct ib_port_attr *attr) +{ + struct erdma_dev *dev = to_edev(ibdev); + struct net_device *ndev = dev->netdev; + + memset(attr, 0, sizeof(*attr)); + + attr->gid_tbl_len = 1; + attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; + attr->max_msg_sz = -1; + + if (!ndev) + goto out; + + ib_get_eth_speed(ibdev, port, &attr->active_speed, &attr->active_width); + attr->max_mtu = ib_mtu_int_to_enum(ndev->mtu); + attr->active_mtu = ib_mtu_int_to_enum(ndev->mtu); + if (netif_running(ndev) && netif_carrier_ok(ndev)) + dev->state = IB_PORT_ACTIVE; + else + dev->state = IB_PORT_DOWN; + attr->state = dev->state; + +out: + if (dev->state == IB_PORT_ACTIVE) + attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; + else + attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; + + return 0; +} + +int erdma_get_port_immutable(struct ib_device *ibdev, u32 port, + struct ib_port_immutable *port_immutable) +{ + port_immutable->gid_tbl_len = 1; + port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; + + return 0; +} + +int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct erdma_pd *pd = to_epd(ibpd); + struct erdma_dev *dev = to_edev(ibpd->device); + int pdn; + + pdn = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_PD]); + if (pdn < 0) + return pdn; + + pd->pdn = pdn; + + return 0; +} + +int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) +{ + struct erdma_pd *pd = to_epd(ibpd); + struct erdma_dev *dev = to_edev(ibpd->device); + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_PD], pd->pdn); + + return 0; +} + +static int erdma_qp_validate_cap(struct erdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if ((attrs->cap.max_send_wr > dev->attrs.max_send_wr) || + (attrs->cap.max_recv_wr > dev->attrs.max_recv_wr) || + (attrs->cap.max_send_sge > dev->attrs.max_send_sge) || + (attrs->cap.max_recv_sge > dev->attrs.max_recv_sge) || + (attrs->cap.max_inline_data > ERDMA_MAX_INLINE) || + !attrs->cap.max_send_wr || !attrs->cap.max_recv_wr) { + return -EINVAL; + } + + return 0; +} + +static int erdma_qp_validate_attr(struct erdma_dev *dev, + struct ib_qp_init_attr *attrs) +{ + if (attrs->qp_type != IB_QPT_RC) + return -EOPNOTSUPP; + + if (attrs->srq) + return -EOPNOTSUPP; + + if (!attrs->send_cq || !attrs->recv_cq) + return -EOPNOTSUPP; + + return 0; +} + +static void free_kernel_qp(struct erdma_qp *qp) +{ + struct erdma_dev *dev = qp->dev; + + vfree(qp->kern_qp.swr_tbl); + vfree(qp->kern_qp.rwr_tbl); + + if (qp->kern_qp.sq_buf) + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), + qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + + if (qp->kern_qp.rq_buf) + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), + qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); +} + +static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp, + struct ib_qp_init_attr *attrs) +{ + struct erdma_kqp *kqp = &qp->kern_qp; + int size; + + if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) + kqp->sig_all = 1; + + kqp->sq_pi = 0; + kqp->sq_ci = 0; + kqp->rq_pi = 0; + kqp->rq_ci = 0; + kqp->hw_sq_db = + dev->func_bar + (ERDMA_SDB_SHARED_PAGE_INDEX << PAGE_SHIFT); + kqp->hw_rq_db = dev->func_bar + ERDMA_BAR_RQDB_SPACE_OFFSET; + + kqp->swr_tbl = vmalloc(qp->attrs.sq_size * sizeof(u64)); + kqp->rwr_tbl = vmalloc(qp->attrs.rq_size * sizeof(u64)); + if (!kqp->swr_tbl || !kqp->rwr_tbl) + goto err_out; + + size = (qp->attrs.sq_size << SQEBB_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->sq_buf = dma_alloc_coherent(&dev->pdev->dev, size, + &kqp->sq_buf_dma_addr, GFP_KERNEL); + if (!kqp->sq_buf) + goto err_out; + + size = (qp->attrs.rq_size << RQE_SHIFT) + ERDMA_EXTRA_BUFFER_SIZE; + kqp->rq_buf = dma_alloc_coherent(&dev->pdev->dev, size, + &kqp->rq_buf_dma_addr, GFP_KERNEL); + if (!kqp->rq_buf) + goto err_out; + + kqp->sq_db_info = kqp->sq_buf + (qp->attrs.sq_size << SQEBB_SHIFT); + kqp->rq_db_info = kqp->rq_buf + (qp->attrs.rq_size << RQE_SHIFT); + + return 0; + +err_out: + free_kernel_qp(qp); + return -ENOMEM; +} + +static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem, + u64 start, u64 len, int access, u64 virt, + unsigned long req_page_size, u8 force_indirect_mtt) +{ + struct ib_block_iter biter; + uint64_t *phy_addr = NULL; + int ret = 0; + + mem->umem = ib_umem_get(&dev->ibdev, start, len, access); + if (IS_ERR(mem->umem)) { + ret = PTR_ERR(mem->umem); + mem->umem = NULL; + return ret; + } + + mem->va = virt; + mem->len = len; + mem->page_size = ib_umem_find_best_pgsz(mem->umem, req_page_size, virt); + mem->page_offset = start & (mem->page_size - 1); + mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size); + mem->page_cnt = mem->mtt_nents; + + if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES || + force_indirect_mtt) { + mem->mtt_type = ERDMA_MR_INDIRECT_MTT; + mem->mtt_buf = + alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL); + if (!mem->mtt_buf) { + ret = -ENOMEM; + goto error_ret; + } + phy_addr = mem->mtt_buf; + } else { + mem->mtt_type = ERDMA_MR_INLINE_MTT; + phy_addr = mem->mtt_entry; + } + + rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) { + *phy_addr = rdma_block_iter_dma_address(&biter); + phy_addr++; + } + + if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) { + mem->mtt_entry[0] = + dma_map_single(&dev->pdev->dev, mem->mtt_buf, + MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) { + free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); + mem->mtt_buf = NULL; + ret = -ENOMEM; + goto error_ret; + } + } + + return 0; + +error_ret: + if (mem->umem) { + ib_umem_release(mem->umem); + mem->umem = NULL; + } + + return ret; +} + +static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem) +{ + if (mem->mtt_buf) { + dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0], + MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE); + free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt)); + } + + if (mem->umem) { + ib_umem_release(mem->umem); + mem->umem = NULL; + } +} + +static int erdma_map_user_dbrecords(struct erdma_ucontext *ctx, + u64 dbrecords_va, + struct erdma_user_dbrecords_page **dbr_page, + dma_addr_t *dma_addr) +{ + struct erdma_user_dbrecords_page *page = NULL; + int rv = 0; + + mutex_lock(&ctx->dbrecords_page_mutex); + + list_for_each_entry(page, &ctx->dbrecords_page_list, list) + if (page->va == (dbrecords_va & PAGE_MASK)) + goto found; + + page = kmalloc(sizeof(*page), GFP_KERNEL); + if (!page) { + rv = -ENOMEM; + goto out; + } + + page->va = (dbrecords_va & PAGE_MASK); + page->refcnt = 0; + + page->umem = ib_umem_get(ctx->ibucontext.device, + dbrecords_va & PAGE_MASK, PAGE_SIZE, 0); + if (IS_ERR(page->umem)) { + rv = PTR_ERR(page->umem); + kfree(page); + goto out; + } + + list_add(&page->list, &ctx->dbrecords_page_list); + +found: + *dma_addr = sg_dma_address(page->umem->sgt_append.sgt.sgl) + + (dbrecords_va & ~PAGE_MASK); + *dbr_page = page; + page->refcnt++; + +out: + mutex_unlock(&ctx->dbrecords_page_mutex); + return rv; +} + +static void +erdma_unmap_user_dbrecords(struct erdma_ucontext *ctx, + struct erdma_user_dbrecords_page **dbr_page) +{ + if (!ctx || !(*dbr_page)) + return; + + mutex_lock(&ctx->dbrecords_page_mutex); + if (--(*dbr_page)->refcnt == 0) { + list_del(&(*dbr_page)->list); + ib_umem_release((*dbr_page)->umem); + kfree(*dbr_page); + } + + *dbr_page = NULL; + mutex_unlock(&ctx->dbrecords_page_mutex); +} + +static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx, + u64 va, u32 len, u64 db_info_va) +{ + dma_addr_t db_info_dma_addr; + u32 rq_offset; + int ret; + + if (len < (PAGE_ALIGN(qp->attrs.sq_size * SQEBB_SIZE) + + qp->attrs.rq_size * RQE_SIZE)) + return -EINVAL; + + ret = get_mtt_entries(qp->dev, &qp->user_qp.sq_mtt, va, + qp->attrs.sq_size << SQEBB_SHIFT, 0, va, + (SZ_1M - SZ_4K), 1); + if (ret) + return ret; + + rq_offset = PAGE_ALIGN(qp->attrs.sq_size << SQEBB_SHIFT); + qp->user_qp.rq_offset = rq_offset; + + ret = get_mtt_entries(qp->dev, &qp->user_qp.rq_mtt, va + rq_offset, + qp->attrs.rq_size << RQE_SHIFT, 0, va + rq_offset, + (SZ_1M - SZ_4K), 1); + if (ret) + goto put_sq_mtt; + + ret = erdma_map_user_dbrecords(uctx, db_info_va, + &qp->user_qp.user_dbr_page, + &db_info_dma_addr); + if (ret) + goto put_rq_mtt; + + qp->user_qp.sq_db_info_dma_addr = db_info_dma_addr; + qp->user_qp.rq_db_info_dma_addr = db_info_dma_addr + ERDMA_DB_SIZE; + + return 0; + +put_rq_mtt: + put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt); + +put_sq_mtt: + put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt); + + return ret; +} + +static void free_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx) +{ + put_mtt_entries(qp->dev, &qp->user_qp.sq_mtt); + put_mtt_entries(qp->dev, &qp->user_qp.rq_mtt); + erdma_unmap_user_dbrecords(uctx, &qp->user_qp.user_dbr_page); +} + +int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, + struct ib_udata *udata) +{ + struct erdma_qp *qp = to_eqp(ibqp); + struct erdma_dev *dev = to_edev(ibqp->device); + struct erdma_ucontext *uctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + struct erdma_ureq_create_qp ureq; + struct erdma_uresp_create_qp uresp; + int ret; + + ret = erdma_qp_validate_cap(dev, attrs); + if (ret) + goto err_out; + + ret = erdma_qp_validate_attr(dev, attrs); + if (ret) + goto err_out; + + qp->scq = to_ecq(attrs->send_cq); + qp->rcq = to_ecq(attrs->recv_cq); + qp->dev = dev; + qp->attrs.cc = dev->attrs.cc; + + init_rwsem(&qp->state_lock); + kref_init(&qp->ref); + init_completion(&qp->safe_free); + + ret = xa_alloc_cyclic(&dev->qp_xa, &qp->ibqp.qp_num, qp, + XA_LIMIT(1, dev->attrs.max_qp - 1), + &dev->next_alloc_qpn, GFP_KERNEL); + if (ret < 0) { + ret = -ENOMEM; + goto err_out; + } + + qp->attrs.sq_size = roundup_pow_of_two(attrs->cap.max_send_wr * + ERDMA_MAX_WQEBB_PER_SQE); + qp->attrs.rq_size = roundup_pow_of_two(attrs->cap.max_recv_wr); + + if (uctx) { + ret = ib_copy_from_udata(&ureq, udata, + min(sizeof(ureq), udata->inlen)); + if (ret) + goto err_out_xa; + + ret = init_user_qp(qp, uctx, ureq.qbuf_va, ureq.qbuf_len, + ureq.db_record_va); + if (ret) + goto err_out_xa; + + memset(&uresp, 0, sizeof(uresp)); + + uresp.num_sqe = qp->attrs.sq_size; + uresp.num_rqe = qp->attrs.rq_size; + uresp.qp_id = QP_ID(qp); + uresp.rq_offset = qp->user_qp.rq_offset; + + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (ret) + goto err_out_cmd; + } else { + init_kernel_qp(dev, qp, attrs); + } + + qp->attrs.max_send_sge = attrs->cap.max_send_sge; + qp->attrs.max_recv_sge = attrs->cap.max_recv_sge; + qp->attrs.state = ERDMA_QP_STATE_IDLE; + + ret = create_qp_cmd(dev, qp); + if (ret) + goto err_out_cmd; + + spin_lock_init(&qp->lock); + + return 0; + +err_out_cmd: + if (uctx) + free_user_qp(qp, uctx); + else + free_kernel_qp(qp); +err_out_xa: + xa_erase(&dev->qp_xa, QP_ID(qp)); +err_out: + return ret; +} + +static int erdma_create_stag(struct erdma_dev *dev, u32 *stag) +{ + int stag_idx; + + stag_idx = erdma_alloc_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX]); + if (stag_idx < 0) + return stag_idx; + + /* For now, we always let key field be zero. */ + *stag = (stag_idx << 8); + + return 0; +} + +struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int acc) +{ + struct erdma_dev *dev = to_edev(ibpd->device); + struct erdma_mr *mr; + u32 stag; + int ret; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto out_free; + + mr->type = ERDMA_MR_TYPE_DMA; + + mr->ibmr.lkey = stag; + mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(acc); + ret = regmr_cmd(dev, mr); + if (ret) + goto out_remove_stag; + + return &mr->ibmr; + +out_remove_stag: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct erdma_mr *mr; + struct erdma_dev *dev = to_edev(ibpd->device); + int ret; + u32 stag; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EOPNOTSUPP); + + if (max_num_sg > ERDMA_MR_MAX_MTT_CNT) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto out_free; + + mr->type = ERDMA_MR_TYPE_FRMR; + + mr->ibmr.lkey = stag; + mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + /* update it in FRMR. */ + mr->access = ERDMA_MR_ACC_LR | ERDMA_MR_ACC_LW | ERDMA_MR_ACC_RR | + ERDMA_MR_ACC_RW; + + mr->mem.page_size = PAGE_SIZE; /* update it later. */ + mr->mem.page_cnt = max_num_sg; + mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT; + mr->mem.mtt_buf = + alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL); + if (!mr->mem.mtt_buf) { + ret = -ENOMEM; + goto out_remove_stag; + } + + mr->mem.mtt_entry[0] = + dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf, + MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); + if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) { + ret = -ENOMEM; + goto out_free_mtt; + } + + ret = regmr_cmd(dev, mr); + if (ret) + goto out_dma_unmap; + + return &mr->ibmr; + +out_dma_unmap: + dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0], + MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE); +out_free_mtt: + free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt)); + +out_remove_stag: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +static int erdma_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct erdma_mr *mr = to_emr(ibmr); + + if (mr->mem.mtt_nents >= mr->mem.page_cnt) + return -1; + + *((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr; + mr->mem.mtt_nents++; + + return 0; +} + +int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct erdma_mr *mr = to_emr(ibmr); + int num; + + mr->mem.mtt_nents = 0; + + num = ib_sg_to_pages(&mr->ibmr, sg, sg_nents, sg_offset, + erdma_set_page); + + return num; +} + +struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, + u64 virt, int access, struct ib_udata *udata) +{ + struct erdma_mr *mr = NULL; + struct erdma_dev *dev = to_edev(ibpd->device); + u32 stag; + int ret; + + if (!len || len > dev->attrs.max_mr_size) + return ERR_PTR(-EINVAL); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + ret = get_mtt_entries(dev, &mr->mem, start, len, access, virt, + SZ_2G - SZ_4K, 0); + if (ret) + goto err_out_free; + + ret = erdma_create_stag(dev, &stag); + if (ret) + goto err_out_put_mtt; + + mr->ibmr.lkey = mr->ibmr.rkey = stag; + mr->ibmr.pd = ibpd; + mr->mem.va = virt; + mr->mem.len = len; + mr->access = ERDMA_MR_ACC_LR | to_erdma_access_flags(access); + mr->valid = 1; + mr->type = ERDMA_MR_TYPE_NORMAL; + + ret = regmr_cmd(dev, mr); + if (ret) + goto err_out_mr; + + return &mr->ibmr; + +err_out_mr: + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], + mr->ibmr.lkey >> 8); + +err_out_put_mtt: + put_mtt_entries(dev, &mr->mem); + +err_out_free: + kfree(mr); + + return ERR_PTR(ret); +} + +int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct erdma_mr *mr; + struct erdma_dev *dev = to_edev(ibmr->device); + struct erdma_cmdq_dereg_mr_req req; + int ret; + + mr = to_emr(ibmr); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DEREG_MR); + + req.cfg = FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, ibmr->lkey >> 8) | + FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, ibmr->lkey & 0xFF); + + ret = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (ret) + return ret; + + erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX], ibmr->lkey >> 8); + + put_mtt_entries(dev, &mr->mem); + + kfree(mr); + return 0; +} + +int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_dev *dev = to_edev(ibcq->device); + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + int err; + struct erdma_cmdq_destroy_cq_req req; + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_CQ); + req.cqn = cq->cqn; + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return err; + + if (rdma_is_kernel_res(&cq->ibcq.res)) { + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + } else { + erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + } + + xa_erase(&dev->cq_xa, cq->cqn); + + return 0; +} + +int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) +{ + struct erdma_qp *qp = to_eqp(ibqp); + struct erdma_dev *dev = to_edev(ibqp->device); + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + struct erdma_qp_attrs qp_attrs; + int err; + struct erdma_cmdq_destroy_qp_req req; + + down_write(&qp->state_lock); + qp_attrs.state = ERDMA_QP_STATE_ERROR; + erdma_modify_qp_internal(qp, &qp_attrs, ERDMA_QP_ATTR_STATE); + up_write(&qp->state_lock); + + erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, + CMDQ_OPCODE_DESTROY_QP); + req.qpn = QP_ID(qp); + + err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL, + NULL); + if (err) + return err; + + erdma_qp_put(qp); + wait_for_completion(&qp->safe_free); + + if (rdma_is_kernel_res(&qp->ibqp.res)) { + vfree(qp->kern_qp.swr_tbl); + vfree(qp->kern_qp.rwr_tbl); + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.rq_size << RQE_SHIFT), + qp->kern_qp.rq_buf, qp->kern_qp.rq_buf_dma_addr); + dma_free_coherent( + &dev->pdev->dev, + WARPPED_BUFSIZE(qp->attrs.sq_size << SQEBB_SHIFT), + qp->kern_qp.sq_buf, qp->kern_qp.sq_buf_dma_addr); + } else { + put_mtt_entries(dev, &qp->user_qp.sq_mtt); + put_mtt_entries(dev, &qp->user_qp.rq_mtt); + erdma_unmap_user_dbrecords(ctx, &qp->user_qp.user_dbr_page); + } + + if (qp->cep) + erdma_cep_put(qp->cep); + xa_erase(&dev->qp_xa, QP_ID(qp)); + + return 0; +} + +void erdma_qp_get_ref(struct ib_qp *ibqp) +{ + erdma_qp_get(to_eqp(ibqp)); +} + +void erdma_qp_put_ref(struct ib_qp *ibqp) +{ + erdma_qp_put(to_eqp(ibqp)); +} + +int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) +{ + struct rdma_user_mmap_entry *rdma_entry; + struct erdma_user_mmap_entry *entry; + pgprot_t prot; + int err; + + rdma_entry = rdma_user_mmap_entry_get(ctx, vma); + if (!rdma_entry) + return -EINVAL; + + entry = to_emmap(rdma_entry); + + switch (entry->mmap_flag) { + case ERDMA_MMAP_IO_NC: + /* map doorbell. */ + prot = pgprot_device(vma->vm_page_prot); + break; + default: + return -EINVAL; + } + + err = rdma_user_mmap_io(ctx, vma, PFN_DOWN(entry->address), PAGE_SIZE, + prot, rdma_entry); + + rdma_user_mmap_entry_put(rdma_entry); + return err; +} + +void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry) +{ + struct erdma_user_mmap_entry *entry = to_emmap(rdma_entry); + + kfree(entry); +} + +#define ERDMA_SDB_PAGE 0 +#define ERDMA_SDB_ENTRY 1 +#define ERDMA_SDB_SHARED 2 + +static void alloc_db_resources(struct erdma_dev *dev, + struct erdma_ucontext *ctx) +{ + u32 bitmap_idx; + struct erdma_devattr *attrs = &dev->attrs; + + if (attrs->disable_dwqe) + goto alloc_normal_db; + + /* Try to alloc independent SDB page. */ + spin_lock(&dev->db_bitmap_lock); + bitmap_idx = find_first_zero_bit(dev->sdb_page, attrs->dwqe_pages); + if (bitmap_idx != attrs->dwqe_pages) { + set_bit(bitmap_idx, dev->sdb_page); + spin_unlock(&dev->db_bitmap_lock); + + ctx->sdb_type = ERDMA_SDB_PAGE; + ctx->sdb_idx = bitmap_idx; + ctx->sdb_page_idx = bitmap_idx; + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + + (bitmap_idx << PAGE_SHIFT); + ctx->sdb_page_off = 0; + + return; + } + + bitmap_idx = find_first_zero_bit(dev->sdb_entry, attrs->dwqe_entries); + if (bitmap_idx != attrs->dwqe_entries) { + set_bit(bitmap_idx, dev->sdb_entry); + spin_unlock(&dev->db_bitmap_lock); + + ctx->sdb_type = ERDMA_SDB_ENTRY; + ctx->sdb_idx = bitmap_idx; + ctx->sdb_page_idx = attrs->dwqe_pages + + bitmap_idx / ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + ctx->sdb_page_off = bitmap_idx % ERDMA_DWQE_TYPE1_CNT_PER_PAGE; + + ctx->sdb = dev->func_bar_addr + ERDMA_BAR_SQDB_SPACE_OFFSET + + (ctx->sdb_page_idx << PAGE_SHIFT); + + return; + } + + spin_unlock(&dev->db_bitmap_lock); + +alloc_normal_db: + ctx->sdb_type = ERDMA_SDB_SHARED; + ctx->sdb_idx = 0; + ctx->sdb_page_idx = ERDMA_SDB_SHARED_PAGE_INDEX; + ctx->sdb_page_off = 0; + + ctx->sdb = dev->func_bar_addr + (ctx->sdb_page_idx << PAGE_SHIFT); +} + +static void erdma_uctx_user_mmap_entries_remove(struct erdma_ucontext *uctx) +{ + rdma_user_mmap_entry_remove(uctx->sq_db_mmap_entry); + rdma_user_mmap_entry_remove(uctx->rq_db_mmap_entry); + rdma_user_mmap_entry_remove(uctx->cq_db_mmap_entry); +} + +int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *udata) +{ + struct erdma_ucontext *ctx = to_ectx(ibctx); + struct erdma_dev *dev = to_edev(ibctx->device); + int ret; + struct erdma_uresp_alloc_ctx uresp = {}; + + if (atomic_inc_return(&dev->num_ctx) > ERDMA_MAX_CONTEXT) { + ret = -ENOMEM; + goto err_out; + } + + INIT_LIST_HEAD(&ctx->dbrecords_page_list); + mutex_init(&ctx->dbrecords_page_mutex); + + alloc_db_resources(dev, ctx); + + ctx->rdb = dev->func_bar_addr + ERDMA_BAR_RQDB_SPACE_OFFSET; + ctx->cdb = dev->func_bar_addr + ERDMA_BAR_CQDB_SPACE_OFFSET; + + if (udata->outlen < sizeof(uresp)) { + ret = -EINVAL; + goto err_out; + } + + ctx->sq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->sdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.sdb); + if (!ctx->sq_db_mmap_entry) { + ret = -ENOMEM; + goto err_out; + } + + ctx->rq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->rdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.rdb); + if (!ctx->rq_db_mmap_entry) { + ret = -EINVAL; + goto err_out; + } + + ctx->cq_db_mmap_entry = erdma_user_mmap_entry_insert( + ctx, (void *)ctx->cdb, PAGE_SIZE, ERDMA_MMAP_IO_NC, &uresp.cdb); + if (!ctx->cq_db_mmap_entry) { + ret = -EINVAL; + goto err_out; + } + + uresp.dev_id = dev->pdev->device; + uresp.sdb_type = ctx->sdb_type; + uresp.sdb_offset = ctx->sdb_page_off; + + ret = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); + if (ret) + goto err_out; + + return 0; + +err_out: + erdma_uctx_user_mmap_entries_remove(ctx); + atomic_dec(&dev->num_ctx); + return ret; +} + +void erdma_dealloc_ucontext(struct ib_ucontext *ibctx) +{ + struct erdma_ucontext *ctx = to_ectx(ibctx); + struct erdma_dev *dev = to_edev(ibctx->device); + + spin_lock(&dev->db_bitmap_lock); + if (ctx->sdb_type == ERDMA_SDB_PAGE) + clear_bit(ctx->sdb_idx, dev->sdb_page); + else if (ctx->sdb_type == ERDMA_SDB_ENTRY) + clear_bit(ctx->sdb_idx, dev->sdb_entry); + + erdma_uctx_user_mmap_entries_remove(ctx); + + spin_unlock(&dev->db_bitmap_lock); + + atomic_dec(&dev->num_ctx); +} + +static int ib_qp_state_to_erdma_qp_state[IB_QPS_ERR + 1] = { + [IB_QPS_RESET] = ERDMA_QP_STATE_IDLE, + [IB_QPS_INIT] = ERDMA_QP_STATE_IDLE, + [IB_QPS_RTR] = ERDMA_QP_STATE_RTR, + [IB_QPS_RTS] = ERDMA_QP_STATE_RTS, + [IB_QPS_SQD] = ERDMA_QP_STATE_CLOSING, + [IB_QPS_SQE] = ERDMA_QP_STATE_TERMINATE, + [IB_QPS_ERR] = ERDMA_QP_STATE_ERROR +}; + +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, + struct ib_udata *udata) +{ + struct erdma_qp_attrs new_attrs; + enum erdma_qp_attr_mask erdma_attr_mask = 0; + struct erdma_qp *qp = to_eqp(ibqp); + int ret = 0; + + if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) + return -EOPNOTSUPP; + + memset(&new_attrs, 0, sizeof(new_attrs)); + + if (attr_mask & IB_QP_STATE) { + new_attrs.state = ib_qp_state_to_erdma_qp_state[attr->qp_state]; + + erdma_attr_mask |= ERDMA_QP_ATTR_STATE; + } + + down_write(&qp->state_lock); + + ret = erdma_modify_qp_internal(qp, &new_attrs, erdma_attr_mask); + + up_write(&qp->state_lock); + + return ret; +} + +int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct erdma_qp *qp; + struct erdma_dev *dev; + + if (ibqp && qp_attr && qp_init_attr) { + qp = to_eqp(ibqp); + dev = to_edev(ibqp->device); + } else { + return -EINVAL; + } + + qp_attr->cap.max_inline_data = ERDMA_MAX_INLINE; + qp_init_attr->cap.max_inline_data = ERDMA_MAX_INLINE; + + qp_attr->cap.max_send_wr = qp->attrs.sq_size; + qp_attr->cap.max_recv_wr = qp->attrs.rq_size; + qp_attr->cap.max_send_sge = qp->attrs.max_send_sge; + qp_attr->cap.max_recv_sge = qp->attrs.max_recv_sge; + + qp_attr->path_mtu = ib_mtu_int_to_enum(dev->netdev->mtu); + qp_attr->max_rd_atomic = qp->attrs.irq_size; + qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; + + qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ; + + qp_init_attr->cap = qp_attr->cap; + + return 0; +} + +static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq, + struct erdma_ureq_create_cq *ureq) +{ + int ret; + struct erdma_dev *dev = to_edev(cq->ibcq.device); + + ret = get_mtt_entries(dev, &cq->user_cq.qbuf_mtt, ureq->qbuf_va, + ureq->qbuf_len, 0, ureq->qbuf_va, SZ_64M - SZ_4K, + 1); + if (ret) + return ret; + + ret = erdma_map_user_dbrecords(ctx, ureq->db_record_va, + &cq->user_cq.user_dbr_page, + &cq->user_cq.db_info_dma_addr); + if (ret) + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + + return ret; +} + +static int erdma_init_kernel_cq(struct erdma_cq *cq) +{ + struct erdma_dev *dev = to_edev(cq->ibcq.device); + + cq->kern_cq.qbuf = + dma_alloc_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(cq->depth << CQE_SHIFT), + &cq->kern_cq.qbuf_dma_addr, GFP_KERNEL); + if (!cq->kern_cq.qbuf) + return -ENOMEM; + + cq->kern_cq.db_record = + (u64 *)(cq->kern_cq.qbuf + (cq->depth << CQE_SHIFT)); + spin_lock_init(&cq->kern_cq.lock); + /* use default cqdb addr */ + cq->kern_cq.db = dev->func_bar + ERDMA_BAR_CQDB_SPACE_OFFSET; + + return 0; +} + +int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *udata) +{ + struct erdma_cq *cq = to_ecq(ibcq); + struct erdma_dev *dev = to_edev(ibcq->device); + unsigned int depth = attr->cqe; + int ret; + struct erdma_ucontext *ctx = rdma_udata_to_drv_context( + udata, struct erdma_ucontext, ibucontext); + + if (depth > dev->attrs.max_cqe) + return -EINVAL; + + depth = roundup_pow_of_two(depth); + cq->ibcq.cqe = depth; + cq->depth = depth; + cq->assoc_eqn = attr->comp_vector + 1; + + ret = xa_alloc_cyclic(&dev->cq_xa, &cq->cqn, cq, + XA_LIMIT(1, dev->attrs.max_cq - 1), + &dev->next_alloc_cqn, GFP_KERNEL); + if (ret < 0) + return ret; + + if (!rdma_is_kernel_res(&ibcq->res)) { + struct erdma_ureq_create_cq ureq; + struct erdma_uresp_create_cq uresp; + + ret = ib_copy_from_udata(&ureq, udata, + min(udata->inlen, sizeof(ureq))); + if (ret) + goto err_out_xa; + + ret = erdma_init_user_cq(ctx, cq, &ureq); + if (ret) + goto err_out_xa; + + uresp.cq_id = cq->cqn; + uresp.num_cqe = depth; + + ret = ib_copy_to_udata(udata, &uresp, + min(sizeof(uresp), udata->outlen)); + if (ret) + goto err_free_res; + } else { + ret = erdma_init_kernel_cq(cq); + if (ret) + goto err_out_xa; + } + + ret = create_cq_cmd(dev, cq); + if (ret) + goto err_free_res; + + return 0; + +err_free_res: + if (!rdma_is_kernel_res(&ibcq->res)) { + erdma_unmap_user_dbrecords(ctx, &cq->user_cq.user_dbr_page); + put_mtt_entries(dev, &cq->user_cq.qbuf_mtt); + } else { + dma_free_coherent(&dev->pdev->dev, + WARPPED_BUFSIZE(depth << CQE_SHIFT), + cq->kern_cq.qbuf, cq->kern_cq.qbuf_dma_addr); + } + +err_out_xa: + xa_erase(&dev->cq_xa, cq->cqn); + + return ret; +} + +void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason) +{ + struct ib_event event; + + event.device = &dev->ibdev; + event.element.port_num = 1; + event.event = reason; + + ib_dispatch_event(&event); +} diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h new file mode 100644 index 000000000000..c7baddb1f292 --- /dev/null +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -0,0 +1,342 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ + +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */ +/* Kai Shen <kaishen@linux.alibaba.com> */ +/* Copyright (c) 2020-2022, Alibaba Group. */ + +#ifndef __ERDMA_VERBS_H__ +#define __ERDMA_VERBS_H__ + +#include <linux/errno.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_user_verbs.h> +#include <rdma/iw_cm.h> + +#include "erdma.h" +#include "erdma_cm.h" +#include "erdma_hw.h" + +/* RDMA Capability. */ +#define ERDMA_MAX_PD (128 * 1024) +#define ERDMA_MAX_SEND_WR 4096 +#define ERDMA_MAX_ORD 128 +#define ERDMA_MAX_IRD 128 +#define ERDMA_MAX_SGE_RD 1 +#define ERDMA_MAX_CONTEXT (128 * 1024) +#define ERDMA_MAX_SEND_SGE 6 +#define ERDMA_MAX_RECV_SGE 1 +#define ERDMA_MAX_INLINE (sizeof(struct erdma_sge) * (ERDMA_MAX_SEND_SGE)) +#define ERDMA_MAX_FRMR_PA 512 + +enum { + ERDMA_MMAP_IO_NC = 0, /* no cache */ +}; + +struct erdma_user_mmap_entry { + struct rdma_user_mmap_entry rdma_entry; + u64 address; + u8 mmap_flag; +}; + +struct erdma_ucontext { + struct ib_ucontext ibucontext; + + u32 sdb_type; + u32 sdb_idx; + u32 sdb_page_idx; + u32 sdb_page_off; + u64 sdb; + u64 rdb; + u64 cdb; + + struct rdma_user_mmap_entry *sq_db_mmap_entry; + struct rdma_user_mmap_entry *rq_db_mmap_entry; + struct rdma_user_mmap_entry *cq_db_mmap_entry; + + /* doorbell records */ + struct list_head dbrecords_page_list; + struct mutex dbrecords_page_mutex; +}; + +struct erdma_pd { + struct ib_pd ibpd; + u32 pdn; +}; + +/* + * MemoryRegion definition. + */ +#define ERDMA_MAX_INLINE_MTT_ENTRIES 4 +#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt takes 8 Bytes. */ +#define ERDMA_MR_MAX_MTT_CNT 524288 +#define ERDMA_MTT_ENTRY_SIZE 8 + +#define ERDMA_MR_TYPE_NORMAL 0 +#define ERDMA_MR_TYPE_FRMR 1 +#define ERDMA_MR_TYPE_DMA 2 + +#define ERDMA_MR_INLINE_MTT 0 +#define ERDMA_MR_INDIRECT_MTT 1 + +#define ERDMA_MR_ACC_LR BIT(0) +#define ERDMA_MR_ACC_LW BIT(1) +#define ERDMA_MR_ACC_RR BIT(2) +#define ERDMA_MR_ACC_RW BIT(3) + +static inline u8 to_erdma_access_flags(int access) +{ + return (access & IB_ACCESS_REMOTE_READ ? ERDMA_MR_ACC_RR : 0) | + (access & IB_ACCESS_LOCAL_WRITE ? ERDMA_MR_ACC_LW : 0) | + (access & IB_ACCESS_REMOTE_WRITE ? ERDMA_MR_ACC_RW : 0); +} + +struct erdma_mem { + struct ib_umem *umem; + void *mtt_buf; + u32 mtt_type; + u32 page_size; + u32 page_offset; + u32 page_cnt; + u32 mtt_nents; + + u64 va; + u64 len; + + u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES]; +}; + +struct erdma_mr { + struct ib_mr ibmr; + struct erdma_mem mem; + u8 type; + u8 access; + u8 valid; +}; + +struct erdma_user_dbrecords_page { + struct list_head list; + struct ib_umem *umem; + u64 va; + int refcnt; +}; + +struct erdma_uqp { + struct erdma_mem sq_mtt; + struct erdma_mem rq_mtt; + + dma_addr_t sq_db_info_dma_addr; + dma_addr_t rq_db_info_dma_addr; + + struct erdma_user_dbrecords_page *user_dbr_page; + + u32 rq_offset; +}; + +struct erdma_kqp { + u16 sq_pi; + u16 sq_ci; + + u16 rq_pi; + u16 rq_ci; + + u64 *swr_tbl; + u64 *rwr_tbl; + + void __iomem *hw_sq_db; + void __iomem *hw_rq_db; + + void *sq_buf; + dma_addr_t sq_buf_dma_addr; + + void *rq_buf; + dma_addr_t rq_buf_dma_addr; + + void *sq_db_info; + void *rq_db_info; + + u8 sig_all; +}; + +enum erdma_qp_state { + ERDMA_QP_STATE_IDLE = 0, + ERDMA_QP_STATE_RTR = 1, + ERDMA_QP_STATE_RTS = 2, + ERDMA_QP_STATE_CLOSING = 3, + ERDMA_QP_STATE_TERMINATE = 4, + ERDMA_QP_STATE_ERROR = 5, + ERDMA_QP_STATE_UNDEF = 7, + ERDMA_QP_STATE_COUNT = 8 +}; + +enum erdma_qp_attr_mask { + ERDMA_QP_ATTR_STATE = (1 << 0), + ERDMA_QP_ATTR_LLP_HANDLE = (1 << 2), + ERDMA_QP_ATTR_ORD = (1 << 3), + ERDMA_QP_ATTR_IRD = (1 << 4), + ERDMA_QP_ATTR_SQ_SIZE = (1 << 5), + ERDMA_QP_ATTR_RQ_SIZE = (1 << 6), + ERDMA_QP_ATTR_MPA = (1 << 7) +}; + +struct erdma_qp_attrs { + enum erdma_qp_state state; + enum erdma_cc_alg cc; /* Congestion control algorithm */ + u32 sq_size; + u32 rq_size; + u32 orq_size; + u32 irq_size; + u32 max_send_sge; + u32 max_recv_sge; + u32 cookie; +#define ERDMA_QP_ACTIVE 0 +#define ERDMA_QP_PASSIVE 1 + u8 qp_type; + u8 pd_len; +}; + +struct erdma_qp { + struct ib_qp ibqp; + struct kref ref; + struct completion safe_free; + struct erdma_dev *dev; + struct erdma_cep *cep; + struct rw_semaphore state_lock; + + union { + struct erdma_kqp kern_qp; + struct erdma_uqp user_qp; + }; + + struct erdma_cq *scq; + struct erdma_cq *rcq; + + struct erdma_qp_attrs attrs; + spinlock_t lock; +}; + +struct erdma_kcq_info { + void *qbuf; + dma_addr_t qbuf_dma_addr; + u32 ci; + u32 cmdsn; + u32 notify_cnt; + + spinlock_t lock; + u8 __iomem *db; + u64 *db_record; +}; + +struct erdma_ucq_info { + struct erdma_mem qbuf_mtt; + struct erdma_user_dbrecords_page *user_dbr_page; + dma_addr_t db_info_dma_addr; +}; + +struct erdma_cq { + struct ib_cq ibcq; + u32 cqn; + + u32 depth; + u32 assoc_eqn; + + union { + struct erdma_kcq_info kern_cq; + struct erdma_ucq_info user_cq; + }; +}; + +#define QP_ID(qp) ((qp)->ibqp.qp_num) + +static inline struct erdma_qp *find_qp_by_qpn(struct erdma_dev *dev, int id) +{ + return (struct erdma_qp *)xa_load(&dev->qp_xa, id); +} + +static inline struct erdma_cq *find_cq_by_cqn(struct erdma_dev *dev, int id) +{ + return (struct erdma_cq *)xa_load(&dev->cq_xa, id); +} + +void erdma_qp_get(struct erdma_qp *qp); +void erdma_qp_put(struct erdma_qp *qp); +int erdma_modify_qp_internal(struct erdma_qp *qp, struct erdma_qp_attrs *attrs, + enum erdma_qp_attr_mask mask); +void erdma_qp_llp_close(struct erdma_qp *qp); +void erdma_qp_cm_drop(struct erdma_qp *qp); + +static inline struct erdma_ucontext *to_ectx(struct ib_ucontext *ibctx) +{ + return container_of(ibctx, struct erdma_ucontext, ibucontext); +} + +static inline struct erdma_pd *to_epd(struct ib_pd *pd) +{ + return container_of(pd, struct erdma_pd, ibpd); +} + +static inline struct erdma_mr *to_emr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct erdma_mr, ibmr); +} + +static inline struct erdma_qp *to_eqp(struct ib_qp *qp) +{ + return container_of(qp, struct erdma_qp, ibqp); +} + +static inline struct erdma_cq *to_ecq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct erdma_cq, ibcq); +} + +static inline struct erdma_user_mmap_entry * +to_emmap(struct rdma_user_mmap_entry *ibmmap) +{ + return container_of(ibmmap, struct erdma_user_mmap_entry, rdma_entry); +} + +int erdma_alloc_ucontext(struct ib_ucontext *ibctx, struct ib_udata *data); +void erdma_dealloc_ucontext(struct ib_ucontext *ibctx); +int erdma_query_device(struct ib_device *dev, struct ib_device_attr *attr, + struct ib_udata *data); +int erdma_get_port_immutable(struct ib_device *dev, u32 port, + struct ib_port_immutable *ib_port_immutable); +int erdma_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_udata *data); +int erdma_query_port(struct ib_device *dev, u32 port, + struct ib_port_attr *attr); +int erdma_query_gid(struct ib_device *dev, u32 port, int idx, + union ib_gid *gid); +int erdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *data); +int erdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); +int erdma_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr, + struct ib_udata *data); +int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_qp_init_attr *init_attr); +int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, + struct ib_udata *data); +int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); +int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); +struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, + u64 virt, int access, struct ib_udata *udata); +struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int rights); +int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *data); +int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); +void erdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry); +void erdma_qp_get_ref(struct ib_qp *ibqp); +void erdma_qp_put_ref(struct ib_qp *ibqp); +struct ib_qp *erdma_get_ibqp(struct ib_device *dev, int id); +int erdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *send_wr, + const struct ib_send_wr **bad_send_wr); +int erdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *recv_wr, + const struct ib_recv_wr **bad_recv_wr); +int erdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); +struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, + u32 max_num_sg); +int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); +void erdma_port_event(struct erdma_dev *dev, enum ib_event_type reason); + +#endif diff --git a/drivers/infiniband/hw/hfi1/Kconfig b/drivers/infiniband/hw/hfi1/Kconfig index 6eb739052121..14b92e12bf29 100644 --- a/drivers/infiniband/hw/hfi1/Kconfig +++ b/drivers/infiniband/hw/hfi1/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config INFINIBAND_HFI1 tristate "Cornelis OPX Gen1 support" - depends on X86_64 && INFINIBAND_RDMAVT && I2C + depends on X86_64 && INFINIBAND_RDMAVT && I2C && !UML select MMU_NOTIFIER select CRC32 select I2C_ALGOBIT diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 2e4cf2b11653..629beff053ad 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1179,8 +1179,10 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, goto done; ret = init_user_ctxt(fd, uctxt); - if (ret) + if (ret) { + hfi1_free_ctxt_rcv_groups(uctxt); goto done; + } user_init(uctxt); diff --git a/drivers/infiniband/hw/hfi1/ipoib_tx.c b/drivers/infiniband/hw/hfi1/ipoib_tx.c index d6bbdb8fcb50..5d9a7b09ca37 100644 --- a/drivers/infiniband/hw/hfi1/ipoib_tx.c +++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c @@ -742,9 +742,7 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv) kzalloc_node(sizeof(*tx->sdma_hdr), GFP_KERNEL, priv->dd->node); - netif_tx_napi_add(dev, &txq->napi, - hfi1_ipoib_poll_tx_ring, - NAPI_POLL_WEIGHT); + netif_napi_add_tx(dev, &txq->napi, hfi1_ipoib_poll_tx_ring); } return 0; diff --git a/drivers/infiniband/hw/hfi1/netdev_rx.c b/drivers/infiniband/hw/hfi1/netdev_rx.c index 03b098a494b5..3dfa5aff2512 100644 --- a/drivers/infiniband/hw/hfi1/netdev_rx.c +++ b/drivers/infiniband/hw/hfi1/netdev_rx.c @@ -216,7 +216,7 @@ static int hfi1_netdev_rxq_init(struct hfi1_netdev_rx *rx) * right now. */ set_bit(NAPI_STATE_NO_BUSY_POLL, &rxq->napi.state); - netif_napi_add(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); + netif_napi_add_weight(dev, &rxq->napi, hfi1_netdev_rx_napi, 64); rc = msix_netdev_request_rcd_irq(rxq->rcd); if (rc) goto bail_context_irq_failure; diff --git a/drivers/infiniband/hw/hfi1/pio_copy.c b/drivers/infiniband/hw/hfi1/pio_copy.c index 136f9a99e1e0..7690f996d5e3 100644 --- a/drivers/infiniband/hw/hfi1/pio_copy.c +++ b/drivers/infiniband/hw/hfi1/pio_copy.c @@ -172,7 +172,7 @@ static inline void jcopy(u8 *dest, const u8 *src, u32 n) } /* - * Read nbytes from "from" and and place them in the low bytes + * Read nbytes from "from" and place them in the low bytes * of pbuf->carry. Other bytes are left as-is. Any previous * value in pbuf->carry is lost. * diff --git a/drivers/infiniband/hw/hfi1/trace_dbg.h b/drivers/infiniband/hw/hfi1/trace_dbg.h index 707f1053f0b7..582b6f68df3d 100644 --- a/drivers/infiniband/hw/hfi1/trace_dbg.h +++ b/drivers/infiniband/hw/hfi1/trace_dbg.h @@ -26,14 +26,10 @@ DECLARE_EVENT_CLASS(hfi1_trace_template, TP_PROTO(const char *function, struct va_format *vaf), TP_ARGS(function, vaf), TP_STRUCT__entry(__string(function, function) - __dynamic_array(char, msg, MAX_MSG_LEN) + __vstring(msg, vaf->fmt, vaf->va) ), TP_fast_assign(__assign_str(function, function); - WARN_ON_ONCE(vsnprintf - (__get_dynamic_array(msg), - MAX_MSG_LEN, vaf->fmt, - *vaf->va) >= - MAX_MSG_LEN); + __assign_vstr(msg, vaf->fmt, vaf->va); ), TP_printk("(%s) %s", __get_str(function), diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 2855e9ad4b32..f848eedc6a23 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -959,6 +959,7 @@ struct hns_roce_dev { const struct hns_roce_hw *hw; void *priv; struct workqueue_struct *irq_workq; + struct work_struct ecc_work; const struct hns_roce_dfx_hw *dfx; u32 func_num; u32 is_vf; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index ba3c742258ef..cbdafaac678a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -55,6 +55,42 @@ enum { CMD_RST_PRC_EBUSY, }; +enum ecc_resource_type { + ECC_RESOURCE_QPC, + ECC_RESOURCE_CQC, + ECC_RESOURCE_MPT, + ECC_RESOURCE_SRQC, + ECC_RESOURCE_GMV, + ECC_RESOURCE_QPC_TIMER, + ECC_RESOURCE_CQC_TIMER, + ECC_RESOURCE_SCCC, + ECC_RESOURCE_COUNT, +}; + +static const struct { + const char *name; + u8 read_bt0_op; + u8 write_bt0_op; +} fmea_ram_res[] = { + { "ECC_RESOURCE_QPC", + HNS_ROCE_CMD_READ_QPC_BT0, HNS_ROCE_CMD_WRITE_QPC_BT0 }, + { "ECC_RESOURCE_CQC", + HNS_ROCE_CMD_READ_CQC_BT0, HNS_ROCE_CMD_WRITE_CQC_BT0 }, + { "ECC_RESOURCE_MPT", + HNS_ROCE_CMD_READ_MPT_BT0, HNS_ROCE_CMD_WRITE_MPT_BT0 }, + { "ECC_RESOURCE_SRQC", + HNS_ROCE_CMD_READ_SRQC_BT0, HNS_ROCE_CMD_WRITE_SRQC_BT0 }, + /* ECC_RESOURCE_GMV is handled by cmdq, not mailbox */ + { "ECC_RESOURCE_GMV", + 0, 0 }, + { "ECC_RESOURCE_QPC_TIMER", + HNS_ROCE_CMD_READ_QPC_TIMER_BT0, HNS_ROCE_CMD_WRITE_QPC_TIMER_BT0 }, + { "ECC_RESOURCE_CQC_TIMER", + HNS_ROCE_CMD_READ_CQC_TIMER_BT0, HNS_ROCE_CMD_WRITE_CQC_TIMER_BT0 }, + { "ECC_RESOURCE_SCCC", + HNS_ROCE_CMD_READ_SCCC_BT0, HNS_ROCE_CMD_WRITE_SCCC_BT0 }, +}; + static inline void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, struct ib_sge *sg) { @@ -5855,12 +5891,12 @@ static struct hns_roce_aeqe *next_aeqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? aeqe : NULL; } -static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct device *dev = hr_dev->dev; struct hns_roce_aeqe *aeqe = next_aeqe_sw_v2(eq); - int aeqe_found = 0; + irqreturn_t aeqe_found = IRQ_NONE; int event_type; u32 queue_num; int sub_type; @@ -5914,7 +5950,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, eq->event_type = event_type; eq->sub_type = sub_type; ++eq->cons_index; - aeqe_found = 1; + aeqe_found = IRQ_HANDLED; hns_roce_v2_init_irq_work(hr_dev, eq, queue_num); @@ -5922,7 +5958,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, } update_eq_db(eq); - return aeqe_found; + + return IRQ_RETVAL(aeqe_found); } static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) @@ -5937,11 +5974,11 @@ static struct hns_roce_ceqe *next_ceqe_sw_v2(struct hns_roce_eq *eq) !!(eq->cons_index & eq->entries)) ? ceqe : NULL; } -static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq) +static irqreturn_t hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, + struct hns_roce_eq *eq) { struct hns_roce_ceqe *ceqe = next_ceqe_sw_v2(eq); - int ceqe_found = 0; + irqreturn_t ceqe_found = IRQ_NONE; u32 cqn; while (ceqe) { @@ -5955,21 +5992,21 @@ static int hns_roce_v2_ceq_int(struct hns_roce_dev *hr_dev, hns_roce_cq_completion(hr_dev, cqn); ++eq->cons_index; - ceqe_found = 1; + ceqe_found = IRQ_HANDLED; ceqe = next_ceqe_sw_v2(eq); } update_eq_db(eq); - return ceqe_found; + return IRQ_RETVAL(ceqe_found); } static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) { struct hns_roce_eq *eq = eq_ptr; struct hns_roce_dev *hr_dev = eq->hr_dev; - int int_work; + irqreturn_t int_work; if (eq->type_flag == HNS_ROCE_CEQ) /* Completion event interrupt */ @@ -5981,27 +6018,22 @@ static irqreturn_t hns_roce_v2_msix_interrupt_eq(int irq, void *eq_ptr) return IRQ_RETVAL(int_work); } -static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, + u32 int_st) { - struct hns_roce_dev *hr_dev = dev_id; - struct device *dev = hr_dev->dev; - int int_work = 0; - u32 int_st; + struct pci_dev *pdev = hr_dev->pci_dev; + struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); + const struct hnae3_ae_ops *ops = ae_dev->ops; + irqreturn_t int_work = IRQ_NONE; u32 int_en; - /* Abnormal interrupt */ - int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); int_en = roce_read(hr_dev, ROCEE_VF_ABN_INT_EN_REG); if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S)) { - struct pci_dev *pdev = hr_dev->pci_dev; - struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); - const struct hnae3_ae_ops *ops = ae_dev->ops; + dev_err(hr_dev->dev, "AEQ overflow!\n"); - dev_err(dev, "AEQ overflow!\n"); - - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); + roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, + 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); /* Set reset level for reset_event() */ if (ops->set_default_reset_request) @@ -6013,19 +6045,165 @@ static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); - int_work = 1; - } else if (int_st & BIT(HNS_ROCE_V2_VF_INT_ST_RAS_INT_S)) { - dev_err(dev, "RAS interrupt!\n"); + int_work = IRQ_HANDLED; + } else { + dev_err(hr_dev->dev, "there is no basic abn irq found.\n"); + } + + return IRQ_RETVAL(int_work); +} - int_st |= 1 << HNS_ROCE_V2_VF_INT_ST_RAS_INT_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, int_st); +static int fmea_ram_ecc_query(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + int ret; - int_en |= 1 << HNS_ROCE_V2_VF_ABN_INT_EN_S; - roce_write(hr_dev, ROCEE_VF_ABN_INT_EN_REG, int_en); + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_QUERY_RAM_ECC, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) + return ret; + + ecc_info->is_ecc_err = hr_reg_read(req, QUERY_RAM_ECC_1BIT_ERR); + ecc_info->res_type = hr_reg_read(req, QUERY_RAM_ECC_RES_TYPE); + ecc_info->index = hr_reg_read(req, QUERY_RAM_ECC_TAG); + + return 0; +} + +static int fmea_recover_gmv(struct hns_roce_dev *hr_dev, u32 idx) +{ + struct hns_roce_cmq_desc desc; + struct hns_roce_cmq_req *req = (struct hns_roce_cmq_req *)desc.data; + u32 addr_upper; + u32 addr_low; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, true); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read gmv, ret = %d.\n", ret); + return ret; + } + + addr_low = hr_reg_read(req, CFG_GMV_BT_BA_L); + addr_upper = hr_reg_read(req, CFG_GMV_BT_BA_H); + + hns_roce_cmq_setup_basic_desc(&desc, HNS_ROCE_OPC_CFG_GMV_BT, false); + hr_reg_write(req, CFG_GMV_BT_BA_L, addr_low); + hr_reg_write(req, CFG_GMV_BT_BA_H, addr_upper); + hr_reg_write(req, CFG_GMV_BT_IDX, idx); + + return hns_roce_cmq_send(hr_dev, &desc, 1); +} + +static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) +{ + if (res_type == ECC_RESOURCE_QPC_TIMER || + res_type == ECC_RESOURCE_CQC_TIMER || + res_type == ECC_RESOURCE_SCCC) + return le64_to_cpu(*data); + + return le64_to_cpu(*data) << PAGE_SHIFT; +} - int_work = 1; +static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, + u32 index) +{ + u8 write_bt0_op = fmea_ram_res[res_type].write_bt0_op; + u8 read_bt0_op = fmea_ram_res[res_type].read_bt0_op; + struct hns_roce_cmd_mailbox *mailbox; + u64 addr; + int ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) + return PTR_ERR(mailbox); + + ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, read_bt0_op, index); + if (ret) { + dev_err(hr_dev->dev, + "failed to execute cmd to read fmea ram, ret = %d.\n", + ret); + goto out; + } + + addr = fmea_get_ram_res_addr(res_type, mailbox->buf); + + ret = hns_roce_cmd_mbox(hr_dev, addr, 0, write_bt0_op, index); + if (ret) + dev_err(hr_dev->dev, + "failed to execute cmd to write fmea ram, ret = %d.\n", + ret); + +out: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + return ret; +} + +static void fmea_ram_ecc_recover(struct hns_roce_dev *hr_dev, + struct fmea_ram_ecc *ecc_info) +{ + u32 res_type = ecc_info->res_type; + u32 index = ecc_info->index; + int ret; + + BUILD_BUG_ON(ARRAY_SIZE(fmea_ram_res) != ECC_RESOURCE_COUNT); + + if (res_type >= ECC_RESOURCE_COUNT) { + dev_err(hr_dev->dev, "unsupported fmea ram ecc type %u.\n", + res_type); + return; + } + + if (res_type == ECC_RESOURCE_GMV) + ret = fmea_recover_gmv(hr_dev, index); + else + ret = fmea_recover_others(hr_dev, res_type, index); + if (ret) + dev_err(hr_dev->dev, + "failed to recover %s, index = %u, ret = %d.\n", + fmea_ram_res[res_type].name, index, ret); +} + +static void fmea_ram_ecc_work(struct work_struct *ecc_work) +{ + struct hns_roce_dev *hr_dev = + container_of(ecc_work, struct hns_roce_dev, ecc_work); + struct fmea_ram_ecc ecc_info = {}; + + if (fmea_ram_ecc_query(hr_dev, &ecc_info)) { + dev_err(hr_dev->dev, "failed to query fmea ram ecc.\n"); + return; + } + + if (!ecc_info.is_ecc_err) { + dev_err(hr_dev->dev, "there is no fmea ram ecc err found.\n"); + return; + } + + fmea_ram_ecc_recover(hr_dev, &ecc_info); +} + +static irqreturn_t hns_roce_v2_msix_interrupt_abn(int irq, void *dev_id) +{ + struct hns_roce_dev *hr_dev = dev_id; + irqreturn_t int_work = IRQ_NONE; + u32 int_st; + + int_st = roce_read(hr_dev, ROCEE_VF_ABN_INT_ST_REG); + + if (int_st) { + int_work = abnormal_interrupt_basic(hr_dev, int_st); + } else if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { + queue_work(hr_dev->irq_workq, &hr_dev->ecc_work); + int_work = IRQ_HANDLED; } else { - dev_err(dev, "There is no abnormal irq found!\n"); + dev_err(hr_dev->dev, "there is no abnormal irq found.\n"); } return IRQ_RETVAL(int_work); @@ -6342,6 +6520,8 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) } } + INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); + hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); if (!hr_dev->irq_workq) { dev_err(dev, "failed to create irq workqueue.\n"); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 7ffb7824d268..f96debac30fe 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -250,6 +250,7 @@ enum hns_roce_opcode_type { HNS_ROCE_OPC_CFG_GMV_TBL = 0x850f, HNS_ROCE_OPC_CFG_GMV_BT = 0x8510, HNS_ROCE_OPC_EXT_CFG = 0x8512, + HNS_ROCE_QUERY_RAM_ECC = 0x8513, HNS_SWITCH_PARAMETER_CFG = 0x1033, }; @@ -1107,6 +1108,11 @@ enum { #define CFG_GMV_BT_BA_H CMQ_REQ_FIELD_LOC(51, 32) #define CFG_GMV_BT_IDX CMQ_REQ_FIELD_LOC(95, 64) +/* Fields of HNS_ROCE_QUERY_RAM_ECC */ +#define QUERY_RAM_ECC_1BIT_ERR CMQ_REQ_FIELD_LOC(31, 0) +#define QUERY_RAM_ECC_RES_TYPE CMQ_REQ_FIELD_LOC(63, 32) +#define QUERY_RAM_ECC_TAG CMQ_REQ_FIELD_LOC(95, 64) + struct hns_roce_cfg_sgid_tb { __le32 table_idx_rsv; __le32 vf_sgid_l; @@ -1343,6 +1349,12 @@ struct hns_roce_dip { struct list_head node; /* all dips are on a list */ }; +struct fmea_ram_ecc { + u32 is_ecc_err; + u32 res_type; + u32 index; +}; + /* only for RNR timeout issue of HIP08 */ #define HNS_ROCE_CLOCK_ADJUST 1000 #define HNS_ROCE_MAX_CQ_PERIOD 65 @@ -1382,7 +1394,6 @@ struct hns_roce_dip { #define HNS_ROCE_V2_ASYNC_EQE_NUM 0x1000 #define HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S 0 -#define HNS_ROCE_V2_VF_INT_ST_RAS_INT_S 1 #define HNS_ROCE_EQ_DB_CMD_AEQ 0x0 #define HNS_ROCE_EQ_DB_CMD_AEQ_ARMED 0x1 diff --git a/drivers/infiniband/hw/irdma/cm.c b/drivers/infiniband/hw/irdma/cm.c index 646fa8677490..7b086fe63a24 100644 --- a/drivers/infiniband/hw/irdma/cm.c +++ b/drivers/infiniband/hw/irdma/cm.c @@ -1477,12 +1477,13 @@ irdma_find_listener(struct irdma_cm_core *cm_core, u32 *dst_addr, u16 dst_port, list_for_each_entry (listen_node, &cm_core->listen_list, list) { memcpy(listen_addr, listen_node->loc_addr, sizeof(listen_addr)); listen_port = listen_node->loc_port; + if (listen_port != dst_port || + !(listener_state & listen_node->listener_state)) + continue; /* compare node pair, return node handle if a match */ - if ((!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) || - !memcmp(listen_addr, ip_zero, sizeof(listen_addr))) && - listen_port == dst_port && - vlan_id == listen_node->vlan_id && - (listener_state & listen_node->listener_state)) { + if (!memcmp(listen_addr, ip_zero, sizeof(listen_addr)) || + (!memcmp(listen_addr, dst_addr, sizeof(listen_addr)) && + vlan_id == listen_node->vlan_id)) { refcount_inc(&listen_node->refcnt); spin_unlock_irqrestore(&cm_core->listen_list_lock, flags); diff --git a/drivers/infiniband/hw/irdma/ctrl.c b/drivers/infiniband/hw/irdma/ctrl.c index 58c0e181ca2b..a41e0d21143a 100644 --- a/drivers/infiniband/hw/irdma/ctrl.c +++ b/drivers/infiniband/hw/irdma/ctrl.c @@ -4872,10 +4872,12 @@ int irdma_cfg_fpm_val(struct irdma_sc_dev *dev, u32 qp_count) sd_diff = sd_needed - hmc_fpm_misc->max_sds; if (sd_diff > 128) { - if (qpwanted > 128 && sd_diff > 144) + if (!(loop_count % 2) && qpwanted > 128) { qpwanted /= 2; - mrwanted /= 2; - pblewanted /= 2; + } else { + mrwanted /= 2; + pblewanted /= 2; + } continue; } if (dev->cqp->hmc_profile != IRDMA_HMC_PROFILE_FAVOR_VF && diff --git a/drivers/infiniband/hw/irdma/hw.c b/drivers/infiniband/hw/irdma/hw.c index dd3943d22dc6..4f132c6fb653 100644 --- a/drivers/infiniband/hw/irdma/hw.c +++ b/drivers/infiniband/hw/irdma/hw.c @@ -257,10 +257,6 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) iwqp->last_aeq = info->ae_id; spin_unlock_irqrestore(&iwqp->lock, flags); ctx_info = &iwqp->ctx_info; - if (rdma_protocol_roce(&iwqp->iwdev->ibdev, 1)) - ctx_info->roce_info->err_rq_idx_valid = true; - else - ctx_info->iwarp_info->err_rq_idx_valid = true; } else { if (info->ae_id != IRDMA_AE_CQ_OPERATION_ERROR) continue; @@ -370,16 +366,12 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) case IRDMA_AE_LCE_FUNCTION_CATASTROPHIC: case IRDMA_AE_LCE_CQ_CATASTROPHIC: case IRDMA_AE_UDA_XMIT_DGRAM_TOO_LONG: - if (rdma_protocol_roce(&iwdev->ibdev, 1)) - ctx_info->roce_info->err_rq_idx_valid = false; - else - ctx_info->iwarp_info->err_rq_idx_valid = false; - fallthrough; default: - ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d\n", - info->ae_id, info->qp, info->qp_cq_id); + ibdev_err(&iwdev->ibdev, "abnormal ae_id = 0x%x bool qp=%d qp_id = %d, ae_src=%d\n", + info->ae_id, info->qp, info->qp_cq_id, info->ae_src); if (rdma_protocol_roce(&iwdev->ibdev, 1)) { - if (!info->sq && ctx_info->roce_info->err_rq_idx_valid) { + ctx_info->roce_info->err_rq_idx_valid = info->rq; + if (info->rq) { ctx_info->roce_info->err_rq_idx = info->wqe_idx; irdma_sc_qp_setctx_roce(&iwqp->sc_qp, iwqp->host_ctx.va, ctx_info); @@ -388,7 +380,8 @@ static void irdma_process_aeq(struct irdma_pci_f *rf) irdma_cm_disconn(iwqp); break; } - if (!info->sq && ctx_info->iwarp_info->err_rq_idx_valid) { + ctx_info->iwarp_info->err_rq_idx_valid = info->rq; + if (info->rq) { ctx_info->iwarp_info->err_rq_idx = info->wqe_idx; ctx_info->tcp_info_valid = false; ctx_info->iwarp_info_valid = true; @@ -1512,10 +1505,7 @@ static int irdma_hmc_setup(struct irdma_pci_f *rf) int status; u32 qpcnt; - if (rf->rdma_ver == IRDMA_GEN_1) - qpcnt = rsrc_limits_table[rf->limits_sel].qplimit * 2; - else - qpcnt = rsrc_limits_table[rf->limits_sel].qplimit; + qpcnt = rsrc_limits_table[rf->limits_sel].qplimit; rf->sd_type = IRDMA_SD_TYPE_DIRECT; status = irdma_cfg_fpm_val(&rf->sc_dev, qpcnt); @@ -1543,7 +1533,7 @@ static void irdma_del_init_mem(struct irdma_pci_f *rf) rf->obj_mem.pa); rf->obj_mem.va = NULL; if (rf->rdma_ver != IRDMA_GEN_1) { - kfree(rf->allocated_ws_nodes); + bitmap_free(rf->allocated_ws_nodes); rf->allocated_ws_nodes = NULL; } kfree(rf->ceqlist); @@ -1972,9 +1962,8 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) u32 ret; if (rf->rdma_ver != IRDMA_GEN_1) { - rf->allocated_ws_nodes = - kcalloc(BITS_TO_LONGS(IRDMA_MAX_WS_NODES), - sizeof(unsigned long), GFP_KERNEL); + rf->allocated_ws_nodes = bitmap_zalloc(IRDMA_MAX_WS_NODES, + GFP_KERNEL); if (!rf->allocated_ws_nodes) return -ENOMEM; @@ -2023,7 +2012,7 @@ u32 irdma_initialize_hw_rsrc(struct irdma_pci_f *rf) return 0; mem_rsrc_kzalloc_fail: - kfree(rf->allocated_ws_nodes); + bitmap_free(rf->allocated_ws_nodes); rf->allocated_ws_nodes = NULL; return ret; diff --git a/drivers/infiniband/hw/irdma/main.h b/drivers/infiniband/hw/irdma/main.h index ef862bced20f..65e966ad3453 100644 --- a/drivers/infiniband/hw/irdma/main.h +++ b/drivers/infiniband/hw/irdma/main.h @@ -85,7 +85,7 @@ extern struct auxiliary_driver i40iw_auxiliary_drv; #define IRDMA_NO_QSET 0xffff #define IW_CFG_FPM_QP_COUNT 32768 -#define IRDMA_MAX_PAGES_PER_FMR 512 +#define IRDMA_MAX_PAGES_PER_FMR 262144 #define IRDMA_MIN_PAGES_PER_FMR 1 #define IRDMA_CQP_COMPL_RQ_WQE_FLUSHED 2 #define IRDMA_CQP_COMPL_SQ_WQE_FLUSHED 3 diff --git a/drivers/infiniband/hw/irdma/utils.c b/drivers/infiniband/hw/irdma/utils.c index ab3c5208a123..fdf4cc88cb91 100644 --- a/drivers/infiniband/hw/irdma/utils.c +++ b/drivers/infiniband/hw/irdma/utils.c @@ -652,6 +652,7 @@ static const char *const irdma_cqp_cmd_names[IRDMA_MAX_CQP_OPS] = { }; static const struct irdma_cqp_err_info irdma_noncrit_err_list[] = { + {0xffff, 0x8002, "Invalid State"}, {0xffff, 0x8006, "Flush No Wqe Pending"}, {0xffff, 0x8007, "Modify QP Bad Close"}, {0xffff, 0x8009, "LLP Closed"}, diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 96135a228f26..9b07b8af2997 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1776,11 +1776,11 @@ static int irdma_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata) spin_unlock_irqrestore(&iwcq->lock, flags); irdma_cq_wq_destroy(iwdev->rf, cq); - irdma_cq_free_rsrc(iwdev->rf, iwcq); spin_lock_irqsave(&iwceq->ce_lock, flags); irdma_sc_cleanup_ceqes(cq, ceq); spin_unlock_irqrestore(&iwceq->ce_lock, flags); + irdma_cq_free_rsrc(iwdev->rf, iwcq); return 0; } @@ -2605,7 +2605,7 @@ static struct ib_mr *irdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, palloc = &iwpbl->pble_alloc; iwmr->page_cnt = max_num_sg; err_code = irdma_get_pble(iwdev->rf->pble_rsrc, palloc, iwmr->page_cnt, - true); + false); if (err_code) goto err_get_pble; @@ -2641,8 +2641,16 @@ static int irdma_set_page(struct ib_mr *ibmr, u64 addr) if (unlikely(iwmr->npages == iwmr->page_cnt)) return -ENOMEM; - pbl = palloc->level1.addr; - pbl[iwmr->npages++] = addr; + if (palloc->level == PBLE_LEVEL_2) { + struct irdma_pble_info *palloc_info = + palloc->level2.leaf + (iwmr->npages >> PBLE_512_SHIFT); + + palloc_info->addr[iwmr->npages & (PBLE_PER_PAGE - 1)] = addr; + } else { + pbl = palloc->level1.addr; + pbl[iwmr->npages] = addr; + } + iwmr->npages++; return 0; } diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 08371a80fdc2..be189e0525de 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -523,6 +523,10 @@ repoll: "Requestor" : "Responder", cq->mcq.cqn); mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n", err_cqe->syndrome, err_cqe->vendor_err_synd); + if (wc->status != IB_WC_WR_FLUSH_ERR && + (*cur_qp)->type == MLX5_IB_QPT_REG_UMR) + dev->umrc.state = MLX5_UMR_STATE_RECOVER; + if (opcode == MLX5_CQE_REQ_ERR) { wq = &(*cur_qp)->sq; wqe_ctr = be16_to_cpu(cqe64->wqe_counter); diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 39ffb363ba0c..490ec308e309 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -679,7 +679,15 @@ enum flow_table_type { #define MLX5_FS_MAX_TYPES 6 #define MLX5_FS_MAX_ENTRIES BIT(16) -static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, +static bool mlx5_ib_shared_ft_allowed(struct ib_device *device) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + return MLX5_CAP_GEN(dev->mdev, shared_object_to_user_object_allowed); +} + +static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_ib_dev *dev, + struct mlx5_flow_namespace *ns, struct mlx5_ib_flow_prio *prio, int priority, int num_entries, int num_groups, @@ -688,6 +696,8 @@ static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, struct mlx5_flow_table_attr ft_attr = {}; struct mlx5_flow_table *ft; + if (mlx5_ib_shared_ft_allowed(&dev->ib_dev)) + ft_attr.uid = MLX5_SHARED_RESOURCE_UID; ft_attr.prio = priority; ft_attr.max_fte = num_entries; ft_attr.flags = flags; @@ -784,8 +794,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, ft = prio->flow_table; if (!ft) - return _get_prio(ns, prio, priority, max_table_size, num_groups, - flags); + return _get_prio(dev, ns, prio, priority, max_table_size, + num_groups, flags); return prio; } @@ -927,7 +937,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, prio = &dev->flow_db->opfcs[type]; if (!prio->flow_table) { - prio = _get_prio(ns, prio, priority, + prio = _get_prio(dev, ns, prio, priority, dev->num_ports * MAX_OPFC_RULES, 1, 0); if (IS_ERR(prio)) { err = PTR_ERR(prio); @@ -1407,8 +1417,8 @@ free_ucmd: } static struct mlx5_ib_flow_prio * -_get_flow_table(struct mlx5_ib_dev *dev, - struct mlx5_ib_flow_matcher *fs_matcher, +_get_flow_table(struct mlx5_ib_dev *dev, u16 user_priority, + enum mlx5_flow_namespace_type ns_type, bool mcast) { struct mlx5_flow_namespace *ns = NULL; @@ -1421,11 +1431,11 @@ _get_flow_table(struct mlx5_ib_dev *dev, if (mcast) priority = MLX5_IB_FLOW_MCAST_PRIO; else - priority = ib_prio_to_core_prio(fs_matcher->priority, false); + priority = ib_prio_to_core_prio(user_priority, false); esw_encap = mlx5_eswitch_get_encap_mode(dev->mdev) != DEVLINK_ESWITCH_ENCAP_MODE_NONE; - switch (fs_matcher->ns_type) { + switch (ns_type) { case MLX5_FLOW_NAMESPACE_BYPASS: max_table_size = BIT( MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); @@ -1452,17 +1462,17 @@ _get_flow_table(struct mlx5_ib_dev *dev, reformat_l3_tunnel_to_l2) && esw_encap) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; - priority = fs_matcher->priority; + priority = user_priority; break; case MLX5_FLOW_NAMESPACE_RDMA_RX: max_table_size = BIT( MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, log_max_ft_size)); - priority = fs_matcher->priority; + priority = user_priority; break; case MLX5_FLOW_NAMESPACE_RDMA_TX: max_table_size = BIT( MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, log_max_ft_size)); - priority = fs_matcher->priority; + priority = user_priority; break; default: break; @@ -1470,11 +1480,11 @@ _get_flow_table(struct mlx5_ib_dev *dev, max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES); - ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type); + ns = mlx5_get_flow_namespace(dev->mdev, ns_type); if (!ns) return ERR_PTR(-EOPNOTSUPP); - switch (fs_matcher->ns_type) { + switch (ns_type) { case MLX5_FLOW_NAMESPACE_BYPASS: prio = &dev->flow_db->prios[priority]; break; @@ -1499,7 +1509,7 @@ _get_flow_table(struct mlx5_ib_dev *dev, if (prio->flow_table) return prio; - return _get_prio(ns, prio, priority, max_table_size, + return _get_prio(dev, ns, prio, priority, max_table_size, MLX5_FS_MAX_TYPES, flags); } @@ -1618,7 +1628,8 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( mcast = raw_fs_is_multicast(fs_matcher, cmd_in); mutex_lock(&dev->flow_db->lock); - ft_prio = _get_flow_table(dev, fs_matcher, mcast); + ft_prio = _get_flow_table(dev, fs_matcher->priority, + fs_matcher->ns_type, mcast); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; @@ -2015,6 +2026,23 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject, return 0; } +static int steering_anchor_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_steering_anchor *obj = uobject->object; + + if (atomic_read(&obj->usecnt)) + return -EBUSY; + + mutex_lock(&obj->dev->flow_db->lock); + put_flow_table(obj->dev, obj->ft_prio, true); + mutex_unlock(&obj->dev->flow_db->lock); + + kfree(obj); + return 0; +} + static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, struct mlx5_ib_flow_matcher *obj) { @@ -2050,12 +2078,10 @@ static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs, if (err) return err; - if (flags) { - mlx5_ib_ft_type_to_namespace( + if (flags) + return mlx5_ib_ft_type_to_namespace( MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX, &obj->ns_type); - return 0; - } } obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS; @@ -2121,6 +2147,75 @@ end: return err; } +static int UVERBS_HANDLER(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE); + struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata); + enum mlx5_ib_uapi_flow_table_type ib_uapi_ft_type; + enum mlx5_flow_namespace_type ns_type; + struct mlx5_ib_steering_anchor *obj; + struct mlx5_ib_flow_prio *ft_prio; + u16 priority; + u32 ft_id; + int err; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + err = uverbs_get_const(&ib_uapi_ft_type, attrs, + MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE); + if (err) + return err; + + err = mlx5_ib_ft_type_to_namespace(ib_uapi_ft_type, &ns_type); + if (err) + return err; + + err = uverbs_copy_from(&priority, attrs, + MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY); + if (err) + return err; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + mutex_lock(&dev->flow_db->lock); + ft_prio = _get_flow_table(dev, priority, ns_type, 0); + if (IS_ERR(ft_prio)) { + mutex_unlock(&dev->flow_db->lock); + err = PTR_ERR(ft_prio); + goto free_obj; + } + + ft_prio->refcount++; + ft_id = mlx5_flow_table_id(ft_prio->flow_table); + mutex_unlock(&dev->flow_db->lock); + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, + &ft_id, sizeof(ft_id)); + if (err) + goto put_flow_table; + + uobj->object = obj; + obj->dev = dev; + obj->ft_prio = ft_prio; + atomic_set(&obj->usecnt, 0); + + return 0; + +put_flow_table: + mutex_lock(&dev->flow_db->lock); + put_flow_table(dev, ft_prio, true); + mutex_unlock(&dev->flow_db->lock); +free_obj: + kfree(obj); + + return err; +} + static struct ib_flow_action * mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev, enum mlx5_ib_uapi_flow_table_type ft_type, @@ -2477,6 +2572,35 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER, &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE), &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_STEERING_ANCHOR_CREATE, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_CREATE_HANDLE, + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_PRIORITY, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_STEERING_ANCHOR_FT_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_STEERING_ANCHOR_DESTROY_HANDLE, + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT( + MLX5_IB_OBJECT_STEERING_ANCHOR, + UVERBS_TYPE_ALLOC_IDR(steering_anchor_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_CREATE), + &UVERBS_METHOD(MLX5_IB_METHOD_STEERING_ANCHOR_DESTROY)); + const struct uapi_definition mlx5_ib_flow_defs[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED( MLX5_IB_OBJECT_FLOW_MATCHER), @@ -2485,6 +2609,9 @@ const struct uapi_definition mlx5_ib_flow_defs[] = { &mlx5_ib_fs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, &mlx5_ib_flow_actions), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_STEERING_ANCHOR, + UAPI_DEF_IS_OBJ_SUPPORTED(mlx5_ib_shared_ft_allowed)), {}, }; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b68fddeac0f1..fc94a1b25485 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2738,26 +2738,24 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev) int err; int port; - for (port = 1; port <= ARRAY_SIZE(dev->port_caps); port++) { - dev->port_caps[port - 1].has_smi = false; - if (MLX5_CAP_GEN(dev->mdev, port_type) == - MLX5_CAP_PORT_TYPE_IB) { - if (MLX5_CAP_GEN(dev->mdev, ib_virt)) { - err = mlx5_query_hca_vport_context(dev->mdev, 0, - port, 0, - &vport_ctx); - if (err) { - mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n", - port, err); - return err; - } - dev->port_caps[port - 1].has_smi = - vport_ctx.has_smi; - } else { - dev->port_caps[port - 1].has_smi = true; - } + if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_IB) + return 0; + + for (port = 1; port <= dev->num_ports; port++) { + if (!MLX5_CAP_GEN(dev->mdev, ib_virt)) { + dev->port_caps[port - 1].has_smi = true; + continue; } + err = mlx5_query_hca_vport_context(dev->mdev, 0, port, 0, + &vport_ctx); + if (err) { + mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n", + port, err); + return err; + } + dev->port_caps[port - 1].has_smi = vport_ctx.has_smi; } + return 0; } @@ -4002,7 +4000,7 @@ static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) { int err; - err = mlx5_mr_cache_cleanup(dev); + err = mlx5_mkey_cache_cleanup(dev); if (err) mlx5_ib_warn(dev, "mr cache cleanup failed\n"); @@ -4022,7 +4020,7 @@ static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev) if (ret) return ret; - ret = mlx5_mr_cache_init(dev); + ret = mlx5_mkey_cache_init(dev); if (ret) { mlx5_ib_warn(dev, "mr cache init failed %d\n", ret); mlx5r_umr_resource_cleanup(dev); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 998b67509a53..2e2ad3918385 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -259,6 +259,12 @@ struct mlx5_ib_flow_matcher { u8 match_criteria_enable; }; +struct mlx5_ib_steering_anchor { + struct mlx5_ib_flow_prio *ft_prio; + struct mlx5_ib_dev *dev; + atomic_t usecnt; +}; + struct mlx5_ib_pp { u16 index; struct mlx5_core_dev *mdev; @@ -613,6 +619,7 @@ struct mlx5_ib_mkey { unsigned int ndescs; struct wait_queue_head wait; refcount_t usecount; + struct mlx5_cache_ent *cache_ent; }; #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE) @@ -635,20 +642,9 @@ struct mlx5_ib_mr { struct ib_mr ibmr; struct mlx5_ib_mkey mmkey; - /* User MR data */ - struct mlx5_cache_ent *cache_ent; - /* Everything after cache_ent is zero'd when MR allocated */ struct ib_umem *umem; union { - /* Used only while the MR is in the cache */ - struct { - u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; - struct mlx5_async_work cb_work; - /* Cache list element */ - struct list_head list; - }; - /* Used only by kernel MRs (umem == NULL) */ struct { void *descs; @@ -688,12 +684,6 @@ struct mlx5_ib_mr { }; }; -/* Zero the fields in the mr that are variant depending on usage */ -static inline void mlx5_clear_mr(struct mlx5_ib_mr *mr) -{ - memset_after(mr, 0, cache_ent); -} - static inline bool is_odp_mr(struct mlx5_ib_mr *mr) { return IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && mr->umem && @@ -717,21 +707,29 @@ struct mlx5_ib_umr_context { struct completion done; }; +enum { + MLX5_UMR_STATE_ACTIVE, + MLX5_UMR_STATE_RECOVER, + MLX5_UMR_STATE_ERR, +}; + struct umr_common { struct ib_pd *pd; struct ib_cq *cq; struct ib_qp *qp; - /* control access to UMR QP + /* Protects from UMR QP overflow */ struct semaphore sem; + /* Protects from using UMR while the UMR is not active + */ + struct mutex lock; + unsigned int state; }; struct mlx5_cache_ent { - struct list_head head; - /* sync access to the cahce entry - */ - spinlock_t lock; - + struct xarray mkeys; + unsigned long stored; + unsigned long reserved; char name[4]; u32 order; @@ -743,18 +741,11 @@ struct mlx5_cache_ent { u8 fill_to_high_water:1; /* - * - available_mrs is the length of list head, ie the number of MRs - * available for immediate allocation. - * - total_mrs is available_mrs plus all in use MRs that could be - * returned to the cache. - * - limit is the low water mark for available_mrs, 2* limit is the + * - limit is the low water mark for stored mkeys, 2* limit is the * upper water mark. - * - pending is the number of MRs currently being created */ - u32 total_mrs; - u32 available_mrs; + u32 in_use; u32 limit; - u32 pending; /* Statistics */ u32 miss; @@ -763,9 +754,19 @@ struct mlx5_cache_ent { struct delayed_work dwork; }; -struct mlx5_mr_cache { +struct mlx5r_async_create_mkey { + union { + u32 in[MLX5_ST_SZ_BYTES(create_mkey_in)]; + u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; + }; + struct mlx5_async_work cb_work; + struct mlx5_cache_ent *ent; + u32 mkey; +}; + +struct mlx5_mkey_cache { struct workqueue_struct *wq; - struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; + struct mlx5_cache_ent ent[MAX_MKEY_CACHE_ENTRIES]; struct dentry *root; unsigned long last_add; }; @@ -1064,7 +1065,7 @@ struct mlx5_ib_dev { struct mlx5_ib_resources devr; atomic_t mkey_var; - struct mlx5_mr_cache cache; + struct mlx5_mkey_cache cache; struct timer_list delay_timer; /* Prevents soft lock on massive reg MRs */ struct mutex slow_path_mutex; @@ -1309,8 +1310,8 @@ void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, u64 access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); -int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); -int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); +int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev); +int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev); struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent, @@ -1338,7 +1339,7 @@ int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq); void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); +void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags); @@ -1357,7 +1358,7 @@ static inline int mlx5r_odp_create_eq(struct mlx5_ib_dev *dev, static inline void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } static inline void mlx5_ib_odp_cleanup(void) {} -static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} +static inline void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) {} static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) {} diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index aedfd7ff4846..129d531bd01b 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -82,15 +82,14 @@ static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, MLX5_SET64(mkc, mkc, start_addr, start_addr); } -static void assign_mkey_variant(struct mlx5_ib_dev *dev, - struct mlx5_ib_mkey *mkey, u32 *in) +static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) { u8 key = atomic_inc_return(&dev->mkey_var); void *mkc; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, mkey_7_0, key); - mkey->key = key; + *mkey = key; } static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, @@ -98,7 +97,7 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, { int ret; - assign_mkey_variant(dev, mkey, in); + assign_mkey_variant(dev, &mkey->key, in); ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); if (!ret) init_waitqueue_head(&mkey->wait); @@ -106,20 +105,21 @@ static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, return ret; } -static int -mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, - struct mlx5_ib_mkey *mkey, - struct mlx5_async_ctx *async_ctx, - u32 *in, int inlen, u32 *out, int outlen, - struct mlx5_async_work *context) +static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) { - MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); - assign_mkey_variant(dev, mkey, in); - return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, - create_mkey_callback, context); + struct mlx5_ib_dev *dev = async_create->ent->dev; + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); + + MLX5_SET(create_mkey_in, async_create->in, opcode, + MLX5_CMD_OP_CREATE_MKEY); + assign_mkey_variant(dev, &async_create->mkey, async_create->in); + return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, + async_create->out, outlen, create_mkey_callback, + &async_create->cb_work); } -static int mr_cache_max_order(struct mlx5_ib_dev *dev); +static int mkey_cache_max_order(struct mlx5_ib_dev *dev); static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) @@ -142,40 +142,132 @@ static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); } + +static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, + void *to_store) +{ + XA_STATE(xas, &ent->mkeys, 0); + void *curr; + + xa_lock_irq(&ent->mkeys); + if (limit_pendings && + (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) { + xa_unlock_irq(&ent->mkeys); + return -EAGAIN; + } + while (1) { + /* + * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version + * doesn't transparently unlock. Instead we set the xas index to + * the current value of reserved every iteration. + */ + xas_set(&xas, ent->reserved); + curr = xas_load(&xas); + if (!curr) { + if (to_store && ent->stored == ent->reserved) + xas_store(&xas, to_store); + else + xas_store(&xas, XA_ZERO_ENTRY); + if (xas_valid(&xas)) { + ent->reserved++; + if (to_store) { + if (ent->stored != ent->reserved) + __xa_store(&ent->mkeys, + ent->stored, + to_store, + GFP_KERNEL); + ent->stored++; + queue_adjust_cache_locked(ent); + WRITE_ONCE(ent->dev->cache.last_add, + jiffies); + } + } + } + xa_unlock_irq(&ent->mkeys); + + /* + * Notice xas_nomem() must always be called as it cleans + * up any cached allocation. + */ + if (!xas_nomem(&xas, GFP_KERNEL)) + break; + xa_lock_irq(&ent->mkeys); + } + if (xas_error(&xas)) + return xas_error(&xas); + if (WARN_ON(curr)) + return -EINVAL; + return 0; +} + +static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) +{ + void *old; + + ent->reserved--; + old = __xa_erase(&ent->mkeys, ent->reserved); + WARN_ON(old); +} + +static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey) +{ + void *old; + + old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0); + WARN_ON(old); + ent->stored++; +} + +static u32 pop_stored_mkey(struct mlx5_cache_ent *ent) +{ + void *old, *xa_mkey; + + ent->stored--; + ent->reserved--; + + if (ent->stored == ent->reserved) { + xa_mkey = __xa_erase(&ent->mkeys, ent->stored); + WARN_ON(!xa_mkey); + return (u32)xa_to_value(xa_mkey); + } + + xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY, + GFP_KERNEL); + WARN_ON(!xa_mkey || xa_is_err(xa_mkey)); + old = __xa_erase(&ent->mkeys, ent->reserved); + WARN_ON(old); + return (u32)xa_to_value(xa_mkey); +} + static void create_mkey_callback(int status, struct mlx5_async_work *context) { - struct mlx5_ib_mr *mr = - container_of(context, struct mlx5_ib_mr, cb_work); - struct mlx5_cache_ent *ent = mr->cache_ent; + struct mlx5r_async_create_mkey *mkey_out = + container_of(context, struct mlx5r_async_create_mkey, cb_work); + struct mlx5_cache_ent *ent = mkey_out->ent; struct mlx5_ib_dev *dev = ent->dev; unsigned long flags; if (status) { - create_mkey_warn(dev, status, mr->out); - kfree(mr); - spin_lock_irqsave(&ent->lock, flags); - ent->pending--; + create_mkey_warn(dev, status, mkey_out->out); + kfree(mkey_out); + xa_lock_irqsave(&ent->mkeys, flags); + undo_push_reserve_mkey(ent); WRITE_ONCE(dev->fill_delay, 1); - spin_unlock_irqrestore(&ent->lock, flags); + xa_unlock_irqrestore(&ent->mkeys, flags); mod_timer(&dev->delay_timer, jiffies + HZ); return; } - mr->mmkey.type = MLX5_MKEY_MR; - mr->mmkey.key |= mlx5_idx_to_mkey( - MLX5_GET(create_mkey_out, mr->out, mkey_index)); - init_waitqueue_head(&mr->mmkey.wait); - + mkey_out->mkey |= mlx5_idx_to_mkey( + MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); WRITE_ONCE(dev->cache.last_add, jiffies); - spin_lock_irqsave(&ent->lock, flags); - list_add_tail(&mr->list, &ent->head); - ent->available_mrs++; - ent->total_mrs++; + xa_lock_irqsave(&ent->mkeys, flags); + push_to_reserved(ent, mkey_out->mkey); /* If we are doing fill_to_high_water then keep going. */ queue_adjust_cache_locked(ent); - ent->pending--; - spin_unlock_irqrestore(&ent->lock, flags); + xa_unlock_irqrestore(&ent->mkeys, flags); + kfree(mkey_out); } static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) @@ -197,15 +289,8 @@ static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) return ret; } -static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) +static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) { - struct mlx5_ib_mr *mr; - - mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) - return NULL; - mr->cache_ent = ent; - set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, umr_en, 1); @@ -215,133 +300,106 @@ static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) MLX5_SET(mkc, mkc, translations_octword_size, get_mkc_octo_size(ent->access_mode, ent->ndescs)); MLX5_SET(mkc, mkc, log_page_size, ent->page); - return mr; } /* Asynchronously schedule new MRs to be populated in the cache. */ static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) { - size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); - struct mlx5_ib_mr *mr; + struct mlx5r_async_create_mkey *async_create; void *mkc; - u32 *in; int err = 0; int i; - in = kzalloc(inlen, GFP_KERNEL); - if (!in) - return -ENOMEM; - - mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); for (i = 0; i < num; i++) { - mr = alloc_cache_mr(ent, mkc); - if (!mr) { - err = -ENOMEM; - break; - } - spin_lock_irq(&ent->lock); - if (ent->pending >= MAX_PENDING_REG_MR) { - err = -EAGAIN; - spin_unlock_irq(&ent->lock); - kfree(mr); - break; - } - ent->pending++; - spin_unlock_irq(&ent->lock); - err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, - &ent->dev->async_ctx, in, inlen, - mr->out, sizeof(mr->out), - &mr->cb_work); + async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), + GFP_KERNEL); + if (!async_create) + return -ENOMEM; + mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, + memory_key_mkey_entry); + set_cache_mkc(ent, mkc); + async_create->ent = ent; + + err = push_mkey(ent, true, NULL); + if (err) + goto free_async_create; + + err = mlx5_ib_create_mkey_cb(async_create); if (err) { - spin_lock_irq(&ent->lock); - ent->pending--; - spin_unlock_irq(&ent->lock); mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); - kfree(mr); - break; + goto err_undo_reserve; } } - kfree(in); + return 0; + +err_undo_reserve: + xa_lock_irq(&ent->mkeys); + undo_push_reserve_mkey(ent); + xa_unlock_irq(&ent->mkeys); +free_async_create: + kfree(async_create); return err; } /* Synchronously create a MR in the cache */ -static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) +static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) { size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); - struct mlx5_ib_mr *mr; void *mkc; u32 *in; int err; in = kzalloc(inlen, GFP_KERNEL); if (!in) - return ERR_PTR(-ENOMEM); + return -ENOMEM; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + set_cache_mkc(ent, mkc); - mr = alloc_cache_mr(ent, mkc); - if (!mr) { - err = -ENOMEM; - goto free_in; - } - - err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen); + err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); if (err) - goto free_mr; + goto free_in; - init_waitqueue_head(&mr->mmkey.wait); - mr->mmkey.type = MLX5_MKEY_MR; WRITE_ONCE(ent->dev->cache.last_add, jiffies); - spin_lock_irq(&ent->lock); - ent->total_mrs++; - spin_unlock_irq(&ent->lock); - kfree(in); - return mr; -free_mr: - kfree(mr); free_in: kfree(in); - return ERR_PTR(err); + return err; } static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) { - struct mlx5_ib_mr *mr; + u32 mkey; - lockdep_assert_held(&ent->lock); - if (list_empty(&ent->head)) + lockdep_assert_held(&ent->mkeys.xa_lock); + if (!ent->stored) return; - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_del(&mr->list); - ent->available_mrs--; - ent->total_mrs--; - spin_unlock_irq(&ent->lock); - mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key); - kfree(mr); - spin_lock_irq(&ent->lock); + mkey = pop_stored_mkey(ent); + xa_unlock_irq(&ent->mkeys); + mlx5_core_destroy_mkey(ent->dev->mdev, mkey); + xa_lock_irq(&ent->mkeys); } static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, bool limit_fill) + __acquires(&ent->mkeys) __releases(&ent->mkeys) { int err; - lockdep_assert_held(&ent->lock); + lockdep_assert_held(&ent->mkeys.xa_lock); while (true) { if (limit_fill) target = ent->limit * 2; - if (target == ent->available_mrs + ent->pending) + if (target == ent->reserved) return 0; - if (target > ent->available_mrs + ent->pending) { - u32 todo = target - (ent->available_mrs + ent->pending); + if (target > ent->reserved) { + u32 todo = target - ent->reserved; - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); err = add_keys(ent, todo); if (err == -EAGAIN) usleep_range(3000, 5000); - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); if (err) { if (err != -EAGAIN) return err; @@ -366,15 +424,15 @@ static ssize_t size_write(struct file *filp, const char __user *buf, /* * Target is the new value of total_mrs the user requests, however we - * cannot free MRs that are in use. Compute the target value for - * available_mrs. + * cannot free MRs that are in use. Compute the target value for stored + * mkeys. */ - spin_lock_irq(&ent->lock); - if (target < ent->total_mrs - ent->available_mrs) { + xa_lock_irq(&ent->mkeys); + if (target < ent->in_use) { err = -EINVAL; goto err_unlock; } - target = target - (ent->total_mrs - ent->available_mrs); + target = target - ent->in_use; if (target < ent->limit || target > ent->limit*2) { err = -EINVAL; goto err_unlock; @@ -382,12 +440,12 @@ static ssize_t size_write(struct file *filp, const char __user *buf, err = resize_available_mrs(ent, target, false); if (err) goto err_unlock; - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); return count; err_unlock: - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); return err; } @@ -398,7 +456,7 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count, char lbuf[20]; int err; - err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); + err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use); if (err < 0) return err; @@ -427,10 +485,10 @@ static ssize_t limit_write(struct file *filp, const char __user *buf, * Upon set we immediately fill the cache to high water mark implied by * the limit. */ - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); ent->limit = var; err = resize_available_mrs(ent, 0, true); - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); if (err) return err; return count; @@ -457,17 +515,17 @@ static const struct file_operations limit_fops = { .read = limit_read, }; -static bool someone_adding(struct mlx5_mr_cache *cache) +static bool someone_adding(struct mlx5_mkey_cache *cache) { unsigned int i; - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { struct mlx5_cache_ent *ent = &cache->ent[i]; bool ret; - spin_lock_irq(&ent->lock); - ret = ent->available_mrs < ent->limit; - spin_unlock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); + ret = ent->stored < ent->limit; + xa_unlock_irq(&ent->mkeys); if (ret) return true; } @@ -481,26 +539,26 @@ static bool someone_adding(struct mlx5_mr_cache *cache) */ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) { - lockdep_assert_held(&ent->lock); + lockdep_assert_held(&ent->mkeys.xa_lock); if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) return; - if (ent->available_mrs < ent->limit) { + if (ent->stored < ent->limit) { ent->fill_to_high_water = true; mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); } else if (ent->fill_to_high_water && - ent->available_mrs + ent->pending < 2 * ent->limit) { + ent->reserved < 2 * ent->limit) { /* * Once we start populating due to hitting a low water mark * continue until we pass the high water mark. */ mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); - } else if (ent->available_mrs == 2 * ent->limit) { + } else if (ent->stored == 2 * ent->limit) { ent->fill_to_high_water = false; - } else if (ent->available_mrs > 2 * ent->limit) { + } else if (ent->stored > 2 * ent->limit) { /* Queue deletion of excess entries */ ent->fill_to_high_water = false; - if (ent->pending) + if (ent->stored != ent->reserved) queue_delayed_work(ent->dev->cache.wq, &ent->dwork, msecs_to_jiffies(1000)); else @@ -511,25 +569,24 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; int err; - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); if (ent->disabled) goto out; - if (ent->fill_to_high_water && - ent->available_mrs + ent->pending < 2 * ent->limit && + if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit && !READ_ONCE(dev->fill_delay)) { - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); err = add_keys(ent, 1); - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); if (ent->disabled) goto out; if (err) { /* - * EAGAIN only happens if pending is positive, so we - * will be rescheduled from reg_mr_callback(). The only + * EAGAIN only happens if there are pending MRs, so we + * will be rescheduled when storing them. The only * failure path here is ENOMEM. */ if (err != -EAGAIN) { @@ -541,7 +598,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) msecs_to_jiffies(1000)); } } - } else if (ent->available_mrs > 2 * ent->limit) { + } else if (ent->stored > 2 * ent->limit) { bool need_delay; /* @@ -556,11 +613,11 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) * the garbage collection work to try to run in next cycle, in * order to free CPU resources to other tasks. */ - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); need_delay = need_resched() || someone_adding(cache) || !time_after(jiffies, READ_ONCE(cache->last_add) + 300 * HZ); - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); if (ent->disabled) goto out; if (need_delay) { @@ -571,7 +628,7 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) queue_adjust_cache_locked(ent); } out: - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); } static void delayed_cache_work_func(struct work_struct *work) @@ -587,73 +644,59 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int access_flags) { struct mlx5_ib_mr *mr; + int err; - /* Matches access in alloc_cache_mr() */ if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) return ERR_PTR(-EOPNOTSUPP); - spin_lock_irq(&ent->lock); - if (list_empty(&ent->head)) { + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + xa_lock_irq(&ent->mkeys); + ent->in_use++; + + if (!ent->stored) { queue_adjust_cache_locked(ent); ent->miss++; - spin_unlock_irq(&ent->lock); - mr = create_cache_mr(ent); - if (IS_ERR(mr)) - return mr; + xa_unlock_irq(&ent->mkeys); + err = create_cache_mkey(ent, &mr->mmkey.key); + if (err) { + xa_lock_irq(&ent->mkeys); + ent->in_use--; + xa_unlock_irq(&ent->mkeys); + kfree(mr); + return ERR_PTR(err); + } } else { - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_del(&mr->list); - ent->available_mrs--; + mr->mmkey.key = pop_stored_mkey(ent); queue_adjust_cache_locked(ent); - spin_unlock_irq(&ent->lock); - - mlx5_clear_mr(mr); + xa_unlock_irq(&ent->mkeys); } + mr->mmkey.cache_ent = ent; + mr->mmkey.type = MLX5_MKEY_MR; + init_waitqueue_head(&mr->mmkey.wait); return mr; } -static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) -{ - struct mlx5_cache_ent *ent = mr->cache_ent; - - WRITE_ONCE(dev->cache.last_add, jiffies); - spin_lock_irq(&ent->lock); - list_add_tail(&mr->list, &ent->head); - ent->available_mrs++; - queue_adjust_cache_locked(ent); - spin_unlock_irq(&ent->lock); -} - static void clean_keys(struct mlx5_ib_dev *dev, int c) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; struct mlx5_cache_ent *ent = &cache->ent[c]; - struct mlx5_ib_mr *tmp_mr; - struct mlx5_ib_mr *mr; - LIST_HEAD(del_list); + u32 mkey; cancel_delayed_work(&ent->dwork); - while (1) { - spin_lock_irq(&ent->lock); - if (list_empty(&ent->head)) { - spin_unlock_irq(&ent->lock); - break; - } - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_move(&mr->list, &del_list); - ent->available_mrs--; - ent->total_mrs--; - spin_unlock_irq(&ent->lock); - mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); - } - - list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { - list_del(&mr->list); - kfree(mr); + xa_lock_irq(&ent->mkeys); + while (ent->stored) { + mkey = pop_stored_mkey(ent); + xa_unlock_irq(&ent->mkeys); + mlx5_core_destroy_mkey(dev->mdev, mkey); + xa_lock_irq(&ent->mkeys); } + xa_unlock_irq(&ent->mkeys); } -static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) +static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) { if (!mlx5_debugfs_root || dev->is_rep) return; @@ -662,9 +705,9 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) dev->cache.root = NULL; } -static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) +static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; struct dentry *dir; int i; @@ -674,13 +717,13 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { ent = &cache->ent[i]; sprintf(ent->name, "%d", ent->order); dir = debugfs_create_dir(ent->name, cache->root); debugfs_create_file("size", 0600, dir, ent, &size_fops); debugfs_create_file("limit", 0600, dir, ent, &limit_fops); - debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); + debugfs_create_ulong("cur", 0400, dir, &ent->stored); debugfs_create_u32("miss", 0600, dir, &ent->miss); } } @@ -692,9 +735,9 @@ static void delay_time_func(struct timer_list *t) WRITE_ONCE(dev->fill_delay, 0); } -int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) +int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; int i; @@ -707,22 +750,21 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); timer_setup(&dev->delay_timer, delay_time_func, 0); - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { ent = &cache->ent[i]; - INIT_LIST_HEAD(&ent->head); - spin_lock_init(&ent->lock); + xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); ent->order = i + 2; ent->dev = dev; ent->limit = 0; INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); - if (i > MR_CACHE_LAST_STD_ENTRY) { - mlx5_odp_init_mr_cache_entry(ent); + if (i > MKEY_CACHE_LAST_STD_ENTRY) { + mlx5_odp_init_mkey_cache_entry(ent); continue; } - if (ent->order > mr_cache_max_order(dev)) + if (ent->order > mkey_cache_max_order(dev)) continue; ent->page = PAGE_SHIFT; @@ -734,36 +776,36 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->limit = dev->mdev->profile.mr_cache[i].limit; else ent->limit = 0; - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); queue_adjust_cache_locked(ent); - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); } - mlx5_mr_cache_debugfs_init(dev); + mlx5_mkey_cache_debugfs_init(dev); return 0; } -int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) +int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) { unsigned int i; if (!dev->cache.wq) return 0; - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { struct mlx5_cache_ent *ent = &dev->cache.ent[i]; - spin_lock_irq(&ent->lock); + xa_lock_irq(&ent->mkeys); ent->disabled = true; - spin_unlock_irq(&ent->lock); + xa_unlock_irq(&ent->mkeys); cancel_delayed_work_sync(&ent->dwork); } - mlx5_mr_cache_debugfs_cleanup(dev); + mlx5_mkey_cache_debugfs_cleanup(dev); mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); - for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) + for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) clean_keys(dev, i); destroy_workqueue(dev->cache.wq); @@ -830,22 +872,22 @@ static int get_octo_len(u64 addr, u64 len, int page_shift) return (npages + 1) / 2; } -static int mr_cache_max_order(struct mlx5_ib_dev *dev) +static int mkey_cache_max_order(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) - return MR_CACHE_LAST_STD_ENTRY + 2; + return MKEY_CACHE_LAST_STD_ENTRY + 2; return MLX5_MAX_UMR_SHIFT; } -static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, - unsigned int order) +static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, + unsigned int order) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_mkey_cache *cache = &dev->cache; if (order < cache->ent[0].order) return &cache->ent[0]; order = order - cache->ent[0].order; - if (order > MR_CACHE_LAST_STD_ENTRY) + if (order > MKEY_CACHE_LAST_STD_ENTRY) return NULL; return &cache->ent[order]; } @@ -888,7 +930,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 0, iova); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); - ent = mr_cache_ent_from_order( + ent = mkey_cache_ent_from_order( dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); /* * Matches access in alloc_cache_mr(). If the MR can't come from the @@ -1320,7 +1362,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); /* We only track the allocated sizes of MRs from the cache */ - if (!mr->cache_ent) + if (!mr->mmkey.cache_ent) return false; if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; @@ -1329,7 +1371,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); if (WARN_ON(!*page_size)) return false; - return (1ULL << mr->cache_ent->order) >= + return (1ULL << mr->mmkey.cache_ent->order) >= ib_umem_num_dma_blocks(new_umem, *page_size); } @@ -1570,15 +1612,17 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) } /* Stop DMA */ - if (mr->cache_ent) { - if (mlx5r_umr_revoke_mr(mr)) { - spin_lock_irq(&mr->cache_ent->lock); - mr->cache_ent->total_mrs--; - spin_unlock_irq(&mr->cache_ent->lock); - mr->cache_ent = NULL; - } + if (mr->mmkey.cache_ent) { + xa_lock_irq(&mr->mmkey.cache_ent->mkeys); + mr->mmkey.cache_ent->in_use--; + xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); + + if (mlx5r_umr_revoke_mr(mr) || + push_mkey(mr->mmkey.cache_ent, false, + xa_mk_value(mr->mmkey.key))) + mr->mmkey.cache_ent = NULL; } - if (!mr->cache_ent) { + if (!mr->mmkey.cache_ent) { rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); if (rc) return rc; @@ -1595,12 +1639,10 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) mlx5_ib_free_odp_mr(mr); } - if (mr->cache_ent) { - mlx5_mr_cache_free(dev, mr); - } else { + if (!mr->mmkey.cache_ent) mlx5_free_priv_descs(mr); - kfree(mr); - } + + kfree(mr); return 0; } diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 84da5674e1ab..e305bf1dc6c2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -1588,7 +1588,7 @@ mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev *dev, struct mlx5_ib_pf_eq *eq) return err; } -void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) +void mlx5_odp_init_mkey_cache_entry(struct mlx5_cache_ent *ent) { if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) return; diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index 3a48364c0918..e00b94d1b1ea 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -176,6 +176,7 @@ int mlx5r_umr_resource_init(struct mlx5_ib_dev *dev) dev->umrc.pd = pd; sema_init(&dev->umrc.sem, MAX_UMR_WR); + mutex_init(&dev->umrc.lock); return 0; @@ -195,6 +196,31 @@ void mlx5r_umr_resource_cleanup(struct mlx5_ib_dev *dev) ib_dealloc_pd(dev->umrc.pd); } +static int mlx5r_umr_recover(struct mlx5_ib_dev *dev) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_qp_attr attr; + int err; + + attr.qp_state = IB_QPS_RESET; + err = ib_modify_qp(umrc->qp, &attr, IB_QP_STATE); + if (err) { + mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n"); + goto err; + } + + err = mlx5r_umr_qp_rst2rts(dev, umrc->qp); + if (err) + goto err; + + umrc->state = MLX5_UMR_STATE_ACTIVE; + return 0; + +err: + umrc->state = MLX5_UMR_STATE_ERR; + return err; +} + static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, struct mlx5r_umr_wqe *wqe, bool with_data) { @@ -231,7 +257,7 @@ static int mlx5r_umr_post_send(struct ib_qp *ibqp, u32 mkey, struct ib_cqe *cqe, id.ib_cqe = cqe; mlx5r_finish_wqe(qp, ctrl, seg, size, cur_edge, idx, id.wr_id, 0, - MLX5_FENCE_MODE_NONE, MLX5_OPCODE_UMR); + MLX5_FENCE_MODE_INITIATOR_SMALL, MLX5_OPCODE_UMR); mlx5r_ring_db(qp, 1, ctrl); @@ -270,17 +296,49 @@ static int mlx5r_umr_post_send_wait(struct mlx5_ib_dev *dev, u32 mkey, mlx5r_umr_init_context(&umr_context); down(&umrc->sem); - err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, - with_data); - if (err) - mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); - else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed (%u)\n", - umr_context.status); + while (true) { + mutex_lock(&umrc->lock); + if (umrc->state == MLX5_UMR_STATE_ERR) { + mutex_unlock(&umrc->lock); err = -EFAULT; + break; + } + + if (umrc->state == MLX5_UMR_STATE_RECOVER) { + mutex_unlock(&umrc->lock); + usleep_range(3000, 5000); + continue; + } + + err = mlx5r_umr_post_send(umrc->qp, mkey, &umr_context.cqe, wqe, + with_data); + mutex_unlock(&umrc->lock); + if (err) { + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", + err); + break; } + + wait_for_completion(&umr_context.done); + + if (umr_context.status == IB_WC_SUCCESS) + break; + + if (umr_context.status == IB_WC_WR_FLUSH_ERR) + continue; + + WARN_ON_ONCE(1); + mlx5_ib_warn(dev, + "reg umr failed (%u). Trying to recover and resubmit the flushed WQEs\n", + umr_context.status); + mutex_lock(&umrc->lock); + err = mlx5r_umr_recover(dev); + mutex_unlock(&umrc->lock); + if (err) + mlx5_ib_warn(dev, "couldn't recover UMR, err %d\n", + err); + err = -EFAULT; + break; } up(&umrc->sem); return err; diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 03ed7c0fae50..d745ce9dc88a 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -3084,7 +3084,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, else DP_ERR(dev, "roce alloc tid returned error %d\n", rc); - goto err0; + goto err1; } /* Index only, 18 bit long, lkey = itid << 8 | key */ @@ -3108,7 +3108,7 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, rc = dev->ops->rdma_register_tid(dev->rdma_ctx, &mr->hw_mr); if (rc) { DP_ERR(dev, "roce register tid returned an error %d\n", rc); - goto err1; + goto err2; } mr->ibmr.lkey = mr->hw_mr.itid << 8 | mr->hw_mr.key; @@ -3117,8 +3117,10 @@ static struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, DP_DEBUG(dev, QEDR_MSG_MR, "alloc frmr: %x\n", mr->ibmr.lkey); return mr; -err1: +err2: dev->ops->rdma_free_tid(dev->rdma_ctx, mr->hw_mr.itid); +err1: + qedr_free_pbl(dev, &mr->info.pbl_info, mr->info.pbl_table); err0: kfree(mr); return ERR_PTR(rc); diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index b37b1c6d35c6..26c615772be3 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -321,7 +321,7 @@ struct qib_verbs_txreq { * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed * negotiation) are used for the 3rd argument to path_f_set_ib_cfg * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs. They - * are also the the possible values for qib_link_speed_enabled and active + * are also the possible values for qib_link_speed_enabled and active * The values were chosen to match values used within the IB spec. */ #define QIB_IB_SDR 1 diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index aa290928cf96..3937144b2ae5 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -153,7 +153,7 @@ static int qib_get_base_info(struct file *fp, void __user *ubase, kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt; /* * for this use, may be cfgctxts summed over all chips that - * are are configured and present + * are configured and present */ kinfo->spi_nctxts = dd->cfgctxts; /* unit (chip/board) our context is on */ @@ -851,7 +851,7 @@ static int mmap_rcvegrbufs(struct vm_area_struct *vma, ret = -EPERM; goto bail; } - /* don't allow them to later change to writeable with mprotect */ + /* don't allow them to later change to writable with mprotect */ vma->vm_flags &= ~VM_MAYWRITE; start = vma->vm_start; @@ -941,7 +941,7 @@ static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr, goto bail; } /* - * Don't allow permission to later change to writeable + * Don't allow permission to later change to writable * with mprotect. */ vma->vm_flags &= ~VM_MAYWRITE; diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c index 37b628a162e0..6af57067c32e 100644 --- a/drivers/infiniband/hw/qib/qib_iba7220.c +++ b/drivers/infiniband/hw/qib/qib_iba7220.c @@ -58,7 +58,7 @@ static void qib_set_ib_7220_lstate(struct qib_pportdata *, u16, u16); /* * This file contains almost all the chip-specific register information and * access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the - * exception of SerDes support, which in in qib_sd7220.c. + * exception of SerDes support, which in qib_sd7220.c. */ /* Below uses machine-generated qib_chipnum_regs.h file */ diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index ceed302cf6a0..6861c6384f18 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -2850,9 +2850,9 @@ static void qib_setup_7322_cleanup(struct qib_devdata *dd) qib_7322_free_irq(dd); kfree(dd->cspec->cntrs); - kfree(dd->cspec->sendchkenable); - kfree(dd->cspec->sendgrhchk); - kfree(dd->cspec->sendibchk); + bitmap_free(dd->cspec->sendchkenable); + bitmap_free(dd->cspec->sendgrhchk); + bitmap_free(dd->cspec->sendibchk); kfree(dd->cspec->msix_entries); for (i = 0; i < dd->num_pports; i++) { unsigned long flags; @@ -6383,18 +6383,11 @@ static int qib_init_7322_variables(struct qib_devdata *dd) features = qib_7322_boardname(dd); /* now that piobcnt2k and 4k set, we can allocate these */ - sbufcnt = dd->piobcnt2k + dd->piobcnt4k + - NUM_VL15_BUFS + BITS_PER_LONG - 1; - sbufcnt /= BITS_PER_LONG; - dd->cspec->sendchkenable = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendchkenable), - GFP_KERNEL); - dd->cspec->sendgrhchk = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendgrhchk), - GFP_KERNEL); - dd->cspec->sendibchk = - kmalloc_array(sbufcnt, sizeof(*dd->cspec->sendibchk), - GFP_KERNEL); + sbufcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS; + + dd->cspec->sendchkenable = bitmap_zalloc(sbufcnt, GFP_KERNEL); + dd->cspec->sendgrhchk = bitmap_zalloc(sbufcnt, GFP_KERNEL); + dd->cspec->sendibchk = bitmap_zalloc(sbufcnt, GFP_KERNEL); if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk || !dd->cspec->sendibchk) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index d1a72e89e297..45211008449f 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1106,8 +1106,7 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra) if (!qib_cpulist_count) { u32 count = num_online_cpus(); - qib_cpulist = kcalloc(BITS_TO_LONGS(count), sizeof(long), - GFP_KERNEL); + qib_cpulist = bitmap_zalloc(count, GFP_KERNEL); if (qib_cpulist) qib_cpulist_count = count; } @@ -1279,7 +1278,7 @@ static void __exit qib_ib_cleanup(void) #endif qib_cpulist_count = 0; - kfree(qib_cpulist); + bitmap_free(qib_cpulist); WARN_ON(!xa_empty(&qib_dev_table)); qib_dev_cleanup(); diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c index 81b810d006c0..1dc3ccf0cf1f 100644 --- a/drivers/infiniband/hw/qib/qib_sd7220.c +++ b/drivers/infiniband/hw/qib/qib_sd7220.c @@ -587,7 +587,7 @@ static int epb_access(struct qib_devdata *dd, int sdnum, int claim) /* Need to release */ u64 pollval; /* - * The only writeable bits are the request and CS. + * The only writable bits are the request and CS. * Both should be clear */ u64 newval = 0; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index e212929369df..67a1b4562dc2 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -482,7 +482,7 @@ int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev) if (err) goto out_free_dev; - if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) { + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { usnic_err("IOMMU of %s does not support cache coherency\n", dev_name(dev)); err = -EINVAL; |