diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-24 15:11:03 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-24 15:11:03 -0800 |
commit | 8cbd92339db08b19b93d1637e5799ff2a8dddfd2 (patch) | |
tree | 7e62d961f32e8a2a96271029b376f1e8bbd70a7c /drivers/infiniband/hw/hfi1 | |
parent | 143c7bc6496c886ce5db2a2f9cec580494690170 (diff) | |
parent | 66fb1d5df6ace316a4a6e2c31e13fc123ea2b644 (diff) |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"Quite a small cycle this time, even with the rc8. I suppose everyone
went to sleep over xmas.
- Minor driver updates for hfi1, cxgb4, erdma, hns, irdma, mlx5, siw,
mana
- inline CQE support for hns
- Have mlx5 display device error codes
- Pinned DMABUF support for irdma
- Continued rxe cleanups, particularly converting the MRs to use
xarray
- Improvements to what can be cached in the mlx5 mkey cache"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (61 commits)
IB/mlx5: Extend debug control for CC parameters
IB/hfi1: Fix sdma.h tx->num_descs off-by-one errors
IB/hfi1: Fix math bugs in hfi1_can_pin_pages()
RDMA/irdma: Add support for dmabuf pin memory regions
RDMA/mlx5: Use query_special_contexts for mkeys
net/mlx5e: Use query_special_contexts for mkeys
net/mlx5: Change define name for 0x100 lkey value
net/mlx5: Expose bits for querying special mkeys
RDMA/rxe: Fix missing memory barriers in rxe_queue.h
RDMA/mana_ib: Fix a bug when the PF indicates more entries for registering memory on first packet
RDMA/rxe: Remove rxe_alloc()
RDMA/cma: Distinguish between sockaddr_in and sockaddr_in6 by size
Subject: RDMA/rxe: Handle zero length rdma
iw_cxgb4: Fix potential NULL dereference in c4iw_fill_res_cm_id_entry()
RDMA/mlx5: Use rdma_umem_for_each_dma_block()
RDMA/umem: Remove unused 'work' member from struct ib_umem
RDMA/irdma: Cap MSIX used to online CPUs + 1
RDMA/mlx5: Check reg_create() create for errors
RDMA/restrack: Correct spelling
RDMA/cxgb4: Fix potential null-ptr-deref in pass_establish()
...
Diffstat (limited to 'drivers/infiniband/hw/hfi1')
-rw-r--r-- | drivers/infiniband/hw/hfi1/chip.c | 59 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/exp_rcv.h | 5 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/file_ops.c | 81 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/init.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/sdma.c | 4 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/sdma.h | 15 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/user_exp_rcv.c | 46 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/user_pages.c | 61 | ||||
-rw-r--r-- | drivers/infiniband/hw/hfi1/verbs.c | 81 |
9 files changed, 194 insertions, 160 deletions
diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index ebe970f76232..90b672feed83 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1056,7 +1056,7 @@ static void read_link_down_reason(struct hfi1_devdata *dd, u8 *ldr); static void handle_temp_err(struct hfi1_devdata *dd); static void dc_shutdown(struct hfi1_devdata *dd); static void dc_start(struct hfi1_devdata *dd); -static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, +static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp, unsigned int *np); static void clear_full_mgmt_pkey(struct hfi1_pportdata *ppd); static int wait_link_transfer_active(struct hfi1_devdata *dd, int wait_ms); @@ -13362,7 +13362,6 @@ static int set_up_context_variables(struct hfi1_devdata *dd) int ret; unsigned ngroups; int rmt_count; - int user_rmt_reduced; u32 n_usr_ctxts; u32 send_contexts = chip_send_contexts(dd); u32 rcv_contexts = chip_rcv_contexts(dd); @@ -13421,28 +13420,34 @@ static int set_up_context_variables(struct hfi1_devdata *dd) (num_kernel_contexts + n_usr_ctxts), &node_affinity.real_cpu_mask); /* - * The RMT entries are currently allocated as shown below: - * 1. QOS (0 to 128 entries); - * 2. FECN (num_kernel_context - 1 + num_user_contexts + - * num_netdev_contexts); - * 3. netdev (num_netdev_contexts). - * It should be noted that FECN oversubscribe num_netdev_contexts - * entries of RMT because both netdev and PSM could allocate any receive - * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts, - * and PSM FECN must reserve an RMT entry for each possible PSM receive - * context. + * RMT entries are allocated as follows: + * 1. QOS (0 to 128 entries) + * 2. FECN (num_kernel_context - 1 [a] + num_user_contexts + + * num_netdev_contexts [b]) + * 3. netdev (NUM_NETDEV_MAP_ENTRIES) + * + * Notes: + * [a] Kernel contexts (except control) are included in FECN if kernel + * TID_RDMA is active. + * [b] Netdev and user contexts are randomly allocated from the same + * context pool, so FECN must cover all contexts in the pool. */ - rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_netdev_contexts * 2); - if (HFI1_CAP_IS_KSET(TID_RDMA)) - rmt_count += num_kernel_contexts - 1; - if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) { - user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count; - dd_dev_err(dd, - "RMT size is reducing the number of user receive contexts from %u to %d\n", - n_usr_ctxts, - user_rmt_reduced); - /* recalculate */ - n_usr_ctxts = user_rmt_reduced; + rmt_count = qos_rmt_entries(num_kernel_contexts - 1, NULL, NULL) + + (HFI1_CAP_IS_KSET(TID_RDMA) ? num_kernel_contexts - 1 + : 0) + + n_usr_ctxts + + num_netdev_contexts + + NUM_NETDEV_MAP_ENTRIES; + if (rmt_count > NUM_MAP_ENTRIES) { + int over = rmt_count - NUM_MAP_ENTRIES; + /* try to squish user contexts, minimum of 1 */ + if (over >= n_usr_ctxts) { + dd_dev_err(dd, "RMT overflow: reduce the requested number of contexts\n"); + return -EINVAL; + } + dd_dev_err(dd, "RMT overflow: reducing # user contexts from %u to %u\n", + n_usr_ctxts, n_usr_ctxts - over); + n_usr_ctxts -= over; } /* the first N are kernel contexts, the rest are user/netdev contexts */ @@ -14299,15 +14304,15 @@ static void clear_rsm_rule(struct hfi1_devdata *dd, u8 rule_index) } /* return the number of RSM map table entries that will be used for QOS */ -static int qos_rmt_entries(struct hfi1_devdata *dd, unsigned int *mp, +static int qos_rmt_entries(unsigned int n_krcv_queues, unsigned int *mp, unsigned int *np) { int i; unsigned int m, n; - u8 max_by_vl = 0; + uint max_by_vl = 0; /* is QOS active at all? */ - if (dd->n_krcv_queues <= MIN_KERNEL_KCTXTS || + if (n_krcv_queues < MIN_KERNEL_KCTXTS || num_vls == 1 || krcvqsset <= 1) goto no_qos; @@ -14365,7 +14370,7 @@ static void init_qos(struct hfi1_devdata *dd, struct rsm_map_table *rmt) if (!rmt) goto bail; - rmt_entries = qos_rmt_entries(dd, &m, &n); + rmt_entries = qos_rmt_entries(dd->n_krcv_queues - 1, &m, &n); if (rmt_entries == 0) goto bail; qpns_per_vl = 1 << m; diff --git a/drivers/infiniband/hw/hfi1/exp_rcv.h b/drivers/infiniband/hw/hfi1/exp_rcv.h index c6291bbf723c..41f7fe5d1839 100644 --- a/drivers/infiniband/hw/hfi1/exp_rcv.h +++ b/drivers/infiniband/hw/hfi1/exp_rcv.h @@ -133,12 +133,13 @@ static inline struct tid_group *tid_group_pop(struct exp_tid_set *set) return grp; } -static inline u32 rcventry2tidinfo(u32 rcventry) +static inline u32 create_tid(u32 rcventry, u32 npages) { u32 pair = rcventry & ~0x1; return EXP_TID_SET(IDX, pair >> 1) | - EXP_TID_SET(CTRL, 1 << (rcventry - pair)); + EXP_TID_SET(CTRL, 1 << (rcventry - pair)) | + EXP_TID_SET(LEN, npages); } /** diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 47e4c8084207..b1d6ca7e9708 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -306,6 +306,17 @@ static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from) return reqs; } +static inline void mmap_cdbg(u16 ctxt, u8 subctxt, u8 type, u8 mapio, u8 vmf, + u64 memaddr, void *memvirt, dma_addr_t memdma, + ssize_t memlen, struct vm_area_struct *vma) +{ + hfi1_cdbg(PROC, + "%u:%u type:%u io/vf/dma:%d/%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx", + ctxt, subctxt, type, mapio, vmf, !!memdma, + memaddr ?: (u64)memvirt, memlen, + vma->vm_end - vma->vm_start, vma->vm_flags); +} + static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) { struct hfi1_filedata *fd = fp->private_data; @@ -315,6 +326,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) u64 token = vma->vm_pgoff << PAGE_SHIFT, memaddr = 0; void *memvirt = NULL; + dma_addr_t memdma = 0; u8 subctxt, mapio = 0, vmf = 0, type; ssize_t memlen = 0; int ret = 0; @@ -334,6 +346,11 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } + /* + * vm_pgoff is used as a buffer selector cookie. Always mmap from + * the beginning. + */ + vma->vm_pgoff = 0; flags = vma->vm_flags; switch (type) { @@ -355,7 +372,8 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); mapio = 1; break; - case PIO_CRED: + case PIO_CRED: { + u64 cr_page_offset; if (flags & VM_WRITE) { ret = -EPERM; goto done; @@ -365,10 +383,11 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) * second or third page allocated for credit returns (if number * of enabled contexts > 64 and 128 respectively). */ - memvirt = dd->cr_base[uctxt->numa_id].va; - memaddr = virt_to_phys(memvirt) + - (((u64)uctxt->sc->hw_free - - (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK); + cr_page_offset = ((u64)uctxt->sc->hw_free - + (u64)dd->cr_base[uctxt->numa_id].va) & + PAGE_MASK; + memvirt = dd->cr_base[uctxt->numa_id].va + cr_page_offset; + memdma = dd->cr_base[uctxt->numa_id].dma + cr_page_offset; memlen = PAGE_SIZE; flags &= ~VM_MAYWRITE; flags |= VM_DONTCOPY | VM_DONTEXPAND; @@ -378,14 +397,16 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) * memory been flagged as non-cached? */ /* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */ - mapio = 1; break; + } case RCV_HDRQ: memlen = rcvhdrq_size(uctxt); memvirt = uctxt->rcvhdrq; + memdma = uctxt->rcvhdrq_dma; break; case RCV_EGRBUF: { - unsigned long addr; + unsigned long vm_start_save; + unsigned long vm_end_save; int i; /* * The RcvEgr buffer need to be handled differently @@ -404,24 +425,34 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) goto done; } vm_flags_clear(vma, VM_MAYWRITE); - addr = vma->vm_start; + /* + * Mmap multiple separate allocations into a single vma. From + * here, dma_mmap_coherent() calls dma_direct_mmap(), which + * requires the mmap to exactly fill the vma starting at + * vma_start. Adjust the vma start and end for each eager + * buffer segment mapped. Restore the originals when done. + */ + vm_start_save = vma->vm_start; + vm_end_save = vma->vm_end; + vma->vm_end = vma->vm_start; for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) { memlen = uctxt->egrbufs.buffers[i].len; memvirt = uctxt->egrbufs.buffers[i].addr; - ret = remap_pfn_range( - vma, addr, - /* - * virt_to_pfn() does the same, but - * it's not available on x86_64 - * when CONFIG_MMU is enabled. - */ - PFN_DOWN(__pa(memvirt)), - memlen, - vma->vm_page_prot); - if (ret < 0) + memdma = uctxt->egrbufs.buffers[i].dma; + vma->vm_end += memlen; + mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, + memvirt, memdma, memlen, vma); + ret = dma_mmap_coherent(&dd->pcidev->dev, vma, + memvirt, memdma, memlen); + if (ret < 0) { + vma->vm_start = vm_start_save; + vma->vm_end = vm_end_save; goto done; - addr += memlen; + } + vma->vm_start += memlen; } + vma->vm_start = vm_start_save; + vma->vm_end = vm_end_save; ret = 0; goto done; } @@ -481,6 +512,7 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) } memlen = PAGE_SIZE; memvirt = (void *)hfi1_rcvhdrtail_kvaddr(uctxt); + memdma = uctxt->rcvhdrqtailaddr_dma; flags &= ~VM_MAYWRITE; break; case SUBCTXT_UREGS: @@ -529,14 +561,15 @@ static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma) } vm_flags_reset(vma, flags); - hfi1_cdbg(PROC, - "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n", - ctxt, subctxt, type, mapio, vmf, memaddr, memlen, - vma->vm_end - vma->vm_start, vma->vm_flags); + mmap_cdbg(ctxt, subctxt, type, mapio, vmf, memaddr, memvirt, memdma, + memlen, vma); if (vmf) { vma->vm_pgoff = PFN_DOWN(memaddr); vma->vm_ops = &vm_ops; ret = 0; + } else if (memdma) { + ret = dma_mmap_coherent(&dd->pcidev->dev, vma, + memvirt, memdma, memlen); } else if (mapio) { ret = io_remap_pfn_range(vma, vma->vm_start, PFN_DOWN(memaddr), diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 24c0f0d257fc..62b6c5020039 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -464,7 +464,7 @@ bail: * * This wrapper is the free function that matches hfi1_create_ctxtdata(). * When a context is done being used (kernel or user), this function is called - * for the "final" put to match the kref init from hf1i_create_ctxtdata(). + * for the "final" put to match the kref init from hfi1_create_ctxtdata(). * Other users of the context do a get/put sequence to make sure that the * structure isn't removed while in use. */ diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index a95b654f5254..8ed20392e9f0 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -3160,8 +3160,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) { int rval = 0; - tx->num_desc++; - if ((unlikely(tx->num_desc == tx->desc_limit))) { + if ((unlikely(tx->num_desc + 1 == tx->desc_limit))) { rval = _extend_sdma_tx_descs(dd, tx); if (rval) { __sdma_txclean(dd, tx); @@ -3174,6 +3173,7 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) SDMA_MAP_NONE, dd->sdma_pad_phys, sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); + tx->num_desc++; _sdma_close_tx(dd, tx); return rval; } diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index d8170fcbfbdd..b023fc461bd5 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -631,14 +631,13 @@ static inline void sdma_txclean(struct hfi1_devdata *dd, struct sdma_txreq *tx) static inline void _sdma_close_tx(struct hfi1_devdata *dd, struct sdma_txreq *tx) { - tx->descp[tx->num_desc].qw[0] |= - SDMA_DESC0_LAST_DESC_FLAG; - tx->descp[tx->num_desc].qw[1] |= - dd->default_desc1; + u16 last_desc = tx->num_desc - 1; + + tx->descp[last_desc].qw[0] |= SDMA_DESC0_LAST_DESC_FLAG; + tx->descp[last_desc].qw[1] |= dd->default_desc1; if (tx->flags & SDMA_TXREQ_F_URGENT) - tx->descp[tx->num_desc].qw[1] |= - (SDMA_DESC1_HEAD_TO_HOST_FLAG | - SDMA_DESC1_INT_REQ_FLAG); + tx->descp[last_desc].qw[1] |= (SDMA_DESC1_HEAD_TO_HOST_FLAG | + SDMA_DESC1_INT_REQ_FLAG); } static inline int _sdma_txadd_daddr( @@ -655,6 +654,7 @@ static inline int _sdma_txadd_daddr( type, addr, len); WARN_ON(len > tx->tlen); + tx->num_desc++; tx->tlen -= len; /* special cases for last */ if (!tx->tlen) { @@ -666,7 +666,6 @@ static inline int _sdma_txadd_daddr( _sdma_close_tx(dd, tx); } } - tx->num_desc++; return rval; } diff --git a/drivers/infiniband/hw/hfi1/user_exp_rcv.c b/drivers/infiniband/hw/hfi1/user_exp_rcv.c index 350884d5f089..96058baf36ed 100644 --- a/drivers/infiniband/hw/hfi1/user_exp_rcv.c +++ b/drivers/infiniband/hw/hfi1/user_exp_rcv.c @@ -27,8 +27,7 @@ static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, const struct mmu_notifier_range *range, unsigned long cur_seq); static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, - struct tid_group *grp, - unsigned int start, u16 count, + struct tid_group *grp, u16 count, u32 *tidlist, unsigned int *tididx, unsigned int *pmapped); static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); @@ -250,7 +249,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, int ret = 0, need_group = 0, pinned; struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; - unsigned int ngroups, pageidx = 0, pageset_count, + unsigned int ngroups, pageset_count, tididx = 0, mapped, mapped_pages = 0; u32 *tidlist = NULL; struct tid_user_buf *tidbuf; @@ -332,7 +331,7 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_pop(&uctxt->tid_group_list); ret = program_rcvarray(fd, tidbuf, grp, - pageidx, dd->rcv_entries.group_size, + dd->rcv_entries.group_size, tidlist, &tididx, &mapped); /* * If there was a failure to program the RcvArray @@ -348,11 +347,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_add_tail(grp, &uctxt->tid_full_list); ngroups--; - pageidx += ret; mapped_pages += mapped; } - while (pageidx < pageset_count) { + while (tididx < pageset_count) { struct tid_group *grp, *ptr; /* * If we don't have any partially used tid groups, check @@ -374,11 +372,11 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, */ list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list, list) { - unsigned use = min_t(unsigned, pageset_count - pageidx, + unsigned use = min_t(unsigned, pageset_count - tididx, grp->size - grp->used); ret = program_rcvarray(fd, tidbuf, grp, - pageidx, use, tidlist, + use, tidlist, &tididx, &mapped); if (ret < 0) { hfi1_cdbg(TID, @@ -390,11 +388,10 @@ int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd, tid_group_move(grp, &uctxt->tid_used_list, &uctxt->tid_full_list); - pageidx += ret; mapped_pages += mapped; need_group = 0; /* Check if we are done so we break out early */ - if (pageidx >= pageset_count) + if (tididx >= pageset_count) break; } else if (WARN_ON(ret == 0)) { /* @@ -638,7 +635,6 @@ static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) * struct tid_pageset holding information on physically contiguous * chunks from the user buffer), and other fields. * @grp: RcvArray group - * @start: starting index into sets array * @count: number of struct tid_pageset's to program * @tidlist: the array of u32 elements when the information about the * programmed RcvArray entries is to be encoded. @@ -658,14 +654,14 @@ static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages) * number of RcvArray entries programmed. */ static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, - struct tid_group *grp, - unsigned int start, u16 count, + struct tid_group *grp, u16 count, u32 *tidlist, unsigned int *tididx, unsigned int *pmapped) { struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; u16 idx; + unsigned int start = *tididx; u32 tidinfo = 0, rcventry, useidx = 0; int mapped = 0; @@ -710,8 +706,7 @@ static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf, return ret; mapped += npages; - tidinfo = rcventry2tidinfo(rcventry - uctxt->expected_base) | - EXP_TID_SET(LEN, npages); + tidinfo = create_tid(rcventry - uctxt->expected_base, npages); tidlist[(*tididx)++] = tidinfo; grp->used++; grp->map |= 1 << useidx++; @@ -795,20 +790,20 @@ static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) struct hfi1_ctxtdata *uctxt = fd->uctxt; struct hfi1_devdata *dd = uctxt->dd; struct tid_rb_node *node; - u8 tidctrl = EXP_TID_GET(tidinfo, CTRL); + u32 tidctrl = EXP_TID_GET(tidinfo, CTRL); u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry; - if (tididx >= uctxt->expected_count) { - dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", - tididx, uctxt->ctxt); - return -EINVAL; - } - - if (tidctrl == 0x3) + if (tidctrl == 0x3 || tidctrl == 0x0) return -EINVAL; rcventry = tididx + (tidctrl - 1); + if (rcventry >= uctxt->expected_count) { + dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n", + rcventry, uctxt->ctxt); + return -EINVAL; + } + node = fd->entry_to_rb[rcventry]; if (!node || node->rcventry != (uctxt->expected_base + rcventry)) return -EBADF; @@ -920,9 +915,8 @@ static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, spin_lock(&fdata->invalid_lock); if (fdata->invalid_tid_idx < uctxt->expected_count) { fdata->invalid_tids[fdata->invalid_tid_idx] = - rcventry2tidinfo(node->rcventry - uctxt->expected_base); - fdata->invalid_tids[fdata->invalid_tid_idx] |= - EXP_TID_SET(LEN, node->npages); + create_tid(node->rcventry - uctxt->expected_base, + node->npages); if (!fdata->invalid_tid_idx) { unsigned long *ev; diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 7bce963e2ae6..36aaedc65145 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -29,33 +29,52 @@ MODULE_PARM_DESC(cache_size, "Send and receive side cache size limit (in MB)"); bool hfi1_can_pin_pages(struct hfi1_devdata *dd, struct mm_struct *mm, u32 nlocked, u32 npages) { - unsigned long ulimit = rlimit(RLIMIT_MEMLOCK), pinned, cache_limit, - size = (cache_size * (1UL << 20)); /* convert to bytes */ - unsigned int usr_ctxts = - dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt; - bool can_lock = capable(CAP_IPC_LOCK); + unsigned long ulimit_pages; + unsigned long cache_limit_pages; + unsigned int usr_ctxts; /* - * Calculate per-cache size. The calculation below uses only a quarter - * of the available per-context limit. This leaves space for other - * pinning. Should we worry about shared ctxts? + * Perform RLIMIT_MEMLOCK based checks unless CAP_IPC_LOCK is present. */ - cache_limit = (ulimit / usr_ctxts) / 4; - - /* If ulimit isn't set to "unlimited" and is smaller than cache_size. */ - if (ulimit != (-1UL) && size > cache_limit) - size = cache_limit; - - /* Convert to number of pages */ - size = DIV_ROUND_UP(size, PAGE_SIZE); - - pinned = atomic64_read(&mm->pinned_vm); + if (!capable(CAP_IPC_LOCK)) { + ulimit_pages = + DIV_ROUND_DOWN_ULL(rlimit(RLIMIT_MEMLOCK), PAGE_SIZE); + + /* + * Pinning these pages would exceed this process's locked memory + * limit. + */ + if (atomic64_read(&mm->pinned_vm) + npages > ulimit_pages) + return false; + + /* + * Only allow 1/4 of the user's RLIMIT_MEMLOCK to be used for HFI + * caches. This fraction is then equally distributed among all + * existing user contexts. Note that if RLIMIT_MEMLOCK is + * 'unlimited' (-1), the value of this limit will be > 2^42 pages + * (2^64 / 2^12 / 2^8 / 2^2). + * + * The effectiveness of this check may be reduced if I/O occurs on + * some user contexts before all user contexts are created. This + * check assumes that this process is the only one using this + * context (e.g., the corresponding fd was not passed to another + * process for concurrent access) as there is no per-context, + * per-process tracking of pinned pages. It also assumes that each + * user context has only one cache to limit. + */ + usr_ctxts = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt; + if (nlocked + npages > (ulimit_pages / usr_ctxts / 4)) + return false; + } - /* First, check the absolute limit against all pinned pages. */ - if (pinned + npages >= ulimit && !can_lock) + /* + * Pinning these pages would exceed the size limit for this cache. + */ + cache_limit_pages = cache_size * (1024 * 1024) / PAGE_SIZE; + if (nlocked + npages > cache_limit_pages) return false; - return ((nlocked + npages) <= size) || can_lock; + return true; } int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t npages, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index e6e17984553c..7f6d7fc7951d 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1598,13 +1598,11 @@ static const char * const driver_cntr_names[] = { "DRIVER_EgrHdrFull" }; -static DEFINE_MUTEX(cntr_names_lock); /* protects the *_cntr_names bufers */ static struct rdma_stat_desc *dev_cntr_descs; static struct rdma_stat_desc *port_cntr_descs; int num_driver_cntrs = ARRAY_SIZE(driver_cntr_names); static int num_dev_cntrs; static int num_port_cntrs; -static int cntr_names_initialized; /* * Convert a list of names separated by '\n' into an array of NULL terminated @@ -1615,8 +1613,8 @@ static int init_cntr_names(const char *names_in, const size_t names_len, int num_extra_names, int *num_cntrs, struct rdma_stat_desc **cntr_descs) { - struct rdma_stat_desc *q; - char *names_out, *p; + struct rdma_stat_desc *names_out; + char *p; int i, n; n = 0; @@ -1624,65 +1622,45 @@ static int init_cntr_names(const char *names_in, const size_t names_len, if (names_in[i] == '\n') n++; - names_out = - kzalloc((n + num_extra_names) * sizeof(*q) + names_len, - GFP_KERNEL); + names_out = kzalloc((n + num_extra_names) * sizeof(*names_out) + + names_len, + GFP_KERNEL); if (!names_out) { *num_cntrs = 0; *cntr_descs = NULL; return -ENOMEM; } - p = names_out + (n + num_extra_names) * sizeof(*q); + p = (char *)&names_out[n + num_extra_names]; memcpy(p, names_in, names_len); - q = (struct rdma_stat_desc *)names_out; for (i = 0; i < n; i++) { - q[i].name = p; + names_out[i].name = p; p = strchr(p, '\n'); *p++ = '\0'; } *num_cntrs = n; - *cntr_descs = (struct rdma_stat_desc *)names_out; + *cntr_descs = names_out; return 0; } -static int init_counters(struct ib_device *ibdev) -{ - struct hfi1_devdata *dd = dd_from_ibdev(ibdev); - int i, err = 0; - - mutex_lock(&cntr_names_lock); - if (cntr_names_initialized) - goto out_unlock; - - err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, num_driver_cntrs, - &num_dev_cntrs, &dev_cntr_descs); - if (err) - goto out_unlock; - - for (i = 0; i < num_driver_cntrs; i++) - dev_cntr_descs[num_dev_cntrs + i].name = driver_cntr_names[i]; - - err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, 0, - &num_port_cntrs, &port_cntr_descs); - if (err) { - kfree(dev_cntr_descs); - dev_cntr_descs = NULL; - goto out_unlock; - } - cntr_names_initialized = 1; - -out_unlock: - mutex_unlock(&cntr_names_lock); - return err; -} - static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev) { - if (init_counters(ibdev)) - return NULL; + if (!dev_cntr_descs) { + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int i, err; + + err = init_cntr_names(dd->cntrnames, dd->cntrnameslen, + num_driver_cntrs, + &num_dev_cntrs, &dev_cntr_descs); + if (err) + return NULL; + + for (i = 0; i < num_driver_cntrs; i++) + dev_cntr_descs[num_dev_cntrs + i].name = + driver_cntr_names[i]; + } return rdma_alloc_hw_stats_struct(dev_cntr_descs, num_dev_cntrs + num_driver_cntrs, RDMA_HW_STATS_DEFAULT_LIFESPAN); @@ -1691,8 +1669,16 @@ static struct rdma_hw_stats *hfi1_alloc_hw_device_stats(struct ib_device *ibdev) static struct rdma_hw_stats *hfi_alloc_hw_port_stats(struct ib_device *ibdev, u32 port_num) { - if (init_counters(ibdev)) - return NULL; + if (!port_cntr_descs) { + struct hfi1_devdata *dd = dd_from_ibdev(ibdev); + int err; + + err = init_cntr_names(dd->portcntrnames, dd->portcntrnameslen, + 0, + &num_port_cntrs, &port_cntr_descs); + if (err) + return NULL; + } return rdma_alloc_hw_stats_struct(port_cntr_descs, num_port_cntrs, RDMA_HW_STATS_DEFAULT_LIFESPAN); } @@ -1917,13 +1903,10 @@ void hfi1_unregister_ib_device(struct hfi1_devdata *dd) del_timer_sync(&dev->mem_timer); verbs_txreq_exit(dev); - mutex_lock(&cntr_names_lock); kfree(dev_cntr_descs); kfree(port_cntr_descs); dev_cntr_descs = NULL; port_cntr_descs = NULL; - cntr_names_initialized = 0; - mutex_unlock(&cntr_names_lock); } void hfi1_cnp_rcv(struct hfi1_packet *packet) |