diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/io_uring_types.h | 30 | ||||
-rw-r--r-- | include/linux/skbuff.h | 66 | ||||
-rw-r--r-- | include/linux/socket.h | 5 | ||||
-rw-r--r-- | include/uapi/linux/io_uring.h | 45 |
4 files changed, 126 insertions, 20 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d54b8b7e0746..f7fab3758cb9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -4,6 +4,7 @@ #include <linux/blkdev.h> #include <linux/task_work.h> #include <linux/bitmap.h> +#include <linux/llist.h> #include <uapi/linux/io_uring.h> struct io_wq_work_node { @@ -33,6 +34,9 @@ struct io_file_table { unsigned int alloc_hint; }; +struct io_notif; +struct io_notif_slot; + struct io_hash_bucket { spinlock_t lock; struct hlist_head list; @@ -43,6 +47,30 @@ struct io_hash_table { unsigned hash_bits; }; +/* + * Arbitrary limit, can be raised if need be + */ +#define IO_RINGFD_REG_MAX 16 + +struct io_uring_task { + /* submission side */ + int cached_refs; + const struct io_ring_ctx *last; + struct io_wq *io_wq; + struct file *registered_rings[IO_RINGFD_REG_MAX]; + + struct xarray xa; + struct wait_queue_head wait; + atomic_t in_idle; + atomic_t inflight_tracked; + struct percpu_counter inflight; + + struct { /* task_work */ + struct llist_head task_list; + struct callback_head task_work; + } ____cacheline_aligned_in_smp; +}; + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -212,6 +240,8 @@ struct io_ring_ctx { unsigned nr_user_files; unsigned nr_user_bufs; struct io_mapped_ubuf **user_bufs; + struct io_notif_slot *notif_slots; + unsigned nr_notif_slots; struct io_submit_state submit_state; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d3d10556f0fa..1111adefd906 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -686,10 +686,18 @@ enum { * charged to the kernel memory. */ SKBFL_PURE_ZEROCOPY = BIT(2), + + SKBFL_DONT_ORPHAN = BIT(3), + + /* page references are managed by the ubuf_info, so it's safe to + * use frags only up until ubuf_info is released + */ + SKBFL_MANAGED_FRAG_REFS = BIT(4), }; #define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG) -#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY) +#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ + SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) /* * The callback notifies userspace to release buffers when skb DMA is done in @@ -1773,13 +1781,14 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success); -int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, - struct iov_iter *from, size_t length); +int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb, struct iov_iter *from, + size_t length); static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len) { - return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len); + return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len); } int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, @@ -1806,6 +1815,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb) return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY; } +static inline bool skb_zcopy_managed(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS; +} + static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1, const struct sk_buff *skb2) { @@ -1880,6 +1894,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) } } +void __skb_zcopy_downgrade_managed(struct sk_buff *skb); + +static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb) +{ + if (unlikely(skb_zcopy_managed(skb))) + __skb_zcopy_downgrade_managed(skb); +} + static inline void skb_mark_not_on_list(struct sk_buff *skb) { skb->next = NULL; @@ -2528,6 +2550,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) return skb_headlen(skb) + __skb_pagelen(skb); } +static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, + int i, struct page *page, + int off, int size) +{ + skb_frag_t *frag = &shinfo->frags[i]; + + /* + * Propagate page pfmemalloc to the skb if we can. The problem is + * that not all callers have unique ownership of the page but rely + * on page_is_pfmemalloc doing the right thing(tm). + */ + frag->bv_page = page; + frag->bv_offset = off; + skb_frag_size_set(frag, size); +} + /** * __skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised @@ -2544,17 +2582,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - /* - * Propagate page pfmemalloc to the skb if we can. The problem is - * that not all callers have unique ownership of the page but rely - * on page_is_pfmemalloc doing the right thing(tm). - */ - frag->bv_page = page; - frag->bv_offset = off; - skb_frag_size_set(frag, size); - + __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; @@ -3182,8 +3210,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) { if (likely(!skb_zcopy(skb))) return 0; - if (!skb_zcopy_is_nouarg(skb) && - skb_uarg(skb)->callback == msg_zerocopy_callback) + if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN) return 0; return skb_copy_ubufs(skb, gfp_mask); } @@ -3496,7 +3523,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle) */ static inline void skb_frag_unref(struct sk_buff *skb, int f) { - __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle); + struct skb_shared_info *shinfo = skb_shinfo(skb); + + if (!skb_zcopy_managed(skb)) + __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle); } /** diff --git a/include/linux/socket.h b/include/linux/socket.h index be24f1c8568a..d4523974efbd 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -14,6 +14,8 @@ struct file; struct pid; struct cred; struct socket; +struct sock; +struct sk_buff; #define __sockaddr_check_size(size) \ BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) @@ -69,6 +71,9 @@ struct msghdr { unsigned int msg_flags; /* flags on received message */ __kernel_size_t msg_controllen; /* ancillary data buffer length */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ + struct ubuf_info *msg_ubuf; + int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); }; struct user_msghdr { diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4c9b11e2e991..1463cfecb56b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -66,6 +66,10 @@ struct io_uring_sqe { union { __s32 splice_fd_in; __u32 file_index; + struct { + __u16 notification_idx; + __u16 addr_len; + }; }; union { struct { @@ -170,7 +174,8 @@ enum io_uring_op { IORING_OP_FALLOCATE, IORING_OP_OPENAT, IORING_OP_CLOSE, - IORING_OP_FILES_UPDATE, + IORING_OP_RSRC_UPDATE, + IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE, IORING_OP_STATX, IORING_OP_READ, IORING_OP_WRITE, @@ -197,6 +202,7 @@ enum io_uring_op { IORING_OP_GETXATTR, IORING_OP_SOCKET, IORING_OP_URING_CMD, + IORING_OP_SENDZC_NOTIF, /* this goes last, obviously */ IORING_OP_LAST, @@ -218,6 +224,7 @@ enum io_uring_op { #define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5) #define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME) #define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE) + /* * sqe->splice_flags * extends splice(2) flags @@ -267,15 +274,32 @@ enum io_uring_op { * IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if * the handler will continue to report * CQEs on behalf of the same SQE. + * + * IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in + * the buf_index field. + * + * IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful + * successful. Only for zerocopy sends. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) -#define IORING_RECV_MULTISHOT (1U << 1) +#define IORING_RECV_MULTISHOT (1U << 1) +#define IORING_RECVSEND_FIXED_BUF (1U << 2) +#define IORING_RECVSEND_NOTIF_FLUSH (1U << 3) /* * accept flags stored in sqe->ioprio */ #define IORING_ACCEPT_MULTISHOT (1U << 0) + +/* + * IORING_OP_RSRC_UPDATE flags + */ +enum { + IORING_RSRC_UPDATE_FILES, + IORING_RSRC_UPDATE_NOTIF, +}; + /* * IORING_OP_MSG_RING command types, stored in sqe->addr */ @@ -457,6 +481,10 @@ enum { /* register a range of fixed file slots for automatic slot allocation */ IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* zerocopy notification API */ + IORING_REGISTER_NOTIFIERS = 26, + IORING_UNREGISTER_NOTIFIERS = 27, + /* this goes last */ IORING_REGISTER_LAST }; @@ -503,6 +531,19 @@ struct io_uring_rsrc_update2 { __u32 resv2; }; +struct io_uring_notification_slot { + __u64 tag; + __u64 resv[3]; +}; + +struct io_uring_notification_register { + __u32 nr_slots; + __u32 resv; + __u64 resv2; + __u64 data; + __u64 resv3; +}; + /* Skip updating fd indexes set to this value in the fd table */ #define IORING_REGISTER_FILES_SKIP (-2) |