From cd967e05715489c5d1059d8d3012c747e5cfb1c4 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:46:56 +0000 Subject: netlink: add symbolic value for congested state Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ce2e0064e7f6..f20a81005177 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -68,6 +68,10 @@ struct listeners { unsigned long masks[0]; }; +/* state bits */ +#define NETLINK_CONGESTED 0x0 + +/* flags */ #define NETLINK_KERNEL_SOCKET 0x1 #define NETLINK_RECV_PKTINFO 0x2 #define NETLINK_BROADCAST_SEND_ERROR 0x4 @@ -727,7 +731,7 @@ static void netlink_overrun(struct sock *sk) struct netlink_sock *nlk = nlk_sk(sk); if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { + if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; sk->sk_error_report(sk); } @@ -788,7 +792,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) { + test_bit(NETLINK_CONGESTED, &nlk->state)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -802,7 +806,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(0, &nlk->state)) && + test_bit(NETLINK_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); @@ -872,8 +876,8 @@ static void netlink_rcv_wake(struct sock *sk) struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(0, &nlk->state); - if (!test_bit(0, &nlk->state)) + clear_bit(NETLINK_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } @@ -957,7 +961,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && - !test_bit(0, &nlk->state)) { + !test_bit(NETLINK_CONGESTED, &nlk->state)) { skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); @@ -1235,7 +1239,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, case NETLINK_NO_ENOBUFS: if (val) { nlk->flags |= NETLINK_RECV_NO_ENOBUFS; - clear_bit(0, &nlk->state); + clear_bit(NETLINK_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS; -- cgit v1.2.3-70-g09d2 From e32123e59871b9389d5b3fe9318611c7f1d1307a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:46:57 +0000 Subject: netlink: rename ssk to sk in struct netlink_skb_params Memory mapped netlink needs to store the receiving userspace socket when sending from the kernel to userspace. Rename 'ssk' to 'sk' to avoid confusion. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 +- net/ipv4/inet_diag.c | 6 +++--- net/ipv4/udp_diag.c | 4 ++-- net/netfilter/nfnetlink_log.c | 2 +- net/netlink/af_netlink.c | 2 +- net/sched/cls_flow.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index e0f746b7b95c..d8e9264ae04a 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -19,7 +19,7 @@ struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; - struct sock *ssk; + struct sock *sk; }; #define NETLINK_CB(skb) (*(struct netlink_skb_parms*)&((skb)->cb)) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8620408af574..5f648751fce2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -324,7 +324,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s } err = sk_diag_fill(sk, rep, req, - sk_user_ns(NETLINK_CB(in_skb).ssk), + sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { @@ -630,7 +630,7 @@ static int inet_csk_diag_dump(struct sock *sk, return 0; return inet_csk_diag_fill(sk, skb, r, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } @@ -805,7 +805,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, } err = inet_diag_fill_req(skb, sk, req, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh); if (err < 0) { diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 369a781851ad..7927db0a9279 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -25,7 +25,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, return 0; return inet_sk_diag_fill(sk, NULL, skb, req, - sk_user_ns(NETLINK_CB(cb->skb).ssk), + sk_user_ns(NETLINK_CB(cb->skb).sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } @@ -71,7 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, goto out; err = inet_sk_diag_fill(sk, NULL, rep, req, - sk_user_ns(NETLINK_CB(in_skb).ssk), + sk_user_ns(NETLINK_CB(in_skb).sk), NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, nlh); if (err < 0) { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 1a0be2af1dd8..50aaf716cd68 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -824,7 +824,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb, inst = instance_create(net, group_num, NETLINK_CB(skb).portid, - sk_user_ns(NETLINK_CB(skb).ssk)); + sk_user_ns(NETLINK_CB(skb).sk)); if (IS_ERR(inst)) { ret = PTR_ERR(inst); goto out; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f20a81005177..978a61f7c87f 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -891,7 +891,7 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, if (nlk->netlink_rcv != NULL) { ret = skb->len; skb_set_owner_r(skb, sk); - NETLINK_CB(skb).ssk = ssk; + NETLINK_CB(skb).sk = ssk; nlk->netlink_rcv(skb); consume_skb(skb); } else { diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index aa36a8c8b33b..7881e2fccbc2 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -393,7 +393,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return -EOPNOTSUPP; if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) && - sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns) + sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns) return -EOPNOTSUPP; } -- cgit v1.2.3-70-g09d2 From 0ebd0ac5ff01ebf412e1bd3c33620ef7ffc5d866 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:46:58 +0000 Subject: net: add function to allocate sk_buff head without data area Add a function to allocate a sk_buff head without any data. This will be used by memory mapped netlink to attach data from the mmaped area to the skb. Additionally change skb_release_all() to check whether the skb has a data area to allow the skb destructor to clear the data pointer in case only a head has been allocated. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/skbuff.h | 6 ++++++ net/core/skbuff.c | 30 +++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f5bed7b31954..2e0ced1af3b1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -651,6 +651,12 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size, return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } +extern struct sk_buff *__alloc_skb_head(gfp_t priority, int node); +static inline struct sk_buff *alloc_skb_head(gfp_t priority) +{ + return __alloc_skb_head(priority, -1); +} + extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src); extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask); extern struct sk_buff *skb_clone(struct sk_buff *skb, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a92d9e7d10f7..898cf5c566f9 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -179,6 +179,33 @@ out: * */ +struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node) +{ + struct sk_buff *skb; + + /* Get the HEAD */ + skb = kmem_cache_alloc_node(skbuff_head_cache, + gfp_mask & ~__GFP_DMA, node); + if (!skb) + goto out; + + /* + * Only clear those fields we need to clear, not those that we will + * actually initialise below. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ + memset(skb, 0, offsetof(struct sk_buff, tail)); + skb->data = NULL; + skb->truesize = sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + +#ifdef NET_SKBUFF_DATA_USES_OFFSET + skb->mac_header = ~0U; +#endif +out: + return skb; +} + /** * __alloc_skb - allocate a network buffer * @size: size to allocate @@ -584,7 +611,8 @@ static void skb_release_head_state(struct sk_buff *skb) static void skb_release_all(struct sk_buff *skb) { skb_release_head_state(skb); - skb_release_data(skb); + if (likely(skb->data)) + skb_release_data(skb); } /** -- cgit v1.2.3-70-g09d2 From 1298ca4671acb10310baa550ed044c553e3a3387 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:46:59 +0000 Subject: netlink: don't orphan skb in netlink_trim() Netlink doesn't account skbs to the sending socket, so the there's no need to orphan the skb before trimming it. Removing the skb_orphan() call is required for mmap'ed netlink, which uses a netlink specific skb destructor that must not be invoked before the final freeing of the skb. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 978a61f7c87f..26779c24b1d4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -851,7 +851,7 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; - skb_orphan(skb); + WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (delta * 2 < skb->truesize) -- cgit v1.2.3-70-g09d2 From cf0a018ac669955c10e4fca24fa55dde58434e9a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:00 +0000 Subject: netlink: add netlink_skb_set_owner_r() For mmap'ed I/O a netlink specific skb destructor needs to be invoked after the final kfree_skb() to clean up state. This doesn't work currently since the skb's ownership is transfered to the receiving socket using skb_set_owner_r(), which orphans the skb, thereby invoking the destructor prematurely. Since netlink doesn't account skbs to the originating socket, there's no need to orphan the skb. Add a netlink specific skb_set_owner_r() variant that does not orphan the skb and use a netlink specific destructor to call sock_rfree(). Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 26779c24b1d4..58b9025978fa 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -119,6 +119,20 @@ static void netlink_consume_callback(struct netlink_callback *cb) kfree(cb); } +static void netlink_skb_destructor(struct sk_buff *skb) +{ + sock_rfree(skb); +} + +static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + WARN_ON(skb->sk != NULL); + skb->sk = sk; + skb->destructor = netlink_skb_destructor; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); +} + static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); @@ -820,7 +834,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, } return 1; } - skb_set_owner_r(skb, sk); + netlink_skb_set_owner_r(skb, sk); return 0; } @@ -890,7 +904,7 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; - skb_set_owner_r(skb, sk); + netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; nlk->netlink_rcv(skb); consume_skb(skb); @@ -962,7 +976,7 @@ static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(NETLINK_CONGESTED, &nlk->state)) { - skb_set_owner_r(skb, sk); + netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } -- cgit v1.2.3-70-g09d2 From ccdfcc398594ddf3f77348c5a10938dbe9efefbe Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:01 +0000 Subject: netlink: mmaped netlink: ring setup Add support for mmap'ed RX and TX ring setup and teardown based on the af_packet.c code. The following patches will use this to add the real mmap'ed receive and transmit functionality. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/uapi/linux/netlink.h | 32 ++++++ net/Kconfig | 9 ++ net/netlink/af_netlink.c | 268 ++++++++++++++++++++++++++++++++++++++++++- net/netlink/af_netlink.h | 20 ++++ 4 files changed, 327 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 32a354f67ba4..1a85940f8ab7 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -1,6 +1,7 @@ #ifndef _UAPI__LINUX_NETLINK_H #define _UAPI__LINUX_NETLINK_H +#include #include /* for __kernel_sa_family_t */ #include @@ -105,11 +106,42 @@ struct nlmsgerr { #define NETLINK_PKTINFO 3 #define NETLINK_BROADCAST_ERROR 4 #define NETLINK_NO_ENOBUFS 5 +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 struct nl_pktinfo { __u32 group; }; +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) + #define NET_MAJOR 36 /* Major 36 is reserved for networking */ enum { diff --git a/net/Kconfig b/net/Kconfig index 2ddc9046868e..1a2221630e6a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -23,6 +23,15 @@ menuconfig NET if NET +config NETLINK_MMAP + bool "Netlink: mmaped IO" + help + This option enables support for memory mapped netlink IO. This + reduces overhead by avoiding copying data between kernel- and + userspace. + + If unsure, say N. + config WANT_COMPAT_NETLINK_MESSAGES bool help diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 58b9025978fa..1d3c7128e90e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include @@ -107,6 +108,234 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; } +#ifdef CONFIG_NETLINK_MMAP +static __pure struct page *pgvec_to_page(const void *addr) +{ + if (is_vmalloc_addr(addr)) + return vmalloc_to_page(addr); + else + return virt_to_page(addr); +} + +static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) { + if (pg_vec[i] != NULL) { + if (is_vmalloc_addr(pg_vec[i])) + vfree(pg_vec[i]); + else + free_pages((unsigned long)pg_vec[i], order); + } + } + kfree(pg_vec); +} + +static void *alloc_one_pg_vec_page(unsigned long order) +{ + void *buffer; + gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | + __GFP_NOWARN | __GFP_NORETRY; + + buffer = (void *)__get_free_pages(gfp_flags, order); + if (buffer != NULL) + return buffer; + + buffer = vzalloc((1 << order) * PAGE_SIZE); + if (buffer != NULL) + return buffer; + + gfp_flags &= ~__GFP_NORETRY; + return (void *)__get_free_pages(gfp_flags, order); +} + +static void **alloc_pg_vec(struct netlink_sock *nlk, + struct nl_mmap_req *req, unsigned int order) +{ + unsigned int block_nr = req->nm_block_nr; + unsigned int i; + void **pg_vec, *ptr; + + pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); + if (pg_vec == NULL) + return NULL; + + for (i = 0; i < block_nr; i++) { + pg_vec[i] = ptr = alloc_one_pg_vec_page(order); + if (pg_vec[i] == NULL) + goto err1; + } + + return pg_vec; +err1: + free_pg_vec(pg_vec, order, block_nr); + return NULL; +} + +static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, + bool closing, bool tx_ring) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct sk_buff_head *queue; + void **pg_vec = NULL; + unsigned int order = 0; + int err; + + ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; + queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; + + if (!closing) { + if (atomic_read(&nlk->mapped)) + return -EBUSY; + if (atomic_read(&ring->pending)) + return -EBUSY; + } + + if (req->nm_block_nr) { + if (ring->pg_vec != NULL) + return -EBUSY; + + if ((int)req->nm_block_size <= 0) + return -EINVAL; + if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) + return -EINVAL; + if (req->nm_frame_size < NL_MMAP_HDRLEN) + return -EINVAL; + if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) + return -EINVAL; + + ring->frames_per_block = req->nm_block_size / + req->nm_frame_size; + if (ring->frames_per_block == 0) + return -EINVAL; + if (ring->frames_per_block * req->nm_block_nr != + req->nm_frame_nr) + return -EINVAL; + + order = get_order(req->nm_block_size); + pg_vec = alloc_pg_vec(nlk, req, order); + if (pg_vec == NULL) + return -ENOMEM; + } else { + if (req->nm_frame_nr) + return -EINVAL; + } + + err = -EBUSY; + mutex_lock(&nlk->pg_vec_lock); + if (closing || atomic_read(&nlk->mapped) == 0) { + err = 0; + spin_lock_bh(&queue->lock); + + ring->frame_max = req->nm_frame_nr - 1; + ring->head = 0; + ring->frame_size = req->nm_frame_size; + ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; + + swap(ring->pg_vec_len, req->nm_block_nr); + swap(ring->pg_vec_order, order); + swap(ring->pg_vec, pg_vec); + + __skb_queue_purge(queue); + spin_unlock_bh(&queue->lock); + + WARN_ON(atomic_read(&nlk->mapped)); + } + mutex_unlock(&nlk->pg_vec_lock); + + if (pg_vec) + free_pg_vec(pg_vec, order, req->nm_block_nr); + return err; +} + +static void netlink_mm_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_inc(&nlk_sk(sk)->mapped); +} + +static void netlink_mm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct socket *sock = file->private_data; + struct sock *sk = sock->sk; + + if (sk) + atomic_dec(&nlk_sk(sk)->mapped); +} + +static const struct vm_operations_struct netlink_mmap_ops = { + .open = netlink_mm_open, + .close = netlink_mm_close, +}; + +static int netlink_mmap(struct file *file, struct socket *sock, + struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + unsigned long start, size, expected; + unsigned int i; + int err = -EINVAL; + + if (vma->vm_pgoff) + return -EINVAL; + + mutex_lock(&nlk->pg_vec_lock); + + expected = 0; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; + } + + if (expected == 0) + goto out; + + size = vma->vm_end - vma->vm_start; + if (size != expected) + goto out; + + start = vma->vm_start; + for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { + if (ring->pg_vec == NULL) + continue; + + for (i = 0; i < ring->pg_vec_len; i++) { + struct page *page; + void *kaddr = ring->pg_vec[i]; + unsigned int pg_num; + + for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { + page = pgvec_to_page(kaddr); + err = vm_insert_page(vma, start, page); + if (err < 0) + goto out; + start += PAGE_SIZE; + kaddr += PAGE_SIZE; + } + } + } + + atomic_inc(&nlk->mapped); + vma->vm_ops = &netlink_mmap_ops; + err = 0; +out: + mutex_unlock(&nlk->pg_vec_lock); + return 0; +} +#else /* CONFIG_NETLINK_MMAP */ +#define netlink_mmap sock_no_mmap +#endif /* CONFIG_NETLINK_MMAP */ + static void netlink_destroy_callback(struct netlink_callback *cb) { kfree_skb(cb->skb); @@ -146,6 +375,18 @@ static void netlink_sock_destruct(struct sock *sk) } skb_queue_purge(&sk->sk_receive_queue); +#ifdef CONFIG_NETLINK_MMAP + if (1) { + struct nl_mmap_req req; + + memset(&req, 0, sizeof(req)); + if (nlk->rx_ring.pg_vec) + netlink_set_ring(sk, &req, true, false); + memset(&req, 0, sizeof(req)); + if (nlk->tx_ring.pg_vec) + netlink_set_ring(sk, &req, true, true); + } +#endif /* CONFIG_NETLINK_MMAP */ if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); @@ -409,6 +650,9 @@ static int __netlink_create(struct net *net, struct socket *sock, mutex_init(nlk->cb_mutex); } init_waitqueue_head(&nlk->wait); +#ifdef CONFIG_NETLINK_MMAP + mutex_init(&nlk->pg_vec_lock); +#endif sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; @@ -1211,7 +1455,8 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, if (level != SOL_NETLINK) return -ENOPROTOOPT; - if (optlen >= sizeof(int) && + if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && + optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; @@ -1260,6 +1505,25 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, } err = 0; break; +#ifdef CONFIG_NETLINK_MMAP + case NETLINK_RX_RING: + case NETLINK_TX_RING: { + struct nl_mmap_req req; + + /* Rings might consume more memory than queue limits, require + * CAP_NET_ADMIN. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (optlen < sizeof(req)) + return -EINVAL; + if (copy_from_user(&req, optval, sizeof(req))) + return -EFAULT; + err = netlink_set_ring(sk, &req, false, + optname == NETLINK_TX_RING); + break; + } +#endif /* CONFIG_NETLINK_MMAP */ default: err = -ENOPROTOOPT; } @@ -2093,7 +2357,7 @@ static const struct proto_ops netlink_ops = { .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, - .mmap = sock_no_mmap, + .mmap = netlink_mmap, .sendpage = sock_no_sendpage, }; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index d9acb2a1d855..ed8522265f4e 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -6,6 +6,20 @@ #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) +struct netlink_ring { + void **pg_vec; + unsigned int head; + unsigned int frames_per_block; + unsigned int frame_size; + unsigned int frame_max; + + unsigned int pg_vec_order; + unsigned int pg_vec_pages; + unsigned int pg_vec_len; + + atomic_t pending; +}; + struct netlink_sock { /* struct sock has to be the first member of netlink_sock */ struct sock sk; @@ -24,6 +38,12 @@ struct netlink_sock { void (*netlink_rcv)(struct sk_buff *skb); void (*netlink_bind)(int group); struct module *module; +#ifdef CONFIG_NETLINK_MMAP + struct mutex pg_vec_lock; + struct netlink_ring rx_ring; + struct netlink_ring tx_ring; + atomic_t mapped; +#endif /* CONFIG_NETLINK_MMAP */ }; static inline struct netlink_sock *nlk_sk(struct sock *sk) -- cgit v1.2.3-70-g09d2 From 9652e931e73be7e54a9c40e9bcd4bbdafe92a406 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:02 +0000 Subject: netlink: add mmap'ed netlink helper functions Add helper functions for looking up mmap'ed frame headers, reading and writing their status, allocating skbs with mmap'ed data areas and a poll function. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netlink.h | 7 ++ net/netlink/af_netlink.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 190 insertions(+), 2 deletions(-) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index d8e9264ae04a..07c473848dbd 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -15,10 +15,17 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb) return (struct nlmsghdr *)skb->data; } +enum netlink_skb_flags { + NETLINK_SKB_MMAPED = 0x1, /* Packet data is mmaped */ + NETLINK_SKB_TX = 0x2, /* Packet was sent by userspace */ + NETLINK_SKB_DELIVERED = 0x4, /* Packet was delivered */ +}; + struct netlink_skb_parms { struct scm_creds creds; /* Skb credentials */ __u32 portid; __u32 dst_group; + __u32 flags; struct sock *sk; }; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 1d3c7128e90e..6560635fd25c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -89,6 +90,7 @@ EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static int netlink_dump(struct sock *sk); +static void netlink_skb_destructor(struct sk_buff *skb); DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); @@ -109,6 +111,11 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u } #ifdef CONFIG_NETLINK_MMAP +static bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ + return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +} + static __pure struct page *pgvec_to_page(const void *addr) { if (is_vmalloc_addr(addr)) @@ -332,8 +339,154 @@ out: mutex_unlock(&nlk->pg_vec_lock); return 0; } + +static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr) +{ +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 + struct page *p_start, *p_end; + + /* First page is flushed through netlink_{get,set}_status */ + p_start = pgvec_to_page(hdr + PAGE_SIZE); + p_end = pgvec_to_page((void *)hdr + NL_MMAP_MSG_HDRLEN + hdr->nm_len - 1); + while (p_start <= p_end) { + flush_dcache_page(p_start); + p_start++; + } +#endif +} + +static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) +{ + smp_rmb(); + flush_dcache_page(pgvec_to_page(hdr)); + return hdr->nm_status; +} + +static void netlink_set_status(struct nl_mmap_hdr *hdr, + enum nl_mmap_status status) +{ + hdr->nm_status = status; + flush_dcache_page(pgvec_to_page(hdr)); + smp_wmb(); +} + +static struct nl_mmap_hdr * +__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) +{ + unsigned int pg_vec_pos, frame_off; + + pg_vec_pos = pos / ring->frames_per_block; + frame_off = pos % ring->frames_per_block; + + return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); +} + +static struct nl_mmap_hdr * +netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, + enum nl_mmap_status status) +{ + struct nl_mmap_hdr *hdr; + + hdr = __netlink_lookup_frame(ring, pos); + if (netlink_get_status(hdr) != status) + return NULL; + + return hdr; +} + +static struct nl_mmap_hdr * +netlink_current_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + return netlink_lookup_frame(ring, ring->head, status); +} + +static struct nl_mmap_hdr * +netlink_previous_frame(const struct netlink_ring *ring, + enum nl_mmap_status status) +{ + unsigned int prev; + + prev = ring->head ? ring->head - 1 : ring->frame_max; + return netlink_lookup_frame(ring, prev, status); +} + +static void netlink_increment_head(struct netlink_ring *ring) +{ + ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; +} + +static void netlink_forward_ring(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) + break; + if (hdr->nm_status != NL_MMAP_STATUS_SKIP) + break; + netlink_increment_head(ring); + } while (ring->head != head); +} + +static unsigned int netlink_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + unsigned int mask; + + mask = datagram_poll(file, sock, wait); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (nlk->rx_ring.pg_vec) { + netlink_forward_ring(&nlk->rx_ring); + if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + + spin_lock_bh(&sk->sk_write_queue.lock); + if (nlk->tx_ring.pg_vec) { + if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) + mask |= POLLOUT | POLLWRNORM; + } + spin_unlock_bh(&sk->sk_write_queue.lock); + + return mask; +} + +static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) +{ + return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); +} + +static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, + struct netlink_ring *ring, + struct nl_mmap_hdr *hdr) +{ + unsigned int size; + void *data; + + size = ring->frame_size - NL_MMAP_HDRLEN; + data = (void *)hdr + NL_MMAP_HDRLEN; + + skb->head = data; + skb->data = data; + skb_reset_tail_pointer(skb); + skb->end = skb->tail + size; + skb->len = 0; + + skb->destructor = netlink_skb_destructor; + NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; + NETLINK_CB(skb).sk = sk; +} #else /* CONFIG_NETLINK_MMAP */ +#define netlink_skb_is_mmaped(skb) false #define netlink_mmap sock_no_mmap +#define netlink_poll datagram_poll #endif /* CONFIG_NETLINK_MMAP */ static void netlink_destroy_callback(struct netlink_callback *cb) @@ -350,7 +503,35 @@ static void netlink_consume_callback(struct netlink_callback *cb) static void netlink_skb_destructor(struct sk_buff *skb) { - sock_rfree(skb); +#ifdef CONFIG_NETLINK_MMAP + struct nl_mmap_hdr *hdr; + struct netlink_ring *ring; + struct sock *sk; + + /* If a packet from the kernel to userspace was freed because of an + * error without being delivered to userspace, the kernel must reset + * the status. In the direction userspace to kernel, the status is + * always reset here after the packet was processed and freed. + */ + if (netlink_skb_is_mmaped(skb)) { + hdr = netlink_mmap_hdr(skb); + sk = NETLINK_CB(skb).sk; + + if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { + hdr->nm_len = 0; + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + } + ring = &nlk_sk(sk)->rx_ring; + + WARN_ON(atomic_read(&ring->pending) == 0); + atomic_dec(&ring->pending); + sock_put(sk); + + skb->data = NULL; + } +#endif + if (skb->sk != NULL) + sock_rfree(skb); } static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) @@ -2349,7 +2530,7 @@ static const struct proto_ops netlink_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, - .poll = datagram_poll, + .poll = netlink_poll, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, -- cgit v1.2.3-70-g09d2 From 5fd96123ee19b96be7d7b57fd42227e1a146ef05 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:03 +0000 Subject: netlink: implement memory mapped sendmsg() Add support for mmap'ed sendmsg() to netlink. Since the kernel validates received messages before processing them, the code makes sure userspace can't modify the message contents after invoking sendmsg(). To do that only a single mapping of the TX ring is allowed to exist and the socket must not be shared. If either of these two conditions does not hold, it falls back to copying. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 135 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 129 insertions(+), 6 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 6560635fd25c..90504a0e42ab 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -116,6 +116,11 @@ static bool netlink_skb_is_mmaped(const struct sk_buff *skb) return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; } +static bool netlink_tx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->tx_ring.pg_vec != NULL; +} + static __pure struct page *pgvec_to_page(const void *addr) { if (is_vmalloc_addr(addr)) @@ -438,6 +443,9 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, struct netlink_sock *nlk = nlk_sk(sk); unsigned int mask; + if (nlk->cb != NULL && nlk->rx_ring.pg_vec != NULL) + netlink_dump(sk); + mask = datagram_poll(file, sock, wait); spin_lock_bh(&sk->sk_receive_queue.lock); @@ -483,10 +491,110 @@ static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; NETLINK_CB(skb).sk = sk; } + +static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, + u32 dst_portid, u32 dst_group, + struct sock_iocb *siocb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + struct sk_buff *skb; + unsigned int maxlen; + bool excl = true; + int err = 0, len = 0; + + /* Netlink messages are validated by the receiver before processing. + * In order to avoid userspace changing the contents of the message + * after validation, the socket and the ring may only be used by a + * single process, otherwise we fall back to copying. + */ + if (atomic_long_read(&sk->sk_socket->file->f_count) > 2 || + atomic_read(&nlk->mapped) > 1) + excl = false; + + mutex_lock(&nlk->pg_vec_lock); + + ring = &nlk->tx_ring; + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + + do { + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); + if (hdr == NULL) { + if (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending)) + schedule(); + continue; + } + if (hdr->nm_len > maxlen) { + err = -EINVAL; + goto out; + } + + netlink_frame_flush_dcache(hdr); + + if (likely(dst_portid == 0 && dst_group == 0 && excl)) { + skb = alloc_skb_head(GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + sock_hold(sk); + netlink_ring_setup_skb(skb, sk, ring, hdr); + NETLINK_CB(skb).flags |= NETLINK_SKB_TX; + __skb_put(skb, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + } else { + skb = alloc_skb(hdr->nm_len, GFP_KERNEL); + if (skb == NULL) { + err = -ENOBUFS; + goto out; + } + __skb_put(skb, hdr->nm_len); + memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, hdr->nm_len); + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + } + + netlink_increment_head(ring); + + NETLINK_CB(skb).portid = nlk->portid; + NETLINK_CB(skb).dst_group = dst_group; + NETLINK_CB(skb).creds = siocb->scm->creds; + + err = security_netlink_send(sk, skb); + if (err) { + kfree_skb(skb); + goto out; + } + + if (unlikely(dst_group)) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_portid, dst_group, + GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_portid, + msg->msg_flags & MSG_DONTWAIT); + if (err < 0) + goto out; + len += err; + + } while (hdr != NULL || + (!(msg->msg_flags & MSG_DONTWAIT) && + atomic_read(&nlk->tx_ring.pending))); + + if (len > 0) + err = len; +out: + mutex_unlock(&nlk->pg_vec_lock); + return err; +} #else /* CONFIG_NETLINK_MMAP */ #define netlink_skb_is_mmaped(skb) false +#define netlink_tx_is_mmaped(sk) false #define netlink_mmap sock_no_mmap #define netlink_poll datagram_poll +#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0 #endif /* CONFIG_NETLINK_MMAP */ static void netlink_destroy_callback(struct netlink_callback *cb) @@ -517,11 +625,16 @@ static void netlink_skb_destructor(struct sk_buff *skb) hdr = netlink_mmap_hdr(skb); sk = NETLINK_CB(skb).sk; - if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { - hdr->nm_len = 0; - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { + netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); + ring = &nlk_sk(sk)->tx_ring; + } else { + if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { + hdr->nm_len = 0; + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + } + ring = &nlk_sk(sk)->rx_ring; } - ring = &nlk_sk(sk)->rx_ring; WARN_ON(atomic_read(&ring->pending) == 0); atomic_dec(&ring->pending); @@ -1230,8 +1343,9 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, nlk = nlk_sk(sk); - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - test_bit(NETLINK_CONGESTED, &nlk->state)) { + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(NETLINK_CONGESTED, &nlk->state)) && + !netlink_skb_is_mmaped(skb)) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) @@ -1291,6 +1405,8 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) int delta; WARN_ON(skb->sk != NULL); + if (netlink_skb_is_mmaped(skb)) + return skb; delta = skb->end - skb->tail; if (delta * 2 < skb->truesize) @@ -1815,6 +1931,13 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out; } + if (netlink_tx_is_mmaped(sk) && + msg->msg_iov->iov_base == NULL) { + err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, + siocb); + goto out; + } + err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; -- cgit v1.2.3-70-g09d2 From f9c2288837ba072b21dba955f04a4c97eaa77b1e Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:04 +0000 Subject: netlink: implement memory mapped recvmsg() Add support for mmap'ed recvmsg(). To allow the kernel to construct messages into the mapped area, a dataless skb is allocated and the data pointer is set to point into the ring frame. This means frames will be delivered to userspace in order of allocation instead of order of transmission. This usually doesn't matter since the order is either not determinable by userspace or message creation/transmission is serialized. The only case where this can have a visible difference is nfnetlink_queue. Userspace can't assume mmap'ed messages have ordered IDs anymore and needs to check this if using batched verdicts. For non-mapped sockets, nothing changes. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netlink.h | 2 + net/netlink/af_netlink.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 143 insertions(+), 3 deletions(-) diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 07c473848dbd..6358da5eeee8 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -64,6 +64,8 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group) extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group); extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); extern int netlink_has_listeners(struct sock *sk, unsigned int group); +extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, + u32 dst_portid, gfp_t gfp_mask); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid, __u32 group, gfp_t allocation); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 90504a0e42ab..d120b5d4d86a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -116,6 +116,11 @@ static bool netlink_skb_is_mmaped(const struct sk_buff *skb) return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; } +static bool netlink_rx_is_mmaped(struct sock *sk) +{ + return nlk_sk(sk)->rx_ring.pg_vec != NULL; +} + static bool netlink_tx_is_mmaped(struct sock *sk) { return nlk_sk(sk)->tx_ring.pg_vec != NULL; @@ -589,8 +594,54 @@ out: mutex_unlock(&nlk->pg_vec_lock); return err; } + +static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) +{ + struct nl_mmap_hdr *hdr; + + hdr = netlink_mmap_hdr(skb); + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = NETLINK_CB(skb).creds.uid; + hdr->nm_gid = NETLINK_CB(skb).creds.gid; + netlink_frame_flush_dcache(hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_VALID); + + NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; + kfree_skb(skb); +} + +static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + + spin_lock_bh(&sk->sk_receive_queue.lock); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) { + spin_unlock_bh(&sk->sk_receive_queue.lock); + kfree_skb(skb); + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + return; + } + netlink_increment_head(ring); + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); + + hdr->nm_len = skb->len; + hdr->nm_group = NETLINK_CB(skb).dst_group; + hdr->nm_pid = NETLINK_CB(skb).creds.pid; + hdr->nm_uid = NETLINK_CB(skb).creds.uid; + hdr->nm_gid = NETLINK_CB(skb).creds.gid; + netlink_set_status(hdr, NL_MMAP_STATUS_COPY); +} + #else /* CONFIG_NETLINK_MMAP */ #define netlink_skb_is_mmaped(skb) false +#define netlink_rx_is_mmaped(sk) false #define netlink_tx_is_mmaped(sk) false #define netlink_mmap sock_no_mmap #define netlink_poll datagram_poll @@ -1381,7 +1432,14 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; - skb_queue_tail(&sk->sk_receive_queue, skb); +#ifdef CONFIG_NETLINK_MMAP + if (netlink_skb_is_mmaped(skb)) + netlink_queue_mmaped_skb(sk, skb); + else if (netlink_rx_is_mmaped(sk)) + netlink_ring_set_copied(sk, skb); + else +#endif /* CONFIG_NETLINK_MMAP */ + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); return len; } @@ -1492,6 +1550,68 @@ retry: } EXPORT_SYMBOL(netlink_unicast); +struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ +#ifdef CONFIG_NETLINK_MMAP + struct sock *sk = NULL; + struct sk_buff *skb; + struct netlink_ring *ring; + struct nl_mmap_hdr *hdr; + unsigned int maxlen; + + sk = netlink_getsockbyportid(ssk, dst_portid); + if (IS_ERR(sk)) + goto out; + + ring = &nlk_sk(sk)->rx_ring; + /* fast-path without atomic ops for common case: non-mmaped receiver */ + if (ring->pg_vec == NULL) + goto out_put; + + skb = alloc_skb_head(gfp_mask); + if (skb == NULL) + goto err1; + + spin_lock_bh(&sk->sk_receive_queue.lock); + /* check again under lock */ + if (ring->pg_vec == NULL) + goto out_free; + + maxlen = ring->frame_size - NL_MMAP_HDRLEN; + if (maxlen < size) + goto out_free; + + netlink_forward_ring(ring); + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + goto err2; + netlink_ring_setup_skb(skb, sk, ring, hdr); + netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); + atomic_inc(&ring->pending); + netlink_increment_head(ring); + + spin_unlock_bh(&sk->sk_receive_queue.lock); + return skb; + +err2: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); +err1: + sock_put(sk); + return NULL; + +out_free: + kfree_skb(skb); + spin_unlock_bh(&sk->sk_receive_queue.lock); +out_put: + sock_put(sk); +out: +#endif + return alloc_skb(size, gfp_mask); +} +EXPORT_SYMBOL_GPL(netlink_alloc_skb); + int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; @@ -2270,9 +2390,13 @@ static int netlink_dump(struct sock *sk) alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL); + if (!netlink_rx_is_mmaped(sk) && + atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + goto errout_skb; + skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL); if (!skb) goto errout_skb; + netlink_skb_set_owner_r(skb, sk); len = cb->dump(skb, cb); @@ -2327,6 +2451,19 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, if (cb == NULL) return -ENOBUFS; + /* Memory mapped dump requests need to be copied to avoid looping + * on the pending state in netlink_mmap_sendmsg() while the CB hold + * a reference to the skb. + */ + if (netlink_skb_is_mmaped(skb)) { + skb = skb_copy(skb, GFP_KERNEL); + if (skb == NULL) { + kfree(cb); + return -ENOBUFS; + } + } else + atomic_inc(&skb->users); + cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; @@ -2387,7 +2524,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) if (err) payload += nlmsg_len(nlh); - skb = nlmsg_new(payload, GFP_KERNEL); + skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), + NETLINK_CB(in_skb).portid, GFP_KERNEL); if (!skb) { struct sock *sk; -- cgit v1.2.3-70-g09d2 From cd1df525da59c64244d27b4548ff5d132489488a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:05 +0000 Subject: netlink: add flow control for memory mapped I/O Add flow control for memory mapped RX. Since user-space usually doesn't invoke recvmsg() when using memory mapped I/O, flow control is performed in netlink_poll(). Dumps are allowed to continue if at least half of the ring frames are unused. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 88 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index d120b5d4d86a..2a3e9ba814c4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -3,6 +3,7 @@ * * Authors: Alan Cox * Alexey Kuznetsov + * Patrick McHardy * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -110,6 +111,29 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u return &hash->table[jhash_1word(portid, hash->rnd) & hash->mask]; } +static void netlink_overrun(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { + if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } + } + atomic_inc(&sk->sk_drops); +} + +static void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (skb_queue_empty(&sk->sk_receive_queue)) + clear_bit(NETLINK_CONGESTED, &nlk->state); + if (!test_bit(NETLINK_CONGESTED, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + #ifdef CONFIG_NETLINK_MMAP static bool netlink_skb_is_mmaped(const struct sk_buff *skb) { @@ -441,15 +465,48 @@ static void netlink_forward_ring(struct netlink_ring *ring) } while (ring->head != head); } +static bool netlink_dump_space(struct netlink_sock *nlk) +{ + struct netlink_ring *ring = &nlk->rx_ring; + struct nl_mmap_hdr *hdr; + unsigned int n; + + hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); + if (hdr == NULL) + return false; + + n = ring->head + ring->frame_max / 2; + if (n > ring->frame_max) + n -= ring->frame_max; + + hdr = __netlink_lookup_frame(ring, n); + + return hdr->nm_status == NL_MMAP_STATUS_UNUSED; +} + static unsigned int netlink_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int mask; + int err; - if (nlk->cb != NULL && nlk->rx_ring.pg_vec != NULL) - netlink_dump(sk); + if (nlk->rx_ring.pg_vec != NULL) { + /* Memory mapped sockets don't call recvmsg(), so flow control + * for dumps is performed here. A dump is allowed to continue + * if at least half the ring is unused. + */ + while (nlk->cb != NULL && netlink_dump_space(nlk)) { + err = netlink_dump(sk); + if (err < 0) { + sk->sk_err = err; + sk->sk_error_report(sk); + break; + } + } + netlink_rcv_wake(sk); + } mask = datagram_poll(file, sock, wait); @@ -623,8 +680,7 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) if (hdr == NULL) { spin_unlock_bh(&sk->sk_receive_queue.lock); kfree_skb(skb); - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); + netlink_overrun(sk); return; } netlink_increment_head(ring); @@ -1329,19 +1385,6 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr, return 0; } -static void netlink_overrun(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) { - if (!test_and_set_bit(NETLINK_CONGESTED, &nlk_sk(sk)->state)) { - sk->sk_err = ENOBUFS; - sk->sk_error_report(sk); - } - } - atomic_inc(&sk->sk_drops); -} - static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; @@ -1484,16 +1527,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) return skb; } -static void netlink_rcv_wake(struct sock *sk) -{ - struct netlink_sock *nlk = nlk_sk(sk); - - if (skb_queue_empty(&sk->sk_receive_queue)) - clear_bit(NETLINK_CONGESTED, &nlk->state); - if (!test_bit(NETLINK_CONGESTED, &nlk->state)) - wake_up_interruptible(&nlk->wait); -} - static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { @@ -1597,6 +1630,7 @@ struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size, err2: kfree_skb(skb); spin_unlock_bh(&sk->sk_receive_queue.lock); + netlink_overrun(sk); err1: sock_put(sk); return NULL; -- cgit v1.2.3-70-g09d2 From 4ae9fbee1690848a6aace1e0193ab27e981e35a5 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:06 +0000 Subject: netlink: add RX/TX-ring support to netlink diag Based on AF_PACKET. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/uapi/linux/netlink_diag.h | 10 ++++++++++ net/netlink/diag.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index 88009a31cd06..4e31db4eea41 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -25,9 +25,18 @@ struct netlink_diag_msg { __u32 ndiag_cookie[2]; }; +struct netlink_diag_ring { + __u32 ndr_block_size; + __u32 ndr_block_nr; + __u32 ndr_frame_size; + __u32 ndr_frame_nr; +}; + enum { NETLINK_DIAG_MEMINFO, NETLINK_DIAG_GROUPS, + NETLINK_DIAG_RX_RING, + NETLINK_DIAG_TX_RING, __NETLINK_DIAG_MAX, }; @@ -38,5 +47,6 @@ enum { #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ +#define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */ #endif diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 5ffb1d1cf402..4e4aa471cd05 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -7,6 +7,34 @@ #include "af_netlink.h" +static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, + struct sk_buff *nlskb) +{ + struct netlink_diag_ring ndr; + + ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; + ndr.ndr_block_nr = ring->pg_vec_len; + ndr.ndr_frame_size = ring->frame_size; + ndr.ndr_frame_nr = ring->frame_max + 1; + + return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); +} + +static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int ret; + + mutex_lock(&nlk->pg_vec_lock); + ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); + if (!ret) + ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, + nlskb); + mutex_unlock(&nlk->pg_vec_lock); + + return ret; +} + static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) { struct netlink_sock *nlk = nlk_sk(sk); @@ -51,6 +79,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && + sk_diag_put_rings_cfg(sk, skb)) + goto out_nlmsg_trim; + return nlmsg_end(skb, nlh); out_nlmsg_trim: -- cgit v1.2.3-70-g09d2 From 5683264c3981047aa93eebabcdbb81676018a7c9 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:07 +0000 Subject: netlink: add documentation for memory mapped I/O Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- Documentation/networking/netlink_mmap.txt | 339 ++++++++++++++++++++++++++++++ 1 file changed, 339 insertions(+) create mode 100644 Documentation/networking/netlink_mmap.txt diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt new file mode 100644 index 000000000000..1c2dab409625 --- /dev/null +++ b/Documentation/networking/netlink_mmap.txt @@ -0,0 +1,339 @@ +This file documents how to use memory mapped I/O with netlink. + +Author: Patrick McHardy + +Overview +-------- + +Memory mapped netlink I/O can be used to increase throughput and decrease +overhead of unicast receive and transmit operations. Some netlink subsystems +require high throughput, these are mainly the netfilter subsystems +nfnetlink_queue and nfnetlink_log, but it can also help speed up large +dump operations of f.i. the routing database. + +Memory mapped netlink I/O used two circular ring buffers for RX and TX which +are mapped into the processes address space. + +The RX ring is used by the kernel to directly construct netlink messages into +user-space memory without copying them as done with regular socket I/O, +additionally as long as the ring contains messages no recvmsg() or poll() +syscalls have to be issued by user-space to get more message. + +The TX ring is used to process messages directly from user-space memory, the +kernel processes all messages contained in the ring using a single sendmsg() +call. + +Usage overview +-------------- + +In order to use memory mapped netlink I/O, user-space needs three main changes: + +- ring setup +- conversion of the RX path to get messages from the ring instead of recvmsg() +- conversion of the TX path to construct messages into the ring + +Ring setup is done using setsockopt() to provide the ring parameters to the +kernel, then a call to mmap() to map the ring into the processes address space: + +- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, ¶ms, sizeof(params)); +- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, ¶ms, sizeof(params)); +- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0) + +Usage of either ring is optional, but even if only the RX ring is used the +mapping still needs to be writable in order to update the frame status after +processing. + +Conversion of the reception path involves calling poll() on the file +descriptor, once the socket is readable the frames from the ring are +processsed in order until no more messages are available, as indicated by +a status word in the frame header. + +On kernel side, in order to make use of memory mapped I/O on receive, the +originating netlink subsystem needs to support memory mapped I/O, otherwise +it will use an allocated socket buffer as usual and the contents will be + copied to the ring on transmission, nullifying most of the performance gains. +Dumps of kernel databases automatically support memory mapped I/O. + +Conversion of the transmit path involves changing message contruction to +use memory from the TX ring instead of (usually) a buffer declared on the +stack and setting up the frame header approriately. Optionally poll() can +be used to wait for free frames in the TX ring. + +Structured and definitions for using memory mapped I/O are contained in +. + +RX and TX rings +---------------- + +Each ring contains a number of continous memory blocks, containing frames of +fixed size dependant on the parameters used for ring setup. + +Ring: [ block 0 ] + [ frame 0 ] + [ frame 1 ] + [ block 1 ] + [ frame 2 ] + [ frame 3 ] + ... + [ block n ] + [ frame 2 * n ] + [ frame 2 * n + 1 ] + +The blocks are only visible to the kernel, from the point of view of user-space +the ring just contains the frames in a continous memory zone. + +The ring parameters used for setting up the ring are defined as follows: + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +Frames are grouped into blocks, where each block is a continous region of memory +and holds nm_block_size / nm_frame_size frames. The total number of frames in +the ring is nm_frame_nr. The following invariants hold: + +- frames_per_block = nm_block_size / nm_frame_size + +- nm_frame_nr = frames_per_block * nm_block_nr + +Some parameters are constrained, specifically: + +- nm_block_size must be a multiple of the architectures memory page size. + The getpagesize() function can be used to get the page size. + +- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be + able to hold at least the frame header + +- nm_frame_size must be smaller or equal to nm_block_size + +- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT + +- nm_frame_nr must equal the actual number of frames as specified above. + +When the kernel can't allocate phsyically continous memory for a ring block, +it will fall back to use physically discontinous memory. This might affect +performance negatively, in order to avoid this the nm_frame_size parameter +should be chosen to be as small as possible for the required frame size and +the number of blocks should be increased instead. + +Ring frames +------------ + +Each frames contain a frame header, consisting of a synchronization word and some +meta-data, and the message itself. + +Frame: [ header message ] + +The frame header is defined as follows: + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +- nm_status is used for synchronizing processing between the kernel and user- + space and specifies ownership of the frame as well as the operation to perform + +- nm_len contains the length of the message contained in the data area + +- nm_group specified the destination multicast group of message + +- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending + process. These values correspond to the data available using SOCK_PASSCRED in + the SCM_CREDENTIALS cmsg. + +The possible values in the status word are: + +- NL_MMAP_STATUS_UNUSED: + RX ring: frame belongs to the kernel and contains no message + for user-space. Approriate action is to invoke poll() + to wait for new messages. + + TX ring: frame belongs to user-space and can be used for + message construction. + +- NL_MMAP_STATUS_RESERVED: + RX ring only: frame is currently used by the kernel for message + construction and contains no valid message yet. + Appropriate action is to invoke poll() to wait for + new messages. + +- NL_MMAP_STATUS_VALID: + RX ring: frame contains a valid message. Approriate action is + to process the message and release the frame back to + the kernel by setting the status to + NL_MMAP_STATUS_UNUSED or queue the frame by setting the + status to NL_MMAP_STATUS_SKIP. + + TX ring: the frame contains a valid message from user-space to + be processed by the kernel. After completing processing + the kernel will release the frame back to user-space by + setting the status to NL_MMAP_STATUS_UNUSED. + +- NL_MMAP_STATUS_COPY: + RX ring only: a message is ready to be processed but could not be + stored in the ring, either because it exceeded the + frame size or because the originating subsystem does + not support memory mapped I/O. Appropriate action is + to invoke recvmsg() to receive the message and release + the frame back to the kernel by setting the status to + NL_MMAP_STATUS_UNUSED. + +- NL_MMAP_STATUS_SKIP: + RX ring only: user-space queued the message for later processing, but + processed some messages following it in the ring. The + kernel should skip this frame when looking for unused + frames. + +The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the +frame header. + +TX limitations +-------------- + +Kernel processing usually involves validation of the message received by +user-space, then processing its contents. The kernel must assure that +userspace is not able to modify the message contents after they have been +validated. In order to do so, the message is copied from the ring frame +to an allocated buffer if either of these conditions is false: + +- only a single mapping of the ring exists +- the file descriptor is not shared between processes + +This means that for threaded programs, the kernel will fall back to copying. + +Example +------- + +Ring setup: + + unsigned int block_size = 16 * getpagesize(); + struct nl_mmap_req req = { + .nm_block_size = block_size, + .nm_block_nr = 64, + .nm_frame_size = 16384, + .nm_frame_nr = 64 * block_size / 16384, + }; + unsigned int ring_size; + void *rx_ring, *tx_ring; + + /* Configure ring parameters */ + if (setsockopt(fd, NETLINK_RX_RING, &req, sizeof(req)) < 0) + exit(1); + if (setsockopt(fd, NETLINK_TX_RING, &req, sizeof(req)) < 0) + exit(1) + + /* Calculate size of each invididual ring */ + ring_size = req.nm_block_nr * req.nm_block_size; + + /* Map RX/TX rings. The TX ring is located after the RX ring */ + rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if ((long)rx_ring == -1L) + exit(1); + tx_ring = rx_ring + ring_size: + +Message reception: + +This example assumes some ring parameters of the ring setup are available. + + unsigned int frame_offset = 0; + struct nl_mmap_hdr *hdr; + struct nlmsghdr *nlh; + unsigned char buf[16384]; + ssize_t len; + + while (1) { + struct pollfd pfds[1]; + + pfds[0].fd = fd; + pfds[0].events = POLLIN | POLLERR; + pfds[0].revents = 0; + + if (poll(pfds, 1, -1) < 0 && errno != -EINTR) + exit(1); + + /* Check for errors. Error handling omitted */ + if (pfds[0].revents & POLLERR) + + + /* If no new messages, poll again */ + if (!(pfds[0].revents & POLLIN)) + continue; + + /* Process all frames */ + while (1) { + /* Get next frame header */ + hdr = rx_ring + frame_offset; + + if (hdr->nm_status == NL_MMAP_STATUS_VALID) + /* Regular memory mapped frame */ + nlh = (void *hdr) + NL_MMAP_HDRLEN; + len = hdr->nm_len; + + /* Release empty message immediately. May happen + * on error during message construction. + */ + if (len == 0) + goto release; + } else if (hdr->nm_status == NL_MMAP_STATUS_COPY) { + /* Frame queued to socket receive queue */ + len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); + if (len <= 0) + break; + nlh = buf; + } else + /* No more messages to process, continue polling */ + break; + + process_msg(nlh); +release: + /* Release frame back to the kernel */ + hdr->nm_status = NL_MMAP_STATUS_UNUSED; + + /* Advance frame offset to next frame */ + frame_offset = (frame_offset + frame_size) % ring_size; + } + } + +Message transmission: + +This example assumes some ring parameters of the ring setup are available. +A single message is constructed and transmitted, to send multiple messages +at once they would be constructed in consecutive frames before a final call +to sendto(). + + unsigned int frame_offset = 0; + struct nl_mmap_hdr *hdr; + struct nlmsghdr *nlh; + struct sockaddr_nl addr = { + .nl_family = AF_NETLINK, + }; + + hdr = tx_ring + frame_offset; + if (hdr->nm_status != NL_MMAP_STATUS_UNUSED) + /* No frame available. Use poll() to avoid. */ + exit(1); + + nlh = (void *)hdr + NL_MMAP_HDRLEN; + + /* Build message */ + build_message(nlh); + + /* Fill frame header: length and status need to be set */ + hdr->nm_len = nlh->nlmsg_len; + hdr->nm_status = NL_MMAP_STATUS_VALID; + + if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0) + exit(1); + + /* Advance frame offset to next frame */ + frame_offset = (frame_offset + frame_size) % ring_size; -- cgit v1.2.3-70-g09d2 From ec464e5dc504a164c5dbff4a06812d495e44e34d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:08 +0000 Subject: netfilter: rename netlink related "pid" variables to "portid" Get rid of the confusing mix of pid and portid and use portid consistently for all netlink related socket identities. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink.h | 9 +++++---- include/net/netfilter/nf_conntrack.h | 2 +- include/net/netfilter/nf_conntrack_expect.h | 4 ++-- net/netfilter/nf_conntrack_core.c | 8 ++++---- net/netfilter/nf_conntrack_expect.c | 8 ++++---- net/netfilter/nfnetlink.c | 13 +++++++------ 6 files changed, 23 insertions(+), 21 deletions(-) diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index ecbb8e495912..60b164171daf 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -29,10 +29,11 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); extern int nfnetlink_has_listeners(struct net *net, unsigned int group); -extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, - int echo, gfp_t flags); -extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error); -extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags); +extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, + unsigned int group, int echo, gfp_t flags); +extern int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); +extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, + u32 portid, int flags); extern void nfnl_lock(__u8 subsys_id); extern void nfnl_unlock(__u8 subsys_id); diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index caca0c4d6b4b..644d9c223d24 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -184,7 +184,7 @@ extern int nf_conntrack_hash_check_insert(struct nf_conn *ct); extern void nf_ct_delete_from_lists(struct nf_conn *ct); extern void nf_ct_dying_timeout(struct nf_conn *ct); -extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report); +extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report); extern bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff, u_int16_t l3num, diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index cbbae7621e22..3f3aecbc8632 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -88,7 +88,7 @@ nf_ct_find_expectation(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, - u32 pid, int report); + u32 portid, int report); static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { nf_ct_unlink_expect_report(exp, 0, 0); @@ -106,7 +106,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t, u_int8_t, const __be16 *, const __be16 *); void nf_ct_expect_put(struct nf_conntrack_expect *exp); int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 pid, int report); + u32 portid, int report); static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect) { return nf_ct_expect_related_report(expect, 0, 0); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 007e8c43d19a..54ddc2f8e7c9 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1260,7 +1260,7 @@ void nf_ct_iterate_cleanup(struct net *net, EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup); struct __nf_ct_flush_report { - u32 pid; + u32 portid; int report; }; @@ -1275,7 +1275,7 @@ static int kill_report(struct nf_conn *i, void *data) /* If we fail to deliver the event, death_by_timeout() will retry */ if (nf_conntrack_event_report(IPCT_DESTROY, i, - fr->pid, fr->report) < 0) + fr->portid, fr->report) < 0) return 1; /* Avoid the delivery of the destroy event in death_by_timeout(). */ @@ -1298,10 +1298,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size) } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush_report(struct net *net, u32 pid, int report) +void nf_conntrack_flush_report(struct net *net, u32 portid, int report) { struct __nf_ct_flush_report fr = { - .pid = pid, + .portid = portid, .report = report, }; nf_ct_iterate_cleanup(net, kill_report, &fr); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 8c10e3db3d9b..0adfdcc68bae 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -40,7 +40,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, - u32 pid, int report) + u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); @@ -54,7 +54,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, hlist_del(&exp->lnode); master_help->expecting[exp->class]--; - nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); + nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); @@ -412,7 +412,7 @@ out: } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, - u32 pid, int report) + u32 portid, int report) { int ret; @@ -425,7 +425,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, if (ret < 0) goto out; spin_unlock_bh(&nf_conntrack_lock); - nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report); + nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return ret; out: spin_unlock_bh(&nf_conntrack_lock); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index bc4c499adb13..1640bd7dcdf4 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -112,22 +112,23 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); -int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, +int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { - return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags); + return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags); } EXPORT_SYMBOL_GPL(nfnetlink_send); -int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) +int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error) { - return netlink_set_err(net->nfnl, pid, group, error); + return netlink_set_err(net->nfnl, portid, group, error); } EXPORT_SYMBOL_GPL(nfnetlink_set_err); -int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags) +int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, + int flags) { - return netlink_unicast(net->nfnl, skb, pid, flags); + return netlink_unicast(net->nfnl, skb, portid, flags); } EXPORT_SYMBOL_GPL(nfnetlink_unicast); -- cgit v1.2.3-70-g09d2 From 3ab1f683bf8be7aa7869cc3ffb8d1db2ec8c8307 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 17 Apr 2013 06:47:09 +0000 Subject: nfnetlink: add support for memory mapped netlink Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink.h | 2 ++ net/netfilter/nfnetlink.c | 7 +++++++ net/netfilter/nfnetlink_log.c | 10 ++++++---- net/netfilter/nfnetlink_queue_core.c | 3 ++- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 60b164171daf..cadb7402d7a7 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -29,6 +29,8 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n); extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n); extern int nfnetlink_has_listeners(struct net *net, unsigned int group); +extern struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, + u32 dst_portid, gfp_t gfp_mask); extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags); extern int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 1640bd7dcdf4..572d87dc116f 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -112,6 +112,13 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group) } EXPORT_SYMBOL_GPL(nfnetlink_has_listeners); +struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size, + u32 dst_portid, gfp_t gfp_mask) +{ + return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask); +} +EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb); + int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid, unsigned int group, int echo, gfp_t flags) { diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 50aaf716cd68..d4199eb9b338 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -318,7 +318,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags) } static struct sk_buff * -nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) +nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size) { struct sk_buff *skb; unsigned int n; @@ -327,13 +327,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size) * message. WARNING: has to be <= 128k due to slab restrictions */ n = max(inst_size, pkt_size); - skb = alloc_skb(n, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC); if (!skb) { if (n > pkt_size) { /* try to allocate only as much as we need for current * packet */ - skb = alloc_skb(pkt_size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, pkt_size, + peer_portid, GFP_ATOMIC); if (!skb) pr_err("nfnetlink_log: can't even alloc %u bytes\n", pkt_size); @@ -696,7 +697,8 @@ nfulnl_log_packet(u_int8_t pf, } if (!inst->skb) { - inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size); + inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz, + size); if (!inst->skb) goto alloc_failure; } diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c index 5e280b3e154f..ef3cdb4bfeea 100644 --- a/net/netfilter/nfnetlink_queue_core.c +++ b/net/netfilter/nfnetlink_queue_core.c @@ -339,7 +339,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, if (queue->flags & NFQA_CFG_F_CONNTRACK) ct = nfqnl_ct_get(entskb, &size, &ctinfo); - skb = alloc_skb(size, GFP_ATOMIC); + skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid, + GFP_ATOMIC); if (!skb) return NULL; -- cgit v1.2.3-70-g09d2