diff options
author | David S. Miller <davem@davemloft.net> | 2016-01-04 22:49:59 -0500 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2016-01-04 22:49:59 -0500 |
commit | 6a5ef90c58daada158ba16ba330558efc3471491 (patch) | |
tree | c3cf3d3cd580a1e759b2914cb79bf6ebcb28eb3f /net | |
parent | ebb3cf41c1a49e334e3fe540ee24a1afb7ed30c4 (diff) | |
parent | 3ca8e4029969d40ab90e3f1ecd83ab1cadd60fbb (diff) |
Merge branch 'faster-soreuseport'
Craig Gallek says:
====================
Faster SO_REUSEPORT
This series contains two optimizations for the SO_REUSEPORT feature:
Faster lookup when selecting a socket for an incoming packet and
the ability to select the socket from the group using a BPF program.
This series only includes the UDP path. I plan to submit a follow-up
including the TCP path if the implementation in this series is
acceptable.
Changes in v4:
- pskb_may_pull is unnecessary with pskb_pull (per Alexei Starovoitov)
Changes in v3:
- skb_pull_inline -> pskb_pull (per Alexei Starovoitov)
- reuseport_attach* -> sk_reuseport_attach* and simple return statement
syntax change (per Daniel Borkmann)
Changes in v2:
- Fix ARM build; remove unnecessary include.
- Handle case where protocol header is not in linear section (per
Alexei Starovoitov).
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/filter.c | 121 | ||||
-rw-r--r-- | net/core/sock.c | 29 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 251 | ||||
-rw-r--r-- | net/ipv4/udp.c | 127 | ||||
-rw-r--r-- | net/ipv4/udp_diag.c | 4 | ||||
-rw-r--r-- | net/ipv6/inet6_connection_sock.c | 4 | ||||
-rw-r--r-- | net/ipv6/udp.c | 56 |
8 files changed, 530 insertions, 64 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 086b01fbe1bd..0b835de04de3 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ - sock_diag.o dev_ioctl.o tso.o + sock_diag.o dev_ioctl.o tso.o sock_reuseport.o obj-$(CONFIG_XFRM) += flow.o obj-y += net-sysfs.o diff --git a/net/core/filter.c b/net/core/filter.c index c770196ae8d5..35e6fed28709 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -50,6 +50,7 @@ #include <net/cls_cgroup.h> #include <net/dst_metadata.h> #include <net/dst.h> +#include <net/sock_reuseport.h> /** * sk_filter - run a packet through a socket filter @@ -1167,17 +1168,32 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return 0; } -/** - * sk_attach_filter - attach a socket filter - * @fprog: the filter program - * @sk: the socket to use - * - * Attach the user's filter code. We first run some sanity checks on - * it to make sure it does not explode on us later. If an error - * occurs or there is insufficient memory for the filter a negative - * errno code is returned. On success the return is zero. - */ -int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk) +{ + struct bpf_prog *old_prog; + int err; + + if (bpf_prog_size(prog->len) > sysctl_optmem_max) + return -ENOMEM; + + if (sk_unhashed(sk)) { + err = reuseport_alloc(sk); + if (err) + return err; + } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) { + /* The socket wasn't bound with SO_REUSEPORT */ + return -EINVAL; + } + + old_prog = reuseport_attach_prog(sk, prog); + if (old_prog) + bpf_prog_destroy(old_prog); + + return 0; +} + +static +struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) { unsigned int fsize = bpf_classic_proglen(fprog); unsigned int bpf_fsize = bpf_prog_size(fprog->len); @@ -1185,19 +1201,19 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) int err; if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; + return ERR_PTR(-EPERM); /* Make sure new filter is there and in the right amounts. */ if (fprog->filter == NULL) - return -EINVAL; + return ERR_PTR(-EINVAL); prog = bpf_prog_alloc(bpf_fsize, 0); if (!prog) - return -ENOMEM; + return ERR_PTR(-ENOMEM); if (copy_from_user(prog->insns, fprog->filter, fsize)) { __bpf_prog_free(prog); - return -EFAULT; + return ERR_PTR(-EFAULT); } prog->len = fprog->len; @@ -1205,13 +1221,30 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) err = bpf_prog_store_orig_filter(prog, fprog); if (err) { __bpf_prog_free(prog); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } /* bpf_prepare_filter() already takes care of freeing * memory in case something goes wrong. */ - prog = bpf_prepare_filter(prog, NULL); + return bpf_prepare_filter(prog, NULL); +} + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct bpf_prog *prog = __get_filter(fprog, sk); + int err; + if (IS_ERR(prog)) return PTR_ERR(prog); @@ -1225,23 +1258,50 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) } EXPORT_SYMBOL_GPL(sk_attach_filter); -int sk_attach_bpf(u32 ufd, struct sock *sk) +int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk) { - struct bpf_prog *prog; + struct bpf_prog *prog = __get_filter(fprog, sk); int err; + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + __bpf_prog_release(prog); + return err; + } + + return 0; +} + +static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog; + if (sock_flag(sk, SOCK_FILTER_LOCKED)) - return -EPERM; + return ERR_PTR(-EPERM); prog = bpf_prog_get(ufd); if (IS_ERR(prog)) - return PTR_ERR(prog); + return prog; if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); - return -EINVAL; + return ERR_PTR(-EINVAL); } + return prog; +} + +int sk_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + err = __sk_attach_prog(prog, sk); if (err < 0) { bpf_prog_put(prog); @@ -1251,6 +1311,23 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } +int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk) +{ + struct bpf_prog *prog = __get_bpf(ufd, sk); + int err; + + if (IS_ERR(prog)) + return PTR_ERR(prog); + + err = __reuseport_attach_prog(prog, sk); + if (err < 0) { + bpf_prog_put(prog); + return err; + } + + return 0; +} + #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) #define BPF_LDST_LEN 16U diff --git a/net/core/sock.c b/net/core/sock.c index 565bab7baca9..51270238e269 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -134,6 +134,7 @@ #include <linux/sock_diag.h> #include <linux/filter.h> +#include <net/sock_reuseport.h> #include <trace/events/sock.h> @@ -932,6 +933,32 @@ set_rcvbuf: } break; + case SO_ATTACH_REUSEPORT_CBPF: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_reuseport_attach_filter(&fprog, sk); + } + break; + + case SO_ATTACH_REUSEPORT_EBPF: + ret = -EINVAL; + if (optlen == sizeof(u32)) { + u32 ufd; + + ret = -EFAULT; + if (copy_from_user(&ufd, optval, sizeof(ufd))) + break; + + ret = sk_reuseport_attach_bpf(ufd, sk); + } + break; + case SO_DETACH_FILTER: ret = sk_detach_filter(sk); break; @@ -1443,6 +1470,8 @@ void sk_destruct(struct sock *sk) sk_filter_uncharge(sk, filter); RCU_INIT_POINTER(sk->sk_filter, NULL); } + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c new file mode 100644 index 000000000000..ae0969c0fc2e --- /dev/null +++ b/net/core/sock_reuseport.c @@ -0,0 +1,251 @@ +/* + * To speed up listener socket lookup, create an array to store all sockets + * listening on the same port. This allows a decision to be made after finding + * the first socket. An optional BPF program can also be configured for + * selecting the socket index from the array of available sockets. + */ + +#include <net/sock_reuseport.h> +#include <linux/bpf.h> +#include <linux/rcupdate.h> + +#define INIT_SOCKS 128 + +static DEFINE_SPINLOCK(reuseport_lock); + +static struct sock_reuseport *__reuseport_alloc(u16 max_socks) +{ + size_t size = sizeof(struct sock_reuseport) + + sizeof(struct sock *) * max_socks; + struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); + + if (!reuse) + return NULL; + + reuse->max_socks = max_socks; + + RCU_INIT_POINTER(reuse->prog, NULL); + return reuse; +} + +int reuseport_alloc(struct sock *sk) +{ + struct sock_reuseport *reuse; + + /* bh lock used since this function call may precede hlist lock in + * soft irq of receive path or setsockopt from process context + */ + spin_lock_bh(&reuseport_lock); + WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + "multiple allocations for the same socket"); + reuse = __reuseport_alloc(INIT_SOCKS); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + + reuse->socks[0] = sk; + reuse->num_socks = 1; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_alloc); + +static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) +{ + struct sock_reuseport *more_reuse; + u32 more_socks_size, i; + + more_socks_size = reuse->max_socks * 2U; + if (more_socks_size > U16_MAX) + return NULL; + + more_reuse = __reuseport_alloc(more_socks_size); + if (!more_reuse) + return NULL; + + more_reuse->max_socks = more_socks_size; + more_reuse->num_socks = reuse->num_socks; + more_reuse->prog = reuse->prog; + + memcpy(more_reuse->socks, reuse->socks, + reuse->num_socks * sizeof(struct sock *)); + + for (i = 0; i < reuse->num_socks; ++i) + rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb, + more_reuse); + + /* Note: we use kfree_rcu here instead of reuseport_free_rcu so + * that reuse and more_reuse can temporarily share a reference + * to prog. + */ + kfree_rcu(reuse, rcu); + return more_reuse; +} + +/** + * reuseport_add_sock - Add a socket to the reuseport group of another. + * @sk: New socket to add to the group. + * @sk2: Socket belonging to the existing reuseport group. + * May return ENOMEM and not add socket to group under memory pressure. + */ +int reuseport_add_sock(struct sock *sk, const struct sock *sk2) +{ + struct sock_reuseport *reuse; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk2->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)), + "socket already in reuseport group"); + + if (reuse->num_socks == reuse->max_socks) { + reuse = reuseport_grow(reuse); + if (!reuse) { + spin_unlock_bh(&reuseport_lock); + return -ENOMEM; + } + } + + reuse->socks[reuse->num_socks] = sk; + /* paired with smp_rmb() in reuseport_select_sock() */ + smp_wmb(); + reuse->num_socks++; + rcu_assign_pointer(sk->sk_reuseport_cb, reuse); + + spin_unlock_bh(&reuseport_lock); + + return 0; +} +EXPORT_SYMBOL(reuseport_add_sock); + +static void reuseport_free_rcu(struct rcu_head *head) +{ + struct sock_reuseport *reuse; + + reuse = container_of(head, struct sock_reuseport, rcu); + if (reuse->prog) + bpf_prog_destroy(reuse->prog); + kfree(reuse); +} + +void reuseport_detach_sock(struct sock *sk) +{ + struct sock_reuseport *reuse; + int i; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(sk->sk_reuseport_cb, NULL); + + for (i = 0; i < reuse->num_socks; i++) { + if (reuse->socks[i] == sk) { + reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; + reuse->num_socks--; + if (reuse->num_socks == 0) + call_rcu(&reuse->rcu, reuseport_free_rcu); + break; + } + } + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_detach_sock); + +static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks, + struct bpf_prog *prog, struct sk_buff *skb, + int hdr_len) +{ + struct sk_buff *nskb = NULL; + u32 index; + + if (skb_shared(skb)) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return NULL; + skb = nskb; + } + + /* temporarily advance data past protocol header */ + if (!pskb_pull(skb, hdr_len)) { + consume_skb(nskb); + return NULL; + } + index = bpf_prog_run_save_cb(prog, skb); + __skb_push(skb, hdr_len); + + consume_skb(nskb); + + if (index >= socks) + return NULL; + + return reuse->socks[index]; +} + +/** + * reuseport_select_sock - Select a socket from an SO_REUSEPORT group. + * @sk: First socket in the group. + * @hash: When no BPF filter is available, use this hash to select. + * @skb: skb to run through BPF filter. + * @hdr_len: BPF filter expects skb data pointer at payload data. If + * the skb does not yet point at the payload, this parameter represents + * how far the pointer needs to advance to reach the payload. + * Returns a socket that should receive the packet (or NULL on error). + */ +struct sock *reuseport_select_sock(struct sock *sk, + u32 hash, + struct sk_buff *skb, + int hdr_len) +{ + struct sock_reuseport *reuse; + struct bpf_prog *prog; + struct sock *sk2 = NULL; + u16 socks; + + rcu_read_lock(); + reuse = rcu_dereference(sk->sk_reuseport_cb); + + /* if memory allocation failed or add call is not yet complete */ + if (!reuse) + goto out; + + prog = rcu_dereference(reuse->prog); + socks = READ_ONCE(reuse->num_socks); + if (likely(socks)) { + /* paired with smp_wmb() in reuseport_add_sock() */ + smp_rmb(); + + if (prog && skb) + sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); + else + sk2 = reuse->socks[reciprocal_scale(hash, socks)]; + } + +out: + rcu_read_unlock(); + return sk2; +} +EXPORT_SYMBOL(reuseport_select_sock); + +struct bpf_prog * +reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog) +{ + struct sock_reuseport *reuse; + struct bpf_prog *old_prog; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + old_prog = rcu_dereference_protected(reuse->prog, + lockdep_is_held(&reuseport_lock)); + rcu_assign_pointer(reuse->prog, prog); + spin_unlock_bh(&reuseport_lock); + + return old_prog; +} +EXPORT_SYMBOL(reuseport_attach_prog); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ac14ae44390d..835378365f25 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -113,6 +113,7 @@ #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" +#include <net/sock_reuseport.h> struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -137,7 +138,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned long *bitmap, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + const struct sock *sk2, + bool match_wildcard), unsigned int log) { struct sock *sk2; @@ -152,8 +154,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2)) { + saddr_comp(sk, sk2, true)) { if (!bitmap) return 1; __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); @@ -170,7 +173,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + const struct sock *sk2, + bool match_wildcard)) { struct sock *sk2; struct hlist_nulls_node *node; @@ -186,8 +190,9 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (!sk2->sk_reuseport || !sk->sk_reuseport || + rcu_access_pointer(sk->sk_reuseport_cb) || !uid_eq(uid, sock_i_uid(sk2))) && - saddr_comp(sk, sk2)) { + saddr_comp(sk, sk2, true)) { res = 1; break; } @@ -196,6 +201,35 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, return res; } +static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot, + int (*saddr_same)(const struct sock *sk1, + const struct sock *sk2, + bool match_wildcard)) +{ + struct net *net = sock_net(sk); + struct hlist_nulls_node *node; + kuid_t uid = sock_i_uid(sk); + struct sock *sk2; + + sk_nulls_for_each(sk2, node, &hslot->head) { + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + sk2->sk_family == sk->sk_family && + ipv6_only_sock(sk2) == ipv6_only_sock(sk) && + (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && + (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + (*saddr_same)(sk, sk2, false)) { + return reuseport_add_sock(sk, sk2); + } + } + + /* Initial allocation may have already happened via setsockopt */ + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return reuseport_alloc(sk); + return 0; +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * @@ -207,7 +241,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2), + const struct sock *sk2, + bool match_wildcard), unsigned int hash2_nulladdr) { struct udp_hslot *hslot, *hslot2; @@ -290,6 +325,14 @@ found: udp_sk(sk)->udp_port_hash = snum; udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { + if (sk->sk_reuseport && + udp_reuseport_add_sock(sk, hslot, saddr_comp)) { + inet_sk(sk)->inet_num = 0; + udp_sk(sk)->udp_port_hash = 0; + udp_sk(sk)->udp_portaddr_hash ^= snum; + goto fail_unlock; + } + sk_nulls_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -309,13 +352,22 @@ fail: } EXPORT_SYMBOL(udp_lib_get_port); -static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) +/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses + * match_wildcard == false: addresses must be exactly the same, i.e. + * 0.0.0.0 only equals to 0.0.0.0 + */ +static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2, + bool match_wildcard) { struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); - return (!ipv6_only_sock(sk2) && - (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || - inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); + if (!ipv6_only_sock(sk2)) { + if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr) + return 1; + if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr) + return match_wildcard; + } + return 0; } static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr, @@ -459,8 +511,14 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, NULL, 0); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -478,6 +536,7 @@ begin: if (get_nulls_value(node) != slot2) goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -494,7 +553,7 @@ begin: */ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, - int dif, struct udp_table *udptable) + int dif, struct udp_table *udptable, struct sk_buff *skb) { struct sock *sk, *result; struct hlist_nulls_node *node; @@ -540,8 +599,15 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -560,6 +626,7 @@ begin: goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score(result, net, saddr, hnum, sport, @@ -581,13 +648,14 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - udptable); + udptable, skb); } struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { - return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); + return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, + &udp_table, NULL); } EXPORT_SYMBOL_GPL(udp4_lib_lookup); @@ -635,7 +703,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, udptable); + iph->saddr, uh->source, skb->dev->ifindex, udptable, + NULL); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -1398,6 +1467,8 @@ void udp_lib_unhash(struct sock *sk) hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); if (sk_nulls_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; @@ -1425,22 +1496,28 @@ void udp_lib_rehash(struct sock *sk, u16 newhash) hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; - if (hslot2 != nhslot2) { + + if (hslot2 != nhslot2 || + rcu_access_pointer(sk->sk_reuseport_cb)) { hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); - - spin_lock(&hslot2->lock); - hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); - hslot2->count--; - spin_unlock(&hslot2->lock); - - spin_lock(&nhslot2->lock); - hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, - &nhslot2->head); - nhslot2->count++; - spin_unlock(&nhslot2->lock); + if (rcu_access_pointer(sk->sk_reuseport_cb)) + reuseport_detach_sock(sk); + + if (hslot2 != nhslot2) { + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); + + spin_lock(&nhslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &nhslot2->head); + nhslot2->count++; + spin_unlock(&nhslot2->lock); + } spin_unlock_bh(&hslot->lock); } diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 6116604bf6e8..df1966f3b6ec 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -44,7 +44,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, @@ -52,7 +52,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, - req->id.idiag_if, tbl); + req->id.idiag_if, tbl, NULL); #endif else goto out_nosk; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index a7ca2cde2ecb..36c3f0155010 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -51,12 +51,12 @@ int inet6_csk_bind_conflict(const struct sock *sk, (sk2->sk_state != TCP_TIME_WAIT && !uid_eq(uid, sock_i_uid((struct sock *)sk2))))) { - if (ipv6_rcv_saddr_equal(sk, sk2)) + if (ipv6_rcv_saddr_equal(sk, sk2, true)) break; } if (!relax && reuse && sk2->sk_reuse && sk2->sk_state != TCP_LISTEN && - ipv6_rcv_saddr_equal(sk, sk2)) + ipv6_rcv_saddr_equal(sk, sk2, true)) break; } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 00775ee27d86..56fcb55fda31 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -47,6 +47,7 @@ #include <net/xfrm.h> #include <net/inet6_hashtables.h> #include <net/busy_poll.h> +#include <net/sock_reuseport.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -76,7 +77,14 @@ static u32 udp6_ehashfn(const struct net *net, udp_ipv6_hash_secret + net_hash_mix(net)); } -int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) +/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6 + * only, and any IPv4 addresses if not IPv6 only + * match_wildcard == false: addresses must be exactly the same, i.e. + * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY, + * and 0.0.0.0 equals to 0.0.0.0 only + */ +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, + bool match_wildcard) { const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); int sk2_ipv6only = inet_v6_ipv6only(sk2); @@ -84,16 +92,24 @@ int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; /* if both are mapped, treat as IPv4 */ - if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) - return (!sk2_ipv6only && - (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr || - sk->sk_rcv_saddr == sk2->sk_rcv_saddr)); + if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) { + if (!sk2_ipv6only) { + if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr) + return 1; + if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr) + return match_wildcard; + } + return 0; + } + + if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY) + return 1; - if (addr_type2 == IPV6_ADDR_ANY && + if (addr_type2 == IPV6_ADDR_ANY && match_wildcard && !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) return 1; - if (addr_type == IPV6_ADDR_ANY && + if (addr_type == IPV6_ADDR_ANY && match_wildcard && !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED)) return 1; @@ -253,8 +269,14 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, NULL, 0); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -273,6 +295,7 @@ begin: goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score2(result, net, saddr, sport, @@ -287,7 +310,8 @@ begin: struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, - int dif, struct udp_table *udptable) + int dif, struct udp_table *udptable, + struct sk_buff *skb) { struct sock *sk, *result; struct hlist_nulls_node *node; @@ -332,8 +356,15 @@ begin: badness = score; reuseport = sk->sk_reuseport; if (reuseport) { + struct sock *sk2; hash = udp6_ehashfn(net, daddr, hnum, saddr, sport); + sk2 = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (sk2) { + result = sk2; + goto found; + } matches = 1; } } else if (score == badness && reuseport) { @@ -352,6 +383,7 @@ begin: goto begin; if (result) { +found: if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2))) result = NULL; else if (unlikely(compute_score(result, net, hnum, saddr, sport, @@ -377,13 +409,13 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, return sk; return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - udptable); + udptable, skb); } struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif) { - return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL); } EXPORT_SYMBOL_GPL(udp6_lib_lookup); @@ -549,8 +581,8 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, int err; struct net *net = dev_net(skb->dev); - sk = __udp6_lib_lookup(net, daddr, uh->dest, - saddr, uh->source, inet6_iif(skb), udptable); + sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, + inet6_iif(skb), udptable, skb); if (!sk) { ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); |