From e4a6a3424b75f23f6bb1cc479974fc305a4b9f78 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 13 Jul 2017 14:27:58 +0800 Subject: bpf: fix return in bpf_skb_adjust_net The bpf_skb_adjust_net() ignores the return value of bpf_skb_net_shrink/grow, and always return 0, fix it by return 'ret'. Signed-off-by: Kefeng Wang Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index c7f737058d89..f44fc22fd45a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2248,7 +2248,7 @@ static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff) bpf_skb_net_grow(skb, len_diff_abs); bpf_compute_data_end(skb); - return 0; + return ret; } BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, -- cgit v1.2.3-70-g09d2 From 5e349fc03cdd44c670aaf3affaa6f0426c5e553f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 13 Jul 2017 12:22:24 +0100 Subject: dccp: make const array error_code static Don't populate array error_code on the stack but make it static. Makes the object code smaller by almost 250 bytes: Before: text data bss dec hex filename 10366 983 0 11349 2c55 net/dccp/input.o After: text data bss dec hex filename 10161 1039 0 11200 2bc0 net/dccp/input.o Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/dccp/input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/input.c b/net/dccp/input.c index 4a05d7876850..fa6be9750bb4 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -126,7 +126,7 @@ static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) static u16 dccp_reset_code_convert(const u8 code) { - const u16 error_code[] = { + static const u16 error_code[] = { [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ [DCCP_RESET_CODE_ABORTED] = ECONNRESET, -- cgit v1.2.3-70-g09d2 From 5d89fb33223e0be32e4100623b915048b02beeec Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Jul 2017 13:36:40 -0700 Subject: net: set fib rule refcount after malloc The configure callback of fib_rules_ops can change the refcnt of a fib rule. For instance, mlxsw takes a refcnt when adding the processing of the rule to a work queue. Thus the rule refcnt can not be reset to to 1 afterwards. Move the refcnt setting to after the allocation. Fixes: 5361e209dd30 ("net: avoid one splat in fib_nl_delrule()") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index a0093e1b0235..fdcb1bcd2afa 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -400,6 +400,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENOMEM; goto errout; } + refcount_set(&rule->refcnt, 1); rule->fr_net = net; rule->pref = tb[FRA_PRIORITY] ? nla_get_u32(tb[FRA_PRIORITY]) @@ -517,8 +518,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, last = r; } - refcount_set(&rule->refcnt, 1); - if (last) list_add_rcu(&rule->list, &last->list); else -- cgit v1.2.3-70-g09d2 From 230cd1279d0019d52f9529c7d91c96d095cae755 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Wed, 12 Jul 2017 15:56:41 -0700 Subject: netpoll: shut up a kernel warning on refcount When we convert atomic_t to refcount_t, a new kernel warning on "increment on 0" is introduced in the netpoll code, zap_completion_queue(). In fact for this special case, we know the refcount is 0 and we just have to set it to 1 to satisfy the following dev_kfree_skb_any(), so we can just use refcount_set(..., 1) instead. Fixes: 633547973ffc ("net: convert sk_buff.users from atomic_t to refcount_t") Reported-by: Dave Jones Cc: Reshetova, Elena Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d3408a693166..8357f164c660 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -277,7 +277,7 @@ static void zap_completion_queue(void) struct sk_buff *skb = clist; clist = clist->next; if (!skb_irq_freeable(skb)) { - refcount_inc(&skb->users); + refcount_set(&skb->users, 1); dev_kfree_skb_any(skb); /* put this one back */ } else { __kfree_skb(skb); -- cgit v1.2.3-70-g09d2 From 31a4562d7408493c6377933ff2f7d7302dbdea80 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 13 Jul 2017 16:09:10 +0300 Subject: net: bridge: fix dest lookup when vlan proto doesn't match With 802.1ad support the vlan_ingress code started checking for vlan protocol mismatch which causes the current tag to be inserted and the bridge vlan protocol & pvid to be set. The vlan tag insertion changes the skb mac_header and thus the lookup mac dest pointer which was loaded prior to calling br_allowed_ingress in br_handle_frame_finish is VLAN_HLEN bytes off now, pointing to the last two bytes of the destination mac and the first four of the source mac causing lookups to always fail and broadcasting all such packets to all ports. Same thing happens for locally originated packets when passing via br_dev_xmit. So load the dest pointer after the vlan checks and possible skb change. Fixes: 8580e2117c06 ("bridge: Prepare for 802.1ad vlan filtering support") Reported-by: Anitha Narasimha Murthy Signed-off-by: Nikolay Aleksandrov Acked-by: Toshiaki Makita Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 ++- net/bridge/br_input.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f0f3447e8aa4..861ae2a165f4 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -34,11 +34,11 @@ static struct lock_class_key bridge_netdev_addr_lock_key; netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - const unsigned char *dest = skb->data; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; + const unsigned char *dest; u16 vid = 0; rcu_read_lock(); @@ -61,6 +61,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; + dest = eth_hdr(skb)->h_dest; if (is_broadcast_ether_addr(dest)) { br_flood(br, skb, BR_PKT_BROADCAST, false, true); } else if (is_multicast_ether_addr(dest)) { diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 013f2290bfa5..7637f58c1226 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -131,11 +131,11 @@ static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_bridge_port *p = br_port_get_rcu(skb->dev); - const unsigned char *dest = eth_hdr(skb)->h_dest; enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; + const unsigned char *dest; struct net_bridge *br; u16 vid = 0; @@ -153,6 +153,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false); local_rcv = !!(br->dev->flags & IFF_PROMISC); + dest = eth_hdr(skb)->h_dest; if (is_multicast_ether_addr(dest)) { /* by definition the broadcast is also a multicast address */ if (is_broadcast_ether_addr(dest)) { -- cgit v1.2.3-70-g09d2 From ccd4eb49f3392ebf989d58bd013a7bf44cdca4d6 Mon Sep 17 00:00:00 2001 From: Iván Briano Date: Thu, 13 Jul 2017 09:46:58 -0700 Subject: net/packet: Fix Tx queue selection for AF_PACKET MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When PACKET_QDISC_BYPASS is not used, Tx queue selection will be done before the packet is enqueued, taking into account any mappings set by a queuing discipline such as mqprio without hardware offloading. This selection may be affected by a previously saved queue_mapping, either on the Rx path, or done before the packet reaches the device, as it's currently the case for AF_PACKET. In order for queue selection to work as expected when using traffic control, there can't be another selection done before that point is reached, so move the call to packet_pick_tx_queue to packet_direct_xmit, leaving the default xmit path as it was before PACKET_QDISC_BYPASS was introduced. A forward declaration of packet_pick_tx_queue() is introduced to avoid the need to reorder the functions within the file. Fixes: d346a3fae3ff ("packet: introduce PACKET_QDISC_BYPASS socket option") Signed-off-by: Iván Briano Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e3beb28203eb..008bb34ee324 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -214,6 +214,7 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *, static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); +static void packet_pick_tx_queue(struct net_device *dev, struct sk_buff *skb); struct packet_skb_cb { union { @@ -260,6 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb) if (skb != orig_skb) goto drop; + packet_pick_tx_queue(dev, skb); txq = skb_get_tx_queue(dev, skb); local_bh_disable(); @@ -2747,8 +2749,6 @@ tpacket_error: goto tpacket_error; } - packet_pick_tx_queue(dev, skb); - skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); @@ -2931,8 +2931,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->priority = sk->sk_priority; skb->mark = sockc.mark; - packet_pick_tx_queue(dev, skb); - if (po->has_vnet_hdr) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) -- cgit v1.2.3-70-g09d2 From c4c4290c17bd099b7654d683f8ab6233b4f4a364 Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Thu, 13 Jul 2017 13:12:18 -0400 Subject: net sched actions: rename act_get_notify() to tcf_get_notify() Make name consistent with other TC event notification routines, such as tcf_add_notify() and tcf_del_notify() Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index aed6cf2e9fd8..f2e9ed34a963 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -835,7 +835,7 @@ out_nlmsg_trim: } static int -act_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, +tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, struct list_head *actions, int event) { struct sk_buff *skb; @@ -1018,7 +1018,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, } if (event == RTM_GETACTION) - ret = act_get_notify(net, portid, n, &actions, event); + ret = tcf_get_notify(net, portid, n, &actions, event); else { /* delete */ ret = tcf_del_notify(net, n, &actions, portid); if (ret) -- cgit v1.2.3-70-g09d2 From 10b3bf54406bb7f4e78da9bb2a485c5c986678ad Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 14 Jul 2017 22:07:33 +0800 Subject: sctp: fix an array overflow when all ext chunks are set Marcelo noticed an array overflow caused by commit c28445c3cb07 ("sctp: add reconf_enable in asoc ep and netns"), in which sctp would add SCTP_CID_RECONF into extensions when reconf_enable is set in sctp_make_init and sctp_make_init_ack. Then now when all ext chunks are set, 4 ext chunk ids can be put into extensions array while extensions array size is 3. It would cause a kernel panic because of this overflow. This patch is to fix it by defining extensions array size is 4 in both sctp_make_init and sctp_make_init_ack. Fixes: c28445c3cb07 ("sctp: add reconf_enable in asoc ep and netns") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 4e16b02ed832..6110447fe51d 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -228,7 +228,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, sctp_adaptation_ind_param_t aiparam; sctp_supported_ext_param_t ext_param; int num_ext = 0; - __u8 extensions[3]; + __u8 extensions[4]; struct sctp_paramhdr *auth_chunks = NULL, *auth_hmacs = NULL; @@ -396,7 +396,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, sctp_adaptation_ind_param_t aiparam; sctp_supported_ext_param_t ext_param; int num_ext = 0; - __u8 extensions[3]; + __u8 extensions[4]; struct sctp_paramhdr *auth_chunks = NULL, *auth_hmacs = NULL, *auth_random = NULL; -- cgit v1.2.3-70-g09d2 From 254d900b801fc04aa524ff7bafe28fdd1dbf0ed6 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 14 Jul 2017 12:04:16 +0300 Subject: ipv4: ip_do_fragment: fix headroom tests Some time ago David Woodhouse reported skb_under_panic when we try to push ethernet header to fragmented ipv6 skbs. It was fixed for ipv6 by Florian Westphal in commit 1d325d217c7f ("ipv6: ip6_fragment: fix headroom tests and skb leak") However similar problem still exist in ipv4. It does not trigger skb_under_panic due paranoid check in ip_finish_output2, however according to Alexey Kuznetsov current state is abnormal and ip_fragment should be fixed too. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 7eb252dcecee..50c74cd890bc 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -599,6 +599,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, hlen = iph->ihl * 4; mtu = mtu - hlen; /* Size of data space */ IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; + ll_rs = LL_RESERVED_SPACE(rt->dst.dev); /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing @@ -614,14 +615,15 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, if (first_len - hlen > mtu || ((first_len - hlen) & 7) || ip_is_fragment(iph) || - skb_cloned(skb)) + skb_cloned(skb) || + skb_headroom(skb) < ll_rs) goto slow_path; skb_walk_frags(skb, frag) { /* Correct geometry. */ if (frag->len > mtu || ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) + skb_headroom(frag) < hlen + ll_rs) goto slow_path_clean; /* Partially cloned skb? */ @@ -711,8 +713,6 @@ slow_path: left = skb->len - hlen; /* Space per frame */ ptr = hlen; /* Where to start from */ - ll_rs = LL_RESERVED_SPACE(rt->dst.dev); - /* * Fragment the datagram. */ -- cgit v1.2.3-70-g09d2 From 8b97ac5bda17cfaa257bcab6180af0f43a2e87e0 Mon Sep 17 00:00:00 2001 From: Greg Rose Date: Fri, 14 Jul 2017 12:42:49 -0700 Subject: openvswitch: Fix for force/commit action failures When there is an established connection in direction A->B, it is possible to receive a packet on port B which then executes ct(commit,force) without first performing ct() - ie, a lookup. In this case, we would expect that this packet can delete the existing entry so that we can commit a connection with direction B->A. However, currently we only perform a check in skb_nfct_cached() for whether OVS_CS_F_TRACKED is set and OVS_CS_F_INVALID is not set, ie that a lookup previously occurred. In the above scenario, a lookup has not occurred but we should still be able to statelessly look up the existing entry and potentially delete the entry if it is in the opposite direction. This patch extends the check to also hint that if the action has the force flag set, then we will lookup the existing entry so that the force check at the end of skb_nfct_cached has the ability to delete the connection. Fixes: dd41d330b03 ("openvswitch: Add force commit.") CC: Pravin Shelar CC: dev@openvswitch.org Signed-off-by: Joe Stringer Signed-off-by: Greg Rose Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 51 ++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 08679ebb3068..e3c4c6c3fef7 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -629,6 +629,34 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, return ct; } +static +struct nf_conn *ovs_ct_executed(struct net *net, + const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, + bool *ct_executed) +{ + struct nf_conn *ct = NULL; + + /* If no ct, check if we have evidence that an existing conntrack entry + * might be found for this skb. This happens when we lose a skb->_nfct + * due to an upcall, or if the direction is being forced. If the + * connection was not confirmed, it is not cached and needs to be run + * through conntrack again. + */ + *ct_executed = (key->ct_state & OVS_CS_F_TRACKED) && + !(key->ct_state & OVS_CS_F_INVALID) && + (key->ct_zone == info->zone.id); + + if (*ct_executed || (!key->ct_state && info->force)) { + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, + !!(key->ct_state & + OVS_CS_F_NAT_MASK)); + } + + return ct; +} + /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ static bool skb_nfct_cached(struct net *net, const struct sw_flow_key *key, @@ -637,24 +665,17 @@ static bool skb_nfct_cached(struct net *net, { enum ip_conntrack_info ctinfo; struct nf_conn *ct; + bool ct_executed = true; ct = nf_ct_get(skb, &ctinfo); - /* If no ct, check if we have evidence that an existing conntrack entry - * might be found for this skb. This happens when we lose a skb->_nfct - * due to an upcall. If the connection was not confirmed, it is not - * cached and needs to be run through conntrack again. - */ - if (!ct && key->ct_state & OVS_CS_F_TRACKED && - !(key->ct_state & OVS_CS_F_INVALID) && - key->ct_zone == info->zone.id) { - ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, - !!(key->ct_state - & OVS_CS_F_NAT_MASK)); - if (ct) - nf_ct_get(skb, &ctinfo); - } if (!ct) + ct = ovs_ct_executed(net, key, info, skb, &ct_executed); + + if (ct) + nf_ct_get(skb, &ctinfo); + else return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) return false; if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) @@ -679,7 +700,7 @@ static bool skb_nfct_cached(struct net *net, return false; } - return true; + return ct_executed; } #ifdef CONFIG_NF_NAT_NEEDED -- cgit v1.2.3-70-g09d2 From 4aea287e90dd61a48268ff2994b56f9799441b62 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:21 -0400 Subject: tcp_bbr: cut pacing rate only if filled pipe In bbr_set_pacing_rate(), which decides whether to cut the pacing rate, there was some code that considered exiting STARTUP to be equivalent to the notion of filling the pipe (i.e., bbr_full_bw_reached()). Specifically, as the code was structured, exiting STARTUP and going into PROBE_RTT could cause us to cut the pacing rate down to something silly and low, based on whatever bandwidth samples we've had so far, when it's possible that all of them have been small app-limited bandwidth samples that are not representative of the bandwidth available in the path. (The code was correct at the time it was written, but the state machine changed without this spot being adjusted correspondingly.) Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index dbcc9352a48f..743e97511dc8 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -220,12 +220,11 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { - struct bbr *bbr = inet_csk_ca(sk); u64 rate = bw; rate = bbr_rate_bytes_per_sec(sk, rate, gain); rate = min_t(u64, rate, sk->sk_max_pacing_rate); - if (bbr->mode != BBR_STARTUP || rate > sk->sk_pacing_rate) + if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) sk->sk_pacing_rate = rate; } -- cgit v1.2.3-70-g09d2 From f19fd62dafaf1ed6cf615dba655b82fa9df59074 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:22 -0400 Subject: tcp_bbr: introduce bbr_bw_to_pacing_rate() helper Introduce a helper to convert a BBR bandwidth and gain factor to a pacing rate in bytes per second. This is a pure refactor, but is needed for two following fixes. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 743e97511dc8..29e23b851b97 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -211,6 +211,16 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) return rate >> BW_SCALE; } +/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +{ + u64 rate = bw; + + rate = bbr_rate_bytes_per_sec(sk, rate, gain); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; +} + /* Pace using current bw estimate and a gain factor. In order to help drive the * network toward lower queues while maintaining high utilization and low * latency, the average pacing rate aims to be slightly (~1%) lower than the @@ -220,10 +230,8 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { - u64 rate = bw; + u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); - rate = bbr_rate_bytes_per_sec(sk, rate, gain); - rate = min_t(u64, rate, sk->sk_max_pacing_rate); if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) sk->sk_pacing_rate = rate; } -- cgit v1.2.3-70-g09d2 From 79135b89b8af304456bd67916b80116ddf03d7b6 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:23 -0400 Subject: tcp_bbr: introduce bbr_init_pacing_rate_from_rtt() helper Introduce a helper to initialize the BBR pacing rate unconditionally, based on the current cwnd and RTT estimate. This is a pure refactor, but is needed for two following fixes. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 29e23b851b97..3276140c2506 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -221,6 +221,23 @@ static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) return rate; } +/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ +static void bbr_init_pacing_rate_from_rtt(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u64 bw; + u32 rtt_us; + + if (tp->srtt_us) { /* any RTT sample yet? */ + rtt_us = max(tp->srtt_us >> 3, 1U); + } else { /* no RTT sample yet */ + rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ + } + bw = (u64)tp->snd_cwnd * BW_UNIT; + do_div(bw, rtt_us); + sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); +} + /* Pace using current bw estimate and a gain factor. In order to help drive the * network toward lower queues while maintaining high utilization and low * latency, the average pacing rate aims to be slightly (~1%) lower than the @@ -805,7 +822,6 @@ static void bbr_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u64 bw; bbr->prior_cwnd = 0; bbr->tso_segs_goal = 0; /* default segs per skb until first ACK */ @@ -821,11 +837,8 @@ static void bbr_init(struct sock *sk) minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ - /* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ - bw = (u64)tp->snd_cwnd * BW_UNIT; - do_div(bw, (tp->srtt_us >> 3) ? : USEC_PER_MSEC); sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ - bbr_set_pacing_rate(sk, bw, bbr_high_gain); + bbr_init_pacing_rate_from_rtt(sk); bbr->restore_cwnd = 0; bbr->round_start = 0; -- cgit v1.2.3-70-g09d2 From 1d3648eb5d1fe9ed3d095ed8fa19ad11ca4c8bc0 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:24 -0400 Subject: tcp_bbr: remove sk_pacing_rate=0 transient during init Fix a corner case noticed by Eric Dumazet, where BBR's setting sk->sk_pacing_rate to 0 during initialization could theoretically cause packets in the sending host to hang if there were packets "in flight" in the pacing infrastructure at the time the BBR congestion control state is initialized. This could occur if the pacing infrastructure happened to race with bbr_init() in a way such that the pacer read the 0 rather than the immediately following non-zero pacing rate. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Reported-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 3276140c2506..42e0017f2ebc 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -837,7 +837,6 @@ static void bbr_init(struct sock *sk) minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ - sk->sk_pacing_rate = 0; /* force an update of sk_pacing_rate */ bbr_init_pacing_rate_from_rtt(sk); bbr->restore_cwnd = 0; -- cgit v1.2.3-70-g09d2 From 32984565574da7ed3afa10647bb4020d7a9e6c93 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 14 Jul 2017 17:49:25 -0400 Subject: tcp_bbr: init pacing rate on first RTT sample Fixes the following behavior: for connections that had no RTT sample at the time of initializing congestion control, BBR was initializing the pacing rate to a high nominal rate (based an a guess of RTT=1ms, in case this is LAN traffic). Then BBR never adjusted the pacing rate downward upon obtaining an actual RTT sample, if the connection never filled the pipe (e.g. all sends were small app-limited writes()). This fix adjusts the pacing rate upon obtaining the first RTT sample. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_bbr.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 42e0017f2ebc..69ee877574d0 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -112,7 +112,8 @@ struct bbr { cwnd_gain:10, /* current gain for setting cwnd */ full_bw_cnt:3, /* number of rounds without large bw gains */ cycle_idx:3, /* current index in pacing_gain cycle array */ - unused_b:6; + has_seen_rtt:1, /* have we seen an RTT sample yet? */ + unused_b:5; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ }; @@ -225,11 +226,13 @@ static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) static void bbr_init_pacing_rate_from_rtt(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); u64 bw; u32 rtt_us; if (tp->srtt_us) { /* any RTT sample yet? */ rtt_us = max(tp->srtt_us >> 3, 1U); + bbr->has_seen_rtt = 1; } else { /* no RTT sample yet */ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ } @@ -247,8 +250,12 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) */ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); + if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) + bbr_init_pacing_rate_from_rtt(sk); if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) sk->sk_pacing_rate = rate; } @@ -837,6 +844,7 @@ static void bbr_init(struct sock *sk) minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ + bbr->has_seen_rtt = 0; bbr_init_pacing_rate_from_rtt(sk); bbr->restore_cwnd = 0; -- cgit v1.2.3-70-g09d2 From f55ce7b024090a51382ccab2730b96e2f7b4e9cf Mon Sep 17 00:00:00 2001 From: Mateusz Jurczyk Date: Wed, 7 Jun 2017 15:50:38 +0200 Subject: netfilter: nfnetlink: Improve input length sanitization in nfnetlink_rcv Verify that the length of the socket buffer is sufficient to cover the nlmsghdr structure before accessing the nlh->nlmsg_len field for further input sanitization. If the client only supplies 1-3 bytes of data in sk_buff, then nlh->nlmsg_len remains partially uninitialized and contains leftover memory from the corresponding kernel allocation. Operating on such data may result in indeterminate evaluation of the nlmsg_len < NLMSG_HDRLEN expression. The bug was discovered by a runtime instrumentation designed to detect use of uninitialized memory in the kernel. The patch prevents this and other similar tools (e.g. KMSAN) from flagging this behavior in the future. Signed-off-by: Mateusz Jurczyk Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 92b05e188fd1..733d3e4a30d8 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -472,8 +472,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) if (msglen > skb->len) msglen = skb->len; - if (nlh->nlmsg_len < NLMSG_HDRLEN || - skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) + if (skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) return; err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy, @@ -500,7 +499,8 @@ static void nfnetlink_rcv(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < NLMSG_HDRLEN || + if (skb->len < NLMSG_HDRLEN || + nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return; -- cgit v1.2.3-70-g09d2 From cf56c2f892a8a1870a8358114ad896772da7543a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 6 Jul 2017 23:17:44 +0200 Subject: netfilter: remove old pre-netns era hook api no more users in the tree, remove this. The old api is racy wrt. module removal, all users have been converted to the netns-aware api. The old api pretended we still have global hooks but that has not been true for a long time. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 9 --- net/netfilter/core.c | 143 ---------------------------------------------- 2 files changed, 152 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index a4b97be30b28..22f081065d49 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -61,8 +61,6 @@ typedef unsigned int nf_hookfn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); struct nf_hook_ops { - struct list_head list; - /* User fills in from here down. */ nf_hookfn *hook; struct net_device *dev; @@ -160,13 +158,6 @@ int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n); -int nf_register_hook(struct nf_hook_ops *reg); -void nf_unregister_hook(struct nf_hook_ops *reg); -int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); -void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); -int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); -void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); - /* Functions to register get/setsockopt ranges (non-inclusive). You need to check permissions yourself! */ int nf_register_sockopt(struct nf_sockopt_ops *reg); diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 552d606e57ca..368610dbc3c0 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -227,114 +227,6 @@ void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, } EXPORT_SYMBOL(nf_unregister_net_hooks); -static LIST_HEAD(nf_hook_list); - -static int _nf_register_hook(struct nf_hook_ops *reg) -{ - struct net *net, *last; - int ret; - - for_each_net(net) { - ret = nf_register_net_hook(net, reg); - if (ret && ret != -ENOENT) - goto rollback; - } - list_add_tail(®->list, &nf_hook_list); - - return 0; -rollback: - last = net; - for_each_net(net) { - if (net == last) - break; - nf_unregister_net_hook(net, reg); - } - return ret; -} - -int nf_register_hook(struct nf_hook_ops *reg) -{ - int ret; - - rtnl_lock(); - ret = _nf_register_hook(reg); - rtnl_unlock(); - - return ret; -} -EXPORT_SYMBOL(nf_register_hook); - -static void _nf_unregister_hook(struct nf_hook_ops *reg) -{ - struct net *net; - - list_del(®->list); - for_each_net(net) - nf_unregister_net_hook(net, reg); -} - -void nf_unregister_hook(struct nf_hook_ops *reg) -{ - rtnl_lock(); - _nf_unregister_hook(reg); - rtnl_unlock(); -} -EXPORT_SYMBOL(nf_unregister_hook); - -int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - unsigned int i; - int err = 0; - - for (i = 0; i < n; i++) { - err = nf_register_hook(®[i]); - if (err) - goto err; - } - return err; - -err: - if (i > 0) - nf_unregister_hooks(reg, i); - return err; -} -EXPORT_SYMBOL(nf_register_hooks); - -/* Caller MUST take rtnl_lock() */ -int _nf_register_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - unsigned int i; - int err = 0; - - for (i = 0; i < n; i++) { - err = _nf_register_hook(®[i]); - if (err) - goto err; - } - return err; - -err: - if (i > 0) - _nf_unregister_hooks(reg, i); - return err; -} -EXPORT_SYMBOL(_nf_register_hooks); - -void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - while (n-- > 0) - nf_unregister_hook(®[n]); -} -EXPORT_SYMBOL(nf_unregister_hooks); - -/* Caller MUST take rtnl_lock */ -void _nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) -{ - while (n-- > 0) - _nf_unregister_hook(®[n]); -} -EXPORT_SYMBOL(_nf_unregister_hooks); - /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, @@ -450,37 +342,6 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif -static int nf_register_hook_list(struct net *net) -{ - struct nf_hook_ops *elem; - int ret; - - rtnl_lock(); - list_for_each_entry(elem, &nf_hook_list, list) { - ret = nf_register_net_hook(net, elem); - if (ret && ret != -ENOENT) - goto out_undo; - } - rtnl_unlock(); - return 0; - -out_undo: - list_for_each_entry_continue_reverse(elem, &nf_hook_list, list) - nf_unregister_net_hook(net, elem); - rtnl_unlock(); - return ret; -} - -static void nf_unregister_hook_list(struct net *net) -{ - struct nf_hook_ops *elem; - - rtnl_lock(); - list_for_each_entry(elem, &nf_hook_list, list) - nf_unregister_net_hook(net, elem); - rtnl_unlock(); -} - static int __net_init netfilter_net_init(struct net *net) { int i, h, ret; @@ -500,16 +361,12 @@ static int __net_init netfilter_net_init(struct net *net) return -ENOMEM; } #endif - ret = nf_register_hook_list(net); - if (ret) - remove_proc_entry("netfilter", net->proc_net); return ret; } static void __net_exit netfilter_net_exit(struct net *net) { - nf_unregister_hook_list(net); remove_proc_entry("netfilter", net->proc_net); } -- cgit v1.2.3-70-g09d2 From 97772bcd56efa21d9d8976db6f205574ea602f51 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jul 2017 13:07:17 +0200 Subject: netfilter: nat: fix src map lookup When doing initial conversion to rhashtable I replaced the bucket walk with a single rhashtable_lookup_fast(). When moving to rhlist I failed to properly walk the list of identical tuples, but that is what is needed for this to work correctly. The table contains the original tuples, so the reply tuples are all distinct. We currently decide that mapping is (not) in range only based on the first entry, but in case its not we need to try the reply tuple of the next entry until we either find an in-range mapping or we checked all the entries. This bug makes nat core attempt collision resolution while it might be able to use the mapping as-is. Fixes: 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable") Reported-by: Jaco Kroon Tested-by: Jaco Kroon Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_nat_core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 832c5a08d9a5..eb541786ccb7 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -222,20 +222,21 @@ find_appropriate_src(struct net *net, .tuple = tuple, .zone = zone }; - struct rhlist_head *hl; + struct rhlist_head *hl, *h; hl = rhltable_lookup(&nf_nat_bysource_table, &key, nf_nat_bysource_params); - if (!hl) - return 0; - ct = container_of(hl, typeof(*ct), nat_bysource); + rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) { + nf_ct_invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; - nf_ct_invert_tuplepr(result, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - result->dst = tuple->dst; + if (in_range(l3proto, l4proto, result, range)) + return 1; + } - return in_range(l3proto, l4proto, result, range); + return 0; } /* For [FUTURE] fragmentation handling, we want the least-used -- cgit v1.2.3-70-g09d2 From 974292defee033bc43ccfcb2fcefc3eba3905340 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 7 Jul 2017 13:29:03 +0200 Subject: netfilter: nf_tables: only allow in/output for arp packets arp packets cannot be forwarded. They can be bridged, but then they can be filtered using either ebtables or nftables bridge family. The bridge netfilter exposes a "call-arptables" switch which pushes packets into arptables, but lets not expose this for nftables, so better close this asap. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_tables_arp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c index 805c8ddfe860..4bbc273b45e8 100644 --- a/net/ipv4/netfilter/nf_tables_arp.c +++ b/net/ipv4/netfilter/nf_tables_arp.c @@ -72,8 +72,7 @@ static const struct nf_chain_type filter_arp = { .family = NFPROTO_ARP, .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | - (1 << NF_ARP_OUT) | - (1 << NF_ARP_FORWARD), + (1 << NF_ARP_OUT), }; static int __init nf_tables_arp_init(void) -- cgit v1.2.3-70-g09d2 From 36ac344e16e04e3e55e8fed7446095a6458c64e6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 10 Jul 2017 13:53:53 +0200 Subject: netfilter: expect: fix crash when putting uninited expectation We crash in __nf_ct_expect_check, it calls nf_ct_remove_expect on the uninitialised expectation instead of existing one, so del_timer chokes on random memory address. Fixes: ec0e3f01114ad32711243 ("netfilter: nf_ct_expect: Add nf_ct_remove_expect()") Reported-by: Sergey Kvachonok Tested-by: Sergey Kvachonok Cc: Gao Feng Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_expect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e03d16ed550d..899c2c36da13 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -422,7 +422,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { - if (nf_ct_remove_expect(expect)) + if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { ret = -EBUSY; -- cgit v1.2.3-70-g09d2 From 18bcf2907df935981266532e1e0d052aff2e6fae Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 17 Jul 2017 12:35:58 +0200 Subject: ipv4: ipv6: initialize treq->txhash in cookie_v[46]_check() KMSAN reported use of uninitialized memory in skb_set_hash_from_sk(), which originated from the TCP request socket created in cookie_v6_check(): ================================================================== BUG: KMSAN: use of uninitialized memory in tcp_transmit_skb+0xf77/0x3ec0 CPU: 1 PID: 2949 Comm: syz-execprog Not tainted 4.11.0-rc5+ #2931 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 TCP: request_sock_TCPv6: Possible SYN flooding on port 20028. Sending cookies. Check SNMP counters. Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x172/0x1c0 lib/dump_stack.c:52 kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:927 __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:469 skb_set_hash_from_sk ./include/net/sock.h:2011 tcp_transmit_skb+0xf77/0x3ec0 net/ipv4/tcp_output.c:983 tcp_send_ack+0x75b/0x830 net/ipv4/tcp_output.c:3493 tcp_delack_timer_handler+0x9a6/0xb90 net/ipv4/tcp_timer.c:284 tcp_delack_timer+0x1b0/0x310 net/ipv4/tcp_timer.c:309 call_timer_fn+0x240/0x520 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 __run_timers+0xc13/0xf10 kernel/time/timer.c:1601 run_timer_softirq+0x36/0xa0 kernel/time/timer.c:1614 __do_softirq+0x485/0x942 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 irq_exit+0x1fa/0x230 kernel/softirq.c:405 exiting_irq+0xe/0x10 ./arch/x86/include/asm/apic.h:657 smp_apic_timer_interrupt+0x5a/0x80 arch/x86/kernel/apic/apic.c:966 apic_timer_interrupt+0x86/0x90 arch/x86/entry/entry_64.S:489 RIP: 0010:native_restore_fl ./arch/x86/include/asm/irqflags.h:36 RIP: 0010:arch_local_irq_restore ./arch/x86/include/asm/irqflags.h:77 RIP: 0010:__msan_poison_alloca+0xed/0x120 mm/kmsan/kmsan_instr.c:440 RSP: 0018:ffff880024917cd8 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10 RAX: 0000000000000246 RBX: ffff8800224c0000 RCX: 0000000000000005 RDX: 0000000000000004 RSI: ffff880000000000 RDI: ffffea0000b6d770 RBP: ffff880024917d58 R08: 0000000000000dd8 R09: 0000000000000004 R10: 0000160000000000 R11: 0000000000000000 R12: ffffffff85abf810 R13: ffff880024917dd8 R14: 0000000000000010 R15: ffffffff81cabde4 poll_select_copy_remaining+0xac/0x6b0 fs/select.c:293 SYSC_select+0x4b4/0x4e0 fs/select.c:653 SyS_select+0x76/0xa0 fs/select.c:634 entry_SYSCALL_64_fastpath+0x13/0x94 arch/x86/entry/entry_64.S:204 RIP: 0033:0x4597e7 RSP: 002b:000000c420037ee0 EFLAGS: 00000246 ORIG_RAX: 0000000000000017 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00000000004597e7 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: 000000c420037ef0 R08: 000000c420037ee0 R09: 0000000000000059 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000042dc20 R13: 00000000000000f3 R14: 0000000000000030 R15: 0000000000000003 chained origin: save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302 kmsan_save_stack mm/kmsan/kmsan.c:317 kmsan_internal_chain_origin+0x12a/0x1f0 mm/kmsan/kmsan.c:547 __msan_store_shadow_origin_4+0xac/0x110 mm/kmsan/kmsan_instr.c:259 tcp_create_openreq_child+0x709/0x1ae0 net/ipv4/tcp_minisocks.c:472 tcp_v6_syn_recv_sock+0x7eb/0x2a30 net/ipv6/tcp_ipv6.c:1103 tcp_get_cookie_sock+0x136/0x5f0 net/ipv4/syncookies.c:212 cookie_v6_check+0x17a9/0x1b50 net/ipv6/syncookies.c:245 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:989 tcp_v6_do_rcv+0xdd8/0x1c60 net/ipv6/tcp_ipv6.c:1298 tcp_v6_rcv+0x41a3/0x4f00 net/ipv6/tcp_ipv6.c:1487 ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279 NF_HOOK ./include/linux/netfilter.h:257 ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322 dst_input ./include/net/dst.h:492 ip6_rcv_finish net/ipv6/ip6_input.c:69 NF_HOOK ./include/linux/netfilter.h:257 ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203 __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208 __netif_receive_skb net/core/dev.c:4246 process_backlog+0x667/0xba0 net/core/dev.c:4866 napi_poll net/core/dev.c:5268 net_rx_action+0xc95/0x1590 net/core/dev.c:5333 __do_softirq+0x485/0x942 kernel/softirq.c:284 origin: save_stack_trace+0x37/0x40 arch/x86/kernel/stacktrace.c:59 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:302 kmsan_internal_poison_shadow+0xb1/0x1a0 mm/kmsan/kmsan.c:198 kmsan_kmalloc+0x7f/0xe0 mm/kmsan/kmsan.c:337 kmem_cache_alloc+0x1c2/0x1e0 mm/slub.c:2766 reqsk_alloc ./include/net/request_sock.h:87 inet_reqsk_alloc+0xa4/0x5b0 net/ipv4/tcp_input.c:6200 cookie_v6_check+0x4f4/0x1b50 net/ipv6/syncookies.c:169 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:989 tcp_v6_do_rcv+0xdd8/0x1c60 net/ipv6/tcp_ipv6.c:1298 tcp_v6_rcv+0x41a3/0x4f00 net/ipv6/tcp_ipv6.c:1487 ip6_input_finish+0x82f/0x1ee0 net/ipv6/ip6_input.c:279 NF_HOOK ./include/linux/netfilter.h:257 ip6_input+0x239/0x290 net/ipv6/ip6_input.c:322 dst_input ./include/net/dst.h:492 ip6_rcv_finish net/ipv6/ip6_input.c:69 NF_HOOK ./include/linux/netfilter.h:257 ipv6_rcv+0x1dbd/0x22e0 net/ipv6/ip6_input.c:203 __netif_receive_skb_core+0x2f6f/0x3a20 net/core/dev.c:4208 __netif_receive_skb net/core/dev.c:4246 process_backlog+0x667/0xba0 net/core/dev.c:4866 napi_poll net/core/dev.c:5268 net_rx_action+0xc95/0x1590 net/core/dev.c:5333 __do_softirq+0x485/0x942 kernel/softirq.c:284 ================================================================== Similar error is reported for cookie_v4_check(). Fixes: 58d607d3e52f ("tcp: provide skb->hash to synack packets") Signed-off-by: Alexander Potapenko Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/syncookies.c | 1 + net/ipv6/syncookies.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 0905cf04c2a4..03ad8778c395 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -335,6 +335,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; treq->ts_off = 0; + treq->txhash = net_tx_rndhash(); req->mss = mss; ireq->ir_num = ntohs(th->dest); ireq->ir_rmt_port = th->source; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 7b75b0620730..4e7817abc0b9 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -216,6 +216,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq->rcv_isn = ntohl(th->seq) - 1; treq->snt_isn = cookie; treq->ts_off = 0; + treq->txhash = net_tx_rndhash(); /* * We need to lookup the dst_entry to get the correct window size. -- cgit v1.2.3-70-g09d2 From 0ddf3fb2c43d2e65aee5de158ed694ea11ef229d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Jul 2017 11:57:55 +0200 Subject: udp: preserve skb->dst if required for IP options processing Eric noticed that in udp_recvmsg() we still need to access skb->dst while processing the IP options. Since commit 0a463c78d25b ("udp: avoid a cache miss on dequeue") skb->dst is no more available at recvmsg() time and bad things will happen if we enter the relevant code path. This commit address the issue, avoid clearing skb->dst if any IP options are present into the relevant skb. Since the IP CB is contained in the first skb cacheline, we can test it to decide to leverage the consume_stateless_skb() optimization, without measurable additional cost in the faster path. v1 -> v2: updated commit message tags Fixes: 0a463c78d25b ("udp: avoid a cache miss on dequeue") Reported-by: Andrey Konovalov Reported-by: Eric Dumazet Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/ipv4/udp.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 25294d43e147..b057653ceca9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1388,6 +1388,11 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) unlock_sock_fast(sk, slow); } + /* we cleared the head states previously only if the skb lacks any IP + * options, see __udp_queue_rcv_skb(). + */ + if (unlikely(IPCB(skb)->opt.optlen > 0)) + skb_release_head_state(skb); consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); @@ -1779,8 +1784,12 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id_once(sk, skb); } - /* clear all pending head states while they are hot in the cache */ - skb_release_head_state(skb); + /* At recvmsg() time we need skb->dst to process IP options-related + * cmsg, elsewhere can we clear all pending head states while they are + * hot in the cache + */ + if (likely(IPCB(skb)->opt.optlen == 0)) + skb_release_head_state(skb); rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { -- cgit v1.2.3-70-g09d2 From 073dd5ad34b1d3aaadaa7e5e8cbe576d9545f163 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 18 Jul 2017 22:38:56 +0300 Subject: netfilter: fix netfilter_net_init() return We accidentally return an uninitialized variable. Fixes: cf56c2f892a8 ("netfilter: remove old pre-netns era hook api") Signed-off-by: Dan Carpenter Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 368610dbc3c0..974cf2a3795a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -344,7 +344,7 @@ EXPORT_SYMBOL(nf_nat_decode_session_hook); static int __net_init netfilter_net_init(struct net *net) { - int i, h, ret; + int i, h; for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) @@ -362,7 +362,7 @@ static int __net_init netfilter_net_init(struct net *net) } #endif - return ret; + return 0; } static void __net_exit netfilter_net_exit(struct net *net) -- cgit v1.2.3-70-g09d2 From 98de4e0ea47d106846fc0e30ce4e644283fa7fc2 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander" Date: Tue, 18 Jul 2017 04:23:16 +0000 Subject: wireless: wext: terminate ifr name coming from userspace ifr name is assumed to be a valid string by the kernel, but nothing was forcing username to pass a valid string. In turn, this would cause panics as we tried to access the string past it's valid memory. Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 82fd4c9c4a1b..7657ad6bc13d 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -424,6 +424,8 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (copy_from_user(&iwr, arg, sizeof(iwr))) return -EFAULT; + iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; + return wext_handle_ioctl(net, &iwr, cmd, arg); } -- cgit v1.2.3-70-g09d2 From 63679112c536289826fec61c917621de95ba2ade Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 19 Jul 2017 13:33:24 -0700 Subject: net: Zero terminate ifr_name in dev_ifname(). The ifr.ifr_name is passed around and assumed to be NULL terminated. Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 7657ad6bc13d..06b147d7d9e2 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -28,6 +28,7 @@ static int dev_ifname(struct net *net, struct ifreq __user *arg) if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; + ifr.ifr_name[IFNAMSIZ-1] = 0; error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); if (error) -- cgit v1.2.3-70-g09d2 From 3753654e541938717b13f2b25791c3171a3a06aa Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Jul 2017 10:22:40 -0700 Subject: Revert "rtnetlink: Do not generate notifications for CHANGEADDR event" This reverts commit cd8966e75ed3c6b41a37047a904617bc44fa481f. The duplicate CHANGEADDR event message is sent regardless of link status whereas the setlink changes only generate a notification when the link is up. Not sending a notification when the link is down breaks dhcpcd which only processes hwaddr changes when the link is down. Fixes reported regression: https://bugzilla.kernel.org/show_bug.cgi?id=196355 Reported-by: Yaroslav Isakov Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d1ba90980be1..11b25fbf3dd2 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4241,6 +4241,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi switch (event) { case NETDEV_REBOOT: + case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: -- cgit v1.2.3-70-g09d2 From 6399f1fae4ec29fab5ec76070435555e256ca3a6 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 19 Jul 2017 22:28:55 +0200 Subject: ipv6: avoid overflow of offset in ip6_find_1stfragopt In some cases, offset can overflow and can cause an infinite loop in ip6_find_1stfragopt(). Make it unsigned int to prevent the overflow, and cap it at IPV6_MAXPLEN, since packets larger than that should be invalid. This problem has been here since before the beginning of git history. Signed-off-by: Sabrina Dubroca Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/output_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index e9065b8d3af8..abb2c307fbe8 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -78,7 +78,7 @@ EXPORT_SYMBOL(ipv6_select_ident); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { - u16 offset = sizeof(struct ipv6hdr); + unsigned int offset = sizeof(struct ipv6hdr); unsigned int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); int found_rhdr = 0; @@ -86,6 +86,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; + unsigned int len; switch (**nexthdr) { @@ -111,7 +112,10 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); - offset += ipv6_optlen(exthdr); + len = ipv6_optlen(exthdr); + if (len + offset >= IPV6_MAXPLEN) + return -EINVAL; + offset += len; *nexthdr = &exthdr->nexthdr; } -- cgit v1.2.3-70-g09d2 From 153711f9421be5dbc973dc57a4109dc9d54c89b1 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Thu, 20 Jul 2017 11:27:57 -0700 Subject: rtnetlink: allocate more memory for dev_set_mac_address() virtnet_set_mac_address() interprets mac address as struct sockaddr, but upper layer only allocates dev->addr_len which is ETH_ALEN + sizeof(sa_family_t) in this case. We lack a unified definition for mac address, so just fix the upper layer, this also allows drivers to interpret it to struct sockaddr freely. Reported-by: David Ahern Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 11b25fbf3dd2..9201e3621351 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2031,7 +2031,8 @@ static int do_setlink(const struct sk_buff *skb, struct sockaddr *sa; int len; - len = sizeof(sa_family_t) + dev->addr_len; + len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, + sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; -- cgit v1.2.3-70-g09d2 From 8799a221f5944a7d74516ecf46d58c28ec1d1f75 Mon Sep 17 00:00:00 2001 From: Mahesh Bandewar Date: Wed, 19 Jul 2017 15:41:33 -0700 Subject: ipv4: initialize fib_trie prior to register_netdev_notifier call. Net stack initialization currently initializes fib-trie after the first call to netdevice_notifier() call. In fact fib_trie initialization needs to happen before first rtnl_register(). It does not cause any problem since there are no devices UP at this moment, but trying to bring 'lo' UP at initialization would make this assumption wrong and exposes the issue. Fixes following crash Call Trace: ? alternate_node_alloc+0x76/0xa0 fib_table_insert+0x1b7/0x4b0 fib_magic.isra.17+0xea/0x120 fib_add_ifaddr+0x7b/0x190 fib_netdev_event+0xc0/0x130 register_netdevice_notifier+0x1c1/0x1d0 ip_fib_init+0x72/0x85 ip_rt_init+0x187/0x1e9 ip_init+0xe/0x1a inet_init+0x171/0x26c ? ipv4_offload_init+0x66/0x66 do_one_initcall+0x43/0x160 kernel_init_freeable+0x191/0x219 ? rest_init+0x80/0x80 kernel_init+0xe/0x150 ret_from_fork+0x22/0x30 Code: f6 46 23 04 74 86 4c 89 f7 e8 ae 45 01 00 49 89 c7 4d 85 ff 0f 85 7b ff ff ff 31 db eb 08 4c 89 ff e8 16 47 01 00 48 8b 44 24 38 <45> 8b 6e 14 4d 63 76 74 48 89 04 24 0f 1f 44 00 00 48 83 c4 08 RIP: kmem_cache_alloc+0xcf/0x1c0 RSP: ffff9b1500017c28 CR2: 0000000000000014 Fixes: 7b1a74fdbb9e ("[NETNS]: Refactor fib initialization so it can handle multiple namespaces.") Fixes: 7f9b80529b8a ("[IPV4]: fib hash|trie initialization") Signed-off-by: Mahesh Bandewar Acked-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4e678fa892dd..044d2a159a3c 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1334,13 +1334,14 @@ static struct pernet_operations fib_net_ops = { void __init ip_fib_init(void) { - rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); - rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); - rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); + fib_trie_init(); register_pernet_subsys(&fib_net_ops); + register_netdevice_notifier(&fib_netdev_notifier); register_inetaddr_notifier(&fib_inetaddr_notifier); - fib_trie_init(); + rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, NULL); + rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, NULL); + rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, NULL); } -- cgit v1.2.3-70-g09d2 From e623a48ee433985f6ca0fb238f0002cc2eccdf53 Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Thu, 20 Jul 2017 12:28:55 +0200 Subject: rds: Make sure updates to cp_send_gen can be observed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cp->cp_send_gen is treated as a normal variable, although it may be used by different threads. This is fixed by using {READ,WRITE}_ONCE when it is incremented and READ_ONCE when it is read outside the {acquire,release}_in_xmit protection. Normative reference from the Linux-Kernel Memory Model: Loads from and stores to shared (but non-atomic) variables should be protected with the READ_ONCE(), WRITE_ONCE(), and ACCESS_ONCE(). Clause 5.1.2.4/25 in the C standard is also relevant. Signed-off-by: Håkon Bugge Reviewed-by: Knut Omang Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index e81aa176f4e2..41b9f0f5bb9c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -170,8 +170,8 @@ restart: * The acquire_in_xmit() check above ensures that only one * caller can increment c_send_gen at any time. */ - cp->cp_send_gen++; - send_gen = cp->cp_send_gen; + send_gen = READ_ONCE(cp->cp_send_gen) + 1; + WRITE_ONCE(cp->cp_send_gen, send_gen); /* * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, @@ -431,7 +431,7 @@ over_batch: smp_mb(); if ((test_bit(0, &conn->c_map_queued) || !list_empty(&cp->cp_send_queue)) && - send_gen == cp->cp_send_gen) { + send_gen == READ_ONCE(cp->cp_send_gen)) { rds_stats_inc(s_send_lock_queue_raced); if (batch_count < send_batch_count) goto restart; -- cgit v1.2.3-70-g09d2