From 75e8d06d4308436055d1a78a2c02bf6328ba724d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 14 Jan 2015 15:33:57 +0100 Subject: netfilter: nf_tables: validate hooks in NAT expressions The user can crash the kernel if it uses any of the existing NAT expressions from the wrong hook, so add some code to validate this when loading the rule. This patch introduces nft_chain_validate_hooks() which is based on an existing function in the bridge version of the reject expression. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 ++ net/bridge/netfilter/nft_reject_bridge.c | 29 +++++------------------ net/netfilter/nf_tables_api.c | 18 ++++++++++++++ net/netfilter/nft_masq.c | 26 ++++++++++++++------- net/netfilter/nft_nat.c | 40 ++++++++++++++++++++++++-------- net/netfilter/nft_redir.c | 25 +++++++++++++------- 6 files changed, 90 insertions(+), 50 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3ae969e3acf0..9eaaa7884586 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -530,6 +530,8 @@ enum nft_chain_type { int nft_chain_validate_dependency(const struct nft_chain *chain, enum nft_chain_type type); +int nft_chain_validate_hooks(const struct nft_chain *chain, + unsigned int hook_flags); struct nft_stats { u64 bytes; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index b0330aecbf97..3244aead0926 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -265,22 +265,12 @@ out: data[NFT_REG_VERDICT].verdict = NF_DROP; } -static int nft_reject_bridge_validate_hooks(const struct nft_chain *chain) +static int nft_reject_bridge_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { - struct nft_base_chain *basechain; - - if (chain->flags & NFT_BASE_CHAIN) { - basechain = nft_base_chain(chain); - - switch (basechain->ops[0].hooknum) { - case NF_BR_PRE_ROUTING: - case NF_BR_LOCAL_IN: - break; - default: - return -EOPNOTSUPP; - } - } - return 0; + return nft_chain_validate_hooks(ctx->chain, (1 << NF_BR_PRE_ROUTING) | + (1 << NF_BR_LOCAL_IN)); } static int nft_reject_bridge_init(const struct nft_ctx *ctx, @@ -290,7 +280,7 @@ static int nft_reject_bridge_init(const struct nft_ctx *ctx, struct nft_reject *priv = nft_expr_priv(expr); int icmp_code, err; - err = nft_reject_bridge_validate_hooks(ctx->chain); + err = nft_reject_bridge_validate(ctx, expr, NULL); if (err < 0) return err; @@ -341,13 +331,6 @@ nla_put_failure: return -1; } -static int nft_reject_bridge_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) -{ - return nft_reject_bridge_validate_hooks(ctx->chain); -} - static struct nft_expr_type nft_reject_bridge_type; static const struct nft_expr_ops nft_reject_bridge_ops = { .type = &nft_reject_bridge_type, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 3b3ddb4fb9ee..7e686948ddca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3753,6 +3753,24 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, } EXPORT_SYMBOL_GPL(nft_chain_validate_dependency); +int nft_chain_validate_hooks(const struct nft_chain *chain, + unsigned int hook_flags) +{ + struct nft_base_chain *basechain; + + if (chain->flags & NFT_BASE_CHAIN) { + basechain = nft_base_chain(chain); + + if ((1 << basechain->ops[0].hooknum) & hook_flags) + return 0; + + return -EOPNOTSUPP; + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); + /* * Loop detection - walk through the ruleset beginning at the destination chain * of a new jump until either the source chain is reached (loop) or all diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index d1ffd5eb3a9b..9aea747b43ea 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -21,6 +21,21 @@ const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = { }; EXPORT_SYMBOL_GPL(nft_masq_policy); +int nft_masq_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + int err; + + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; + + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_POST_ROUTING)); +} +EXPORT_SYMBOL_GPL(nft_masq_validate); + int nft_masq_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -28,8 +43,8 @@ int nft_masq_init(const struct nft_ctx *ctx, struct nft_masq *priv = nft_expr_priv(expr); int err; - err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); - if (err < 0) + err = nft_masq_validate(ctx, expr, NULL); + if (err) return err; if (tb[NFTA_MASQ_FLAGS] == NULL) @@ -60,12 +75,5 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_masq_dump); -int nft_masq_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) -{ - return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); -} -EXPORT_SYMBOL_GPL(nft_masq_validate); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index aff54fb1c8a0..a0837c6c9283 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -88,17 +88,40 @@ static const struct nla_policy nft_nat_policy[NFTA_NAT_MAX + 1] = { [NFTA_NAT_FLAGS] = { .type = NLA_U32 }, }; -static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_nat_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) { struct nft_nat *priv = nft_expr_priv(expr); - u32 family; int err; err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); if (err < 0) return err; + switch (priv->type) { + case NFT_NAT_SNAT: + err = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN)); + break; + case NFT_NAT_DNAT: + err = nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT)); + break; + } + + return err; +} + +static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_nat *priv = nft_expr_priv(expr); + u32 family; + int err; + if (tb[NFTA_NAT_TYPE] == NULL || (tb[NFTA_NAT_REG_ADDR_MIN] == NULL && tb[NFTA_NAT_REG_PROTO_MIN] == NULL)) @@ -115,6 +138,10 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; } + err = nft_nat_validate(ctx, expr, NULL); + if (err < 0) + return err; + if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; @@ -219,13 +246,6 @@ nla_put_failure: return -1; } -static int nft_nat_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) -{ - return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); -} - static struct nft_expr_type nft_nat_type; static const struct nft_expr_ops nft_nat_ops = { .type = &nft_nat_type, diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 9e8093f28311..d7e9e93a4e90 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -23,6 +23,22 @@ const struct nla_policy nft_redir_policy[NFTA_REDIR_MAX + 1] = { }; EXPORT_SYMBOL_GPL(nft_redir_policy); +int nft_redir_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + int err; + + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; + + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_OUT)); +} +EXPORT_SYMBOL_GPL(nft_redir_validate); + int nft_redir_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -30,7 +46,7 @@ int nft_redir_init(const struct nft_ctx *ctx, struct nft_redir *priv = nft_expr_priv(expr); int err; - err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + err = nft_redir_validate(ctx, expr, NULL); if (err < 0) return err; @@ -88,12 +104,5 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_redir_dump); -int nft_redir_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) -{ - return nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); -} -EXPORT_SYMBOL_GPL(nft_redir_validate); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arturo Borrero Gonzalez "); -- cgit v1.2.3-70-g09d2 From e8781f70a5b210a1b08cff8ce05895ebcec18d83 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 21 Jan 2015 18:04:18 +0100 Subject: netfilter: nf_tables: disable preemption when restoring chain counters With CONFIG_DEBUG_PREEMPT=y [22144.496057] BUG: using smp_processor_id() in preemptible [00000000] code: iptables-compat/10406 [22144.496061] caller is debug_smp_processor_id+0x17/0x1b [22144.496065] CPU: 2 PID: 10406 Comm: iptables-compat Not tainted 3.19.0-rc4+ # [...] [22144.496092] Call Trace: [22144.496098] [] dump_stack+0x4f/0x7b [22144.496104] [] check_preemption_disabled+0xd6/0xe8 [22144.496110] [] debug_smp_processor_id+0x17/0x1b [22144.496120] [] nft_stats_alloc+0x94/0xc7 [nf_tables] [22144.496130] [] nf_tables_newchain+0x471/0x6d8 [nf_tables] [22144.496140] [] ? nft_trans_alloc+0x18/0x34 [nf_tables] [22144.496154] [] nfnetlink_rcv_batch+0x2b4/0x457 [nfnetlink] Reported-by: Andreas Schultz Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7e686948ddca..b54360634e95 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1134,9 +1134,11 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. */ + preempt_disable(); stats = this_cpu_ptr(newstats); stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + preempt_enable(); return newstats; } -- cgit v1.2.3-70-g09d2 From e73ebb0881ea5534ce606c1d71b4ac44db5c6930 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 28 Jan 2015 20:01:35 -0500 Subject: tcp: stretch ACK fixes prep LRO, GRO, delayed ACKs, and middleboxes can cause "stretch ACKs" that cover more than the RFC-specified maximum of 2 packets. These stretch ACKs can cause serious performance shortfalls in common congestion control algorithms that were designed and tuned years ago with receiver hosts that were not using LRO or GRO, and were instead politely ACKing every other packet. This patch series fixes Reno and CUBIC to handle stretch ACKs. This patch prepares for the upcoming stretch ACK bug fix patches. It adds an "acked" parameter to tcp_cong_avoid_ai() to allow for future fixes to tcp_cong_avoid_ai() to correctly handle stretch ACKs, and changes all congestion control algorithms to pass in 1 for the ACKed count. It also changes tcp_slow_start() to return the number of packet ACK "credits" that were not processed in slow start mode, and can be processed by the congestion control module in additive increase mode. In future patches we will fix tcp_cong_avoid_ai() to handle stretch ACKs, and fix Reno and CUBIC handling of stretch ACKs in slow start and additive increase mode. Reported-by: Eyal Perry Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++-- net/ipv4/tcp_bic.c | 2 +- net/ipv4/tcp_cong.c | 11 +++++++---- net/ipv4/tcp_cubic.c | 2 +- net/ipv4/tcp_scalable.c | 3 ++- net/ipv4/tcp_veno.c | 2 +- net/ipv4/tcp_yeah.c | 2 +- 7 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f50f29faf76f..9d9111ef43ae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -834,8 +834,8 @@ void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name); -void tcp_slow_start(struct tcp_sock *tp, u32 acked); -void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w); +u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); +void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); u32 tcp_reno_ssthresh(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index bb395d46a389..c037644eafb7 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_slow_start(tp, acked); else { bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt); + tcp_cong_avoid_ai(tp, ca->cnt, 1); } } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 27ead0dd16bc..6826017c12d1 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -291,25 +291,28 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and * returns the leftover acks to adjust cwnd in congestion avoidance mode. */ -void tcp_slow_start(struct tcp_sock *tp, u32 acked) +u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { u32 cwnd = tp->snd_cwnd + acked; if (cwnd > tp->snd_ssthresh) cwnd = tp->snd_ssthresh + 1; + acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + + return acked; } EXPORT_SYMBOL_GPL(tcp_slow_start); /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ -void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w) +void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { if (tp->snd_cwnd_cnt >= w) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt = 0; } else { - tp->snd_cwnd_cnt++; + tp->snd_cwnd_cnt += acked; } } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); @@ -333,7 +336,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_slow_start(tp, acked); /* In dangerous area, increase slowly. */ else - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 6b6002416a73..df4bc4d87e58 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -320,7 +320,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) tcp_slow_start(tp, acked); } else { bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt); + tcp_cong_avoid_ai(tp, ca->cnt, 1); } } diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 6824afb65d93..333bcb2415ff 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (tp->snd_cwnd <= tp->snd_ssthresh) tcp_slow_start(tp, acked); else - tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)); + tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + 1); } static u32 tcp_scalable_ssthresh(struct sock *sk) diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index a4d2d2d88dca..112151eeee45 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* In the "non-congestive state", increase cwnd * every rtt. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } else { /* In the "congestive state", increase cwnd * every other rtt. diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index cd7273218598..17d35662930d 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) } else { /* Reno */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. -- cgit v1.2.3-70-g09d2 From 814d488c61260521b1b3cc97063700a5a6667c8f Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 28 Jan 2015 20:01:36 -0500 Subject: tcp: fix the timid additive increase on stretch ACKs tcp_cong_avoid_ai() was too timid (snd_cwnd increased too slowly) on "stretch ACKs" -- cases where the receiver ACKed more than 1 packet in a single ACK. For example, suppose w is 10 and we get a stretch ACK for 20 packets, so acked is 20. We ought to increase snd_cwnd by 2 (since acked/w = 20/10 = 2), but instead we were only increasing cwnd by 1. This patch fixes that behavior. Reported-by: Eyal Perry Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6826017c12d1..faaee5338bea 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -304,16 +304,19 @@ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) } EXPORT_SYMBOL_GPL(tcp_slow_start); -/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */ +/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w), + * for every packet that was ACKed. + */ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { + tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt = 0; - } else { - tp->snd_cwnd_cnt += acked; + u32 delta = tp->snd_cwnd_cnt / w; + + tp->snd_cwnd_cnt -= delta * w; + tp->snd_cwnd += delta; } + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); -- cgit v1.2.3-70-g09d2 From c22bdca94782f05b9337d8548bde51b2f38ef17f Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 28 Jan 2015 20:01:37 -0500 Subject: tcp: fix stretch ACK bugs in Reno Change Reno to properly handle stretch ACKs in additive increase mode by passing in the count of ACKed packets to tcp_cong_avoid_ai(). In addition, if snd_cwnd crosses snd_ssthresh during slow start processing, and we then exit slow start mode, we need to carry over any remaining "credit" for packets ACKed and apply that to additive increase by passing this remaining "acked" count to tcp_cong_avoid_ai(). Reported-by: Eyal Perry Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cong.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index faaee5338bea..8670e68e2ce6 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -335,11 +335,13 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In "safe" area, increase. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp, acked); + if (tp->snd_cwnd <= tp->snd_ssthresh) { + acked = tcp_slow_start(tp, acked); + if (!acked) + return; + } /* In dangerous area, increase slowly. */ - else - tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); + tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); -- cgit v1.2.3-70-g09d2 From 9cd981dcf174d26805a032aefa791436da709bee Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 28 Jan 2015 20:01:38 -0500 Subject: tcp: fix stretch ACK bugs in CUBIC Change CUBIC to properly handle stretch ACKs in additive increase mode by passing in the count of ACKed packets to tcp_cong_avoid_ai(). In addition, because we are now precisely accounting for stretch ACKs, including delayed ACKs, we can now remove the delayed ACK tracking and estimation code that tracked recent delayed ACK behavior in ca->delayed_ack. Reported-by: Eyal Perry Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index df4bc4d87e58..ffc045da2fd5 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -93,9 +93,7 @@ struct bictcp { u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ -#define ACK_RATIO_SHIFT 4 -#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) - u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ + u16 unused; u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ u32 round_start; /* beginning of each round */ @@ -114,7 +112,6 @@ static inline void bictcp_reset(struct bictcp *ca) ca->bic_K = 0; ca->delay_min = 0; ca->epoch_start = 0; - ca->delayed_ack = 2 << ACK_RATIO_SHIFT; ca->ack_cnt = 0; ca->tcp_cwnd = 0; ca->found = 0; @@ -205,12 +202,12 @@ static u32 cubic_root(u64 a) /* * Compute congestion window to use. */ -static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) { u32 delta, bic_target, max_cnt; u64 offs, t; - ca->ack_cnt++; /* count the number of ACKs */ + ca->ack_cnt += acked; /* count the number of ACKed packets */ if (ca->last_cwnd == cwnd && (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) @@ -221,7 +218,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) if (ca->epoch_start == 0) { ca->epoch_start = tcp_time_stamp; /* record beginning */ - ca->ack_cnt = 1; /* start counting */ + ca->ack_cnt = acked; /* start counting */ ca->tcp_cwnd = cwnd; /* syn with cubic */ if (ca->last_max_cwnd <= cwnd) { @@ -301,7 +298,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } } - ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; if (ca->cnt == 0) /* cannot be zero */ ca->cnt = 1; } @@ -317,11 +313,12 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (tp->snd_cwnd <= tp->snd_ssthresh) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); - tcp_slow_start(tp, acked); - } else { - bictcp_update(ca, tp->snd_cwnd); - tcp_cong_avoid_ai(tp, ca->cnt, 1); + acked = tcp_slow_start(tp, acked); + if (!acked) + return; } + bictcp_update(ca, tp->snd_cwnd, acked); + tcp_cong_avoid_ai(tp, ca->cnt, acked); } static u32 bictcp_recalc_ssthresh(struct sock *sk) @@ -411,20 +408,10 @@ static void hystart_update(struct sock *sk, u32 delay) */ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) { - const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); u32 delay; - if (icsk->icsk_ca_state == TCP_CA_Open) { - u32 ratio = ca->delayed_ack; - - ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ratio += cnt; - - ca->delayed_ack = clamp(ratio, 1U, ACK_RATIO_LIMIT); - } - /* Some calls are for duplicates without timetamps */ if (rtt_us < 0) return; -- cgit v1.2.3-70-g09d2 From d6b1a8a92a1417f8859a6937d2e6ffe2dfab4e6d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 28 Jan 2015 20:01:39 -0500 Subject: tcp: fix timing issue in CUBIC slope calculation This patch fixes a bug in CUBIC that causes cwnd to increase slightly too slowly when multiple ACKs arrive in the same jiffy. If cwnd is supposed to increase at a rate of more than once per jiffy, then CUBIC was sometimes too slow. Because the bic_target is calculated for a future point in time, calculated with time in jiffies, the cwnd can increase over the course of the jiffy while the bic_target calculated as the proper CUBIC cwnd at time t=tcp_time_stamp+rtt does not increase, because tcp_time_stamp only increases on jiffy tick boundaries. So since the cnt is set to: ca->cnt = cwnd / (bic_target - cwnd); as cwnd increases but bic_target does not increase due to jiffy granularity, the cnt becomes too large, causing cwnd to increase too slowly. For example: - suppose at the beginning of a jiffy, cwnd=40, bic_target=44 - so CUBIC sets: ca->cnt = cwnd / (bic_target - cwnd) = 40 / (44 - 40) = 40/4 = 10 - suppose we get 10 acks, each for 1 segment, so tcp_cong_avoid_ai() increases cwnd to 41 - so CUBIC sets: ca->cnt = cwnd / (bic_target - cwnd) = 41 / (44 - 41) = 41 / 3 = 13 So now CUBIC will wait for 13 packets to be ACKed before increasing cwnd to 42, insted of 10 as it should. The fix is to avoid adjusting the slope (determined by ca->cnt) multiple times within a jiffy, and instead skip to compute the Reno cwnd, the "TCP friendliness" code path. Reported-by: Eyal Perry Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index ffc045da2fd5..4b276d1ed980 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -213,6 +213,13 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) return; + /* The CUBIC function can update ca->cnt at most once per jiffy. + * On all cwnd reduction events, ca->epoch_start is set to 0, + * which will force a recalculation of ca->cnt. + */ + if (ca->epoch_start && tcp_time_stamp == ca->last_time) + goto tcp_friendliness; + ca->last_cwnd = cwnd; ca->last_time = tcp_time_stamp; @@ -280,6 +287,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) if (ca->last_max_cwnd == 0 && ca->cnt > 20) ca->cnt = 20; /* increase cwnd 5% per RTT */ +tcp_friendliness: /* TCP Friendly */ if (tcp_friendliness) { u32 scale = beta_scale; -- cgit v1.2.3-70-g09d2 From 59ccaaaa49b5b096cdc1f16706a9f931416b2332 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Wed, 28 Jan 2015 16:23:11 -0800 Subject: bridge: dont send notification when skb->len == 0 in rtnl_bridge_notify Reported in: https://bugzilla.kernel.org/show_bug.cgi?id=92081 This patch avoids calling rtnl_notify if the device ndo_bridge_getlink handler does not return any bytes in the skb. Alternately, the skb->len check can be moved inside rtnl_notify. For the bridge vlan case described in 92081, there is also a fix needed in bridge driver to generate a proper notification. Will fix that in subsequent patch. v2: rebase patch on net tree Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9cf6fe9ddc0c..446cbaf81185 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2895,12 +2895,16 @@ static int rtnl_bridge_notify(struct net_device *dev, u16 flags) goto errout; } + if (!skb->len) + goto errout; + rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; errout: WARN_ON(err == -EMSGSIZE); kfree_skb(skb); - rtnl_set_sk_err(net, RTNLGRP_LINK, err); + if (err) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); return err; } -- cgit v1.2.3-70-g09d2 From 150ae0e94634714b23919f0c333fee28a5b199d5 Mon Sep 17 00:00:00 2001 From: karl beldan Date: Wed, 28 Jan 2015 10:58:11 +0100 Subject: lib/checksum.c: fix carry in csum_tcpudp_nofold The carry from the 64->32bits folding was dropped, e.g with: saddr=0xFFFFFFFF daddr=0xFF0000FF len=0xFFFF proto=0 sum=1, csum_tcpudp_nofold returned 0 instead of 1. Signed-off-by: Karl Beldan Cc: Al Viro Cc: Eric Dumazet Cc: Arnd Bergmann Cc: Mike Frysinger Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- lib/checksum.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/lib/checksum.c b/lib/checksum.c index 129775eb6de6..fcf38943132c 100644 --- a/lib/checksum.c +++ b/lib/checksum.c @@ -47,6 +47,15 @@ static inline unsigned short from32to16(unsigned int x) return x; } +static inline u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (u32)x; +} + static unsigned int do_csum(const unsigned char *buff, int len) { int odd; @@ -195,8 +204,7 @@ __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, #else s += (proto + len) << 8; #endif - s += (s >> 32); - return (__force __wsum)s; + return (__force __wsum)from64to32(s); } EXPORT_SYMBOL(csum_tcpudp_nofold); #endif -- cgit v1.2.3-70-g09d2 From 811230cd853d62f09ed0addd0ce9a1b9b0e13fb5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 28 Jan 2015 05:47:11 -0800 Subject: tcp: ipv4: initialize unicast_sock sk_pacing_rate When I added sk_pacing_rate field, I forgot to initialize its value in the per cpu unicast_sock used in ip_send_unicast_reply() This means that for sch_fq users, RST packets, or ACK packets sent on behalf of TIME_WAIT sockets might be sent to slowly or even dropped once we reach the per flow limit. Signed-off-by: Eric Dumazet Fixes: 95bd09eb2750 ("tcp: TSO packets automatic sizing") Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b50861b22b6b..38a20a9cca1a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1517,6 +1517,7 @@ static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { .sk_wmem_alloc = ATOMIC_INIT(1), .sk_allocation = GFP_ATOMIC, .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), + .sk_pacing_rate = ~0U, }, .pmtudisc = IP_PMTUDISC_WANT, .uc_ttl = -1, -- cgit v1.2.3-70-g09d2 From 9ce357795ef208faa0d59894d9d119a7434e37f3 Mon Sep 17 00:00:00 2001 From: karl beldan Date: Thu, 29 Jan 2015 11:10:22 +0100 Subject: lib/checksum.c: fix build for generic csum_tcpudp_nofold Fixed commit added from64to32 under _#ifndef do_csum_ but used it under _#ifndef csum_tcpudp_nofold_, breaking some builds (Fengguang's robot reported TILEGX's). Move from64to32 under the latter. Fixes: 150ae0e94634 ("lib/checksum.c: fix carry in csum_tcpudp_nofold") Reported-by: kbuild test robot Signed-off-by: Karl Beldan Cc: Eric Dumazet Cc: David S. Miller Signed-off-by: David S. Miller --- lib/checksum.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/lib/checksum.c b/lib/checksum.c index fcf38943132c..8b39e86dbab5 100644 --- a/lib/checksum.c +++ b/lib/checksum.c @@ -47,15 +47,6 @@ static inline unsigned short from32to16(unsigned int x) return x; } -static inline u32 from64to32(u64 x) -{ - /* add up 32-bit and 32-bit for 32+c bit */ - x = (x & 0xffffffff) + (x >> 32); - /* add up carry.. */ - x = (x & 0xffffffff) + (x >> 32); - return (u32)x; -} - static unsigned int do_csum(const unsigned char *buff, int len) { int odd; @@ -190,6 +181,15 @@ csum_partial_copy(const void *src, void *dst, int len, __wsum sum) EXPORT_SYMBOL(csum_partial_copy); #ifndef csum_tcpudp_nofold +static inline u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (u32)x; +} + __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, unsigned short proto, -- cgit v1.2.3-70-g09d2 From 8997c27ec41127bf57421cc0205413d525421ddc Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Jan 2015 22:28:13 +0100 Subject: caif: remove wrong dev_net_set() call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit src_net points to the netns where the netlink message has been received. This netns may be different from the netns where the interface is created (because the user may add IFLA_NET_NS_[PID|FD]). In this case, src_net is the link netns. It seems wrong to override the netns in the newlink() handler because if it was not already src_net, it means that the user explicitly asks to create the netdevice in another netns. CC: Sjur Brændeland CC: Dmitry Tarnyagin Fixes: 8391c4aab1aa ("caif: Bugfixes in CAIF netdevice for close and flow control") Fixes: c41254006377 ("caif-hsi: Add rtnl support") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/caif/caif_hsi.c | 1 - net/caif/chnl_net.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c index 5e40a8b68cbe..b3b922adc0e4 100644 --- a/drivers/net/caif/caif_hsi.c +++ b/drivers/net/caif/caif_hsi.c @@ -1415,7 +1415,6 @@ static int caif_hsi_newlink(struct net *src_net, struct net_device *dev, cfhsi = netdev_priv(dev); cfhsi_netlink_parms(data, cfhsi); - dev_net_set(cfhsi->ndev, src_net); get_ops = symbol_get(cfhsi_get_ops); if (!get_ops) { diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 4589ff67bfa9..67a4a36febd1 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -470,7 +470,6 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev, ASSERT_RTNL(); caifdev = netdev_priv(dev); caif_netlink_parms(data, &caifdev->conn_req); - dev_net_set(caifdev->netdev, src_net); ret = register_netdevice(dev); if (ret) -- cgit v1.2.3-70-g09d2 From 33564bbb2cf1c05cf3882af5d62a0b2b4a09754c Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 26 Jan 2015 22:28:14 +0100 Subject: vxlan: setup the right link netns in newlink hdlr Rename the netns to src_net to avoid confusion with the netns where the interface stands. The user may specify IFLA_NET_NS_[PID|FD] to create a x-netns netndevice: IFLA_NET_NS_[PID|FD] points to the netns where the netdevice stands and src_net to the link netns. Note that before commit f01ec1c017de ("vxlan: add x-netns support"), it was possible to create a x-netns vxlan netdevice, but the netdevice was not operational. Fixes: f01ec1c017de ("vxlan: add x-netns support") Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 7fbd89fbe107..a8c755dcab14 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2432,10 +2432,10 @@ static void vxlan_sock_work(struct work_struct *work) dev_put(vxlan->dev); } -static int vxlan_newlink(struct net *net, struct net_device *dev, +static int vxlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { - struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); struct vxlan_dev *vxlan = netdev_priv(dev); struct vxlan_rdst *dst = &vxlan->default_dst; __u32 vni; @@ -2445,7 +2445,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, if (!data[IFLA_VXLAN_ID]) return -EINVAL; - vxlan->net = dev_net(dev); + vxlan->net = src_net; vni = nla_get_u32(data[IFLA_VXLAN_ID]); dst->remote_vni = vni; @@ -2481,7 +2481,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, if (data[IFLA_VXLAN_LINK] && (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { struct net_device *lowerdev - = __dev_get_by_index(net, dst->remote_ifindex); + = __dev_get_by_index(src_net, dst->remote_ifindex); if (!lowerdev) { pr_info("ifindex %d does not exist\n", dst->remote_ifindex); @@ -2557,7 +2557,7 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; - if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET, + if (vxlan_find_vni(src_net, vni, use_ipv6 ? AF_INET6 : AF_INET, vxlan->dst_port)) { pr_info("duplicate VNI %u\n", vni); return -EEXIST; -- cgit v1.2.3-70-g09d2 From e2a4800e75780ccf4e6c2487f82b688ba736eb18 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 28 Jan 2015 10:56:04 +0100 Subject: ppp: deflate: never return len larger than output buffer When we've run out of space in the output buffer to store more data, we will call zlib_deflate with a NULL output buffer until we've consumed remaining input. When this happens, olen contains the size the output buffer would have consumed iff we'd have had enough room. This can later cause skb_over_panic when ppp_generic skb_put()s the returned length. Reported-by: Iain Douglas Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_deflate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_deflate.c b/drivers/net/ppp/ppp_deflate.c index 602c625d95d5..b5edc7f96a39 100644 --- a/drivers/net/ppp/ppp_deflate.c +++ b/drivers/net/ppp/ppp_deflate.c @@ -246,7 +246,7 @@ static int z_compress(void *arg, unsigned char *rptr, unsigned char *obuf, /* * See if we managed to reduce the size of the packet. */ - if (olen < isize) { + if (olen < isize && olen <= osize) { state->stats.comp_bytes += olen; state->stats.comp_packets++; } else { -- cgit v1.2.3-70-g09d2 From fc9a5707832e57f2317c89101fbe9d4ccbd62fa6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Jan 2015 15:15:01 +0100 Subject: net: cs89x0: always build platform code if !HAS_IOPORT_MAP The cs89x0 driver can either be built as an ISA driver or a platform driver, the choice is controlled by the CS89x0_PLATFORM Kconfig symbol. Building the ISA driver on a system that does not have a way to map I/O ports fails with this error: drivers/built-in.o: In function `cs89x0_ioport_probe.constprop.1': :(.init.text+0x4794): undefined reference to `ioport_map' :(.init.text+0x4830): undefined reference to `ioport_unmap' This changes the Kconfig logic to take that option away and always force building the platform variant of this driver if CONFIG_HAS_IOPORT_MAP is not set. This is the only correct choice in this case, and it avoids the build error. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/cirrus/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/cirrus/Kconfig b/drivers/net/ethernet/cirrus/Kconfig index 7403dff8f14a..905ac5f5d9a6 100644 --- a/drivers/net/ethernet/cirrus/Kconfig +++ b/drivers/net/ethernet/cirrus/Kconfig @@ -32,7 +32,8 @@ config CS89x0 will be called cs89x0. config CS89x0_PLATFORM - bool "CS89x0 platform driver support" + bool "CS89x0 platform driver support" if HAS_IOPORT_MAP + default !HAS_IOPORT_MAP depends on CS89x0 help Say Y to compile the cs89x0 driver as a platform driver. This -- cgit v1.2.3-70-g09d2 From 303c28d859cc7ab0f1f37acdd4aadf87c6e86743 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Jan 2015 15:15:02 +0100 Subject: net: wan: add missing virt_to_bus dependencies The cosa driver is rather outdated and does not get built on most platforms because it requires the ISA_DMA_API symbol. However there are some ARM platforms that have ISA_DMA_API but no virt_to_bus, and they get this build error when enabling the ltpc driver. drivers/net/wan/cosa.c: In function 'tx_interrupt': drivers/net/wan/cosa.c:1768:3: error: implicit declaration of function 'virt_to_bus' unsigned long addr = virt_to_bus(cosa->txbuf); ^ The same problem exists for the Hostess SV-11 and Sealevel Systems 4021 drivers. This adds another dependency in Kconfig to avoid that configuration. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/wan/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/wan/Kconfig b/drivers/net/wan/Kconfig index 94e234975c61..a2fdd15f285a 100644 --- a/drivers/net/wan/Kconfig +++ b/drivers/net/wan/Kconfig @@ -25,7 +25,7 @@ if WAN # There is no way to detect a comtrol sv11 - force it modular for now. config HOSTESS_SV11 tristate "Comtrol Hostess SV-11 support" - depends on ISA && m && ISA_DMA_API && INET && HDLC + depends on ISA && m && ISA_DMA_API && INET && HDLC && VIRT_TO_BUS help Driver for Comtrol Hostess SV-11 network card which operates on low speed synchronous serial links at up to @@ -37,7 +37,7 @@ config HOSTESS_SV11 # The COSA/SRP driver has not been tested as non-modular yet. config COSA tristate "COSA/SRP sync serial boards support" - depends on ISA && m && ISA_DMA_API && HDLC + depends on ISA && m && ISA_DMA_API && HDLC && VIRT_TO_BUS ---help--- Driver for COSA and SRP synchronous serial boards. @@ -87,7 +87,7 @@ config LANMEDIA # There is no way to detect a Sealevel board. Force it modular config SEALEVEL_4021 tristate "Sealevel Systems 4021 support" - depends on ISA && m && ISA_DMA_API && INET && HDLC + depends on ISA && m && ISA_DMA_API && INET && HDLC && VIRT_TO_BUS help This is a driver for the Sealevel Systems ACB 56 serial I/O adapter. -- cgit v1.2.3-70-g09d2 From e9b106b8fbdb98ceab2dfa46cebe097bd1a47bac Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Jan 2015 15:15:03 +0100 Subject: net: lance,ni64: don't build for ARM The ni65 and lance ethernet drivers manually program the ISA DMA controller that is only available on x86 PCs and a few compatible systems. Trying to build it on ARM results in this error: ni65.c: In function 'ni65_probe1': ni65.c:496:62: error: 'DMA1_STAT_REG' undeclared (first use in this function) ((inb(DMA1_STAT_REG) >> 4) & 0x0f) ^ ni65.c:496:62: note: each undeclared identifier is reported only once for each function it appears in ni65.c:497:63: error: 'DMA2_STAT_REG' undeclared (first use in this function) | (inb(DMA2_STAT_REG) & 0xf0); The DMA1_STAT_REG and DMA2_STAT_REG registers are only defined for alpha, mips, parisc, powerpc and x86, although it is not clear which subarchitectures actually have them at the correct location. This patch for now just disables it for ARM, to avoid randconfig build errors. We could also decide to limit it to the set of architectures on which it does compile, but that might look more deliberate than guessing based on where the drivers build. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amd/Kconfig b/drivers/net/ethernet/amd/Kconfig index 7a5e4aa5415e..77f1f6048ddd 100644 --- a/drivers/net/ethernet/amd/Kconfig +++ b/drivers/net/ethernet/amd/Kconfig @@ -45,7 +45,7 @@ config AMD8111_ETH config LANCE tristate "AMD LANCE and PCnet (AT1500 and NE2100) support" - depends on ISA && ISA_DMA_API + depends on ISA && ISA_DMA_API && !ARM ---help--- If you have a network (Ethernet) card of this type, say Y and read the Ethernet-HOWTO, available from @@ -142,7 +142,7 @@ config PCMCIA_NMCLAN config NI65 tristate "NI6510 support" - depends on ISA && ISA_DMA_API + depends on ISA && ISA_DMA_API && !ARM ---help--- If you have a network (Ethernet) card of this type, say Y and read the Ethernet-HOWTO, available from -- cgit v1.2.3-70-g09d2 From 96a30175f927facfb421655ef08b7a0fe546fbed Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Jan 2015 15:15:04 +0100 Subject: net: am2150: fix nmclan_cs.c shared interrupt handling A recent patch tried to work around a valid warning for the use of a deprecated interface by blindly changing from the old pcmcia_request_exclusive_irq() interface to pcmcia_request_irq(). This driver has an interrupt handler that is not currently aware of shared interrupts, but can be easily converted to be. At the moment, the driver reads the interrupt status register repeatedly until it contains only zeroes in the interesting bits, and handles each bit individually. This patch adds the missing part of returning IRQ_NONE in case none of the bits are set to start with, so we can move on to the next interrupt source. Signed-off-by: Arnd Bergmann Fixes: 5f5316fcd08ef7 ("am2150: Update nmclan_cs.c to use update PCMCIA API") Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/nmclan_cs.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/amd/nmclan_cs.c b/drivers/net/ethernet/amd/nmclan_cs.c index 5b22764ba88d..27245efe9f50 100644 --- a/drivers/net/ethernet/amd/nmclan_cs.c +++ b/drivers/net/ethernet/amd/nmclan_cs.c @@ -952,6 +952,8 @@ static irqreturn_t mace_interrupt(int irq, void *dev_id) do { /* WARNING: MACE_IR is a READ/CLEAR port! */ status = inb(ioaddr + AM2150_MACE_BASE + MACE_IR); + if (!(status & ~MACE_IMR_DEFAULT) && IntrCnt == MACE_MAX_IR_ITERATIONS) + return IRQ_NONE; pr_debug("mace_interrupt: irq 0x%X status 0x%X.\n", irq, status); -- cgit v1.2.3-70-g09d2 From 3cdaa5be9e81a914e633a6be7b7d2ef75b528562 Mon Sep 17 00:00:00 2001 From: Li Wei Date: Thu, 29 Jan 2015 16:09:03 +0800 Subject: ipv4: Don't increase PMTU with Datagram Too Big message. RFC 1191 said, "a host MUST not increase its estimate of the Path MTU in response to the contents of a Datagram Too Big message." Signed-off-by: Li Wei Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d58dd0ec3e53..52e1f2bf0ca2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -966,6 +966,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (dst->dev->mtu < mtu) return; + if (rt->rt_pmtu && rt->rt_pmtu < mtu) + return; + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; -- cgit v1.2.3-70-g09d2 From 579eb62ac35845686a7c4286c0a820b4eb1f96aa Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 18 Dec 2014 22:41:23 +0200 Subject: ipvs: rerouting to local clients is not needed anymore commit f5a41847acc5 ("ipvs: move ip_route_me_harder for ICMP") from 2.6.37 introduced ip_route_me_harder() call for responses to local clients, so that we can provide valid rt_src after SNAT. It was used by TCP to provide valid daddr for ip_send_reply(). After commit 0a5ebb8000c5 ("ipv4: Pass explicit daddr arg to ip_send_reply()." from 3.0 this rerouting is not needed anymore and should be avoided, especially in LOCAL_IN. Fixes 3.12.33 crash in xfrm reported by Florian Wiessner: "3.12.33 - BUG xfrm_selector_match+0x25/0x2f6" Reported-by: Smart Weblications GmbH - Florian Wiessner Tested-by: Smart Weblications GmbH - Florian Wiessner Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 990decba1fe4..b87ca32efa0b 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -659,16 +659,24 @@ static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } -static int ip_vs_route_me_harder(int af, struct sk_buff *skb) +static int ip_vs_route_me_harder(int af, struct sk_buff *skb, + unsigned int hooknum) { + if (!sysctl_snat_reroute(skb)) + return 0; + /* Reroute replies only to remote clients (FORWARD and LOCAL_OUT) */ + if (NF_INET_LOCAL_IN == hooknum) + return 0; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (sysctl_snat_reroute(skb) && ip6_route_me_harder(skb) != 0) + struct dst_entry *dst = skb_dst(skb); + + if (dst->dev && !(dst->dev->flags & IFF_LOOPBACK) && + ip6_route_me_harder(skb) != 0) return 1; } else #endif - if ((sysctl_snat_reroute(skb) || - skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + if (!(skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(skb, RTN_LOCAL) != 0) return 1; @@ -791,7 +799,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, union nf_inet_addr *snet, __u8 protocol, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, - unsigned int offset, unsigned int ihl) + unsigned int offset, unsigned int ihl, + unsigned int hooknum) { unsigned int verdict = NF_DROP; @@ -821,7 +830,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); - if (ip_vs_route_me_harder(af, skb)) + if (ip_vs_route_me_harder(af, skb, hooknum)) goto out; /* do the statistics and put it back */ @@ -916,7 +925,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, - pp, ciph.len, ihl); + pp, ciph.len, ihl, hooknum); } #ifdef CONFIG_IP_VS_IPV6 @@ -981,7 +990,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, snet.in6 = ciph.saddr.in6; writable = ciph.len; return handle_response_icmp(AF_INET6, skb, &snet, ciph.protocol, cp, - pp, writable, sizeof(struct ipv6hdr)); + pp, writable, sizeof(struct ipv6hdr), + hooknum); } #endif @@ -1040,7 +1050,8 @@ static inline bool is_new_conn(const struct sk_buff *skb, */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, - struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) + struct ip_vs_conn *cp, struct ip_vs_iphdr *iph, + unsigned int hooknum) { struct ip_vs_protocol *pp = pd->pp; @@ -1078,7 +1089,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, * if it came from this machine itself. So re-compute * the routing information. */ - if (ip_vs_route_me_harder(af, skb)) + if (ip_vs_route_me_harder(af, skb, hooknum)) goto drop; IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); @@ -1181,7 +1192,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) cp = pp->conn_out_get(af, skb, &iph, 0); if (likely(cp)) - return handle_response(af, skb, pd, cp, &iph); + return handle_response(af, skb, pd, cp, &iph, hooknum); if (sysctl_nat_icmp_send(net) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || -- cgit v1.2.3-70-g09d2 From f5553c19ff9058136e7082c0b1f4268e705ea538 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 29 Jan 2015 19:08:09 +0100 Subject: netfilter: nf_tables: fix leaks in error path of nf_tables_newchain() Release statistics and module refcount on memory allocation problems. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b54360634e95..1ff04bcd4871 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1264,8 +1264,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla); trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN, sizeof(struct nft_trans_chain)); - if (trans == NULL) + if (trans == NULL) { + free_percpu(stats); return -ENOMEM; + } nft_trans_chain_stats(trans) = stats; nft_trans_chain_update(trans) = true; @@ -1321,8 +1323,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, hookfn = type->hooks[hooknum]; basechain = kzalloc(sizeof(*basechain), GFP_KERNEL); - if (basechain == NULL) + if (basechain == NULL) { + module_put(type->owner); return -ENOMEM; + } if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); -- cgit v1.2.3-70-g09d2 From 8b7c36d810c61ab16997f4387fc16291410700f8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Thu, 29 Jan 2015 10:51:53 +0100 Subject: netlink: fix wrong subscription bitmask to group mapping in The subscription bitmask passed via struct sockaddr_nl is converted to the group number when calling the netlink_bind() and netlink_unbind() callbacks. The conversion is however incorrect since bitmask (1 << 0) needs to be mapped to group number 1. Note that you cannot specify the group number 0 (usually known as _NONE) from setsockopt() using NETLINK_ADD_MEMBERSHIP since this is rejected through -EINVAL. This problem became noticeable since 97840cb ("netfilter: nfnetlink: fix insufficient validation in nfnetlink_bind") when binding to bitmask (1 << 0) in ctnetlink. Reported-by: Andre Tomt Reported-by: Ivan Delalande Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 02fdde28dada..75532efa51cd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1438,7 +1438,7 @@ static void netlink_undo_bind(int group, long unsigned int groups, for (undo = 0; undo < group; undo++) if (test_bit(undo, &groups)) - nlk->netlink_unbind(sock_net(sk), undo); + nlk->netlink_unbind(sock_net(sk), undo + 1); } static int netlink_bind(struct socket *sock, struct sockaddr *addr, @@ -1476,7 +1476,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, for (group = 0; group < nlk->ngroups; group++) { if (!test_bit(group, &groups)) continue; - err = nlk->netlink_bind(net, group); + err = nlk->netlink_bind(net, group + 1); if (!err) continue; netlink_undo_bind(group, groups, sk); -- cgit v1.2.3-70-g09d2 From cfbf654efc6d78dc9812e030673b86f235bf677d Mon Sep 17 00:00:00 2001 From: Saran Maruti Ramanara Date: Thu, 29 Jan 2015 11:05:58 +0100 Subject: net: sctp: fix passing wrong parameter header to param_type2af in sctp_process_param When making use of RFC5061, section 4.2.4. for setting the primary IP address, we're passing a wrong parameter header to param_type2af(), resulting always in NULL being returned. At this point, param.p points to a sctp_addip_param struct, containing a sctp_paramhdr (type = 0xc004, length = var), and crr_id as a correlation id. Followed by that, as also presented in RFC5061 section 4.2.4., comes the actual sctp_addr_param, which also contains a sctp_paramhdr, but this time with the correct type SCTP_PARAM_IPV{4,6}_ADDRESS that param_type2af() can make use of. Since we already hold a pointer to addr_param from previous line, just reuse it for param_type2af(). Fixes: d6de3097592b ("[SCTP]: Add the handling of "Set Primary IP Address" parameter to INIT") Signed-off-by: Saran Maruti Ramanara Signed-off-by: Daniel Borkmann Acked-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index e49e231cef52..06320c8c1c86 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -2608,7 +2608,7 @@ do_addr_param: addr_param = param.v + sizeof(sctp_addip_param_t); - af = sctp_get_af_specific(param_type2af(param.p->type)); + af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (af == NULL) break; -- cgit v1.2.3-70-g09d2 From d4bcef3fbe887ff93b58da4fcf6df1eee416e8fa Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 29 Jan 2015 20:37:07 +0900 Subject: net: Fix vlan_get_protocol for stacked vlan vlan_get_protocol() could not get network protocol if a skb has a 802.1ad vlan tag or multiple vlans, which caused incorrect checksum calculation in several drivers. Fix vlan_get_protocol() to retrieve network protocol instead of incorrect vlan protocol. As the logic is the same as skb_network_protocol(), create a common helper function __vlan_get_protocol() and call it from existing functions. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/linux/if_vlan.h | 60 +++++++++++++++++++++++++++++++++++++------------ net/core/dev.c | 31 +------------------------ 2 files changed, 47 insertions(+), 44 deletions(-) diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index 515a35e2a48a..960e666c51e4 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -472,27 +472,59 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query + * @type: first vlan protocol + * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 vlan_get_protocol(const struct sk_buff *skb) +static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, + int *depth) { - __be16 protocol = 0; - - if (vlan_tx_tag_present(skb) || - skb->protocol != cpu_to_be16(ETH_P_8021Q)) - protocol = skb->protocol; - else { - __be16 proto, *protop; - protop = skb_header_pointer(skb, offsetof(struct vlan_ethhdr, - h_vlan_encapsulated_proto), - sizeof(proto), &proto); - if (likely(protop)) - protocol = *protop; + unsigned int vlan_depth = skb->mac_len; + + /* if type is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { + if (vlan_depth) { + if (WARN_ON(vlan_depth < VLAN_HLEN)) + return 0; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; + } + do { + struct vlan_hdr *vh; + + if (unlikely(!pskb_may_pull(skb, + vlan_depth + VLAN_HLEN))) + return 0; + + vh = (struct vlan_hdr *)(skb->data + vlan_depth); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (type == htons(ETH_P_8021Q) || + type == htons(ETH_P_8021AD)); } - return protocol; + if (depth) + *depth = vlan_depth; + + return type; +} + +/** + * vlan_get_protocol - get protocol EtherType. + * @skb: skbuff to query + * + * Returns the EtherType of the packet, regardless of whether it is + * vlan encapsulated (normal or hardware accelerated) or not. + */ +static inline __be16 vlan_get_protocol(struct sk_buff *skb) +{ + return __vlan_get_protocol(skb, skb->protocol, NULL); } static inline void vlan_set_encap_proto(struct sk_buff *skb, diff --git a/net/core/dev.c b/net/core/dev.c index 171420e75b03..c87a2264a02b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2352,7 +2352,6 @@ EXPORT_SYMBOL(skb_checksum_help); __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { - unsigned int vlan_depth = skb->mac_len; __be16 type = skb->protocol; /* Tunnel gso handlers can set protocol to ethernet. */ @@ -2366,35 +2365,7 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) type = eth->h_proto; } - /* if skb->protocol is 802.1Q/AD then the header should already be - * present at mac_len - VLAN_HLEN (if mac_len > 0), or at - * ETH_HLEN otherwise - */ - if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - if (vlan_depth) { - if (WARN_ON(vlan_depth < VLAN_HLEN)) - return 0; - vlan_depth -= VLAN_HLEN; - } else { - vlan_depth = ETH_HLEN; - } - do { - struct vlan_hdr *vh; - - if (unlikely(!pskb_may_pull(skb, - vlan_depth + VLAN_HLEN))) - return 0; - - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; - } while (type == htons(ETH_P_8021Q) || - type == htons(ETH_P_8021AD)); - } - - *depth = vlan_depth; - - return type; + return __vlan_get_protocol(skb, type, depth); } /** -- cgit v1.2.3-70-g09d2 From 72b1405964c19b99ad9f75340249b16305cf31ab Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 29 Jan 2015 20:37:08 +0900 Subject: igbvf: Fix checksum error when using stacked vlan When a skb has multiple vlans and it is CHECKSUM_PARTIAL, igbvf_tx_csum() fails to get the network protocol and checksum related descriptor fields are not configured correctly because skb->protocol doesn't show the L3 protocol in this case. Use vlan_get_protocol() to get the proper network protocol. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/igbvf/netdev.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c index 63c807c9b21c..edea13b0ee85 100644 --- a/drivers/net/ethernet/intel/igbvf/netdev.c +++ b/drivers/net/ethernet/intel/igbvf/netdev.c @@ -1907,7 +1907,8 @@ static void igbvf_watchdog_task(struct work_struct *work) static int igbvf_tso(struct igbvf_adapter *adapter, struct igbvf_ring *tx_ring, - struct sk_buff *skb, u32 tx_flags, u8 *hdr_len) + struct sk_buff *skb, u32 tx_flags, u8 *hdr_len, + __be16 protocol) { struct e1000_adv_tx_context_desc *context_desc; struct igbvf_buffer *buffer_info; @@ -1927,7 +1928,7 @@ static int igbvf_tso(struct igbvf_adapter *adapter, l4len = tcp_hdrlen(skb); *hdr_len += l4len; - if (skb->protocol == htons(ETH_P_IP)) { + if (protocol == htons(ETH_P_IP)) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = 0; iph->check = 0; @@ -1958,7 +1959,7 @@ static int igbvf_tso(struct igbvf_adapter *adapter, /* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */ tu_cmd |= (E1000_TXD_CMD_DEXT | E1000_ADVTXD_DTYP_CTXT); - if (skb->protocol == htons(ETH_P_IP)) + if (protocol == htons(ETH_P_IP)) tu_cmd |= E1000_ADVTXD_TUCMD_IPV4; tu_cmd |= E1000_ADVTXD_TUCMD_L4T_TCP; @@ -1984,7 +1985,8 @@ static int igbvf_tso(struct igbvf_adapter *adapter, static inline bool igbvf_tx_csum(struct igbvf_adapter *adapter, struct igbvf_ring *tx_ring, - struct sk_buff *skb, u32 tx_flags) + struct sk_buff *skb, u32 tx_flags, + __be16 protocol) { struct e1000_adv_tx_context_desc *context_desc; unsigned int i; @@ -2011,7 +2013,7 @@ static inline bool igbvf_tx_csum(struct igbvf_adapter *adapter, tu_cmd |= (E1000_TXD_CMD_DEXT | E1000_ADVTXD_DTYP_CTXT); if (skb->ip_summed == CHECKSUM_PARTIAL) { - switch (skb->protocol) { + switch (protocol) { case htons(ETH_P_IP): tu_cmd |= E1000_ADVTXD_TUCMD_IPV4; if (ip_hdr(skb)->protocol == IPPROTO_TCP) @@ -2211,6 +2213,7 @@ static netdev_tx_t igbvf_xmit_frame_ring_adv(struct sk_buff *skb, u8 hdr_len = 0; int count = 0; int tso = 0; + __be16 protocol = vlan_get_protocol(skb); if (test_bit(__IGBVF_DOWN, &adapter->state)) { dev_kfree_skb_any(skb); @@ -2239,13 +2242,13 @@ static netdev_tx_t igbvf_xmit_frame_ring_adv(struct sk_buff *skb, tx_flags |= (vlan_tx_tag_get(skb) << IGBVF_TX_FLAGS_VLAN_SHIFT); } - if (skb->protocol == htons(ETH_P_IP)) + if (protocol == htons(ETH_P_IP)) tx_flags |= IGBVF_TX_FLAGS_IPV4; first = tx_ring->next_to_use; tso = skb_is_gso(skb) ? - igbvf_tso(adapter, tx_ring, skb, tx_flags, &hdr_len) : 0; + igbvf_tso(adapter, tx_ring, skb, tx_flags, &hdr_len, protocol) : 0; if (unlikely(tso < 0)) { dev_kfree_skb_any(skb); return NETDEV_TX_OK; @@ -2253,7 +2256,7 @@ static netdev_tx_t igbvf_xmit_frame_ring_adv(struct sk_buff *skb, if (tso) tx_flags |= IGBVF_TX_FLAGS_TSO; - else if (igbvf_tx_csum(adapter, tx_ring, skb, tx_flags) && + else if (igbvf_tx_csum(adapter, tx_ring, skb, tx_flags, protocol) && (skb->ip_summed == CHECKSUM_PARTIAL)) tx_flags |= IGBVF_TX_FLAGS_CSUM; -- cgit v1.2.3-70-g09d2 From 0213668f060ea966ee8f4e6334f0fd27b6a1c428 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 29 Jan 2015 20:37:09 +0900 Subject: ixgbe: Fix checksum error when using stacked vlan When a skb has multiple vlans and it is CHECKSUM_PARTIAL, ixgbe_tx_csum() fails to get the network protocol and checksum related descriptor fields are not configured correctly because skb->protocol doesn't show the L3 protocol in this case. Use vlan_get_protocol() to get the proper network protocol. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 2ed2c7de2304..67b02bde179e 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -7227,11 +7227,11 @@ netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb, if (!vhdr) goto out_drop; - protocol = vhdr->h_vlan_encapsulated_proto; tx_flags |= ntohs(vhdr->h_vlan_TCI) << IXGBE_TX_FLAGS_VLAN_SHIFT; tx_flags |= IXGBE_TX_FLAGS_SW_VLAN; } + protocol = vlan_get_protocol(skb); if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) && adapter->ptp_clock && -- cgit v1.2.3-70-g09d2 From 10e4fb333c9ad72491f80bed018f8007e17060d1 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 29 Jan 2015 20:37:10 +0900 Subject: ixgbevf: Fix checksum error when using stacked vlan When a skb has multiple vlans and it is CHECKSUM_PARTIAL, ixgbevf_tx_csum() fails to get the network protocol and checksum related descriptor fields are not configured correctly because skb->protocol doesn't show the L3 protocol in this case. Use first->protocol instead of skb->protocol to get the proper network protocol. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 62a0d8e0f17d..38c7a0be8197 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -3099,7 +3099,7 @@ static int ixgbevf_tso(struct ixgbevf_ring *tx_ring, /* ADV DTYP TUCMD MKRLOC/ISCSIHEDLEN */ type_tucmd = IXGBE_ADVTXD_TUCMD_L4T_TCP; - if (skb->protocol == htons(ETH_P_IP)) { + if (first->protocol == htons(ETH_P_IP)) { struct iphdr *iph = ip_hdr(skb); iph->tot_len = 0; iph->check = 0; @@ -3156,7 +3156,7 @@ static void ixgbevf_tx_csum(struct ixgbevf_ring *tx_ring, if (skb->ip_summed == CHECKSUM_PARTIAL) { u8 l4_hdr = 0; - switch (skb->protocol) { + switch (first->protocol) { case htons(ETH_P_IP): vlan_macip_lens |= skb_network_header_len(skb); type_tucmd |= IXGBE_ADVTXD_TUCMD_IPV4; -- cgit v1.2.3-70-g09d2 From ecf6ba83d76e0c78e89401750dc527008e14faa2 Mon Sep 17 00:00:00 2001 From: Iyappan Subramanian Date: Thu, 29 Jan 2015 14:38:23 -0800 Subject: drivers: net: xgene: fix: Out of order descriptor bytes read This patch fixes the following kernel crash, WARNING: CPU: 2 PID: 0 at net/ipv4/tcp_input.c:3079 tcp_clean_rtx_queue+0x658/0x80c() Call trace: [] dump_backtrace+0x0/0x184 [] show_stack+0x10/0x1c [] dump_stack+0x74/0x98 [] warn_slowpath_common+0x88/0xb0 [] warn_slowpath_null+0x14/0x20 [] tcp_clean_rtx_queue+0x654/0x80c [] tcp_ack+0x454/0x688 [] tcp_rcv_established+0x4a4/0x62c [] tcp_v4_do_rcv+0x16c/0x350 [] tcp_v4_rcv+0x8e8/0x904 [] ip_local_deliver_finish+0x100/0x26c [] ip_local_deliver+0xac/0xc4 [] ip_rcv_finish+0xe8/0x328 [] ip_rcv+0x24c/0x38c [] __netif_receive_skb_core+0x29c/0x7c8 [] __netif_receive_skb+0x28/0x7c [] netif_receive_skb_internal+0x5c/0xe0 [] napi_gro_receive+0xb4/0x110 [] xgene_enet_process_ring+0x144/0x338 [] xgene_enet_napi+0x1c/0x50 [] net_rx_action+0x154/0x228 [] __do_softirq+0x110/0x28c [] irq_exit+0x8c/0xc0 [] handle_IRQ+0x44/0xa8 [] gic_handle_irq+0x38/0x7c [...] Software writes poison data into the descriptor bytes[15:8] and upon receiving the interrupt, if those bytes are overwritten by the hardware with the valid data, software also reads bytes[7:0] and executes receive/tx completion logic. If the CPU executes the above two reads in out of order fashion, then the bytes[7:0] will have older data and causing the kernel panic. We have to force the order of the reads and thus this patch introduces read memory barrier between these reads. Signed-off-by: Iyappan Subramanian Signed-off-by: Keyur Chudgar Signed-off-by: David S. Miller --- drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 83a50280bb70..793f3b73eeff 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -369,6 +369,8 @@ static int xgene_enet_process_ring(struct xgene_enet_desc_ring *ring, if (unlikely(xgene_enet_is_desc_slot_empty(raw_desc))) break; + /* read fpqnum field after dataaddr field */ + dma_rmb(); if (is_rx_desc(raw_desc)) ret = xgene_enet_rx_frame(ring, raw_desc); else -- cgit v1.2.3-70-g09d2 From d953ca4ddf71aa91a4596b2ff7ff1598f6ad4708 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Thu, 29 Jan 2015 12:34:49 -0800 Subject: hyperv: Fix the error processing in netvsc_send() The existing code frees the skb in EAGAIN case, in which the skb will be retried from upper layer and used again. Also, the existing code doesn't free send buffer slot in error case, because there is no completion message for unsent packets. This patch fixes these problems. (Please also include this patch for stable trees. Thanks!) Signed-off-by: Haiyang Zhang Reviewed-by: K. Y. Srinivasan Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 9f49c0129a78..7cd4eb38abfa 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -716,7 +716,7 @@ int netvsc_send(struct hv_device *device, u64 req_id; unsigned int section_index = NETVSC_INVALID_INDEX; u32 msg_size = 0; - struct sk_buff *skb; + struct sk_buff *skb = NULL; u16 q_idx = packet->q_idx; @@ -743,8 +743,6 @@ int netvsc_send(struct hv_device *device, packet); skb = (struct sk_buff *) (unsigned long)packet->send_completion_tid; - if (skb) - dev_kfree_skb_any(skb); packet->page_buf_cnt = 0; } } @@ -810,6 +808,13 @@ int netvsc_send(struct hv_device *device, packet, ret); } + if (ret != 0) { + if (section_index != NETVSC_INVALID_INDEX) + netvsc_free_send_slot(net_device, section_index); + } else if (skb) { + dev_kfree_skb_any(skb); + } + return ret; } -- cgit v1.2.3-70-g09d2 From 0d32ef8cef9aa8f375e128f78b77caceaa7e8da0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Jan 2015 17:30:12 -0800 Subject: net: sched: fix panic in rate estimators Doing the following commands on a non idle network device panics the box instantly, because cpu_bstats gets overwritten by stats. tc qdisc add dev eth0 root ... some traffic (one packet is enough) ... tc qdisc replace dev eth0 root est 1sec 4sec [ 325.355596] BUG: unable to handle kernel paging request at ffff8841dc5a074c [ 325.362609] IP: [] __gnet_stats_copy_basic+0x3e/0x90 [ 325.369158] PGD 1fa7067 PUD 0 [ 325.372254] Oops: 0000 [#1] SMP [ 325.375514] Modules linked in: ... [ 325.398346] CPU: 13 PID: 14313 Comm: tc Not tainted 3.19.0-smp-DEV #1163 [ 325.412042] task: ffff8800793ab5d0 ti: ffff881ff2fa4000 task.ti: ffff881ff2fa4000 [ 325.419518] RIP: 0010:[] [] __gnet_stats_copy_basic+0x3e/0x90 [ 325.428506] RSP: 0018:ffff881ff2fa7928 EFLAGS: 00010286 [ 325.433824] RAX: 000000000000000c RBX: ffff881ff2fa796c RCX: 000000000000000c [ 325.440988] RDX: ffff8841dc5a0744 RSI: 0000000000000060 RDI: 0000000000000060 [ 325.448120] RBP: ffff881ff2fa7948 R08: ffffffff81cd4f80 R09: 0000000000000000 [ 325.455268] R10: ffff883ff223e400 R11: 0000000000000000 R12: 000000015cba0744 [ 325.462405] R13: ffffffff81cd4f80 R14: ffff883ff223e460 R15: ffff883feea0722c [ 325.469536] FS: 00007f2ee30fa700(0000) GS:ffff88407fa20000(0000) knlGS:0000000000000000 [ 325.477630] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 325.483380] CR2: ffff8841dc5a074c CR3: 0000003feeae9000 CR4: 00000000001407e0 [ 325.490510] Stack: [ 325.492524] ffff883feea0722c ffff883fef719dc0 ffff883feea0722c ffff883ff223e4a0 [ 325.499990] ffff881ff2fa79a8 ffffffff815424ee ffff883ff223e49c 000000015cba0744 [ 325.507460] 00000000f2fa7978 0000000000000000 ffff881ff2fa79a8 ffff883ff223e4a0 [ 325.514956] Call Trace: [ 325.517412] [] gen_new_estimator+0x8e/0x230 [ 325.523250] [] gen_replace_estimator+0x4a/0x60 [ 325.529349] [] tc_modify_qdisc+0x52b/0x590 [ 325.535117] [] rtnetlink_rcv_msg+0xa0/0x240 [ 325.540963] [] ? __rtnl_unlock+0x20/0x20 [ 325.546532] [] netlink_rcv_skb+0xb1/0xc0 [ 325.552145] [] rtnetlink_rcv+0x25/0x40 [ 325.557558] [] netlink_unicast+0x168/0x220 [ 325.563317] [] netlink_sendmsg+0x2ec/0x3e0 Lets play safe and not use an union : percpu 'pointers' are mostly read anyway, and we have typically few qdiscs per host. Signed-off-by: Eric Dumazet Cc: John Fastabend Fixes: 22e0f8b9322c ("net: sched: make bstats per cpu and estimator RCU safe") Signed-off-by: David S. Miller --- include/net/sch_generic.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 3d282cbb66bf..c605d305c577 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -79,6 +79,9 @@ struct Qdisc { struct netdev_queue *dev_queue; struct gnet_stats_rate_est64 rate_est; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; + struct Qdisc *next_sched; struct sk_buff *gso_skb; /* @@ -86,15 +89,9 @@ struct Qdisc { */ unsigned long state; struct sk_buff_head q; - union { - struct gnet_stats_basic_packed bstats; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; - } __packed; + struct gnet_stats_basic_packed bstats; unsigned int __state; - union { - struct gnet_stats_queue qstats; - struct gnet_stats_queue __percpu *cpu_qstats; - } __packed; + struct gnet_stats_queue qstats; struct rcu_head rcu_head; int padded; atomic_t refcnt; -- cgit v1.2.3-70-g09d2 From bdbbb8527b6f6a358dbcb70dac247034d665b8e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 29 Jan 2015 21:35:05 -0800 Subject: ipv4: tcp: get rid of ugly unicast_sock In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock") I tried to address contention on a socket lock, but the solution I chose was horrible : commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside of TCP stack") addressed a selinux regression. commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1") took care of another regression. commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression. commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate") was another shot in the dark. Really, just use a proper socket per cpu, and remove the skb_orphan() call, to re-enable flow control. This solves a serious problem with FQ packet scheduler when used in hostile environments, as we do not want to allocate a flow structure for every RST packet sent in response to a spoofed packet. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ip.h | 2 +- include/net/netns/ipv4.h | 1 + net/ipv4/ip_output.c | 30 +++--------------------------- net/ipv4/tcp_ipv4.c | 37 ++++++++++++++++++++++++++++++++----- 4 files changed, 37 insertions(+), 33 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index f7cbd703d15d..09cf5aebb283 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -181,7 +181,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg) return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0; } -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 24945cefc4fd..0ffef1a38efc 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -52,6 +52,7 @@ struct netns_ipv4 { struct inet_peer_base *peers; struct tcpm_hash_bucket *tcp_metrics_hash; unsigned int tcp_metrics_hash_log; + struct sock * __percpu *tcp_sk; struct netns_frags frags; #ifdef CONFIG_NETFILTER struct xt_table *iptable_filter; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 38a20a9cca1a..c373c0708d97 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1506,24 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset, /* * Generic function to send a packet as reply to another packet. * Used to send some TCP resets/acks so far. - * - * Use a fake percpu inet socket to avoid false sharing and contention. */ -static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = { - .sk = { - .__sk_common = { - .skc_refcnt = ATOMIC_INIT(1), - }, - .sk_wmem_alloc = ATOMIC_INIT(1), - .sk_allocation = GFP_ATOMIC, - .sk_flags = (1UL << SOCK_USE_WRITE_QUEUE), - .sk_pacing_rate = ~0U, - }, - .pmtudisc = IP_PMTUDISC_WANT, - .uc_ttl = -1, -}; - -void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, +void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, const struct ip_options *sopt, __be32 daddr, __be32 saddr, const struct ip_reply_arg *arg, @@ -1533,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); + struct net *net = sock_net(sk); struct sk_buff *nskb; - struct sock *sk; - struct inet_sock *inet; int err; if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) @@ -1566,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, if (IS_ERR(rt)) return; - inet = &get_cpu_var(unicast_sock); + inet_sk(sk)->tos = arg->tos; - inet->tos = arg->tos; - sk = &inet->sk; sk->sk_priority = skb->priority; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; - sock_net_set(sk, net); - __skb_queue_head_init(&sk->sk_write_queue); sk->sk_sndbuf = sysctl_wmem_default; err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base, len, 0, &ipc, &rt, MSG_DONTWAIT); @@ -1590,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - skb_orphan(nskb); skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); ip_push_pending_frames(sk, &fl4); } out: - put_cpu_var(unicast_sock); - ip_rt_put(rt); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a3f72d7fc06c..d22f54482bab 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.bound_dev_if = sk->sk_bound_dev_if; arg.tos = ip_hdr(skb)->tos; - ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); @@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, if (oif) arg.bound_dev_if = oif; arg.tos = tos; - ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt, + ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), + skb, &TCP_SKB_CB(skb)->header.h4.opt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len); @@ -2428,14 +2430,39 @@ struct proto tcp_prot = { }; EXPORT_SYMBOL(tcp_prot); +static void __net_exit tcp_sk_exit(struct net *net) +{ + int cpu; + + for_each_possible_cpu(cpu) + inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); + free_percpu(net->ipv4.tcp_sk); +} + static int __net_init tcp_sk_init(struct net *net) { + int res, cpu; + + net->ipv4.tcp_sk = alloc_percpu(struct sock *); + if (!net->ipv4.tcp_sk) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct sock *sk; + + res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW, + IPPROTO_TCP, net); + if (res) + goto fail; + *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; + } net->ipv4.sysctl_tcp_ecn = 2; return 0; -} -static void __net_exit tcp_sk_exit(struct net *net) -{ +fail: + tcp_sk_exit(net); + + return res; } static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) -- cgit v1.2.3-70-g09d2 From 61132bf7fbe3a802df1f68ad08e8ca10d6b30ddc Mon Sep 17 00:00:00 2001 From: Marcelo Leitner Date: Fri, 30 Jan 2015 09:56:01 -0200 Subject: qlge: Fix qlge_update_hw_vlan_features to handle if interface is down Currently qlge_update_hw_vlan_features() will always first put the interface down, then update features and then bring it up again. But it is possible to hit this code while the adapter is down and this causes a non-paired call to napi_disable(), which will get stuck. This patch fixes it by skipping these down/up actions if the interface is already down. Fixes: a45adbe8d352 ("qlge: Enhance nested VLAN (Q-in-Q) handling.") Cc: Harish Patil Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlge/qlge_main.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index 6c904a6cad2a..ef5aed3b1225 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -2351,23 +2351,29 @@ static int qlge_update_hw_vlan_features(struct net_device *ndev, { struct ql_adapter *qdev = netdev_priv(ndev); int status = 0; + bool need_restart = netif_running(ndev); - status = ql_adapter_down(qdev); - if (status) { - netif_err(qdev, link, qdev->ndev, - "Failed to bring down the adapter\n"); - return status; + if (need_restart) { + status = ql_adapter_down(qdev); + if (status) { + netif_err(qdev, link, qdev->ndev, + "Failed to bring down the adapter\n"); + return status; + } } /* update the features with resent change */ ndev->features = features; - status = ql_adapter_up(qdev); - if (status) { - netif_err(qdev, link, qdev->ndev, - "Failed to bring up the adapter\n"); - return status; + if (need_restart) { + status = ql_adapter_up(qdev); + if (status) { + netif_err(qdev, link, qdev->ndev, + "Failed to bring up the adapter\n"); + return status; + } } + return status; } -- cgit v1.2.3-70-g09d2 From 44ba582beac4ff8e05a15e38548ebf3deb509547 Mon Sep 17 00:00:00 2001 From: David L Stevens Date: Fri, 30 Jan 2015 12:29:45 -0500 Subject: sunvnet: set queue mapping when doing packet copies This patch fixes a bug where vnet_skb_shape() didn't set the already-selected queue mapping when a packet copy was required. This results in using the wrong queue index for stops/starts, hung tx queues and watchdog timeouts under heavy load. Signed-off-by: David L Stevens Acked-by: Sowmini Varadhan Signed-off-by: David S. Miller --- drivers/net/ethernet/sun/sunvnet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/sun/sunvnet.c b/drivers/net/ethernet/sun/sunvnet.c index d2835bf7b4fb..3699b98d5b2c 100644 --- a/drivers/net/ethernet/sun/sunvnet.c +++ b/drivers/net/ethernet/sun/sunvnet.c @@ -1119,6 +1119,7 @@ static inline struct sk_buff *vnet_skb_shape(struct sk_buff *skb, int ncookies) skb_shinfo(nskb)->gso_size = skb_shinfo(skb)->gso_size; skb_shinfo(nskb)->gso_type = skb_shinfo(skb)->gso_type; } + nskb->queue_mapping = skb->queue_mapping; dev_kfree_skb(skb); skb = nskb; } -- cgit v1.2.3-70-g09d2 From e6b02be81b2ed4b2e667f40050cffa64d6bd0256 Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 30 Jan 2015 20:50:44 +0100 Subject: Documentation: Update netlink_mmap.txt Update netlink_mmap.txt wrt. commit 4682a0358639b29cf ("netlink: Always copy on mmap TX."). Signed-off-by: Richard Weinberger Signed-off-by: David S. Miller --- Documentation/networking/netlink_mmap.txt | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt index c6af4bac5aa8..54f10478e8e3 100644 --- a/Documentation/networking/netlink_mmap.txt +++ b/Documentation/networking/netlink_mmap.txt @@ -199,16 +199,9 @@ frame header. TX limitations -------------- -Kernel processing usually involves validation of the message received by -user-space, then processing its contents. The kernel must assure that -userspace is not able to modify the message contents after they have been -validated. In order to do so, the message is copied from the ring frame -to an allocated buffer if either of these conditions is false: - -- only a single mapping of the ring exists -- the file descriptor is not shared between processes - -This means that for threaded programs, the kernel will fall back to copying. +As of Jan 2015 the message is always copied from the ring frame to an +allocated buffer due to unresolved security concerns. +See commit 4682a0358639b29cf ("netlink: Always copy on mmap TX."). Example ------- -- cgit v1.2.3-70-g09d2 From c101cff9010083a8796353a75673fd100b077b79 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 1 Feb 2015 23:54:25 +0300 Subject: isdn: off by one in connect_res() The bug here is that we use "Reject" as the index into the cau_t[] array in the else path. Since the cau_t[] has 9 elements if Reject == 9 then we are reading beyond the end of the array. My understanding of the code is that it's saying that if Reject is 1 or too high then that's invalid and we should hang up. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/isdn/hardware/eicon/message.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/isdn/hardware/eicon/message.c b/drivers/isdn/hardware/eicon/message.c index 0b380603a578..d7c286656a25 100644 --- a/drivers/isdn/hardware/eicon/message.c +++ b/drivers/isdn/hardware/eicon/message.c @@ -1474,7 +1474,7 @@ static byte connect_res(dword Id, word Number, DIVA_CAPI_ADAPTER *a, add_ai(plci, &parms[5]); sig_req(plci, REJECT, 0); } - else if (Reject == 1 || Reject > 9) + else if (Reject == 1 || Reject >= 9) { add_ai(plci, &parms[5]); sig_req(plci, HANGUP, 0); -- cgit v1.2.3-70-g09d2 From 5a2e87b16875f9b83b7e9494cf1fce8e17dc764a Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 2 Feb 2015 15:18:42 +0200 Subject: net/mlx4_core: Fix kernel Oops (mem corruption) when working with more than 80 VFs Commit de966c592802 (net/mlx4_core: Support more than 64 VFs) was meant to allow up to 126 VFs. However, due to leaving MLX4_MFUNC_MAX too low, using more than 80 VFs resulted in memory corruptions (and Oopses) when more than 80 VFs were requested. In addition, the number of slaves was left too high. This commit fixes these issues. Fixes: de966c592802 ("net/mlx4_core: Support more than 64 VFs") Signed-off-by: Jack Morgenstein Signed-off-by: Amir Vadai Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 ++- include/linux/mlx4/device.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index bdd4eea2247c..210691c89b6c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -235,7 +235,8 @@ do { \ extern int mlx4_log_num_mgm_entry_size; extern int log_mtts_per_seg; -#define MLX4_MAX_NUM_SLAVES (MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF) +#define MLX4_MAX_NUM_SLAVES (min(MLX4_MAX_NUM_PF + MLX4_MAX_NUM_VF, \ + MLX4_MFUNC_MAX)) #define ALL_SLAVES 0xff struct mlx4_bitmap { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 25c791e295fd..5f3a9aa7225d 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -97,7 +97,7 @@ enum { MLX4_MAX_NUM_PF = 16, MLX4_MAX_NUM_VF = 126, MLX4_MAX_NUM_VF_P_PORT = 64, - MLX4_MFUNC_MAX = 80, + MLX4_MFUNC_MAX = 128, MLX4_MAX_EQ_NUM = 1024, MLX4_MFUNC_EQ_NUM = 4, MLX4_MFUNC_MAX_EQES = 8, -- cgit v1.2.3-70-g09d2 From 42b5212fee4f57907e9415b18fe19c13e65574bc Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 2 Feb 2015 16:57:51 +0000 Subject: xen-netback: stop the guest rx thread after a fatal error After commit e9d8b2c2968499c1f96563e6522c56958d5a1d0d (xen-netback: disable rogue vif in kthread context), a fatal (protocol) error would leave the guest Rx thread spinning, wasting CPU time. Commit ecf08d2dbb96d5a4b4bcc53a39e8d29cc8fef02e (xen-netback: reintroduce guest Rx stall detection) made this even worse by removing a cond_resched() from this path. Since a fatal error is non-recoverable, just allow the guest Rx thread to exit. This requires taking additional refs to the task so the thread exiting early is handled safely. Signed-off-by: David Vrabel Reported-by: Julien Grall Tested-by: Julien Grall Acked-by: Wei Liu Signed-off-by: David S. Miller --- drivers/net/xen-netback/interface.c | 2 ++ drivers/net/xen-netback/netback.c | 3 +-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 9259a732e8a4..037f74f0fcf6 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -578,6 +578,7 @@ int xenvif_connect(struct xenvif_queue *queue, unsigned long tx_ring_ref, goto err_rx_unbind; } queue->task = task; + get_task_struct(task); task = kthread_create(xenvif_dealloc_kthread, (void *)queue, "%s-dealloc", queue->name); @@ -634,6 +635,7 @@ void xenvif_disconnect(struct xenvif *vif) if (queue->task) { kthread_stop(queue->task); + put_task_struct(queue->task); queue->task = NULL; } diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 908e65e9b821..c8ce701a7efb 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -2109,8 +2109,7 @@ int xenvif_kthread_guest_rx(void *data) */ if (unlikely(vif->disabled && queue->id == 0)) { xenvif_carrier_off(vif); - xenvif_rx_queue_purge(queue); - continue; + break; } if (!skb_queue_empty(&queue->rx_queue)) -- cgit v1.2.3-70-g09d2 From 0508c07f5e0c94f38afd5434e8b2a55b84553077 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 3 Feb 2015 16:36:15 -0500 Subject: ipv6: Select fragment id during UFO segmentation if not set. If the IPv6 fragment id has not been set and we perform fragmentation due to UFO, select a new fragment id. We now consider a fragment id of 0 as unset and if id selection process returns 0 (after all the pertrubations), we set it to 0x80000000, thus giving us ample space not to create collisions with the next packet we may have to fragment. When doing UFO integrity checking, we also select the fragment id if it has not be set yet. This is stored into the skb_shinfo() thus allowing UFO to function correclty. This patch also removes duplicate fragment id generation code and moves ipv6_select_ident() into the header as it may be used during GSO. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- include/net/ipv6.h | 3 +++ net/ipv6/ip6_output.c | 14 -------------- net/ipv6/output_core.c | 41 +++++++++++++++++++++++++++++++++++------ net/ipv6/udp_offload.c | 10 +++++++++- 4 files changed, 47 insertions(+), 21 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 4292929392b0..9bf85d34c024 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -671,6 +671,9 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } +u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst, + struct in6_addr *src); +void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt); void ipv6_proxy_select_ident(struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ce69a12ae48c..d28f2a2efb32 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -537,20 +537,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) skb_copy_secmark(to, from); } -static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) -{ - static u32 ip6_idents_hashrnd __read_mostly; - u32 hash, id; - - net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); - - hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd); - hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash); - - id = ip_idents_reserve(hash, 1); - fhdr->identification = htonl(id); -} - int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { struct sk_buff *frag; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 97f41a3e68d9..54520a0bd5e3 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -9,6 +9,24 @@ #include #include +u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst, struct in6_addr *src) +{ + u32 hash, id; + + hash = __ipv6_addr_jhash(dst, hashrnd); + hash = __ipv6_addr_jhash(src, hash); + + /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, + * set the hight order instead thus minimizing possible future + * collisions. + */ + id = ip_idents_reserve(hash, 1); + if (unlikely(!id)) + id = 1 << 31; + + return id; +} + /* This function exists only for tap drivers that must support broken * clients requesting UFO without specifying an IPv6 fragment ID. * @@ -22,7 +40,7 @@ void ipv6_proxy_select_ident(struct sk_buff *skb) static u32 ip6_proxy_idents_hashrnd __read_mostly; struct in6_addr buf[2]; struct in6_addr *addrs; - u32 hash, id; + u32 id; addrs = skb_header_pointer(skb, skb_network_offset(skb) + @@ -34,14 +52,25 @@ void ipv6_proxy_select_ident(struct sk_buff *skb) net_get_random_once(&ip6_proxy_idents_hashrnd, sizeof(ip6_proxy_idents_hashrnd)); - hash = __ipv6_addr_jhash(&addrs[1], ip6_proxy_idents_hashrnd); - hash = __ipv6_addr_jhash(&addrs[0], hash); - - id = ip_idents_reserve(hash, 1); - skb_shinfo(skb)->ip6_frag_id = htonl(id); + id = __ipv6_select_ident(ip6_proxy_idents_hashrnd, + &addrs[1], &addrs[0]); + skb_shinfo(skb)->ip6_frag_id = id; } EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident); +void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) +{ + static u32 ip6_idents_hashrnd __read_mostly; + u32 id; + + net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); + + id = __ipv6_select_ident(ip6_idents_hashrnd, &rt->rt6i_dst.addr, + &rt->rt6i_src.addr); + fhdr->identification = htonl(id); +} +EXPORT_SYMBOL(ipv6_select_ident); + int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) { u16 offset = sizeof(struct ipv6hdr); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b6aa8ed18257..a56276996b72 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -52,6 +52,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); + /* Set the IPv6 fragment id if not set yet */ + if (!skb_shinfo(skb)->ip6_frag_id) + ipv6_proxy_select_ident(skb); + segs = NULL; goto out; } @@ -108,7 +112,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); fptr->nexthdr = nexthdr; fptr->reserved = 0; - fptr->identification = skb_shinfo(skb)->ip6_frag_id; + if (skb_shinfo(skb)->ip6_frag_id) + fptr->identification = skb_shinfo(skb)->ip6_frag_id; + else + ipv6_select_ident(fptr, + (struct rt6_info *)skb_dst(skb)); /* Fragment the skb. ipv6 header and the remaining fields of the * fragment header are updated in ipv6_gso_segment() -- cgit v1.2.3-70-g09d2 From 72f6510745592c87f612f62ae4f16bb002934df4 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 3 Feb 2015 16:36:16 -0500 Subject: Revert "drivers/net, ipv6: Select IPv6 fragment idents for virtio UFO packets" This reverts commit 5188cd44c55db3e92cd9e77a40b5baa7ed4340f7. Now that GSO layer can track if fragment id has been selected and can allocate one if necessary, we don't need to do this in tap and macvtap. This reverts most of the code and only keeps the new ipv6 fragment id generation function that is still needed. Fixes: 3d0ad09412ff (drivers/net: Disable UFO through virtio) Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 3 --- drivers/net/tun.c | 6 +----- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 7df221788cd4..0b86e46c130b 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include @@ -589,8 +588,6 @@ static int macvtap_skb_from_vnet_hdr(struct macvtap_queue *q, pr_warn_once("macvtap: %s: using disabled UFO feature; please fix this program\n", current->comm); gso_type = SKB_GSO_UDP; - if (skb->protocol == htons(ETH_P_IPV6)) - ipv6_proxy_select_ident(skb); break; default: return -EINVAL; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 8c8dc16839a7..5ca42b79b16f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -65,7 +65,6 @@ #include #include #include -#include #include #include #include @@ -1167,8 +1166,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, break; } - skb_reset_network_header(skb); - if (gso.gso_type != VIRTIO_NET_HDR_GSO_NONE) { pr_debug("GSO!\n"); switch (gso.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { @@ -1189,8 +1186,6 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, current->comm); } skb_shinfo(skb)->gso_type = SKB_GSO_UDP; - if (skb->protocol == htons(ETH_P_IPV6)) - ipv6_proxy_select_ident(skb); break; } default: @@ -1221,6 +1216,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; } + skb_reset_network_header(skb); skb_probe_transport_header(skb, 0); rxhash = skb_get_hash(skb); -- cgit v1.2.3-70-g09d2 From e3e3c423f82a415195a7bbbfa619bfa7b20d2db6 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 3 Feb 2015 16:36:17 -0500 Subject: Revert "drivers/net: Disable UFO through virtio" This reverts commit 3d0ad09412ffe00c9afa201d01effdb6023d09b4. Now that GSO functionality can correctly track if the fragment id has been selected and select a fragment id if necessary, we can re-enable UFO on tap/macvap and virtio devices. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- drivers/net/macvtap.c | 13 ++++++++----- drivers/net/tun.c | 19 ++++++++----------- drivers/net/virtio_net.c | 24 ++++++++++-------------- 3 files changed, 26 insertions(+), 30 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 0b86e46c130b..919f4fccc322 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -80,7 +80,7 @@ static struct cdev macvtap_cdev; static const struct proto_ops macvtap_socket_ops; #define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \ - NETIF_F_TSO6) + NETIF_F_TSO6 | NETIF_F_UFO) #define RX_OFFLOADS (NETIF_F_GRO | NETIF_F_LRO) #define TAP_FEATURES (NETIF_F_GSO | NETIF_F_SG) @@ -585,8 +585,6 @@ static int macvtap_skb_from_vnet_hdr(struct macvtap_queue *q, gso_type = SKB_GSO_TCPV6; break; case VIRTIO_NET_HDR_GSO_UDP: - pr_warn_once("macvtap: %s: using disabled UFO feature; please fix this program\n", - current->comm); gso_type = SKB_GSO_UDP; break; default: @@ -633,6 +631,8 @@ static void macvtap_skb_to_vnet_hdr(struct macvtap_queue *q, vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP; else BUG(); if (sinfo->gso_type & SKB_GSO_TCP_ECN) @@ -962,6 +962,9 @@ static int set_offload(struct macvtap_queue *q, unsigned long arg) if (arg & TUN_F_TSO6) feature_mask |= NETIF_F_TSO6; } + + if (arg & TUN_F_UFO) + feature_mask |= NETIF_F_UFO; } /* tun/tap driver inverts the usage for TSO offloads, where @@ -972,7 +975,7 @@ static int set_offload(struct macvtap_queue *q, unsigned long arg) * When user space turns off TSO, we turn off GSO/LRO so that * user-space will not receive TSO frames. */ - if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6)) + if (feature_mask & (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_UFO)) features |= RX_OFFLOADS; else features &= ~RX_OFFLOADS; @@ -1087,7 +1090,7 @@ static long macvtap_ioctl(struct file *file, unsigned int cmd, case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | - TUN_F_TSO_ECN)) + TUN_F_TSO_ECN | TUN_F_UFO)) return -EINVAL; rtnl_lock(); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 5ca42b79b16f..10f9e4021b5a 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -186,7 +186,7 @@ struct tun_struct { struct net_device *dev; netdev_features_t set_features; #define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \ - NETIF_F_TSO6) + NETIF_F_TSO6|NETIF_F_UFO) int vnet_hdr_sz; int sndbuf; @@ -1176,18 +1176,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; break; case VIRTIO_NET_HDR_GSO_UDP: - { - static bool warned; - - if (!warned) { - warned = true; - netdev_warn(tun->dev, - "%s: using disabled UFO feature; please fix this program\n", - current->comm); - } skb_shinfo(skb)->gso_type = SKB_GSO_UDP; break; - } default: tun->dev->stats.rx_frame_errors++; kfree_skb(skb); @@ -1294,6 +1284,8 @@ static ssize_t tun_put_user(struct tun_struct *tun, gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP) + gso.gso_type = VIRTIO_NET_HDR_GSO_UDP; else { pr_err("unexpected GSO type: " "0x%x, gso_size %d, hdr_len %d\n", @@ -1742,6 +1734,11 @@ static int set_offload(struct tun_struct *tun, unsigned long arg) features |= NETIF_F_TSO6; arg &= ~(TUN_F_TSO4|TUN_F_TSO6); } + + if (arg & TUN_F_UFO) { + features |= NETIF_F_UFO; + arg &= ~TUN_F_UFO; + } } /* This gives the user a way to test for new features in future by diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 5ca97713bfb3..059fdf1bf5ee 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -490,17 +490,8 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq, skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; break; case VIRTIO_NET_HDR_GSO_UDP: - { - static bool warned; - - if (!warned) { - warned = true; - netdev_warn(dev, - "host using disabled UFO feature; please fix it\n"); - } skb_shinfo(skb)->gso_type = SKB_GSO_UDP; break; - } case VIRTIO_NET_HDR_GSO_TCPV6: skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; break; @@ -888,6 +879,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb) hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; else BUG(); if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) @@ -1748,7 +1741,7 @@ static int virtnet_probe(struct virtio_device *vdev) dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; if (virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { - dev->hw_features |= NETIF_F_TSO + dev->hw_features |= NETIF_F_TSO | NETIF_F_UFO | NETIF_F_TSO_ECN | NETIF_F_TSO6; } /* Individual feature bits: what can host handle? */ @@ -1758,9 +1751,11 @@ static int virtnet_probe(struct virtio_device *vdev) dev->hw_features |= NETIF_F_TSO6; if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN)) dev->hw_features |= NETIF_F_TSO_ECN; + if (virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO)) + dev->hw_features |= NETIF_F_UFO; if (gso) - dev->features |= dev->hw_features & NETIF_F_ALL_TSO; + dev->features |= dev->hw_features & (NETIF_F_ALL_TSO|NETIF_F_UFO); /* (!csum && gso) case will be fixed by register_netdev() */ } if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_CSUM)) @@ -1798,7 +1793,8 @@ static int virtnet_probe(struct virtio_device *vdev) /* If we can receive ANY GSO packets, we must allocate large ones. */ if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) || - virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN)) + virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN) || + virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_UFO)) vi->big_packets = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) @@ -1994,9 +1990,9 @@ static struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC, - VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_TSO6, + VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, - VIRTIO_NET_F_GUEST_ECN, + VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, -- cgit v1.2.3-70-g09d2 From 06b19b1b178570b3d843306c636cdadb6e36c0a1 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 3 Feb 2015 05:00:40 +0800 Subject: net: usb: sr9700: Use 'SR_' prefix for the common register macros The commone register macors (e.g. RSR) is too commont to drivers, it may be conflict with the architectures (e.g. xtensa, sh). The related warnings (with allmodconfig under xtensa): CC [M] drivers/net/usb/sr9700.o In file included from drivers/net/usb/sr9700.c:24:0: drivers/net/usb/sr9700.h:65:0: warning: "RSR" redefined #define RSR 0x06 ^ In file included from ./arch/xtensa/include/asm/bitops.h:22:0, from include/linux/bitops.h:36, from include/linux/kernel.h:10, from include/linux/list.h:8, from include/linux/module.h:9, from drivers/net/usb/sr9700.c:13: ./arch/xtensa/include/asm/processor.h:190:0: note: this is the location of the previous definition #define RSR(v,sr) __asm__ __volatile__ ("rsr %0,"__stringify(sr) : "=a"(v)); ^ Signed-off-by: Chen Gang Signed-off-by: David S. Miller --- drivers/net/usb/sr9700.c | 36 +++++++++++++------------- drivers/net/usb/sr9700.h | 66 ++++++++++++++++++++++++------------------------ 2 files changed, 51 insertions(+), 51 deletions(-) diff --git a/drivers/net/usb/sr9700.c b/drivers/net/usb/sr9700.c index 99b69af14274..4a1e9c489f1f 100644 --- a/drivers/net/usb/sr9700.c +++ b/drivers/net/usb/sr9700.c @@ -77,7 +77,7 @@ static int wait_phy_eeprom_ready(struct usbnet *dev, int phy) int ret; udelay(1); - ret = sr_read_reg(dev, EPCR, &tmp); + ret = sr_read_reg(dev, SR_EPCR, &tmp); if (ret < 0) return ret; @@ -98,15 +98,15 @@ static int sr_share_read_word(struct usbnet *dev, int phy, u8 reg, mutex_lock(&dev->phy_mutex); - sr_write_reg(dev, EPAR, phy ? (reg | EPAR_PHY_ADR) : reg); - sr_write_reg(dev, EPCR, phy ? (EPCR_EPOS | EPCR_ERPRR) : EPCR_ERPRR); + sr_write_reg(dev, SR_EPAR, phy ? (reg | EPAR_PHY_ADR) : reg); + sr_write_reg(dev, SR_EPCR, phy ? (EPCR_EPOS | EPCR_ERPRR) : EPCR_ERPRR); ret = wait_phy_eeprom_ready(dev, phy); if (ret < 0) goto out_unlock; - sr_write_reg(dev, EPCR, 0x0); - ret = sr_read(dev, EPDR, 2, value); + sr_write_reg(dev, SR_EPCR, 0x0); + ret = sr_read(dev, SR_EPDR, 2, value); netdev_dbg(dev->net, "read shared %d 0x%02x returned 0x%04x, %d\n", phy, reg, *value, ret); @@ -123,19 +123,19 @@ static int sr_share_write_word(struct usbnet *dev, int phy, u8 reg, mutex_lock(&dev->phy_mutex); - ret = sr_write(dev, EPDR, 2, &value); + ret = sr_write(dev, SR_EPDR, 2, &value); if (ret < 0) goto out_unlock; - sr_write_reg(dev, EPAR, phy ? (reg | EPAR_PHY_ADR) : reg); - sr_write_reg(dev, EPCR, phy ? (EPCR_WEP | EPCR_EPOS | EPCR_ERPRW) : + sr_write_reg(dev, SR_EPAR, phy ? (reg | EPAR_PHY_ADR) : reg); + sr_write_reg(dev, SR_EPCR, phy ? (EPCR_WEP | EPCR_EPOS | EPCR_ERPRW) : (EPCR_WEP | EPCR_ERPRW)); ret = wait_phy_eeprom_ready(dev, phy); if (ret < 0) goto out_unlock; - sr_write_reg(dev, EPCR, 0x0); + sr_write_reg(dev, SR_EPCR, 0x0); out_unlock: mutex_unlock(&dev->phy_mutex); @@ -188,7 +188,7 @@ static int sr_mdio_read(struct net_device *netdev, int phy_id, int loc) if (loc == MII_BMSR) { u8 value; - sr_read_reg(dev, NSR, &value); + sr_read_reg(dev, SR_NSR, &value); if (value & NSR_LINKST) rc = 1; } @@ -228,7 +228,7 @@ static u32 sr9700_get_link(struct net_device *netdev) int rc = 0; /* Get the Link Status directly */ - sr_read_reg(dev, NSR, &value); + sr_read_reg(dev, SR_NSR, &value); if (value & NSR_LINKST) rc = 1; @@ -281,8 +281,8 @@ static void sr9700_set_multicast(struct net_device *netdev) } } - sr_write_async(dev, MAR, SR_MCAST_SIZE, hashes); - sr_write_reg_async(dev, RCR, rx_ctl); + sr_write_async(dev, SR_MAR, SR_MCAST_SIZE, hashes); + sr_write_reg_async(dev, SR_RCR, rx_ctl); } static int sr9700_set_mac_address(struct net_device *netdev, void *p) @@ -297,7 +297,7 @@ static int sr9700_set_mac_address(struct net_device *netdev, void *p) } memcpy(netdev->dev_addr, addr->sa_data, netdev->addr_len); - sr_write_async(dev, PAR, 6, netdev->dev_addr); + sr_write_async(dev, SR_PAR, 6, netdev->dev_addr); return 0; } @@ -340,7 +340,7 @@ static int sr9700_bind(struct usbnet *dev, struct usb_interface *intf) mii->phy_id_mask = 0x1f; mii->reg_num_mask = 0x1f; - sr_write_reg(dev, NCR, NCR_RST); + sr_write_reg(dev, SR_NCR, NCR_RST); udelay(20); /* read MAC @@ -348,17 +348,17 @@ static int sr9700_bind(struct usbnet *dev, struct usb_interface *intf) * EEPROM automatically to PAR. In case there is no EEPROM externally, * a default MAC address is stored in PAR for making chip work properly. */ - if (sr_read(dev, PAR, ETH_ALEN, netdev->dev_addr) < 0) { + if (sr_read(dev, SR_PAR, ETH_ALEN, netdev->dev_addr) < 0) { netdev_err(netdev, "Error reading MAC address\n"); ret = -ENODEV; goto out; } /* power up and reset phy */ - sr_write_reg(dev, PRR, PRR_PHY_RST); + sr_write_reg(dev, SR_PRR, PRR_PHY_RST); /* at least 10ms, here 20ms for safe */ mdelay(20); - sr_write_reg(dev, PRR, 0); + sr_write_reg(dev, SR_PRR, 0); /* at least 1ms, here 2ms for reading right register */ udelay(2 * 1000); diff --git a/drivers/net/usb/sr9700.h b/drivers/net/usb/sr9700.h index fd687c575e74..258b030277e7 100644 --- a/drivers/net/usb/sr9700.h +++ b/drivers/net/usb/sr9700.h @@ -14,13 +14,13 @@ /* sr9700 spec. register table on Linux platform */ /* Network Control Reg */ -#define NCR 0x00 +#define SR_NCR 0x00 #define NCR_RST (1 << 0) #define NCR_LBK (3 << 1) #define NCR_FDX (1 << 3) #define NCR_WAKEEN (1 << 6) /* Network Status Reg */ -#define NSR 0x01 +#define SR_NSR 0x01 #define NSR_RXRDY (1 << 0) #define NSR_RXOV (1 << 1) #define NSR_TX1END (1 << 2) @@ -30,7 +30,7 @@ #define NSR_LINKST (1 << 6) #define NSR_SPEED (1 << 7) /* Tx Control Reg */ -#define TCR 0x02 +#define SR_TCR 0x02 #define TCR_CRC_DIS (1 << 1) #define TCR_PAD_DIS (1 << 2) #define TCR_LC_CARE (1 << 3) @@ -38,7 +38,7 @@ #define TCR_EXCECM (1 << 5) #define TCR_LF_EN (1 << 6) /* Tx Status Reg for Packet Index 1 */ -#define TSR1 0x03 +#define SR_TSR1 0x03 #define TSR1_EC (1 << 2) #define TSR1_COL (1 << 3) #define TSR1_LC (1 << 4) @@ -46,7 +46,7 @@ #define TSR1_LOC (1 << 6) #define TSR1_TLF (1 << 7) /* Tx Status Reg for Packet Index 2 */ -#define TSR2 0x04 +#define SR_TSR2 0x04 #define TSR2_EC (1 << 2) #define TSR2_COL (1 << 3) #define TSR2_LC (1 << 4) @@ -54,7 +54,7 @@ #define TSR2_LOC (1 << 6) #define TSR2_TLF (1 << 7) /* Rx Control Reg*/ -#define RCR 0x05 +#define SR_RCR 0x05 #define RCR_RXEN (1 << 0) #define RCR_PRMSC (1 << 1) #define RCR_RUNT (1 << 2) @@ -62,87 +62,87 @@ #define RCR_DIS_CRC (1 << 4) #define RCR_DIS_LONG (1 << 5) /* Rx Status Reg */ -#define RSR 0x06 +#define SR_RSR 0x06 #define RSR_AE (1 << 2) #define RSR_MF (1 << 6) #define RSR_RF (1 << 7) /* Rx Overflow Counter Reg */ -#define ROCR 0x07 +#define SR_ROCR 0x07 #define ROCR_ROC (0x7F << 0) #define ROCR_RXFU (1 << 7) /* Back Pressure Threshold Reg */ -#define BPTR 0x08 +#define SR_BPTR 0x08 #define BPTR_JPT (0x0F << 0) #define BPTR_BPHW (0x0F << 4) /* Flow Control Threshold Reg */ -#define FCTR 0x09 +#define SR_FCTR 0x09 #define FCTR_LWOT (0x0F << 0) #define FCTR_HWOT (0x0F << 4) /* rx/tx Flow Control Reg */ -#define FCR 0x0A +#define SR_FCR 0x0A #define FCR_FLCE (1 << 0) #define FCR_BKPA (1 << 4) #define FCR_TXPEN (1 << 5) #define FCR_TXPF (1 << 6) #define FCR_TXP0 (1 << 7) /* Eeprom & Phy Control Reg */ -#define EPCR 0x0B +#define SR_EPCR 0x0B #define EPCR_ERRE (1 << 0) #define EPCR_ERPRW (1 << 1) #define EPCR_ERPRR (1 << 2) #define EPCR_EPOS (1 << 3) #define EPCR_WEP (1 << 4) /* Eeprom & Phy Address Reg */ -#define EPAR 0x0C +#define SR_EPAR 0x0C #define EPAR_EROA (0x3F << 0) #define EPAR_PHY_ADR_MASK (0x03 << 6) #define EPAR_PHY_ADR (0x01 << 6) /* Eeprom & Phy Data Reg */ -#define EPDR 0x0D /* 0x0D ~ 0x0E for Data Reg Low & High */ +#define SR_EPDR 0x0D /* 0x0D ~ 0x0E for Data Reg Low & High */ /* Wakeup Control Reg */ -#define WCR 0x0F +#define SR_WCR 0x0F #define WCR_MAGICST (1 << 0) #define WCR_LINKST (1 << 2) #define WCR_MAGICEN (1 << 3) #define WCR_LINKEN (1 << 5) /* Physical Address Reg */ -#define PAR 0x10 /* 0x10 ~ 0x15 6 bytes for PAR */ +#define SR_PAR 0x10 /* 0x10 ~ 0x15 6 bytes for PAR */ /* Multicast Address Reg */ -#define MAR 0x16 /* 0x16 ~ 0x1D 8 bytes for MAR */ +#define SR_MAR 0x16 /* 0x16 ~ 0x1D 8 bytes for MAR */ /* 0x1e unused */ /* Phy Reset Reg */ -#define PRR 0x1F +#define SR_PRR 0x1F #define PRR_PHY_RST (1 << 0) /* Tx sdram Write Pointer Address Low */ -#define TWPAL 0x20 +#define SR_TWPAL 0x20 /* Tx sdram Write Pointer Address High */ -#define TWPAH 0x21 +#define SR_TWPAH 0x21 /* Tx sdram Read Pointer Address Low */ -#define TRPAL 0x22 +#define SR_TRPAL 0x22 /* Tx sdram Read Pointer Address High */ -#define TRPAH 0x23 +#define SR_TRPAH 0x23 /* Rx sdram Write Pointer Address Low */ -#define RWPAL 0x24 +#define SR_RWPAL 0x24 /* Rx sdram Write Pointer Address High */ -#define RWPAH 0x25 +#define SR_RWPAH 0x25 /* Rx sdram Read Pointer Address Low */ -#define RRPAL 0x26 +#define SR_RRPAL 0x26 /* Rx sdram Read Pointer Address High */ -#define RRPAH 0x27 +#define SR_RRPAH 0x27 /* Vendor ID register */ -#define VID 0x28 /* 0x28 ~ 0x29 2 bytes for VID */ +#define SR_VID 0x28 /* 0x28 ~ 0x29 2 bytes for VID */ /* Product ID register */ -#define PID 0x2A /* 0x2A ~ 0x2B 2 bytes for PID */ +#define SR_PID 0x2A /* 0x2A ~ 0x2B 2 bytes for PID */ /* CHIP Revision register */ -#define CHIPR 0x2C +#define SR_CHIPR 0x2C /* 0x2D --> 0xEF unused */ /* USB Device Address */ -#define USBDA 0xF0 +#define SR_USBDA 0xF0 #define USBDA_USBFA (0x7F << 0) /* RX packet Counter Reg */ -#define RXC 0xF1 +#define SR_RXC 0xF1 /* Tx packet Counter & USB Status Reg */ -#define TXC_USBS 0xF2 +#define SR_TXC_USBS 0xF2 #define TXC_USBS_TXC0 (1 << 0) #define TXC_USBS_TXC1 (1 << 1) #define TXC_USBS_TXC2 (1 << 2) @@ -150,7 +150,7 @@ #define TXC_USBS_SUSFLAG (1 << 6) #define TXC_USBS_RXFAULT (1 << 7) /* USB Control register */ -#define USBC 0xF4 +#define SR_USBC 0xF4 #define USBC_EP3NAK (1 << 4) #define USBC_EP3ACK (1 << 5) -- cgit v1.2.3-70-g09d2 From 75300ad2d91fac50852dc4123977d55e64c874cc Mon Sep 17 00:00:00 2001 From: Sanjeev Sharma Date: Tue, 3 Feb 2015 13:02:02 +0530 Subject: gianfar: correct the bad expression while writing bit-pattern This patch correct the bad expression while writing the bit-pattern from software's buffer to hardware registers. Signed-off-by: Sanjeev Sharma Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/gianfar_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c index 3e1a9c1a67a9..fda12fb32ec7 100644 --- a/drivers/net/ethernet/freescale/gianfar_ethtool.c +++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c @@ -1586,7 +1586,7 @@ static int gfar_write_filer_table(struct gfar_private *priv, return -EBUSY; /* Fill regular entries */ - for (; i < MAX_FILER_IDX - 1 && (tab->fe[i].ctrl | tab->fe[i].ctrl); + for (; i < MAX_FILER_IDX - 1 && (tab->fe[i].ctrl | tab->fe[i].prop); i++) gfar_write_filer(priv, i, tab->fe[i].ctrl, tab->fe[i].prop); /* Fill the rest with fall-troughs */ -- cgit v1.2.3-70-g09d2 From 5201aa49b0fc12d9bf911090744ca04066bad4a7 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Tue, 3 Feb 2015 11:07:06 +0200 Subject: vhost/net: fix up num_buffers endian-ness In virtio 1.0 mode, when mergeable buffers are enabled on a big-endian host, num_buffers wasn't byte-swapped correctly, so large incoming packets got corrupted. To fix, fill it in within hdr - this also makes sure it gets the correct type. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/vhost/net.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index d415d69dc237..9484d5652ca5 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -650,8 +650,10 @@ static void handle_rx(struct vhost_net *net) break; } /* TODO: Should check and handle checksum. */ + + hdr.num_buffers = cpu_to_vhost16(vq, headcount); if (likely(mergeable) && - memcpy_toiovecend(nvq->hdr, (unsigned char *)&headcount, + memcpy_toiovecend(nvq->hdr, (void *)&hdr.num_buffers, offsetof(typeof(hdr), num_buffers), sizeof hdr.num_buffers)) { vq_err(vq, "Failed num_buffers write"); -- cgit v1.2.3-70-g09d2 From db27ebb111e9f69efece08e4cb6a34ff980f8896 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 3 Feb 2015 08:55:58 -0500 Subject: net: rds: use correct size for max unacked packets and bytes Max unacked packets/bytes is an int while sizeof(long) was used in the sysctl table. This means that when they were getting read we'd also leak kernel memory to userspace along with the timeout values. Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- net/rds/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index c3b0cd43eb56..c173f69e1479 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -71,14 +71,14 @@ static struct ctl_table rds_sysctl_rds_table[] = { { .procname = "max_unacked_packets", .data = &rds_sysctl_max_unacked_packets, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_unacked_bytes", .data = &rds_sysctl_max_unacked_bytes, - .maxlen = sizeof(unsigned long), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -- cgit v1.2.3-70-g09d2 From 3725a269815ba6dbb415feddc47da5af7d1fac58 Mon Sep 17 00:00:00 2001 From: Kenneth Klette Jonassen Date: Tue, 3 Feb 2015 17:49:18 +0100 Subject: pkt_sched: fq: avoid hang when quantum 0 Configuring fq with quantum 0 hangs the system, presumably because of a non-interruptible infinite loop. Either way quantum 0 does not make sense. Reproduce with: sudo tc qdisc add dev lo root fq quantum 0 initial_quantum 0 ping 127.0.0.1 Signed-off-by: Kenneth Klette Jonassen Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_fq.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 9b05924cc386..333cd94ba381 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -670,8 +670,14 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_FQ_FLOW_PLIMIT]) q->flow_plimit = nla_get_u32(tb[TCA_FQ_FLOW_PLIMIT]); - if (tb[TCA_FQ_QUANTUM]) - q->quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + if (tb[TCA_FQ_QUANTUM]) { + u32 quantum = nla_get_u32(tb[TCA_FQ_QUANTUM]); + + if (quantum > 0) + q->quantum = quantum; + else + err = -EINVAL; + } if (tb[TCA_FQ_INITIAL_QUANTUM]) q->initial_quantum = nla_get_u32(tb[TCA_FQ_INITIAL_QUANTUM]); -- cgit v1.2.3-70-g09d2 From b057df24a7536cce6c372efe9d0e3d1558afedf4 Mon Sep 17 00:00:00 2001 From: Ignacy Gawędzki Date: Tue, 3 Feb 2015 19:05:18 +0100 Subject: cls_api.c: Fix dumping of non-existing actions' stats. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In tcf_exts_dump_stats(), ensure that exts->actions is not empty before accessing the first element of that list and calling tcf_action_copy_stats() on it. This fixes some random segvs when adding filters of type "basic" with no particular action. This also fixes the dumping of those "no-action" filters, which more often than not made calls to tcf_action_copy_stats() fail and consequently netlink attributes added by the caller to be removed by a call to nla_nest_cancel(). Fixes: 33be62715991 ("net_sched: act: use standard struct list_head") Signed-off-by: Ignacy Gawędzki Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_api.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index aad6a679fb13..baef987fe2c0 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -556,8 +556,9 @@ void tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, } EXPORT_SYMBOL(tcf_exts_change); -#define tcf_exts_first_act(ext) \ - list_first_entry(&(exts)->actions, struct tc_action, list) +#define tcf_exts_first_act(ext) \ + list_first_entry_or_null(&(exts)->actions, \ + struct tc_action, list) int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) { @@ -603,7 +604,7 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT struct tc_action *a = tcf_exts_first_act(exts); - if (tcf_action_copy_stats(skb, a, 1) < 0) + if (a != NULL && tcf_action_copy_stats(skb, a, 1) < 0) return -1; #endif return 0; -- cgit v1.2.3-70-g09d2 From 211fcf6d2100e625ec1d4a50e8adf67370a37bad Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Tue, 3 Feb 2015 12:49:55 -0600 Subject: amd-xgbe: Adjust for zero-based traffic class count The number of traffic classes reported by the hardware is zero-based so increment the value returned to get an actual count. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 7bb5f07dbeef..124614096456 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -552,13 +552,14 @@ void xgbe_get_all_hw_features(struct xgbe_prv_data *pdata) break; } - /* The Queue and Channel counts are zero based so increment them + /* The Queue, Channel and TC counts are zero based so increment them * to get the actual number */ hw_feat->rx_q_cnt++; hw_feat->tx_q_cnt++; hw_feat->rx_ch_cnt++; hw_feat->tx_ch_cnt++; + hw_feat->tc_cnt++; DBGPR("<--xgbe_get_all_hw_features\n"); } -- cgit v1.2.3-70-g09d2 From cf180b8acf2627fdc9ab472ed0dc565803cd388a Mon Sep 17 00:00:00 2001 From: "Lendacky, Thomas" Date: Tue, 3 Feb 2015 14:14:32 -0600 Subject: amd-xgbe: Set RSS enablement based on hardware features The RSS support requires enablement based on the features reported by the hardware. The setting of this flag is missing. Add support to set the RSS enablement flag based on the reported hardware features. Signed-off-by: Tom Lendacky Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c index 124614096456..e5ffb2ccb67d 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c @@ -523,6 +523,7 @@ void xgbe_get_all_hw_features(struct xgbe_prv_data *pdata) hw_feat->sph = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, SPHEN); hw_feat->tso = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, TSOEN); hw_feat->dma_debug = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, DBGMEMA); + hw_feat->rss = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, RSSEN); hw_feat->tc_cnt = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, NUMTC); hw_feat->hash_table_size = XGMAC_GET_BITS(mac_hfr1, MAC_HWF1R, HASHTBLSZ); -- cgit v1.2.3-70-g09d2 From f31ec95fa19e07a8beebcc0297284f23aa57967e Mon Sep 17 00:00:00 2001 From: Shahed Shaikh Date: Wed, 4 Feb 2015 05:41:25 -0500 Subject: qlcnic: Fix NAPI poll routine for Tx completion After d75b1ade567f ("net: less interrupt masking in NAPI") driver's NAPI poll routine is expected to return exact budget value if it wants to be re-called. Signed-off-by: Shahed Shaikh Fixes: d75b1ade567f ("net: less interrupt masking in NAPI") Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c | 27 +++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c index 18e5de72e9b4..4e1f58cf19ce 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c @@ -967,7 +967,12 @@ static int qlcnic_poll(struct napi_struct *napi, int budget) tx_complete = qlcnic_process_cmd_ring(adapter, tx_ring, budget); work_done = qlcnic_process_rcv_ring(sds_ring, budget); - if ((work_done < budget) && tx_complete) { + + /* Check if we need a repoll */ + if (!tx_complete) + work_done = budget; + + if (work_done < budget) { napi_complete(&sds_ring->napi); if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) { qlcnic_enable_sds_intr(adapter, sds_ring); @@ -992,6 +997,9 @@ static int qlcnic_tx_poll(struct napi_struct *napi, int budget) napi_complete(&tx_ring->napi); if (test_bit(__QLCNIC_DEV_UP, &adapter->state)) qlcnic_enable_tx_intr(adapter, tx_ring); + } else { + /* As qlcnic_process_cmd_ring() returned 0, we need a repoll*/ + work_done = budget; } return work_done; @@ -1950,7 +1958,12 @@ static int qlcnic_83xx_msix_sriov_vf_poll(struct napi_struct *napi, int budget) tx_complete = qlcnic_process_cmd_ring(adapter, tx_ring, budget); work_done = qlcnic_83xx_process_rcv_ring(sds_ring, budget); - if ((work_done < budget) && tx_complete) { + + /* Check if we need a repoll */ + if (!tx_complete) + work_done = budget; + + if (work_done < budget) { napi_complete(&sds_ring->napi); qlcnic_enable_sds_intr(adapter, sds_ring); } @@ -1973,7 +1986,12 @@ static int qlcnic_83xx_poll(struct napi_struct *napi, int budget) tx_complete = qlcnic_process_cmd_ring(adapter, tx_ring, budget); work_done = qlcnic_83xx_process_rcv_ring(sds_ring, budget); - if ((work_done < budget) && tx_complete) { + + /* Check if we need a repoll */ + if (!tx_complete) + work_done = budget; + + if (work_done < budget) { napi_complete(&sds_ring->napi); qlcnic_enable_sds_intr(adapter, sds_ring); } @@ -1995,6 +2013,9 @@ static int qlcnic_83xx_msix_tx_poll(struct napi_struct *napi, int budget) napi_complete(&tx_ring->napi); if (test_bit(__QLCNIC_DEV_UP , &adapter->state)) qlcnic_enable_tx_intr(adapter, tx_ring); + } else { + /* need a repoll */ + work_done = budget; } return work_done; -- cgit v1.2.3-70-g09d2 From d1e158e2d7a0a91110b206653f0e02376e809150 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 4 Feb 2015 15:25:09 +0100 Subject: ip6_gre: fix endianness errors in ip6gre_err info is in network byte order, change it back to host byte order before use. In particular, the current code sets the MTU of the tunnel to a wrong (too big) value. Fixes: c12b395a4664 ("gre: Support GRE over IPv6") Signed-off-by: Sabrina Dubroca Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 13cda4c6313b..01ccc28a686f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -417,7 +417,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (code == ICMPV6_HDR_FIELD) teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); - if (teli && teli == info - 2) { + if (teli && teli == be32_to_cpu(info) - 2) { tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; if (tel->encap_limit == 0) { net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", @@ -429,7 +429,7 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } break; case ICMPV6_PKT_TOOBIG: - mtu = info - offset; + mtu = be32_to_cpu(info) - offset; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; t->dev->mtu = mtu; -- cgit v1.2.3-70-g09d2 From f4575d3534617eec98c7eb8701185cec96b4374b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Feb 2015 13:31:54 -0800 Subject: flow_keys: n_proto type should be __be16 (struct flow_keys)->n_proto is in network order, use proper type for this. Fixes following sparse errors : net/core/flow_dissector.c:139:39: warning: incorrect type in assignment (different base types) net/core/flow_dissector.c:139:39: expected unsigned short [unsigned] [usertype] n_proto net/core/flow_dissector.c:139:39: got restricted __be16 [assigned] [usertype] proto net/core/flow_dissector.c:237:23: warning: incorrect type in assignment (different base types) net/core/flow_dissector.c:237:23: expected unsigned short [unsigned] [usertype] n_proto net/core/flow_dissector.c:237:23: got restricted __be16 [assigned] [usertype] proto Signed-off-by: Eric Dumazet Fixes: e0f31d849867 ("flow_keys: Record IP layer protocol in skb_flow_dissect()") Signed-off-by: David S. Miller --- include/net/flow_keys.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h index 7ee2df083542..dc8fd81412bf 100644 --- a/include/net/flow_keys.h +++ b/include/net/flow_keys.h @@ -22,9 +22,9 @@ struct flow_keys { __be32 ports; __be16 port16[2]; }; - u16 thoff; - u16 n_proto; - u8 ip_proto; + u16 thoff; + __be16 n_proto; + u8 ip_proto; }; bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow, -- cgit v1.2.3-70-g09d2 From 2ce1ee1780564ba06ab4c1434aa03e347dc9169f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Feb 2015 13:37:44 -0800 Subject: net: remove some sparse warnings netdev_adjacent_add_links() and netdev_adjacent_del_links() are static. queue->qdisc has __rcu annotation, need to use RCU_INIT_POINTER() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index c87a2264a02b..7fe82929f509 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5294,7 +5294,7 @@ void netdev_upper_dev_unlink(struct net_device *dev, } EXPORT_SYMBOL(netdev_upper_dev_unlink); -void netdev_adjacent_add_links(struct net_device *dev) +static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; @@ -5319,7 +5319,7 @@ void netdev_adjacent_add_links(struct net_device *dev) } } -void netdev_adjacent_del_links(struct net_device *dev) +static void netdev_adjacent_del_links(struct net_device *dev) { struct netdev_adjacent *iter; @@ -6627,7 +6627,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); - queue->qdisc = &noop_qdisc; + RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); queue->qdisc_sleeping = &noop_qdisc; rcu_assign_pointer(dev->ingress_queue, queue); #endif -- cgit v1.2.3-70-g09d2 From 677651462c774b5866be2bc42601303a76b021a0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Feb 2015 15:03:25 -0800 Subject: ipv6: fix sparse errors in ip6_make_flowlabel() include/net/ipv6.h:713:22: warning: incorrect type in assignment (different base types) include/net/ipv6.h:713:22: expected restricted __be32 [usertype] hash include/net/ipv6.h:713:22: got unsigned int include/net/ipv6.h:719:25: warning: restricted __be32 degrades to integer include/net/ipv6.h:719:22: warning: invalid assignment: ^= include/net/ipv6.h:719:22: left side has type restricted __be32 include/net/ipv6.h:719:22: right side has type unsigned int Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/ipv6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 9bf85d34c024..6e416f6d3e3c 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -711,7 +711,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel) { if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) { - __be32 hash; + u32 hash; hash = skb_get_hash(skb); @@ -721,7 +721,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, */ hash ^= hash >> 12; - flowlabel = hash & IPV6_FLOWLABEL_MASK; + flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; } return flowlabel; -- cgit v1.2.3-70-g09d2 From a409caecb2e17fc475533738dd1c69b32e13fe09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Feb 2015 15:12:04 -0800 Subject: sit: fix some __be16/u16 mismatches Fixes following sparse warnings : net/ipv6/sit.c:1509:32: warning: incorrect type in assignment (different base types) net/ipv6/sit.c:1509:32: expected restricted __be16 [usertype] sport net/ipv6/sit.c:1509:32: got unsigned short net/ipv6/sit.c:1514:32: warning: incorrect type in assignment (different base types) net/ipv6/sit.c:1514:32: expected restricted __be16 [usertype] dport net/ipv6/sit.c:1514:32: got unsigned short net/ipv6/sit.c:1711:38: warning: incorrect type in argument 3 (different base types) net/ipv6/sit.c:1711:38: expected unsigned short [unsigned] [usertype] value net/ipv6/sit.c:1711:38: got restricted __be16 [usertype] sport net/ipv6/sit.c:1713:38: warning: incorrect type in argument 3 (different base types) net/ipv6/sit.c:1713:38: expected unsigned short [unsigned] [usertype] value net/ipv6/sit.c:1713:38: got restricted __be16 [usertype] dport Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/sit.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 213546bd6d5d..cdbfe5af6187 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1506,12 +1506,12 @@ static bool ipip6_netlink_encap_parms(struct nlattr *data[], if (data[IFLA_IPTUN_ENCAP_SPORT]) { ret = true; - ipencap->sport = nla_get_u16(data[IFLA_IPTUN_ENCAP_SPORT]); + ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); } if (data[IFLA_IPTUN_ENCAP_DPORT]) { ret = true; - ipencap->dport = nla_get_u16(data[IFLA_IPTUN_ENCAP_DPORT]); + ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); } return ret; @@ -1707,9 +1707,9 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || - nla_put_u16(skb, IFLA_IPTUN_ENCAP_SPORT, + nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || - nla_put_u16(skb, IFLA_IPTUN_ENCAP_DPORT, + nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) -- cgit v1.2.3-70-g09d2