From 269c4845d5b3627b95b1934107251bacbe99bb68 Mon Sep 17 00:00:00 2001 From: Gustavo Padovan Date: Fri, 15 Jun 2012 02:30:20 -0300 Subject: Bluetooth: Fix possible deadlock in SCO code sco_chan_del() only has conn != NULL when called from sco_conn_del() so just move the code from it that deal with conn to sco_conn_del(). [ 120.765529] [ 120.765529] ====================================================== [ 120.766529] [ INFO: possible circular locking dependency detected ] [ 120.766529] 3.5.0-rc1-10292-g3701f94-dirty #70 Tainted: G W [ 120.766529] ------------------------------------------------------- [ 120.766529] kworker/u:3/1497 is trying to acquire lock: [ 120.766529] (&(&conn->lock)->rlock#2){+.+...}, at: [] sco_chan_del+0x4c/0x170 [bluetooth] [ 120.766529] [ 120.766529] but task is already holding lock: [ 120.766529] (slock-AF_BLUETOOTH-BTPROTO_SCO){+.+...}, at: [] sco_conn_del+0x61/0xe0 [bluetooth] [ 120.766529] [ 120.766529] which lock already depends on the new lock. [ 120.766529] [ 120.766529] [ 120.766529] the existing dependency chain (in reverse order) is: [ 120.766529] [ 120.766529] -> #1 (slock-AF_BLUETOOTH-BTPROTO_SCO){+.+...}: [ 120.766529] [] lock_acquire+0x8e/0xb0 [ 120.766529] [] _raw_spin_lock+0x40/0x80 [ 120.766529] [] sco_connect_cfm+0x79/0x300 [bluetooth] [ 120.766529] [] hci_sync_conn_complete_evt.isra.90+0x343/0x400 [bluetooth] [ 120.766529] [] hci_event_packet+0x317/0xfb0 [bluetooth] [ 120.766529] [] hci_rx_work+0x2c8/0x890 [bluetooth] [ 120.766529] [] process_one_work+0x197/0x460 [ 120.766529] [] worker_thread+0x126/0x2d0 [ 120.766529] [] kthread+0x9d/0xb0 [ 120.766529] [] kernel_thread_helper+0x4/0x10 [ 120.766529] [ 120.766529] -> #0 (&(&conn->lock)->rlock#2){+.+...}: [ 120.766529] [] __lock_acquire+0x154a/0x1d30 [ 120.766529] [] lock_acquire+0x8e/0xb0 [ 120.766529] [] _raw_spin_lock+0x40/0x80 [ 120.766529] [] sco_chan_del+0x4c/0x170 [bluetooth] [ 120.766529] [] sco_conn_del+0x74/0xe0 [bluetooth] [ 120.766529] [] sco_disconn_cfm+0x32/0x60 [bluetooth] [ 120.766529] [] hci_disconn_complete_evt.isra.53+0x242/0x390 [bluetooth] [ 120.766529] [] hci_event_packet+0x617/0xfb0 [bluetooth] [ 120.766529] [] hci_rx_work+0x2c8/0x890 [bluetooth] [ 120.766529] [] process_one_work+0x197/0x460 [ 120.766529] [] worker_thread+0x126/0x2d0 [ 120.766529] [] kthread+0x9d/0xb0 [ 120.766529] [] kernel_thread_helper+0x4/0x10 [ 120.766529] [ 120.766529] other info that might help us debug this: [ 120.766529] [ 120.766529] Possible unsafe locking scenario: [ 120.766529] [ 120.766529] CPU0 CPU1 [ 120.766529] ---- ---- [ 120.766529] lock(slock-AF_BLUETOOTH-BTPROTO_SCO); [ 120.766529] lock(&(&conn->lock)->rlock#2); [ 120.766529] lock(slock-AF_BLUETOOTH-BTPROTO_SCO); [ 120.766529] lock(&(&conn->lock)->rlock#2); [ 120.766529] [ 120.766529] *** DEADLOCK *** Signed-off-by: Gustavo Padovan --- net/bluetooth/sco.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 40bbe25dcff7..3589e21edb09 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -131,6 +131,15 @@ static int sco_conn_del(struct hci_conn *hcon, int err) sco_sock_clear_timer(sk); sco_chan_del(sk, err); bh_unlock_sock(sk); + + sco_conn_lock(conn); + conn->sk = NULL; + sco_pi(sk)->conn = NULL; + sco_conn_unlock(conn); + + if (conn->hcon) + hci_conn_put(conn->hcon); + sco_sock_kill(sk); } @@ -821,16 +830,6 @@ static void sco_chan_del(struct sock *sk, int err) BT_DBG("sk %p, conn %p, err %d", sk, conn, err); - if (conn) { - sco_conn_lock(conn); - conn->sk = NULL; - sco_pi(sk)->conn = NULL; - sco_conn_unlock(conn); - - if (conn->hcon) - hci_conn_put(conn->hcon); - } - sk->sk_state = BT_CLOSED; sk->sk_err = err; sk->sk_state_change(sk); -- cgit v1.2.3-70-g09d2 From a9ea3ed9b71cc3271dd59e76f65748adcaa76422 Mon Sep 17 00:00:00 2001 From: Szymon Janc Date: Thu, 19 Jul 2012 14:46:08 +0200 Subject: Bluetooth: Fix legacy pairing with some devices Some devices e.g. some Android based phones don't do SDP search before pairing and cancel legacy pairing when ACL is disconnected. PIN Code Request event which changes ACL timeout to HCI_PAIRING_TIMEOUT is only received after remote user entered PIN. In that case no L2CAP is connected so default HCI_DISCONN_TIMEOUT (2 seconds) is being used to timeout ACL connection. This results in problems with legacy pairing as remote user has only few seconds to enter PIN before ACL is disconnected. Increase disconnect timeout for incomming connection to HCI_PAIRING_TIMEOUT if SSP is disabled and no linkey exists. To avoid keeping ACL alive for too long after SDP search set ACL timeout back to HCI_DISCONN_TIMEOUT when L2CAP is connected. 2012-07-19 13:24:43.413521 < HCI Command: Create Connection (0x01|0x0005) plen 13 bdaddr 00:02:72:D6:6A:3F ptype 0xcc18 rswitch 0x01 clkoffset 0x0000 Packet type: DM1 DM3 DM5 DH1 DH3 DH5 2012-07-19 13:24:43.425224 > HCI Event: Command Status (0x0f) plen 4 Create Connection (0x01|0x0005) status 0x00 ncmd 1 2012-07-19 13:24:43.885222 > HCI Event: Role Change (0x12) plen 8 status 0x00 bdaddr 00:02:72:D6:6A:3F role 0x01 Role: Slave 2012-07-19 13:24:44.054221 > HCI Event: Connect Complete (0x03) plen 11 status 0x00 handle 42 bdaddr 00:02:72:D6:6A:3F type ACL encrypt 0x00 2012-07-19 13:24:44.054313 < HCI Command: Read Remote Supported Features (0x01|0x001b) plen 2 handle 42 2012-07-19 13:24:44.055176 > HCI Event: Page Scan Repetition Mode Change (0x20) plen 7 bdaddr 00:02:72:D6:6A:3F mode 0 2012-07-19 13:24:44.056217 > HCI Event: Max Slots Change (0x1b) plen 3 handle 42 slots 5 2012-07-19 13:24:44.059218 > HCI Event: Command Status (0x0f) plen 4 Read Remote Supported Features (0x01|0x001b) status 0x00 ncmd 0 2012-07-19 13:24:44.062192 > HCI Event: Command Status (0x0f) plen 4 Unknown (0x00|0x0000) status 0x00 ncmd 1 2012-07-19 13:24:44.067219 > HCI Event: Read Remote Supported Features (0x0b) plen 11 status 0x00 handle 42 Features: 0xbf 0xfe 0xcf 0xfe 0xdb 0xff 0x7b 0x87 2012-07-19 13:24:44.067248 < HCI Command: Read Remote Extended Features (0x01|0x001c) plen 3 handle 42 page 1 2012-07-19 13:24:44.071217 > HCI Event: Command Status (0x0f) plen 4 Read Remote Extended Features (0x01|0x001c) status 0x00 ncmd 1 2012-07-19 13:24:44.076218 > HCI Event: Read Remote Extended Features (0x23) plen 13 status 0x00 handle 42 page 1 max 1 Features: 0x01 0x00 0x00 0x00 0x00 0x00 0x00 0x00 2012-07-19 13:24:44.076249 < HCI Command: Remote Name Request (0x01|0x0019) plen 10 bdaddr 00:02:72:D6:6A:3F mode 2 clkoffset 0x0000 2012-07-19 13:24:44.081218 > HCI Event: Command Status (0x0f) plen 4 Remote Name Request (0x01|0x0019) status 0x00 ncmd 1 2012-07-19 13:24:44.105214 > HCI Event: Remote Name Req Complete (0x07) plen 255 status 0x00 bdaddr 00:02:72:D6:6A:3F name 'uw000951-0' 2012-07-19 13:24:44.105284 < HCI Command: Authentication Requested (0x01|0x0011) plen 2 handle 42 2012-07-19 13:24:44.111207 > HCI Event: Command Status (0x0f) plen 4 Authentication Requested (0x01|0x0011) status 0x00 ncmd 1 2012-07-19 13:24:44.112220 > HCI Event: Link Key Request (0x17) plen 6 bdaddr 00:02:72:D6:6A:3F 2012-07-19 13:24:44.112249 < HCI Command: Link Key Request Negative Reply (0x01|0x000c) plen 6 bdaddr 00:02:72:D6:6A:3F 2012-07-19 13:24:44.115215 > HCI Event: Command Complete (0x0e) plen 10 Link Key Request Negative Reply (0x01|0x000c) ncmd 1 status 0x00 bdaddr 00:02:72:D6:6A:3F 2012-07-19 13:24:44.116215 > HCI Event: PIN Code Request (0x16) plen 6 bdaddr 00:02:72:D6:6A:3F 2012-07-19 13:24:48.099184 > HCI Event: Auth Complete (0x06) plen 3 status 0x13 handle 42 Error: Remote User Terminated Connection 2012-07-19 13:24:48.179182 > HCI Event: Disconn Complete (0x05) plen 4 status 0x00 handle 42 reason 0x13 Reason: Remote User Terminated Connection Cc: stable@vger.kernel.org Signed-off-by: Szymon Janc Acked-by: Johan Hedberg Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 7 ++++++- net/bluetooth/l2cap_core.c | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 41ff978a33f9..248632cec11d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1762,7 +1762,12 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (conn->type == ACL_LINK) { conn->state = BT_CONFIG; hci_conn_hold(conn); - conn->disc_timeout = HCI_DISCONN_TIMEOUT; + + if (!conn->out && !hci_conn_ssp_enabled(conn) && + !hci_find_link_key(hdev, &ev->bdaddr)) + conn->disc_timeout = HCI_PAIRING_TIMEOUT; + else + conn->disc_timeout = HCI_DISCONN_TIMEOUT; } else conn->state = BT_CONNECTED; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index a8964db04bfb..daa149b7003c 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1181,6 +1181,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn) sk = chan->sk; hci_conn_hold(conn->hcon); + conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT; bacpy(&bt_sk(sk)->src, conn->src); bacpy(&bt_sk(sk)->dst, conn->dst); -- cgit v1.2.3-70-g09d2 From c810089c27e48b816181b454fcc493d19fdbc2ba Mon Sep 17 00:00:00 2001 From: Ram Malovany Date: Thu, 19 Jul 2012 10:26:09 +0300 Subject: Bluetooth: Fix using NULL inquiry entry If entry wasn't found in the hci_inquiry_cache_lookup_resolve do not resolve the name.This will fix a kernel crash when trying to use NULL pointer. Cc: stable@vger.kernel.org Signed-off-by: Ram Malovany Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 248632cec11d..b64cfa213bd6 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1365,6 +1365,9 @@ static bool hci_resolve_next_name(struct hci_dev *hdev) return false; e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED); + if (!e) + return false; + if (hci_resolve_name(hdev, e) == 0) { e->name_state = NAME_PENDING; return true; -- cgit v1.2.3-70-g09d2 From 7cc8380eb10347016d95bf6f9d842c2ae6d12932 Mon Sep 17 00:00:00 2001 From: Ram Malovany Date: Thu, 19 Jul 2012 10:26:10 +0300 Subject: Bluetooth: Fix using a NULL inquiry cache entry If the device was not found in a list of found devices names of which are pending.This may happen in a case when HCI Remote Name Request was sent as a part of incoming connection establishment procedure. Hence there is no need to continue resolving a next name as it will be done upon receiving another Remote Name Request Complete Event. This will fix a kernel crash when trying to use this entry to resolve the next name. Cc: stable@vger.kernel.org Signed-off-by: Ram Malovany Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b64cfa213bd6..fe9a3d6d30b0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1396,12 +1396,18 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn, return; e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING); - if (e) { + /* If the device was not found in a list of found devices names of which + * are pending. there is no need to continue resolving a next name as it + * will be done upon receiving another Remote Name Request Complete + * Event */ + if (!e) + return; + + list_del(&e->list); + if (name) { e->name_state = NAME_KNOWN; - list_del(&e->list); - if (name) - mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, - e->data.rssi, name, name_len); + mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, + e->data.rssi, name, name_len); } if (hci_resolve_next_name(hdev)) -- cgit v1.2.3-70-g09d2 From c3e7c0d90b14a3e7ac091d24cef09efb516d587b Mon Sep 17 00:00:00 2001 From: Ram Malovany Date: Thu, 19 Jul 2012 10:26:11 +0300 Subject: Bluetooth: Set name_state to unknown when entry name is empty When the name of the given entry is empty , the state needs to be updated accordingly. Cc: stable@vger.kernel.org Signed-off-by: Ram Malovany Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_event.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fe9a3d6d30b0..715d7e33fba0 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1408,6 +1408,8 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn, e->name_state = NAME_KNOWN; mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, e->data.rssi, name, name_len); + } else { + e->name_state = NAME_NOT_KNOWN; } if (hci_resolve_next_name(hdev)) -- cgit v1.2.3-70-g09d2 From d08fd0e712a834d4abb869c0215a702e290bc51e Mon Sep 17 00:00:00 2001 From: Andrei Emeltchenko Date: Thu, 19 Jul 2012 17:03:43 +0300 Subject: Bluetooth: smp: Fix possible NULL dereference smp_chan_create might return NULL so we need to check before dereferencing smp. Signed-off-by: Andrei Emeltchenko Signed-off-by: Gustavo Padovan --- net/bluetooth/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 16ef0dc85a0a..901a616c8083 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -579,8 +579,11 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb) if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) smp = smp_chan_create(conn); + else + smp = conn->smp_chan; - smp = conn->smp_chan; + if (!smp) + return SMP_UNSPECIFIED; smp->preq[0] = SMP_CMD_PAIRING_REQ; memcpy(&smp->preq[1], req, sizeof(*req)); -- cgit v1.2.3-70-g09d2 From 49dfbb9129c4edb318578de35cc45c555df37884 Mon Sep 17 00:00:00 2001 From: Jaganath Kanakkassery Date: Thu, 19 Jul 2012 12:54:04 +0530 Subject: Bluetooth: Fix socket not getting freed if l2cap channel create fails If l2cap_chan_create() fails then it will return from l2cap_sock_kill since zapped flag of sk is reset. Signed-off-by: Jaganath Kanakkassery Signed-off-by: Gustavo Padovan --- net/bluetooth/l2cap_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index a4bb27e8427e..b94abd30e6f9 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1174,7 +1174,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p chan = l2cap_chan_create(); if (!chan) { - l2cap_sock_kill(sk); + sk_free(sk); return NULL; } -- cgit v1.2.3-70-g09d2 From 4185392da4b4b494e51934c51b999b4df424afba Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 6 Aug 2012 15:49:47 -0700 Subject: openvswitch: Relax set header validation. When installing a flow with an action to set a particular field we need to validate that the packets that are part of the flow actually contain that header. With IP we use zeroed addresses and with TCP/UDP the check is for zeroed ports. This check is overly broad and can catch packets like DHCP requests that have a zero source address in a legitimate header. This changes the check to look for a zeroed protocol number for IP or for both ports be zero for TCP/UDP before considering the header to not exist. Reported-by: Ethan Jackson Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d8277d29e710..cf58cedad083 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -425,10 +425,10 @@ static int validate_sample(const struct nlattr *attr, static int validate_tp_port(const struct sw_flow_key *flow_key) { if (flow_key->eth.type == htons(ETH_P_IP)) { - if (flow_key->ipv4.tp.src && flow_key->ipv4.tp.dst) + if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) return 0; } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { - if (flow_key->ipv6.tp.src && flow_key->ipv6.tp.dst) + if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) return 0; } @@ -460,7 +460,7 @@ static int validate_set(const struct nlattr *a, if (flow_key->eth.type != htons(ETH_P_IP)) return -EINVAL; - if (!flow_key->ipv4.addr.src || !flow_key->ipv4.addr.dst) + if (!flow_key->ip.proto) return -EINVAL; ipv4_key = nla_data(ovs_key); -- cgit v1.2.3-70-g09d2 From e9324b2ce656e1910d2385b9b47a2f926456dbe3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Aug 2012 10:08:45 +0000 Subject: netfilter: nf_ct_sip: fix helper name Commit 3a8fc53a (netfilter: nf_ct_helper: allocate 16 bytes for the helper and policy names) introduced a bug in the SIP helper, the helper name is sprinted to the sip_names array instead of instead of into the helper structure. This breaks the helper match and the /proc/net/nf_conntrack_expect output. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_sip.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 758a1bacc126..2fb666920cc9 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1515,7 +1515,6 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, } static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; -static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly; static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { [SIP_EXPECT_SIGNALLING] = { @@ -1585,9 +1584,9 @@ static int __init nf_conntrack_sip_init(void) sip[i][j].me = THIS_MODULE; if (ports[i] == SIP_PORT) - sprintf(sip_names[i][j], "sip"); + sprintf(sip[i][j].name, "sip"); else - sprintf(sip_names[i][j], "sip-%u", i); + sprintf(sip[i][j].name, "sip-%u", i); pr_debug("port #%u: %u\n", i, ports[i]); -- cgit v1.2.3-70-g09d2 From 02b69cbdc2fb2e1bfbfd9ac0c246d7be1b08d3cd Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Aug 2012 10:08:46 +0000 Subject: netfilter: nf_ct_sip: fix IPv6 address parsing Within SIP messages IPv6 addresses are enclosed in square brackets in most cases, with the exception of the "received=" header parameter. Currently the helper fails to parse enclosed addresses. This patch: - changes the SIP address parsing function to enforce square brackets when required, and accept them when not required but present, as recommended by RFC 5118. - adds a new SDP address parsing function that never accepts square brackets since SDP doesn't use them. With these changes, the SIP helper correctly parses all test messages from RFC 5118 (Session Initiation Protocol (SIP) Torture Test Messages for Internet Protocol Version 6 (IPv6)). Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nf_conntrack_sip.h | 2 +- net/ipv4/netfilter/nf_nat_sip.c | 4 +- net/netfilter/nf_conntrack_sip.c | 87 ++++++++++++++++++++++++------ 3 files changed, 73 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h index 0dfc8b7210a3..89f2a627f3f0 100644 --- a/include/linux/netfilter/nf_conntrack_sip.h +++ b/include/linux/netfilter/nf_conntrack_sip.h @@ -164,7 +164,7 @@ extern int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr); + union nf_inet_addr *addr, bool delim); extern int ct_sip_parse_numerical_param(const struct nf_conn *ct, const char *dptr, unsigned int off, unsigned int datalen, const char *name, diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index ea4a23813d26..eef8f29e8bf8 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -173,7 +173,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * the reply. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "maddr=", &poff, &plen, - &addr) > 0 && + &addr, true) > 0 && addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { buflen = sprintf(buffer, "%pI4", @@ -187,7 +187,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, * from which the server received the request. */ if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, "received=", &poff, &plen, - &addr) > 0 && + &addr, false) > 0 && addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { buflen = sprintf(buffer, "%pI4", diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 2fb666920cc9..5c0a112aeee6 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -183,12 +183,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr, return len + digits_len(ct, dptr, limit, shift); } -static int parse_addr(const struct nf_conn *ct, const char *cp, - const char **endp, union nf_inet_addr *addr, - const char *limit) +static int sip_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit, bool delim) { const char *end; - int ret = 0; + int ret; if (!ct) return 0; @@ -197,16 +197,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp, switch (nf_ct_l3num(ct)) { case AF_INET: ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + if (ret == 0) + return 0; break; case AF_INET6: + if (cp < limit && *cp == '[') + cp++; + else if (delim) + return 0; + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + if (ret == 0) + return 0; + + if (end < limit && *end == ']') + end++; + else if (delim) + return 0; break; default: BUG(); } - if (ret == 0 || end == cp) - return 0; if (endp) *endp = end; return 1; @@ -219,7 +231,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr, union nf_inet_addr addr; const char *aux = dptr; - if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { + if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) { pr_debug("ip: %s parse failed.!\n", dptr); return 0; } @@ -296,7 +308,7 @@ int ct_sip_parse_request(const struct nf_conn *ct, return 0; dptr += shift; - if (!parse_addr(ct, dptr, &end, addr, limit)) + if (!sip_parse_addr(ct, dptr, &end, addr, limit, true)) return -1; if (end < limit && *end == ':') { end++; @@ -550,7 +562,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr, if (ret == 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) + if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true)) return -1; if (*c == ':') { c++; @@ -599,7 +611,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, unsigned int dataoff, unsigned int datalen, const char *name, unsigned int *matchoff, unsigned int *matchlen, - union nf_inet_addr *addr) + union nf_inet_addr *addr, bool delim) { const char *limit = dptr + datalen; const char *start, *end; @@ -613,7 +625,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr, return 0; start += strlen(name); - if (!parse_addr(ct, start, &end, addr, limit)) + if (!sip_parse_addr(ct, start, &end, addr, limit, delim)) return 0; *matchoff = start - dptr; *matchlen = end - start; @@ -675,6 +687,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, return 1; } +static int sdp_parse_addr(const struct nf_conn *ct, const char *cp, + const char **endp, union nf_inet_addr *addr, + const char *limit) +{ + const char *end; + int ret; + + memset(addr, 0, sizeof(*addr)); + switch (nf_ct_l3num(ct)) { + case AF_INET: + ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); + break; + case AF_INET6: + ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); + break; + default: + BUG(); + } + + if (ret == 0) + return 0; + if (endp) + *endp = end; + return 1; +} + +/* skip ip address. returns its length. */ +static int sdp_addr_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + union nf_inet_addr addr; + const char *aux = dptr; + + if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) { + pr_debug("ip: %s parse failed.!\n", dptr); + return 0; + } + + return dptr - aux; +} + /* SDP header parsing: a SDP session description contains an ordered set of * headers, starting with a section containing general session parameters, * optionally followed by multiple media descriptions. @@ -686,10 +739,10 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr, */ static const struct sip_header ct_sdp_hdrs[] = { [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), - [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), - [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), - [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), - [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), + [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len), + [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len), + [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len), [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), }; @@ -775,8 +828,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr, if (ret <= 0) return ret; - if (!parse_addr(ct, dptr + *matchoff, NULL, addr, - dptr + *matchoff + *matchlen)) + if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr, + dptr + *matchoff + *matchlen)) return -1; return 1; } -- cgit v1.2.3-70-g09d2 From f22eb25cf5b1157b29ef88c793b71972efc47143 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 9 Aug 2012 10:08:47 +0000 Subject: netfilter: nf_nat_sip: fix via header translation with multiple parameters Via-headers are parsed beginning at the first character after the Via-address. When the address is translated first and its length decreases, the offset to start parsing at is incorrect and header parameters might be missed. Update the offset after translating the Via-address to fix this. Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_sip.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index eef8f29e8bf8..4ad9cf173992 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -148,7 +148,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, hdr, NULL, &matchoff, &matchlen, &addr, &port) > 0) { - unsigned int matchend, poff, plen, buflen, n; + unsigned int olen, matchend, poff, plen, buflen, n; char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; /* We're only interested in headers related to this @@ -163,11 +163,12 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, goto next; } + olen = *datalen; if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; - matchend = matchoff + matchlen; + matchend = matchoff + matchlen + *datalen - olen; /* The maddr= parameter (RFC 2361) specifies where to send * the reply. */ -- cgit v1.2.3-70-g09d2 From 68e035c950dbceaf660144bf74054dfdfb6aad15 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 14 Aug 2012 12:47:37 +0200 Subject: netfilter: ctnetlink: fix missing locking while changing conntrack from nfqueue Since 9cb017665 netfilter: add glue code to integrate nfnetlink_queue and ctnetlink, we can modify the conntrack entry via nfnl_queue. However, the change of the conntrack entry via nfnetlink_queue requires appropriate locking to avoid concurrent updates. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 14f67a2cbcb5..da4fc37a8578 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1896,10 +1896,15 @@ static int ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) { struct nlattr *cda[CTA_MAX+1]; + int ret; nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy); - return ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + spin_lock_bh(&nf_conntrack_lock); + ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); + spin_unlock_bh(&nf_conntrack_lock); + + return ret; } static struct nfq_ct_hook ctnetlink_nfqueue_hook = { -- cgit v1.2.3-70-g09d2 From 47be03a28cc6c80e3aa2b3e8ed6d960ff0c5c0af Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:37 +0000 Subject: netpoll: use GFP_ATOMIC in slave_enable_netpoll() and __netpoll_setup() slave_enable_netpoll() and __netpoll_setup() may be called with read_lock() held, so should use GFP_ATOMIC to allocate memory. Eric suggested to pass gfp flags to __netpoll_setup(). Cc: Eric Dumazet Cc: "David S. Miller" Reported-by: Dan Carpenter Signed-off-by: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 6 +++--- drivers/net/team/team.c | 16 +++++++++------- include/linux/netdevice.h | 3 ++- include/linux/netpoll.h | 2 +- net/8021q/vlan_dev.c | 7 ++++--- net/bridge/br_device.c | 12 ++++++------ net/bridge/br_if.c | 2 +- net/bridge/br_private.h | 4 ++-- net/core/netpoll.c | 8 ++++---- 9 files changed, 32 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 6fae5f3ec7f6..8697136e27c0 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1235,12 +1235,12 @@ static inline int slave_enable_netpoll(struct slave *slave) struct netpoll *np; int err = 0; - np = kzalloc(sizeof(*np), GFP_KERNEL); + np = kzalloc(sizeof(*np), GFP_ATOMIC); err = -ENOMEM; if (!np) goto out; - err = __netpoll_setup(np, slave->dev); + err = __netpoll_setup(np, slave->dev, GFP_ATOMIC); if (err) { kfree(np); goto out; @@ -1292,7 +1292,7 @@ static void bond_netpoll_cleanup(struct net_device *bond_dev) read_unlock(&bond->lock); } -static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, gfp_t gfp) { struct bonding *bond = netdev_priv(dev); struct slave *slave; diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 87707ab39430..341b65dbbcd3 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -795,16 +795,17 @@ static void team_port_leave(struct team *team, struct team_port *port) } #ifdef CONFIG_NET_POLL_CONTROLLER -static int team_port_enable_netpoll(struct team *team, struct team_port *port) +static int team_port_enable_netpoll(struct team *team, struct team_port *port, + gfp_t gfp) { struct netpoll *np; int err; - np = kzalloc(sizeof(*np), GFP_KERNEL); + np = kzalloc(sizeof(*np), gfp); if (!np) return -ENOMEM; - err = __netpoll_setup(np, port->dev); + err = __netpoll_setup(np, port->dev, gfp); if (err) { kfree(np); return err; @@ -833,7 +834,8 @@ static struct netpoll_info *team_netpoll_info(struct team *team) } #else -static int team_port_enable_netpoll(struct team *team, struct team_port *port) +static int team_port_enable_netpoll(struct team *team, struct team_port *port, + gfp_t gfp) { return 0; } @@ -913,7 +915,7 @@ static int team_port_add(struct team *team, struct net_device *port_dev) } if (team_netpoll_info(team)) { - err = team_port_enable_netpoll(team, port); + err = team_port_enable_netpoll(team, port, GFP_KERNEL); if (err) { netdev_err(dev, "Failed to enable netpoll on device %s\n", portname); @@ -1443,7 +1445,7 @@ static void team_netpoll_cleanup(struct net_device *dev) } static int team_netpoll_setup(struct net_device *dev, - struct netpoll_info *npifo) + struct netpoll_info *npifo, gfp_t gfp) { struct team *team = netdev_priv(dev); struct team_port *port; @@ -1451,7 +1453,7 @@ static int team_netpoll_setup(struct net_device *dev, mutex_lock(&team->lock); list_for_each_entry(port, &team->port_list, list) { - err = team_port_enable_netpoll(team, port); + err = team_port_enable_netpoll(team, port, gfp); if (err) { __team_netpoll_cleanup(team); break; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index a9db4f33407f..3560d688161e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -953,7 +953,8 @@ struct net_device_ops { #ifdef CONFIG_NET_POLL_CONTROLLER void (*ndo_poll_controller)(struct net_device *dev); int (*ndo_netpoll_setup)(struct net_device *dev, - struct netpoll_info *info); + struct netpoll_info *info, + gfp_t gfp); void (*ndo_netpoll_cleanup)(struct net_device *dev); #endif int (*ndo_set_vf_mac)(struct net_device *dev, diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 28f5389c924b..bf2d51eec0f3 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -43,7 +43,7 @@ struct netpoll_info { void netpoll_send_udp(struct netpoll *np, const char *msg, int len); void netpoll_print_options(struct netpoll *np); int netpoll_parse_options(struct netpoll *np, char *opt); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev); +int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp); int netpoll_setup(struct netpoll *np); int netpoll_trap(void); void netpoll_set_trap(int trap); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 73a2a83ee2da..ee4ae0944cef 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -669,19 +669,20 @@ static void vlan_dev_poll_controller(struct net_device *dev) return; } -static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) +static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo, + gfp_t gfp) { struct vlan_dev_priv *info = vlan_dev_priv(dev); struct net_device *real_dev = info->real_dev; struct netpoll *netpoll; int err = 0; - netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + netpoll = kzalloc(sizeof(*netpoll), gfp); err = -ENOMEM; if (!netpoll) goto out; - err = __netpoll_setup(netpoll, real_dev); + err = __netpoll_setup(netpoll, real_dev, gfp); if (err) { kfree(netpoll); goto out; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 333484537600..ed0e0f9dc788 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -213,7 +213,8 @@ static void br_netpoll_cleanup(struct net_device *dev) } } -static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) +static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, + gfp_t gfp) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; @@ -222,8 +223,7 @@ static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) list_for_each_entry_safe(p, n, &br->port_list, list) { if (!p->dev) continue; - - err = br_netpoll_enable(p); + err = br_netpoll_enable(p, gfp); if (err) goto fail; } @@ -236,17 +236,17 @@ fail: goto out; } -int br_netpoll_enable(struct net_bridge_port *p) +int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { struct netpoll *np; int err = 0; - np = kzalloc(sizeof(*p->np), GFP_KERNEL); + np = kzalloc(sizeof(*p->np), gfp); err = -ENOMEM; if (!np) goto out; - err = __netpoll_setup(np, p->dev); + err = __netpoll_setup(np, p->dev, gfp); if (err) { kfree(np); goto out; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index e1144e1617be..171fd6b9bfe6 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -361,7 +361,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (err) goto err2; - if (br_netpoll_info(br) && ((err = br_netpoll_enable(p)))) + if (br_netpoll_info(br) && ((err = br_netpoll_enable(p, GFP_KERNEL)))) goto err3; err = netdev_set_master(dev, br->dev); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a768b2408edf..f507d2af9646 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -316,7 +316,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, netpoll_send_skb(np, skb); } -extern int br_netpoll_enable(struct net_bridge_port *p); +extern int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp); extern void br_netpoll_disable(struct net_bridge_port *p); #else static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br) @@ -329,7 +329,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p, { } -static inline int br_netpoll_enable(struct net_bridge_port *p) +static inline int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp) { return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index b4c90e42b443..37cc854774a4 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -715,7 +715,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); -int __netpoll_setup(struct netpoll *np, struct net_device *ndev) +int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp) { struct netpoll_info *npinfo; const struct net_device_ops *ops; @@ -734,7 +734,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) } if (!ndev->npinfo) { - npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); + npinfo = kmalloc(sizeof(*npinfo), gfp); if (!npinfo) { err = -ENOMEM; goto out; @@ -752,7 +752,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) ops = np->dev->netdev_ops; if (ops->ndo_netpoll_setup) { - err = ops->ndo_netpoll_setup(ndev, npinfo); + err = ops->ndo_netpoll_setup(ndev, npinfo, gfp); if (err) goto free_npinfo; } @@ -857,7 +857,7 @@ int netpoll_setup(struct netpoll *np) refill_skbs(); rtnl_lock(); - err = __netpoll_setup(np, ndev); + err = __netpoll_setup(np, ndev, GFP_KERNEL); rtnl_unlock(); if (err) -- cgit v1.2.3-70-g09d2 From 38e6bc185d9544dfad1774b3f8902a0b061aea25 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:38 +0000 Subject: netpoll: make __netpoll_cleanup non-block Like the previous patch, slave_disable_netpoll() and __netpoll_cleanup() may be called with read_lock() held too, so we should make them non-block, by moving the cleanup and kfree() to call_rcu_bh() callbacks. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 4 +--- include/linux/netpoll.h | 3 +++ net/8021q/vlan_dev.c | 6 +----- net/bridge/br_device.c | 6 +----- net/core/netpoll.c | 42 +++++++++++++++++++++++++++++++---------- 5 files changed, 38 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 8697136e27c0..e42891683e3b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1257,9 +1257,7 @@ static inline void slave_disable_netpoll(struct slave *slave) return; slave->np = NULL; - synchronize_rcu_bh(); - __netpoll_cleanup(np); - kfree(np); + __netpoll_free_rcu(np); } static inline bool slave_dev_support_netpoll(struct net_device *slave_dev) { diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index bf2d51eec0f3..907812efb4d9 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -23,6 +23,7 @@ struct netpoll { u8 remote_mac[ETH_ALEN]; struct list_head rx; /* rx_np list element */ + struct rcu_head rcu; }; struct netpoll_info { @@ -38,6 +39,7 @@ struct netpoll_info { struct delayed_work tx_work; struct netpoll *netpoll; + struct rcu_head rcu; }; void netpoll_send_udp(struct netpoll *np, const char *msg, int len); @@ -48,6 +50,7 @@ int netpoll_setup(struct netpoll *np); int netpoll_trap(void); void netpoll_set_trap(int trap); void __netpoll_cleanup(struct netpoll *np); +void __netpoll_free_rcu(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); int __netpoll_rx(struct sk_buff *skb); void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index ee4ae0944cef..b65623f90660 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -704,11 +704,7 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev) info->netpoll = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(netpoll); - kfree(netpoll); + __netpoll_free_rcu(netpoll); } #endif /* CONFIG_NET_POLL_CONTROLLER */ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index ed0e0f9dc788..f41ba4048c9a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -267,11 +267,7 @@ void br_netpoll_disable(struct net_bridge_port *p) p->np = NULL; - /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu_bh(); - - __netpoll_cleanup(np); - kfree(np); + __netpoll_free_rcu(np); } #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 37cc854774a4..dc17f1db1479 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -878,6 +878,24 @@ static int __init netpoll_init(void) } core_initcall(netpoll_init); +static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head) +{ + struct netpoll_info *npinfo = + container_of(rcu_head, struct netpoll_info, rcu); + + skb_queue_purge(&npinfo->arp_tx); + skb_queue_purge(&npinfo->txq); + + /* we can't call cancel_delayed_work_sync here, as we are in softirq */ + cancel_delayed_work(&npinfo->tx_work); + + /* clean after last, unfinished work */ + __skb_queue_purge(&npinfo->txq); + /* now cancel it again */ + cancel_delayed_work(&npinfo->tx_work); + kfree(npinfo); +} + void __netpoll_cleanup(struct netpoll *np) { struct netpoll_info *npinfo; @@ -903,20 +921,24 @@ void __netpoll_cleanup(struct netpoll *np) ops->ndo_netpoll_cleanup(np->dev); RCU_INIT_POINTER(np->dev->npinfo, NULL); + call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info); + } +} +EXPORT_SYMBOL_GPL(__netpoll_cleanup); - /* avoid racing with NAPI reading npinfo */ - synchronize_rcu_bh(); +static void rcu_cleanup_netpoll(struct rcu_head *rcu_head) +{ + struct netpoll *np = container_of(rcu_head, struct netpoll, rcu); - skb_queue_purge(&npinfo->arp_tx); - skb_queue_purge(&npinfo->txq); - cancel_delayed_work_sync(&npinfo->tx_work); + __netpoll_cleanup(np); + kfree(np); +} - /* clean after last, unfinished work */ - __skb_queue_purge(&npinfo->txq); - kfree(npinfo); - } +void __netpoll_free_rcu(struct netpoll *np) +{ + call_rcu_bh(&np->rcu, rcu_cleanup_netpoll); } -EXPORT_SYMBOL_GPL(__netpoll_cleanup); +EXPORT_SYMBOL_GPL(__netpoll_free_rcu); void netpoll_cleanup(struct netpoll *np) { -- cgit v1.2.3-70-g09d2 From 57c5d46191e75312934c00eba65b13a31ca95120 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:40 +0000 Subject: netpoll: take rcu_read_lock_bh() in netpoll_rx() In __netpoll_rx(), it dereferences ->npinfo without rcu_dereference_bh(), this patch fixes it by using the 'npinfo' passed from netpoll_rx() where it is already dereferenced with rcu_dereference_bh(). Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 4 ++-- net/core/netpoll.c | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 907812efb4d9..5d881c388273 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -52,7 +52,7 @@ void netpoll_set_trap(int trap); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free_rcu(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -int __netpoll_rx(struct sk_buff *skb); +int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo); void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) @@ -77,7 +77,7 @@ static inline bool netpoll_rx(struct sk_buff *skb) spin_lock(&npinfo->rx_lock); /* check rx_flags again with the lock held */ - if (npinfo->rx_flags && __netpoll_rx(skb)) + if (npinfo->rx_flags && __netpoll_rx(skb, npinfo)) ret = true; spin_unlock(&npinfo->rx_lock); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index dc17f1db1479..d055bb01328b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -543,13 +543,12 @@ static void arp_reply(struct sk_buff *skb) spin_unlock_irqrestore(&npinfo->rx_lock, flags); } -int __netpoll_rx(struct sk_buff *skb) +int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) { int proto, len, ulen; int hits = 0; const struct iphdr *iph; struct udphdr *uh; - struct netpoll_info *npinfo = skb->dev->npinfo; struct netpoll *np, *tmp; if (list_empty(&npinfo->rx_np)) -- cgit v1.2.3-70-g09d2 From 2899656b494dcd118123af1126826b115c8ea6f9 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:42 +0000 Subject: netpoll: take rcu_read_lock_bh() in netpoll_send_skb_on_dev() This patch fixes several problems in the call path of netpoll_send_skb_on_dev(): 1. Disable IRQ's before calling netpoll_send_skb_on_dev(). 2. All the callees of netpoll_send_skb_on_dev() should use rcu_dereference_bh() to dereference ->npinfo. 3. Rename arp_reply() to netpoll_arp_reply(), the former is too generic. Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/linux/netpoll.h | 3 +++ net/core/netpoll.c | 31 +++++++++++++++++-------------- 2 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 2d178baa49df..61aee86cf21d 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -57,7 +57,10 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { + unsigned long flags; + local_irq_save(flags); netpoll_send_skb_on_dev(np, skb, np->dev); + local_irq_restore(flags); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d055bb01328b..174346ac15a0 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -54,7 +54,7 @@ static atomic_t trapped; MAX_UDP_CHUNK) static void zap_completion_queue(void); -static void arp_reply(struct sk_buff *skb); +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo); static unsigned int carrier_timeout = 4; module_param(carrier_timeout, uint, 0644); @@ -170,7 +170,8 @@ static void poll_napi(struct net_device *dev) list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - budget = poll_one_napi(dev->npinfo, napi, budget); + budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), + napi, budget); spin_unlock(&napi->poll_lock); if (!budget) @@ -185,13 +186,14 @@ static void service_arp_queue(struct netpoll_info *npi) struct sk_buff *skb; while ((skb = skb_dequeue(&npi->arp_tx))) - arp_reply(skb); + netpoll_arp_reply(skb, npi); } } static void netpoll_poll_dev(struct net_device *dev) { const struct net_device_ops *ops; + struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo); if (!dev || !netif_running(dev)) return; @@ -206,17 +208,18 @@ static void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); if (dev->flags & IFF_SLAVE) { - if (dev->npinfo) { + if (ni) { struct net_device *bond_dev = dev->master; struct sk_buff *skb; - while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { + struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo); + while ((skb = skb_dequeue(&ni->arp_tx))) { skb->dev = bond_dev; - skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); + skb_queue_tail(&bond_ni->arp_tx, skb); } } } - service_arp_queue(dev->npinfo); + service_arp_queue(ni); zap_completion_queue(); } @@ -302,6 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } +/* call with IRQ disabled */ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { @@ -309,8 +313,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, unsigned long tries; const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ - struct netpoll_info *npinfo = np->dev->npinfo; + struct netpoll_info *npinfo; + + WARN_ON_ONCE(!irqs_disabled()); + npinfo = rcu_dereference_bh(np->dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { __kfree_skb(skb); return; @@ -319,11 +326,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, /* don't get messages out of order, and no recursion */ if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { struct netdev_queue *txq; - unsigned long flags; txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); - local_irq_save(flags); /* try until next clock tick */ for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; tries > 0; --tries) { @@ -347,10 +352,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, } WARN_ONCE(!irqs_disabled(), - "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", + "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n", dev->name, ops->ndo_start_xmit); - local_irq_restore(flags); } if (status != NETDEV_TX_OK) { @@ -423,9 +427,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) } EXPORT_SYMBOL(netpoll_send_udp); -static void arp_reply(struct sk_buff *skb) +static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo) { - struct netpoll_info *npinfo = skb->dev->npinfo; struct arphdr *arp; unsigned char *arp_ptr; int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; -- cgit v1.2.3-70-g09d2 From d30362c0712eb567334b3b66de7c40d4372f2c6f Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:43 +0000 Subject: bridge: add some comments for NETDEV_RELEASE Add comments on why we don't notify NETDEV_RELEASE. Cc: David Miller Cc: Stephen Hemminger Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/bridge/br_if.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 171fd6b9bfe6..1c8fdc3558cd 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -427,6 +427,10 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) if (!p || p->br != br) return -EINVAL; + /* Since more than one interface can be attached to a bridge, + * there still maybe an alternate path for netconsole to use; + * therefore there is no reason for a NETDEV_RELEASE event. + */ del_nbp(p); spin_lock_bh(&br->lock); -- cgit v1.2.3-70-g09d2 From 4e3828c4bfd90b00a951cad7c8da27d1966beefe Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:44 +0000 Subject: bridge: use list_for_each_entry() in netpoll functions We don't delete 'p' from the list in the loop, so we can just use list_for_each_entry(). Cc: David Miller Cc: Stephen Hemminger Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/bridge/br_device.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f41ba4048c9a..32211fa5b506 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -206,21 +206,20 @@ static void br_poll_controller(struct net_device *br_dev) static void br_netpoll_cleanup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p, *n; + struct net_bridge_port *p; - list_for_each_entry_safe(p, n, &br->port_list, list) { + list_for_each_entry(p, &br->port_list, list) br_netpoll_disable(p); - } } static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni, gfp_t gfp) { struct net_bridge *br = netdev_priv(dev); - struct net_bridge_port *p, *n; + struct net_bridge_port *p; int err = 0; - list_for_each_entry_safe(p, n, &br->port_list, list) { + list_for_each_entry(p, &br->port_list, list) { if (!p->dev) continue; err = br_netpoll_enable(p, gfp); -- cgit v1.2.3-70-g09d2 From e15c3c2294605f09f9b336b2f3b97086ab4b8145 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:45 +0000 Subject: netpoll: check netpoll tx status on the right device Although this doesn't matter actually, because netpoll_tx_running() doesn't use the parameter, the code will be more readable. For team_dev_queue_xmit() we have to move it down to avoid compile errors. Cc: David Miller Signed-off-by: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- include/linux/if_team.h | 30 +++++++++++++++--------------- net/bridge/br_forward.c | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index e42891683e3b..d688a8af432c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -398,7 +398,7 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping; - if (unlikely(netpoll_tx_running(slave_dev))) + if (unlikely(netpoll_tx_running(bond->dev))) bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); else dev_queue_xmit(skb); diff --git a/include/linux/if_team.h b/include/linux/if_team.h index 6960fc1841a7..aa2e167e1ef4 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -96,21 +96,6 @@ static inline void team_netpoll_send_skb(struct team_port *port, } #endif -static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, - struct sk_buff *skb) -{ - BUILD_BUG_ON(sizeof(skb->queue_mapping) != - sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); - skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); - - skb->dev = port->dev; - if (unlikely(netpoll_tx_running(port->dev))) { - team_netpoll_send_skb(port, skb); - return 0; - } - return dev_queue_xmit(skb); -} - struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); @@ -200,6 +185,21 @@ struct team { long mode_priv[TEAM_MODE_PRIV_LONGS]; }; +static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, + struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->queue_mapping) != + sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); + skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); + + skb->dev = port->dev; + if (unlikely(netpoll_tx_running(team->dev))) { + team_netpoll_send_skb(port, skb); + return 0; + } + return dev_queue_xmit(skb); +} + static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index e9466d412707..02015a505d2a 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -65,7 +65,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { skb->dev = to->dev; - if (unlikely(netpoll_tx_running(to->dev))) { + if (unlikely(netpoll_tx_running(to->br->dev))) { if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) kfree_skb(skb); else { -- cgit v1.2.3-70-g09d2 From f3da38932b46d1bf171634cc7aae3da405eaf7a6 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:47 +0000 Subject: vlan: clean up some variable names To be consistent, s/info/vlan/. Cc: Benjamin LaHaise Cc: Patrick McHardy Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index b65623f90660..0347d48aa7c9 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -672,8 +672,8 @@ static void vlan_dev_poll_controller(struct net_device *dev) static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo, gfp_t gfp) { - struct vlan_dev_priv *info = vlan_dev_priv(dev); - struct net_device *real_dev = info->real_dev; + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct net_device *real_dev = vlan->real_dev; struct netpoll *netpoll; int err = 0; @@ -688,7 +688,7 @@ static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *n goto out; } - info->netpoll = netpoll; + vlan->netpoll = netpoll; out: return err; @@ -696,13 +696,13 @@ out: static void vlan_dev_netpoll_cleanup(struct net_device *dev) { - struct vlan_dev_priv *info = vlan_dev_priv(dev); - struct netpoll *netpoll = info->netpoll; + struct vlan_dev_priv *vlan= vlan_dev_priv(dev); + struct netpoll *netpoll = vlan->netpoll; if (!netpoll) return; - info->netpoll = NULL; + vlan->netpoll = NULL; __netpoll_free_rcu(netpoll); } -- cgit v1.2.3-70-g09d2 From 6eacf8ad8d01c49b95b994b0bf379db2b5b29460 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:48 +0000 Subject: vlan: clean up vlan_dev_hard_start_xmit() Clean up vlan_dev_hard_start_xmit() function. Cc: Benjamin LaHaise Cc: Patrick McHardy Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 0347d48aa7c9..402442402af7 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -137,9 +137,21 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, return rc; } +static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) +{ +#ifdef CONFIG_NET_POLL_CONTROLLER + if (vlan->netpoll) + netpoll_send_skb(vlan->netpoll, skb); +#else + BUG(); +#endif + return NETDEV_TX_OK; +} + static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) { + struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); unsigned int len; int ret; @@ -150,29 +162,30 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... */ if (veth->h_vlan_proto != htons(ETH_P_8021Q) || - vlan_dev_priv(dev)->flags & VLAN_FLAG_REORDER_HDR) { + vlan->flags & VLAN_FLAG_REORDER_HDR) { u16 vlan_tci; - vlan_tci = vlan_dev_priv(dev)->vlan_id; + vlan_tci = vlan->vlan_id; vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb); skb = __vlan_hwaccel_put_tag(skb, vlan_tci); } - skb->dev = vlan_dev_priv(dev)->real_dev; + skb->dev = vlan->real_dev; len = skb->len; - if (netpoll_tx_running(dev)) - return skb->dev->netdev_ops->ndo_start_xmit(skb, skb->dev); + if (unlikely(netpoll_tx_running(dev))) + return vlan_netpoll_send_skb(vlan, skb); + ret = dev_queue_xmit(skb); if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct vlan_pcpu_stats *stats; - stats = this_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats); + stats = this_cpu_ptr(vlan->vlan_pcpu_stats); u64_stats_update_begin(&stats->syncp); stats->tx_packets++; stats->tx_bytes += len; u64_stats_update_end(&stats->syncp); } else { - this_cpu_inc(vlan_dev_priv(dev)->vlan_pcpu_stats->tx_dropped); + this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped); } return ret; -- cgit v1.2.3-70-g09d2 From 689971b44613883ee74ae9c1b31a864aaa3a8e17 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:49 +0000 Subject: netpoll: handle vlan tags in netpoll tx and rx path Without this patch, I can't get netconsole logs remotely over vlan. The reason is probably we don't handle vlan tags in either netpoll tx or rx path. I am not sure if I use these vlan functions correctly, at least this patch works. Cc: Benjamin LaHaise Cc: Patrick McHardy Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 174346ac15a0..e4ba3e70c174 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -334,6 +335,14 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, tries > 0; --tries) { if (__netif_tx_trylock(txq)) { if (!netif_xmit_stopped(txq)) { + if (vlan_tx_tag_present(skb) && + !(netif_skb_features(skb) & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + break; + skb->vlan_tci = 0; + } + status = ops->ndo_start_xmit(skb, dev); if (status == NETDEV_TX_OK) txq_trans_update(txq); @@ -567,6 +576,12 @@ int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo) return 1; } + if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { + skb = vlan_untag(skb); + if (unlikely(!skb)) + goto out; + } + proto = ntohs(eth_hdr(skb)->h_proto); if (proto != ETH_P_IP) goto out; -- cgit v1.2.3-70-g09d2 From 6bdb7fe31046ac50b47e83c35cd6c6b6160a475d Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 10 Aug 2012 01:24:50 +0000 Subject: netpoll: re-enable irq in poll_napi() napi->poll() needs IRQ enabled, so we have to re-enable IRQ before calling it. Cc: David Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e4ba3e70c174..346b1eb83a1f 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -168,16 +168,24 @@ static void poll_napi(struct net_device *dev) struct napi_struct *napi; int budget = 16; + WARN_ON_ONCE(!irqs_disabled()); + list_for_each_entry(napi, &dev->napi_list, dev_list) { + local_irq_enable(); if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { + rcu_read_lock_bh(); budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), napi, budget); + rcu_read_unlock_bh(); spin_unlock(&napi->poll_lock); - if (!budget) + if (!budget) { + local_irq_disable(); break; + } } + local_irq_disable(); } } -- cgit v1.2.3-70-g09d2 From 7bd86cc282a458b66c41e3f6676de6656c99b8db Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 12 Aug 2012 20:09:59 +0000 Subject: ipv4: Cache local output routes Commit caacf05e5ad1abf causes big drop of UDP loop back performance. The cause of the regression is that we do not cache the local output routes. Each time we send a datagram from unconnected UDP socket, the kernel allocates a dst_entry and adds it to the rt_uncached_list. It creates lock contention on the rt_uncached_lock. Reported-by: Alex Shi Signed-off-by: Yan, Zheng Signed-off-by: David S. Miller --- net/ipv4/route.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e4ba974f143c..fd9ecb52c66b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2028,7 +2028,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) } dev_out = net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; - res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } -- cgit v1.2.3-70-g09d2 From 4855d6f3116e891b66198838b683dce3dcf6e874 Mon Sep 17 00:00:00 2001 From: Igor Maravic Date: Sun, 12 Aug 2012 22:31:58 +0000 Subject: net: ipv6: proc: Fix error handling Fix error handling in case making of dir dev_snmp6 failes Signed-off-by: Igor Maravic Signed-off-by: David S. Miller --- net/ipv6/proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index da2e92d05c15..745a32042950 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -307,10 +307,10 @@ static int __net_init ipv6_proc_init_net(struct net *net) goto proc_dev_snmp6_fail; return 0; +proc_dev_snmp6_fail: + proc_net_remove(net, "snmp6"); proc_snmp6_fail: proc_net_remove(net, "sockstat6"); -proc_dev_snmp6_fail: - proc_net_remove(net, "dev_snmp6"); return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From 6024935f5ff5f1646bce8404416318e5fd4a0c4a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 13 Aug 2012 02:49:59 +0000 Subject: llc2: Fix silent failure of llc_station_init() llc_station_init() creates and processes an event skb with no effect other than to change the state from DOWN to UP. Allocation failure is reported, but then ignored by its caller, llc2_init(). Remove this possibility by simply initialising the state as UP. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- include/net/llc.h | 2 +- net/llc/llc_station.c | 19 ++----------------- 2 files changed, 3 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/llc.h b/include/net/llc.h index 226c846cab08..f2d0fc570527 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -133,7 +133,7 @@ extern int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, extern void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); extern void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); -extern int llc_station_init(void); +extern void llc_station_init(void); extern void llc_station_exit(void); #ifdef CONFIG_PROC_FS diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 6828e39ec2ec..45ddbb93c5d0 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -687,12 +687,8 @@ static void llc_station_rcv(struct sk_buff *skb) llc_station_state_process(skb); } -int __init llc_station_init(void) +void __init llc_station_init(void) { - int rc = -ENOBUFS; - struct sk_buff *skb; - struct llc_station_state_ev *ev; - skb_queue_head_init(&llc_main_station.mac_pdu_q); skb_queue_head_init(&llc_main_station.ev_q.list); spin_lock_init(&llc_main_station.ev_q.lock); @@ -700,20 +696,9 @@ int __init llc_station_init(void) (unsigned long)&llc_main_station); llc_main_station.ack_timer.expires = jiffies + sysctl_llc_station_ack_timeout; - skb = alloc_skb(0, GFP_ATOMIC); - if (!skb) - goto out; - rc = 0; llc_set_station_handler(llc_station_rcv); - ev = llc_station_ev(skb); - memset(ev, 0, sizeof(*ev)); llc_main_station.maximum_retry = 1; - llc_main_station.state = LLC_STATION_STATE_DOWN; - ev->type = LLC_STATION_EV_TYPE_SIMPLE; - ev->prim_type = LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK; - rc = llc_station_next_state(skb); -out: - return rc; + llc_main_station.state = LLC_STATION_STATE_UP; } void __exit llc_station_exit(void) -- cgit v1.2.3-70-g09d2 From f4f8720febf0d785a054fc09bde5e3ad09728a58 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 13 Aug 2012 02:50:43 +0000 Subject: llc2: Call llc_station_exit() on llc2_init() failure path Otherwise the station packet handler will remain registered even though the module is unloaded. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/llc/af_llc.c | 5 +++-- net/llc/llc_station.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index f6fe4d400502..8c2919ca36f6 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -1206,7 +1206,7 @@ static int __init llc2_init(void) rc = llc_proc_init(); if (rc != 0) { printk(llc_proc_err_msg); - goto out_unregister_llc_proto; + goto out_station; } rc = llc_sysctl_init(); if (rc) { @@ -1226,7 +1226,8 @@ out_sysctl: llc_sysctl_exit(); out_proc: llc_proc_exit(); -out_unregister_llc_proto: +out_station: + llc_station_exit(); proto_unregister(&llc_proto); goto out; } diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index 45ddbb93c5d0..bba5184fafd7 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -701,7 +701,7 @@ void __init llc_station_init(void) llc_main_station.state = LLC_STATION_STATE_UP; } -void __exit llc_station_exit(void) +void llc_station_exit(void) { llc_set_station_handler(NULL); } -- cgit v1.2.3-70-g09d2 From aadf31de16a7b2878af00a02e6557df84efa784b Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 13 Aug 2012 02:50:55 +0000 Subject: llc: Fix races between llc2 handler use and (un)registration When registering the handlers, any state they rely on must be completely initialised first. When unregistering, we must wait until they are definitely no longer running. llc_rcv() must also avoid reading the handler pointers again after checking for NULL. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/llc/llc_input.c | 21 +++++++++++++++++---- net/llc/llc_station.c | 2 +- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index e32cab44ea95..dd3e83328ad5 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -42,6 +42,7 @@ static void (*llc_type_handlers[2])(struct llc_sap *sap, void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, struct sk_buff *skb)) { + smp_wmb(); /* ensure initialisation is complete before it's called */ if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = handler; } @@ -50,11 +51,19 @@ void llc_remove_pack(int type) { if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) llc_type_handlers[type - 1] = NULL; + synchronize_net(); } void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) { + /* Ensure initialisation is complete before it's called */ + if (handler) + smp_wmb(); + llc_station_handler = handler; + + if (!handler) + synchronize_net(); } /** @@ -150,6 +159,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, int dest; int (*rcv)(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); + void (*sta_handler)(struct sk_buff *skb); + void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); if (!net_eq(dev_net(dev), &init_net)) goto drop; @@ -182,7 +193,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, */ rcv = rcu_dereference(sap->rcv_func); dest = llc_pdu_type(skb); - if (unlikely(!dest || !llc_type_handlers[dest - 1])) { + sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL; + if (unlikely(!sap_handler)) { if (rcv) rcv(skb, dev, pt, orig_dev); else @@ -193,7 +205,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, if (cskb) rcv(cskb, dev, pt, orig_dev); } - llc_type_handlers[dest - 1](sap, skb); + sap_handler(sap, skb); } llc_sap_put(sap); out: @@ -202,9 +214,10 @@ drop: kfree_skb(skb); goto out; handle_station: - if (!llc_station_handler) + sta_handler = ACCESS_ONCE(llc_station_handler); + if (!sta_handler) goto drop; - llc_station_handler(skb); + sta_handler(skb); goto out; } diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c index bba5184fafd7..b2f2bac2c2a2 100644 --- a/net/llc/llc_station.c +++ b/net/llc/llc_station.c @@ -696,9 +696,9 @@ void __init llc_station_init(void) (unsigned long)&llc_main_station); llc_main_station.ack_timer.expires = jiffies + sysctl_llc_station_ack_timeout; - llc_set_station_handler(llc_station_rcv); llc_main_station.maximum_retry = 1; llc_main_station.state = LLC_STATION_STATE_UP; + llc_set_station_handler(llc_station_rcv); } void llc_station_exit(void) -- cgit v1.2.3-70-g09d2 From 4acd4945cd1e1f92b20d14e349c6c6a52acbd42d Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 14 Aug 2012 08:54:51 +0000 Subject: ipv6: addrconf: Avoid calling netdevice notifiers with RCU read-side lock Cong Wang reports that lockdep detected suspicious RCU usage while enabling IPV6 forwarding: [ 1123.310275] =============================== [ 1123.442202] [ INFO: suspicious RCU usage. ] [ 1123.558207] 3.6.0-rc1+ #109 Not tainted [ 1123.665204] ------------------------------- [ 1123.768254] include/linux/rcupdate.h:430 Illegal context switch in RCU read-side critical section! [ 1123.992320] [ 1123.992320] other info that might help us debug this: [ 1123.992320] [ 1124.307382] [ 1124.307382] rcu_scheduler_active = 1, debug_locks = 0 [ 1124.522220] 2 locks held by sysctl/5710: [ 1124.648364] #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_trylock+0x15/0x17 [ 1124.882211] #1: (rcu_read_lock){.+.+.+}, at: [] rcu_lock_acquire+0x0/0x29 [ 1125.085209] [ 1125.085209] stack backtrace: [ 1125.332213] Pid: 5710, comm: sysctl Not tainted 3.6.0-rc1+ #109 [ 1125.441291] Call Trace: [ 1125.545281] [] lockdep_rcu_suspicious+0x109/0x112 [ 1125.667212] [] rcu_preempt_sleep_check+0x45/0x47 [ 1125.781838] [] __might_sleep+0x1e/0x19b [...] [ 1127.445223] [] call_netdevice_notifiers+0x4a/0x4f [...] [ 1127.772188] [] dev_disable_lro+0x32/0x6b [ 1127.885174] [] dev_forward_change+0x30/0xcb [ 1128.013214] [] addrconf_forward_change+0x85/0xc5 [...] addrconf_forward_change() uses RCU iteration over the netdev list, which is unnecessary since it already holds the RTNL lock. We also cannot reasonably require netdevice notifier functions not to sleep. Reported-by: Cong Wang Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 79181819a24f..6bc85f7c31e3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -494,8 +494,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct net_device *dev; struct inet6_dev *idev; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { idev = __in6_dev_get(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -504,7 +503,6 @@ static void addrconf_forward_change(struct net *net, __s32 newf) dev_forward_change(idev); } } - rcu_read_unlock(); } static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) -- cgit v1.2.3-70-g09d2 From 61a0cfb008f57ecf7eb28ee762952fb42dc15d15 Mon Sep 17 00:00:00 2001 From: Andre Guedes Date: Wed, 1 Aug 2012 20:34:15 -0300 Subject: Bluetooth: Fix use-after-free bug in SMP If SMP fails, we should always cancel security_timer delayed work. Otherwise, security_timer function may run after l2cap_conn object has been freed. This patch fixes the following warning reported by ODEBUG: WARNING: at lib/debugobjects.c:261 debug_print_object+0x7c/0x8d() Hardware name: Bochs ODEBUG: free active (active state 0) object type: timer_list hint: delayed_work_timer_fn+0x0/0x27 Modules linked in: btusb bluetooth Pid: 440, comm: kworker/u:2 Not tainted 3.5.0-rc1+ #4 Call Trace: [] ? free_obj_work+0x4a/0x7f [] warn_slowpath_common+0x7e/0x97 [] warn_slowpath_fmt+0x41/0x43 [] debug_print_object+0x7c/0x8d [] ? __queue_work+0x241/0x241 [] debug_check_no_obj_freed+0x92/0x159 [] slab_free_hook+0x6f/0x77 [] ? l2cap_conn_del+0x148/0x157 [bluetooth] [] kfree+0x59/0xac [] l2cap_conn_del+0x148/0x157 [bluetooth] [] l2cap_recv_frame+0xa77/0xfa4 [bluetooth] [] ? trace_hardirqs_on_caller+0x112/0x1ad [] l2cap_recv_acldata+0xe2/0x264 [bluetooth] [] hci_rx_work+0x235/0x33c [bluetooth] [] ? process_one_work+0x126/0x2fe [] process_one_work+0x185/0x2fe [] ? process_one_work+0x126/0x2fe [] ? lock_acquired+0x1b5/0x1cf [] ? le_scan_work+0x11d/0x11d [bluetooth] [] ? spin_lock_irq+0x9/0xb [] worker_thread+0xcf/0x175 [] ? rescuer_thread+0x175/0x175 [] kthread+0x95/0x9d [] kernel_threadi_helper+0x4/0x10 [] ? retint_restore_args+0x13/0x13 [] ? flush_kthread_worker+0xdb/0xdb [] ? gs_change+0x13/0x13 This bug can be reproduced using hctool lecc or l2test tools and bluetoothd not running. Signed-off-by: Andre Guedes Signed-off-by: Gustavo Padovan --- net/bluetooth/smp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 901a616c8083..98ffc1b6a6fa 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -267,10 +267,10 @@ static void smp_failure(struct l2cap_conn *conn, u8 reason, u8 send) mgmt_auth_failed(conn->hcon->hdev, conn->dst, hcon->type, hcon->dst_type, reason); - if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) { - cancel_delayed_work_sync(&conn->security_timer); + cancel_delayed_work_sync(&conn->security_timer); + + if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) smp_chan_destroy(conn); - } } #define JUST_WORKS 0x00 -- cgit v1.2.3-70-g09d2 From c03307eab68d583ea6db917681afa14ed1fb3b84 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Aug 2012 08:19:33 -0700 Subject: bridge: fix rcu dereference outside of rcu_read_lock Alternative solution for problem found by Linux Driver Verification project (linuxtesting.org). As it noted in the comment before the br_handle_frame_finish function, this function should be called under rcu_read_lock. The problem callgraph: br_dev_xmit -> br_nf_pre_routing_finish_bridge_slow -> -> br_handle_frame_finish -> br_port_get_rcu -> rcu_dereference And in this case there is no read-lock section. Reported-by: Denis Efremov Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 32211fa5b506..070e8a68cfc6 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -31,9 +31,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct net_bridge_mdb_entry *mdst; struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); + rcu_read_lock(); #ifdef CONFIG_BRIDGE_NETFILTER if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { br_nf_pre_routing_finish_bridge_slow(skb); + rcu_read_unlock(); return NETDEV_TX_OK; } #endif @@ -48,7 +50,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - rcu_read_lock(); if (is_broadcast_ether_addr(dest)) br_flood_deliver(br, skb); else if (is_multicast_ether_addr(dest)) { -- cgit v1.2.3-70-g09d2 From e862f1a9b7df4e8196ebec45ac62295138aa3fc2 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:44 +0000 Subject: atm: fix info leak in getsockopt(SO_ATMPVC) The ATM code fails to initialize the two padding bytes of struct sockaddr_atmpvc inserted for alignment. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- net/atm/common.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/atm/common.c b/net/atm/common.c index b4b44dbed645..0c0ad930a632 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -812,6 +812,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) return -ENOTCONN; + memset(&pvc, 0, sizeof(pvc)); pvc.sap_family = AF_ATMPVC; pvc.sap_addr.itf = vcc->dev->number; pvc.sap_addr.vpi = vcc->vpi; -- cgit v1.2.3-70-g09d2 From 3c0c5cfdcd4d69ffc4b9c0907cec99039f30a50a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:45 +0000 Subject: atm: fix info leak via getsockname() The ATM code fails to initialize the two padding bytes of struct sockaddr_atmpvc inserted for alignment. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- net/atm/pvc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 3a734919c36c..ae0324021407 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -95,6 +95,7 @@ static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, return -ENOTCONN; *sockaddr_len = sizeof(struct sockaddr_atmpvc); addr = (struct sockaddr_atmpvc *)sockaddr; + memset(addr, 0, sizeof(*addr)); addr->sap_family = AF_ATMPVC; addr->sap_addr.itf = vcc->dev->number; addr->sap_addr.vpi = vcc->vpi; -- cgit v1.2.3-70-g09d2 From e15ca9a0ef9a86f0477530b0f44a725d67f889ee Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:46 +0000 Subject: Bluetooth: HCI - Fix info leak in getsockopt(HCI_FILTER) The HCI code fails to initialize the two padding bytes of struct hci_ufilter before copying it to userland -- that for leaking two bytes kernel stack. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/hci_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index a7f04de03d79..a27bbc3cd4b7 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1009,6 +1009,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, { struct hci_filter *f = &hci_pi(sk)->filter; + memset(&uf, 0, sizeof(uf)); uf.type_mask = f->type_mask; uf.opcode = f->opcode; uf.event_mask[0] = *((u32 *) f->event_mask + 0); -- cgit v1.2.3-70-g09d2 From 3f68ba07b1da811bf383b4b701b129bfcb2e4988 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:47 +0000 Subject: Bluetooth: HCI - Fix info leak via getsockname() The HCI code fails to initialize the hci_channel member of struct sockaddr_hci and that for leaks two bytes kernel stack via the getsockname() syscall. Initialize hci_channel with 0 to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/hci_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index a27bbc3cd4b7..19fdac78e555 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -694,6 +694,7 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, *addr_len = sizeof(*haddr); haddr->hci_family = AF_BLUETOOTH; haddr->hci_dev = hdev->id; + haddr->hci_channel= 0; release_sock(sk); return 0; -- cgit v1.2.3-70-g09d2 From 9ad2de43f1aee7e7274a4e0d41465489299e344b Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:48 +0000 Subject: Bluetooth: RFCOMM - Fix info leak in getsockopt(BT_SECURITY) The RFCOMM code fails to initialize the key_size member of struct bt_security before copying it to userland -- that for leaking one byte kernel stack. Initialize key_size with 0 to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 7e1e59645c05..64f55ca61472 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -822,6 +822,7 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c } sec.level = rfcomm_pi(sk)->sec_level; + sec.key_size = 0; len = min_t(unsigned int, len, sizeof(sec)); if (copy_to_user(optval, (char *) &sec, len)) -- cgit v1.2.3-70-g09d2 From f9432c5ec8b1e9a09b9b0e5569e3c73db8de432a Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:49 +0000 Subject: Bluetooth: RFCOMM - Fix info leak in ioctl(RFCOMMGETDEVLIST) The RFCOMM code fails to initialize the two padding bytes of struct rfcomm_dev_list_req inserted for alignment before copying it to userland. Additionally there are two padding bytes in each instance of struct rfcomm_dev_info. The ioctl() that for disclosures two bytes plus dev_num times two bytes uninitialized kernel heap memory. Allocate the memory using kzalloc() to fix this issue. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/tty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index cb960773c002..56f182393c4c 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -456,7 +456,7 @@ static int rfcomm_get_dev_list(void __user *arg) size = sizeof(*dl) + dev_num * sizeof(*di); - dl = kmalloc(size, GFP_KERNEL); + dl = kzalloc(size, GFP_KERNEL); if (!dl) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 9344a972961d1a6d2c04d9008b13617bcb6ec2ef Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:50 +0000 Subject: Bluetooth: RFCOMM - Fix info leak via getsockname() The RFCOMM code fails to initialize the trailing padding byte of struct sockaddr_rc added for alignment. It that for leaks one byte kernel stack via the getsockname() syscall. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 64f55ca61472..1a17850d093c 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -528,6 +528,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int * BT_DBG("sock %p, sk %p", sock, sk); + memset(sa, 0, sizeof(*sa)); sa->rc_family = AF_BLUETOOTH; sa->rc_channel = rfcomm_pi(sk)->channel; if (peer) -- cgit v1.2.3-70-g09d2 From 792039c73cf176c8e39a6e8beef2c94ff46522ed Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:51 +0000 Subject: Bluetooth: L2CAP - Fix info leak via getsockname() The L2CAP code fails to initialize the l2_bdaddr_type member of struct sockaddr_l2 and the padding byte added for alignment. It that for leaks two bytes kernel stack via the getsockname() syscall. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Cc: Marcel Holtmann Cc: Gustavo Padovan Cc: Johan Hedberg Signed-off-by: David S. Miller --- net/bluetooth/l2cap_sock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index b94abd30e6f9..1497edd191a2 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -245,6 +245,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l BT_DBG("sock %p, sk %p", sock, sk); + memset(la, 0, sizeof(struct sockaddr_l2)); addr->sa_family = AF_BLUETOOTH; *len = sizeof(struct sockaddr_l2); -- cgit v1.2.3-70-g09d2 From 04d4fbca1017c11381e7d82acea21dd741e748bc Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:52 +0000 Subject: l2tp: fix info leak via getsockname() The L2TP code for IPv6 fails to initialize the l2tp_unused member of struct sockaddr_l2tpip6 and that for leaks two bytes kernel stack via the getsockname() syscall. Initialize l2tp_unused with 0 to avoid the info leak. Signed-off-by: Mathias Krause Cc: James Chapman Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip6.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 35e1e4bde587..927547171bc7 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -410,6 +410,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr, lsa->l2tp_family = AF_INET6; lsa->l2tp_flowinfo = 0; lsa->l2tp_scope_id = 0; + lsa->l2tp_unused = 0; if (peer) { if (!lsk->peer_conn_id) return -ENOTCONN; -- cgit v1.2.3-70-g09d2 From 3592aaeb80290bda0f2cf0b5456c97bfc638b192 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:53 +0000 Subject: llc: fix info leak via getsockname() The LLC code wrongly returns 0, i.e. "success", when the socket is zapped. Together with the uninitialized uaddrlen pointer argument from sys_getsockname this leads to an arbitrary memory leak of up to 128 bytes kernel stack via the getsockname() syscall. Return an error instead when the socket is zapped to prevent the info leak. Also remove the unnecessary memset(0). We don't directly write to the memory pointed by uaddr but memcpy() a local structure at the end of the function that is properly initialized. Signed-off-by: Mathias Krause Cc: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/llc/af_llc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 8c2919ca36f6..c2190005a114 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -969,14 +969,13 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); - int rc = 0; + int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; *uaddrlen = sizeof(sllc); - memset(uaddr, 0, *uaddrlen); if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) -- cgit v1.2.3-70-g09d2 From 276bdb82dedb290511467a5a4fdbe9f0b52dce6f Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:54 +0000 Subject: dccp: check ccid before dereferencing ccid_hc_rx_getsockopt() and ccid_hc_tx_getsockopt() might be called with a NULL ccid pointer leading to a NULL pointer dereference. This could lead to a privilege escalation if the attacker is able to map page 0 and prepare it with a fake ccid_ops pointer. Signed-off-by: Mathias Krause Cc: Gerrit Renker Cc: stable@vger.kernel.org Signed-off-by: David S. Miller --- net/dccp/ccid.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 75c3582a7678..fb85d371a8de 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -246,7 +246,7 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; - if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) + if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, optval, optlen); return rc; @@ -257,7 +257,7 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, u32 __user *optval, int __user *optlen) { int rc = -ENOPROTOOPT; - if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) + if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, optval, optlen); return rc; -- cgit v1.2.3-70-g09d2 From 7b07f8eb75aa3097cdfd4f6eac3da49db787381d Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:55 +0000 Subject: dccp: fix info leak via getsockopt(DCCP_SOCKOPT_CCID_TX_INFO) The CCID3 code fails to initialize the trailing padding bytes of struct tfrc_tx_info added for alignment on 64 bit architectures. It that for potentially leaks four bytes kernel stack via the getsockopt() syscall. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Cc: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid3.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index d65e98798eca..119c04317d48 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -535,6 +535,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, case DCCP_SOCKOPT_CCID_TX_INFO: if (len < sizeof(tfrc)) return -EINVAL; + memset(&tfrc, 0, sizeof(tfrc)); tfrc.tfrctx_x = hc->tx_x; tfrc.tfrctx_x_recv = hc->tx_x_recv; tfrc.tfrctx_x_calc = hc->tx_x_calc; -- cgit v1.2.3-70-g09d2 From 2d8a041b7bfe1097af21441cb77d6af95f4f4680 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:56 +0000 Subject: ipvs: fix info leak in getsockopt(IP_VS_SO_GET_TIMEOUT) If at least one of CONFIG_IP_VS_PROTO_TCP or CONFIG_IP_VS_PROTO_UDP is not set, __ip_vs_get_timeouts() does not fully initialize the structure that gets copied to userland and that for leaks up to 12 bytes of kernel stack. Add an explicit memset(0) before passing the structure to __ip_vs_get_timeouts() to avoid the info leak. Signed-off-by: Mathias Krause Cc: Wensong Zhang Cc: Simon Horman Cc: Julian Anastasov Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 84444dda194b..72bf32a84874 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2759,6 +2759,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; + memset(&t, 0, sizeof(t)); __ip_vs_get_timeouts(net, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; -- cgit v1.2.3-70-g09d2 From 43da5f2e0d0c69ded3d51907d9552310a6b545e8 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 15 Aug 2012 11:31:57 +0000 Subject: net: fix info leak in compat dev_ifconf() The implementation of dev_ifconf() for the compat ioctl interface uses an intermediate ifc structure allocated in userland for the duration of the syscall. Though, it fails to initialize the padding bytes inserted for alignment and that for leaks four bytes of kernel stack. Add an explicit memset(0) before filling the structure to avoid the info leak. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- net/socket.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index dfe5b66c97e0..a5471f804d99 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2657,6 +2657,7 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) return -EFAULT; + memset(&ifc, 0, sizeof(ifc)); if (ifc32.ifcbuf == 0) { ifc32.ifc_len = 0; ifc.ifc_len = 0; -- cgit v1.2.3-70-g09d2 From 2614f86490122bf51eb7c12ec73927f1900f4e7d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 16 Aug 2012 02:25:24 +0200 Subject: netfilter: nf_ct_expect: fix possible access to uninitialized timer In __nf_ct_expect_check, the function refresh_timer returns 1 if a matching expectation is found and its timer is successfully refreshed. This results in nf_ct_expect_related returning 0. Note that at this point: - the passed expectation is not inserted in the expectation table and its timer was not initialized, since we have refreshed one matching/existing expectation. - nf_ct_expect_alloc uses kmem_cache_alloc, so the expectation timer is in some undefined state just after the allocation, until it is appropriately initialized. This can be a problem for the SIP helper during the expectation addition: ... if (nf_ct_expect_related(rtp_exp) == 0) { if (nf_ct_expect_related(rtcp_exp) != 0) nf_ct_unexpect_related(rtp_exp); ... Note that nf_ct_expect_related(rtp_exp) may return 0 for the timer refresh case that is detailed above. Then, if nf_ct_unexpect_related(rtcp_exp) returns != 0, nf_ct_unexpect_related(rtp_exp) is called, which does: spin_lock_bh(&nf_conntrack_lock); if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } spin_unlock_bh(&nf_conntrack_lock); Note that del_timer always returns false if the timer has been initialized. However, the timer was not initialized since setup_timer was not called, therefore, the expectation timer remains in some undefined state. If I'm not missing anything, this may lead to the removal an unexistent expectation. To fix this, the optimization that allows refreshing an expectation is removed. Now nf_conntrack_expect_related looks more consistent to me since it always add the expectation in case that it returns success. Thanks to Patrick McHardy for participating in the discussion of this patch. I think this may be the source of the problem described by: http://marc.info/?l=netfilter-devel&m=134073514719421&w=2 Reported-by: Rafal Fitt Acked-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_expect.c | 29 ++++++----------------------- 1 file changed, 6 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 45cf602a76bc..527651a53a45 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -361,23 +361,6 @@ static void evict_oldest_expect(struct nf_conn *master, } } -static inline int refresh_timer(struct nf_conntrack_expect *i) -{ - struct nf_conn_help *master_help = nfct_help(i->master); - const struct nf_conntrack_expect_policy *p; - - if (!del_timer(&i->timeout)) - return 0; - - p = &rcu_dereference_protected( - master_help->helper, - lockdep_is_held(&nf_conntrack_lock) - )->expect_policy[i->class]; - i->timeout.expires = jiffies + p->timeout * HZ; - add_timer(&i->timeout); - return 1; -} - static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) { const struct nf_conntrack_expect_policy *p; @@ -386,7 +369,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); - struct hlist_node *n; + struct hlist_node *n, *next; unsigned int h; int ret = 1; @@ -395,12 +378,12 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { + hlist_for_each_entry_safe(i, n, next, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { - /* Refresh timer: if it's dying, ignore.. */ - if (refresh_timer(i)) { - ret = 0; - goto out; + if (del_timer(&i->timeout)) { + nf_ct_unlink_expect(i); + nf_ct_expect_put(i); + break; } } else if (expect_clash(i, expect)) { ret = -EBUSY; -- cgit v1.2.3-70-g09d2 From 16c0b164bd24d44db137693a36b428ba28970c62 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 15 Aug 2012 20:44:27 +0000 Subject: act_mirred: do not drop packets when fails to mirror it We drop packet unconditionally when we fail to mirror it. This is not intended in some cases. Consdier for kvm guest, we may mirror the traffic of the bridge to a tap device used by a VM. When kernel fails to mirror the packet in conditions such as when qemu crashes or stop polling the tap, it's hard for the management software to detect such condition and clean the the mirroring before. This would lead all packets to the bridge to be dropped and break the netowrk of other virtual machines. To solve the issue, the patch does not drop packets when kernel fails to mirror it, and only drop the redirected packets. Signed-off-by: Jason Wang Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/act_mirred.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index fe81cc18e9e0..9c0fd0c78814 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -200,13 +200,12 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, out: if (err) { m->tcf_qstats.overlimits++; - /* should we be asking for packet to be dropped? - * may make sense for redirect case only - */ - retval = TC_ACT_SHOT; - } else { + if (m->tcfm_eaction != TCA_EGRESS_MIRROR) + retval = TC_ACT_SHOT; + else + retval = m->tcf_action; + } else retval = m->tcf_action; - } spin_unlock(&m->tcf_lock); return retval; -- cgit v1.2.3-70-g09d2 From f796c20cf67aa54c9130d0dc41307c0025719b85 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 14 Aug 2012 12:34:24 +0000 Subject: net: netprio: fix files lock and remove useless d_path bits Add lock to prevent a race with a file closing and also remove useless and ugly sscanf code. The extra code was never needed and the case it supposedly protected against is in fact handled correctly by sock_from_file as pointed out by Al Viro. CC: Neil Horman Reported-by: Al Viro Signed-off-by: John Fastabend Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index ed0c0431fcd8..f65dba3afd99 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -277,12 +277,6 @@ out_free_devname: void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *p; - char *tmp = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); - - if (!tmp) { - pr_warn("Unable to attach cgrp due to alloc failure!\n"); - return; - } cgroup_taskset_for_each(p, cgrp, tset) { unsigned int fd; @@ -296,32 +290,24 @@ void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) continue; } - rcu_read_lock(); + spin_lock(&files->file_lock); fdt = files_fdtable(files); for (fd = 0; fd < fdt->max_fds; fd++) { - char *path; struct file *file; struct socket *sock; - unsigned long s; - int rv, err = 0; + int err; file = fcheck_files(files, fd); if (!file) continue; - path = d_path(&file->f_path, tmp, PAGE_SIZE); - rv = sscanf(path, "socket:[%lu]", &s); - if (rv <= 0) - continue; - sock = sock_from_file(file, &err); - if (!err) + if (sock) sock_update_netprioidx(sock->sk, p); } - rcu_read_unlock(); + spin_unlock(&files->file_lock); task_unlock(p); } - kfree(tmp); } static struct cftype ss_files[] = { -- cgit v1.2.3-70-g09d2 From 48a87cc26c13b68f6cce4e9d769fcb17a6b3e4b8 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 14 Aug 2012 12:34:30 +0000 Subject: net: netprio: fd passed in SCM_RIGHTS datagram not set correctly A socket fd passed in a SCM_RIGHTS datagram was not getting updated with the new tasks cgrp prioidx. This leaves IO on the socket tagged with the old tasks priority. To fix this add a check in the scm recvmsg path to update the sock cgrp prioidx with the new tasks value. Thanks to Al Viro for catching this. CC: Neil Horman Reported-by: Al Viro Signed-off-by: John Fastabend Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/scm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/scm.c b/net/core/scm.c index 8f6ccfd68ef4..040cebeed45b 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -265,6 +265,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); isk, current); fd_install(new_fd, fp[i]); } -- cgit v1.2.3-70-g09d2 From 476ad154f3b41dd7d9a08a2f641e28388abc2fd1 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 14 Aug 2012 12:34:35 +0000 Subject: net: netprio: fix cgrp create and write priomap race A race exists where creating cgroups and also updating the priomap may result in losing a priomap update. This is because priomap writers are not protected by rtnl_lock. Move priority writer into rtnl_lock()/rtnl_unlock(). CC: Neil Horman Reported-by: Al Viro Signed-off-by: John Fastabend Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netprio_cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index f65dba3afd99..c75e3f9d060f 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -101,12 +101,10 @@ static int write_update_netdev_table(struct net_device *dev) u32 max_len; struct netprio_map *map; - rtnl_lock(); max_len = atomic_read(&max_prioidx) + 1; map = rtnl_dereference(dev->priomap); if (!map || map->priomap_len < max_len) ret = extend_netdev_table(dev, max_len); - rtnl_unlock(); return ret; } @@ -256,17 +254,17 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft, if (!dev) goto out_free_devname; + rtnl_lock(); ret = write_update_netdev_table(dev); if (ret < 0) goto out_put_dev; - rcu_read_lock(); - map = rcu_dereference(dev->priomap); + map = rtnl_dereference(dev->priomap); if (map) map->priomap[prioidx] = priority; - rcu_read_unlock(); out_put_dev: + rtnl_unlock(); dev_put(dev); out_free_devname: -- cgit v1.2.3-70-g09d2 From c0de08d04215031d68fa13af36f347a6cfa252ca Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Thu, 16 Aug 2012 22:02:58 +0000 Subject: af_packet: don't emit packet on orig fanout group If a packet is emitted on one socket in one group of fanout sockets, it is transmitted again. It is thus read again on one of the sockets of the fanout group. This result in a loop for software which generate packets when receiving one. This retransmission is not the intended behavior: a fanout group must behave like a single socket. The packet should not be transmitted on a socket if it originates from a socket belonging to the same fanout group. This patch fixes the issue by changing the transmission check to take fanout group info account. Reported-by: Aleksandr Kotov Signed-off-by: Eric Leblond Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 16 ++++++++++++++-- net/packet/af_packet.c | 9 +++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3560d688161e..59dc05f38247 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1522,6 +1522,8 @@ struct packet_type { struct sk_buff **(*gro_receive)(struct sk_buff **head, struct sk_buff *skb); int (*gro_complete)(struct sk_buff *skb); + bool (*id_match)(struct packet_type *ptype, + struct sock *sk); void *af_packet_priv; struct list_head list; }; diff --git a/net/core/dev.c b/net/core/dev.c index a39354ee1432..debd9372472f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1642,6 +1642,19 @@ static inline int deliver_skb(struct sk_buff *skb, return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } +static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) +{ + if (ptype->af_packet_priv == NULL) + return false; + + if (ptype->id_match) + return ptype->id_match(ptype, skb->sk); + else if ((struct sock *)ptype->af_packet_priv == skb->sk) + return true; + + return false; +} + /* * Support routine. Sends outgoing frames to any network * taps currently in use. @@ -1659,8 +1672,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * they originated from - MvS (miquels@drinkel.ow.org) */ if ((ptype->dev == dev || !ptype->dev) && - (ptype->af_packet_priv == NULL || - (struct sock *)ptype->af_packet_priv != skb->sk)) { + (!skb_loop_sk(ptype, skb))) { if (pt_prev) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8ac890a1a4c0..aee7196aac36 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1273,6 +1273,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_unlock(&f->lock); } +bool match_fanout_group(struct packet_type *ptype, struct sock * sk) +{ + if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) + return true; + + return false; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); @@ -1325,6 +1333,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; + match->prot_hook.id_match = match_fanout_group; dev_add_pack(&match->prot_hook); list_add(&match->list, &fanout_list); } -- cgit v1.2.3-70-g09d2 From d92c7f8aabae913de16eb855b19cd2002c341896 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Fri, 17 Aug 2012 10:33:12 +0000 Subject: caif: Do not dereference NULL in chnl_recv_cb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In net/caif/chnl_net.c::chnl_recv_cb() we call skb_header_pointer() which may return NULL, but we do not check for a NULL pointer before dereferencing it. This patch adds such a NULL check and properly free's allocated memory and return an error (-EINVAL) on failure - much better than crashing.. Signed-off-by: Jesper Juhl Acked-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 69771c04ba8f..e597733affb8 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -94,6 +94,10 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) /* check the version of IP */ ip_version = skb_header_pointer(skb, 0, 1, &buf); + if (!ip_version) { + kfree_skb(skb); + return -EINVAL; + } switch (*ip_version >> 4) { case 4: -- cgit v1.2.3-70-g09d2 From 9d7b0fc1ef1f17aff57c0dc9a59453d8fca255c3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 20 Aug 2012 02:56:56 -0700 Subject: net: ipv6: fix oops in inet_putpeer() Commit 97bab73f (inet: Hide route peer accesses behind helpers.) introduced a bug in xfrm6_policy_destroy(). The xfrm_dst's _rt6i_peer member is not initialized, causing a false positive result from inetpeer_ptr_is_peer(), which in turn causes a NULL pointer dereference in inet_putpeer(). Pid: 314, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #17 To Be Filled By O.E.M. To Be Filled By O.E.M./P4S800D-X EIP: 0060:[] EFLAGS: 00010246 CPU: 0 EIP is at inet_putpeer+0xe/0x16 EAX: 00000000 EBX: f3481700 ECX: 00000000 EDX: 000dd641 ESI: f3481700 EDI: c05e949c EBP: f551def4 ESP: f551def4 DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 CR0: 8005003b CR2: 00000070 CR3: 3243d000 CR4: 00000750 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 f551df04 c0423de1 00000000 f3481700 f551df18 c038d5f7 f254b9f8 f551df28 f34f85d8 f551df20 c03ef48d f551df3c c0396870 f30697e8 f24e1738 c05e98f4 f5509540 c05cd2b4 f551df7c c0142d2b c043feb5 f5509540 00000000 c05cd2e8 [] xfrm6_dst_destroy+0x42/0xdb [] dst_destroy+0x1d/0xa4 [] xfrm_bundle_flo_delete+0x2b/0x36 [] flow_cache_gc_task+0x85/0x9f [] process_one_work+0x122/0x441 [] ? apic_timer_interrupt+0x31/0x38 [] ? flow_cache_new_hashrnd+0x2b/0x2b [] worker_thread+0x113/0x3cc Fix by adding a init_dst() callback to struct xfrm_policy_afinfo to properly initialize the dst's peer pointer. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/net/xfrm.h | 2 ++ net/ipv6/xfrm6_policy.c | 8 ++++++++ net/xfrm/xfrm_policy.c | 2 ++ 3 files changed, 12 insertions(+) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 62b619e82a90..976a81abe1a2 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -292,6 +292,8 @@ struct xfrm_policy_afinfo { struct flowi *fl, int reverse); int (*get_tos)(const struct flowi *fl); + void (*init_dst)(struct net *net, + struct xfrm_dst *dst); int (*init_path)(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ef39812107b1..f8c4c08ffb60 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -73,6 +73,13 @@ static int xfrm6_get_tos(const struct flowi *fl) return 0; } +static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst) +{ + struct rt6_info *rt = (struct rt6_info *)xdst; + + rt6_init_peer(rt, net->ipv6.peers); +} + static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { @@ -286,6 +293,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .get_saddr = xfrm6_get_saddr, .decode_session = _decode_session6, .get_tos = xfrm6_get_tos, + .init_dst = xfrm6_init_dst, .init_path = xfrm6_init_path, .fill_dst = xfrm6_fill_dst, .blackhole_route = ip6_blackhole_route, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5a5165a5927..5a2aa17e4d3c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1357,6 +1357,8 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); xdst->flo.ops = &xfrm_bundle_fc_ops; + if (afinfo->init_dst) + afinfo->init_dst(net, xdst); } else xdst = ERR_PTR(-ENOBUFS); -- cgit v1.2.3-70-g09d2 From 3de7a37b02f607716753dc669c1dca4f71db08fc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 18 Aug 2012 14:36:44 +0000 Subject: net/core/dev.c: fix kernel-doc warning Fix kernel-doc warning: Warning(net/core/dev.c:5745): No description found for parameter 'dev' Signed-off-by: Randy Dunlap Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index debd9372472f..83988362805e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5744,6 +5744,7 @@ EXPORT_SYMBOL(netdev_refcnt_read); /** * netdev_wait_allrefs - wait until all references are gone. + * @dev: target net_device * * This is called when unregistering network devices. * -- cgit v1.2.3-70-g09d2 From fae6ef87faeb8853896920c68ee703d715799d28 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 19 Aug 2012 03:30:38 +0000 Subject: net: tcp: move sk_rx_dst_set call after tcp_create_openreq_child() This commit removes the sk_rx_dst_set calls from tcp_create_openreq_child(), because at that point the icsk_af_ops field of ipv6_mapped TCP sockets has not been set to its proper final value. Instead, to make sure we get the right sk_rx_dst_set variant appropriate for the address family of the new connection, we have tcp_v{4,6}_syn_recv_sock() directly call the appropriate function shortly after the call to tcp_create_openreq_child() returns. This also moves inet6_sk_rx_dst_set() to avoid a forward declaration with the new approach. Signed-off-by: Neal Cardwell Reported-by: Artem Savkov Cc: Eric Dumazet Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_minisocks.c | 2 -- net/ipv6/tcp_ipv6.c | 25 +++++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 767823764016..5bf2040b25b1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1462,6 +1462,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; + inet_sk_rx_dst_set(newsk, skb); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index d9c9dcef2de3..6ff7f10dce9d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -387,8 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct tcp_sock *oldtp = tcp_sk(sk); struct tcp_cookie_values *oldcvp = oldtp->cookie_values; - newicsk->icsk_af_ops->sk_rx_dst_set(newsk, skb); - /* TCP Cookie Transactions require space for the cookie pair, * as it differs for each connection. There is no need to * copy any s_data_payload stored at the original socket. diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bb9ce2b2f377..a3e60cc04a8a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -94,6 +94,18 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk, } #endif +static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + const struct rt6_info *rt = (const struct rt6_info *)dst; + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + if (rt->rt6i_node) + inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; +} + static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { @@ -1270,6 +1282,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_gso_type = SKB_GSO_TCPV6; __ip6_dst_store(newsk, dst, NULL, NULL); + inet6_sk_rx_dst_set(newsk, skb); newtcp6sk = (struct tcp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; @@ -1729,18 +1742,6 @@ static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; -static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - const struct rt6_info *rt = (const struct rt6_info *)dst; - - dst_hold(dst); - sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - if (rt->rt6i_node) - inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum; -} - static const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, -- cgit v1.2.3-70-g09d2 From 2dba62c30ec3040ef1b647a8976fd7e6854cc7a7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 19 Aug 2012 10:16:08 +0000 Subject: netfilter: nfnetlink_log: fix NLA_PUT macro removal bug Commit 1db20a52 (nfnetlink_log: Stop using NLA_PUT*().) incorrectly converted a NLA_PUT_BE16 macro to nla_put_be32() in nfnetlink_log: - NLA_PUT_BE16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)); + if (nla_put_be32(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) Signed-off-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 169ab59ed9d4..592091c1260b 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -480,7 +480,7 @@ __build_packet_message(struct nfulnl_instance *inst, } if (indev && skb_mac_header_was_set(skb)) { - if (nla_put_be32(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || + if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || nla_put_be16(inst->skb, NFULA_HWLEN, htons(skb->dev->hard_header_len)) || nla_put(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, -- cgit v1.2.3-70-g09d2 From d1c338a509cea5378df59629ad47382810c38623 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 19 Aug 2012 12:29:16 -0700 Subject: libceph: delay debugfs initialization until we learn global_id The debugfs directory includes the cluster fsid and our unique global_id. We need to delay the initialization of the debug entry until we have learned both the fsid and our global_id from the monitor or else the second client can't create its debugfs entry and will fail (and multiple client instances aren't properly reflected in debugfs). Reported by: Yan, Zheng Signed-off-by: Sage Weil Reviewed-by: Yehuda Sadeh --- fs/ceph/debugfs.c | 1 + net/ceph/ceph_common.c | 1 - net/ceph/debugfs.c | 4 ++++ net/ceph/mon_client.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 51 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index fb962efdacee..6d59006bfa27 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -201,6 +201,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) int err = -ENOMEM; dout("ceph_fs_debugfs_init\n"); + BUG_ON(!fsc->client->debugfs_dir); fsc->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 0600, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 69e38db28e5f..a8020293f342 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -84,7 +84,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) return -1; } } else { - pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); memcpy(&client->fsid, fsid, sizeof(*fsid)); } return 0; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 54b531a01121..38b5dc1823d4 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -189,6 +189,9 @@ int ceph_debugfs_client_init(struct ceph_client *client) snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, client->monc.auth->global_id); + dout("ceph_debugfs_client_init %p %s\n", client, name); + + BUG_ON(client->debugfs_dir); client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); if (!client->debugfs_dir) goto out; @@ -234,6 +237,7 @@ out: void ceph_debugfs_client_cleanup(struct ceph_client *client) { + dout("ceph_debugfs_client_cleanup %p\n", client); debugfs_remove(client->debugfs_osdmap); debugfs_remove(client->debugfs_monmap); debugfs_remove(client->osdc.debugfs_file); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 105d533b55f3..900ea0f043fc 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -310,6 +310,17 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) } EXPORT_SYMBOL(ceph_monc_open_session); +/* + * We require the fsid and global_id in order to initialize our + * debugfs dir. + */ +static bool have_debugfs_info(struct ceph_mon_client *monc) +{ + dout("have_debugfs_info fsid %d globalid %lld\n", + (int)monc->client->have_fsid, monc->auth->global_id); + return monc->client->have_fsid && monc->auth->global_id > 0; +} + /* * The monitor responds with mount ack indicate mount success. The * included client ticket allows the client to talk to MDSs and OSDs. @@ -320,9 +331,12 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_client *client = monc->client; struct ceph_monmap *monmap = NULL, *old = monc->monmap; void *p, *end; + int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); + dout("handle_monmap\n"); p = msg->front.iov_base; end = p + msg->front.iov_len; @@ -344,12 +358,22 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, if (!client->have_fsid) { client->have_fsid = true; + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } mutex_unlock(&monc->mutex); - /* - * do debugfs initialization without mutex to avoid - * creating a locking dependency - */ - ceph_debugfs_client_init(client); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } + goto out_unlocked; } out: @@ -865,8 +889,10 @@ static void handle_auth_reply(struct ceph_mon_client *monc, { int ret; int was_auth = 0; + int had_debugfs_info, init_debugfs = 0; mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); if (monc->auth->ops) was_auth = monc->auth->ops->is_authenticated(monc->auth); monc->pending_auth = 0; @@ -889,7 +915,22 @@ static void handle_auth_reply(struct ceph_mon_client *monc, __send_subscribe(monc); __resend_generic_request(monc); } + + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } mutex_unlock(&monc->mutex); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } } static int __validate_auth(struct ceph_mon_client *monc) -- cgit v1.2.3-70-g09d2 From be1e44441a560c43c136a562d49a1c9623c91197 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 9 Aug 2012 18:12:28 -0400 Subject: svcrpc: fix BUG() in svc_tcp_clear_pages Examination of svc_tcp_clear_pages shows that it assumes sk_tcplen is consistent with sk_pages[] (in particular, sk_pages[n] can't be NULL if sk_tcplen would lead us to expect n pages of data). svc_tcp_restore_pages zeroes out sk_pages[] while leaving sk_tcplen. This is OK, since both functions are serialized by XPT_BUSY. However, that means the inconsistency must be repaired before dropping XPT_BUSY. Therefore we should be ensuring that svc_tcp_save_pages repairs the problem before exiting svc_tcp_recv_record on error. Symptoms were a BUG() in svc_tcp_clear_pages. Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 18bc130255a7..998aa8c1807c 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1129,9 +1129,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) if (len >= 0) svsk->sk_tcplen += len; if (len != want) { + svc_tcp_save_pages(svsk, rqstp); if (len < 0 && len != -EAGAIN) goto err_other; - svc_tcp_save_pages(svsk, rqstp); dprintk("svc: incomplete TCP record (%d of %d)\n", svsk->sk_tcplen, svsk->sk_reclen); goto err_noclose; -- cgit v1.2.3-70-g09d2 From f06f00a24d76e168ecb38d352126fd203937b601 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 20 Aug 2012 16:04:40 -0400 Subject: svcrpc: sends on closed socket should stop immediately svc_tcp_sendto sets XPT_CLOSE if we fail to transmit the entire reply. However, the XPT_CLOSE won't be acted on immediately. Meanwhile other threads could send further replies before the socket is really shut down. This can manifest as data corruption: for example, if a truncated read reply is followed by another rpc reply, that second reply will look to the client like further read data. Symptoms were data corruption preceded by svc_tcp_sendto logging something like kernel: rpc-srv/tcp: nfsd: sent only 963696 when sending 1048708 bytes - shutting down socket Cc: stable@vger.kernel.org Reported-by: Malahal Naineni Tested-by: Malahal Naineni Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 88f2bf671960..0d693a89434f 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -794,7 +794,8 @@ int svc_send(struct svc_rqst *rqstp) /* Grab mutex to serialize outgoing data. */ mutex_lock(&xprt->xpt_mutex); - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) + if (test_bit(XPT_DEAD, &xprt->xpt_flags) + || test_bit(XPT_CLOSE, &xprt->xpt_flags)) len = -ENOTCONN; else len = xprt->xpt_ops->xpo_sendto(rqstp); -- cgit v1.2.3-70-g09d2 From d10f27a750312ed5638c876e4bd6aa83664cccd8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 17 Aug 2012 17:31:53 -0400 Subject: svcrpc: fix svc_xprt_enqueue/svc_recv busy-looping The rpc server tries to ensure that there will be room to send a reply before it receives a request. It does this by tracking, in xpt_reserved, an upper bound on the total size of the replies that is has already committed to for the socket. Currently it is adding in the estimate for a new reply *before* it checks whether there is space available. If it finds that there is not space, it then subtracts the estimate back out. This may lead the subsequent svc_xprt_enqueue to decide that there is space after all. The results is a svc_recv() that will repeatedly return -EAGAIN, causing server threads to loop without doing any actual work. Cc: stable@vger.kernel.org Reported-by: Michael Tokarev Tested-by: Michael Tokarev Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 0d693a89434f..bac973a31367 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -316,7 +316,6 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt) */ void svc_xprt_enqueue(struct svc_xprt *xprt) { - struct svc_serv *serv = xprt->xpt_server; struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; @@ -362,8 +361,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) rqstp, rqstp->rq_xprt); rqstp->rq_xprt = xprt; svc_xprt_get(xprt); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); pool->sp_stats.threads_woken++; wake_up(&rqstp->rq_wait); } else { @@ -640,8 +637,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (xprt) { rqstp->rq_xprt = xprt; svc_xprt_get(xprt); - rqstp->rq_reserved = serv->sv_max_mesg; - atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); /* As there is a shortage of threads and this request * had to be queued, don't allow the thread to wait so @@ -738,6 +733,8 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) else len = xprt->xpt_ops->xpo_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); + rqstp->rq_reserved = serv->sv_max_mesg; + atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } svc_xprt_received(xprt); -- cgit v1.2.3-70-g09d2 From 144d56e91044181ec0ef67aeca91e9a8b5718348 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Aug 2012 00:22:46 +0000 Subject: tcp: fix possible socket refcount problem Commit 6f458dfb40 (tcp: improve latencies of timer triggered events) added bug leading to following trace : [ 2866.131281] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000 [ 2866.131726] [ 2866.132188] ========================= [ 2866.132281] [ BUG: held lock freed! ] [ 2866.132281] 3.6.0-rc1+ #622 Not tainted [ 2866.132281] ------------------------- [ 2866.132281] kworker/0:1/652 is freeing memory ffff880019ec0000-ffff880019ec0a1f, with a lock still held there! [ 2866.132281] (sk_lock-AF_INET-RPC){+.+...}, at: [] tcp_sendmsg+0x29/0xcc6 [ 2866.132281] 4 locks held by kworker/0:1/652: [ 2866.132281] #0: (rpciod){.+.+.+}, at: [] process_one_work+0x1de/0x47f [ 2866.132281] #1: ((&task->u.tk_work)){+.+.+.}, at: [] process_one_work+0x1de/0x47f [ 2866.132281] #2: (sk_lock-AF_INET-RPC){+.+...}, at: [] tcp_sendmsg+0x29/0xcc6 [ 2866.132281] #3: (&icsk->icsk_retransmit_timer){+.-...}, at: [] run_timer_softirq+0x1ad/0x35f [ 2866.132281] [ 2866.132281] stack backtrace: [ 2866.132281] Pid: 652, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #622 [ 2866.132281] Call Trace: [ 2866.132281] [] debug_check_no_locks_freed+0x112/0x159 [ 2866.132281] [] ? __sk_free+0xfd/0x114 [ 2866.132281] [] kmem_cache_free+0x6b/0x13a [ 2866.132281] [] __sk_free+0xfd/0x114 [ 2866.132281] [] sk_free+0x1c/0x1e [ 2866.132281] [] tcp_write_timer+0x51/0x56 [ 2866.132281] [] run_timer_softirq+0x218/0x35f [ 2866.132281] [] ? run_timer_softirq+0x1ad/0x35f [ 2866.132281] [] ? rb_commit+0x58/0x85 [ 2866.132281] [] ? tcp_write_timer_handler+0x148/0x148 [ 2866.132281] [] __do_softirq+0xcb/0x1f9 [ 2866.132281] [] ? _raw_spin_unlock+0x29/0x2e [ 2866.132281] [] call_softirq+0x1c/0x30 [ 2866.132281] [] do_softirq+0x4a/0xa6 [ 2866.132281] [] irq_exit+0x51/0xad [ 2866.132281] [] do_IRQ+0x9d/0xb4 [ 2866.132281] [] common_interrupt+0x6f/0x6f [ 2866.132281] [] ? sched_clock_cpu+0x58/0xd1 [ 2866.132281] [] ? _raw_spin_unlock_irqrestore+0x4c/0x56 [ 2866.132281] [] mod_timer+0x178/0x1a9 [ 2866.132281] [] sk_reset_timer+0x19/0x26 [ 2866.132281] [] tcp_rearm_rto+0x99/0xa4 [ 2866.132281] [] tcp_event_new_data_sent+0x6e/0x70 [ 2866.132281] [] tcp_write_xmit+0x7de/0x8e4 [ 2866.132281] [] ? __alloc_skb+0xa0/0x1a1 [ 2866.132281] [] __tcp_push_pending_frames+0x2e/0x8a [ 2866.132281] [] tcp_sendmsg+0xb32/0xcc6 [ 2866.132281] [] inet_sendmsg+0xaa/0xd5 [ 2866.132281] [] ? inet_autobind+0x5f/0x5f [ 2866.132281] [] ? trace_clock_local+0x9/0xb [ 2866.132281] [] sock_sendmsg+0xa3/0xc4 [ 2866.132281] [] ? rb_reserve_next_event+0x26f/0x2d5 [ 2866.132281] [] ? native_sched_clock+0x29/0x6f [ 2866.132281] [] ? sched_clock+0x9/0xd [ 2866.132281] [] ? trace_clock_local+0x9/0xb [ 2866.132281] [] kernel_sendmsg+0x37/0x43 [ 2866.132281] [] xs_send_kvec+0x77/0x80 [ 2866.132281] [] xs_sendpages+0x6f/0x1a0 [ 2866.132281] [] ? try_to_del_timer_sync+0x55/0x61 [ 2866.132281] [] xs_tcp_send_request+0x55/0xf1 [ 2866.132281] [] xprt_transmit+0x89/0x1db [ 2866.132281] [] ? call_connect+0x3c/0x3c [ 2866.132281] [] call_transmit+0x1c5/0x20e [ 2866.132281] [] __rpc_execute+0x6f/0x225 [ 2866.132281] [] ? call_connect+0x3c/0x3c [ 2866.132281] [] rpc_async_schedule+0x28/0x34 [ 2866.132281] [] process_one_work+0x24d/0x47f [ 2866.132281] [] ? process_one_work+0x1de/0x47f [ 2866.132281] [] ? __rpc_execute+0x225/0x225 [ 2866.132281] [] worker_thread+0x236/0x317 [ 2866.132281] [] ? process_scheduled_works+0x2f/0x2f [ 2866.132281] [] kthread+0x9a/0xa2 [ 2866.132281] [] kernel_thread_helper+0x4/0x10 [ 2866.132281] [] ? retint_restore_args+0x13/0x13 [ 2866.132281] [] ? __init_kthread_worker+0x5a/0x5a [ 2866.132281] [] ? gs_change+0x13/0x13 [ 2866.308506] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000 [ 2866.309689] ============================================================================= [ 2866.310254] BUG TCP (Not tainted): Object already free [ 2866.310254] ----------------------------------------------------------------------------- [ 2866.310254] The bug comes from the fact that timer set in sk_reset_timer() can run before we actually do the sock_hold(). socket refcount reaches zero and we free the socket too soon. timer handler is not allowed to reduce socket refcnt if socket is owned by the user, or we need to change sk_reset_timer() implementation. We should take a reference on the socket in case TCP_DELACK_TIMER_DEFERRED or TCP_DELACK_TIMER_DEFERRED bit are set in tsq_flags Also fix a typo in tcp_delack_timer(), where TCP_WRITE_TIMER_DEFERRED was used instead of TCP_DELACK_TIMER_DEFERRED. For consistency, use same socket refcount change for TCP_MTU_REDUCED_DEFERRED, even if not fired from a timer. Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 +++++--- net/ipv4/tcp_output.c | 14 +++++++++----- net/ipv4/tcp_timer.c | 6 ++++-- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5bf2040b25b1..00a748d14062 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -417,10 +417,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ tp->mtu_info = info; - if (!sock_owned_by_user(sk)) + if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); - else - set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); + } else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + sock_hold(sk); + } goto out; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 20dfd892c86f..d04632673a9e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -910,14 +910,18 @@ void tcp_release_cb(struct sock *sk) if (flags & (1UL << TCP_TSQ_DEFERRED)) tcp_tsq_handler(sk); - if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) + if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) { tcp_write_timer_handler(sk); - - if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) + __sock_put(sk); + } + if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) { tcp_delack_timer_handler(sk); - - if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) + __sock_put(sk); + } + if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) { sk->sk_prot->mtu_reduced(sk); + __sock_put(sk); + } } EXPORT_SYMBOL(tcp_release_cb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6df36ad55a38..b774a03bd1dc 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -252,7 +252,8 @@ static void tcp_delack_timer(unsigned long data) inet_csk(sk)->icsk_ack.blocked = 1; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); /* deleguate our work to tcp_release_cb() */ - set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); @@ -481,7 +482,8 @@ static void tcp_write_timer(unsigned long data) tcp_write_timer_handler(sk); } else { /* deleguate our work to tcp_release_cb() */ - set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags)) + sock_hold(sk); } bh_unlock_sock(sk); sock_put(sk); -- cgit v1.2.3-70-g09d2 From 1a7b27c97ce675b42eeb7bfaf6e15c34f35c8f95 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 20 Aug 2012 02:52:09 +0000 Subject: ipv4: Use newinet->inet_opt in inet_csk_route_child_sock() Since 0e734419923bd ("ipv4: Use inet_csk_route_child_sock() in DCCP and TCP."), inet_csk_route_child_sock() is called instead of inet_csk_route_req(). However, after creating the child-sock in tcp/dccp_v4_syn_recv_sock(), ireq->opt is set to NULL, before calling inet_csk_route_child_sock(). Thus, inside inet_csk_route_child_sock() opt is always NULL and the SRR-options are not respected anymore. Packets sent by the server won't have the correct destination-IP. This patch fixes it by accessing newinet->inet_opt instead of ireq->opt inside inet_csk_route_child_sock(). Reported-by: Luca Boccassi Signed-off-by: Christoph Paasch Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index db0cf17c00f7..7f75f21d7b83 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -404,12 +404,15 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *newinet = inet_sk(newsk); - struct ip_options_rcu *opt = ireq->opt; + struct ip_options_rcu *opt; struct net *net = sock_net(sk); struct flowi4 *fl4; struct rtable *rt; fl4 = &newinet->cork.fl.u.ip4; + + rcu_read_lock(); + opt = rcu_dereference(newinet->inet_opt); flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, sk->sk_protocol, inet_sk_flowi_flags(sk), @@ -421,11 +424,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, goto no_route; if (opt && opt->opt.is_strictroute && rt->rt_gateway) goto route_err; + rcu_read_unlock(); return &rt->dst; route_err: ip_rt_put(rt); no_route: + rcu_read_unlock(); IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } -- cgit v1.2.3-70-g09d2 From a9915a1b52df52ad87f3b33422da95cf25372f09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Aug 2012 07:26:45 +0000 Subject: ipv4: fix ip header ident selection in __ip_make_skb() Christian Casteyde reported a kmemcheck 32-bit read from uninitialized memory in __ip_select_ident(). It turns out that __ip_make_skb() called ip_select_ident() before properly initializing iph->daddr. This is a bug uncovered by commit 1d861aa4b3fb (inet: Minimize use of cached route inetpeer.) Addresses https://bugzilla.kernel.org/show_bug.cgi?id=46131 Reported-by: Christian Casteyde Signed-off-by: Eric Dumazet Cc: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 147ccc3e93db..c196d749daf2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1338,10 +1338,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, iph->ihl = 5; iph->tos = inet->tos; iph->frag_off = df; - ip_select_ident(iph, &rt->dst, sk); iph->ttl = ttl; iph->protocol = sk->sk_protocol; ip_copy_addrs(iph, fl4); + ip_select_ident(iph, &rt->dst, sk); if (opt) { iph->ihl += opt->optlen>>2; -- cgit v1.2.3-70-g09d2 From e0e3cea46d31d23dc40df0a49a7a2c04fe8edfea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 06:21:17 +0000 Subject: af_netlink: force credentials passing [CVE-2012-3520] Pablo Neira Ayuso discovered that avahi and potentially NetworkManager accept spoofed Netlink messages because of a kernel bug. The kernel passes all-zero SCM_CREDENTIALS ancillary data to the receiver if the sender did not provide such data, instead of not including any such data at all or including the correct data from the peer (as it is the case with AF_UNIX). This bug was introduced in commit 16e572626961 (af_unix: dont send SCM_CREDENTIALS by default) This patch forces passing credentials for netlink, as before the regression. Another fix would be to not add SCM_CREDENTIALS in netlink messages if not provided by the sender, but it might break some programs. With help from Florian Weimer & Petr Matousek This issue is designated as CVE-2012-3520 Signed-off-by: Eric Dumazet Cc: Petr Matousek Cc: Florian Weimer Cc: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/scm.h | 4 +++- net/netlink/af_netlink.c | 2 +- net/unix/af_unix.c | 4 ++-- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/scm.h b/include/net/scm.h index 079d7887dac1..7dc0854f0b38 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -70,9 +70,11 @@ static __inline__ void scm_destroy(struct scm_cookie *scm) } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, - struct scm_cookie *scm) + struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); + if (forcecreds) + scm_set_cred(scm, task_tgid(current), current_cred()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5463969da45b..1445d73533ed 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1362,7 +1362,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &scm; - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, true); if (err < 0) return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e4768c180da2..c5ee4ff61364 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1450,7 +1450,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; @@ -1619,7 +1619,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, if (NULL == siocb->scm) siocb->scm = &tmp_scm; wait_for_unix_gc(); - err = scm_send(sock, msg, siocb->scm); + err = scm_send(sock, msg, siocb->scm, false); if (err < 0) return err; -- cgit v1.2.3-70-g09d2 From 6d4221b53707486dfad3f5bfe568d2ce7f4c9863 Mon Sep 17 00:00:00 2001 From: Jim Schutt Date: Fri, 10 Aug 2012 10:37:38 -0700 Subject: libceph: avoid truncation due to racing banners Because the Ceph client messenger uses a non-blocking connect, it is possible for the sending of the client banner to race with the arrival of the banner sent by the peer. When ceph_sock_state_change() notices the connect has completed, it schedules work to process the socket via con_work(). During this time the peer is writing its banner, and arrival of the peer banner races with con_work(). If con_work() calls try_read() before the peer banner arrives, there is nothing for it to do, after which con_work() calls try_write() to send the client's banner. In this case Ceph's protocol negotiation can complete succesfully. The server-side messenger immediately sends its banner and addresses after accepting a connect request, *before* actually attempting to read or verify the banner from the client. As a result, it is possible for the banner from the server to arrive before con_work() calls try_read(). If that happens, try_read() will read the banner and prepare protocol negotiation info via prepare_write_connect(). prepare_write_connect() calls con_out_kvec_reset(), which discards the as-yet-unsent client banner. Next, con_work() calls try_write(), which sends the protocol negotiation info rather than the banner that the peer is expecting. The result is that the peer sees an invalid banner, and the client reports "negotiation failed". Fix this by moving con_out_kvec_reset() out of prepare_write_connect() to its callers at all locations except the one where the banner might still need to be sent. [elder@inktak.com: added note about server-side behavior] Signed-off-by: Jim Schutt Reviewed-by: Alex Elder --- net/ceph/messenger.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b9796750034a..24c5eea8c45b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -915,7 +915,6 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.authorizer_len = auth ? cpu_to_le32(auth->authorizer_buf_len) : 0; - con_out_kvec_reset(con); con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); if (auth && auth->authorizer_buf_len) @@ -1557,6 +1556,7 @@ static int process_connect(struct ceph_connection *con) return -1; } con->auth_retry = 1; + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1577,6 +1577,7 @@ static int process_connect(struct ceph_connection *con) ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); reset_connection(con); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1601,6 +1602,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->out_connect.connect_seq), le32_to_cpu(con->in_reply.connect_seq)); con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1617,6 +1619,7 @@ static int process_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, le32_to_cpu(con->in_reply.global_seq)); + con_out_kvec_reset(con); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -2135,7 +2138,11 @@ more: BUG_ON(con->state != CON_STATE_CONNECTING); con->state = CON_STATE_NEGOTIATING; - /* Banner is good, exchange connection info */ + /* + * Received banner is good, exchange connection info. + * Do not reset out_kvec, as sending our banner raced + * with receiving peer banner after connect completed. + */ ret = prepare_write_connect(con); if (ret < 0) goto out; -- cgit v1.2.3-70-g09d2 From 27f011243a6e4e8b81078df1d83608dae31e3d38 Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Mon, 20 Aug 2012 11:28:25 -0700 Subject: mac80211: fix DS to MBSS address translation The destination address of unicast frames forwarded through a mesh gate was being replaced with the broadcast address. Instead leave the original destination address as the mesh DA. If the nexthop address is not in the mpath table it will be resolved. If that fails, the frame will be forwarded to known mesh gates. Reported-by: Cedric Voncken Signed-off-by: Thomas Pedersen Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index acf712ffb5e6..c5e8c9c31f76 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1811,37 +1811,31 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, sdata, NULL, NULL); } else { - int is_mesh_mcast = 1; - const u8 *mesh_da; + /* DS -> MBSS (802.11-2012 13.11.3.3). + * For unicast with unknown forwarding information, + * destination might be in the MBSS or if that fails + * forwarded to another mesh gate. In either case + * resolution will be handled in ieee80211_xmit(), so + * leave the original DA. This also works for mcast */ + const u8 *mesh_da = skb->data; + + if (mppath) + mesh_da = mppath->mpp; + else if (mpath) + mesh_da = mpath->dst; + rcu_read_unlock(); - if (is_multicast_ether_addr(skb->data)) - /* DA TA mSA AE:SA */ - mesh_da = skb->data; - else { - static const u8 bcast[ETH_ALEN] = - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; - if (mppath) { - /* RA TA mDA mSA AE:DA SA */ - mesh_da = mppath->mpp; - is_mesh_mcast = 0; - } else if (mpath) { - mesh_da = mpath->dst; - is_mesh_mcast = 0; - } else { - /* DA TA mSA AE:SA */ - mesh_da = bcast; - } - } hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, mesh_da, sdata->vif.addr); - rcu_read_unlock(); - if (is_mesh_mcast) + if (is_multicast_ether_addr(mesh_da)) + /* DA TA mSA AE:SA */ meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, sdata, skb->data + ETH_ALEN, NULL); else + /* RA TA mDA mSA AE:DA SA */ meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, sdata, -- cgit v1.2.3-70-g09d2 From 9b04f350057863d1fad1ba071e09362a1da3503e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Aug 2012 20:48:29 +0000 Subject: ipv4: properly update pmtu Sylvain Munault reported following info : - TCP connection get "stuck" with data in send queue when doing "large" transfers ( like typing 'ps ax' on a ssh connection ) - Only happens on path where the PMTU is lower than the MTU of the interface - Is not present right after boot, it only appears 10-20min after boot or so. (and that's inside the _same_ TCP connection, it works fine at first and then in the same ssh session, it'll get stuck) - Definitely seems related to fragments somehow since I see a router sending ICMP message saying fragmentation is needed. - Exact same setup works fine with kernel 3.5.1 Problem happens when the 10 minutes (ip_rt_mtu_expires) expiration period is over. ip_rt_update_pmtu() calls dst_set_expires() to rearm a new expiration, but dst_set_expires() does nothing because dst.expires is already set. It seems we want to set the expires field to a new value, regardless of prior one. With help from Julian Anastasov. Reported-by: Sylvain Munaut Signed-off-by: Eric Dumazet CC: Julian Anastasov Tested-by: Sylvain Munaut Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fd9ecb52c66b..8c8c748ebb28 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -956,7 +956,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, dst->obsolete = DST_OBSOLETE_KILL; } else { rt->rt_pmtu = mtu; - dst_set_expires(&rt->dst, ip_rt_mtu_expires); + rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires); } } -- cgit v1.2.3-70-g09d2 From a0dfb2634e5671770f598cda08002d8cda66ac77 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Thu, 23 Aug 2012 19:51:21 +0800 Subject: af_packet: match_fanout_group() can be static cc: Eric Leblond Signed-off-by: Fengguang Wu Signed-off-by: David S. Miller --- net/packet/af_packet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index aee7196aac36..c5c9e2a54218 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1273,7 +1273,7 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po) spin_unlock(&f->lock); } -bool match_fanout_group(struct packet_type *ptype, struct sock * sk) +static bool match_fanout_group(struct packet_type *ptype, struct sock * sk) { if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout) return true; -- cgit v1.2.3-70-g09d2 From 78df76a065ae3b5dbcb9a29912adc02f697de498 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 24 Aug 2012 05:40:47 +0000 Subject: ipv4: take rt_uncached_lock only if needed Multicast traffic allocates dst with DST_NOCACHE, but dst is not inserted into rt_uncached_list. This slowdown multicast workloads on SMP because rt_uncached_lock is contended. Change the test before taking the lock to actually check the dst was inserted into rt_uncached_list. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8c8c748ebb28..24fd4c596643 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1263,7 +1263,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) { struct rtable *rt = (struct rtable *) dst; - if (dst->flags & DST_NOCACHE) { + if (!list_empty(&rt->rt_uncached)) { spin_lock_bh(&rt_uncached_lock); list_del(&rt->rt_uncached); spin_unlock_bh(&rt_uncached_lock); -- cgit v1.2.3-70-g09d2 From 20e1db19db5d6b9e4e83021595eab0dc8f107bef Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 23 Aug 2012 02:09:11 +0000 Subject: netlink: fix possible spoofing from non-root processes Non-root user-space processes can send Netlink messages to other processes that are well-known for being subscribed to Netlink asynchronous notifications. This allows ilegitimate non-root process to send forged messages to Netlink subscribers. The userspace process usually verifies the legitimate origin in two ways: a) Socket credentials. If UID != 0, then the message comes from some ilegitimate process and the message needs to be dropped. b) Netlink portID. In general, portID == 0 means that the origin of the messages comes from the kernel. Thus, discarding any message not coming from the kernel. However, ctnetlink sets the portID in event messages that has been triggered by some user-space process, eg. conntrack utility. So other processes subscribed to ctnetlink events, eg. conntrackd, know that the event was triggered by some user-space action. Neither of the two ways to discard ilegitimate messages coming from non-root processes can help for ctnetlink. This patch adds capability validation in case that dst_pid is set in netlink_sendmsg(). This approach is aggressive since existing applications using any Netlink bus to deliver messages between two user-space processes will break. Note that the exception is NETLINK_USERSOCK, since it is reserved for netlink-to-netlink userspace communication. Still, if anyone wants that his Netlink bus allows netlink-to-netlink userspace, then they can set NL_NONROOT_SEND. However, by default, I don't think it makes sense to allow to use NETLINK_ROUTE to communicate two processes that are sending no matter what information that is not related to link/neighbouring/routing. They should be using NETLINK_USERSOCK instead for that. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 1445d73533ed..527023823b5c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1373,7 +1373,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, dst_pid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; - if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) + if ((dst_group || dst_pid) && + !netlink_capable(sock, NL_NONROOT_SEND)) goto out; } else { dst_pid = nlk->dst_pid; @@ -2147,6 +2148,7 @@ static void __init netlink_add_usersock_entry(void) rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; + nl_table[NETLINK_USERSOCK].nl_nonroot = NL_NONROOT_SEND; netlink_table_ungrab(); } -- cgit v1.2.3-70-g09d2 From 7c4a56fec379ac0d7754e0d4da6a7361f1a4fe64 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 23 Aug 2012 07:05:17 +0000 Subject: tcp: fix cwnd reduction for non-sack recovery The cwnd reduction in fast recovery is based on the number of packets newly delivered per ACK. For non-sack connections every DUPACK signifies a packet has been delivered, but the sender mistakenly skips counting them for cwnd reduction. The fix is to compute newly_acked_sacked after DUPACKs are accounted in sacked_out for non-sack connections. Signed-off-by: Yuchung Cheng Acked-by: Nandita Dukkipati Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 85308b90df80..6e38c6c23caa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2926,13 +2926,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, - int newly_acked_sacked, bool is_dupack, + int prior_sacked, bool is_dupack, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); + int newly_acked_sacked = 0; int fast_rexmit = 0; if (WARN_ON(!tp->packets_out && tp->sacked_out)) @@ -2992,6 +2993,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, tcp_add_reno_sack(sk); } else do_lost = tcp_try_undo_partial(sk, pkts_acked); + newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; break; case TCP_CA_Loss: if (flag & FLAG_DATA_ACKED) @@ -3013,6 +3015,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (is_dupack) tcp_add_reno_sack(sk); } + newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); @@ -3590,7 +3593,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) int prior_packets; int prior_sacked = tp->sacked_out; int pkts_acked = 0; - int newly_acked_sacked = 0; bool frto_cwnd = false; /* If the ack is older than previous acks @@ -3666,8 +3668,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); pkts_acked = prior_packets - tp->packets_out; - newly_acked_sacked = (prior_packets - prior_sacked) - - (tp->packets_out - tp->sacked_out); if (tp->frto_counter) frto_cwnd = tcp_process_frto(sk, flag); @@ -3681,7 +3681,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_may_raise_cwnd(sk, flag)) tcp_cong_avoid(sk, ack, prior_in_flight); is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) @@ -3698,7 +3698,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3718,8 +3718,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - newly_acked_sacked = tp->sacked_out - prior_sacked; - tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, + tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, is_dupack, flag); } -- cgit v1.2.3-70-g09d2 From cc110922da7e902b62d18641a370fec01a9fa794 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Thu, 23 Aug 2012 21:32:43 -0300 Subject: Bluetooth: Change signature of smp_conn_security() To make it clear that it may be called from contexts that may not have any knowledge of L2CAP, we change the connection parameter, to receive a hci_conn. This also makes it clear that it is checking the security of the link. Signed-off-by: Vinicius Costa Gomes Signed-off-by: Gustavo Padovan --- include/net/bluetooth/smp.h | 2 +- net/bluetooth/l2cap_core.c | 11 ++++++----- net/bluetooth/l2cap_sock.c | 2 +- net/bluetooth/smp.c | 4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/smp.h b/include/net/bluetooth/smp.h index ca356a734920..8b27927b2a55 100644 --- a/include/net/bluetooth/smp.h +++ b/include/net/bluetooth/smp.h @@ -136,7 +136,7 @@ struct smp_chan { }; /* SMP Commands */ -int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level); +int smp_conn_security(struct hci_conn *hcon, __u8 sec_level); int smp_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb); int smp_distribute_keys(struct l2cap_conn *conn, __u8 force); int smp_user_confirm_reply(struct hci_conn *conn, u16 mgmt_op, __le32 passkey); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index daa149b7003c..4ea1710a4783 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1199,14 +1199,15 @@ clean: static void l2cap_conn_ready(struct l2cap_conn *conn) { struct l2cap_chan *chan; + struct hci_conn *hcon = conn->hcon; BT_DBG("conn %p", conn); - if (!conn->hcon->out && conn->hcon->type == LE_LINK) + if (!hcon->out && hcon->type == LE_LINK) l2cap_le_conn_ready(conn); - if (conn->hcon->out && conn->hcon->type == LE_LINK) - smp_conn_security(conn, conn->hcon->pending_sec_level); + if (hcon->out && hcon->type == LE_LINK) + smp_conn_security(hcon, hcon->pending_sec_level); mutex_lock(&conn->chan_lock); @@ -1219,8 +1220,8 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) continue; } - if (conn->hcon->type == LE_LINK) { - if (smp_conn_security(conn, chan->sec_level)) + if (hcon->type == LE_LINK) { + if (smp_conn_security(hcon, chan->sec_level)) l2cap_chan_ready(chan); } else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index b94abd30e6f9..45cb0b0dd2c7 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -615,7 +615,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch break; } - if (smp_conn_security(conn, sec.level)) + if (smp_conn_security(conn->hcon, sec.level)) break; sk->sk_state = BT_CONFIG; chan->state = BT_CONFIG; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 98ffc1b6a6fa..8c225ef349cd 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -760,9 +760,9 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb) return 0; } -int smp_conn_security(struct l2cap_conn *conn, __u8 sec_level) +int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) { - struct hci_conn *hcon = conn->hcon; + struct l2cap_conn *conn = hcon->l2cap_data; struct smp_chan *smp = conn->smp_chan; __u8 authreq; -- cgit v1.2.3-70-g09d2 From d8343f125710fb596f7a88cd756679f14f4e77b9 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Thu, 23 Aug 2012 21:32:44 -0300 Subject: Bluetooth: Fix sending a HCI Authorization Request over LE links MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the case that the link is already in the connected state and a Pairing request arrives from the mgmt interface, hci_conn_security() would be called but it was not considering LE links. Reported-by: João Paulo Rechi Vita Signed-off-by: Vinicius Costa Gomes Signed-off-by: Gustavo Padovan --- net/bluetooth/hci_conn.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 5ad7da217474..3c094e78dde9 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -29,6 +29,7 @@ #include #include #include +#include static void hci_le_connect(struct hci_conn *conn) { @@ -619,6 +620,9 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("hcon %p", conn); + if (conn->type == LE_LINK) + return smp_conn_security(conn, sec_level); + /* For sdp we don't need the link key. */ if (sec_level == BT_SECURITY_SDP) return 1; -- cgit v1.2.3-70-g09d2 From 072a9c48600409d72aeb0d5b29fbb75861a06631 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Fri, 24 Aug 2012 21:41:11 +0000 Subject: netpoll: revert 6bdb7fe3104 and fix be_poll() instead Against -net. In the patch "netpoll: re-enable irq in poll_napi()", I tried to fix the following warning: [100718.051041] ------------[ cut here ]------------ [100718.051048] WARNING: at kernel/softirq.c:159 local_bh_enable_ip+0x7d/0xb0() (Not tainted) [100718.051049] Hardware name: ProLiant BL460c G7 ... [100718.051068] Call Trace: [100718.051073] [] ? warn_slowpath_common+0x87/0xc0 [100718.051075] [] ? warn_slowpath_null+0x1a/0x20 [100718.051077] [] ? local_bh_enable_ip+0x7d/0xb0 [100718.051080] [] ? _spin_unlock_bh+0x1b/0x20 [100718.051085] [] ? be_process_mcc+0x74/0x230 [be2net] [100718.051088] [] ? be_poll_tx_mcc+0x16c/0x290 [be2net] [100718.051090] [] ? netpoll_poll_dev+0xd6/0x490 [100718.051095] [] ? bond_poll_controller+0x75/0x80 [bonding] [100718.051097] [] ? netpoll_poll_dev+0x45/0x490 [100718.051100] [] ? ksize+0x19/0x80 [100718.051102] [] ? netpoll_send_skb_on_dev+0x157/0x240 by reenabling IRQ before calling ->poll, but it seems more problems are introduced after that patch: http://ozlabs.org/~akpm/stuff/IMG_20120824_122054.jpg http://marc.info/?l=linux-netdev&m=134563282530588&w=2 So it is safe to fix be2net driver code directly. This patch reverts the offending commit and fixes be_poll() by avoid disabling BH there, this is okay because be_poll() can be called either by poll_napi() which already disables IRQ, or by net_rx_action() which already disables BH. Reported-by: Andrew Morton Reported-by: Sylvain Munaut Cc: Sylvain Munaut Cc: Andrew Morton Cc: David Miller Cc: Sathya Perla Cc: Subbu Seetharaman Cc: Ajit Khaparde Signed-off-by: Cong Wang Tested-by: Sylvain Munaut Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be_cmds.c | 6 ++++-- drivers/net/ethernet/emulex/benet/be_main.c | 2 ++ net/core/netpoll.c | 10 +--------- 3 files changed, 7 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c index 7fac97b4bb59..8c63d06ab12b 100644 --- a/drivers/net/ethernet/emulex/benet/be_cmds.c +++ b/drivers/net/ethernet/emulex/benet/be_cmds.c @@ -259,7 +259,7 @@ int be_process_mcc(struct be_adapter *adapter) int num = 0, status = 0; struct be_mcc_obj *mcc_obj = &adapter->mcc_obj; - spin_lock_bh(&adapter->mcc_cq_lock); + spin_lock(&adapter->mcc_cq_lock); while ((compl = be_mcc_compl_get(adapter))) { if (compl->flags & CQE_FLAGS_ASYNC_MASK) { /* Interpret flags as an async trailer */ @@ -280,7 +280,7 @@ int be_process_mcc(struct be_adapter *adapter) if (num) be_cq_notify(adapter, mcc_obj->cq.id, mcc_obj->rearm_cq, num); - spin_unlock_bh(&adapter->mcc_cq_lock); + spin_unlock(&adapter->mcc_cq_lock); return status; } @@ -295,7 +295,9 @@ static int be_mcc_wait_compl(struct be_adapter *adapter) if (be_error(adapter)) return -EIO; + local_bh_disable(); status = be_process_mcc(adapter); + local_bh_enable(); if (atomic_read(&mcc_obj->q.used) == 0) break; diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 90a903d83d87..78b8aa8069f0 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -3763,7 +3763,9 @@ static void be_worker(struct work_struct *work) /* when interrupts are not yet enabled, just reap any pending * mcc completions */ if (!netif_running(adapter->netdev)) { + local_bh_disable(); be_process_mcc(adapter); + local_bh_enable(); goto reschedule; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 346b1eb83a1f..e4ba3e70c174 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -168,24 +168,16 @@ static void poll_napi(struct net_device *dev) struct napi_struct *napi; int budget = 16; - WARN_ON_ONCE(!irqs_disabled()); - list_for_each_entry(napi, &dev->napi_list, dev_list) { - local_irq_enable(); if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { - rcu_read_lock_bh(); budget = poll_one_napi(rcu_dereference_bh(dev->npinfo), napi, budget); - rcu_read_unlock_bh(); spin_unlock(&napi->poll_lock); - if (!budget) { - local_irq_disable(); + if (!budget) break; - } } - local_irq_disable(); } } -- cgit v1.2.3-70-g09d2 From 0a54e939d8c2647c711ef2369c3e73c3b7f3028c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 29 Aug 2012 06:49:11 +0000 Subject: ipvs: fix error return code Initialize return variable before exiting on an error path. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // ( if@p1 (\(ret < 0\|ret != 0\)) { ... return ret; } | ret@p1 = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_ctl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 72bf32a84874..f51013c07b9f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1171,8 +1171,10 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, goto out_err; } svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!svc->stats.cpustats) + if (!svc->stats.cpustats) { + ret = -ENOMEM; goto out_err; + } /* I'm the first user of the service */ atomic_set(&svc->usecnt, 0); -- cgit v1.2.3-70-g09d2 From ef6acf68c259d907517dcc0ffefcd4e30276ae29 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 29 Aug 2012 06:49:16 +0000 Subject: netfilter: ctnetlink: fix error return code in init path Initialize return variable before exiting on an error path. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // ( if@p1 (\(ret < 0\|ret != 0\)) { ... return ret; } | ret@p1 = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index da4fc37a8578..9807f3278fcb 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2790,7 +2790,8 @@ static int __init ctnetlink_init(void) goto err_unreg_subsys; } - if (register_pernet_subsys(&ctnetlink_net_ops)) { + ret = register_pernet_subsys(&ctnetlink_net_ops); + if (ret < 0) { pr_err("ctnetlink_init: cannot register pernet operations\n"); goto err_unreg_exp_subsys; } -- cgit v1.2.3-70-g09d2 From 6fc09f10f12477bdaa54e94f9cb428bef6d81315 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 29 Aug 2012 06:49:17 +0000 Subject: netfilter: nfnetlink_log: fix error return code in init path Initialize return variable before exiting on an error path. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // ( if@p1 (\(ret < 0\|ret != 0\)) { ... return ret; } | ret@p1 = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 592091c1260b..14e2f3903142 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -996,8 +996,10 @@ static int __init nfnetlink_log_init(void) #ifdef CONFIG_PROC_FS if (!proc_create("nfnetlink_log", 0440, - proc_net_netfilter, &nful_file_ops)) + proc_net_netfilter, &nful_file_ops)) { + status = -ENOMEM; goto cleanup_logger; + } #endif return status; -- cgit v1.2.3-70-g09d2 From 3f509c689a07a4aa989b426893d8491a7ffcc410 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 29 Aug 2012 15:24:09 +0000 Subject: netfilter: nf_nat_sip: fix incorrect handling of EBUSY for RTCP expectation We're hitting bug while trying to reinsert an already existing expectation: kernel BUG at kernel/timer.c:895! invalid opcode: 0000 [#1] SMP [...] Call Trace: [] nf_ct_expect_related_report+0x4a0/0x57a [nf_conntrack] [] ? in4_pton+0x72/0x131 [] ip_nat_sdp_media+0xeb/0x185 [nf_nat_sip] [] set_expected_rtp_rtcp+0x32d/0x39b [nf_conntrack_sip] [] process_sdp+0x30c/0x3ec [nf_conntrack_sip] [] ? irq_exit+0x9a/0x9c [] ? ip_nat_sdp_media+0x185/0x185 [nf_nat_sip] We have to remove the RTP expectation if the RTCP expectation hits EBUSY since we keep trying with other ports until we succeed. Reported-by: Rafal Fitt Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nf_nat_sip.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 4ad9cf173992..9c87cde28ff8 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -502,7 +502,10 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, ret = nf_ct_expect_related(rtcp_exp); if (ret == 0) break; - else if (ret != -EBUSY) { + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { nf_ct_unexpect_related(rtp_exp); port = 0; break; -- cgit v1.2.3-70-g09d2 From 99469c32f79a32d8481f87be0d3c66dad286f4ec Mon Sep 17 00:00:00 2001 From: "xeb@mail.ru" Date: Fri, 24 Aug 2012 01:07:38 +0000 Subject: l2tp: avoid to use synchronize_rcu in tunnel free function Avoid to use synchronize_rcu in l2tp_tunnel_free because context may be atomic. Signed-off-by: Dmitry Kozlov Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 3 +-- net/l2tp/l2tp_core.h | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 393355d37b47..513cab08a986 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1347,11 +1347,10 @@ static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) /* Remove from tunnel list */ spin_lock_bh(&pn->l2tp_tunnel_list_lock); list_del_rcu(&tunnel->list); + kfree_rcu(tunnel, rcu); spin_unlock_bh(&pn->l2tp_tunnel_list_lock); - synchronize_rcu(); atomic_dec(&l2tp_tunnel_count); - kfree(tunnel); } /* Create a socket for the tunnel, if one isn't set up by diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index a38ec6cdeee1..56d583e083a7 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -163,6 +163,7 @@ struct l2tp_tunnel_cfg { struct l2tp_tunnel { int magic; /* Should be L2TP_TUNNEL_MAGIC */ + struct rcu_head rcu; rwlock_t hlist_lock; /* protect session_hlist */ struct hlist_head session_hlist[L2TP_HASH_SIZE]; /* hashed list of sessions, -- cgit v1.2.3-70-g09d2 From acbb219d5f53821b2d0080d047800410c0420ea1 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Fri, 24 Aug 2012 07:38:35 +0000 Subject: net: ipv4: ipmr_expire_timer causes crash when removing net namespace When tearing down a net namespace, ipv4 mr_table structures are freed without first deactivating their timers. This can result in a crash in run_timer_softirq. This patch mimics the corresponding behaviour in ipv6. Locking and synchronization seem to be adequate. We are about to kfree mrt, so existing code should already make sure that no other references to mrt are pending or can be created by incoming traffic. The functions invoked here do not cause new references to mrt or other race conditions to be created. Invoking del_timer_sync guarantees that ipmr_expire_timer is inactive. Both ipmr_expire_process (whose completion we may have to wait in del_timer_sync) and mroute_clean_tables internally use mfc_unres_lock or other synchronizations when needed, and they both only modify mrt. Tested in Linux 3.4.8. Signed-off-by: Francesco Ruggeri Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 8eec8f4a0536..ebdf06f938bf 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock); static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ipmr_new_table(struct net *net, u32 id); +static void ipmr_free_table(struct mr_table *mrt); + static int ip_mr_forward(struct net *net, struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *cache, int local); @@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt, struct sk_buff *pkt, vifi_t vifi, int assert); static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); +static void mroute_clean_tables(struct mr_table *mrt); static void ipmr_expire_process(unsigned long arg); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net) list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); - kfree(mrt); + ipmr_free_table(mrt); } fib_rules_unregister(net->ipv4.mr_rules_ops); } @@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { - kfree(net->ipv4.mrt); + ipmr_free_table(net->ipv4.mrt); } #endif @@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) return mrt; } +static void ipmr_free_table(struct mr_table *mrt) +{ + del_timer_sync(&mrt->ipmr_expire_timer); + mroute_clean_tables(mrt); + kfree(mrt); +} + /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) -- cgit v1.2.3-70-g09d2 From c5ae7d41927dbd208c885d4794e30617ad6cdf12 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Aug 2012 12:33:07 +0000 Subject: ipv4: must use rcu protection while calling fib_lookup Following lockdep splat was reported by Pavel Roskin : [ 1570.586223] =============================== [ 1570.586225] [ INFO: suspicious RCU usage. ] [ 1570.586228] 3.6.0-rc3-wl-main #98 Not tainted [ 1570.586229] ------------------------------- [ 1570.586231] /home/proski/src/linux/net/ipv4/route.c:645 suspicious rcu_dereference_check() usage! [ 1570.586233] [ 1570.586233] other info that might help us debug this: [ 1570.586233] [ 1570.586236] [ 1570.586236] rcu_scheduler_active = 1, debug_locks = 0 [ 1570.586238] 2 locks held by Chrome_IOThread/4467: [ 1570.586240] #0: (slock-AF_INET){+.-...}, at: [] release_sock+0x2c/0xa0 [ 1570.586253] #1: (fnhe_lock){+.-...}, at: [] update_or_create_fnhe+0x2c/0x270 [ 1570.586260] [ 1570.586260] stack backtrace: [ 1570.586263] Pid: 4467, comm: Chrome_IOThread Not tainted 3.6.0-rc3-wl-main #98 [ 1570.586265] Call Trace: [ 1570.586271] [] lockdep_rcu_suspicious+0xfd/0x130 [ 1570.586275] [] update_or_create_fnhe+0x15c/0x270 [ 1570.586278] [] __ip_rt_update_pmtu+0x73/0xb0 [ 1570.586282] [] ip_rt_update_pmtu+0x29/0x90 [ 1570.586285] [] inet_csk_update_pmtu+0x2c/0x80 [ 1570.586290] [] tcp_v4_mtu_reduced+0x2e/0xc0 [ 1570.586293] [] tcp_release_cb+0xa4/0xb0 [ 1570.586296] [] release_sock+0x55/0xa0 [ 1570.586300] [] tcp_sendmsg+0x4af/0xf50 [ 1570.586305] [] inet_sendmsg+0x120/0x230 [ 1570.586308] [] ? inet_sk_rebuild_header+0x40/0x40 [ 1570.586312] [] ? sock_update_classid+0xbd/0x3b0 [ 1570.586315] [] ? sock_update_classid+0x130/0x3b0 [ 1570.586320] [] do_sock_write+0xc5/0xe0 [ 1570.586323] [] sock_aio_write+0x53/0x80 [ 1570.586328] [] do_sync_write+0xa3/0xe0 [ 1570.586332] [] vfs_write+0x165/0x180 [ 1570.586335] [] sys_write+0x45/0x90 [ 1570.586340] [] system_call_fastpath+0x16/0x1b Signed-off-by: Eric Dumazet Reported-by: Pavel Roskin Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 24fd4c596643..82cf2a722b23 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -934,12 +934,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; + rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, jiffies + ip_rt_mtu_expires); } + rcu_read_unlock(); return mtu; } -- cgit v1.2.3-70-g09d2 From 5b423f6a40a0327f9d40bc8b97ce9be266f74368 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 29 Aug 2012 16:25:49 +0000 Subject: netfilter: nf_conntrack: fix racy timer handling with reliable events Existing code assumes that del_timer returns true for alive conntrack entries. However, this is not true if reliable events are enabled. In that case, del_timer may return true for entries that were just inserted in the dying list. Note that packets / ctnetlink may hold references to conntrack entries that were just inserted to such list. This patch fixes the issue by adding an independent timer for event delivery. This increases the size of the ecache extension. Still we can revisit this later and use variable size extensions to allocate this area on demand. Tested-by: Oliver Smith Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 1 + net/netfilter/nf_conntrack_core.c | 16 +++++++++++----- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index e1ce1048fe5f..4a045cda9c60 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -18,6 +18,7 @@ struct nf_conntrack_ecache { u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ u32 pid; /* netlink pid of destroyer */ + struct timer_list timeout; }; static inline struct nf_conntrack_ecache * diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index cf4875565d67..2ceec64b19f9 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -249,12 +249,15 @@ static void death_by_event(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { /* bad luck, let's retry again */ - ct->timeout.expires = jiffies + + ecache->timeout.expires = jiffies + (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ct->timeout); + add_timer(&ecache->timeout); return; } /* we've got the event delivered, now it's dying */ @@ -268,6 +271,9 @@ static void death_by_event(unsigned long ul_conntrack) void nf_ct_insert_dying_list(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct); + + BUG_ON(ecache == NULL); /* add this conntrack to the dying list */ spin_lock_bh(&nf_conntrack_lock); @@ -275,10 +281,10 @@ void nf_ct_insert_dying_list(struct nf_conn *ct) &net->ct.dying); spin_unlock_bh(&nf_conntrack_lock); /* set a new timer to retry event delivery */ - setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); - ct->timeout.expires = jiffies + + setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct); + ecache->timeout.expires = jiffies + (random32() % net->ct.sysctl_events_retry_timeout); - add_timer(&ct->timeout); + add_timer(&ecache->timeout); } EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); -- cgit v1.2.3-70-g09d2 From 48f125ce1cc3ff275f9587b5bf56bf0f90766c7d Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 29 Aug 2012 06:49:12 +0000 Subject: net: ipv6: fix error return code Initialize return variable before exiting on an error path. The initial initialization of the return variable is also dropped, because that value is never used. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // ( if@p1 (\(ret < 0\|ret != 0\)) { ... return ret; } | ret@p1 = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/ipv6/esp6.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 6dc7fd353ef5..282f3723ee19 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -167,8 +167,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct esp_data *esp = x->data; /* skb is pure payload to encrypt */ - err = -ENOMEM; - aead = esp->aead; alen = crypto_aead_authsize(aead); @@ -203,8 +201,10 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) } tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); - if (!tmp) + if (!tmp) { + err = -ENOMEM; goto error; + } seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); -- cgit v1.2.3-70-g09d2 From 599901c3e4204e9d9c5a24df5402cd91617a2a26 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 29 Aug 2012 06:49:15 +0000 Subject: net/xfrm/xfrm_state.c: fix error return code Initialize return variable before exiting on an error path. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // ( if@p1 (\(ret < 0\|ret != 0\)) { ... return ret; } | ret@p1 = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/xfrm/xfrm_state.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 87cd0e4d4282..210be48d8ae3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1994,8 +1994,10 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay) goto error; x->outer_mode = xfrm_get_mode(x->props.mode, family); - if (x->outer_mode == NULL) + if (x->outer_mode == NULL) { + err = -EPROTONOSUPPORT; goto error; + } if (init_replay) { err = xfrm_init_replay(x); -- cgit v1.2.3-70-g09d2 From 39855b5ba9a72a80de96009011b7f8b2fb01612b Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 31 Aug 2012 15:28:28 -0700 Subject: openvswitch: Fix typo Signed-off-by: Joe Stringer Signed-off-by: Jesse Gross --- net/openvswitch/actions.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index f3f96badf5aa..954405ceae9e 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -45,7 +45,7 @@ static int make_writable(struct sk_buff *skb, int write_len) return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } -/* remove VLAN header from packet and update csum accrodingly. */ +/* remove VLAN header from packet and update csum accordingly. */ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) { struct vlan_hdr *vhdr; -- cgit v1.2.3-70-g09d2 From e812347ccf9e8ce073b0ba0c49d03b124707b2b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 2 Sep 2012 23:57:18 +0000 Subject: net: sock_edemux() should take care of timewait sockets sock_edemux() can handle either a regular socket or a timewait socket Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 8f67ced8d6a8..7f64467535d1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1523,7 +1523,12 @@ EXPORT_SYMBOL(sock_rfree); void sock_edemux(struct sk_buff *skb) { - sock_put(skb->sk); + struct sock *sk = skb->sk; + + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put(inet_twsk(sk)); + else + sock_put(sk); } EXPORT_SYMBOL(sock_edemux); -- cgit v1.2.3-70-g09d2 From 4c3a5bdae293f75cdf729c6c00124e8489af2276 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Mon, 3 Sep 2012 04:27:42 +0000 Subject: sctp: Don't charge for data in sndbuf again when transmitting packet SCTP charges wmem_alloc via sctp_set_owner_w() in sctp_sendmsg() and via skb_set_owner_w() in sctp_packet_transmit(). If a sender runs out of sndbuf it will sleep in sctp_wait_for_sndbuf() and expects to be waken up by __sctp_write_space(). Buffer space charged via sctp_set_owner_w() is released in sctp_wfree() which calls __sctp_write_space() directly. Buffer space charged via skb_set_owner_w() is released via sock_wfree() which calls sk->sk_write_space() _if_ SOCK_USE_WRITE_QUEUE is not set. sctp_endpoint_init() sets SOCK_USE_WRITE_QUEUE on all sockets. Therefore if sctp_packet_transmit() manages to queue up more than sndbuf bytes, sctp_wait_for_sndbuf() will never be woken up again unless it is interrupted by a signal. This could be fixed by clearing the SOCK_USE_WRITE_QUEUE flag but ... Charging for the data twice does not make sense in the first place, it leads to overcharging sndbuf by a factor 2. Therefore this patch only charges a single byte in wmem_alloc when transmitting an SCTP packet to ensure that the socket stays alive until the packet has been released. This means that control chunks are no longer accounted for in wmem_alloc which I believe is not a problem as skb->truesize will typically lead to overcharging anyway and thus compensates for any control overhead. Signed-off-by: Thomas Graf CC: Vlad Yasevich CC: Neil Horman CC: David Miller Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/output.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 838e18b4d7ea..be50aa234dcd 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -364,6 +364,25 @@ finish: return retval; } +static void sctp_packet_release_owner(struct sk_buff *skb) +{ + sk_free(skb->sk); +} + +static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = sctp_packet_release_owner; + + /* + * The data chunks have already been accounted for in sctp_sendmsg(), + * therefore only reserve a single byte to keep socket around until + * the packet has been transmitted. + */ + atomic_inc(&sk->sk_wmem_alloc); +} + /* All packets are sent to the network through this function from * sctp_outq_tail(). * @@ -405,7 +424,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) /* Set the owning socket so that we know where to get the * destination IP address. */ - skb_set_owner_w(nskb, sk); + sctp_packet_set_owner_w(nskb, sk); if (!sctp_transport_dst_check(tp)) { sctp_transport_route(tp, NULL, sctp_sk(sk)); -- cgit v1.2.3-70-g09d2 From b379135c40163ae79ba7a54e6928b53983e74ee8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 1 Sep 2012 03:19:57 +0000 Subject: fq_codel: dont reinit flow state When fq_codel builds a new flow, it should not reset codel state. Codel algo needs to get previous values (lastcount, drop_next) to get proper behavior. Signed-off-by: Dave Taht Signed-off-by: Eric Dumazet Acked-by: Dave Taht Signed-off-by: David S. Miller --- net/sched/sch_fq_codel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9fc1c62ec80e..4e606fcb2534 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -191,7 +191,6 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (list_empty(&flow->flowchain)) { list_add_tail(&flow->flowchain, &q->new_flows); - codel_vars_init(&flow->cvars); q->new_flow_count++; flow->deficit = q->quantum; flow->dropped = 0; @@ -418,6 +417,7 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) struct fq_codel_flow *flow = q->flows + i; INIT_LIST_HEAD(&flow->flowchain); + codel_vars_init(&flow->cvars); } } if (sch->limit >= 1) -- cgit v1.2.3-70-g09d2 From c303aa94cdae483a7577230e61720e126e600a52 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Mon, 3 Sep 2012 19:06:27 -0700 Subject: openvswitch: Fix FLOW_BUFSIZE definition. The vlan encapsulation fields in the maximum flow defintion were never updated when the representation changed before upstreaming. In theory this could cause a kernel panic when a maximum length flow is used. In practice this has never happened (to my knowledge) because skb allocations are padded out to a cache line so you would need the right combination of flow and packet being sent to userspace. Signed-off-by: Jesse Gross --- net/openvswitch/flow.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 9b75617ca4e0..c30df1a10c67 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -145,15 +145,17 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies); * OVS_KEY_ATTR_PRIORITY 4 -- 4 8 * OVS_KEY_ATTR_IN_PORT 4 -- 4 8 * OVS_KEY_ATTR_ETHERNET 12 -- 4 16 + * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (outer VLAN ethertype) * OVS_KEY_ATTR_8021Q 4 -- 4 8 - * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 + * OVS_KEY_ATTR_ENCAP 0 -- 4 4 (VLAN encapsulation) + * OVS_KEY_ATTR_ETHERTYPE 2 2 4 8 (inner VLAN ethertype) * OVS_KEY_ATTR_IPV6 40 -- 4 44 * OVS_KEY_ATTR_ICMPV6 2 2 4 8 * OVS_KEY_ATTR_ND 28 -- 4 32 * ------------------------------------------------- - * total 132 + * total 144 */ -#define FLOW_BUFSIZE 132 +#define FLOW_BUFSIZE 144 int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, -- cgit v1.2.3-70-g09d2 From 3d2abdfdf14f4d6decc2023708211e19b096f4ca Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Tue, 4 Sep 2012 17:44:45 +0300 Subject: mac80211: clear bssid on auth/assoc failure ifmgd->bssid wasn't cleared properly in some auth/assoc failure cases, causing mac80211 and the low-level driver to go out of sync. Clear ifmgd->bssid on failure, and notify the driver. Cc: stable@kernel.org # 3.4+ Signed-off-by: Eliad Peller Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a4a5acdbaa4d..f76b83341cf9 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3248,6 +3248,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, goto out_unlock; err_clear: + memset(ifmgd->bssid, 0, ETH_ALEN); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->auth_data = NULL; err_free: kfree(auth_data); @@ -3439,6 +3441,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, err = 0; goto out; err_clear: + memset(ifmgd->bssid, 0, ETH_ALEN); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); ifmgd->assoc_data = NULL; err_free: kfree(assoc_data); -- cgit v1.2.3-70-g09d2 From b4e4f47e940bc93c5b1125a4429ff53956754800 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 2 Sep 2012 21:41:04 +0800 Subject: nl80211: fix possible memory leak nl80211_connect() connkeys is malloced in nl80211_parse_connkeys() and should be freed in the error handling case, otherwise it will cause memory leak. spatch with a semantic match is used to found this problem. (http://coccinelle.lip6.fr/) Signed-off-by: Wei Yongjun Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 97026f3b215a..1e37dbf00cb3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5633,8 +5633,10 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) sizeof(connect.ht_capa_mask)); if (info->attrs[NL80211_ATTR_HT_CAPABILITY]) { - if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) + if (!info->attrs[NL80211_ATTR_HT_CAPABILITY_MASK]) { + kfree(connkeys); return -EINVAL; + } memcpy(&connect.ht_capa, nla_data(info->attrs[NL80211_ATTR_HT_CAPABILITY]), sizeof(connect.ht_capa)); -- cgit v1.2.3-70-g09d2 From 6cf5c951175abcec4da470c50565cc0afe6cd11d Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 4 Sep 2012 04:13:18 +0000 Subject: netrom: copy_datagram_iovec can fail Check for an error from this and if so bail properly. Signed-off-by: Alan Cox Signed-off-by: David S. Miller --- net/netrom/af_netrom.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 06592d8b4a2b..1b9024ee963c 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1169,7 +1169,12 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (er < 0) { + skb_free_datagram(sk, skb); + release_sock(sk); + return er; + } if (sax != NULL) { sax->sax25_family = AF_NETROM; -- cgit v1.2.3-70-g09d2 From 37159ef2c1ae1e696b24b260b241209a19f92c60 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Sep 2012 07:18:57 +0000 Subject: l2tp: fix a lockdep splat Fixes following lockdep splat : [ 1614.734896] ============================================= [ 1614.734898] [ INFO: possible recursive locking detected ] [ 1614.734901] 3.6.0-rc3+ #782 Not tainted [ 1614.734903] --------------------------------------------- [ 1614.734905] swapper/11/0 is trying to acquire lock: [ 1614.734907] (slock-AF_INET){+.-...}, at: [] l2tp_xmit_skb+0x172/0xa50 [l2tp_core] [ 1614.734920] [ 1614.734920] but task is already holding lock: [ 1614.734922] (slock-AF_INET){+.-...}, at: [] tcp_v4_err+0x163/0x6b0 [ 1614.734932] [ 1614.734932] other info that might help us debug this: [ 1614.734935] Possible unsafe locking scenario: [ 1614.734935] [ 1614.734937] CPU0 [ 1614.734938] ---- [ 1614.734940] lock(slock-AF_INET); [ 1614.734943] lock(slock-AF_INET); [ 1614.734946] [ 1614.734946] *** DEADLOCK *** [ 1614.734946] [ 1614.734949] May be due to missing lock nesting notation [ 1614.734949] [ 1614.734952] 7 locks held by swapper/11/0: [ 1614.734954] #0: (rcu_read_lock){.+.+..}, at: [] __netif_receive_skb+0x251/0xd00 [ 1614.734964] #1: (rcu_read_lock){.+.+..}, at: [] ip_local_deliver_finish+0x4c/0x4e0 [ 1614.734972] #2: (rcu_read_lock){.+.+..}, at: [] icmp_socket_deliver+0x46/0x230 [ 1614.734982] #3: (slock-AF_INET){+.-...}, at: [] tcp_v4_err+0x163/0x6b0 [ 1614.734989] #4: (rcu_read_lock){.+.+..}, at: [] ip_queue_xmit+0x0/0x680 [ 1614.734997] #5: (rcu_read_lock_bh){.+....}, at: [] ip_finish_output+0x135/0x890 [ 1614.735004] #6: (rcu_read_lock_bh){.+....}, at: [] dev_queue_xmit+0x0/0xe00 [ 1614.735012] [ 1614.735012] stack backtrace: [ 1614.735016] Pid: 0, comm: swapper/11 Not tainted 3.6.0-rc3+ #782 [ 1614.735018] Call Trace: [ 1614.735020] [] __lock_acquire+0x144c/0x1b10 [ 1614.735033] [] ? check_usage+0x9b/0x4d0 [ 1614.735037] [] ? mark_held_locks+0x82/0x130 [ 1614.735042] [] lock_acquire+0x90/0x200 [ 1614.735047] [] ? l2tp_xmit_skb+0x172/0xa50 [l2tp_core] [ 1614.735051] [] ? trace_hardirqs_on+0xd/0x10 [ 1614.735060] [] _raw_spin_lock+0x41/0x50 [ 1614.735065] [] ? l2tp_xmit_skb+0x172/0xa50 [l2tp_core] [ 1614.735069] [] l2tp_xmit_skb+0x172/0xa50 [l2tp_core] [ 1614.735075] [] l2tp_eth_dev_xmit+0x32/0x60 [l2tp_eth] [ 1614.735079] [] dev_hard_start_xmit+0x502/0xa70 [ 1614.735083] [] ? dev_hard_start_xmit+0x5e/0xa70 [ 1614.735087] [] ? dev_queue_xmit+0x141/0xe00 [ 1614.735093] [] sch_direct_xmit+0xfe/0x290 [ 1614.735098] [] dev_queue_xmit+0x1e5/0xe00 [ 1614.735102] [] ? dev_hard_start_xmit+0xa70/0xa70 [ 1614.735106] [] ? eth_header+0x3a/0xf0 [ 1614.735111] [] ? fib_get_table+0x2e/0x280 [ 1614.735117] [] arp_xmit+0x22/0x60 [ 1614.735121] [] arp_send+0x43/0x50 [ 1614.735125] [] arp_solicit+0x18f/0x450 [ 1614.735132] [] neigh_probe+0x4a/0x70 [ 1614.735137] [] __neigh_event_send+0xea/0x300 [ 1614.735141] [] neigh_resolve_output+0x163/0x260 [ 1614.735146] [] ip_finish_output+0x505/0x890 [ 1614.735150] [] ? ip_finish_output+0x135/0x890 [ 1614.735154] [] ip_output+0x59/0xf0 [ 1614.735158] [] ip_local_out+0x2d/0xa0 [ 1614.735162] [] ip_queue_xmit+0x1c3/0x680 [ 1614.735165] [] ? ip_local_out+0xa0/0xa0 [ 1614.735172] [] tcp_transmit_skb+0x402/0xa60 [ 1614.735177] [] tcp_retransmit_skb+0x1a1/0x620 [ 1614.735181] [] tcp_retransmit_timer+0x393/0x960 [ 1614.735185] [] ? tcp_v4_err+0x163/0x6b0 [ 1614.735189] [] tcp_v4_err+0x657/0x6b0 [ 1614.735194] [] ? icmp_socket_deliver+0x46/0x230 [ 1614.735199] [] icmp_socket_deliver+0xce/0x230 [ 1614.735203] [] ? icmp_socket_deliver+0x46/0x230 [ 1614.735208] [] icmp_unreach+0xe4/0x2c0 [ 1614.735213] [] icmp_rcv+0x350/0x4a0 [ 1614.735217] [] ip_local_deliver_finish+0x135/0x4e0 [ 1614.735221] [] ? ip_local_deliver_finish+0x4c/0x4e0 [ 1614.735225] [] ip_local_deliver+0x4a/0x90 [ 1614.735229] [] ip_rcv_finish+0x187/0x730 [ 1614.735233] [] ip_rcv+0x21d/0x300 [ 1614.735237] [] __netif_receive_skb+0x46b/0xd00 [ 1614.735241] [] ? __netif_receive_skb+0x251/0xd00 [ 1614.735245] [] process_backlog+0xb8/0x180 [ 1614.735249] [] net_rx_action+0x159/0x330 [ 1614.735257] [] __do_softirq+0xd0/0x3e0 [ 1614.735264] [] ? tick_program_event+0x24/0x30 [ 1614.735270] [] call_softirq+0x1c/0x30 [ 1614.735278] [] do_softirq+0x8d/0xc0 [ 1614.735282] [] irq_exit+0xae/0xe0 [ 1614.735287] [] smp_apic_timer_interrupt+0x6e/0x99 [ 1614.735291] [] apic_timer_interrupt+0x6c/0x80 [ 1614.735293] [] ? trace_hardirqs_off+0xd/0x10 [ 1614.735306] [] ? intel_idle+0xf5/0x150 [ 1614.735310] [] ? intel_idle+0xee/0x150 [ 1614.735317] [] cpuidle_enter+0x19/0x20 [ 1614.735321] [] cpuidle_idle_call+0xa8/0x630 [ 1614.735327] [] cpu_idle+0x8a/0xe0 [ 1614.735333] [] start_secondary+0x220/0x222 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_core.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 513cab08a986..1a9f3723c13c 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1501,6 +1501,8 @@ out: return err; } +static struct lock_class_key l2tp_socket_class; + int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp) { struct l2tp_tunnel *tunnel = NULL; @@ -1605,6 +1607,8 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 tunnel->old_sk_destruct = sk->sk_destruct; sk->sk_destruct = &l2tp_tunnel_destruct; tunnel->sock = sk; + lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock"); + sk->sk_allocation = GFP_ATOMIC; /* Add tunnel to our list */ -- cgit v1.2.3-70-g09d2 From 3b59df46a449ec9975146d71318c4777ad086744 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 4 Sep 2012 00:03:29 +0000 Subject: xfrm: Workaround incompatibility of ESN and async crypto ESN for esp is defined in RFC 4303. This RFC assumes that the sequence number counters are always up to date. However, this is not true if an async crypto algorithm is employed. If the sequence number counters are not up to date on sequence number check, we may incorrectly update the upper 32 bit of the sequence number. This leads to a DOS. We workaround this by comparing the upper sequence number, (used for authentication) with the upper sequence number computed after the async processing. We drop the packet if these numbers are different. To do this, we introduce a recheck function that does this check in the ESN case. Signed-off-by: Steffen Klassert Acked-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/xfrm.h | 3 +++ net/xfrm/xfrm_input.c | 2 +- net/xfrm/xfrm_replay.c | 15 +++++++++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 976a81abe1a2..639dd1316d37 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -273,6 +273,9 @@ struct xfrm_replay { int (*check)(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq); + int (*recheck)(struct xfrm_state *x, + struct sk_buff *skb, + __be32 net_seq); void (*notify)(struct xfrm_state *x, int event); int (*overflow)(struct xfrm_state *x, struct sk_buff *skb); }; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 54a0dc2e2f8d..ab2bb42fe094 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -212,7 +212,7 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (async && x->repl->check(x, skb, seq)) { + if (async && x->repl->recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 2f6d11d04a2b..3efb07d3eb27 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -420,6 +420,18 @@ err: return -EINVAL; } +static int xfrm_replay_recheck_esn(struct xfrm_state *x, + struct sk_buff *skb, __be32 net_seq) +{ + if (unlikely(XFRM_SKB_CB(skb)->seq.input.hi != + htonl(xfrm_replay_seqhi(x, net_seq)))) { + x->stats.replay_window++; + return -EINVAL; + } + + return xfrm_replay_check_esn(x, skb, net_seq); +} + static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) { unsigned int bitnr, nr, i; @@ -479,6 +491,7 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) static struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, + .recheck = xfrm_replay_check, .notify = xfrm_replay_notify, .overflow = xfrm_replay_overflow, }; @@ -486,6 +499,7 @@ static struct xfrm_replay xfrm_replay_legacy = { static struct xfrm_replay xfrm_replay_bmp = { .advance = xfrm_replay_advance_bmp, .check = xfrm_replay_check_bmp, + .recheck = xfrm_replay_check_bmp, .notify = xfrm_replay_notify_bmp, .overflow = xfrm_replay_overflow_bmp, }; @@ -493,6 +507,7 @@ static struct xfrm_replay xfrm_replay_bmp = { static struct xfrm_replay xfrm_replay_esn = { .advance = xfrm_replay_advance_esn, .check = xfrm_replay_check_esn, + .recheck = xfrm_replay_recheck_esn, .notify = xfrm_replay_notify_bmp, .overflow = xfrm_replay_overflow_esn, }; -- cgit v1.2.3-70-g09d2 From c0cc88a7627c333de50b07b7c60b1d49d9d2e6cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Sep 2012 15:54:55 -0400 Subject: l2tp: fix a typo in l2tp_eth_dev_recv() While investigating l2tp bug, I hit a bug in eth_type_trans(), because not enough bytes were pulled in skb head. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index f9ee74deeac2..3bfb34aaee29 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -153,7 +153,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length); } - if (!pskb_may_pull(skb, sizeof(ETH_HLEN))) + if (!pskb_may_pull(skb, ETH_HLEN)) goto error; secpath_reset(skb); -- cgit v1.2.3-70-g09d2 From 7ce8c7a3433c6d6f4adfec0611d250782f0b4b0c Mon Sep 17 00:00:00 2001 From: LEO Airwarosu Yoichi Shinoda Date: Mon, 27 Aug 2012 22:28:16 +0900 Subject: mac80211: Various small fixes for cfg.c: mpath_set_pinfo() Various small fixes for net/mac80211/cfg.c:mpath_set_pinfo(): Initialize *pinfo before filling members in, handle MESH_PATH_RESOLVED correctly, and remove bogus assignment; result in correct display of FLAGS values and meaningful EXPTIME for expired paths in iw utility. Signed-off-by: Yoichi Shinoda Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d41974aacf51..a58c0b649ba1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1378,6 +1378,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, else memset(next_hop, 0, ETH_ALEN); + memset(pinfo, 0, sizeof(*pinfo)); + pinfo->generation = mesh_paths_generation; pinfo->filled = MPATH_INFO_FRAME_QLEN | @@ -1396,7 +1398,6 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, pinfo->discovery_timeout = jiffies_to_msecs(mpath->discovery_timeout); pinfo->discovery_retries = mpath->discovery_retries; - pinfo->flags = 0; if (mpath->flags & MESH_PATH_ACTIVE) pinfo->flags |= NL80211_MPATH_FLAG_ACTIVE; if (mpath->flags & MESH_PATH_RESOLVING) @@ -1405,10 +1406,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, pinfo->flags |= NL80211_MPATH_FLAG_SN_VALID; if (mpath->flags & MESH_PATH_FIXED) pinfo->flags |= NL80211_MPATH_FLAG_FIXED; - if (mpath->flags & MESH_PATH_RESOLVING) - pinfo->flags |= NL80211_MPATH_FLAG_RESOLVING; - - pinfo->flags = mpath->flags; + if (mpath->flags & MESH_PATH_RESOLVED) + pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED; } static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, -- cgit v1.2.3-70-g09d2 From d013ef2aba8fe765ca683598e404203215632373 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 5 Sep 2012 10:53:18 +0000 Subject: tcp: fix possible socket refcount problem for ipv6 commit 144d56e91044181ec0ef67aeca91e9a8b5718348 ("tcp: fix possible socket refcount problem") is missing the IPv6 part. As tcp_release_cb is shared by both protocols we should hold sock reference for the TCP_MTU_REDUCED_DEFERRED bit. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a3e60cc04a8a..acd32e3f1b68 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -403,8 +403,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, tp->mtu_info = ntohl(info); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); - else - set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); + else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, + &tp->tsq_flags)) + sock_hold(sk); goto out; } -- cgit v1.2.3-70-g09d2 From ed6fe9d614fc1bca95eb8c0ccd0e92db00ef9d5d Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sat, 1 Sep 2012 12:34:07 -0400 Subject: Fix order of arguments to compat_put_time[spec|val] Commit 644595f89620 ("compat: Handle COMPAT_USE_64BIT_TIME in net/socket.c") introduced a bug where the helper functions to take either a 64-bit or compat time[spec|val] got the arguments in the wrong order, passing the kernel stack pointer off as a user pointer (and vice versa). Because of the user address range check, that in turn then causes an EFAULT due to the user pointer range checking failing for the kernel address. Incorrectly resuling in a failed system call for 32-bit processes with a 64-bit kernel. On odder architectures like HP-PA (with separate user/kernel address spaces), it can be used read kernel memory. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- net/socket.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index a5471f804d99..edc3c4af9085 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2604,7 +2604,7 @@ static int do_siocgstamp(struct net *net, struct socket *sock, err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); set_fs(old_fs); if (!err) - err = compat_put_timeval(up, &ktv); + err = compat_put_timeval(&ktv, up); return err; } @@ -2620,7 +2620,7 @@ static int do_siocgstampns(struct net *net, struct socket *sock, err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); set_fs(old_fs); if (!err) - err = compat_put_timespec(up, &kts); + err = compat_put_timespec(&kts, up); return err; } -- cgit v1.2.3-70-g09d2 From 0626af3139572610b56376580d11eb65d45d9dd7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 4 Sep 2012 07:49:03 +0000 Subject: netfilter: take care of timewait sockets Sami Farin reported crashes in xt_LOG because it assumes skb->sk is a full blown socket. Since (41063e9 ipv4: Early TCP socket demux), we can have skb->sk pointing to a timewait socket. Same fix is needed in nfnetlink_log. Diagnosed-by: Florian Westphal Reported-by: Sami Farin Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_log.c | 14 ++++++++------ net/netfilter/xt_LOG.c | 33 +++++++++++++++++---------------- 2 files changed, 25 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 14e2f3903142..5cfb5bedb2b8 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -381,6 +381,7 @@ __build_packet_message(struct nfulnl_instance *inst, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; sk_buff_data_t old_tail = inst->skb->tail; + struct sock *sk; nlh = nlmsg_put(inst->skb, 0, 0, NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, @@ -499,18 +500,19 @@ __build_packet_message(struct nfulnl_instance *inst, } /* UID */ - if (skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) { - struct file *file = skb->sk->sk_socket->file; + sk = skb->sk; + if (sk && sk->sk_state != TCP_TIME_WAIT) { + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) { + struct file *file = sk->sk_socket->file; __be32 uid = htonl(file->f_cred->fsuid); __be32 gid = htonl(file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); if (nla_put_be32(inst->skb, NFULA_UID, uid) || nla_put_be32(inst->skb, NFULA_GID, gid)) goto nla_put_failure; } else - read_unlock_bh(&skb->sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } /* local sequence number */ diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index ff5f75fddb15..2a4f9693e799 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -145,6 +145,19 @@ static int dump_tcp_header(struct sbuff *m, const struct sk_buff *skb, return 0; } +static void dump_sk_uid_gid(struct sbuff *m, struct sock *sk) +{ + if (!sk || sk->sk_state == TCP_TIME_WAIT) + return; + + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket && sk->sk_socket->file) + sb_add(m, "UID=%u GID=%u ", + sk->sk_socket->file->f_cred->fsuid, + sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&sk->sk_callback_lock); +} + /* One level of recursion won't kill us */ static void dump_ipv4_packet(struct sbuff *m, const struct nf_loginfo *info, @@ -361,14 +374,8 @@ static void dump_ipv4_packet(struct sbuff *m, } /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && !iphoff && skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) - sb_add(m, "UID=%u GID=%u ", - skb->sk->sk_socket->file->f_cred->fsuid, - skb->sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); - } + if ((logflags & XT_LOG_UID) && !iphoff) + dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) @@ -717,14 +724,8 @@ static void dump_ipv6_packet(struct sbuff *m, } /* Max length: 15 "UID=4294967295 " */ - if ((logflags & XT_LOG_UID) && recurse && skb->sk) { - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket && skb->sk->sk_socket->file) - sb_add(m, "UID=%u GID=%u ", - skb->sk->sk_socket->file->f_cred->fsuid, - skb->sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); - } + if ((logflags & XT_LOG_UID) && recurse) + dump_sk_uid_gid(m, skb->sk); /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!recurse && skb->mark) -- cgit v1.2.3-70-g09d2 From f39c1bfb5a03e2d255451bff05be0d7255298fa4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 7 Sep 2012 11:08:50 -0400 Subject: SUNRPC: Fix a UDP transport regression Commit 43cedbf0e8dfb9c5610eb7985d5f21263e313802 (SUNRPC: Ensure that we grab the XPRT_LOCK before calling xprt_alloc_slot) is causing hangs in the case of NFS over UDP mounts. Since neither the UDP or the RDMA transport mechanism use dynamic slot allocation, we can skip grabbing the socket lock for those transports. Add a new rpc_xprt_op to allow switching between the TCP and UDP/RDMA case. Note that the NFSv4.1 back channel assigns the slot directly through rpc_run_bc_task, so we can ignore that case. Reported-by: Dick Streefland Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [>= 3.1] --- include/linux/sunrpc/xprt.h | 3 +++ net/sunrpc/xprt.c | 34 ++++++++++++++++++++-------------- net/sunrpc/xprtrdma/transport.c | 1 + net/sunrpc/xprtsock.c | 3 +++ 4 files changed, 27 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index cff40aa7db62..bf8c49ff7530 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -114,6 +114,7 @@ struct rpc_xprt_ops { void (*set_buffer_size)(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize); int (*reserve_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); void (*release_xprt)(struct rpc_xprt *xprt, struct rpc_task *task); + void (*alloc_slot)(struct rpc_xprt *xprt, struct rpc_task *task); void (*rpcbind)(struct rpc_task *task); void (*set_port)(struct rpc_xprt *xprt, unsigned short port); void (*connect)(struct rpc_task *task); @@ -281,6 +282,8 @@ void xprt_connect(struct rpc_task *task); void xprt_reserve(struct rpc_task *task); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task); int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); +void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task); int xprt_prepare_transmit(struct rpc_task *task); void xprt_transmit(struct rpc_task *task); void xprt_end_transmit(struct rpc_task *task); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a5a402a7d21f..5d7f61d7559c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -969,11 +969,11 @@ static bool xprt_dynamic_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) return false; } -static void xprt_alloc_slot(struct rpc_task *task) +void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; + spin_lock(&xprt->reserve_lock); if (!list_empty(&xprt->free)) { req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); list_del(&req->rq_list); @@ -994,12 +994,29 @@ static void xprt_alloc_slot(struct rpc_task *task) default: task->tk_status = -EAGAIN; } + spin_unlock(&xprt->reserve_lock); return; out_init_req: task->tk_status = 0; task->tk_rqstp = req; xprt_request_init(task, xprt); + spin_unlock(&xprt->reserve_lock); +} +EXPORT_SYMBOL_GPL(xprt_alloc_slot); + +void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) +{ + /* Note: grabbing the xprt_lock_write() ensures that we throttle + * new slot allocation if the transport is congested (i.e. when + * reconnecting a stream transport or when out of socket write + * buffer space). + */ + if (xprt_lock_write(xprt, task)) { + xprt_alloc_slot(xprt, task); + xprt_release_write(xprt, task); + } } +EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { @@ -1083,20 +1100,9 @@ void xprt_reserve(struct rpc_task *task) if (task->tk_rqstp != NULL) return; - /* Note: grabbing the xprt_lock_write() here is not strictly needed, - * but ensures that we throttle new slot allocation if the transport - * is congested (e.g. if reconnecting or if we're out of socket - * write buffer space). - */ task->tk_timeout = 0; task->tk_status = -EAGAIN; - if (!xprt_lock_write(xprt, task)) - return; - - spin_lock(&xprt->reserve_lock); - xprt_alloc_slot(task); - spin_unlock(&xprt->reserve_lock); - xprt_release_write(xprt, task); + xprt->ops->alloc_slot(xprt, task); } static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 06cdbff79e4a..5d9202dc7cb1 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -713,6 +713,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) static struct rpc_xprt_ops xprt_rdma_procs = { .reserve_xprt = xprt_rdma_reserve_xprt, .release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */ + .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 400567243f84..a35b8e52e551 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2473,6 +2473,7 @@ static void bc_destroy(struct rpc_xprt *xprt) static struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, + .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .set_port = xs_local_set_port, .connect = xs_connect, @@ -2489,6 +2490,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, @@ -2506,6 +2508,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xs_tcp_release_xprt, + .alloc_slot = xprt_lock_and_alloc_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, -- cgit v1.2.3-70-g09d2 From 979402b16cde048ced4839e21cc49e0779352b80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Sep 2012 23:34:44 +0000 Subject: udp: increment UDP_MIB_INERRORS if copy failed In UDP recvmsg(), we miss an increase of UDP_MIB_INERRORS if the copy of skb to userspace failed for whatever reason. Reported-by: Shawn Bohrer Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/udp.c | 5 +++++ net/ipv6/udp.c | 11 +++++++++++ 2 files changed, 16 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6f6d1aca3c3d..2814f66dac64 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1226,6 +1226,11 @@ try_again: if (unlikely(err)) { trace_kfree_skb(skb, udp_recvmsg); + if (!peeked) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, is_udplite); + } goto out_free; } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 99d0077b56b8..07e2bfef6845 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -394,6 +394,17 @@ try_again: } if (unlikely(err)) { trace_kfree_skb(skb, udpv6_recvmsg); + if (!peeked) { + atomic_inc(&sk->sk_drops); + if (is_udp4) + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, + is_udplite); + else + UDP6_INC_STATS_USER(sock_net(sk), + UDP_MIB_INERRORS, + is_udplite); + } goto out_free; } if (!peeked) { -- cgit v1.2.3-70-g09d2 From 6862234238e84648c305526af2edd98badcad1e0 Mon Sep 17 00:00:00 2001 From: Chema Gonzalez Date: Fri, 7 Sep 2012 13:40:50 +0000 Subject: net: small bug on rxhash calculation In the current rxhash calculation function, while the sorting of the ports/addrs is coherent (you get the same rxhash for packets sharing the same 4-tuple, in both directions), ports and addrs are sorted independently. This implies packets from a connection between the same addresses but crossed ports hash to the same rxhash. For example, traffic between A=S:l and B=L:s is hashed (in both directions) from {L, S, {s, l}}. The same rxhash is obtained for packets between C=S:s and D=L:l. This patch ensures that you either swap both addrs and ports, or you swap none. Traffic between A and B, and traffic between C and D, get their rxhash from different sources ({L, S, {l, s}} for A<->B, and {L, S, {s, l}} for C<->D) The patch is co-written with Eric Dumazet Signed-off-by: Chema Gonzalez Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 83988362805e..d7fe32c946c1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2647,15 +2647,16 @@ void __skb_get_rxhash(struct sk_buff *skb) if (!skb_flow_dissect(skb, &keys)) return; - if (keys.ports) { - if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]) - swap(keys.port16[0], keys.port16[1]); + if (keys.ports) skb->l4_rxhash = 1; - } /* get a consistent hash (same value on both flow directions) */ - if ((__force u32)keys.dst < (__force u32)keys.src) + if (((__force u32)keys.dst < (__force u32)keys.src) || + (((__force u32)keys.dst == (__force u32)keys.src) && + ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) { swap(keys.dst, keys.src); + swap(keys.port16[0], keys.port16[1]); + } hash = jhash_3words((__force u32)keys.dst, (__force u32)keys.src, -- cgit v1.2.3-70-g09d2 From 64f509ce71b08d037998e93dd51180c19b2f464c Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 31 Aug 2012 09:55:53 +0000 Subject: netfilter: Mark SYN/ACK packets as invalid from original direction Clients should not send such packets. By accepting them, we open up a hole by wich ephemeral ports can be discovered in an off-path attack. See: "Reflection scan: an Off-Path Attack on TCP" by Jan Wrobel, http://arxiv.org/abs/1201.2074 Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index a5ac11ebef33..aba98f942979 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -158,21 +158,18 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sSS */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ -/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/*synack*/ { sIV, sIV, sSR, sIV, sIV, sIV, sIV, sIV, sIV, sSR }, /* * sNO -> sIV Too late and no reason to do anything * sSS -> sIV Client can't send SYN and then SYN/ACK * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open - * sSR -> sIG - * sES -> sIG Error: SYNs in window outside the SYN_SENT state - * are errors. Receiver will reply with RST - * and close the connection. - * Or we are not in sync and hold a dead connection. - * sFW -> sIG - * sCW -> sIG - * sLA -> sIG - * sTW -> sIG - * sCL -> sIG + * sSR -> sSR Late retransmitted SYN/ACK in simultaneous open + * sES -> sIV Invalid SYN/ACK packets sent by the client + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV + * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, -- cgit v1.2.3-70-g09d2 From 4a70bbfaef0361d27272629d1a250a937edcafe4 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 31 Aug 2012 09:55:54 +0000 Subject: netfilter: Validate the sequence number of dataless ACK packets as well We spare nothing by not validating the sequence number of dataless ACK packets and enabling it makes harder off-path attacks. See: "Reflection scan: an Off-Path Attack on TCP" by Jan Wrobel, http://arxiv.org/abs/1201.2074 Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_tcp.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index aba98f942979..e046b3756aab 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -630,15 +630,9 @@ static bool tcp_in_window(const struct nf_conn *ct, ack = sack = receiver->td_end; } - if (seq == end - && (!tcph->rst - || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + if (tcph->rst && seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT) /* - * Packets contains no data: we assume it is valid - * and check the ack value only. - * However RST segments are always validated by their - * SEQ number, except when seq == 0 (reset sent answering - * SYN. + * RST sent answering SYN. */ seq = end = sender->td_end; -- cgit v1.2.3-70-g09d2 From 566f26aa705609d05940289036ab914c8a3be707 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sun, 9 Sep 2012 18:38:27 +0000 Subject: caif: move the dereference below the NULL test The dereference should be moved below the NULL test. spatch with a semantic match is used to found this. (http://coccinelle.lip6.fr/) Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/caif/cfsrvl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index dd485f6128e8..ba217e90765e 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -211,9 +211,10 @@ void caif_client_register_refcnt(struct cflayer *adapt_layer, void (*put)(struct cflayer *lyr)) { struct cfsrvl *service; - service = container_of(adapt_layer->dn, struct cfsrvl, layer); - WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL); + if (WARN_ON(adapt_layer == NULL || adapt_layer->dn == NULL)) + return; + service = container_of(adapt_layer->dn, struct cfsrvl, layer); service->hold = hold; service->put = put; } -- cgit v1.2.3-70-g09d2 From 1c463e57b32e3b6a3250fe75d1d56cb598d664e6 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 10 Sep 2012 09:13:07 -0700 Subject: net: fix net/core/sock.c build error Fix net/core/sock.c build error when CONFIG_INET is not enabled: net/built-in.o: In function `sock_edemux': (.text+0xd396): undefined reference to `inet_twsk_put' Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/core/sock.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 7f64467535d1..305792076121 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1525,9 +1525,11 @@ void sock_edemux(struct sk_buff *skb) { struct sock *sk = skb->sk; +#ifdef CONFIG_INET if (sk->sk_state == TCP_TIME_WAIT) inet_twsk_put(inet_twsk(sk)); else +#endif sock_put(sk); } EXPORT_SYMBOL(sock_edemux); -- cgit v1.2.3-70-g09d2 From bdfc87f7d1e253e0a61e2fc6a75ea9d76f7fc03a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 11 Sep 2012 13:11:12 +0000 Subject: net-sched: sch_cbq: avoid infinite loop Its possible to setup a bad cbq configuration leading to an infinite loop in cbq_classify() DEV_OUT=eth0 ICMP="match ip protocol 1 0xff" U32="protocol ip u32" DST="match ip dst" tc qdisc add dev $DEV_OUT root handle 1: cbq avpkt 1000 \ bandwidth 100mbit tc class add dev $DEV_OUT parent 1: classid 1:1 cbq \ rate 512kbit allot 1500 prio 5 bounded isolated tc filter add dev $DEV_OUT parent 1: prio 3 $U32 \ $ICMP $DST 192.168.3.234 flowid 1: Reported-by: Denys Fedoryschenko Tested-by: Denys Fedoryschenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 6aabd77d1cfd..564b9fc8efd3 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -250,10 +250,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) cl = defmap[TC_PRIO_BESTEFFORT]; - if (cl == NULL || cl->level >= head->level) + if (cl == NULL) goto fallback; } - + if (cl->level >= head->level) + goto fallback; #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: -- cgit v1.2.3-70-g09d2 From 16af511a666827eaf5802144f09e2fb7b0942c99 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 12 Sep 2012 02:04:53 +0000 Subject: netfilter: log: Fix log-level processing auto75914331@hushmail.com reports that iptables does not correctly output the KERN_. $IPTABLES -A RULE_0_in -j LOG --log-level notice --log-prefix "DENY in: " result with linux 3.6-rc5 Sep 12 06:37:29 xxxxx kernel: <5>DENY in: IN=eth0 OUT= MAC=....... result with linux 3.5.3 and older: Sep 9 10:43:01 xxxxx kernel: DENY in: IN=eth0 OUT= MAC...... commit 04d2c8c83d0 ("printk: convert the format for KERN_ to a 2 byte pattern") updated the syslog header style but did not update netfilter uses. Do so. Use KERN_SOH and string concatenation instead of "%c" KERN_SOH_ASCII as suggested by Eric Dumazet. Signed-off-by: Joe Perches cc: auto75914331@hushmail.com Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebt_log.c | 2 +- net/netfilter/xt_LOG.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index f88ee537fb2b..92de5e5f9db2 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -80,7 +80,7 @@ ebt_log_packet(u_int8_t pf, unsigned int hooknum, unsigned int bitmask; spin_lock_bh(&ebt_log_lock); - printk("<%c>%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x", + printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x", '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : "", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index 2a4f9693e799..91e9af4d1f42 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -443,8 +443,8 @@ log_packet_common(struct sbuff *m, const struct nf_loginfo *loginfo, const char *prefix) { - sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, - prefix, + sb_add(m, KERN_SOH "%c%sIN=%s OUT=%s ", + '0' + loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); #ifdef CONFIG_BRIDGE_NETFILTER -- cgit v1.2.3-70-g09d2 From e29fe837bfa3ca0a9f4ef1d4a90e6e0a74b6b8a0 Mon Sep 17 00:00:00 2001 From: David Ward Date: Thu, 13 Sep 2012 05:22:32 +0000 Subject: net_sched: gred: correct comment about qavg calculation in RIO mode Signed-off-by: David Ward Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index e901583e4ea5..fca73cdf44d9 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -176,7 +176,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) skb->tc_index = (skb->tc_index & ~GRED_VQ_MASK) | dp; } - /* sum up all the qaves of prios <= to ours to get the new qave */ + /* sum up all the qaves of prios < ours to get the new qave */ if (!gred_wred_mode(t) && gred_rio_mode(t)) { int i; -- cgit v1.2.3-70-g09d2 From c22e464022f935b0cbd8724b1d99d800d49518a9 Mon Sep 17 00:00:00 2001 From: David Ward Date: Thu, 13 Sep 2012 05:22:33 +0000 Subject: net_sched: gred: eliminate redundant DP prio comparisons Each pair of DPs only needs to be compared once when searching for a non-unique prio value. Signed-off-by: David Ward Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index fca73cdf44d9..e19d4ebfea1c 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -102,9 +102,8 @@ static inline int gred_wred_mode_check(struct Qdisc *sch) if (q == NULL) continue; - for (n = 0; n < table->DPs; n++) - if (table->tab[n] && table->tab[n] != q && - table->tab[n]->prio == q->prio) + for (n = i + 1; n < table->DPs; n++) + if (table->tab[n] && table->tab[n]->prio == q->prio) return 1; } -- cgit v1.2.3-70-g09d2 From 1fe37b106b039d9358fd1211c39b1fa199e547a8 Mon Sep 17 00:00:00 2001 From: David Ward Date: Thu, 13 Sep 2012 05:22:34 +0000 Subject: net_sched: gred: fix qave reporting via netlink q->vars.qavg is a Wlog scaled value, but q->backlog is not. In order to pass q->vars.qavg as the backlog value, we need to un-scale it. Additionally, the qave value returned via netlink should not be Wlog scaled, so we need to un-scale the result of red_calc_qavg(). This caused artificially high values for "Average Queue" to be shown by 'tc -s -d qdisc', but did not affect the actual operation of GRED. Signed-off-by: David Ward Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index e19d4ebfea1c..b2570b59d85e 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -534,6 +534,7 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) for (i = 0; i < MAX_DPs; i++) { struct gred_sched_data *q = table->tab[i]; struct tc_gred_qopt opt; + unsigned long qavg; memset(&opt, 0, sizeof(opt)); @@ -565,7 +566,9 @@ static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) if (gred_wred_mode(table)) gred_load_wred_set(table, q); - opt.qave = red_calc_qavg(&q->parms, &q->vars, q->vars.qavg); + qavg = red_calc_qavg(&q->parms, &q->vars, + q->vars.qavg >> q->parms.Wlog); + opt.qave = qavg >> q->parms.Wlog; append_opt: if (nla_append(skb, sizeof(opt), &opt) < 0) -- cgit v1.2.3-70-g09d2 From ba1bf474eae07a128fae6252bf7d68eaef0eacf8 Mon Sep 17 00:00:00 2001 From: David Ward Date: Thu, 13 Sep 2012 05:22:35 +0000 Subject: net_sched: gred: actually perform idling in WRED mode gred_dequeue() and gred_drop() do not seem to get called when the queue is empty, meaning that we never start idling while in WRED mode. And since qidlestart is not stored by gred_store_wred_set(), we would never stop idling while in WRED mode if we ever started. This messes up the average queue size calculation that influences packet marking/dropping behavior. Now, we start WRED mode idling as we are removing the last packet from the queue. Also we now actually stop WRED mode idling when we are enqueuing a packet. Cc: Bruce Osler Signed-off-by: David Ward Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_gred.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index b2570b59d85e..d42234c0f13b 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -136,6 +136,7 @@ static inline void gred_store_wred_set(struct gred_sched *table, struct gred_sched_data *q) { table->wred_set.qavg = q->vars.qavg; + table->wred_set.qidlestart = q->vars.qidlestart; } static inline int gred_use_ecn(struct gred_sched *t) @@ -259,16 +260,18 @@ static struct sk_buff *gred_dequeue(struct Qdisc *sch) } else { q->backlog -= qdisc_pkt_len(skb); - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->vars); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } return skb; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return NULL; } @@ -290,19 +293,20 @@ static unsigned int gred_drop(struct Qdisc *sch) q->backlog -= len; q->stats.other++; - if (!q->backlog && !gred_wred_mode(t)) - red_start_of_idle_period(&q->vars); + if (gred_wred_mode(t)) { + if (!sch->qstats.backlog) + red_start_of_idle_period(&t->wred_set); + } else { + if (!q->backlog) + red_start_of_idle_period(&q->vars); + } } qdisc_drop(skb, sch); return len; } - if (gred_wred_mode(t) && !red_is_idling(&t->wred_set)) - red_start_of_idle_period(&t->wred_set); - return 0; - } static void gred_reset(struct Qdisc *sch) -- cgit v1.2.3-70-g09d2 From 6af773e786ad617b0264ebe06ba60675c01f3e51 Mon Sep 17 00:00:00 2001 From: Nishank Trivedi Date: Wed, 12 Sep 2012 13:32:49 +0000 Subject: pktgen: fix crash with vlan and packet size less than 46 If vlan option is being specified in the pktgen and packet size being requested is less than 46 bytes, despite being illogical request, pktgen should not crash the kernel. BUG: unable to handle kernel paging request at ffff88021fb82000 Process kpktgend_0 (pid: 1184, threadinfo ffff880215f1a000, task ffff880218544530) Call Trace: [] ? pktgen_finalize_skb+0x222/0x300 [pktgen] [] ? build_skb+0x34/0x1c0 [] pktgen_thread_worker+0x5d1/0x1790 [pktgen] [] ? igb_xmit_frame_ring+0xa30/0xa30 [igb] [] ? wake_up_bit+0x40/0x40 [] ? wake_up_bit+0x40/0x40 [] ? spin+0x240/0x240 [pktgen] [] kthread+0x93/0xa0 [] kernel_thread_helper+0x4/0x10 [] ? flush_kthread_worker+0x80/0x80 [] ? gs_change+0x13/0x13 The root cause of why pktgen is not able to handle this case is due to comparison of signed (datalen) and unsigned data (sizeof), which eventually passes a huge number to skb_put(). Signed-off-by: Nishank Trivedi Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index cce9e53528b1..148e73d2c451 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2721,7 +2721,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - pkt_dev->pkt_overhead; - if (datalen < sizeof(struct pktgen_hdr)) + if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) datalen = sizeof(struct pktgen_hdr); udph->source = htons(pkt_dev->cur_udp_src); -- cgit v1.2.3-70-g09d2 From bafa6d9d89072c1a18853afe9ee5de05c491c13a Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 7 Sep 2012 00:45:29 +0000 Subject: ipv4/route: arg delay is useless in rt_cache_flush() Since route cache deletion (89aef8921bfbac22f), delay is no more used. Remove it. Signed-off-by: Nicolas Dichtel Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/route.h | 2 +- net/ipv4/arp.c | 2 +- net/ipv4/devinet.c | 6 +++--- net/ipv4/fib_frontend.c | 20 ++++++++++---------- net/ipv4/fib_rules.c | 2 +- net/ipv4/fib_trie.c | 6 +++--- net/ipv4/route.c | 19 +++---------------- 7 files changed, 22 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 776a27f1ab78..da22243d2760 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -108,7 +108,7 @@ extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; extern int ip_rt_init(void); -extern void rt_cache_flush(struct net *net, int how); +extern void rt_cache_flush(struct net *net); extern void rt_flush_dev(struct net_device *dev); extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 77e87aff419a..47800459e4cb 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1225,7 +1225,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev)); break; default: break; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 44bf82e3aef7..9b55b6f5a585 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1503,7 +1503,7 @@ static int devinet_conf_proc(ctl_table *ctl, int write, if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 || i == IPV4_DEVCONF_ROUTE_LOCALNET - 1) if ((new_value == 0) && (old_value != 0)) - rt_cache_flush(net, 0); + rt_cache_flush(net); } return ret; @@ -1537,7 +1537,7 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, dev_disable_lro(idev->dev); } rtnl_unlock(); - rt_cache_flush(net, 0); + rt_cache_flush(net); } } @@ -1554,7 +1554,7 @@ static int ipv4_doint_and_flush(ctl_table *ctl, int write, struct net *net = ctl->extra2; if (write && *valp != val) - rt_cache_flush(net, 0); + rt_cache_flush(net); return ret; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c43ae3fba792..8e2b475da9fa 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -148,7 +148,7 @@ static void fib_flush(struct net *net) } if (flushed) - rt_cache_flush(net, -1); + rt_cache_flush(net); } /* @@ -999,11 +999,11 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force, int delay) +static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev), delay); + rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } @@ -1020,7 +1020,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); @@ -1029,9 +1029,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. * Disable IP. */ - fib_disable_ip(dev, 1, 0); + fib_disable_ip(dev, 1); } else { - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); } break; } @@ -1045,7 +1045,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct net *net = dev_net(dev); if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2, -1); + fib_disable_ip(dev, 2); rt_flush_dev(dev); return NOTIFY_DONE; } @@ -1062,14 +1062,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_sync_up(dev); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev), -1); + rt_cache_flush(dev_net(dev)); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0, 0); + fib_disable_ip(dev, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev)); break; case NETDEV_UNREGISTER_BATCH: break; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index a83d74e498d2..274309d3aded 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -259,7 +259,7 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { - rt_cache_flush(ops->fro_net, -1); + rt_cache_flush(ops->fro_net); } static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 57bd978483e1..d1b93595b4a7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1286,7 +1286,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); @@ -1333,7 +1333,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); succeeded: @@ -1708,7 +1708,7 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); + rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 82cf2a722b23..f6436d3b207a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -461,11 +461,7 @@ static void rt_cache_invalidate(struct net *net) atomic_add(shuffle + 1U, &net->ipv4.rt_genid); } -/* - * delay < 0 : invalidate cache (fast : entries will be deleted later) - * delay >= 0 : invalidate & flush cache (can be long) - */ -void rt_cache_flush(struct net *net, int delay) +void rt_cache_flush(struct net *net) { rt_cache_invalidate(net); } @@ -2345,7 +2341,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(dev_net(in_dev->dev), 0); + rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL @@ -2354,16 +2350,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, size_t *lenp, loff_t *ppos) { if (write) { - int flush_delay; - ctl_table ctl; - struct net *net; - - memcpy(&ctl, __ctl, sizeof(ctl)); - ctl.data = &flush_delay; - proc_dointvec(&ctl, write, buffer, lenp, ppos); - - net = (struct net *)__ctl->extra1; - rt_cache_flush(net, flush_delay); + rt_cache_flush((struct net *)__ctl->extra1); return 0; } -- cgit v1.2.3-70-g09d2 From 2885da72966fcb89f48d554339d347fb02b5ea78 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 7 Sep 2012 22:27:11 +0200 Subject: net: rt_cache_flush() cleanup We dont use jhash anymore since route cache removal, so we can get rid of get_random_bytes() calls for rt_genid changes. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f6436d3b207a..be27cfa96e88 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -447,23 +447,9 @@ static inline bool rt_is_expired(const struct rtable *rth) return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); } -/* - * Perturbation of rt_genid by a small quantity [1..256] - * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() - * many times (2^24) without giving recent rt_genid. - * Jenkins hash is strong enough that litle changes of rt_genid are OK. - */ -static void rt_cache_invalidate(struct net *net) -{ - unsigned char shuffle; - - get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &net->ipv4.rt_genid); -} - void rt_cache_flush(struct net *net) { - rt_cache_invalidate(net); + atomic_inc(&net->ipv4.rt_genid); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -2520,8 +2506,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { - get_random_bytes(&net->ipv4.rt_genid, - sizeof(net->ipv4.rt_genid)); + atomic_set(&net->ipv4.rt_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); return 0; -- cgit v1.2.3-70-g09d2 From b42664f898c976247f7f609b8bb9c94d7475ca10 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 10 Sep 2012 22:09:44 +0000 Subject: netns: move net->ipv4.rt_genid to net->rt_genid This commit prepares the use of rt_genid by both IPv4 and IPv6. Initialization is left in IPv4 part. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/net_namespace.h | 10 ++++++++++ include/net/netns/ipv4.h | 1 - net/ipv4/route.c | 9 ++------- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index ae1cd6c9ba52..fd87963a0ea5 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -102,6 +102,7 @@ struct net { #endif struct netns_ipvs *ipvs; struct sock *diag_nlsk; + atomic_t rt_genid; }; @@ -300,5 +301,14 @@ static inline void unregister_net_sysctl_table(struct ctl_table_header *header) } #endif +static inline int rt_genid(struct net *net) +{ + return atomic_read(&net->rt_genid); +} + +static inline void rt_genid_bump(struct net *net) +{ + atomic_inc(&net->rt_genid); +} #endif /* __NET_NET_NAMESPACE_H */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1474dd65c66f..eb24dbccd81e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -65,7 +65,6 @@ struct netns_ipv4 { unsigned int sysctl_ping_group_range[2]; long sysctl_tcp_mem[3]; - atomic_t rt_genid; atomic_t dev_addr_genid; #ifdef CONFIG_IP_MROUTE diff --git a/net/ipv4/route.c b/net/ipv4/route.c index be27cfa96e88..fd9af60397b5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -202,11 +202,6 @@ EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) -static inline int rt_genid(struct net *net) -{ - return atomic_read(&net->ipv4.rt_genid); -} - #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { @@ -449,7 +444,7 @@ static inline bool rt_is_expired(const struct rtable *rth) void rt_cache_flush(struct net *net) { - atomic_inc(&net->ipv4.rt_genid); + rt_genid_bump(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, @@ -2506,7 +2501,7 @@ static __net_initdata struct pernet_operations sysctl_route_ops = { static __net_init int rt_genid_init(struct net *net) { - atomic_set(&net->ipv4.rt_genid, 0); + atomic_set(&net->rt_genid, 0); get_random_bytes(&net->ipv4.dev_addr_genid, sizeof(net->ipv4.dev_addr_genid)); return 0; -- cgit v1.2.3-70-g09d2 From ee8372dd1989287c5eedb69d44bac43f69e496f1 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 10 Sep 2012 22:09:45 +0000 Subject: xfrm: invalidate dst on policy insertion/deletion When a policy is inserted or deleted, all dst should be recalculated. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 1 + security/selinux/include/xfrm.h | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 5a2aa17e4d3c..ab2ce7d5152d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -585,6 +585,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) xfrm_pol_hold(policy); net->xfrm.policy_count[dir]++; atomic_inc(&flow_cache_genid); + rt_genid_bump(net); if (delpol) __xfrm_policy_unlink(delpol, dir); policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir); diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index c220f314709c..65f67cb0aefb 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -51,6 +51,7 @@ int selinux_xfrm_decode_session(struct sk_buff *skb, u32 *sid, int ckall); static inline void selinux_xfrm_notify_policyload(void) { atomic_inc(&flow_cache_genid); + rt_genid_bump(&init_net); } #else static inline int selinux_xfrm_enabled(void) -- cgit v1.2.3-70-g09d2 From 6f3118b571b8a4c06c7985dc3172c3526cb86253 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 10 Sep 2012 22:09:46 +0000 Subject: ipv6: use net->rt_genid to check dst validity IPv6 dst should take care of rt_genid too. When a xfrm policy is inserted or deleted, all dst should be invalidated. To force the validation, dst entries should be created with ->obsolete set to DST_OBSOLETE_FORCE_CHK. This was already the case for all functions calling ip6_dst_alloc(), except for ip6_rt_copy(). As a consequence, we can remove the specific code in inet6_connection_sock. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 5 ++--- net/ipv6/inet6_connection_sock.c | 23 +---------------------- net/ipv6/route.c | 13 +++++++++---- 3 files changed, 12 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 0fedbd8d747a..9fc7114159e8 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -111,9 +111,8 @@ struct rt6_info { struct inet6_dev *rt6i_idev; unsigned long _rt6i_peer; -#ifdef CONFIG_XFRM - u32 rt6i_flow_cache_genid; -#endif + u32 rt6i_genid; + /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 0251a6005be8..c4f934176cab 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -175,33 +175,12 @@ void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *saddr) { __ip6_dst_store(sk, dst, daddr, saddr); - -#ifdef CONFIG_XFRM - { - struct rt6_info *rt = (struct rt6_info *)dst; - rt->rt6i_flow_cache_genid = atomic_read(&flow_cache_genid); - } -#endif } static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) { - struct dst_entry *dst; - - dst = __sk_dst_check(sk, cookie); - -#ifdef CONFIG_XFRM - if (dst) { - struct rt6_info *rt = (struct rt6_info *)dst; - if (rt->rt6i_flow_cache_genid != atomic_read(&flow_cache_genid)) { - __sk_dst_reset(sk); - dst = NULL; - } - } -#endif - - return dst; + return __sk_dst_check(sk, cookie); } static struct dst_entry *inet6_csk_route_socket(struct sock *sk, diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8e80fd279100..fb29e2215a19 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -281,13 +281,14 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net, struct fib6_table *table) { struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, - 0, DST_OBSOLETE_NONE, flags); + 0, DST_OBSOLETE_FORCE_CHK, flags); if (rt) { struct dst_entry *dst = &rt->dst; memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers); + rt->rt6i_genid = rt_genid(net); } return rt; } @@ -1031,6 +1032,13 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) rt = (struct rt6_info *) dst; + /* All IPV6 dsts are created with ->obsolete set to the value + * DST_OBSOLETE_FORCE_CHK which forces validation calls down + * into this function always. + */ + if (rt->rt6i_genid != rt_genid(dev_net(rt->dst.dev))) + return NULL; + if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) { if (rt->rt6i_peer_genid != rt6_peer_genid()) { if (!rt6_has_peer(rt)) @@ -1397,8 +1405,6 @@ int ip6_route_add(struct fib6_config *cfg) goto out; } - rt->dst.obsolete = -1; - if (cfg->fc_flags & RTF_EXPIRES) rt6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); @@ -2080,7 +2086,6 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, rt->dst.input = ip6_input; rt->dst.output = ip6_output; rt->rt6i_idev = idev; - rt->dst.obsolete = -1; rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; if (anycast) -- cgit v1.2.3-70-g09d2 From 2c20cbd7e3aa6e9dddc07975d3f3a89fe1f69c00 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Mon, 10 Sep 2012 22:09:47 +0000 Subject: ipv6: use DST_* macro to set obselete field Signed-off-by: Nicolas Dichtel Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fb29e2215a19..854e4018d205 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -226,7 +226,7 @@ static struct rt6_info ip6_null_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), .__use = 1, - .obsolete = -1, + .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -ENETUNREACH, .input = ip6_pkt_discard, .output = ip6_pkt_discard_out, @@ -246,7 +246,7 @@ static struct rt6_info ip6_prohibit_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), .__use = 1, - .obsolete = -1, + .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EACCES, .input = ip6_pkt_prohibit, .output = ip6_pkt_prohibit_out, @@ -261,7 +261,7 @@ static struct rt6_info ip6_blk_hole_entry_template = { .dst = { .__refcnt = ATOMIC_INIT(1), .__use = 1, - .obsolete = -1, + .obsolete = DST_OBSOLETE_FORCE_CHK, .error = -EINVAL, .input = dst_discard, .output = dst_discard, -- cgit v1.2.3-70-g09d2 From 864745d291b5ba80ea0bd0edcbe67273de368836 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 13 Sep 2012 11:41:26 +0000 Subject: xfrm_user: return error pointer instead of NULL When dump_one_state() returns an error, e.g. because of a too small buffer to dump the whole xfrm state, xfrm_state_netlink() returns NULL instead of an error pointer. But its callers expect an error pointer and therefore continue to operate on a NULL skbuff. This could lead to a privilege escalation (execution of user code in kernel context) if the attacker has CAP_NET_ADMIN and is able to map address 0. Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e75d8e47f35c..dac08e2a5a93 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -878,6 +878,7 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, { struct xfrm_dump_info info; struct sk_buff *skb; + int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) @@ -888,9 +889,10 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, info.nlmsg_seq = seq; info.nlmsg_flags = 0; - if (dump_one_state(x, 0, &info)) { + err = dump_one_state(x, 0, &info); + if (err) { kfree_skb(skb); - return NULL; + return ERR_PTR(err); } return skb; -- cgit v1.2.3-70-g09d2 From c25463722509fef0ed630b271576a8c9a70236f3 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Fri, 14 Sep 2012 09:58:32 +0000 Subject: xfrm_user: return error pointer instead of NULL #2 When dump_one_policy() returns an error, e.g. because of a too small buffer to dump the whole xfrm policy, xfrm_policy_netlink() returns NULL instead of an error pointer. But its caller expects an error pointer and therefore continues to operate on a NULL skbuff. Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index dac08e2a5a93..d12b62547ad0 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1548,6 +1548,7 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, { struct xfrm_dump_info info; struct sk_buff *skb; + int err; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) @@ -1558,9 +1559,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, info.nlmsg_seq = seq; info.nlmsg_flags = 0; - if (dump_one_policy(xp, dir, 0, &info) < 0) { + err = dump_one_policy(xp, dir, 0, &info); + if (err) { kfree_skb(skb); - return NULL; + return ERR_PTR(err); } return skb; -- cgit v1.2.3-70-g09d2 From 0e698bf6624c469cd4f3f391247b142963ca9c4e Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sat, 15 Sep 2012 22:44:16 +0000 Subject: net: fix memory leak on oom with zerocopy If orphan flags fails, we don't free the skb on receive, which leaks the skb memory. Return value was also wrong: netif_receive_skb is supposed to return NET_RX_DROP, not ENOMEM. Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d7fe32c946c1..ac7609d85187 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3322,7 +3322,7 @@ ncls: if (pt_prev) { if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) - ret = -ENOMEM; + goto drop; else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { -- cgit v1.2.3-70-g09d2 From 1d57f19539c074105791da6384a8ad674bba8037 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Sep 2012 12:51:39 +0000 Subject: tcp: fix regression in urgent data handling Stephan Springl found that commit 1402d366019fed "tcp: introduce tcp_try_coalesce" introduced a regression for rlogin It turns out problem comes from TCP urgent data handling and a change in behavior in input path. rlogin sends two one-byte packets with URG ptr set, and when next data frame is coalesced, we lack sk_data_ready() calls to wakeup consumer. Signed-off-by: Eric Dumazet Reported-by: Stephan Springl Cc: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6e38c6c23caa..d377f4854cb8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4661,7 +4661,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); - else if (!sock_flag(sk, SOCK_DEAD)) + if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk, 0); return; } @@ -5556,8 +5556,7 @@ no_ack: #endif if (eaten) kfree_skb_partial(skb, fragstolen); - else - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk, 0); return 0; } } -- cgit v1.2.3-70-g09d2 From 433a19548061bb5457b6ab77ed7ea58ca6e43ddb Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 17 Sep 2012 22:40:10 +0000 Subject: xfrm: fix a read lock imbalance in make_blackhole if xfrm_policy_get_afinfo returns 0, it has already released the read lock, xfrm_policy_put_afinfo should not be called again. Signed-off-by: Li RongQing Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index ab2ce7d5152d..387848e90078 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1764,7 +1764,7 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family, if (!afinfo) { dst_release(dst_orig); - ret = ERR_PTR(-EINVAL); + return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } -- cgit v1.2.3-70-g09d2 From dbd6b11e15a2f96030da17dbeda943a8a98ee990 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Fri, 14 Sep 2012 00:40:54 +0000 Subject: batman-adv: make batadv_test_bit() return 0 or 1 only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On some architectures test_bit() can return other values than 0 or 1: With a generic x86 OpenWrt image in a kvm setup (batadv_)test_bit() frequently returns -1 for me, leading to batadv_iv_ogm_update_seqnos() wrongly signaling a protected seqno window. This patch tries to fix this issue by making batadv_test_bit() return 0 or 1 only. Signed-off-by: Linus Lüssing Acked-by: Sven Eckelmann Signed-off-by: Antonio Quartulli Signed-off-by: David S. Miller --- net/batman-adv/bitarray.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index a081ce1c0514..cebaae7e148b 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -20,8 +20,8 @@ #ifndef _NET_BATMAN_ADV_BITARRAY_H_ #define _NET_BATMAN_ADV_BITARRAY_H_ -/* returns true if the corresponding bit in the given seq_bits indicates true - * and curr_seqno is within range of last_seqno +/* Returns 1 if the corresponding bit in the given seq_bits indicates true + * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ static inline int batadv_test_bit(const unsigned long *seq_bits, uint32_t last_seqno, uint32_t curr_seqno) @@ -32,7 +32,7 @@ static inline int batadv_test_bit(const unsigned long *seq_bits, if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE) return 0; else - return test_bit(diff, seq_bits); + return test_bit(diff, seq_bits) != 0; } /* turn corresponding bit on, so we can remember that we got the packet */ -- cgit v1.2.3-70-g09d2 From 15c041759bfcd9ab0a4e43f1c16e2644977d0467 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Fri, 14 Sep 2012 04:59:52 +0000 Subject: tcp: flush DMA queue before sk_wait_data if rcv_wnd is zero If recv() syscall is called for a TCP socket so that - IOAT DMA is used - MSG_WAITALL flag is used - requested length is bigger than sk_rcvbuf - enough data has already arrived to bring rcv_wnd to zero then when tcp_recvmsg() gets to calling sk_wait_data(), receive window can be still zero while sk_async_wait_queue exhausts enough space to keep it zero. As this queue isn't cleaned until the tcp_service_net_dma() call, sk_wait_data() cannot receive any data and blocks forever. If zero receive window and non-empty sk_async_wait_queue is detected before calling sk_wait_data(), process the queue first. Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2109ff4a1daf..bf9a8ab29459 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1762,8 +1762,14 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } #ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + if (tp->ucopy.dma_chan) { + if (tp->rcv_wnd == 0 && + !skb_queue_empty(&sk->sk_async_wait_queue)) { + tcp_service_net_dma(sk, true); + tcp_cleanup_rbuf(sk, copied); + } else + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + } #endif if (copied >= target) { /* Do not sleep, just process backlog. */ -- cgit v1.2.3-70-g09d2 From 71261956973ba9e0637848a5adb4a5819b4bae83 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 15 Sep 2012 00:41:35 +0000 Subject: pkt_sched: fix virtual-start-time update in QFQ If the old timestamps of a class, say cl, are stale when the class becomes active, then QFQ may assign to cl a much higher start time than the maximum value allowed. This may happen when QFQ assigns to the start time of cl the finish time of a group whose classes are characterized by a higher value of the ratio max_class_pkt/weight_of_the_class with respect to that of cl. Inserting a class with a too high start time into the bucket list corrupts the data structure and may eventually lead to crashes. This patch limits the maximum start time assigned to a class. Signed-off-by: Paolo Valente Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index e4723d31fdd5..211a21217045 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -865,7 +865,10 @@ static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl) if (mask) { struct qfq_group *next = qfq_ffs(q, mask); if (qfq_gt(roundedF, next->F)) { - cl->S = next->F; + if (qfq_gt(limit, next->F)) + cl->S = next->F; + else /* preserve timestamp correctness */ + cl->S = limit; return; } } -- cgit v1.2.3-70-g09d2 From 8ea853fd0b721f14eacff1a5b364fe3e60d2dd82 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Tue, 18 Sep 2012 16:53:21 +0000 Subject: net/core: fix comment in skb_try_coalesce It should be the skb which is not cloned Signed-off-by: Li RongQing Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fe00d1208167..e33ebae519c8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3502,7 +3502,9 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, if (!skb_cloned(from)) skb_shinfo(from)->nr_frags = 0; - /* if the skb is cloned this does nothing since we set nr_frags to 0 */ + /* if the skb is not cloned this does nothing + * since we set nr_frags to 0. + */ for (i = 0; i < skb_shinfo(from)->nr_frags; i++) skb_frag_ref(from, i); -- cgit v1.2.3-70-g09d2 From bc26ccd8fc756749de95606d28314efd0ce5aec3 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 19 Sep 2012 09:40:00 +0000 Subject: tcp: restore rcv_wscale in a repair mode (v2) rcv_wscale is a symetric parameter with snd_wscale. Both this parameters are set on a connection handshake. Without this value a remote window size can not be interpreted correctly, because a value from a packet should be shifted on rcv_wscale. And one more thing is that wscale_ok should be set too. This patch doesn't break a backward compatibility. If someone uses it in a old scheme, a rcv window will be restored with the same bug (rcv_wscale = 0). v2: Save backward compatibility on big-endian system. Before the first two bytes were snd_wscale and the second two bytes were rcv_wscale. Now snd_wscale is opt_val & 0xFFFF and rcv_wscale >> 16. This approach is independent on byte ordering. Cc: David S. Miller Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy CC: Pavel Emelyanov Signed-off-by: Andrew Vagin Acked-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bf9a8ab29459..5f6419341821 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2331,10 +2331,17 @@ static int tcp_repair_options_est(struct tcp_sock *tp, tp->rx_opt.mss_clamp = opt.opt_val; break; case TCPOPT_WINDOW: - if (opt.opt_val > 14) - return -EFBIG; + { + u16 snd_wscale = opt.opt_val & 0xFFFF; + u16 rcv_wscale = opt.opt_val >> 16; + + if (snd_wscale > 14 || rcv_wscale > 14) + return -EFBIG; - tp->rx_opt.snd_wscale = opt.opt_val; + tp->rx_opt.snd_wscale = snd_wscale; + tp->rx_opt.rcv_wscale = rcv_wscale; + tp->rx_opt.wscale_ok = 1; + } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) -- cgit v1.2.3-70-g09d2 From 4c87308bdea31a7b4828a51f6156e6f721a1fcc9 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 19 Sep 2012 11:33:38 +0000 Subject: xfrm_user: fix info leak in copy_to_user_auth() copy_to_user_auth() fails to initialize the remainder of alg_name and therefore discloses up to 54 bytes of heap memory via netlink to userland. Use strncpy() instead of strcpy() to fill the trailing bytes of alg_name with null bytes. Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d12b62547ad0..40dd50d6c4cc 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -742,7 +742,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) return -EMSGSIZE; algo = nla_data(nla); - strcpy(algo->alg_name, auth->alg_name); + strncpy(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); memcpy(algo->alg_key, auth->alg_key, (auth->alg_key_len + 7) / 8); algo->alg_key_len = auth->alg_key_len; -- cgit v1.2.3-70-g09d2 From f778a636713a435d3a922c60b1622a91136560c1 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 19 Sep 2012 11:33:39 +0000 Subject: xfrm_user: fix info leak in copy_to_user_state() The memory reserved to dump the xfrm state includes the padding bytes of struct xfrm_usersa_info added by the compiler for alignment (7 for amd64, 3 for i386). Add an explicit memset(0) before filling the buffer to avoid the info leak. Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 40dd50d6c4cc..d585459dc8bb 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -689,6 +689,7 @@ out: static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { + memset(p, 0, sizeof(*p)); memcpy(&p->id, &x->id, sizeof(p->id)); memcpy(&p->sel, &x->sel, sizeof(p->sel)); memcpy(&p->lft, &x->lft, sizeof(p->lft)); -- cgit v1.2.3-70-g09d2 From 7b789836f434c87168eab067cfbed1ec4783dffd Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 19 Sep 2012 11:33:40 +0000 Subject: xfrm_user: fix info leak in copy_to_user_policy() The memory reserved to dump the xfrm policy includes multiple padding bytes added by the compiler for alignment (padding bytes in struct xfrm_selector and struct xfrm_userpolicy_info). Add an explicit memset(0) before filling the buffer to avoid the heap info leak. Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index d585459dc8bb..84dd85ceeeea 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1320,6 +1320,7 @@ static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir) { + memset(p, 0, sizeof(*p)); memcpy(&p->sel, &xp->selector, sizeof(p->sel)); memcpy(&p->lft, &xp->lft, sizeof(p->lft)); memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft)); -- cgit v1.2.3-70-g09d2 From 1f86840f897717f86d523a13e99a447e6a5d2fa5 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 19 Sep 2012 11:33:41 +0000 Subject: xfrm_user: fix info leak in copy_to_user_tmpl() The memory used for the template copy is a local stack variable. As struct xfrm_user_tmpl contains multiple holes added by the compiler for alignment, not initializing the memory will lead to leaking stack bytes to userland. Add an explicit memset(0) to avoid the info leak. Initial version of the patch by Brad Spengler. Cc: Brad Spengler Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 84dd85ceeeea..8024b3dea8c2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1425,6 +1425,7 @@ static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) struct xfrm_user_tmpl *up = &vec[i]; struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; + memset(up, 0, sizeof(*up)); memcpy(&up->id, &kp->id, sizeof(up->id)); up->family = kp->encap_family; memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr)); -- cgit v1.2.3-70-g09d2 From ecd7918745234e423dd87fcc0c077da557909720 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 20 Sep 2012 10:01:49 +0000 Subject: xfrm_user: ensure user supplied esn replay window is valid The current code fails to ensure that the netlink message actually contains as many bytes as the header indicates. If a user creates a new state or updates an existing one but does not supply the bytes for the whole ESN replay window, the kernel copies random heap bytes into the replay bitmap, the ones happen to follow the XFRMA_REPLAY_ESN_VAL netlink attribute. This leads to following issues: 1. The replay window has random bits set confusing the replay handling code later on. 2. A malicious user could use this flaw to leak up to ~3.5kB of heap memory when she has access to the XFRM netlink interface (requires CAP_NET_ADMIN). Known users of the ESN replay window are strongSwan and Steffen's iproute2 patch (). The latter uses the interface with a bitmap supplied while the former does not. strongSwan is therefore prone to run into issue 1. To fix both issues without breaking existing userland allow using the XFRMA_REPLAY_ESN_VAL netlink attribute with either an empty bitmap or a fully specified one. For the former case we initialize the in-kernel bitmap with zero, for the latter we copy the user supplied bitmap. For state updates the full bitmap must be supplied. To prevent overflows in the bitmap length calculation the maximum size of bmp_len is limited to 128 by this patch -- resulting in a maximum replay window of 4096 packets. This should be sufficient for all real life scenarios (RFC 4303 recommends a default replay window size of 64). Cc: Steffen Klassert Cc: Martin Willi Cc: Ben Hutchings Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- include/linux/xfrm.h | 2 ++ net/xfrm/xfrm_user.c | 31 +++++++++++++++++++++++++------ 2 files changed, 27 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 22e61fdf75a2..28e493b5b94c 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -84,6 +84,8 @@ struct xfrm_replay_state { __u32 bitmap; }; +#define XFRMA_REPLAY_ESN_MAX 4096 + struct xfrm_replay_state_esn { unsigned int bmp_len; __u32 oseq; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 8024b3dea8c2..5927065e97cf 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -123,9 +123,21 @@ static inline int verify_replay(struct xfrm_usersa_info *p, struct nlattr **attrs) { struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; + struct xfrm_replay_state_esn *rs; - if ((p->flags & XFRM_STATE_ESN) && !rt) - return -EINVAL; + if (p->flags & XFRM_STATE_ESN) { + if (!rt) + return -EINVAL; + + rs = nla_data(rt); + + if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) + return -EINVAL; + + if (nla_len(rt) < xfrm_replay_state_esn_len(rs) && + nla_len(rt) != sizeof(*rs)) + return -EINVAL; + } if (!rt) return 0; @@ -370,14 +382,15 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es struct nlattr *rp) { struct xfrm_replay_state_esn *up; + int ulen; if (!replay_esn || !rp) return 0; up = nla_data(rp); + ulen = xfrm_replay_state_esn_len(up); - if (xfrm_replay_state_esn_len(replay_esn) != - xfrm_replay_state_esn_len(up)) + if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen) return -EINVAL; return 0; @@ -388,22 +401,28 @@ static int xfrm_alloc_replay_state_esn(struct xfrm_replay_state_esn **replay_esn struct nlattr *rta) { struct xfrm_replay_state_esn *p, *pp, *up; + int klen, ulen; if (!rta) return 0; up = nla_data(rta); + klen = xfrm_replay_state_esn_len(up); + ulen = nla_len(rta) >= klen ? klen : sizeof(*up); - p = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); + p = kzalloc(klen, GFP_KERNEL); if (!p) return -ENOMEM; - pp = kmemdup(up, xfrm_replay_state_esn_len(up), GFP_KERNEL); + pp = kzalloc(klen, GFP_KERNEL); if (!pp) { kfree(p); return -ENOMEM; } + memcpy(p, up, ulen); + memcpy(pp, up, ulen); + *replay_esn = p; *preplay_esn = pp; -- cgit v1.2.3-70-g09d2 From e3ac104d41a97b42316915020ba228c505447d21 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Wed, 19 Sep 2012 11:33:43 +0000 Subject: xfrm_user: don't copy esn replay window twice for new states The ESN replay window was already fully initialized in xfrm_alloc_replay_state_esn(). No need to copy it again. Cc: Steffen Klassert Signed-off-by: Mathias Krause Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5927065e97cf..289f4bf18ff0 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -461,10 +461,11 @@ static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info * * somehow made shareable and move it to xfrm_state.c - JHS * */ -static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs) +static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs, + int update_esn) { struct nlattr *rp = attrs[XFRMA_REPLAY_VAL]; - struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL]; + struct nlattr *re = update_esn ? attrs[XFRMA_REPLAY_ESN_VAL] : NULL; struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; @@ -574,7 +575,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; /* override default values from above */ - xfrm_update_ae_params(x, attrs); + xfrm_update_ae_params(x, attrs, 0); return x; @@ -1848,7 +1849,7 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; spin_lock_bh(&x->lock); - xfrm_update_ae_params(x, attrs); + xfrm_update_ae_params(x, attrs, 1); spin_unlock_bh(&x->lock); c.event = nlh->nlmsg_type; -- cgit v1.2.3-70-g09d2 From c0d680e577ff171e7b37dbdb1b1bf5451e851f04 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 19 Sep 2012 15:49:00 +0000 Subject: net: do not disable sg for packets requiring no checksum A change in a series of VLAN-related changes appears to have inadvertently disabled the use of the scatter gather feature of network cards for transmission of non-IP ethernet protocols like ATA over Ethernet (AoE). Below is a reference to the commit that introduces a "harmonize_features" function that turns off scatter gather when the NIC does not support hardware checksumming for the ethernet protocol of an sk buff. commit f01a5236bd4b140198fbcc550f085e8361fd73fa Author: Jesse Gross Date: Sun Jan 9 06:23:31 2011 +0000 net offloading: Generalize netif_get_vlan_features(). The can_checksum_protocol function is not equipped to consider a protocol that does not require checksumming. Calling it for a protocol that requires no checksum is inappropriate. The patch below has harmonize_features call can_checksum_protocol when the protocol needs a checksum, so that the network layer is not forced to perform unnecessary skb linearization on the transmission of AoE packets. Unnecessary linearization results in decreased performance and increased memory pressure, as reported here: http://www.spinics.net/lists/linux-mm/msg15184.html The problem has probably not been widely experienced yet, because only recently has the kernel.org-distributed aoe driver acquired the ability to use payloads of over a page in size, with the patchset recently included in the mm tree: https://lkml.org/lkml/2012/8/28/140 The coraid.com-distributed aoe driver already could use payloads of greater than a page in size, but its users generally do not use the newest kernels. Signed-off-by: Ed Cashin Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index ac7609d85187..89e33a5d4d93 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2134,7 +2134,8 @@ static bool can_checksum_protocol(netdev_features_t features, __be16 protocol) static netdev_features_t harmonize_features(struct sk_buff *skb, __be16 protocol, netdev_features_t features) { - if (!can_checksum_protocol(features, protocol)) { + if (skb->ip_summed != CHECKSUM_NONE && + !can_checksum_protocol(features, protocol)) { features &= ~NETIF_F_ALL_CSUM; features &= ~NETIF_F_SG; } else if (illegal_highdma(skb->dev, skb)) { -- cgit v1.2.3-70-g09d2 From f950c0ecc78f745e490d615280e031de4dbb1306 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 20 Sep 2012 18:29:56 +0000 Subject: ipv6: fix return value check in fib6_add() In case of error, the function fib6_add_1() returns ERR_PTR() or NULL pointer. The ERR_PTR() case check is missing in fib6_add(). dpatch engine is used to generated this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 13690d650c3e..286acfc21250 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -819,6 +819,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info) offsetof(struct rt6_info, rt6i_src), allow_create, replace_required); + if (IS_ERR(sn)) { + err = PTR_ERR(sn); + sn = NULL; + } if (!sn) { /* If it is failed, discard just allocated root, and then (in st_failure) stale node -- cgit v1.2.3-70-g09d2 From bf5b30b8a4416de04f1ac1196281ddb318669464 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Thu, 20 Sep 2012 22:37:25 +0000 Subject: net: change return values from -EACCES to -EPERM Change return value from -EACCES to -EPERM when the permission check fails. Signed-off-by: Zhao Hongjiang Signed-off-by: David S. Miller --- net/bluetooth/bnep/sock.c | 4 ++-- net/bluetooth/cmtp/sock.c | 4 ++-- net/bluetooth/hci_sock.c | 16 ++++++++-------- net/bluetooth/hidp/sock.c | 4 ++-- net/ipv4/devinet.c | 4 ++-- net/netrom/af_netrom.c | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 5e5f5b410e0b..1eaacf10d19d 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -58,7 +58,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long switch (cmd) { case BNEPCONNADD: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; @@ -84,7 +84,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long case BNEPCONNDEL: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index 311668d14571..32dc83dcb6b2 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -72,7 +72,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long switch (cmd) { case CMTPCONNADD: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; @@ -97,7 +97,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long case CMTPCONNDEL: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 19fdac78e555..d5ace1eda3ed 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -490,7 +490,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, switch (cmd) { case HCISETRAW: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return -EPERM; @@ -510,12 +510,12 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, case HCIBLOCKADDR: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_sock_blacklist_add(hdev, (void __user *) arg); case HCIUNBLOCKADDR: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_sock_blacklist_del(hdev, (void __user *) arg); default: @@ -546,22 +546,22 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, case HCIDEVUP: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_open(arg); case HCIDEVDOWN: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_close(arg); case HCIDEVRESET: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_reset(arg); case HCIDEVRESTAT: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_reset_stat(arg); case HCISETSCAN: @@ -573,7 +573,7 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, case HCISETACLMTU: case HCISETSCOMTU: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; return hci_dev_cmd(cmd, argp); case HCIINQUIRY: diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index 18b3f6892a36..b24fb3bd8625 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -56,7 +56,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long switch (cmd) { case HIDPCONNADD: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; @@ -91,7 +91,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long case HIDPCONNDEL: if (!capable(CAP_NET_ADMIN)) - return -EACCES; + return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 9b55b6f5a585..e12fad773852 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -725,7 +725,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; case SIOCSIFFLAGS: - ret = -EACCES; + ret = -EPERM; if (!capable(CAP_NET_ADMIN)) goto out; break; @@ -733,7 +733,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFBRDADDR: /* Set the broadcast address */ case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ - ret = -EACCES; + ret = -EPERM; if (!capable(CAP_NET_ADMIN)) goto out; ret = -EINVAL; diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 1b9024ee963c..7261eb81974f 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -601,7 +601,7 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (!capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); release_sock(sk); - return -EACCES; + return -EPERM; } nr->user_addr = addr->fsa_digipeater[0]; nr->source_addr = addr->fsa_ax25.sax25_call; -- cgit v1.2.3-70-g09d2