diff options
Diffstat (limited to 'net')
114 files changed, 1508 insertions, 974 deletions
diff --git a/net/atm/clip.c b/net/atm/clip.c index 17e55dfecbe2..e07f551a863c 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -317,6 +317,9 @@ static int clip_constructor(struct neighbour *neigh)  static int clip_encap(struct atm_vcc *vcc, int mode)  { +	if (!CLIP_VCC(vcc)) +		return -EBADFD; +  	CLIP_VCC(vcc)->encap = mode;  	return 0;  } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index b4548c739a64..2dda439c8cb8 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -91,10 +91,50 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn)  	 * autoconnect action, remove them completely. If they are, just unmark  	 * them as waiting for connection, by clearing explicit_connect field.  	 */ -	if (params->auto_connect == HCI_AUTO_CONN_EXPLICIT) +	params->explicit_connect = false; + +	list_del_init(¶ms->action); + +	switch (params->auto_connect) { +	case HCI_AUTO_CONN_EXPLICIT:  		hci_conn_params_del(conn->hdev, bdaddr, bdaddr_type); -	else -		params->explicit_connect = false; +		/* return instead of break to avoid duplicate scan update */ +		return; +	case HCI_AUTO_CONN_DIRECT: +	case HCI_AUTO_CONN_ALWAYS: +		list_add(¶ms->action, &conn->hdev->pend_le_conns); +		break; +	case HCI_AUTO_CONN_REPORT: +		list_add(¶ms->action, &conn->hdev->pend_le_reports); +		break; +	default: +		break; +	} + +	hci_update_background_scan(conn->hdev); +} + +static void hci_conn_cleanup(struct hci_conn *conn) +{ +	struct hci_dev *hdev = conn->hdev; + +	if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) +		hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); + +	hci_chan_list_flush(conn); + +	hci_conn_hash_del(hdev, conn); + +	if (hdev->notify) +		hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + +	hci_conn_del_sysfs(conn); + +	debugfs_remove_recursive(conn->debugfs); + +	hci_dev_put(hdev); + +	hci_conn_put(conn);  }  /* This function requires the caller holds hdev->lock */ @@ -102,8 +142,13 @@ static void hci_connect_le_scan_remove(struct hci_conn *conn)  {  	hci_connect_le_scan_cleanup(conn); -	hci_conn_hash_del(conn->hdev, conn); -	hci_update_background_scan(conn->hdev); +	/* We can't call hci_conn_del here since that would deadlock +	 * with trying to call cancel_delayed_work_sync(&conn->disc_work). +	 * Instead, call just hci_conn_cleanup() which contains the bare +	 * minimum cleanup operations needed for a connection in this +	 * state. +	 */ +	hci_conn_cleanup(conn);  }  static void hci_acl_create_connection(struct hci_conn *conn) @@ -581,27 +626,17 @@ int hci_conn_del(struct hci_conn *conn)  		}  	} -	hci_chan_list_flush(conn); -  	if (conn->amp_mgr)  		amp_mgr_put(conn->amp_mgr); -	hci_conn_hash_del(hdev, conn); -	if (hdev->notify) -		hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); -  	skb_queue_purge(&conn->data_q); -	hci_conn_del_sysfs(conn); - -	debugfs_remove_recursive(conn->debugfs); - -	if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) -		hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); - -	hci_dev_put(hdev); - -	hci_conn_put(conn); +	/* Remove the connection from the list and cleanup its remaining +	 * state. This is a separate function since for some cases like +	 * BT_CONNECT_SCAN we *only* want the cleanup part without the +	 * rest of hci_conn_del. +	 */ +	hci_conn_cleanup(conn);  	return 0;  } @@ -973,15 +1008,23 @@ static int hci_explicit_conn_params_set(struct hci_request *req,  	if (is_connected(hdev, addr, addr_type))  		return -EISCONN; -	params = hci_conn_params_add(hdev, addr, addr_type); -	if (!params) -		return -EIO; +	params = hci_conn_params_lookup(hdev, addr, addr_type); +	if (!params) { +		params = hci_conn_params_add(hdev, addr, addr_type); +		if (!params) +			return -ENOMEM; -	/* If we created new params, or existing params were marked as disabled, -	 * mark them to be used just once to connect. -	 */ -	if (params->auto_connect == HCI_AUTO_CONN_DISABLED) { +		/* If we created new params, mark them to be deleted in +		 * hci_connect_le_scan_cleanup. It's different case than +		 * existing disabled params, those will stay after cleanup. +		 */  		params->auto_connect = HCI_AUTO_CONN_EXPLICIT; +	} + +	/* We're trying to connect, so make sure params are at pend_le_conns */ +	if (params->auto_connect == HCI_AUTO_CONN_DISABLED || +	    params->auto_connect == HCI_AUTO_CONN_REPORT || +	    params->auto_connect == HCI_AUTO_CONN_EXPLICIT) {  		list_del_init(¶ms->action);  		list_add(¶ms->action, &hdev->pend_le_conns);  	} diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index adcbc74c2432..e837539452fb 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2861,13 +2861,6 @@ struct hci_conn_params *hci_explicit_connect_lookup(struct hci_dev *hdev,  			return param;  	} -	list_for_each_entry(param, &hdev->pend_le_reports, action) { -		if (bacmp(¶m->addr, addr) == 0 && -		    param->addr_type == addr_type && -		    param->explicit_connect) -			return param; -	} -  	return NULL;  } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 186041866315..bc31099d3b5b 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -55,7 +55,12 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb)  	wake_up_bit(&hdev->flags, HCI_INQUIRY);  	hci_dev_lock(hdev); -	hci_discovery_set_state(hdev, DISCOVERY_STOPPED); +	/* Set discovery state to stopped if we're not doing LE active +	 * scanning. +	 */ +	if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || +	    hdev->le_scan_type != LE_SCAN_ACTIVE) +		hci_discovery_set_state(hdev, DISCOVERY_STOPPED);  	hci_dev_unlock(hdev);  	hci_conn_check_pending(hdev); @@ -4648,8 +4653,8 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,  	/* If we're not connectable only connect devices that we have in  	 * our pend_le_conns list.  	 */ -	params = hci_explicit_connect_lookup(hdev, addr, addr_type); - +	params = hci_pend_le_action_lookup(&hdev->pend_le_conns, addr, +					   addr_type);  	if (!params)  		return NULL; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index ccaf5a436d8f..c4fe2fee753f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3545,6 +3545,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,  				       auth_type);  	} else {  		u8 addr_type; +		struct hci_conn_params *p;  		/* Convert from L2CAP channel address type to HCI address type  		 */ @@ -3562,7 +3563,10 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,  		 * If connection parameters already exist, then they  		 * will be kept and this function does nothing.  		 */ -		hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); +		p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); + +		if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT) +			p->auto_connect = HCI_AUTO_CONN_DISABLED;  		conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr,  					   addr_type, sec_level, @@ -6117,14 +6121,21 @@ static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr,  		__hci_update_background_scan(req);  		break;  	case HCI_AUTO_CONN_REPORT: -		list_add(¶ms->action, &hdev->pend_le_reports); +		if (params->explicit_connect) +			list_add(¶ms->action, &hdev->pend_le_conns); +		else +			list_add(¶ms->action, &hdev->pend_le_reports);  		__hci_update_background_scan(req);  		break;  	case HCI_AUTO_CONN_DIRECT:  	case HCI_AUTO_CONN_ALWAYS:  		if (!is_connected(hdev, addr, addr_type)) {  			list_add(¶ms->action, &hdev->pend_le_conns); -			__hci_update_background_scan(req); +			/* If we are in scan phase of connecting, we were +			 * already added to pend_le_conns and scanning. +			 */ +			if (params->auto_connect != HCI_AUTO_CONN_EXPLICIT) +				__hci_update_background_scan(req);  		}  		break;  	} @@ -6379,7 +6390,8 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,  			goto unlock;  		} -		if (params->auto_connect == HCI_AUTO_CONN_DISABLED) { +		if (params->auto_connect == HCI_AUTO_CONN_DISABLED || +		    params->auto_connect == HCI_AUTO_CONN_EXPLICIT) {  			err = cmd->cmd_complete(cmd,  						MGMT_STATUS_INVALID_PARAMS);  			mgmt_pending_remove(cmd); @@ -6415,6 +6427,10 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,  			if (p->auto_connect == HCI_AUTO_CONN_DISABLED)  				continue;  			device_removed(sk, hdev, &p->addr, p->addr_type); +			if (p->explicit_connect) { +				p->auto_connect = HCI_AUTO_CONN_EXPLICIT; +				continue; +			}  			list_del(&p->action);  			list_del(&p->list);  			kfree(p); diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index ad82324f710f..0510a577a7b5 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2311,12 +2311,6 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)  	if (!conn)  		return 1; -	chan = conn->smp; -	if (!chan) { -		BT_ERR("SMP security requested but not available"); -		return 1; -	} -  	if (!hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED))  		return 1; @@ -2330,6 +2324,12 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)  		if (smp_ltk_encrypt(conn, hcon->pending_sec_level))  			return 0; +	chan = conn->smp; +	if (!chan) { +		BT_ERR("SMP security requested but not available"); +		return 1; +	} +  	l2cap_chan_lock(chan);  	/* If SMP is already in progress ignore this request */ diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 66efdc21f548..480b3de1a0e3 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1006,7 +1006,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,  	ih = igmpv3_report_hdr(skb);  	num = ntohs(ih->ngrec); -	len = sizeof(*ih); +	len = skb_transport_offset(skb) + sizeof(*ih);  	for (i = 0; i < num; i++) {  		len += sizeof(*grec); @@ -1067,7 +1067,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,  	icmp6h = icmp6_hdr(skb);  	num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); -	len = sizeof(*icmp6h); +	len = skb_transport_offset(skb) + sizeof(*icmp6h);  	for (i = 0; i < num; i++) {  		__be16 *nsrcs, _nsrcs; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 525f454f7531..b9b0e3b5da49 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1353,11 +1353,12 @@ static void prepare_write_keepalive(struct ceph_connection *con)  	dout("prepare_write_keepalive %p\n", con);  	con_out_kvec_reset(con);  	if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { -		struct timespec ts = CURRENT_TIME; -		struct ceph_timespec ceph_ts; -		ceph_encode_timespec(&ceph_ts, &ts); +		struct timespec now = CURRENT_TIME; +  		con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); -		con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts); +		ceph_encode_timespec(&con->out_temp_keepalive2, &now); +		con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), +				 &con->out_temp_keepalive2);  	} else {  		con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);  	} diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 80b94e37c94a..f79ccac6699f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -285,6 +285,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,  	switch (op->op) {  	case CEPH_OSD_OP_READ:  	case CEPH_OSD_OP_WRITE: +	case CEPH_OSD_OP_WRITEFULL:  		ceph_osd_data_release(&op->extent.osd_data);  		break;  	case CEPH_OSD_OP_CALL: @@ -485,13 +486,14 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,  	size_t payload_len = 0;  	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && -	       opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE); +	       opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO && +	       opcode != CEPH_OSD_OP_TRUNCATE);  	op->extent.offset = offset;  	op->extent.length = length;  	op->extent.truncate_size = truncate_size;  	op->extent.truncate_seq = truncate_seq; -	if (opcode == CEPH_OSD_OP_WRITE) +	if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)  		payload_len += length;  	op->payload_len = payload_len; @@ -670,9 +672,11 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,  		break;  	case CEPH_OSD_OP_READ:  	case CEPH_OSD_OP_WRITE: +	case CEPH_OSD_OP_WRITEFULL:  	case CEPH_OSD_OP_ZERO:  	case CEPH_OSD_OP_TRUNCATE: -		if (src->op == CEPH_OSD_OP_WRITE) +		if (src->op == CEPH_OSD_OP_WRITE || +		    src->op == CEPH_OSD_OP_WRITEFULL)  			request_data_len = src->extent.length;  		dst->extent.offset = cpu_to_le64(src->extent.offset);  		dst->extent.length = cpu_to_le64(src->extent.length); @@ -681,7 +685,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,  		dst->extent.truncate_seq =  			cpu_to_le32(src->extent.truncate_seq);  		osd_data = &src->extent.osd_data; -		if (src->op == CEPH_OSD_OP_WRITE) +		if (src->op == CEPH_OSD_OP_WRITE || +		    src->op == CEPH_OSD_OP_WRITEFULL)  			ceph_osdc_msg_data_add(req->r_request, osd_data);  		else  			ceph_osdc_msg_data_add(req->r_reply, osd_data); diff --git a/net/core/dev.c b/net/core/dev.c index 877c84834d81..c14748d051e7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -99,6 +99,7 @@  #include <linux/rtnetlink.h>  #include <linux/stat.h>  #include <net/dst.h> +#include <net/dst_metadata.h>  #include <net/pkt_sched.h>  #include <net/checksum.h>  #include <net/xfrm.h> @@ -682,6 +683,32 @@ int dev_get_iflink(const struct net_device *dev)  EXPORT_SYMBOL(dev_get_iflink);  /** + *	dev_fill_metadata_dst - Retrieve tunnel egress information. + *	@dev: targeted interface + *	@skb: The packet. + * + *	For better visibility of tunnel traffic OVS needs to retrieve + *	egress tunnel information for a packet. Following API allows + *	user to get this info. + */ +int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ +	struct ip_tunnel_info *info; + +	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst) +		return -EINVAL; + +	info = skb_tunnel_info_unclone(skb); +	if (!info) +		return -ENOMEM; +	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) +		return -EINVAL; + +	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); +} +EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); + +/**   *	__dev_get_by_name	- find a device by its name   *	@net: the applicable net namespace   *	@name: name to find @@ -4713,6 +4740,8 @@ void napi_disable(struct napi_struct *n)  	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))  		msleep(1); +	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) +		msleep(1);  	hrtimer_cancel(&n->timer); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index b495ab1797fa..29edf74846fc 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1284,7 +1284,7 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)  	gstrings.len = ret; -	data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); +	data = kcalloc(gstrings.len, ETH_GSTRING_LEN, GFP_USER);  	if (!data)  		return -ENOMEM; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bf77e3639ce0..365de66436ac 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -631,15 +631,17 @@ static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,  {  	int idx = 0;  	struct fib_rule *rule; +	int err = 0;  	rcu_read_lock();  	list_for_each_entry_rcu(rule, &ops->rules_list, list) {  		if (idx < cb->args[1])  			goto skip; -		if (fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, -				     cb->nlh->nlmsg_seq, RTM_NEWRULE, -				     NLM_F_MULTI, ops) < 0) +		err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid, +				       cb->nlh->nlmsg_seq, RTM_NEWRULE, +				       NLM_F_MULTI, ops); +		if (err)  			break;  skip:  		idx++; @@ -648,7 +650,7 @@ skip:  	cb->args[1] = idx;  	rules_ops_put(ops); -	return skb->len; +	return err;  }  static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb) @@ -664,7 +666,9 @@ static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)  		if (ops == NULL)  			return -EAFNOSUPPORT; -		return dump_rules(skb, cb, ops); +		dump_rules(skb, cb, ops); + +		return skb->len;  	}  	rcu_read_lock(); diff --git a/net/core/filter.c b/net/core/filter.c index 13079f03902e..bb18c3680001 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -478,9 +478,9 @@ do_pass:  				bpf_src = BPF_X;  			} else {  				insn->dst_reg = BPF_REG_A; -				insn->src_reg = BPF_REG_X;  				insn->imm = fp->k;  				bpf_src = BPF_SRC(fp->code); +				insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;  			}  			/* Common case where 'jump_false' is next insn. */ @@ -1415,6 +1415,7 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)  		return dev_forward_skb(dev, skb2);  	skb2->dev = dev; +	skb_sender_cpu_clear(skb2);  	return dev_queue_xmit(skb2);  } @@ -1854,9 +1855,13 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,  		goto out;  	/* We're copying the filter that has been originally attached, -	 * so no conversion/decode needed anymore. +	 * so no conversion/decode needed anymore. eBPF programs that +	 * have no original program cannot be dumped through this.  	 */ +	ret = -EACCES;  	fprog = filter->prog->orig_prog; +	if (!fprog) +		goto out;  	ret = fprog->len;  	if (!len) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index b279077c3089..830f8a7c1cb1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -31,7 +31,6 @@  static const char fmt_hex[] = "%#x\n";  static const char fmt_long_hex[] = "%#lx\n";  static const char fmt_dec[] = "%d\n"; -static const char fmt_udec[] = "%u\n";  static const char fmt_ulong[] = "%lu\n";  static const char fmt_u64[] = "%llu\n"; @@ -202,7 +201,7 @@ static ssize_t speed_show(struct device *dev,  	if (netif_running(netdev)) {  		struct ethtool_cmd cmd;  		if (!__ethtool_get_settings(netdev, &cmd)) -			ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd)); +			ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));  	}  	rtnl_unlock();  	return ret; @@ -1481,6 +1480,15 @@ static int of_dev_node_match(struct device *dev, const void *data)  	return ret == 0 ? dev->of_node == data : ret;  } +/* + * of_find_net_device_by_node - lookup the net device for the device node + * @np: OF device node + * + * Looks up the net_device structure corresponding with the device node. + * If successful, returns a pointer to the net_device with the embedded + * struct device refcount incremented by one, or NULL on failure. The + * refcount must be dropped when done with the net_device. + */  struct net_device *of_find_net_device_by_node(struct device_node *np)  {  	struct device *dev; diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 6aa3db8dfc3b..8bdada242a7d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -142,7 +142,7 @@ static void queue_process(struct work_struct *work)   */  static int poll_one_napi(struct napi_struct *napi, int budget)  { -	int work; +	int work = 0;  	/* net_rx_action's ->poll() invocations and our's are  	 * synchronized by this test which is only made while @@ -151,7 +151,12 @@ static int poll_one_napi(struct napi_struct *napi, int budget)  	if (!test_bit(NAPI_STATE_SCHED, &napi->state))  		return budget; -	set_bit(NAPI_STATE_NPSVC, &napi->state); +	/* If we set this bit but see that it has already been set, +	 * that indicates that napi has been disabled and we need +	 * to abort this operation +	 */ +	if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state)) +		goto out;  	work = napi->poll(napi, budget);  	WARN_ONCE(work > budget, "%pF exceeded budget in poll\n", napi->poll); @@ -159,6 +164,7 @@ static int poll_one_napi(struct napi_struct *napi, int budget)  	clear_bit(NAPI_STATE_NPSVC, &napi->state); +out:  	return budget - work;  } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a466821d1441..0ec48403ed68 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3047,6 +3047,7 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)  	u32 portid = NETLINK_CB(cb->skb).portid;  	u32 seq = cb->nlh->nlmsg_seq;  	u32 filter_mask = 0; +	int err;  	if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) {  		struct nlattr *extfilt; @@ -3067,20 +3068,25 @@ static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)  		struct net_device *br_dev = netdev_master_upper_dev_get(dev);  		if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { -			if (idx >= cb->args[0] && -			    br_dev->netdev_ops->ndo_bridge_getlink( -				    skb, portid, seq, dev, filter_mask, -				    NLM_F_MULTI) < 0) -				break; +			if (idx >= cb->args[0]) { +				err = br_dev->netdev_ops->ndo_bridge_getlink( +						skb, portid, seq, dev, +						filter_mask, NLM_F_MULTI); +				if (err < 0 && err != -EOPNOTSUPP) +					break; +			}  			idx++;  		}  		if (ops->ndo_bridge_getlink) { -			if (idx >= cb->args[0] && -			    ops->ndo_bridge_getlink(skb, portid, seq, dev, -						    filter_mask, -						    NLM_F_MULTI) < 0) -				break; +			if (idx >= cb->args[0]) { +				err = ops->ndo_bridge_getlink(skb, portid, +							      seq, dev, +							      filter_mask, +							      NLM_F_MULTI); +				if (err < 0 && err != -EOPNOTSUPP) +					break; +			}  			idx++;  		}  	} diff --git a/net/core/skbuff.c b/net/core/skbuff.c index dad4dd37e2aa..fab4599ba8b2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2958,11 +2958,12 @@ EXPORT_SYMBOL_GPL(skb_append_pagefrags);   */  unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)  { +	unsigned char *data = skb->data; +  	BUG_ON(len > skb->len); -	skb->len -= len; -	BUG_ON(skb->len < skb->data_len); -	skb_postpull_rcsum(skb, skb->data, len); -	return skb->data += len; +	__skb_pull(skb, len); +	skb_postpull_rcsum(skb, data, len); +	return skb->data;  }  EXPORT_SYMBOL_GPL(skb_pull_rcsum); diff --git a/net/core/sock.c b/net/core/sock.c index ca2984afe16e..3307c02244d3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2740,10 +2740,8 @@ static void req_prot_cleanup(struct request_sock_ops *rsk_prot)  		return;  	kfree(rsk_prot->slab_name);  	rsk_prot->slab_name = NULL; -	if (rsk_prot->slab) { -		kmem_cache_destroy(rsk_prot->slab); -		rsk_prot->slab = NULL; -	} +	kmem_cache_destroy(rsk_prot->slab); +	rsk_prot->slab = NULL;  }  static int req_prot_init(const struct proto *prot) @@ -2828,10 +2826,8 @@ void proto_unregister(struct proto *prot)  	list_del(&prot->node);  	mutex_unlock(&proto_list_mutex); -	if (prot->slab != NULL) { -		kmem_cache_destroy(prot->slab); -		prot->slab = NULL; -	} +	kmem_cache_destroy(prot->slab); +	prot->slab = NULL;  	req_prot_cleanup(prot->rsk_prot); diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index bd9e718c2a20..3de0d0362d7f 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -398,12 +398,8 @@ out_err:  void dccp_ackvec_exit(void)  { -	if (dccp_ackvec_slab != NULL) { -		kmem_cache_destroy(dccp_ackvec_slab); -		dccp_ackvec_slab = NULL; -	} -	if (dccp_ackvec_record_slab != NULL) { -		kmem_cache_destroy(dccp_ackvec_record_slab); -		dccp_ackvec_record_slab = NULL; -	} +	kmem_cache_destroy(dccp_ackvec_slab); +	dccp_ackvec_slab = NULL; +	kmem_cache_destroy(dccp_ackvec_record_slab); +	dccp_ackvec_record_slab = NULL;  } diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 83498975165f..90f77d08cc37 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -95,8 +95,7 @@ static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_f  static void ccid_kmem_cache_destroy(struct kmem_cache *slab)  { -	if (slab != NULL) -		kmem_cache_destroy(slab); +	kmem_cache_destroy(slab);  }  static int __init ccid_activate(struct ccid_operations *ccid_ops) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 30addee2dd03..838f524cf11a 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)  			tw->tw_ipv6only = sk->sk_ipv6only;  		}  #endif -		/* Linkage updates. */ -		__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);  		/* Get the TIME_WAIT timeout firing. */  		if (timeo < rto) @@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)  			timeo = DCCP_TIMEWAIT_LEN;  		inet_twsk_schedule(tw, timeo); +		/* Linkage updates. */ +		__inet_twsk_hashdance(tw, sk, &dccp_hashinfo);  		inet_twsk_put(tw);  	} else {  		/* Sorry, if we're out of memory, just CLOSE this diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 76e3800765f8..adb5325f4934 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -22,6 +22,7 @@  #include <linux/of_platform.h>  #include <linux/of_net.h>  #include <linux/sysfs.h> +#include <linux/phy_fixed.h>  #include "dsa_priv.h"  char dsa_driver_version[] = "0.1"; @@ -305,7 +306,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)  	if (ret < 0)  		goto out; -	ds->slave_mii_bus = mdiobus_alloc(); +	ds->slave_mii_bus = devm_mdiobus_alloc(parent);  	if (ds->slave_mii_bus == NULL) {  		ret = -ENOMEM;  		goto out; @@ -314,7 +315,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)  	ret = mdiobus_register(ds->slave_mii_bus);  	if (ret < 0) -		goto out_free; +		goto out;  	/* @@ -367,10 +368,7 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent)  	return ret; -out_free: -	mdiobus_free(ds->slave_mii_bus);  out: -	kfree(ds);  	return ret;  } @@ -400,7 +398,7 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,  	/*  	 * Allocate and initialise switch state.  	 */ -	ds = kzalloc(sizeof(*ds) + drv->priv_size, GFP_KERNEL); +	ds = devm_kzalloc(parent, sizeof(*ds) + drv->priv_size, GFP_KERNEL);  	if (ds == NULL)  		return ERR_PTR(-ENOMEM); @@ -420,10 +418,47 @@ dsa_switch_setup(struct dsa_switch_tree *dst, int index,  static void dsa_switch_destroy(struct dsa_switch *ds)  { +	struct device_node *port_dn; +	struct phy_device *phydev; +	struct dsa_chip_data *cd = ds->pd; +	int port; +  #ifdef CONFIG_NET_DSA_HWMON  	if (ds->hwmon_dev)  		hwmon_device_unregister(ds->hwmon_dev);  #endif + +	/* Disable configuration of the CPU and DSA ports */ +	for (port = 0; port < DSA_MAX_PORTS; port++) { +		if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) +			continue; + +		port_dn = cd->port_dn[port]; +		if (of_phy_is_fixed_link(port_dn)) { +			phydev = of_phy_find_device(port_dn); +			if (phydev) { +				int addr = phydev->addr; + +				phy_device_free(phydev); +				of_node_put(port_dn); +				fixed_phy_del(addr); +			} +		} +	} + +	/* Destroy network devices for physical switch ports. */ +	for (port = 0; port < DSA_MAX_PORTS; port++) { +		if (!(ds->phys_port_mask & (1 << port))) +			continue; + +		if (!ds->ports[port]) +			continue; + +		unregister_netdev(ds->ports[port]); +		free_netdev(ds->ports[port]); +	} + +	mdiobus_unregister(ds->slave_mii_bus);  }  #ifdef CONFIG_PM_SLEEP @@ -634,6 +669,10 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd)  			port_index++;  		}  		kfree(pd->chip[i].rtable); + +		/* Drop our reference to the MDIO bus device */ +		if (pd->chip[i].host_dev) +			put_device(pd->chip[i].host_dev);  	}  	kfree(pd->chip);  } @@ -661,16 +700,22 @@ static int dsa_of_probe(struct device *dev)  		return -EPROBE_DEFER;  	ethernet = of_parse_phandle(np, "dsa,ethernet", 0); -	if (!ethernet) -		return -EINVAL; +	if (!ethernet) { +		ret = -EINVAL; +		goto out_put_mdio; +	}  	ethernet_dev = of_find_net_device_by_node(ethernet); -	if (!ethernet_dev) -		return -EPROBE_DEFER; +	if (!ethernet_dev) { +		ret = -EPROBE_DEFER; +		goto out_put_mdio; +	}  	pd = kzalloc(sizeof(*pd), GFP_KERNEL); -	if (!pd) -		return -ENOMEM; +	if (!pd) { +		ret = -ENOMEM; +		goto out_put_ethernet; +	}  	dev->platform_data = pd;  	pd->of_netdev = ethernet_dev; @@ -691,7 +736,9 @@ static int dsa_of_probe(struct device *dev)  		cd = &pd->chip[chip_index];  		cd->of_node = child; -		cd->host_dev = &mdio_bus->dev; + +		/* When assigning the host device, increment its refcount */ +		cd->host_dev = get_device(&mdio_bus->dev);  		sw_addr = of_get_property(child, "reg", NULL);  		if (!sw_addr) @@ -711,6 +758,12 @@ static int dsa_of_probe(struct device *dev)  				ret = -EPROBE_DEFER;  				goto out_free_chip;  			} + +			/* Drop the mdio_bus device ref, replacing the host +			 * device with the mdio_bus_switch device, keeping +			 * the refcount from of_mdio_find_bus() above. +			 */ +			put_device(cd->host_dev);  			cd->host_dev = &mdio_bus_switch->dev;  		} @@ -744,6 +797,10 @@ static int dsa_of_probe(struct device *dev)  		}  	} +	/* The individual chips hold their own refcount on the mdio bus, +	 * so drop ours */ +	put_device(&mdio_bus->dev); +  	return 0;  out_free_chip: @@ -751,6 +808,10 @@ out_free_chip:  out_free:  	kfree(pd);  	dev->platform_data = NULL; +out_put_ethernet: +	put_device(ðernet_dev->dev); +out_put_mdio: +	put_device(&mdio_bus->dev);  	return ret;  } @@ -762,6 +823,7 @@ static void dsa_of_remove(struct device *dev)  		return;  	dsa_of_free_platform_data(pd); +	put_device(&pd->of_netdev->dev);  	kfree(pd);  }  #else @@ -775,10 +837,11 @@ static inline void dsa_of_remove(struct device *dev)  }  #endif -static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, -			  struct device *parent, struct dsa_platform_data *pd) +static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, +			 struct device *parent, struct dsa_platform_data *pd)  {  	int i; +	unsigned configured = 0;  	dst->pd = pd;  	dst->master_netdev = dev; @@ -798,9 +861,17 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,  		dst->ds[i] = ds;  		if (ds->drv->poll_link != NULL)  			dst->link_poll_needed = 1; + +		++configured;  	}  	/* +	 * If no switch was found, exit cleanly +	 */ +	if (!configured) +		return -EPROBE_DEFER; + +	/*  	 * If we use a tagging format that doesn't have an ethertype  	 * field, make sure that all packets from this point on get  	 * sent to the tag format's receive function. @@ -816,6 +887,8 @@ static void dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,  		dst->link_poll_timer.expires = round_jiffies(jiffies + HZ);  		add_timer(&dst->link_poll_timer);  	} + +	return 0;  }  static int dsa_probe(struct platform_device *pdev) @@ -856,7 +929,7 @@ static int dsa_probe(struct platform_device *pdev)  		goto out;  	} -	dst = kzalloc(sizeof(*dst), GFP_KERNEL); +	dst = devm_kzalloc(&pdev->dev, sizeof(*dst), GFP_KERNEL);  	if (dst == NULL) {  		dev_put(dev);  		ret = -ENOMEM; @@ -865,7 +938,9 @@ static int dsa_probe(struct platform_device *pdev)  	platform_set_drvdata(pdev, dst); -	dsa_setup_dst(dst, dev, &pdev->dev, pd); +	ret = dsa_setup_dst(dst, dev, &pdev->dev, pd); +	if (ret) +		goto out;  	return 0; @@ -887,7 +962,7 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)  	for (i = 0; i < dst->pd->nr_chips; i++) {  		struct dsa_switch *ds = dst->ds[i]; -		if (ds != NULL) +		if (ds)  			dsa_switch_destroy(ds);  	}  } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index cce97385f743..7d91f4612ac0 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -458,12 +458,17 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state)  static int dsa_slave_port_attr_set(struct net_device *dev,  				   struct switchdev_attr *attr)  { -	int ret = 0; +	struct dsa_slave_priv *p = netdev_priv(dev); +	struct dsa_switch *ds = p->parent; +	int ret;  	switch (attr->id) {  	case SWITCHDEV_ATTR_PORT_STP_STATE: -		if (attr->trans == SWITCHDEV_TRANS_COMMIT) -			ret = dsa_slave_stp_update(dev, attr->u.stp_state); +		if (attr->trans == SWITCHDEV_TRANS_PREPARE) +			ret = ds->drv->port_stp_update ? 0 : -EOPNOTSUPP; +		else +			ret = ds->drv->port_stp_update(ds, p->port, +						       attr->u.stp_state);  		break;  	default:  		ret = -EOPNOTSUPP; diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index d25efc93d8f1..b6ca0890d018 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -78,7 +78,7 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,  	trailer = skb_tail_pointer(skb) - 4;  	if (trailer[0] != 0x80 || (trailer[1] & 0xf8) != 0x00 || -	    (trailer[3] & 0xef) != 0x00 || trailer[3] != 0x00) +	    (trailer[2] & 0xef) != 0x00 || trailer[3] != 0x00)  		goto out_drop;  	source_port = trailer[1] & 7; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 30409b75e925..0c9c3482e419 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -113,6 +113,8 @@  #include <net/arp.h>  #include <net/ax25.h>  #include <net/netrom.h> +#include <net/dst_metadata.h> +#include <net/ip_tunnels.h>  #include <linux/uaccess.h> @@ -296,7 +298,8 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip,  			 struct net_device *dev, __be32 src_ip,  			 const unsigned char *dest_hw,  			 const unsigned char *src_hw, -			 const unsigned char *target_hw, struct sk_buff *oskb) +			 const unsigned char *target_hw, +			 struct dst_entry *dst)  {  	struct sk_buff *skb; @@ -309,9 +312,7 @@ static void arp_send_dst(int type, int ptype, __be32 dest_ip,  	if (!skb)  		return; -	if (oskb) -		skb_dst_copy(skb, oskb); - +	skb_dst_set(skb, dst_clone(dst));  	arp_xmit(skb);  } @@ -333,6 +334,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)  	__be32 target = *(__be32 *)neigh->primary_key;  	int probes = atomic_read(&neigh->probes);  	struct in_device *in_dev; +	struct dst_entry *dst = NULL;  	rcu_read_lock();  	in_dev = __in_dev_get_rcu(dev); @@ -381,9 +383,10 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)  		}  	} +	if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) +		dst = skb_dst(skb);  	arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, -		     dst_hw, dev->dev_addr, NULL, -		     dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); +		     dst_hw, dev->dev_addr, NULL, dst);  }  static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -649,6 +652,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)  	int addr_type;  	struct neighbour *n;  	struct net *net = dev_net(dev); +	struct dst_entry *reply_dst = NULL;  	bool is_garp = false;  	/* arp_rcv below verifies the ARP header and verifies the device @@ -749,13 +753,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)   *  cache.   */ +	if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) +		reply_dst = (struct dst_entry *) +			    iptunnel_metadata_reply(skb_metadata_dst(skb), +						    GFP_ATOMIC); +  	/* Special case: IPv4 duplicate address detection packet (RFC2131) */  	if (sip == 0) {  		if (arp->ar_op == htons(ARPOP_REQUEST) &&  		    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&  		    !arp_ignore(in_dev, sip, tip)) -			arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, -				 dev->dev_addr, sha); +			arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, +				     sha, dev->dev_addr, sha, reply_dst);  		goto out;  	} @@ -774,9 +783,10 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)  			if (!dont_send) {  				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);  				if (n) { -					arp_send(ARPOP_REPLY, ETH_P_ARP, sip, -						 dev, tip, sha, dev->dev_addr, -						 sha); +					arp_send_dst(ARPOP_REPLY, ETH_P_ARP, +						     sip, dev, tip, sha, +						     dev->dev_addr, sha, +						     reply_dst);  					neigh_release(n);  				}  			} @@ -794,13 +804,14 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)  				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||  				    skb->pkt_type == PACKET_HOST ||  				    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { -					arp_send(ARPOP_REPLY, ETH_P_ARP, sip, -						 dev, tip, sha, dev->dev_addr, -						 sha); +					arp_send_dst(ARPOP_REPLY, ETH_P_ARP, +						     sip, dev, tip, sha, +						     dev->dev_addr, sha, +						     reply_dst);  				} else {  					pneigh_enqueue(&arp_tbl,  						       in_dev->arp_parms, skb); -					return 0; +					goto out_free_dst;  				}  				goto out;  			} @@ -854,6 +865,8 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)  out:  	consume_skb(skb); +out_free_dst: +	dst_release(reply_dst);  	return 0;  } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6fcbd215cdbc..690bcbc59f26 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -340,6 +340,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,  	fl4.flowi4_tos = tos;  	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;  	fl4.flowi4_tun_key.tun_id = 0; +	fl4.flowi4_flags = 0;  	no_addr = idev->ifa_list == NULL; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 26d6ffb6d23c..744e5936c10d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1426,7 +1426,7 @@ found:  			    nh->nh_flags & RTNH_F_LINKDOWN &&  			    !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))  				continue; -			if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { +			if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {  				if (flp->flowi4_oif &&  				    flp->flowi4_oif != nh->nh_oif)  					continue; @@ -1569,7 +1569,7 @@ static struct key_vector *leaf_walk_rcu(struct key_vector **tn, t_key key)  	do {  		/* record parent and next child index */  		pn = n; -		cindex = key ? get_index(key, pn) : 0; +		cindex = (key > pn->key) ? get_index(key, pn) : 0;  		if (cindex >> pn->bits)  			break; diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 5aa46d4b44ef..5a8ee3282550 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -36,7 +36,8 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,  				  SKB_GSO_TCP_ECN |  				  SKB_GSO_GRE |  				  SKB_GSO_GRE_CSUM | -				  SKB_GSO_IPIP))) +				  SKB_GSO_IPIP | +				  SKB_GSO_SIT)))  		goto out;  	if (!skb->encapsulation) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 79fe05befcae..e5eb8ac4089d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -427,7 +427,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)  	fl4.flowi4_mark = mark;  	fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);  	fl4.flowi4_proto = IPPROTO_ICMP; -	fl4.flowi4_oif = vrf_master_ifindex(skb->dev) ? : skb->dev->ifindex; +	fl4.flowi4_oif = vrf_master_ifindex(skb->dev);  	security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));  	rt = ip_route_output_key(net, &fl4);  	if (IS_ERR(rt)) @@ -461,7 +461,7 @@ static struct rtable *icmp_route_lookup(struct net *net,  	fl4->flowi4_proto = IPPROTO_ICMP;  	fl4->fl4_icmp_type = type;  	fl4->fl4_icmp_code = code; -	fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex; +	fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev);  	security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));  	rt = __ip_route_output_key(net, fl4); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 134957159c27..61b45a17fc73 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -577,21 +577,22 @@ EXPORT_SYMBOL(inet_rtx_syn_ack);  static bool reqsk_queue_unlink(struct request_sock_queue *queue,  			       struct request_sock *req)  { -	struct listen_sock *lopt = queue->listen_opt;  	struct request_sock **prev; +	struct listen_sock *lopt;  	bool found = false;  	spin_lock(&queue->syn_wait_lock); - -	for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; -	     prev = &(*prev)->dl_next) { -		if (*prev == req) { -			*prev = req->dl_next; -			found = true; -			break; +	lopt = queue->listen_opt; +	if (lopt) { +		for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; +		     prev = &(*prev)->dl_next) { +			if (*prev == req) { +				*prev = req->dl_next; +				found = true; +				break; +			}  		}  	} -  	spin_unlock(&queue->syn_wait_lock);  	if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))  		reqsk_put(req); @@ -685,20 +686,20 @@ void reqsk_queue_hash_req(struct request_sock_queue *queue,  	req->num_timeout = 0;  	req->sk = NULL; +	setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); +	mod_timer_pinned(&req->rsk_timer, jiffies + timeout); +	req->rsk_hash = hash; +  	/* before letting lookups find us, make sure all req fields  	 * are committed to memory and refcnt initialized.  	 */  	smp_wmb();  	atomic_set(&req->rsk_refcnt, 2); -	setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); -	req->rsk_hash = hash;  	spin_lock(&queue->syn_wait_lock);  	req->dl_next = lopt->syn_table[hash];  	lopt->syn_table[hash] = req;  	spin_unlock(&queue->syn_wait_lock); - -	mod_timer_pinned(&req->rsk_timer, jiffies + timeout);  }  EXPORT_SYMBOL(reqsk_queue_hash_req); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ae22cc24fbe8..c67f9bd7699c 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -123,13 +123,15 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,  	/*  	 * Step 2: Hash TW into tcp ehash chain.  	 * Notes : -	 * - tw_refcnt is set to 3 because : +	 * - tw_refcnt is set to 4 because :  	 * - We have one reference from bhash chain.  	 * - We have one reference from ehash chain. +	 * - We have one reference from timer. +	 * - One reference for ourself (our caller will release it).  	 * We can use atomic_set() because prior spin_lock()/spin_unlock()  	 * committed into memory all tw fields.  	 */ -	atomic_set(&tw->tw_refcnt, 1 + 1 + 1); +	atomic_set(&tw->tw_refcnt, 4);  	inet_twsk_add_node_rcu(tw, &ehead->chain);  	/* Step 3: Remove SK from hash chain */ @@ -217,7 +219,7 @@ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)  }  EXPORT_SYMBOL(inet_twsk_deschedule_put); -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)  {  	/* timeout := RTO * 3.5  	 * @@ -245,12 +247,14 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo)  	 */  	tw->tw_kill = timeo <= 4*HZ; -	if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { -		atomic_inc(&tw->tw_refcnt); +	if (!rearm) { +		BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo));  		atomic_inc(&tw->tw_dr->tw_count); +	} else { +		mod_timer_pending(&tw->tw_timer, jiffies + timeo);  	}  } -EXPORT_SYMBOL_GPL(inet_twsk_schedule); +EXPORT_SYMBOL_GPL(__inet_twsk_schedule);  void inet_twsk_purge(struct inet_hashinfo *hashinfo,  		     struct inet_timewait_death_row *twdr, int family) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index bd0679d90519..614521437e30 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -498,10 +498,26 @@ static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,  					csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);  } +static struct rtable *gre_get_rt(struct sk_buff *skb, +				 struct net_device *dev, +				 struct flowi4 *fl, +				 const struct ip_tunnel_key *key) +{ +	struct net *net = dev_net(dev); + +	memset(fl, 0, sizeof(*fl)); +	fl->daddr = key->u.ipv4.dst; +	fl->saddr = key->u.ipv4.src; +	fl->flowi4_tos = RT_TOS(key->tos); +	fl->flowi4_mark = skb->mark; +	fl->flowi4_proto = IPPROTO_GRE; + +	return ip_route_output_key(net, fl); +} +  static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)  {  	struct ip_tunnel_info *tun_info; -	struct net *net = dev_net(dev);  	const struct ip_tunnel_key *key;  	struct flowi4 fl;  	struct rtable *rt; @@ -516,14 +532,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)  		goto err_free_skb;  	key = &tun_info->key; -	memset(&fl, 0, sizeof(fl)); -	fl.daddr = key->u.ipv4.dst; -	fl.saddr = key->u.ipv4.src; -	fl.flowi4_tos = RT_TOS(key->tos); -	fl.flowi4_mark = skb->mark; -	fl.flowi4_proto = IPPROTO_GRE; - -	rt = ip_route_output_key(net, &fl); +	rt = gre_get_rt(skb, dev, &fl, key);  	if (IS_ERR(rt))  		goto err_free_skb; @@ -566,6 +575,24 @@ err_free_skb:  	dev->stats.tx_dropped++;  } +static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) +{ +	struct ip_tunnel_info *info = skb_tunnel_info(skb); +	struct rtable *rt; +	struct flowi4 fl4; + +	if (ip_tunnel_info_af(info) != AF_INET) +		return -EINVAL; + +	rt = gre_get_rt(skb, dev, &fl4, &info->key); +	if (IS_ERR(rt)) +		return PTR_ERR(rt); + +	ip_rt_put(rt); +	info->key.u.ipv4.src = fl4.saddr; +	return 0; +} +  static netdev_tx_t ipgre_xmit(struct sk_buff *skb,  			      struct net_device *dev)  { @@ -1023,6 +1050,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {  	.ndo_change_mtu		= ip_tunnel_change_mtu,  	.ndo_get_stats64	= ip_tunnel_get_stats64,  	.ndo_get_iflink		= ip_tunnel_get_iflink, +	.ndo_fill_metadata_dst	= gre_fill_metadata_dst,  };  static void ipgre_tap_setup(struct net_device *dev) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 29ed6c5a5185..84dce6a92f93 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -46,12 +46,13 @@  #include <net/net_namespace.h>  #include <net/netns/generic.h>  #include <net/rtnetlink.h> +#include <net/dst_metadata.h>  int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,  		  __be32 src, __be32 dst, __u8 proto,  		  __u8 tos, __u8 ttl, __be16 df, bool xnet)  { -	int pkt_len = skb->len; +	int pkt_len = skb->len - skb_inner_network_offset(skb);  	struct iphdr *iph;  	int err; @@ -119,6 +120,33 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)  }  EXPORT_SYMBOL_GPL(iptunnel_pull_header); +struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, +					     gfp_t flags) +{ +	struct metadata_dst *res; +	struct ip_tunnel_info *dst, *src; + +	if (!md || md->u.tun_info.mode & IP_TUNNEL_INFO_TX) +		return NULL; + +	res = metadata_dst_alloc(0, flags); +	if (!res) +		return NULL; + +	dst = &res->u.tun_info; +	src = &md->u.tun_info; +	dst->key.tun_id = src->key.tun_id; +	if (src->mode & IP_TUNNEL_INFO_IPV6) +		memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src, +		       sizeof(struct in6_addr)); +	else +		dst->key.u.ipv4.dst = src->key.u.ipv4.src; +	dst->mode = src->mode | IP_TUNNEL_INFO_TX; + +	return res; +} +EXPORT_SYMBOL_GPL(iptunnel_metadata_reply); +  struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,  					 bool csum_help,  					 int gso_type_mask) @@ -198,8 +226,6 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {  	[LWTUNNEL_IP_SRC]	= { .type = NLA_U32 },  	[LWTUNNEL_IP_TTL]	= { .type = NLA_U8 },  	[LWTUNNEL_IP_TOS]	= { .type = NLA_U8 }, -	[LWTUNNEL_IP_SPORT]	= { .type = NLA_U16 }, -	[LWTUNNEL_IP_DPORT]	= { .type = NLA_U16 },  	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },  }; @@ -239,12 +265,6 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,  	if (tb[LWTUNNEL_IP_TOS])  		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); -	if (tb[LWTUNNEL_IP_SPORT]) -		tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); - -	if (tb[LWTUNNEL_IP_DPORT]) -		tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP_DPORT]); -  	if (tb[LWTUNNEL_IP_FLAGS])  		tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); @@ -266,8 +286,6 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,  	    nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||  	    nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||  	    nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || -	    nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || -	    nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) ||  	    nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))  		return -ENOMEM; @@ -281,8 +299,6 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)  		+ nla_total_size(4)	/* LWTUNNEL_IP_SRC */  		+ nla_total_size(1)	/* LWTUNNEL_IP_TOS */  		+ nla_total_size(1)	/* LWTUNNEL_IP_TTL */ -		+ nla_total_size(2)	/* LWTUNNEL_IP_SPORT */ -		+ nla_total_size(2)	/* LWTUNNEL_IP_DPORT */  		+ nla_total_size(2);	/* LWTUNNEL_IP_FLAGS */  } @@ -305,8 +321,6 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {  	[LWTUNNEL_IP6_SRC]		= { .len = sizeof(struct in6_addr) },  	[LWTUNNEL_IP6_HOPLIMIT]		= { .type = NLA_U8 },  	[LWTUNNEL_IP6_TC]		= { .type = NLA_U8 }, -	[LWTUNNEL_IP6_SPORT]		= { .type = NLA_U16 }, -	[LWTUNNEL_IP6_DPORT]		= { .type = NLA_U16 },  	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },  }; @@ -346,12 +360,6 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,  	if (tb[LWTUNNEL_IP6_TC])  		tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); -	if (tb[LWTUNNEL_IP6_SPORT]) -		tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]); - -	if (tb[LWTUNNEL_IP6_DPORT]) -		tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]); -  	if (tb[LWTUNNEL_IP6_FLAGS])  		tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); @@ -373,8 +381,6 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,  	    nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||  	    nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) ||  	    nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || -	    nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) || -	    nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) ||  	    nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))  		return -ENOMEM; @@ -388,8 +394,6 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)  		+ nla_total_size(16)	/* LWTUNNEL_IP6_SRC */  		+ nla_total_size(1)	/* LWTUNNEL_IP6_HOPLIMIT */  		+ nla_total_size(1)	/* LWTUNNEL_IP6_TC */ -		+ nla_total_size(2)	/* LWTUNNEL_IP6_SPORT */ -		+ nla_total_size(2)	/* LWTUNNEL_IP6_DPORT */  		+ nla_total_size(2);	/* LWTUNNEL_IP6_FLAGS */  } diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 690d27d3f2f9..a35584176535 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -75,6 +75,7 @@ endif # NF_TABLES  config NF_DUP_IPV4  	tristate "Netfilter IPv4 packet duplication to alternate destination" +	depends on !NF_CONNTRACK || NF_CONNTRACK  	help  	  This option enables the nf_dup_ipv4 core, which duplicates an IPv4  	  packet to be rerouted to another destination. diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 8618fd150c96..c4ffc9de1654 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -61,9 +61,7 @@ static bool rpfilter_lookup_reverse(struct flowi4 *fl4,  	if (FIB_RES_DEV(res) == dev)  		dev_match = true;  #endif -	if (dev_match || flags & XT_RPFILTER_LOOSE) -		return FIB_RES_NH(res).nh_scope <= RT_SCOPE_HOST; -	return dev_match; +	return dev_match || flags & XT_RPFILTER_LOOSE;  }  static bool rpfilter_is_local(const struct sk_buff *skb) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5f4a5565ad8b..c81deb85acb4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1737,6 +1737,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,  	fl4.flowi4_mark = skb->mark;  	fl4.flowi4_tos = tos;  	fl4.flowi4_scope = RT_SCOPE_UNIVERSE; +	fl4.flowi4_flags = 0;  	fl4.daddr = daddr;  	fl4.saddr = saddr;  	err = fib_lookup(net, &fl4, &res, 0); @@ -2045,6 +2046,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)  	struct fib_result res;  	struct rtable *rth;  	int orig_oif; +	int err = -ENETUNREACH;  	res.tclassid	= 0;  	res.fi		= NULL; @@ -2153,7 +2155,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)  		goto make_route;  	} -	if (fib_lookup(net, fl4, &res, 0)) { +	err = fib_lookup(net, fl4, &res, 0); +	if (err) {  		res.fi = NULL;  		res.table = NULL;  		if (fl4->flowi4_oif) { @@ -2181,7 +2184,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)  			res.type = RTN_UNICAST;  			goto make_route;  		} -		rth = ERR_PTR(-ENETUNREACH); +		rth = ERR_PTR(err);  		goto out;  	} diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c6ded6b2a79f..448c2615fece 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -154,14 +154,20 @@ static void bictcp_init(struct sock *sk)  static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)  {  	if (event == CA_EVENT_TX_START) { -		s32 delta = tcp_time_stamp - tcp_sk(sk)->lsndtime;  		struct bictcp *ca = inet_csk_ca(sk); +		u32 now = tcp_time_stamp; +		s32 delta; + +		delta = now - tcp_sk(sk)->lsndtime;  		/* We were application limited (idle) for a while.  		 * Shift epoch_start to keep cwnd growth to cubic curve.  		 */ -		if (ca->epoch_start && delta > 0) +		if (ca->epoch_start && delta > 0) {  			ca->epoch_start += delta; +			if (after(ca->epoch_start, now)) +				ca->epoch_start = now; +		}  		return;  	}  } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 7092a61c4dc8..7e538f71f5fb 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -209,7 +209,7 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)  		/* alpha = (1 - g) * alpha + g * F */ -		alpha -= alpha >> dctcp_shift_g; +		alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);  		if (bytes_ecn) {  			/* If dctcp_shift_g == 1, a 32bit value would overflow  			 * after 8 Mbytes. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6d8795b066ac..def765911ff8 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -162,9 +162,9 @@ kill_with_rst:  		if (tcp_death_row.sysctl_tw_recycle &&  		    tcptw->tw_ts_recent_stamp &&  		    tcp_tw_remember_stamp(tw)) -			inet_twsk_schedule(tw, tw->tw_timeout); +			inet_twsk_reschedule(tw, tw->tw_timeout);  		else -			inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); +			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);  		return TCP_TW_ACK;  	} @@ -201,7 +201,7 @@ kill:  				return TCP_TW_SUCCESS;  			}  		} -		inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); +		inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);  		if (tmp_opt.saw_tstamp) {  			tcptw->tw_ts_recent	  = tmp_opt.rcv_tsval; @@ -251,7 +251,7 @@ kill:  		 * Do not reschedule in the last case.  		 */  		if (paws_reject || th->ack) -			inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); +			inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);  		return tcp_timewait_check_oow_rate_limit(  			tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); @@ -322,9 +322,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  		} while (0);  #endif -		/* Linkage updates. */ -		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo); -  		/* Get the TIME_WAIT timeout firing. */  		if (timeo < rto)  			timeo = rto; @@ -338,6 +335,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)  		}  		inet_twsk_schedule(tw, timeo); +		/* Linkage updates. */ +		__inet_twsk_hashdance(tw, sk, &tcp_hashinfo);  		inet_twsk_put(tw);  	} else {  		/* Sorry, if we're out of memory, just CLOSE this diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f9a8a12b62ee..3dbee0d83b15 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2897,6 +2897,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)  	skb_reserve(skb, MAX_TCP_HEADER);  	tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),  			     TCPHDR_ACK | TCPHDR_RST); +	skb_mstamp_get(&skb->skb_mstamp);  	/* Send it off. */  	if (tcp_transmit_skb(sk, skb, 0, priority))  		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); @@ -3404,7 +3405,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)  	 */  	tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);  	skb_mstamp_get(&skb->skb_mstamp); -	NET_INC_STATS_BH(sock_net(sk), mib); +	NET_INC_STATS(sock_net(sk), mib);  	return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);  } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c0a15e7f359f..f7d1d5e19e95 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1024,7 +1024,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)  		if (netif_index_is_vrf(net, ipc.oif)) {  			flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,  					   RT_SCOPE_UNIVERSE, sk->sk_protocol, -					   (flow_flags | FLOWI_FLAG_VRFSRC), +					   (flow_flags | FLOWI_FLAG_VRFSRC | +					    FLOWI_FLAG_SKIP_NH_OIF),  					   faddr, saddr, dport,  					   inet->inet_sport); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 2878dbfffeb7..41a261355662 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -30,6 +30,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)  	mtu = dst_mtu(skb_dst(skb));  	if (skb->len > mtu) { +		skb->protocol = htons(ETH_P_IP); +  		if (skb->sk)  			xfrm_local_error(skb, mtu);  		else diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bb919b28619f..c10a9ee68433 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -33,6 +33,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,  	if (saddr)  		fl4->saddr = saddr->a4; +	fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF; +  	rt = __ip_route_output_key(net, fl4);  	if (!IS_ERR(rt))  		return &rt->dst; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 030fefdc9aed..36b85bd05ac8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3119,6 +3119,8 @@ static void addrconf_gre_config(struct net_device *dev)  	}  	addrconf_addr_gen(idev, true); +	if (dev->flags & IFF_POINTOPOINT) +		addrconf_add_mroute(dev);  }  #endif @@ -5127,13 +5129,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)  			rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,  						       ifp->idev->dev, 0, 0); -			if (rt && ip6_del_rt(rt)) -				dst_free(&rt->dst); +			if (rt) +				ip6_del_rt(rt);  		}  		dst_hold(&ifp->rt->dst); -		if (ip6_del_rt(ifp->rt)) -			dst_free(&ifp->rt->dst); +		ip6_del_rt(ifp->rt);  		rt_genid_bump_ipv6(net);  		break; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 9f777ec59a59..ed33abf57abd 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -32,6 +32,7 @@ struct fib6_rule {  struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,  				   int flags, pol_lookup_t lookup)  { +	struct rt6_info *rt;  	struct fib_lookup_arg arg = {  		.lookup_ptr = lookup,  		.flags = FIB_LOOKUP_NOREF, @@ -40,11 +41,21 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,  	fib_rules_lookup(net->ipv6.fib6_rules_ops,  			 flowi6_to_flowi(fl6), flags, &arg); -	if (arg.result) -		return arg.result; +	rt = arg.result; -	dst_hold(&net->ipv6.ip6_null_entry->dst); -	return &net->ipv6.ip6_null_entry->dst; +	if (!rt) { +		dst_hold(&net->ipv6.ip6_null_entry->dst); +		return &net->ipv6.ip6_null_entry->dst; +	} + +	if (rt->rt6i_flags & RTF_REJECT && +	    rt->dst.error == -EAGAIN) { +		ip6_rt_put(rt); +		rt = net->ipv6.ip6_null_entry; +		dst_hold(&rt->dst); +	} + +	return &rt->dst;  }  static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 418d9823692b..6cedc62b2abb 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -155,6 +155,11 @@ static void node_free(struct fib6_node *fn)  	kmem_cache_free(fib6_node_kmem, fn);  } +static void rt6_rcu_free(struct rt6_info *rt) +{ +	call_rcu(&rt->dst.rcu_head, dst_rcu_free); +} +  static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)  {  	int cpu; @@ -169,7 +174,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt)  		ppcpu_rt = per_cpu_ptr(non_pcpu_rt->rt6i_pcpu, cpu);  		pcpu_rt = *ppcpu_rt;  		if (pcpu_rt) { -			dst_free(&pcpu_rt->dst); +			rt6_rcu_free(pcpu_rt);  			*ppcpu_rt = NULL;  		}  	} @@ -181,7 +186,7 @@ static void rt6_release(struct rt6_info *rt)  {  	if (atomic_dec_and_test(&rt->rt6i_ref)) {  		rt6_free_pcpu(rt); -		dst_free(&rt->dst); +		rt6_rcu_free(rt);  	}  } @@ -280,7 +285,17 @@ struct fib6_table *fib6_get_table(struct net *net, u32 id)  struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,  				   int flags, pol_lookup_t lookup)  { -	return (struct dst_entry *) lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); +	struct rt6_info *rt; + +	rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags); +	if (rt->rt6i_flags & RTF_REJECT && +	    rt->dst.error == -EAGAIN) { +		ip6_rt_put(rt); +		rt = net->ipv6.ip6_null_entry; +		dst_hold(&rt->dst); +	} + +	return &rt->dst;  }  static void __net_init fib6_tables_init(struct net *net) @@ -846,7 +861,7 @@ add:  		*ins = rt;  		rt->rt6i_node = fn;  		atomic_inc(&rt->rt6i_ref); -		inet6_rt_notify(RTM_NEWROUTE, rt, info); +		inet6_rt_notify(RTM_NEWROUTE, rt, info, 0);  		info->nl_net->ipv6.rt6_stats->fib_rt_entries++;  		if (!(fn->fn_flags & RTN_RTINFO)) { @@ -872,7 +887,7 @@ add:  		rt->rt6i_node = fn;  		rt->dst.rt6_next = iter->dst.rt6_next;  		atomic_inc(&rt->rt6i_ref); -		inet6_rt_notify(RTM_NEWROUTE, rt, info); +		inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);  		if (!(fn->fn_flags & RTN_RTINFO)) {  			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;  			fn->fn_flags |= RTN_RTINFO; @@ -933,6 +948,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,  	int replace_required = 0;  	int sernum = fib6_new_sernum(info->nl_net); +	if (WARN_ON_ONCE((rt->dst.flags & DST_NOCACHE) && +			 !atomic_read(&rt->dst.__refcnt))) +		return -EINVAL; +  	if (info->nlh) {  		if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))  			allow_create = 0; @@ -1025,6 +1044,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt,  		fib6_start_gc(info->nl_net, rt);  		if (!(rt->rt6i_flags & RTF_CACHE))  			fib6_prune_clones(info->nl_net, pn); +		rt->dst.flags &= ~DST_NOCACHE;  	}  out: @@ -1049,7 +1069,8 @@ out:  			atomic_inc(&pn->leaf->rt6i_ref);  		}  #endif -		dst_free(&rt->dst); +		if (!(rt->dst.flags & DST_NOCACHE)) +			dst_free(&rt->dst);  	}  	return err; @@ -1060,7 +1081,8 @@ out:  st_failure:  	if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)))  		fib6_repair_tree(info->nl_net, fn); -	dst_free(&rt->dst); +	if (!(rt->dst.flags & DST_NOCACHE)) +		dst_free(&rt->dst);  	return err;  #endif  } @@ -1410,7 +1432,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,  	fib6_purge_rt(rt, fn, net); -	inet6_rt_notify(RTM_DELROUTE, rt, info); +	inet6_rt_notify(RTM_DELROUTE, rt, info, 0);  	rt6_release(rt);  } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 4038c694ec03..3c7b9310b33f 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -404,13 +404,13 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,  		struct ipv6_tlv_tnl_enc_lim *tel;  		__u32 mtu;  	case ICMPV6_DEST_UNREACH: -		net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", -				     t->parms.name); +		net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", +				    t->parms.name);  		break;  	case ICMPV6_TIME_EXCEED:  		if (code == ICMPV6_EXC_HOPLIMIT) { -			net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", -					     t->parms.name); +			net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", +					    t->parms.name);  		}  		break;  	case ICMPV6_PARAMPROB: @@ -421,12 +421,12 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,  		if (teli && teli == be32_to_cpu(info) - 2) {  			tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];  			if (tel->encap_limit == 0) { -				net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", -						     t->parms.name); +				net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", +						    t->parms.name);  			}  		} else { -			net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", -					     t->parms.name); +			net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", +					    t->parms.name);  		}  		break;  	case ICMPV6_PKT_TOOBIG: @@ -634,20 +634,20 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,  	}  	if (!fl6->flowi6_mark) -		dst = ip6_tnl_dst_check(tunnel); +		dst = ip6_tnl_dst_get(tunnel);  	if (!dst) { -		ndst = ip6_route_output(net, NULL, fl6); +		dst = ip6_route_output(net, NULL, fl6); -		if (ndst->error) +		if (dst->error)  			goto tx_err_link_failure; -		ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); -		if (IS_ERR(ndst)) { -			err = PTR_ERR(ndst); -			ndst = NULL; +		dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); +		if (IS_ERR(dst)) { +			err = PTR_ERR(dst); +			dst = NULL;  			goto tx_err_link_failure;  		} -		dst = ndst; +		ndst = dst;  	}  	tdev = dst->dev; @@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,  		skb = new_skb;  	} -	if (fl6->flowi6_mark) { -		skb_dst_set(skb, dst); -		ndst = NULL; -	} else { -		skb_dst_set_noref(skb, dst); -	} +	if (!fl6->flowi6_mark && ndst) +		ip6_tnl_dst_set(tunnel, ndst); +	skb_dst_set(skb, dst);  	proto = NEXTHDR_GRE;  	if (encap_limit >= 0) { @@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,  	skb_set_inner_protocol(skb, protocol);  	ip6tunnel_xmit(NULL, skb, dev); -	if (ndst) -		ip6_tnl_dst_store(tunnel, ndst);  	return 0;  tx_err_link_failure:  	stats->tx_carrier_errors++;  	dst_link_failure(skb);  tx_err_dst_release: -	dst_release(ndst); +	dst_release(dst);  	return err;  } @@ -1223,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = {  static void ip6gre_dev_free(struct net_device *dev)  { +	struct ip6_tnl *t = netdev_priv(dev); + +	ip6_tnl_dst_destroy(t);  	free_percpu(dev->tstats);  	free_netdev(dev);  } @@ -1245,9 +1243,10 @@ static void ip6gre_tunnel_setup(struct net_device *dev)  	netif_keep_dst(dev);  } -static int ip6gre_tunnel_init(struct net_device *dev) +static int ip6gre_tunnel_init_common(struct net_device *dev)  {  	struct ip6_tnl *tunnel; +	int ret;  	tunnel = netdev_priv(dev); @@ -1255,16 +1254,37 @@ static int ip6gre_tunnel_init(struct net_device *dev)  	tunnel->net = dev_net(dev);  	strcpy(tunnel->parms.name, dev->name); +	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); +	if (!dev->tstats) +		return -ENOMEM; + +	ret = ip6_tnl_dst_init(tunnel); +	if (ret) { +		free_percpu(dev->tstats); +		dev->tstats = NULL; +		return ret; +	} + +	return 0; +} + +static int ip6gre_tunnel_init(struct net_device *dev) +{ +	struct ip6_tnl *tunnel; +	int ret; + +	ret = ip6gre_tunnel_init_common(dev); +	if (ret) +		return ret; + +	tunnel = netdev_priv(dev); +  	memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));  	memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));  	if (ipv6_addr_any(&tunnel->parms.raddr))  		dev->header_ops = &ip6gre_header_ops; -	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); -	if (!dev->tstats) -		return -ENOMEM; -  	return 0;  } @@ -1460,19 +1480,16 @@ static void ip6gre_netlink_parms(struct nlattr *data[],  static int ip6gre_tap_init(struct net_device *dev)  {  	struct ip6_tnl *tunnel; +	int ret; -	tunnel = netdev_priv(dev); +	ret = ip6gre_tunnel_init_common(dev); +	if (ret) +		return ret; -	tunnel->dev = dev; -	tunnel->net = dev_net(dev); -	strcpy(tunnel->parms.name, dev->name); +	tunnel = netdev_priv(dev);  	ip6gre_tnl_link_config(tunnel, 1); -	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); -	if (!dev->tstats) -		return -ENOMEM; -  	return 0;  } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 26ea47930740..f84ec4e9b2de 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -376,6 +376,9 @@ int ip6_forward(struct sk_buff *skb)  	if (skb->pkt_type != PACKET_HOST)  		goto drop; +	if (unlikely(skb->sk)) +		goto drop; +  	if (skb_warn_if_lro(skb))  		goto drop; @@ -581,25 +584,29 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,  		if (np->frag_size)  			mtu = np->frag_size;  	} +	if (mtu < hlen + sizeof(struct frag_hdr) + 8) +		goto fail_toobig;  	mtu -= hlen + sizeof(struct frag_hdr);  	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,  				    &ipv6_hdr(skb)->saddr); +	hroom = LL_RESERVED_SPACE(rt->dst.dev);  	if (skb_has_frag_list(skb)) {  		int first_len = skb_pagelen(skb);  		struct sk_buff *frag2;  		if (first_len - hlen > mtu ||  		    ((first_len - hlen) & 7) || -		    skb_cloned(skb)) +		    skb_cloned(skb) || +		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))  			goto slow_path;  		skb_walk_frags(skb, frag) {  			/* Correct geometry. */  			if (frag->len > mtu ||  			    ((frag->len & 7) && frag->next) || -			    skb_headroom(frag) < hlen) +			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))  				goto slow_path_clean;  			/* Partially cloned skb? */ @@ -616,8 +623,6 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,  		err = 0;  		offset = 0; -		frag = skb_shinfo(skb)->frag_list; -		skb_frag_list_init(skb);  		/* BUILD HEADER */  		*prevhdr = NEXTHDR_FRAGMENT; @@ -625,8 +630,11 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,  		if (!tmp_hdr) {  			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),  				      IPSTATS_MIB_FRAGFAILS); -			return -ENOMEM; +			err = -ENOMEM; +			goto fail;  		} +		frag = skb_shinfo(skb)->frag_list; +		skb_frag_list_init(skb);  		__skb_pull(skb, hlen);  		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); @@ -723,7 +731,6 @@ slow_path:  	 */  	*prevhdr = NEXTHDR_FRAGMENT; -	hroom = LL_RESERVED_SPACE(rt->dst.dev);  	troom = rt->dst.dev->needed_tailroom;  	/* @@ -872,7 +879,8 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,  #ifdef CONFIG_IPV6_SUBTREES  	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||  #endif -	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { +	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && +	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {  		dst_release(dst);  		dst = NULL;  	} diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index b0ab420612bc..eabffbb89795 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -126,36 +126,92 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)   * Locking : hash tables are protected by RCU and RTNL   */ -struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst, +				    struct dst_entry *dst)  { -	struct dst_entry *dst = t->dst_cache; +	write_seqlock_bh(&idst->lock); +	dst_release(rcu_dereference_protected( +			    idst->dst, +			    lockdep_is_held(&idst->lock.lock))); +	if (dst) { +		dst_hold(dst); +		idst->cookie = rt6_get_cookie((struct rt6_info *)dst); +	} else { +		idst->cookie = 0; +	} +	rcu_assign_pointer(idst->dst, dst); +	write_sequnlock_bh(&idst->lock); +} + +struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) +{ +	struct ip6_tnl_dst *idst; +	struct dst_entry *dst; +	unsigned int seq; +	u32 cookie; + +	idst = raw_cpu_ptr(t->dst_cache); + +	rcu_read_lock(); +	do { +		seq = read_seqbegin(&idst->lock); +		dst = rcu_dereference(idst->dst); +		cookie = idst->cookie; +	} while (read_seqretry(&idst->lock, seq)); + +	if (dst && !atomic_inc_not_zero(&dst->__refcnt)) +		dst = NULL; +	rcu_read_unlock(); -	if (dst && dst->obsolete && -	    !dst->ops->check(dst, t->dst_cookie)) { -		t->dst_cache = NULL; +	if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) { +		ip6_tnl_per_cpu_dst_set(idst, NULL);  		dst_release(dst); -		return NULL; +		dst = NULL;  	} -  	return dst;  } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_check); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_get);  void ip6_tnl_dst_reset(struct ip6_tnl *t)  { -	dst_release(t->dst_cache); -	t->dst_cache = NULL; +	int i; + +	for_each_possible_cpu(i) +		ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL);  }  EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); -void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) +{ +	ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst); + +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); + +void ip6_tnl_dst_destroy(struct ip6_tnl *t) +{ +	if (!t->dst_cache) +		return; + +	ip6_tnl_dst_reset(t); +	free_percpu(t->dst_cache); +} +EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy); + +int ip6_tnl_dst_init(struct ip6_tnl *t)  { -	struct rt6_info *rt = (struct rt6_info *) dst; -	t->dst_cookie = rt6_get_cookie(rt); -	dst_release(t->dst_cache); -	t->dst_cache = dst; +	int i; + +	t->dst_cache = alloc_percpu(struct ip6_tnl_dst); +	if (!t->dst_cache) +		return -ENOMEM; + +	for_each_possible_cpu(i) +		seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock); + +	return 0;  } -EXPORT_SYMBOL_GPL(ip6_tnl_dst_store); +EXPORT_SYMBOL_GPL(ip6_tnl_dst_init);  /**   * ip6_tnl_lookup - fetch tunnel matching the end-point addresses @@ -271,6 +327,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)  static void ip6_dev_free(struct net_device *dev)  { +	struct ip6_tnl *t = netdev_priv(dev); + +	ip6_tnl_dst_destroy(t);  	free_percpu(dev->tstats);  	free_netdev(dev);  } @@ -510,14 +569,14 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,  		struct ipv6_tlv_tnl_enc_lim *tel;  		__u32 mtu;  	case ICMPV6_DEST_UNREACH: -		net_warn_ratelimited("%s: Path to destination invalid or inactive!\n", -				     t->parms.name); +		net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", +				    t->parms.name);  		rel_msg = 1;  		break;  	case ICMPV6_TIME_EXCEED:  		if ((*code) == ICMPV6_EXC_HOPLIMIT) { -			net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", -					     t->parms.name); +			net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", +					    t->parms.name);  			rel_msg = 1;  		}  		break; @@ -529,13 +588,13 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,  		if (teli && teli == *info - 2) {  			tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];  			if (tel->encap_limit == 0) { -				net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", -						     t->parms.name); +				net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", +						    t->parms.name);  				rel_msg = 1;  			}  		} else { -			net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n", -					     t->parms.name); +			net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", +					    t->parms.name);  		}  		break;  	case ICMPV6_PKT_TOOBIG: @@ -1010,23 +1069,23 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,  		memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));  		neigh_release(neigh);  	} else if (!fl6->flowi6_mark) -		dst = ip6_tnl_dst_check(t); +		dst = ip6_tnl_dst_get(t);  	if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))  		goto tx_err_link_failure;  	if (!dst) { -		ndst = ip6_route_output(net, NULL, fl6); +		dst = ip6_route_output(net, NULL, fl6); -		if (ndst->error) +		if (dst->error)  			goto tx_err_link_failure; -		ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); -		if (IS_ERR(ndst)) { -			err = PTR_ERR(ndst); -			ndst = NULL; +		dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); +		if (IS_ERR(dst)) { +			err = PTR_ERR(dst); +			dst = NULL;  			goto tx_err_link_failure;  		} -		dst = ndst; +		ndst = dst;  	}  	tdev = dst->dev; @@ -1072,12 +1131,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,  		consume_skb(skb);  		skb = new_skb;  	} -	if (fl6->flowi6_mark) { -		skb_dst_set(skb, dst); -		ndst = NULL; -	} else { -		skb_dst_set_noref(skb, dst); -	} + +	if (!fl6->flowi6_mark && ndst) +		ip6_tnl_dst_set(t, ndst); +	skb_dst_set(skb, dst); +  	skb->transport_header = skb->network_header;  	proto = fl6->flowi6_proto; @@ -1101,14 +1159,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,  	ipv6h->saddr = fl6->saddr;  	ipv6h->daddr = fl6->daddr;  	ip6tunnel_xmit(NULL, skb, dev); -	if (ndst) -		ip6_tnl_dst_store(t, ndst);  	return 0;  tx_err_link_failure:  	stats->tx_carrier_errors++;  	dst_link_failure(skb);  tx_err_dst_release: -	dst_release(ndst); +	dst_release(dst);  	return err;  } @@ -1573,12 +1629,21 @@ static inline int  ip6_tnl_dev_init_gen(struct net_device *dev)  {  	struct ip6_tnl *t = netdev_priv(dev); +	int ret;  	t->dev = dev;  	t->net = dev_net(dev);  	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);  	if (!dev->tstats)  		return -ENOMEM; + +	ret = ip6_tnl_dst_init(t); +	if (ret) { +		free_percpu(dev->tstats); +		dev->tstats = NULL; +		return ret; +	} +  	return 0;  } diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 96833e4b3193..f6a024e141e5 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -58,6 +58,7 @@ endif # NF_TABLES  config NF_DUP_IPV6  	tristate "Netfilter IPv6 packet duplication to alternate destination" +	depends on !NF_CONNTRACK || NF_CONNTRACK  	help  	  This option enables the nf_dup_ipv6 core, which duplicates an IPv6  	  packet to be rerouted to another destination. diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 701cd2bae0a9..c7196ad1d69f 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -646,6 +646,7 @@ void nf_ct_frag6_consume_orig(struct sk_buff *skb)  		s = s2;  	}  } +EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig);  static int nf_ct_net_init(struct net *net)  { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 53617d715188..946880ad48ac 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -142,6 +142,9 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)  	struct net_device *loopback_dev = net->loopback_dev;  	int cpu; +	if (dev == loopback_dev) +		return; +  	for_each_possible_cpu(cpu) {  		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);  		struct rt6_info *rt; @@ -151,14 +154,12 @@ static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)  			struct inet6_dev *rt_idev = rt->rt6i_idev;  			struct net_device *rt_dev = rt->dst.dev; -			if (rt_idev && (rt_idev->dev == dev || !dev) && -			    rt_idev->dev != loopback_dev) { +			if (rt_idev->dev == dev) {  				rt->rt6i_idev = in6_dev_get(loopback_dev);  				in6_dev_put(rt_idev);  			} -			if (rt_dev && (rt_dev == dev || !dev) && -			    rt_dev != loopback_dev) { +			if (rt_dev == dev) {  				rt->dst.dev = loopback_dev;  				dev_hold(rt->dst.dev);  				dev_put(rt_dev); @@ -247,12 +248,6 @@ static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,  {  } -static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst, -					 unsigned long old) -{ -	return NULL; -} -  static struct dst_ops ip6_dst_blackhole_ops = {  	.family			=	AF_INET6,  	.destroy		=	ip6_dst_destroy, @@ -261,7 +256,7 @@ static struct dst_ops ip6_dst_blackhole_ops = {  	.default_advmss		=	ip6_default_advmss,  	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,  	.redirect		=	ip6_rt_blackhole_redirect, -	.cow_metrics		=	ip6_rt_blackhole_cow_metrics, +	.cow_metrics		=	dst_cow_metrics_generic,  	.neigh_lookup		=	ip6_neigh_lookup,  }; @@ -318,6 +313,15 @@ static const struct rt6_info ip6_blk_hole_entry_template = {  #endif +static void rt6_info_init(struct rt6_info *rt) +{ +	struct dst_entry *dst = &rt->dst; + +	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); +	INIT_LIST_HEAD(&rt->rt6i_siblings); +	INIT_LIST_HEAD(&rt->rt6i_uncached); +} +  /* allocate dst with ip6_dst_ops */  static struct rt6_info *__ip6_dst_alloc(struct net *net,  					struct net_device *dev, @@ -326,13 +330,9 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net,  	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,  					0, DST_OBSOLETE_FORCE_CHK, flags); -	if (rt) { -		struct dst_entry *dst = &rt->dst; +	if (rt) +		rt6_info_init(rt); -		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst)); -		INIT_LIST_HEAD(&rt->rt6i_siblings); -		INIT_LIST_HEAD(&rt->rt6i_uncached); -	}  	return rt;  } @@ -1068,6 +1068,9 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,  	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);  	saved_fn = fn; +	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) +		oif = 0; +  redo_rt6_select:  	rt = rt6_select(fn, oif, strict);  	if (rt->rt6i_nsiblings) @@ -1190,13 +1193,16 @@ struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,  				    struct flowi6 *fl6)  {  	int flags = 0; +	bool any_src;  	fl6->flowi6_iif = LOOPBACK_IFINDEX; -	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr)) +	any_src = ipv6_addr_any(&fl6->saddr); +	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) || +	    (fl6->flowi6_oif && any_src))  		flags |= RT6_LOOKUP_F_IFACE; -	if (!ipv6_addr_any(&fl6->saddr)) +	if (!any_src)  		flags |= RT6_LOOKUP_F_HAS_SADDR;  	else if (sk)  		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs); @@ -1212,24 +1218,20 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori  	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);  	if (rt) { -		new = &rt->dst; - -		memset(new + 1, 0, sizeof(*rt) - sizeof(*new)); +		rt6_info_init(rt); +		new = &rt->dst;  		new->__use = 1;  		new->input = dst_discard;  		new->output = dst_discard_sk; -		if (dst_metrics_read_only(&ort->dst)) -			new->_metrics = ort->dst._metrics; -		else -			dst_copy_metrics(new, &ort->dst); +		dst_copy_metrics(new, &ort->dst);  		rt->rt6i_idev = ort->rt6i_idev;  		if (rt->rt6i_idev)  			in6_dev_hold(rt->rt6i_idev);  		rt->rt6i_gateway = ort->rt6i_gateway; -		rt->rt6i_flags = ort->rt6i_flags; +		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;  		rt->rt6i_metric = 0;  		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); @@ -1322,8 +1324,7 @@ static void ip6_link_failure(struct sk_buff *skb)  	if (rt) {  		if (rt->rt6i_flags & RTF_CACHE) {  			dst_hold(&rt->dst); -			if (ip6_del_rt(rt)) -				dst_free(&rt->dst); +			ip6_del_rt(rt);  		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {  			rt->rt6i_node->fn_sernum = -1;  		} @@ -1886,9 +1887,11 @@ int ip6_route_info_create(struct fib6_config *cfg, struct rt6_info **rt_ret)  			rt->dst.input = ip6_pkt_prohibit;  			break;  		case RTN_THROW: +		case RTN_UNREACHABLE:  		default:  			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN -					: -ENETUNREACH; +					: (cfg->fc_type == RTN_UNREACHABLE) +					? -EHOSTUNREACH : -ENETUNREACH;  			rt->dst.output = ip6_pkt_discard_out;  			rt->dst.input = ip6_pkt_discard;  			break; @@ -2028,7 +2031,8 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)  	struct fib6_table *table;  	struct net *net = dev_net(rt->dst.dev); -	if (rt == net->ipv6.ip6_null_entry) { +	if (rt == net->ipv6.ip6_null_entry || +	    rt->dst.flags & DST_NOCACHE) {  		err = -ENOENT;  		goto out;  	} @@ -2515,6 +2519,7 @@ struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,  	rt->rt6i_dst.addr = *addr;  	rt->rt6i_dst.plen = 128;  	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL); +	rt->dst.flags |= DST_NOCACHE;  	atomic_set(&rt->dst.__refcnt, 1); @@ -2618,7 +2623,8 @@ void rt6_ifdown(struct net *net, struct net_device *dev)  	fib6_clean_all(net, fib6_ifdown, &adn);  	icmp6_clean_all(fib6_ifdown, &adn); -	rt6_uncached_list_flush_dev(net, dev); +	if (dev) +		rt6_uncached_list_flush_dev(net, dev);  }  struct rt6_mtu_change_arg { @@ -3303,7 +3309,8 @@ errout:  	return err;  } -void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) +void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info, +		     unsigned int nlm_flags)  {  	struct sk_buff *skb;  	struct net *net = info->nl_net; @@ -3318,7 +3325,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)  		goto errout;  	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0, -				event, info->portid, seq, 0, 0, 0); +				event, info->portid, seq, 0, 0, nlm_flags);  	if (err < 0) {  		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */  		WARN_ON(err == -EMSGSIZE); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 09c76a7b474d..e15feb7b413d 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -79,6 +79,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)  	if (!skb->ignore_df && skb->len > mtu) {  		skb->dev = dst->dev; +		skb->protocol = htons(ETH_P_IPV6);  		if (xfrm6_local_dontfrag(skb))  			xfrm6_local_rxpmtu(skb, mtu); @@ -136,6 +137,7 @@ static int __xfrm6_output(struct sock *sk, struct sk_buff *skb)  	struct dst_entry *dst = skb_dst(skb);  	struct xfrm_state *x = dst->xfrm;  	int mtu; +	bool toobig;  #ifdef CONFIG_NETFILTER  	if (!x) { @@ -144,25 +146,29 @@ static int __xfrm6_output(struct sock *sk, struct sk_buff *skb)  	}  #endif +	if (x->props.mode != XFRM_MODE_TUNNEL) +		goto skip_frag; +  	if (skb->protocol == htons(ETH_P_IPV6))  		mtu = ip6_skb_dst_mtu(skb);  	else  		mtu = dst_mtu(skb_dst(skb)); -	if (skb->len > mtu && xfrm6_local_dontfrag(skb)) { +	toobig = skb->len > mtu && !skb_is_gso(skb); + +	if (toobig && xfrm6_local_dontfrag(skb)) {  		xfrm6_local_rxpmtu(skb, mtu);  		return -EMSGSIZE; -	} else if (!skb->ignore_df && skb->len > mtu && skb->sk) { +	} else if (!skb->ignore_df && toobig && skb->sk) {  		xfrm_local_error(skb, mtu);  		return -EMSGSIZE;  	} -	if (x->props.mode == XFRM_MODE_TUNNEL && -	    ((skb->len > mtu && !skb_is_gso(skb)) || -		dst_allfrag(skb_dst(skb)))) { +	if (toobig || dst_allfrag(skb_dst(skb)))  		return ip6_fragment(sk, skb,  				    x->outer_mode->afinfo->output_finish); -	} + +skip_frag:  	return x->outer_mode->afinfo->output_finish(sk, skb);  } diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 30caa289c5db..da55e0c85bb8 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -37,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,  	memset(&fl6, 0, sizeof(fl6));  	fl6.flowi6_oif = oif; +	fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;  	memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));  	if (saddr)  		memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); @@ -178,7 +179,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)  			return;  		case IPPROTO_ICMPV6: -			if (!onlyproto && pskb_may_pull(skb, nh + offset + 2 - skb->data)) { +			if (!onlyproto && (nh + offset + 2 < skb->data || +			    pskb_may_pull(skb, nh + offset + 2 - skb->data))) {  				u8 *icmp;  				nh = skb_network_header(skb); @@ -192,7 +194,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)  #if IS_ENABLED(CONFIG_IPV6_MIP6)  		case IPPROTO_MH:  			offset += ipv6_optlen(exthdr); -			if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { +			if (!onlyproto && (nh + offset + 3 < skb->data || +			    pskb_may_pull(skb, nh + offset + 3 - skb->data))) {  				struct ip6_mh *mh;  				nh = skb_network_header(skb); diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index a26c401ef4a4..43964594aa12 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -1839,7 +1839,7 @@ static void *irlmp_seq_hb_idx(struct irlmp_iter_state *iter, loff_t *off)  	for (element = hashbin_get_first(iter->hashbin);  	     element != NULL;  	     element = hashbin_get_next(iter->hashbin)) { -		if (!off || *off-- == 0) { +		if (!off || (*off)-- == 0) {  			/* NB: hashbin left locked */  			return element;  		} diff --git a/net/key/af_key.c b/net/key/af_key.c index 83a70688784b..f9c9ecb0cdd3 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -261,7 +261,7 @@ static int pfkey_broadcast(struct sk_buff *skb,  		err2 = pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk); -		/* Error is cleare after succecful sending to at least one +		/* Error is cleared after successful sending to at least one  		 * registered KM */  		if ((broadcast_flags & BROADCAST_REGISTERED) && err)  			err = err2; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index f6b090df3930..afca2eb4dfa7 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1319,7 +1319,7 @@ static void l2tp_tunnel_del_work(struct work_struct *work)  	tunnel = container_of(work, struct l2tp_tunnel, del_work);  	sk = l2tp_tunnel_sock_lookup(tunnel);  	if (!sk) -		return; +		goto out;  	sock = sk->sk_socket; @@ -1341,6 +1341,8 @@ static void l2tp_tunnel_del_work(struct work_struct *work)  	}  	l2tp_tunnel_sock_put(sk); +out: +	l2tp_tunnel_dec_refcount(tunnel);  }  /* Create a socket for the tunnel, if one isn't set up by @@ -1636,8 +1638,13 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);   */  int l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)  { +	l2tp_tunnel_inc_refcount(tunnel);  	l2tp_tunnel_closeall(tunnel); -	return (false == queue_work(l2tp_wq, &tunnel->del_work)); +	if (false == queue_work(l2tp_wq, &tunnel->del_work)) { +		l2tp_tunnel_dec_refcount(tunnel); +		return 1; +	} +	return 0;  }  EXPORT_SYMBOL_GPL(l2tp_tunnel_delete); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 17b1fe961c5d..7a77a1470f25 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2474,6 +2474,7 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,  	bss_conf->cqm_rssi_thold = rssi_thold;  	bss_conf->cqm_rssi_hyst = rssi_hyst; +	sdata->u.mgd.last_cqm_event_signal = 0;  	/* tell the driver upon association, unless already associated */  	if (sdata->u.mgd.associated && @@ -2518,15 +2519,17 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,  			continue;  		for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { -			if (~sdata->rc_rateidx_mcs_mask[i][j]) +			if (~sdata->rc_rateidx_mcs_mask[i][j]) {  				sdata->rc_has_mcs_mask[i] = true; +				break; +			} +		} -			if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) +		for (j = 0; j < NL80211_VHT_NSS_MAX; j++) { +			if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) {  				sdata->rc_has_vht_mcs_mask[i] = true; - -			if (sdata->rc_has_mcs_mask[i] && -			    sdata->rc_has_vht_mcs_mask[i])  				break; +			}  		}  	} diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index ced6bf3be8d6..1560c8482bcb 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -149,7 +149,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,  	for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) {  		if (test_bit(i, local->hw.flags)) -			pos += scnprintf(pos, end - pos, "%s", +			pos += scnprintf(pos, end - pos, "%s\n",  					 hw_flag_names[i]);  	} diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 8ba583243509..3ed7ddfbf8e8 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -101,6 +101,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,  	 * when it wakes up for the next time.  	 */  	set_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT); +	ieee80211_clear_fast_xmit(sta);  	/*  	 * This code races in the following way: diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 84e0e8c7fb23..7892eb8ed4c8 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1218,8 +1218,10 @@ ieee80211_tx_prepare(struct ieee80211_sub_if_data *sdata,  	if (!tx->sta)  		info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; -	else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) +	else if (test_and_clear_sta_flag(tx->sta, WLAN_STA_CLEAR_PS_FILT)) {  		info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT; +		ieee80211_check_fast_xmit(tx->sta); +	}  	info->flags |= IEEE80211_TX_CTL_FIRST_FRAGMENT; @@ -2451,7 +2453,8 @@ void ieee80211_check_fast_xmit(struct sta_info *sta)  	if (test_sta_flag(sta, WLAN_STA_PS_STA) ||  	    test_sta_flag(sta, WLAN_STA_PS_DRIVER) || -	    test_sta_flag(sta, WLAN_STA_PS_DELIVER)) +	    test_sta_flag(sta, WLAN_STA_PS_DELIVER) || +	    test_sta_flag(sta, WLAN_STA_CLEAR_PS_FILT))  		goto out;  	if (sdata->noack_map) diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 8e47f8113495..21a085686dc1 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -152,6 +152,8 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg)  #endif  	synchronize_net();  	nf_queue_nf_hook_drop(net, &entry->ops); +	/* other cpu might still process nfqueue verdict that used reg */ +	synchronize_net();  	kfree(entry);  }  EXPORT_SYMBOL(nf_unregister_net_hook); diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index a1fe5377a2b3..5a30ce6e8c90 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -297,7 +297,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,  	      ip_set_timeout_expired(ext_timeout(n, set))))  		n =  NULL; -	e = kzalloc(set->dsize, GFP_KERNEL); +	e = kzalloc(set->dsize, GFP_ATOMIC);  	if (!e)  		return -ENOMEM;  	e->id = d->id; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 675d12c69e32..a5d41dfa9f05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -107,12 +107,17 @@ EXPORT_SYMBOL(nf_log_register);  void nf_log_unregister(struct nf_logger *logger)  { +	const struct nf_logger *log;  	int i;  	mutex_lock(&nf_log_mutex); -	for (i = 0; i < NFPROTO_NUMPROTO; i++) -		RCU_INIT_POINTER(loggers[i][logger->type], NULL); +	for (i = 0; i < NFPROTO_NUMPROTO; i++) { +		log = nft_log_dereference(loggers[i][logger->type]); +		if (log == logger) +			RCU_INIT_POINTER(loggers[i][logger->type], NULL); +	}  	mutex_unlock(&nf_log_mutex); +	synchronize_rcu();  }  EXPORT_SYMBOL(nf_log_unregister); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 66def315eb56..9c8fab00164b 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -619,6 +619,13 @@ struct nft_xt {  static struct nft_expr_type nft_match_type; +static bool nft_match_cmp(const struct xt_match *match, +			  const char *name, u32 rev, u32 family) +{ +	return strcmp(match->name, name) == 0 && match->revision == rev && +	       (match->family == NFPROTO_UNSPEC || match->family == family); +} +  static const struct nft_expr_ops *  nft_match_select_ops(const struct nft_ctx *ctx,  		     const struct nlattr * const tb[]) @@ -626,7 +633,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,  	struct nft_xt *nft_match;  	struct xt_match *match;  	char *mt_name; -	__u32 rev, family; +	u32 rev, family;  	if (tb[NFTA_MATCH_NAME] == NULL ||  	    tb[NFTA_MATCH_REV] == NULL || @@ -641,8 +648,7 @@ nft_match_select_ops(const struct nft_ctx *ctx,  	list_for_each_entry(nft_match, &nft_match_list, head) {  		struct xt_match *match = nft_match->ops.data; -		if (strcmp(match->name, mt_name) == 0 && -		    match->revision == rev && match->family == family) { +		if (nft_match_cmp(match, mt_name, rev, family)) {  			if (!try_module_get(match->me))  				return ERR_PTR(-ENOENT); @@ -693,6 +699,13 @@ static LIST_HEAD(nft_target_list);  static struct nft_expr_type nft_target_type; +static bool nft_target_cmp(const struct xt_target *tg, +			   const char *name, u32 rev, u32 family) +{ +	return strcmp(tg->name, name) == 0 && tg->revision == rev && +	       (tg->family == NFPROTO_UNSPEC || tg->family == family); +} +  static const struct nft_expr_ops *  nft_target_select_ops(const struct nft_ctx *ctx,  		      const struct nlattr * const tb[]) @@ -700,7 +713,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,  	struct nft_xt *nft_target;  	struct xt_target *target;  	char *tg_name; -	__u32 rev, family; +	u32 rev, family;  	if (tb[NFTA_TARGET_NAME] == NULL ||  	    tb[NFTA_TARGET_REV] == NULL || @@ -715,8 +728,7 @@ nft_target_select_ops(const struct nft_ctx *ctx,  	list_for_each_entry(nft_target, &nft_target_list, head) {  		struct xt_target *target = nft_target->ops.data; -		if (strcmp(target->name, tg_name) == 0 && -		    target->revision == rev && target->family == family) { +		if (nft_target_cmp(target, tg_name, rev, family)) {  			if (!try_module_get(target->me))  				return ERR_PTR(-ENOENT); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7f86d3b55060..fafe33bdb619 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -125,6 +125,24 @@ static inline u32 netlink_group_mask(u32 group)  	return group ? 1 << (group - 1) : 0;  } +static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, +					   gfp_t gfp_mask) +{ +	unsigned int len = skb_end_offset(skb); +	struct sk_buff *new; + +	new = alloc_skb(len, gfp_mask); +	if (new == NULL) +		return NULL; + +	NETLINK_CB(new).portid = NETLINK_CB(skb).portid; +	NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; +	NETLINK_CB(new).creds = NETLINK_CB(skb).creds; + +	memcpy(skb_put(new, len), skb->data, len); +	return new; +} +  int netlink_add_tap(struct netlink_tap *nt)  {  	if (unlikely(nt->dev->type != ARPHRD_NETLINK)) @@ -206,7 +224,11 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,  	int ret = -ENOMEM;  	dev_hold(dev); -	nskb = skb_clone(skb, GFP_ATOMIC); + +	if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) +		nskb = netlink_to_full_skb(skb, GFP_ATOMIC); +	else +		nskb = skb_clone(skb, GFP_ATOMIC);  	if (nskb) {  		nskb->dev = dev;  		nskb->protocol = htons((u16) sk->sk_protocol); @@ -279,11 +301,6 @@ static void netlink_rcv_wake(struct sock *sk)  }  #ifdef CONFIG_NETLINK_MMAP -static bool netlink_skb_is_mmaped(const struct sk_buff *skb) -{ -	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; -} -  static bool netlink_rx_is_mmaped(struct sock *sk)  {  	return nlk_sk(sk)->rx_ring.pg_vec != NULL; @@ -846,7 +863,6 @@ static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)  }  #else /* CONFIG_NETLINK_MMAP */ -#define netlink_skb_is_mmaped(skb)	false  #define netlink_rx_is_mmaped(sk)	false  #define netlink_tx_is_mmaped(sk)	false  #define netlink_mmap			sock_no_mmap @@ -1094,8 +1110,8 @@ static int netlink_insert(struct sock *sk, u32 portid)  	lock_sock(sk); -	err = -EBUSY; -	if (nlk_sk(sk)->portid) +	err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; +	if (nlk_sk(sk)->bound)  		goto err;  	err = -ENOMEM; @@ -1115,10 +1131,14 @@ static int netlink_insert(struct sock *sk, u32 portid)  			err = -EOVERFLOW;  		if (err == -EEXIST)  			err = -EADDRINUSE; -		nlk_sk(sk)->portid = 0;  		sock_put(sk); +		goto err;  	} +	/* We need to ensure that the socket is hashed and visible. */ +	smp_wmb(); +	nlk_sk(sk)->bound = portid; +  err:  	release_sock(sk);  	return err; @@ -1503,6 +1523,7 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,  	struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;  	int err;  	long unsigned int groups = nladdr->nl_groups; +	bool bound;  	if (addr_len < sizeof(struct sockaddr_nl))  		return -EINVAL; @@ -1519,9 +1540,14 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,  			return err;  	} -	if (nlk->portid) +	bound = nlk->bound; +	if (bound) { +		/* Ensure nlk->portid is up-to-date. */ +		smp_rmb(); +  		if (nladdr->nl_pid != nlk->portid)  			return -EINVAL; +	}  	if (nlk->netlink_bind && groups) {  		int group; @@ -1537,7 +1563,10 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,  		}  	} -	if (!nlk->portid) { +	/* No need for barriers here as we return to user-space without +	 * using any of the bound attributes. +	 */ +	if (!bound) {  		err = nladdr->nl_pid ?  			netlink_insert(sk, nladdr->nl_pid) :  			netlink_autobind(sock); @@ -1585,7 +1614,10 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr,  	    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))  		return -EPERM; -	if (!nlk->portid) +	/* No need for barriers here as we return to user-space without +	 * using any of the bound attributes. +	 */ +	if (!nlk->bound)  		err = netlink_autobind(sock);  	if (err == 0) { @@ -2339,7 +2371,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,  		int pos, idx, shift;  		err = 0; -		netlink_table_grab(); +		netlink_lock_table();  		for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {  			if (len - pos < sizeof(u32))  				break; @@ -2354,7 +2386,7 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,  		}  		if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen))  			err = -EFAULT; -		netlink_table_ungrab(); +		netlink_unlock_table();  		break;  	}  	case NETLINK_CAP_ACK: @@ -2426,10 +2458,13 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)  		dst_group = nlk->dst_group;  	} -	if (!nlk->portid) { +	if (!nlk->bound) {  		err = netlink_autobind(sock);  		if (err)  			goto out; +	} else { +		/* Ensure nlk is hashed and visible. */ +		smp_rmb();  	}  	/* It's a really convoluted way for userland to ask for mmaped @@ -2750,6 +2785,7 @@ static int netlink_dump(struct sock *sk)  	struct sk_buff *skb = NULL;  	struct nlmsghdr *nlh;  	int len, err = -ENOBUFS; +	int alloc_min_size;  	int alloc_size;  	mutex_lock(nlk->cb_mutex); @@ -2758,9 +2794,6 @@ static int netlink_dump(struct sock *sk)  		goto errout_skb;  	} -	cb = &nlk->cb; -	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); -  	if (!netlink_rx_is_mmaped(sk) &&  	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)  		goto errout_skb; @@ -2770,23 +2803,35 @@ static int netlink_dump(struct sock *sk)  	 * to reduce number of system calls on dump operations, if user  	 * ever provided a big enough buffer.  	 */ -	if (alloc_size < nlk->max_recvmsg_len) { -		skb = netlink_alloc_skb(sk, -					nlk->max_recvmsg_len, -					nlk->portid, +	cb = &nlk->cb; +	alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); + +	if (alloc_min_size < nlk->max_recvmsg_len) { +		alloc_size = nlk->max_recvmsg_len; +		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,  					GFP_KERNEL |  					__GFP_NOWARN |  					__GFP_NORETRY); -		/* available room should be exact amount to avoid MSG_TRUNC */ -		if (skb) -			skb_reserve(skb, skb_tailroom(skb) - -					 nlk->max_recvmsg_len);  	} -	if (!skb) +	if (!skb) { +		alloc_size = alloc_min_size;  		skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,  					GFP_KERNEL); +	}  	if (!skb)  		goto errout_skb; + +	/* Trim skb to allocated size. User is expected to provide buffer as +	 * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at +	 * netlink_recvmsg())). dump will pack as many smaller messages as +	 * could fit within the allocated skb. skb is typically allocated +	 * with larger space than required (could be as much as near 2x the +	 * requested size with align to next power of 2 approach). Allowing +	 * dump to use the excess space makes it difficult for a user to have a +	 * reasonable static buffer based on the expected largest dump of a +	 * single netdev. The outcome is MSG_TRUNC error. +	 */ +	skb_reserve(skb, skb_tailroom(skb) - alloc_size);  	netlink_skb_set_owner_r(skb, sk);  	len = cb->dump(skb, cb); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 89008405d6b4..14437d9b1965 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -35,6 +35,7 @@ struct netlink_sock {  	unsigned long		state;  	size_t			max_recvmsg_len;  	wait_queue_head_t	wait; +	bool			bound;  	bool			cb_running;  	struct netlink_callback	cb;  	struct mutex		*cb_mutex; @@ -59,6 +60,15 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)  	return container_of(sk, struct netlink_sock, sk);  } +static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) +{ +#ifdef CONFIG_NETLINK_MMAP +	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; +#else +	return false; +#endif /* CONFIG_NETLINK_MMAP */ +} +  struct netlink_table {  	struct rhashtable	hash;  	struct hlist_head	mc_list; diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 2a071f470d57..d143aa9f6654 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -5,7 +5,8 @@  config OPENVSWITCH  	tristate "Open vSwitch"  	depends on INET -	depends on (!NF_CONNTRACK || NF_CONNTRACK) +	depends on !NF_CONNTRACK || \ +		   (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))  	select LIBCRC32C  	select MPLS  	select NET_MPLS_GSO diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 315f5330b6e5..dba635d086b2 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -684,7 +684,7 @@ static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru,  {  	if (skb_network_offset(skb) > MAX_L2_LEN) {  		OVS_NLERR(1, "L2 header too long to fragment"); -		return; +		goto err;  	}  	if (ethertype == htons(ETH_P_IP)) { @@ -708,8 +708,7 @@ static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru,  		struct rt6_info ovs_rt;  		if (!v6ops) { -			kfree_skb(skb); -			return; +			goto err;  		}  		prepare_frag(vport, skb); @@ -728,8 +727,12 @@ static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru,  		WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",  			  ovs_vport_name(vport), ntohs(ethertype), mru,  			  vport->dev->mtu); -		kfree_skb(skb); +		goto err;  	} + +	return; +err: +	kfree_skb(skb);  }  static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, @@ -765,7 +768,6 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,  			    struct sw_flow_key *key, const struct nlattr *attr,  			    const struct nlattr *actions, int actions_len)  { -	struct ip_tunnel_info info;  	struct dp_upcall_info upcall;  	const struct nlattr *a;  	int rem; @@ -793,11 +795,9 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,  			if (vport) {  				int err; -				upcall.egress_tun_info = &info; -				err = ovs_vport_get_egress_tun_info(vport, skb, -								    &upcall); -				if (err) -					upcall.egress_tun_info = NULL; +				err = dev_fill_metadata_dst(vport->dev, skb); +				if (!err) +					upcall.egress_tun_info = skb_tunnel_info(skb);  			}  			break; @@ -968,7 +968,7 @@ static int execute_masked_set_action(struct sk_buff *skb,  	case OVS_KEY_ATTR_CT_STATE:  	case OVS_KEY_ATTR_CT_ZONE:  	case OVS_KEY_ATTR_CT_MARK: -	case OVS_KEY_ATTR_CT_LABEL: +	case OVS_KEY_ATTR_CT_LABELS:  		err = -EINVAL;  		break;  	} @@ -1099,12 +1099,18 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,  			break;  		case OVS_ACTION_ATTR_CT: +			if (!is_flow_key_valid(key)) { +				err = ovs_flow_key_update(skb, key); +				if (err) +					return err; +			} +  			err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,  					     nla_data(a));  			/* Hide stolen IP fragments from user space. */ -			if (err == -EINPROGRESS) -				return 0; +			if (err) +				return err == -EINPROGRESS ? 0 : err;  			break;  		} diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e8e524ad8a01..50095820edb7 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -37,9 +37,9 @@ struct md_mark {  };  /* Metadata label for masked write to conntrack label. */ -struct md_label { -	struct ovs_key_ct_label value; -	struct ovs_key_ct_label mask; +struct md_labels { +	struct ovs_key_ct_labels value; +	struct ovs_key_ct_labels mask;  };  /* Conntrack action context for execution. */ @@ -47,10 +47,10 @@ struct ovs_conntrack_info {  	struct nf_conntrack_helper *helper;  	struct nf_conntrack_zone zone;  	struct nf_conn *ct; -	u32 flags; +	u8 commit : 1;  	u16 family;  	struct md_mark mark; -	struct md_label label; +	struct md_labels labels;  };  static u16 key_to_nfproto(const struct sw_flow_key *key) @@ -109,21 +109,21 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)  #endif  } -static void ovs_ct_get_label(const struct nf_conn *ct, -			     struct ovs_key_ct_label *label) +static void ovs_ct_get_labels(const struct nf_conn *ct, +			      struct ovs_key_ct_labels *labels)  {  	struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;  	if (cl) {  		size_t len = cl->words * sizeof(long); -		if (len > OVS_CT_LABEL_LEN) -			len = OVS_CT_LABEL_LEN; -		else if (len < OVS_CT_LABEL_LEN) -			memset(label, 0, OVS_CT_LABEL_LEN); -		memcpy(label, cl->bits, len); +		if (len > OVS_CT_LABELS_LEN) +			len = OVS_CT_LABELS_LEN; +		else if (len < OVS_CT_LABELS_LEN) +			memset(labels, 0, OVS_CT_LABELS_LEN); +		memcpy(labels, cl->bits, len);  	} else { -		memset(label, 0, OVS_CT_LABEL_LEN); +		memset(labels, 0, OVS_CT_LABELS_LEN);  	}  } @@ -134,7 +134,7 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,  	key->ct.state = state;  	key->ct.zone = zone->id;  	key->ct.mark = ovs_ct_get_mark(ct); -	ovs_ct_get_label(ct, &key->ct.label); +	ovs_ct_get_labels(ct, &key->ct.labels);  }  /* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has @@ -151,6 +151,8 @@ static void ovs_ct_update_key(const struct sk_buff *skb,  	ct = nf_ct_get(skb, &ctinfo);  	if (ct) {  		state = ovs_ct_get_state(ctinfo); +		if (!nf_ct_is_confirmed(ct)) +			state |= OVS_CS_F_NEW;  		if (ct->master)  			state |= OVS_CS_F_RELATED;  		zone = nf_ct_zone(ct); @@ -167,7 +169,7 @@ void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)  int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)  { -	if (nla_put_u8(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) +	if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state))  		return -EMSGSIZE;  	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && @@ -179,8 +181,8 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)  		return -EMSGSIZE;  	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && -	    nla_put(skb, OVS_KEY_ATTR_CT_LABEL, sizeof(key->ct.label), -		    &key->ct.label)) +	    nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(key->ct.labels), +		    &key->ct.labels))  		return -EMSGSIZE;  	return 0; @@ -213,18 +215,15 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,  #endif  } -static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key, -			    const struct ovs_key_ct_label *label, -			    const struct ovs_key_ct_label *mask) +static int ovs_ct_set_labels(struct sk_buff *skb, struct sw_flow_key *key, +			     const struct ovs_key_ct_labels *labels, +			     const struct ovs_key_ct_labels *mask)  {  	enum ip_conntrack_info ctinfo;  	struct nf_conn_labels *cl;  	struct nf_conn *ct;  	int err; -	if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) -		return -ENOTSUPP; -  	/* The connection could be invalid, in which case set_label is no-op.*/  	ct = nf_ct_get(skb, &ctinfo);  	if (!ct) @@ -235,15 +234,15 @@ static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key,  		nf_ct_labels_ext_add(ct);  		cl = nf_ct_labels_find(ct);  	} -	if (!cl || cl->words * sizeof(long) < OVS_CT_LABEL_LEN) +	if (!cl || cl->words * sizeof(long) < OVS_CT_LABELS_LEN)  		return -ENOSPC; -	err = nf_connlabels_replace(ct, (u32 *)label, (u32 *)mask, -				    OVS_CT_LABEL_LEN / sizeof(u32)); +	err = nf_connlabels_replace(ct, (u32 *)labels, (u32 *)mask, +				    OVS_CT_LABELS_LEN / sizeof(u32));  	if (err)  		return err; -	ovs_ct_get_label(ct, &key->ct.label); +	ovs_ct_get_labels(ct, &key->ct.labels);  	return 0;  } @@ -275,13 +274,15 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)  	case NFPROTO_IPV6: {  		u8 nexthdr = ipv6_hdr(skb)->nexthdr;  		__be16 frag_off; +		int ofs; -		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), -					   &nexthdr, &frag_off); -		if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { +		ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, +				       &frag_off); +		if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {  			pr_debug("proto header not found\n");  			return NF_ACCEPT;  		} +		protoff = ofs;  		break;  	}  	default: @@ -292,6 +293,9 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)  	return helper->help(skb, protoff, ct, ctinfo);  } +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */  static int handle_fragments(struct net *net, struct sw_flow_key *key,  			    u16 zone, struct sk_buff *skb)  { @@ -307,8 +311,8 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,  			return err;  		ovs_cb.mru = IPCB(skb)->frag_max_size; -	} else if (key->eth.type == htons(ETH_P_IPV6)) {  #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) +	} else if (key->eth.type == htons(ETH_P_IPV6)) {  		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;  		struct sk_buff *reasm; @@ -317,17 +321,25 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,  		if (!reasm)  			return -EINPROGRESS; -		if (skb == reasm) +		if (skb == reasm) { +			kfree_skb(skb);  			return -EINVAL; +		} + +		/* Don't free 'skb' even though it is one of the original +		 * fragments, as we're going to morph it into the head. +		 */ +		skb_get(skb); +		nf_ct_frag6_consume_orig(reasm);  		key->ip.proto = ipv6_hdr(reasm)->nexthdr;  		skb_morph(skb, reasm); +		skb->next = reasm->next;  		consume_skb(reasm);  		ovs_cb.mru = IP6CB(skb)->frag_max_size; -#else -		return -EPFNOSUPPORT;  #endif  	} else { +		kfree_skb(skb);  		return -EPFNOSUPPORT;  	} @@ -375,7 +387,7 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,  	return true;  } -static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key, +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,  			   const struct ovs_conntrack_info *info,  			   struct sk_buff *skb)  { @@ -406,6 +418,8 @@ static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key,  		}  	} +	ovs_ct_update_key(skb, key, true); +  	return 0;  } @@ -428,8 +442,6 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,  		err = __ovs_ct_lookup(net, key, info, skb);  		if (err)  			return err; - -		ovs_ct_update_key(skb, key, true);  	}  	return 0; @@ -458,22 +470,23 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,  	if (nf_conntrack_confirm(skb) != NF_ACCEPT)  		return -EINVAL; -	ovs_ct_update_key(skb, key, true); -  	return 0;  } -static bool label_nonzero(const struct ovs_key_ct_label *label) +static bool labels_nonzero(const struct ovs_key_ct_labels *labels)  {  	size_t i; -	for (i = 0; i < sizeof(*label); i++) -		if (label->ct_label[i]) +	for (i = 0; i < sizeof(*labels); i++) +		if (labels->ct_labels[i])  			return true;  	return false;  } +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */  int ovs_ct_execute(struct net *net, struct sk_buff *skb,  		   struct sw_flow_key *key,  		   const struct ovs_conntrack_info *info) @@ -491,7 +504,7 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,  			return err;  	} -	if (info->flags & OVS_CT_F_COMMIT) +	if (info->commit)  		err = ovs_ct_commit(net, key, info, skb);  	else  		err = ovs_ct_lookup(net, key, info, skb); @@ -504,11 +517,13 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,  		if (err)  			goto err;  	} -	if (label_nonzero(&info->label.mask)) -		err = ovs_ct_set_label(skb, key, &info->label.value, -				       &info->label.mask); +	if (labels_nonzero(&info->labels.mask)) +		err = ovs_ct_set_labels(skb, key, &info->labels.value, +					&info->labels.mask);  err:  	skb_push(skb, nh_ofs); +	if (err) +		kfree_skb(skb);  	return err;  } @@ -537,14 +552,13 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,  }  static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { -	[OVS_CT_ATTR_FLAGS]	= { .minlen = sizeof(u32), -				    .maxlen = sizeof(u32) }, +	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0, .maxlen = 0 },  	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),  				    .maxlen = sizeof(u16) },  	[OVS_CT_ATTR_MARK]	= { .minlen = sizeof(struct md_mark),  				    .maxlen = sizeof(struct md_mark) }, -	[OVS_CT_ATTR_LABEL]	= { .minlen = sizeof(struct md_label), -				    .maxlen = sizeof(struct md_label) }, +	[OVS_CT_ATTR_LABELS]	= { .minlen = sizeof(struct md_labels), +				    .maxlen = sizeof(struct md_labels) },  	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,  				    .maxlen = NF_CT_HELPER_NAME_LEN }  }; @@ -574,8 +588,8 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,  		}  		switch (type) { -		case OVS_CT_ATTR_FLAGS: -			info->flags = nla_get_u32(a); +		case OVS_CT_ATTR_COMMIT: +			info->commit = true;  			break;  #ifdef CONFIG_NF_CONNTRACK_ZONES  		case OVS_CT_ATTR_ZONE: @@ -586,15 +600,23 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,  		case OVS_CT_ATTR_MARK: {  			struct md_mark *mark = nla_data(a); +			if (!mark->mask) { +				OVS_NLERR(log, "ct_mark mask cannot be 0"); +				return -EINVAL; +			}  			info->mark = *mark;  			break;  		}  #endif  #ifdef CONFIG_NF_CONNTRACK_LABELS -		case OVS_CT_ATTR_LABEL: { -			struct md_label *label = nla_data(a); +		case OVS_CT_ATTR_LABELS: { +			struct md_labels *labels = nla_data(a); -			info->label = *label; +			if (!labels_nonzero(&labels->mask)) { +				OVS_NLERR(log, "ct_labels mask cannot be 0"); +				return -EINVAL; +			} +			info->labels = *labels;  			break;  		}  #endif @@ -631,7 +653,7 @@ bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr)  	    attr == OVS_KEY_ATTR_CT_MARK)  		return true;  	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && -	    attr == OVS_KEY_ATTR_CT_LABEL) { +	    attr == OVS_KEY_ATTR_CT_LABELS) {  		struct ovs_net *ovs_net = net_generic(net, ovs_net_id);  		return ovs_net->xt_label; @@ -699,18 +721,19 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,  	if (!start)  		return -EMSGSIZE; -	if (nla_put_u32(skb, OVS_CT_ATTR_FLAGS, ct_info->flags)) +	if (ct_info->commit && nla_put_flag(skb, OVS_CT_ATTR_COMMIT))  		return -EMSGSIZE;  	if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&  	    nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id))  		return -EMSGSIZE; -	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && +	if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask &&  	    nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark),  		    &ct_info->mark))  		return -EMSGSIZE;  	if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && -	    nla_put(skb, OVS_CT_ATTR_LABEL, sizeof(ct_info->label), -		    &ct_info->label)) +	    labels_nonzero(&ct_info->labels.mask) && +	    nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), +		    &ct_info->labels))  		return -EMSGSIZE;  	if (ct_info->helper) {  		if (nla_put_string(skb, OVS_CT_ATTR_HELPER, @@ -735,7 +758,7 @@ void ovs_ct_free_action(const struct nlattr *a)  void ovs_ct_init(struct net *net)  { -	unsigned int n_bits = sizeof(struct ovs_key_ct_label) * BITS_PER_BYTE; +	unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;  	struct ovs_net *ovs_net = net_generic(net, ovs_net_id);  	if (nf_connlabels_get(net, n_bits)) { diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index 43f5dd7a5577..a7544f405c16 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -34,6 +34,10 @@ int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *,  void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key);  int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb);  void ovs_ct_free_action(const struct nlattr *a); + +#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ +			   OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ +			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED)  #else  #include <linux/errno.h> @@ -63,6 +67,7 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb,  				 struct sw_flow_key *key,  				 const struct ovs_conntrack_info *info)  { +	kfree_skb(skb);  	return -ENOTSUPP;  } @@ -72,7 +77,7 @@ static inline void ovs_ct_fill_key(const struct sk_buff *skb,  	key->ct.state = 0;  	key->ct.zone = 0;  	key->ct.mark = 0; -	memset(&key->ct.label, 0, sizeof(key->ct.label)); +	memset(&key->ct.labels, 0, sizeof(key->ct.labels));  }  static inline int ovs_ct_put_key(const struct sw_flow_key *key, @@ -82,5 +87,7 @@ static inline int ovs_ct_put_key(const struct sw_flow_key *key,  }  static inline void ovs_ct_free_action(const struct nlattr *a) { } + +#define CT_SUPPORTED_MASK 0  #endif /* CONFIG_NF_CONNTRACK */  #endif /* ovs_conntrack.h */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6fbd2decb19e..c5d08ee37730 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -490,9 +490,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,  	if (upcall_info->egress_tun_info) {  		nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); -		err = ovs_nla_put_egress_tunnel_key(user_skb, -						    upcall_info->egress_tun_info, -						    upcall_info->egress_tun_opts); +		err = ovs_nla_put_tunnel_info(user_skb, +					      upcall_info->egress_tun_info);  		BUG_ON(err);  		nla_nest_end(user_skb, nla);  	} @@ -952,7 +951,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)  	if (error)  		goto err_kfree_flow; -	ovs_flow_mask_key(&new_flow->key, &key, &mask); +	ovs_flow_mask_key(&new_flow->key, &key, true, &mask);  	/* Extract flow identifier. */  	error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], @@ -1080,7 +1079,7 @@ static struct sw_flow_actions *get_flow_actions(struct net *net,  	struct sw_flow_key masked_key;  	int error; -	ovs_flow_mask_key(&masked_key, key, mask); +	ovs_flow_mask_key(&masked_key, key, true, mask);  	error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log);  	if (error) {  		OVS_NLERR(log, diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index f88038a99f44..67bdecd9fdc1 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -117,7 +117,6 @@ struct ovs_skb_cb {   */  struct dp_upcall_info {  	struct ip_tunnel_info *egress_tun_info; -	const void *egress_tun_opts;  	const struct nlattr *userdata;  	const struct nlattr *actions;  	int actions_len; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index fe527d2dd4b7..8cfa15a08668 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -116,7 +116,7 @@ struct sw_flow_key {  		u16 zone;  		u32 mark;  		u8 state; -		struct ovs_key_ct_label label; +		struct ovs_key_ct_labels labels;  	} ct;  } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c92d6a262bc5..38536c137c54 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -57,6 +57,7 @@ struct ovs_len_tbl {  };  #define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2  static void update_range(struct sw_flow_match *match,  			 size_t offset, size_t size, bool is_mask) @@ -290,10 +291,10 @@ size_t ovs_key_attr_size(void)  		+ nla_total_size(4)   /* OVS_KEY_ATTR_SKB_MARK */  		+ nla_total_size(4)   /* OVS_KEY_ATTR_DP_HASH */  		+ nla_total_size(4)   /* OVS_KEY_ATTR_RECIRC_ID */ -		+ nla_total_size(1)   /* OVS_KEY_ATTR_CT_STATE */ +		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_STATE */  		+ nla_total_size(2)   /* OVS_KEY_ATTR_CT_ZONE */  		+ nla_total_size(4)   /* OVS_KEY_ATTR_CT_MARK */ -		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABEL */ +		+ nla_total_size(16)  /* OVS_KEY_ATTR_CT_LABELS */  		+ nla_total_size(12)  /* OVS_KEY_ATTR_ETHERNET */  		+ nla_total_size(2)   /* OVS_KEY_ATTR_ETHERTYPE */  		+ nla_total_size(4)   /* OVS_KEY_ATTR_VLAN */ @@ -304,6 +305,10 @@ size_t ovs_key_attr_size(void)  		+ nla_total_size(28); /* OVS_KEY_ATTR_ND */  } +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { +	[OVS_VXLAN_EXT_GBP]	    = { .len = sizeof(u32) }, +}; +  static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {  	[OVS_TUNNEL_KEY_ATTR_ID]	    = { .len = sizeof(u64) },  	[OVS_TUNNEL_KEY_ATTR_IPV4_SRC]	    = { .len = sizeof(u32) }, @@ -315,8 +320,9 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1]  	[OVS_TUNNEL_KEY_ATTR_TP_SRC]	    = { .len = sizeof(u16) },  	[OVS_TUNNEL_KEY_ATTR_TP_DST]	    = { .len = sizeof(u16) },  	[OVS_TUNNEL_KEY_ATTR_OAM]	    = { .len = 0 }, -	[OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS]   = { .len = OVS_ATTR_NESTED }, -	[OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS]    = { .len = OVS_ATTR_NESTED }, +	[OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS]   = { .len = OVS_ATTR_VARIABLE }, +	[OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS]    = { .len = OVS_ATTR_NESTED, +						.next = ovs_vxlan_ext_key_lens },  };  /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute.  */ @@ -343,12 +349,19 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {  	[OVS_KEY_ATTR_TUNNEL]	 = { .len = OVS_ATTR_NESTED,  				     .next = ovs_tunnel_key_lens, },  	[OVS_KEY_ATTR_MPLS]	 = { .len = sizeof(struct ovs_key_mpls) }, -	[OVS_KEY_ATTR_CT_STATE]	 = { .len = sizeof(u8) }, +	[OVS_KEY_ATTR_CT_STATE]	 = { .len = sizeof(u32) },  	[OVS_KEY_ATTR_CT_ZONE]	 = { .len = sizeof(u16) },  	[OVS_KEY_ATTR_CT_MARK]	 = { .len = sizeof(u32) }, -	[OVS_KEY_ATTR_CT_LABEL]	 = { .len = sizeof(struct ovs_key_ct_label) }, +	[OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) },  }; +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ +	return expected_len == attr_len || +	       expected_len == OVS_ATTR_NESTED || +	       expected_len == OVS_ATTR_VARIABLE; +} +  static bool is_all_zero(const u8 *fp, size_t size)  {  	int i; @@ -388,7 +401,7 @@ static int __parse_flow_nlattrs(const struct nlattr *attr,  		}  		expected_len = ovs_key_lens[type].len; -		if (nla_len(nla) != expected_len && expected_len != OVS_ATTR_NESTED) { +		if (!check_attr_len(nla_len(nla), expected_len)) {  			OVS_NLERR(log, "Key %d has unexpected len %d expected %d",  				  type, nla_len(nla), expected_len);  			return -EINVAL; @@ -473,29 +486,50 @@ static int genev_tun_opt_from_nlattr(const struct nlattr *a,  	return 0;  } -static const struct nla_policy vxlan_opt_policy[OVS_VXLAN_EXT_MAX + 1] = { -	[OVS_VXLAN_EXT_GBP]	= { .type = NLA_U32 }, -}; - -static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,  				     struct sw_flow_match *match, bool is_mask,  				     bool log)  { -	struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; +	struct nlattr *a; +	int rem;  	unsigned long opt_key_offset;  	struct vxlan_metadata opts; -	int err;  	BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); -	err = nla_parse_nested(tb, OVS_VXLAN_EXT_MAX, a, vxlan_opt_policy); -	if (err < 0) -		return err; -  	memset(&opts, 0, sizeof(opts)); +	nla_for_each_nested(a, attr, rem) { +		int type = nla_type(a); -	if (tb[OVS_VXLAN_EXT_GBP]) -		opts.gbp = nla_get_u32(tb[OVS_VXLAN_EXT_GBP]); +		if (type > OVS_VXLAN_EXT_MAX) { +			OVS_NLERR(log, "VXLAN extension %d out of range max %d", +				  type, OVS_VXLAN_EXT_MAX); +			return -EINVAL; +		} + +		if (!check_attr_len(nla_len(a), +				    ovs_vxlan_ext_key_lens[type].len)) { +			OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", +				  type, nla_len(a), +				  ovs_vxlan_ext_key_lens[type].len); +			return -EINVAL; +		} + +		switch (type) { +		case OVS_VXLAN_EXT_GBP: +			opts.gbp = nla_get_u32(a); +			break; +		default: +			OVS_NLERR(log, "Unknown VXLAN extension attribute %d", +				  type); +			return -EINVAL; +		} +	} +	if (rem) { +		OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", +			  rem); +		return -EINVAL; +	}  	if (!is_mask)  		SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); @@ -528,8 +562,8 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,  			return -EINVAL;  		} -		if (ovs_tunnel_key_lens[type].len != nla_len(a) && -		    ovs_tunnel_key_lens[type].len != OVS_ATTR_NESTED) { +		if (!check_attr_len(nla_len(a), +				    ovs_tunnel_key_lens[type].len)) {  			OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d",  				  type, nla_len(a), ovs_tunnel_key_lens[type].len);  			return -EINVAL; @@ -683,7 +717,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,  	if ((output->tun_flags & TUNNEL_OAM) &&  	    nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM))  		return -EMSGSIZE; -	if (tun_opts) { +	if (swkey_tun_opts_len) {  		if (output->tun_flags & TUNNEL_GENEVE_OPT &&  		    nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,  			    swkey_tun_opts_len, tun_opts)) @@ -715,13 +749,12 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,  	return 0;  } -int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, -				  const struct ip_tunnel_info *egress_tun_info, -				  const void *egress_tun_opts) +int ovs_nla_put_tunnel_info(struct sk_buff *skb, +			    struct ip_tunnel_info *tun_info)  { -	return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, -				    egress_tun_opts, -				    egress_tun_info->options_len); +	return __ipv4_tun_to_nlattr(skb, &tun_info->key, +				    ip_tunnel_info_opts(tun_info), +				    tun_info->options_len);  }  static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, @@ -780,7 +813,13 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,  	if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) &&  	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { -		u8 ct_state = nla_get_u8(a[OVS_KEY_ATTR_CT_STATE]); +		u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]); + +		if (ct_state & ~CT_SUPPORTED_MASK) { +			OVS_NLERR(log, "ct_state flags %08x unsupported", +				  ct_state); +			return -EINVAL; +		}  		SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask);  		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); @@ -799,14 +838,14 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match,  		SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask);  		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK);  	} -	if (*attrs & (1 << OVS_KEY_ATTR_CT_LABEL) && -	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABEL)) { -		const struct ovs_key_ct_label *cl; +	if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) && +	    ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) { +		const struct ovs_key_ct_labels *cl; -		cl = nla_data(a[OVS_KEY_ATTR_CT_LABEL]); -		SW_FLOW_KEY_MEMCPY(match, ct.label, cl->ct_label, +		cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]); +		SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels,  				   sizeof(*cl), is_mask); -		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABEL); +		*attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS);  	}  	return 0;  } @@ -1052,10 +1091,16 @@ static void nlattr_set(struct nlattr *attr, u8 val,  	/* The nlattr stream should already have been validated */  	nla_for_each_nested(nla, attr, rem) { -		if (tbl && tbl[nla_type(nla)].len == OVS_ATTR_NESTED) -			nlattr_set(nla, val, tbl[nla_type(nla)].next); -		else +		if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) { +			if (tbl[nla_type(nla)].next) +				tbl = tbl[nla_type(nla)].next; +			nlattr_set(nla, val, tbl); +		} else {  			memset(nla_data(nla), val, nla_len(nla)); +		} + +		if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE) +			*(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK;  	}  } @@ -1922,8 +1967,7 @@ static int validate_set(const struct nlattr *a,  		key_len /= 2;  	if (key_type > OVS_KEY_ATTR_MAX || -	    (ovs_key_lens[key_type].len != key_len && -	     ovs_key_lens[key_type].len != OVS_ATTR_NESTED)) +	    !check_attr_len(key_len, ovs_key_lens[key_type].len))  		return -EINVAL;  	if (masked && !validate_masked(nla_data(ovs_key), key_len)) @@ -1937,7 +1981,7 @@ static int validate_set(const struct nlattr *a,  	case OVS_KEY_ATTR_PRIORITY:  	case OVS_KEY_ATTR_SKB_MARK:  	case OVS_KEY_ATTR_CT_MARK: -	case OVS_KEY_ATTR_CT_LABEL: +	case OVS_KEY_ATTR_CT_LABELS:  	case OVS_KEY_ATTR_ETHERNET:  		break; @@ -2338,10 +2382,7 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)  		if (!start)  			return -EMSGSIZE; -		err = ipv4_tun_to_nlattr(skb, &tun_info->key, -					 tun_info->options_len ? -					     ip_tunnel_info_opts(tun_info) : NULL, -					 tun_info->options_len); +		err = ovs_nla_put_tunnel_info(skb, tun_info);  		if (err)  			return err;  		nla_nest_end(skb, start); diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 6ca3f0baf449..47dd142eca1c 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -55,9 +55,9 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);  int ovs_nla_get_match(struct net *, struct sw_flow_match *,  		      const struct nlattr *key, const struct nlattr *mask,  		      bool log); -int ovs_nla_put_egress_tunnel_key(struct sk_buff *, -				  const struct ip_tunnel_info *, -				  const void *egress_tun_opts); + +int ovs_nla_put_tunnel_info(struct sk_buff *skb, +			    struct ip_tunnel_info *tun_info);  bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);  int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d22d8e948d0f..c7f74aab34b9 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -57,20 +57,21 @@ static u16 range_n_bytes(const struct sw_flow_key_range *range)  }  void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, -		       const struct sw_flow_mask *mask) +		       bool full, const struct sw_flow_mask *mask)  { -	const long *m = (const long *)((const u8 *)&mask->key + -				mask->range.start); -	const long *s = (const long *)((const u8 *)src + -				mask->range.start); -	long *d = (long *)((u8 *)dst + mask->range.start); +	int start = full ? 0 : mask->range.start; +	int len = full ? sizeof *dst : range_n_bytes(&mask->range); +	const long *m = (const long *)((const u8 *)&mask->key + start); +	const long *s = (const long *)((const u8 *)src + start); +	long *d = (long *)((u8 *)dst + start);  	int i; -	/* The memory outside of the 'mask->range' are not set since -	 * further operations on 'dst' only uses contents within -	 * 'mask->range'. +	/* If 'full' is true then all of 'dst' is fully initialized. Otherwise, +	 * if 'full' is false the memory outside of the 'mask->range' is left +	 * uninitialized. This can be used as an optimization when further +	 * operations on 'dst' only use contents within 'mask->range'.  	 */ -	for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long)) +	for (i = 0; i < len; i += sizeof(long))  		*d++ = *s++ & *m++;  } @@ -92,7 +93,8 @@ struct sw_flow *ovs_flow_alloc(void)  	/* Initialize the default stat node. */  	stats = kmem_cache_alloc_node(flow_stats_cache, -				      GFP_KERNEL | __GFP_ZERO, 0); +				      GFP_KERNEL | __GFP_ZERO, +				      node_online(0) ? 0 : NUMA_NO_NODE);  	if (!stats)  		goto err; @@ -475,7 +477,7 @@ static struct sw_flow *masked_flow_lookup(struct table_instance *ti,  	u32 hash;  	struct sw_flow_key masked_key; -	ovs_flow_mask_key(&masked_key, unmasked, mask); +	ovs_flow_mask_key(&masked_key, unmasked, false, mask);  	hash = flow_hash(&masked_key, &mask->range);  	head = find_bucket(ti, hash);  	hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 616eda10d955..2dd9900f533d 100644 --- a/net/openvswitch/flow_table.h +++ b/net/openvswitch/flow_table.h @@ -86,5 +86,5 @@ struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *,  bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *);  void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, -		       const struct sw_flow_mask *mask); +		       bool full, const struct sw_flow_mask *mask);  #endif /* flow_table.h */ diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 2735e9c4a3b8..5f8aaaaa0785 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -52,18 +52,6 @@ static int geneve_get_options(const struct vport *vport,  	return 0;  } -static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, -				      struct dp_upcall_info *upcall) -{ -	struct geneve_port *geneve_port = geneve_vport(vport); -	struct net *net = ovs_dp_get_net(vport->dp); -	__be16 dport = htons(geneve_port->port_no); -	__be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - -	return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), -					  skb, IPPROTO_UDP, sport, dport); -} -  static struct vport *geneve_tnl_create(const struct vport_parms *parms)  {  	struct net *net = ovs_dp_get_net(parms->dp); @@ -130,7 +118,6 @@ static struct vport_ops ovs_geneve_vport_ops = {  	.get_options	= geneve_get_options,  	.send		= ovs_netdev_send,  	.owner          = THIS_MODULE, -	.get_egress_tun_info	= geneve_get_egress_tun_info,  };  static int __init ovs_geneve_tnl_init(void) diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 4d24481669c9..64225bf5eb40 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -84,18 +84,10 @@ static struct vport *gre_create(const struct vport_parms *parms)  	return ovs_netdev_link(vport, parms->name);  } -static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, -				   struct dp_upcall_info *upcall) -{ -	return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), -					  skb, IPPROTO_GRE, 0, 0); -} -  static struct vport_ops ovs_gre_vport_ops = {  	.type		= OVS_VPORT_TYPE_GRE,  	.create		= gre_create,  	.send		= ovs_netdev_send, -	.get_egress_tun_info	= gre_get_egress_tun_info,  	.destroy	= ovs_netdev_tunnel_destroy,  	.owner		= THIS_MODULE,  }; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 388b8a6bf112..b3934126daa8 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -106,12 +106,45 @@ static void internal_dev_destructor(struct net_device *dev)  	free_netdev(dev);  } +static struct rtnl_link_stats64 * +internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ +	int i; + +	memset(stats, 0, sizeof(*stats)); +	stats->rx_errors  = dev->stats.rx_errors; +	stats->tx_errors  = dev->stats.tx_errors; +	stats->tx_dropped = dev->stats.tx_dropped; +	stats->rx_dropped = dev->stats.rx_dropped; + +	for_each_possible_cpu(i) { +		const struct pcpu_sw_netstats *percpu_stats; +		struct pcpu_sw_netstats local_stats; +		unsigned int start; + +		percpu_stats = per_cpu_ptr(dev->tstats, i); + +		do { +			start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); +			local_stats = *percpu_stats; +		} while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + +		stats->rx_bytes         += local_stats.rx_bytes; +		stats->rx_packets       += local_stats.rx_packets; +		stats->tx_bytes         += local_stats.tx_bytes; +		stats->tx_packets       += local_stats.tx_packets; +	} + +	return stats; +} +  static const struct net_device_ops internal_dev_netdev_ops = {  	.ndo_open = internal_dev_open,  	.ndo_stop = internal_dev_stop,  	.ndo_start_xmit = internal_dev_xmit,  	.ndo_set_mac_address = eth_mac_addr,  	.ndo_change_mtu = internal_dev_change_mtu, +	.ndo_get_stats64 = internal_get_stats,  };  static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -161,6 +194,11 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)  		err = -ENOMEM;  		goto error_free_vport;  	} +	vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); +	if (!vport->dev->tstats) { +		err = -ENOMEM; +		goto error_free_netdev; +	}  	dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));  	internal_dev = internal_dev_priv(vport->dev); @@ -173,7 +211,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)  	rtnl_lock();  	err = register_netdevice(vport->dev);  	if (err) -		goto error_free_netdev; +		goto error_unlock;  	dev_set_promiscuity(vport->dev, 1);  	rtnl_unlock(); @@ -181,8 +219,10 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)  	return vport; -error_free_netdev: +error_unlock:  	rtnl_unlock(); +	free_percpu(vport->dev->tstats); +error_free_netdev:  	free_netdev(vport->dev);  error_free_vport:  	ovs_vport_free(vport); @@ -198,7 +238,7 @@ static void internal_dev_destroy(struct vport *vport)  	/* unregister_netdevice() waits for an RCU grace period. */  	unregister_netdevice(vport->dev); - +	free_percpu(vport->dev->tstats);  	rtnl_unlock();  } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index c11413d5075f..e1c9c0888037 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -146,31 +146,12 @@ static struct vport *vxlan_create(const struct vport_parms *parms)  	return ovs_netdev_link(vport, parms->name);  } -static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, -				     struct dp_upcall_info *upcall) -{ -	struct vxlan_dev *vxlan = netdev_priv(vport->dev); -	struct net *net = ovs_dp_get_net(vport->dp); -	__be16 dst_port = vxlan_dev_dst_port(vxlan); -	__be16 src_port; -	int port_min; -	int port_max; - -	inet_get_local_port_range(net, &port_min, &port_max); -	src_port = udp_flow_src_port(net, skb, 0, 0, true); - -	return ovs_tunnel_get_egress_info(upcall, net, -					  skb, IPPROTO_UDP, -					  src_port, dst_port); -} -  static struct vport_ops ovs_vxlan_netdev_vport_ops = {  	.type			= OVS_VPORT_TYPE_VXLAN,  	.create			= vxlan_create,  	.destroy		= ovs_netdev_tunnel_destroy,  	.get_options		= vxlan_get_options,  	.send			= ovs_netdev_send, -	.get_egress_tun_info	= vxlan_get_egress_tun_info,  };  static int __init ovs_vxlan_tnl_init(void) diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index dc81dc619aa2..320c765ce44a 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -280,35 +280,19 @@ void ovs_vport_del(struct vport *vport)   */  void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)  { -	struct net_device *dev = vport->dev; -	int i; +	const struct rtnl_link_stats64 *dev_stats; +	struct rtnl_link_stats64 temp; -	memset(stats, 0, sizeof(*stats)); -	stats->rx_errors  = dev->stats.rx_errors; -	stats->tx_errors  = dev->stats.tx_errors; -	stats->tx_dropped = dev->stats.tx_dropped; -	stats->rx_dropped = dev->stats.rx_dropped; +	dev_stats = dev_get_stats(vport->dev, &temp); +	stats->rx_errors  = dev_stats->rx_errors; +	stats->tx_errors  = dev_stats->tx_errors; +	stats->tx_dropped = dev_stats->tx_dropped; +	stats->rx_dropped = dev_stats->rx_dropped; -	stats->rx_dropped += atomic_long_read(&dev->rx_dropped); -	stats->tx_dropped += atomic_long_read(&dev->tx_dropped); - -	for_each_possible_cpu(i) { -		const struct pcpu_sw_netstats *percpu_stats; -		struct pcpu_sw_netstats local_stats; -		unsigned int start; - -		percpu_stats = per_cpu_ptr(dev->tstats, i); - -		do { -			start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); -			local_stats = *percpu_stats; -		} while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - -		stats->rx_bytes		+= local_stats.rx_bytes; -		stats->rx_packets	+= local_stats.rx_packets; -		stats->tx_bytes		+= local_stats.tx_bytes; -		stats->tx_packets	+= local_stats.tx_packets; -	} +	stats->rx_bytes	  = dev_stats->rx_bytes; +	stats->rx_packets = dev_stats->rx_packets; +	stats->tx_bytes	  = dev_stats->tx_bytes; +	stats->tx_packets = dev_stats->tx_packets;  }  /** @@ -460,6 +444,15 @@ int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,  	OVS_CB(skb)->input_vport = vport;  	OVS_CB(skb)->mru = 0; +	if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { +		u32 mark; + +		mark = skb->mark; +		skb_scrub_packet(skb, true); +		skb->mark = mark; +		tun_info = NULL; +	} +  	/* Extract flow from 'skb' into 'key'. */  	error = ovs_flow_key_extract(tun_info, skb, &key);  	if (unlikely(error)) { @@ -486,61 +479,3 @@ void ovs_vport_deferred_free(struct vport *vport)  	call_rcu(&vport->rcu, free_vport_rcu);  }  EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); - -int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, -			       struct net *net, -			       struct sk_buff *skb, -			       u8 ipproto, -			       __be16 tp_src, -			       __be16 tp_dst) -{ -	struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info; -	const struct ip_tunnel_info *tun_info = skb_tunnel_info(skb); -	const struct ip_tunnel_key *tun_key; -	u32 skb_mark = skb->mark; -	struct rtable *rt; -	struct flowi4 fl; - -	if (unlikely(!tun_info)) -		return -EINVAL; -	if (ip_tunnel_info_af(tun_info) != AF_INET) -		return -EINVAL; - -	tun_key = &tun_info->key; - -	/* Route lookup to get srouce IP address. -	 * The process may need to be changed if the corresponding process -	 * in vports ops changed. -	 */ -	rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto); -	if (IS_ERR(rt)) -		return PTR_ERR(rt); - -	ip_rt_put(rt); - -	/* Generate egress_tun_info based on tun_info, -	 * saddr, tp_src and tp_dst -	 */ -	ip_tunnel_key_init(&egress_tun_info->key, -			   fl.saddr, tun_key->u.ipv4.dst, -			   tun_key->tos, -			   tun_key->ttl, -			   tp_src, tp_dst, -			   tun_key->tun_id, -			   tun_key->tun_flags); -	egress_tun_info->options_len = tun_info->options_len; -	egress_tun_info->mode = tun_info->mode; -	upcall->egress_tun_opts = ip_tunnel_info_opts(egress_tun_info); -	return 0; -} -EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); - -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, -				  struct dp_upcall_info *upcall) -{ -	/* get_egress_tun_info() is only implemented on tunnel ports. */ -	if (unlikely(!vport->ops->get_egress_tun_info)) -		return -EINVAL; - -	return vport->ops->get_egress_tun_info(vport, skb, upcall); -} diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index a413f3ae6a7b..d341ad6f3afe 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -27,7 +27,6 @@  #include <linux/skbuff.h>  #include <linux/spinlock.h>  #include <linux/u64_stats_sync.h> -#include <net/route.h>  #include "datapath.h" @@ -53,16 +52,6 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids);  int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *);  u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, -			       struct net *net, -			       struct sk_buff *, -			       u8 ipproto, -			       __be16 tp_src, -			       __be16 tp_dst); - -int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, -				  struct dp_upcall_info *upcall); -  /**   * struct vport_portids - array of netlink portids of a vport.   *                        must be protected by rcu. @@ -140,8 +129,6 @@ struct vport_parms {   * have any configuration.   * @send: Send a packet on the device.   * zero for dropped packets or negative for error. - * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for - * a packet.   */  struct vport_ops {  	enum ovs_vport_type type; @@ -154,9 +141,6 @@ struct vport_ops {  	int (*get_options)(const struct vport *, struct sk_buff *);  	void (*send)(struct vport *, struct sk_buff *); -	int (*get_egress_tun_info)(struct vport *, struct sk_buff *, -				   struct dp_upcall_info *upcall); -  	struct module *owner;  	struct list_head list;  }; @@ -215,25 +199,6 @@ static inline const char *ovs_vport_name(struct vport *vport)  int ovs_vport_ops_register(struct vport_ops *ops);  void ovs_vport_ops_unregister(struct vport_ops *ops); -static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, -						     const struct ip_tunnel_key *key, -						     u32 mark, -						     struct flowi4 *fl, -						     u8 protocol) -{ -	struct rtable *rt; - -	memset(fl, 0, sizeof(*fl)); -	fl->daddr = key->u.ipv4.dst; -	fl->saddr = key->u.ipv4.src; -	fl->flowi4_tos = RT_TOS(key->tos); -	fl->flowi4_mark = mark; -	fl->flowi4_proto = protocol; - -	rt = ip_route_output_key(net, fl); -	return rt; -} -  static inline void ovs_vport_send(struct vport *vport, struct sk_buff *skb)  {  	vport->ops->send(vport, skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7b8e39a22387..aa4b15c35884 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -230,6 +230,8 @@ struct packet_skb_cb {  	} sa;  }; +#define vio_le() virtio_legacy_is_little_endian() +  #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))  #define GET_PBDQC_FROM_RB(x)	((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) @@ -2680,15 +2682,15 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)  			goto out_unlock;  		if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && -		    (__virtio16_to_cpu(false, vnet_hdr.csum_start) + -		     __virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2 > -		      __virtio16_to_cpu(false, vnet_hdr.hdr_len))) -			vnet_hdr.hdr_len = __cpu_to_virtio16(false, -				 __virtio16_to_cpu(false, vnet_hdr.csum_start) + -				__virtio16_to_cpu(false, vnet_hdr.csum_offset) + 2); +		    (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + +		     __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 > +		      __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len))) +			vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(), +				 __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) + +				__virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2);  		err = -EINVAL; -		if (__virtio16_to_cpu(false, vnet_hdr.hdr_len) > len) +		if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len)  			goto out_unlock;  		if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -2731,7 +2733,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)  	hlen = LL_RESERVED_SPACE(dev);  	tlen = dev->needed_tailroom;  	skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, -			       __virtio16_to_cpu(false, vnet_hdr.hdr_len), +			       __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len),  			       msg->msg_flags & MSG_DONTWAIT, &err);  	if (skb == NULL)  		goto out_unlock; @@ -2778,8 +2780,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)  	if (po->has_vnet_hdr) {  		if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { -			u16 s = __virtio16_to_cpu(false, vnet_hdr.csum_start); -			u16 o = __virtio16_to_cpu(false, vnet_hdr.csum_offset); +			u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start); +			u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset);  			if (!skb_partial_csum_set(skb, s, o)) {  				err = -EINVAL;  				goto out_free; @@ -2787,7 +2789,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)  		}  		skb_shinfo(skb)->gso_size = -			__virtio16_to_cpu(false, vnet_hdr.gso_size); +			__virtio16_to_cpu(vio_le(), vnet_hdr.gso_size);  		skb_shinfo(skb)->gso_type = gso_type;  		/* Header must be checked, and gso_segs computed. */ @@ -3161,9 +3163,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,  			/* This is a hint as to how much should be linear. */  			vnet_hdr.hdr_len = -				__cpu_to_virtio16(false, skb_headlen(skb)); +				__cpu_to_virtio16(vio_le(), skb_headlen(skb));  			vnet_hdr.gso_size = -				__cpu_to_virtio16(false, sinfo->gso_size); +				__cpu_to_virtio16(vio_le(), sinfo->gso_size);  			if (sinfo->gso_type & SKB_GSO_TCPV4)  				vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;  			else if (sinfo->gso_type & SKB_GSO_TCPV6) @@ -3181,9 +3183,9 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,  		if (skb->ip_summed == CHECKSUM_PARTIAL) {  			vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; -			vnet_hdr.csum_start = __cpu_to_virtio16(false, +			vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(),  					  skb_checksum_start_offset(skb)); -			vnet_hdr.csum_offset = __cpu_to_virtio16(false, +			vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(),  							 skb->csum_offset);  		} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {  			vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index fbc5ef88bc0e..27a992154804 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -214,8 +214,15 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,  			}  			to_copy = min(tc->t_tinc_data_rem, left); -			pskb_pull(clone, offset); -			pskb_trim(clone, to_copy); +			if (!pskb_pull(clone, offset) || +			    pskb_trim(clone, to_copy)) { +				pr_warn("rds_tcp_data_recv: pull/trim failed " +					"left %zu data_rem %zu skb_len %d\n", +					left, tc->t_tinc_data_rem, skb->len); +				kfree_skb(clone); +				desc->error = -ENOMEM; +				goto out; +			}  			skb_queue_tail(&tinc->ti_skb_list, clone);  			rdsdebug("skb %p data %p len %d off %u to_copy %zu -> " diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 2d1be4a760fd..32fcdecdb9e2 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -31,13 +31,17 @@  #define MIRRED_TAB_MASK     7  static LIST_HEAD(mirred_list); +static DEFINE_SPINLOCK(mirred_list_lock);  static void tcf_mirred_release(struct tc_action *a, int bind)  {  	struct tcf_mirred *m = to_mirred(a);  	struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1); +	/* We could be called either in a RCU callback or with RTNL lock held. */ +	spin_lock_bh(&mirred_list_lock);  	list_del(&m->tcfm_list); +	spin_unlock_bh(&mirred_list_lock);  	if (dev)  		dev_put(dev);  } @@ -103,10 +107,10 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,  	} else {  		if (bind)  			return 0; -		if (!ovr) { -			tcf_hash_release(a, bind); + +		tcf_hash_release(a, bind); +		if (!ovr)  			return -EEXIST; -		}  	}  	m = to_mirred(a); @@ -123,7 +127,9 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,  	}  	if (ret == ACT_P_CREATED) { +		spin_lock_bh(&mirred_list_lock);  		list_add(&m->tcfm_list, &mirred_list); +		spin_unlock_bh(&mirred_list_lock);  		tcf_hash_insert(a);  	} @@ -173,6 +179,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,  	skb2->skb_iif = skb->dev->ifindex;  	skb2->dev = dev; +	skb_sender_cpu_clear(skb2);  	err = dev_queue_xmit(skb2);  	if (err) { @@ -221,7 +228,8 @@ static int mirred_device_event(struct notifier_block *unused,  	struct tcf_mirred *m;  	ASSERT_RTNL(); -	if (event == NETDEV_UNREGISTER) +	if (event == NETDEV_UNREGISTER) { +		spin_lock_bh(&mirred_list_lock);  		list_for_each_entry(m, &mirred_list, tcfm_list) {  			if (rcu_access_pointer(m->tcfm_dev) == dev) {  				dev_put(dev); @@ -231,6 +239,8 @@ static int mirred_device_event(struct notifier_block *unused,  				RCU_INIT_POINTER(m->tcfm_dev, NULL);  			}  		} +		spin_unlock_bh(&mirred_list_lock); +	}  	return NOTIFY_DONE;  } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 715e01e5910a..f23a3b68bba6 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -33,7 +33,6 @@  struct fw_head {  	u32			mask; -	bool			mask_set;  	struct fw_filter __rcu	*ht[HTSIZE];  	struct rcu_head		rcu;  }; @@ -84,7 +83,7 @@ static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp,  			}  		}  	} else { -		/* old method */ +		/* Old method: classify the packet using its skb mark. */  		if (id && (TC_H_MAJ(id) == 0 ||  			   !(TC_H_MAJ(id ^ tp->q->handle)))) {  			res->classid = id; @@ -114,14 +113,9 @@ static unsigned long fw_get(struct tcf_proto *tp, u32 handle)  static int fw_init(struct tcf_proto *tp)  { -	struct fw_head *head; - -	head = kzalloc(sizeof(struct fw_head), GFP_KERNEL); -	if (head == NULL) -		return -ENOBUFS; - -	head->mask_set = false; -	rcu_assign_pointer(tp->root, head); +	/* We don't allocate fw_head here, because in the old method +	 * we don't need it at all. +	 */  	return 0;  } @@ -252,7 +246,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,  	int err;  	if (!opt) -		return handle ? -EINVAL : 0; +		return handle ? -EINVAL : 0; /* Succeed if it is old method. */  	err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy);  	if (err < 0) @@ -302,11 +296,17 @@ static int fw_change(struct net *net, struct sk_buff *in_skb,  	if (!handle)  		return -EINVAL; -	if (!head->mask_set) { -		head->mask = 0xFFFFFFFF; +	if (!head) { +		u32 mask = 0xFFFFFFFF;  		if (tb[TCA_FW_MASK]) -			head->mask = nla_get_u32(tb[TCA_FW_MASK]); -		head->mask_set = true; +			mask = nla_get_u32(tb[TCA_FW_MASK]); + +		head = kzalloc(sizeof(*head), GFP_KERNEL); +		if (!head) +			return -ENOBUFS; +		head->mask = mask; + +		rcu_assign_pointer(tp->root, head);  	}  	f = kzalloc(sizeof(struct fw_filter), GFP_KERNEL); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 9d15cb6b8cb1..86b04e31e60b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -368,6 +368,15 @@ static unsigned int hhf_drop(struct Qdisc *sch)  	return bucket - q->buckets;  } +static unsigned int hhf_qdisc_drop(struct Qdisc *sch) +{ +	unsigned int prev_backlog; + +	prev_backlog = sch->qstats.backlog; +	hhf_drop(sch); +	return prev_backlog - sch->qstats.backlog; +} +  static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)  {  	struct hhf_sched_data *q = qdisc_priv(sch); @@ -696,7 +705,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = {  	.enqueue	=	hhf_enqueue,  	.dequeue	=	hhf_dequeue,  	.peek		=	qdisc_peek_dequeued, -	.drop		=	hhf_drop, +	.drop		=	hhf_qdisc_drop,  	.init		=	hhf_init,  	.reset		=	hhf_reset,  	.destroy	=	hhf_destroy, diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 197c3f59ecbf..b00f1f9611d6 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1208,20 +1208,22 @@ void sctp_assoc_update(struct sctp_association *asoc,   *   within this document.   *   * Our basic strategy is to round-robin transports in priorities - * according to sctp_state_prio_map[] e.g., if no such + * according to sctp_trans_score() e.g., if no such   * transport with state SCTP_ACTIVE exists, round-robin through   * SCTP_UNKNOWN, etc. You get the picture.   */ -static const u8 sctp_trans_state_to_prio_map[] = { -	[SCTP_ACTIVE]	= 3,	/* best case */ -	[SCTP_UNKNOWN]	= 2, -	[SCTP_PF]	= 1, -	[SCTP_INACTIVE] = 0,	/* worst case */ -}; -  static u8 sctp_trans_score(const struct sctp_transport *trans)  { -	return sctp_trans_state_to_prio_map[trans->state]; +	switch (trans->state) { +	case SCTP_ACTIVE: +		return 3;	/* best case */ +	case SCTP_UNKNOWN: +		return 2; +	case SCTP_PF: +		return 1; +	default: /* case SCTP_INACTIVE */ +		return 0;	/* worst case */ +	}  }  static struct sctp_transport *sctp_trans_elect_tie(struct sctp_transport *trans1, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index b7143337e4fa..3d9ea9a48289 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1186,7 +1186,7 @@ static void sctp_v4_del_protocol(void)  	unregister_inetaddr_notifier(&sctp_inetaddr_notifier);  } -static int __net_init sctp_net_init(struct net *net) +static int __net_init sctp_defaults_init(struct net *net)  {  	int status; @@ -1279,12 +1279,6 @@ static int __net_init sctp_net_init(struct net *net)  	sctp_dbg_objcnt_init(net); -	/* Initialize the control inode/socket for handling OOTB packets.  */ -	if ((status = sctp_ctl_sock_init(net))) { -		pr_err("Failed to initialize the SCTP control sock\n"); -		goto err_ctl_sock_init; -	} -  	/* Initialize the local address list. */  	INIT_LIST_HEAD(&net->sctp.local_addr_list);  	spin_lock_init(&net->sctp.local_addr_lock); @@ -1300,9 +1294,6 @@ static int __net_init sctp_net_init(struct net *net)  	return 0; -err_ctl_sock_init: -	sctp_dbg_objcnt_exit(net); -	sctp_proc_exit(net);  err_init_proc:  	cleanup_sctp_mibs(net);  err_init_mibs: @@ -1311,15 +1302,12 @@ err_sysctl_register:  	return status;  } -static void __net_exit sctp_net_exit(struct net *net) +static void __net_exit sctp_defaults_exit(struct net *net)  {  	/* Free the local address list */  	sctp_free_addr_wq(net);  	sctp_free_local_addr_list(net); -	/* Free the control endpoint.  */ -	inet_ctl_sock_destroy(net->sctp.ctl_sock); -  	sctp_dbg_objcnt_exit(net);  	sctp_proc_exit(net); @@ -1327,9 +1315,32 @@ static void __net_exit sctp_net_exit(struct net *net)  	sctp_sysctl_net_unregister(net);  } -static struct pernet_operations sctp_net_ops = { -	.init = sctp_net_init, -	.exit = sctp_net_exit, +static struct pernet_operations sctp_defaults_ops = { +	.init = sctp_defaults_init, +	.exit = sctp_defaults_exit, +}; + +static int __net_init sctp_ctrlsock_init(struct net *net) +{ +	int status; + +	/* Initialize the control inode/socket for handling OOTB packets.  */ +	status = sctp_ctl_sock_init(net); +	if (status) +		pr_err("Failed to initialize the SCTP control sock\n"); + +	return status; +} + +static void __net_init sctp_ctrlsock_exit(struct net *net) +{ +	/* Free the control endpoint.  */ +	inet_ctl_sock_destroy(net->sctp.ctl_sock); +} + +static struct pernet_operations sctp_ctrlsock_ops = { +	.init = sctp_ctrlsock_init, +	.exit = sctp_ctrlsock_exit,  };  /* Initialize the universe into something sensible.  */ @@ -1462,8 +1473,11 @@ static __init int sctp_init(void)  	sctp_v4_pf_init();  	sctp_v6_pf_init(); -	status = sctp_v4_protosw_init(); +	status = register_pernet_subsys(&sctp_defaults_ops); +	if (status) +		goto err_register_defaults; +	status = sctp_v4_protosw_init();  	if (status)  		goto err_protosw_init; @@ -1471,9 +1485,9 @@ static __init int sctp_init(void)  	if (status)  		goto err_v6_protosw_init; -	status = register_pernet_subsys(&sctp_net_ops); +	status = register_pernet_subsys(&sctp_ctrlsock_ops);  	if (status) -		goto err_register_pernet_subsys; +		goto err_register_ctrlsock;  	status = sctp_v4_add_protocol();  	if (status) @@ -1489,12 +1503,14 @@ out:  err_v6_add_protocol:  	sctp_v4_del_protocol();  err_add_protocol: -	unregister_pernet_subsys(&sctp_net_ops); -err_register_pernet_subsys: +	unregister_pernet_subsys(&sctp_ctrlsock_ops); +err_register_ctrlsock:  	sctp_v6_protosw_exit();  err_v6_protosw_init:  	sctp_v4_protosw_exit();  err_protosw_init: +	unregister_pernet_subsys(&sctp_defaults_ops); +err_register_defaults:  	sctp_v4_pf_exit();  	sctp_v6_pf_exit();  	sctp_sysctl_unregister(); @@ -1527,12 +1543,14 @@ static __exit void sctp_exit(void)  	sctp_v6_del_protocol();  	sctp_v4_del_protocol(); -	unregister_pernet_subsys(&sctp_net_ops); +	unregister_pernet_subsys(&sctp_ctrlsock_ops);  	/* Free protosw registrations */  	sctp_v6_protosw_exit();  	sctp_v4_protosw_exit(); +	unregister_pernet_subsys(&sctp_defaults_ops); +  	/* Unregister with socket layer. */  	sctp_v6_pf_exit();  	sctp_v4_pf_exit(); diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 35df1266bf07..6098d4c42fa9 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -244,12 +244,13 @@ void sctp_generate_t3_rtx_event(unsigned long peer)  	int error;  	struct sctp_transport *transport = (struct sctp_transport *) peer;  	struct sctp_association *asoc = transport->asoc; -	struct net *net = sock_net(asoc->base.sk); +	struct sock *sk = asoc->base.sk; +	struct net *net = sock_net(sk);  	/* Check whether a task is in the sock.  */ -	bh_lock_sock(asoc->base.sk); -	if (sock_owned_by_user(asoc->base.sk)) { +	bh_lock_sock(sk); +	if (sock_owned_by_user(sk)) {  		pr_debug("%s: sock is busy\n", __func__);  		/* Try again later.  */ @@ -272,10 +273,10 @@ void sctp_generate_t3_rtx_event(unsigned long peer)  			   transport, GFP_ATOMIC);  	if (error) -		asoc->base.sk->sk_err = -error; +		sk->sk_err = -error;  out_unlock: -	bh_unlock_sock(asoc->base.sk); +	bh_unlock_sock(sk);  	sctp_transport_put(transport);  } @@ -285,11 +286,12 @@ out_unlock:  static void sctp_generate_timeout_event(struct sctp_association *asoc,  					sctp_event_timeout_t timeout_type)  { -	struct net *net = sock_net(asoc->base.sk); +	struct sock *sk = asoc->base.sk; +	struct net *net = sock_net(sk);  	int error = 0; -	bh_lock_sock(asoc->base.sk); -	if (sock_owned_by_user(asoc->base.sk)) { +	bh_lock_sock(sk); +	if (sock_owned_by_user(sk)) {  		pr_debug("%s: sock is busy: timer %d\n", __func__,  			 timeout_type); @@ -312,10 +314,10 @@ static void sctp_generate_timeout_event(struct sctp_association *asoc,  			   (void *)timeout_type, GFP_ATOMIC);  	if (error) -		asoc->base.sk->sk_err = -error; +		sk->sk_err = -error;  out_unlock: -	bh_unlock_sock(asoc->base.sk); +	bh_unlock_sock(sk);  	sctp_association_put(asoc);  } @@ -365,10 +367,11 @@ void sctp_generate_heartbeat_event(unsigned long data)  	int error = 0;  	struct sctp_transport *transport = (struct sctp_transport *) data;  	struct sctp_association *asoc = transport->asoc; -	struct net *net = sock_net(asoc->base.sk); +	struct sock *sk = asoc->base.sk; +	struct net *net = sock_net(sk); -	bh_lock_sock(asoc->base.sk); -	if (sock_owned_by_user(asoc->base.sk)) { +	bh_lock_sock(sk); +	if (sock_owned_by_user(sk)) {  		pr_debug("%s: sock is busy\n", __func__);  		/* Try again later.  */ @@ -388,11 +391,11 @@ void sctp_generate_heartbeat_event(unsigned long data)  			   asoc->state, asoc->ep, asoc,  			   transport, GFP_ATOMIC); -	 if (error) -		 asoc->base.sk->sk_err = -error; +	if (error) +		sk->sk_err = -error;  out_unlock: -	bh_unlock_sock(asoc->base.sk); +	bh_unlock_sock(sk);  	sctp_transport_put(transport);  } @@ -403,10 +406,11 @@ void sctp_generate_proto_unreach_event(unsigned long data)  {  	struct sctp_transport *transport = (struct sctp_transport *) data;  	struct sctp_association *asoc = transport->asoc; -	struct net *net = sock_net(asoc->base.sk); +	struct sock *sk = asoc->base.sk; +	struct net *net = sock_net(sk); -	bh_lock_sock(asoc->base.sk); -	if (sock_owned_by_user(asoc->base.sk)) { +	bh_lock_sock(sk); +	if (sock_owned_by_user(sk)) {  		pr_debug("%s: sock is busy\n", __func__);  		/* Try again later.  */ @@ -427,7 +431,7 @@ void sctp_generate_proto_unreach_event(unsigned long data)  		   asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC);  out_unlock: -	bh_unlock_sock(asoc->base.sk); +	bh_unlock_sock(sk);  	sctp_association_put(asoc);  } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index b140c092d226..f14f24ee9983 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -297,7 +297,7 @@ static int rpc_complete_task(struct rpc_task *task)  	clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);  	ret = atomic_dec_and_test(&task->tk_count);  	if (waitqueue_active(wq)) -		__wake_up_locked_key(wq, TASK_NORMAL, 1, &k); +		__wake_up_locked_key(wq, TASK_NORMAL, &k);  	spin_unlock_irqrestore(&wq->lock, flags);  	return ret;  } @@ -1092,14 +1092,10 @@ void  rpc_destroy_mempool(void)  {  	rpciod_stop(); -	if (rpc_buffer_mempool) -		mempool_destroy(rpc_buffer_mempool); -	if (rpc_task_mempool) -		mempool_destroy(rpc_task_mempool); -	if (rpc_task_slabp) -		kmem_cache_destroy(rpc_task_slabp); -	if (rpc_buffer_slabp) -		kmem_cache_destroy(rpc_buffer_slabp); +	mempool_destroy(rpc_buffer_mempool); +	mempool_destroy(rpc_task_mempool); +	kmem_cache_destroy(rpc_task_slabp); +	kmem_cache_destroy(rpc_buffer_slabp);  	rpc_destroy_wait_queue(&delay_queue);  } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ab5dd621ae0c..2e98f4a243e5 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -614,6 +614,7 @@ static void xprt_autoclose(struct work_struct *work)  	clear_bit(XPRT_CLOSE_WAIT, &xprt->state);  	xprt->ops->close(xprt);  	xprt_release_write(xprt, NULL); +	wake_up_bit(&xprt->state, XPRT_LOCKED);  }  /** @@ -723,6 +724,7 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie)  	xprt->ops->release_xprt(xprt, NULL);  out:  	spin_unlock_bh(&xprt->transport_lock); +	wake_up_bit(&xprt->state, XPRT_LOCKED);  }  /** @@ -1394,6 +1396,10 @@ out:  static void xprt_destroy(struct rpc_xprt *xprt)  {  	dprintk("RPC:       destroying transport %p\n", xprt); + +	/* Exclude transport connect/disconnect handlers */ +	wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_UNINTERRUPTIBLE); +  	del_timer_sync(&xprt->timer);  	rpc_xprt_debugfs_unregister(xprt); diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index cb25c89da623..f1e8dafbd507 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -39,25 +39,6 @@ static int  fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,  	    struct rpcrdma_create_data_internal *cdata)  { -	struct ib_device_attr *devattr = &ia->ri_devattr; -	struct ib_mr *mr; - -	/* Obtain an lkey to use for the regbufs, which are -	 * protected from remote access. -	 */ -	if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { -		ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; -	} else { -		mr = ib_get_dma_mr(ia->ri_pd, IB_ACCESS_LOCAL_WRITE); -		if (IS_ERR(mr)) { -			pr_err("%s: ib_get_dma_mr for failed with %lX\n", -			       __func__, PTR_ERR(mr)); -			return -ENOMEM; -		} -		ia->ri_dma_lkey = ia->ri_dma_mr->lkey; -		ia->ri_dma_mr = mr; -	} -  	return 0;  } diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index d6653f5d0830..5318951b3b53 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -189,11 +189,6 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,  	struct ib_device_attr *devattr = &ia->ri_devattr;  	int depth, delta; -	/* Obtain an lkey to use for the regbufs, which are -	 * protected from remote access. -	 */ -	ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; -  	ia->ri_max_frmr_depth =  			min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,  			      devattr->max_fast_reg_page_list_len); diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c index 72cf8b15bbb4..617b76f22154 100644 --- a/net/sunrpc/xprtrdma/physical_ops.c +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -23,7 +23,6 @@ static int  physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,  		 struct rpcrdma_create_data_internal *cdata)  { -	struct ib_device_attr *devattr = &ia->ri_devattr;  	struct ib_mr *mr;  	/* Obtain an rkey to use for RPC data payloads. @@ -37,15 +36,8 @@ physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,  		       __func__, PTR_ERR(mr));  		return -ENOMEM;  	} -	ia->ri_dma_mr = mr; - -	/* Obtain an lkey to use for regbufs. -	 */ -	if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) -		ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; -	else -		ia->ri_dma_lkey = ia->ri_dma_mr->lkey; +	ia->ri_dma_mr = mr;  	return 0;  } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index cb5174284074..f0c3ff67ca98 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -136,7 +136,8 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,  	ctxt->direction = DMA_FROM_DEVICE;  	ctxt->read_hdr = head;  	pages_needed = min_t(int, pages_needed, xprt->sc_max_sge_rd); -	read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); +	read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset, +		     rs_length);  	for (pno = 0; pno < pages_needed; pno++) {  		int len = min_t(int, rs_length, PAGE_SIZE - pg_off); @@ -235,7 +236,8 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,  	ctxt->direction = DMA_FROM_DEVICE;  	ctxt->frmr = frmr;  	pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len); -	read = min_t(int, pages_needed << PAGE_SHIFT, rs_length); +	read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset, +		     rs_length);  	frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);  	frmr->direction = DMA_FROM_DEVICE; @@ -531,7 +533,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,  	rqstp->rq_arg.page_base = head->arg.page_base;  	/* rq_respages starts after the last arg page */ -	rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; +	rqstp->rq_respages = &rqstp->rq_pages[page_no];  	rqstp->rq_next_page = rqstp->rq_respages + 1;  	/* Rebuild rq_arg head and tail. */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 64443eb754ad..41e452bc580c 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -270,8 +270,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)  	xprt_clear_connected(xprt); -	rpcrdma_buffer_destroy(&r_xprt->rx_buf);  	rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); +	rpcrdma_buffer_destroy(&r_xprt->rx_buf);  	rpcrdma_ia_close(&r_xprt->rx_ia);  	xprt_rdma_free_addresses(xprt); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 682996779970..5502d4dade74 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -543,11 +543,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)  	}  	if (memreg == RPCRDMA_FRMR) { -		/* Requires both frmr reg and local dma lkey */ -		if (((devattr->device_cap_flags & -		     (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != -		    (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || -		      (devattr->max_fast_reg_page_list_len == 0)) { +		if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || +		    (devattr->max_fast_reg_page_list_len == 0)) {  			dprintk("RPC:       %s: FRMR registration "  				"not supported by HCA\n", __func__);  			memreg = RPCRDMA_MTHCAFMR; @@ -557,6 +554,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)  		if (!ia->ri_device->alloc_fmr) {  			dprintk("RPC:       %s: MTHCAFMR registration "  				"not supported by HCA\n", __func__); +			rc = -EINVAL;  			goto out3;  		}  	} @@ -755,19 +753,22 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)  	cancel_delayed_work_sync(&ep->rep_connect_worker); -	if (ia->ri_id->qp) { +	if (ia->ri_id->qp)  		rpcrdma_ep_disconnect(ep, ia); + +	rpcrdma_clean_cq(ep->rep_attr.recv_cq); +	rpcrdma_clean_cq(ep->rep_attr.send_cq); + +	if (ia->ri_id->qp) {  		rdma_destroy_qp(ia->ri_id);  		ia->ri_id->qp = NULL;  	} -	rpcrdma_clean_cq(ep->rep_attr.recv_cq);  	rc = ib_destroy_cq(ep->rep_attr.recv_cq);  	if (rc)  		dprintk("RPC:       %s: ib_destroy_cq returned %i\n",  			__func__, rc); -	rpcrdma_clean_cq(ep->rep_attr.send_cq);  	rc = ib_destroy_cq(ep->rep_attr.send_cq);  	if (rc)  		dprintk("RPC:       %s: ib_destroy_cq returned %i\n", @@ -1252,7 +1253,7 @@ rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)  		goto out_free;  	iov->length = size; -	iov->lkey = ia->ri_dma_lkey; +	iov->lkey = ia->ri_pd->local_dma_lkey;  	rb->rg_size = size;  	rb->rg_owner = NULL;  	return rb; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 02512221b8bc..c09414e6f91b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -65,7 +65,6 @@ struct rpcrdma_ia {  	struct rdma_cm_id 	*ri_id;  	struct ib_pd		*ri_pd;  	struct ib_mr		*ri_dma_mr; -	u32			ri_dma_lkey;  	struct completion	ri_done;  	int			ri_async_rc;  	unsigned int		ri_max_frmr_depth; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7be90bc1a7c2..1a85e0ed0b48 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -777,7 +777,6 @@ static void xs_sock_mark_closed(struct rpc_xprt *xprt)  	xs_sock_reset_connection_flags(xprt);  	/* Mark transport as closed and wake up all pending tasks */  	xprt_disconnect_done(xprt); -	xprt_force_disconnect(xprt);  }  /** @@ -881,8 +880,11 @@ static void xs_xprt_free(struct rpc_xprt *xprt)   */  static void xs_destroy(struct rpc_xprt *xprt)  { +	struct sock_xprt *transport = container_of(xprt, +			struct sock_xprt, xprt);  	dprintk("RPC:       xs_destroy xprt %p\n", xprt); +	cancel_delayed_work_sync(&transport->connect_worker);  	xs_close(xprt);  	xs_xprt_free(xprt);  	module_put(THIS_MODULE); @@ -1435,6 +1437,7 @@ out:  static void xs_tcp_state_change(struct sock *sk)  {  	struct rpc_xprt *xprt; +	struct sock_xprt *transport;  	read_lock_bh(&sk->sk_callback_lock);  	if (!(xprt = xprt_from_sock(sk))) @@ -1446,13 +1449,12 @@ static void xs_tcp_state_change(struct sock *sk)  			sock_flag(sk, SOCK_ZAPPED),  			sk->sk_shutdown); +	transport = container_of(xprt, struct sock_xprt, xprt);  	trace_rpc_socket_state_change(xprt, sk->sk_socket);  	switch (sk->sk_state) {  	case TCP_ESTABLISHED:  		spin_lock(&xprt->transport_lock);  		if (!xprt_test_and_set_connected(xprt)) { -			struct sock_xprt *transport = container_of(xprt, -					struct sock_xprt, xprt);  			/* Reset TCP record info */  			transport->tcp_offset = 0; @@ -1461,6 +1463,8 @@ static void xs_tcp_state_change(struct sock *sk)  			transport->tcp_flags =  				TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;  			xprt->connect_cookie++; +			clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); +			xprt_clear_connecting(xprt);  			xprt_wake_pending_tasks(xprt, -EAGAIN);  		} @@ -1496,6 +1500,9 @@ static void xs_tcp_state_change(struct sock *sk)  		smp_mb__after_atomic();  		break;  	case TCP_CLOSE: +		if (test_and_clear_bit(XPRT_SOCK_CONNECTING, +					&transport->sock_state)) +			xprt_clear_connecting(xprt);  		xs_sock_mark_closed(xprt);  	}   out: @@ -2179,6 +2186,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)  	/* Tell the socket layer to start connecting... */  	xprt->stat.connect_count++;  	xprt->stat.connect_start = jiffies; +	set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state);  	ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK);  	switch (ret) {  	case 0: @@ -2240,7 +2248,6 @@ static void xs_tcp_setup_socket(struct work_struct *work)  	case -EINPROGRESS:  	case -EALREADY:  		xprt_unlock_connect(xprt, transport); -		xprt_clear_connecting(xprt);  		return;  	case -EINVAL:  		/* Happens, for instance, if the user specified a link diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index fda38f830a10..77f5d17e2612 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -16,6 +16,7 @@  #include <linux/notifier.h>  #include <linux/netdevice.h>  #include <linux/if_bridge.h> +#include <linux/if_vlan.h>  #include <net/ip_fib.h>  #include <net/switchdev.h> @@ -634,6 +635,8 @@ static int switchdev_port_br_afspec(struct net_device *dev,  		if (nla_len(attr) != sizeof(struct bridge_vlan_info))  			return -EINVAL;  		vinfo = nla_data(attr); +		if (!vinfo->vid || vinfo->vid >= VLAN_VID_MASK) +			return -EINVAL;  		vlan->flags = vinfo->flags;  		if (vinfo->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {  			if (vlan->vid_begin) diff --git a/net/sysctl_net.c b/net/sysctl_net.c index e7000be321b0..ed98c1fc3de1 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -94,10 +94,14 @@ __init int net_sysctl_init(void)  		goto out;  	ret = register_pernet_subsys(&sysctl_pernet_ops);  	if (ret) -		goto out; +		goto out1;  	register_sysctl_root(&net_sysctl_root);  out:  	return ret; +out1: +	unregister_sysctl_table(net_header); +	net_header = NULL; +	goto out;  }  struct ctl_table_header *register_net_sysctl(struct net *net, diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 41042de3ae9b..eadba62afa85 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -42,7 +42,8 @@  #include "core.h"  #define	MAX_PKT_DEFAULT_MCAST	1500	/* bcast link max packet size (fixed) */ -#define	BCLINK_WIN_DEFAULT	20	/* bcast link window size (default) */ +#define	BCLINK_WIN_DEFAULT	50	/* bcast link window size (default) */ +#define	BCLINK_WIN_MIN	        32	/* bcast minimum link window size */  const char tipc_bclink_name[] = "broadcast-link"; @@ -908,9 +909,10 @@ int tipc_bclink_set_queue_limits(struct net *net, u32 limit)  	if (!bcl)  		return -ENOPROTOOPT; -	if ((limit < TIPC_MIN_LINK_WIN) || (limit > TIPC_MAX_LINK_WIN)) +	if (limit < BCLINK_WIN_MIN) +		limit = BCLINK_WIN_MIN; +	if (limit > TIPC_MAX_LINK_WIN)  		return -EINVAL; -  	tipc_bclink_lock(net);  	tipc_link_set_queue_limits(bcl, limit);  	tipc_bclink_unlock(net); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 562c926a51cc..5f73450159df 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -121,7 +121,7 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)  {  	struct sk_buff *head = *headbuf;  	struct sk_buff *frag = *buf; -	struct sk_buff *tail; +	struct sk_buff *tail = NULL;  	struct tipc_msg *msg;  	u32 fragid;  	int delta; @@ -141,9 +141,15 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf)  		if (unlikely(skb_unclone(frag, GFP_ATOMIC)))  			goto err;  		head = *headbuf = frag; -		skb_frag_list_init(head); -		TIPC_SKB_CB(head)->tail = NULL;  		*buf = NULL; +		TIPC_SKB_CB(head)->tail = NULL; +		if (skb_is_nonlinear(head)) { +			skb_walk_frags(head, tail) { +				TIPC_SKB_CB(head)->tail = tail; +			} +		} else { +			skb_frag_list_init(head); +		}  		return 0;  	} @@ -539,6 +545,7 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err)  	*err = -TIPC_ERR_NO_NAME;  	if (skb_linearize(skb))  		return false; +	msg = buf_msg(skb);  	if (msg_reroute_cnt(msg))  		return false;  	dnode = addr_domain(net, msg_lookup_scope(msg)); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index a82c5848d4bc..5351a3f97e8e 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -357,7 +357,7 @@ static inline u32 msg_importance(struct tipc_msg *m)  	if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m)))  		return usr;  	if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) -		return msg_bits(m, 5, 13, 0x7); +		return msg_bits(m, 9, 0, 0x7);  	return TIPC_SYSTEM_IMPORTANCE;  } @@ -366,7 +366,7 @@ static inline void msg_set_importance(struct tipc_msg *m, u32 i)  	int usr = msg_user(m);  	if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) -		msg_set_bits(m, 5, 13, 0x7, i); +		msg_set_bits(m, 9, 0, 0x7, i);  	else if (i < TIPC_SYSTEM_IMPORTANCE)  		msg_set_user(m, i);  	else diff --git a/net/tipc/node.c b/net/tipc/node.c index 703875fd6cde..2c32a83037a3 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1116,7 +1116,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,  	}  	/* Ignore duplicate packets */ -	if (less(oseqno, rcv_nxt)) +	if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt))  		return true;  	/* Initiate or update failover mode if applicable */ @@ -1146,8 +1146,8 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,  	if (!pl || !tipc_link_is_up(pl))  		return true; -	/* Initiate or update synch mode if applicable */ -	if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) { +	/* Initiate synch mode if applicable */ +	if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) {  		syncpt = iseqno + exp_pkts - 1;  		if (!tipc_link_is_up(l)) {  			tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index c170d3138953..6e648d90297a 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -52,6 +52,8 @@  /* IANA assigned UDP port */  #define UDP_PORT_DEFAULT	6118 +#define UDP_MIN_HEADROOM        28 +  static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {  	[TIPC_NLA_UDP_UNSPEC]	= {.type = NLA_UNSPEC},  	[TIPC_NLA_UDP_LOCAL]	= {.type = NLA_BINARY, @@ -156,6 +158,9 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,  	struct sk_buff *clone;  	struct rtable *rt; +	if (skb_headroom(skb) < UDP_MIN_HEADROOM) +		pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC); +  	clone = skb_clone(skb, GFP_ATOMIC);  	skb_set_inner_protocol(clone, htons(ETH_P_TIPC));  	ub = rcu_dereference_rtnl(b->media_ptr); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 03ee4d359f6a..94f658235fb4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2064,6 +2064,11 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)  		goto out;  	} +	if (flags & MSG_PEEK) +		skip = sk_peek_offset(sk, flags); +	else +		skip = 0; +  	do {  		int chunk;  		struct sk_buff *skb, *last; @@ -2112,7 +2117,6 @@ unlock:  			break;  		} -		skip = sk_peek_offset(sk, flags);  		while (skip >= unix_skb_len(skb)) {  			skip -= unix_skb_len(skb);  			last = skb; @@ -2181,6 +2185,17 @@ unlock:  			sk_peek_offset_fwd(sk, chunk); +			if (UNIXCB(skb).fp) +				break; + +			skip = 0; +			last = skb; +			last_len = skb->len; +			unix_state_lock(sk); +			skb = skb_peek_next(skb, &sk->sk_receive_queue); +			if (skb) +				goto again; +			unix_state_unlock(sk);  			break;  		}  	} while (size); diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index df5fc6b340f1..00e8a349aabc 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1948,13 +1948,13 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)  	err = misc_register(&vsock_device);  	if (err) {  		pr_err("Failed to register misc device\n"); -		return -ENOENT; +		goto err_reset_transport;  	}  	err = proto_register(&vsock_proto, 1);	/* we want our slab */  	if (err) {  		pr_err("Cannot register vsock protocol\n"); -		goto err_misc_deregister; +		goto err_deregister_misc;  	}  	err = sock_register(&vsock_family_ops); @@ -1969,8 +1969,9 @@ int __vsock_core_init(const struct vsock_transport *t, struct module *owner)  err_unregister_proto:  	proto_unregister(&vsock_proto); -err_misc_deregister: +err_deregister_misc:  	misc_deregister(&vsock_device); +err_reset_transport:  	transport = NULL;  err_busy:  	mutex_unlock(&vsock_register_mutex); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 1f63daff3965..7555cad83a75 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -40,13 +40,11 @@  static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg);  static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg); -static void vmci_transport_peer_attach_cb(u32 sub_id, -					  const struct vmci_event_data *ed, -					  void *client_data);  static void vmci_transport_peer_detach_cb(u32 sub_id,  					  const struct vmci_event_data *ed,  					  void *client_data);  static void vmci_transport_recv_pkt_work(struct work_struct *work); +static void vmci_transport_cleanup(struct work_struct *work);  static int vmci_transport_recv_listen(struct sock *sk,  				      struct vmci_transport_packet *pkt);  static int vmci_transport_recv_connecting_server( @@ -75,6 +73,10 @@ struct vmci_transport_recv_pkt_info {  	struct vmci_transport_packet pkt;  }; +static LIST_HEAD(vmci_transport_cleanup_list); +static DEFINE_SPINLOCK(vmci_transport_cleanup_lock); +static DECLARE_WORK(vmci_transport_cleanup_work, vmci_transport_cleanup); +  static struct vmci_handle vmci_transport_stream_handle = { VMCI_INVALID_ID,  							   VMCI_INVALID_ID };  static u32 vmci_transport_qp_resumed_sub_id = VMCI_INVALID_ID; @@ -791,44 +793,6 @@ out:  	return err;  } -static void vmci_transport_peer_attach_cb(u32 sub_id, -					  const struct vmci_event_data *e_data, -					  void *client_data) -{ -	struct sock *sk = client_data; -	const struct vmci_event_payload_qp *e_payload; -	struct vsock_sock *vsk; - -	e_payload = vmci_event_data_const_payload(e_data); - -	vsk = vsock_sk(sk); - -	/* We don't ask for delayed CBs when we subscribe to this event (we -	 * pass 0 as flags to vmci_event_subscribe()).  VMCI makes no -	 * guarantees in that case about what context we might be running in, -	 * so it could be BH or process, blockable or non-blockable.  So we -	 * need to account for all possible contexts here. -	 */ -	local_bh_disable(); -	bh_lock_sock(sk); - -	/* XXX This is lame, we should provide a way to lookup sockets by -	 * qp_handle. -	 */ -	if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, -				 e_payload->handle)) { -		/* XXX This doesn't do anything, but in the future we may want -		 * to set a flag here to verify the attach really did occur and -		 * we weren't just sent a datagram claiming it was. -		 */ -		goto out; -	} - -out: -	bh_unlock_sock(sk); -	local_bh_enable(); -} -  static void vmci_transport_handle_detach(struct sock *sk)  {  	struct vsock_sock *vsk; @@ -871,28 +835,38 @@ static void vmci_transport_peer_detach_cb(u32 sub_id,  					  const struct vmci_event_data *e_data,  					  void *client_data)  { -	struct sock *sk = client_data; +	struct vmci_transport *trans = client_data;  	const struct vmci_event_payload_qp *e_payload; -	struct vsock_sock *vsk;  	e_payload = vmci_event_data_const_payload(e_data); -	vsk = vsock_sk(sk); -	if (vmci_handle_is_invalid(e_payload->handle)) -		return; - -	/* Same rules for locking as for peer_attach_cb(). */ -	local_bh_disable(); -	bh_lock_sock(sk);  	/* XXX This is lame, we should provide a way to lookup sockets by  	 * qp_handle.  	 */ -	if (vmci_handle_is_equal(vmci_trans(vsk)->qp_handle, -				 e_payload->handle)) -		vmci_transport_handle_detach(sk); +	if (vmci_handle_is_invalid(e_payload->handle) || +	    vmci_handle_is_equal(trans->qp_handle, e_payload->handle)) +		return; -	bh_unlock_sock(sk); -	local_bh_enable(); +	/* We don't ask for delayed CBs when we subscribe to this event (we +	 * pass 0 as flags to vmci_event_subscribe()).  VMCI makes no +	 * guarantees in that case about what context we might be running in, +	 * so it could be BH or process, blockable or non-blockable.  So we +	 * need to account for all possible contexts here. +	 */ +	spin_lock_bh(&trans->lock); +	if (!trans->sk) +		goto out; + +	/* Apart from here, trans->lock is only grabbed as part of sk destruct, +	 * where trans->sk isn't locked. +	 */ +	bh_lock_sock(trans->sk); + +	vmci_transport_handle_detach(trans->sk); + +	bh_unlock_sock(trans->sk); + out: +	spin_unlock_bh(&trans->lock);  }  static void vmci_transport_qp_resumed_cb(u32 sub_id, @@ -1181,7 +1155,7 @@ vmci_transport_recv_connecting_server(struct sock *listener,  	 */  	err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH,  				   vmci_transport_peer_detach_cb, -				   pending, &detach_sub_id); +				   vmci_trans(vpending), &detach_sub_id);  	if (err < VMCI_SUCCESS) {  		vmci_transport_send_reset(pending, pkt);  		err = vmci_transport_error_to_vsock_error(err); @@ -1321,7 +1295,6 @@ vmci_transport_recv_connecting_client(struct sock *sk,  		    || vmci_trans(vsk)->qpair  		    || vmci_trans(vsk)->produce_size != 0  		    || vmci_trans(vsk)->consume_size != 0 -		    || vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID  		    || vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) {  			skerr = EPROTO;  			err = -EINVAL; @@ -1389,7 +1362,6 @@ static int vmci_transport_recv_connecting_client_negotiate(  	struct vsock_sock *vsk;  	struct vmci_handle handle;  	struct vmci_qp *qpair; -	u32 attach_sub_id;  	u32 detach_sub_id;  	bool is_local;  	u32 flags; @@ -1399,7 +1371,6 @@ static int vmci_transport_recv_connecting_client_negotiate(  	vsk = vsock_sk(sk);  	handle = VMCI_INVALID_HANDLE; -	attach_sub_id = VMCI_INVALID_ID;  	detach_sub_id = VMCI_INVALID_ID;  	/* If we have gotten here then we should be past the point where old @@ -1444,23 +1415,15 @@ static int vmci_transport_recv_connecting_client_negotiate(  		goto destroy;  	} -	/* Subscribe to attach and detach events first. +	/* Subscribe to detach events first.  	 *  	 * XXX We attach once for each queue pair created for now so it is easy  	 * to find the socket (it's provided), but later we should only  	 * subscribe once and add a way to lookup sockets by queue pair handle.  	 */ -	err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_ATTACH, -				   vmci_transport_peer_attach_cb, -				   sk, &attach_sub_id); -	if (err < VMCI_SUCCESS) { -		err = vmci_transport_error_to_vsock_error(err); -		goto destroy; -	} -  	err = vmci_event_subscribe(VMCI_EVENT_QP_PEER_DETACH,  				   vmci_transport_peer_detach_cb, -				   sk, &detach_sub_id); +				   vmci_trans(vsk), &detach_sub_id);  	if (err < VMCI_SUCCESS) {  		err = vmci_transport_error_to_vsock_error(err);  		goto destroy; @@ -1496,7 +1459,6 @@ static int vmci_transport_recv_connecting_client_negotiate(  	vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size =  		pkt->u.size; -	vmci_trans(vsk)->attach_sub_id = attach_sub_id;  	vmci_trans(vsk)->detach_sub_id = detach_sub_id;  	vmci_trans(vsk)->notify_ops->process_negotiate(sk); @@ -1504,9 +1466,6 @@ static int vmci_transport_recv_connecting_client_negotiate(  	return 0;  destroy: -	if (attach_sub_id != VMCI_INVALID_ID) -		vmci_event_unsubscribe(attach_sub_id); -  	if (detach_sub_id != VMCI_INVALID_ID)  		vmci_event_unsubscribe(detach_sub_id); @@ -1607,9 +1566,11 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk,  	vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE;  	vmci_trans(vsk)->qpair = NULL;  	vmci_trans(vsk)->produce_size = vmci_trans(vsk)->consume_size = 0; -	vmci_trans(vsk)->attach_sub_id = vmci_trans(vsk)->detach_sub_id = -		VMCI_INVALID_ID; +	vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID;  	vmci_trans(vsk)->notify_ops = NULL; +	INIT_LIST_HEAD(&vmci_trans(vsk)->elem); +	vmci_trans(vsk)->sk = &vsk->sk; +	spin_lock_init(&vmci_trans(vsk)->lock);  	if (psk) {  		vmci_trans(vsk)->queue_pair_size =  			vmci_trans(psk)->queue_pair_size; @@ -1629,29 +1590,57 @@ static int vmci_transport_socket_init(struct vsock_sock *vsk,  	return 0;  } -static void vmci_transport_destruct(struct vsock_sock *vsk) +static void vmci_transport_free_resources(struct list_head *transport_list)  { -	if (vmci_trans(vsk)->attach_sub_id != VMCI_INVALID_ID) { -		vmci_event_unsubscribe(vmci_trans(vsk)->attach_sub_id); -		vmci_trans(vsk)->attach_sub_id = VMCI_INVALID_ID; -	} +	while (!list_empty(transport_list)) { +		struct vmci_transport *transport = +		    list_first_entry(transport_list, struct vmci_transport, +				     elem); +		list_del(&transport->elem); -	if (vmci_trans(vsk)->detach_sub_id != VMCI_INVALID_ID) { -		vmci_event_unsubscribe(vmci_trans(vsk)->detach_sub_id); -		vmci_trans(vsk)->detach_sub_id = VMCI_INVALID_ID; -	} +		if (transport->detach_sub_id != VMCI_INVALID_ID) { +			vmci_event_unsubscribe(transport->detach_sub_id); +			transport->detach_sub_id = VMCI_INVALID_ID; +		} -	if (!vmci_handle_is_invalid(vmci_trans(vsk)->qp_handle)) { -		vmci_qpair_detach(&vmci_trans(vsk)->qpair); -		vmci_trans(vsk)->qp_handle = VMCI_INVALID_HANDLE; -		vmci_trans(vsk)->produce_size = 0; -		vmci_trans(vsk)->consume_size = 0; +		if (!vmci_handle_is_invalid(transport->qp_handle)) { +			vmci_qpair_detach(&transport->qpair); +			transport->qp_handle = VMCI_INVALID_HANDLE; +			transport->produce_size = 0; +			transport->consume_size = 0; +		} + +		kfree(transport);  	} +} + +static void vmci_transport_cleanup(struct work_struct *work) +{ +	LIST_HEAD(pending); + +	spin_lock_bh(&vmci_transport_cleanup_lock); +	list_replace_init(&vmci_transport_cleanup_list, &pending); +	spin_unlock_bh(&vmci_transport_cleanup_lock); +	vmci_transport_free_resources(&pending); +} + +static void vmci_transport_destruct(struct vsock_sock *vsk) +{ +	/* Ensure that the detach callback doesn't use the sk/vsk +	 * we are about to destruct. +	 */ +	spin_lock_bh(&vmci_trans(vsk)->lock); +	vmci_trans(vsk)->sk = NULL; +	spin_unlock_bh(&vmci_trans(vsk)->lock);  	if (vmci_trans(vsk)->notify_ops)  		vmci_trans(vsk)->notify_ops->socket_destruct(vsk); -	kfree(vsk->trans); +	spin_lock_bh(&vmci_transport_cleanup_lock); +	list_add(&vmci_trans(vsk)->elem, &vmci_transport_cleanup_list); +	spin_unlock_bh(&vmci_transport_cleanup_lock); +	schedule_work(&vmci_transport_cleanup_work); +  	vsk->trans = NULL;  } @@ -2146,6 +2135,9 @@ module_init(vmci_transport_init);  static void __exit vmci_transport_exit(void)  { +	cancel_work_sync(&vmci_transport_cleanup_work); +	vmci_transport_free_resources(&vmci_transport_cleanup_list); +  	if (!vmci_handle_is_invalid(vmci_transport_stream_handle)) {  		if (vmci_datagram_destroy_handle(  			vmci_transport_stream_handle) != VMCI_SUCCESS) @@ -2164,6 +2156,7 @@ module_exit(vmci_transport_exit);  MODULE_AUTHOR("VMware, Inc.");  MODULE_DESCRIPTION("VMCI transport for Virtual Sockets"); +MODULE_VERSION("1.0.2.0-k");  MODULE_LICENSE("GPL v2");  MODULE_ALIAS("vmware_vsock");  MODULE_ALIAS_NETPROTO(PF_VSOCK); diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h index ce6c9623d5f0..2ad46f39649f 100644 --- a/net/vmw_vsock/vmci_transport.h +++ b/net/vmw_vsock/vmci_transport.h @@ -119,10 +119,12 @@ struct vmci_transport {  	u64 queue_pair_size;  	u64 queue_pair_min_size;  	u64 queue_pair_max_size; -	u32 attach_sub_id;  	u32 detach_sub_id;  	union vmci_transport_notify notify;  	struct vmci_transport_notify_ops *notify_ops; +	struct list_head elem; +	struct sock *sk; +	spinlock_t lock; /* protects sk. */  };  int vmci_transport_register(void); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a8de9e300200..24e06a2377f6 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1928,8 +1928,10 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh,  	struct nlattr *rp = attrs[XFRMA_REPLAY_VAL];  	struct nlattr *re = attrs[XFRMA_REPLAY_ESN_VAL];  	struct nlattr *lt = attrs[XFRMA_LTIME_VAL]; +	struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; +	struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; -	if (!lt && !rp && !re) +	if (!lt && !rp && !re && !et && !rt)  		return err;  	/* pedantic mode - thou shalt sayeth replaceth */  | 
