summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-10 20:01:30 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-10 20:01:30 -0800
commitc5ce28df0e7c01a1de23c36ebdefcd803f2b6cbb (patch)
tree9830baf38832769e1cf621708889111bbe3c93df /net/core/dev.c
parent29afc4e9a408f2304e09c6dd0dbcfbd2356d0faa (diff)
parent9399f0c51489ae8c16d6559b82a452fdc1895e91 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) More iov_iter conversion work from Al Viro. [ The "crypto: switch af_alg_make_sg() to iov_iter" commit was wrong, and this pull actually adds an extra commit on top of the branch I'm pulling to fix that up, so that the pre-merge state is ok. - Linus ] 2) Various optimizations to the ipv4 forwarding information base trie lookup implementation. From Alexander Duyck. 3) Remove sock_iocb altogether, from CHristoph Hellwig. 4) Allow congestion control algorithm selection via routing metrics. From Daniel Borkmann. 5) Make ipv4 uncached route list per-cpu, from Eric Dumazet. 6) Handle rfs hash collisions more gracefully, also from Eric Dumazet. 7) Add xmit_more support to r8169, e1000, and e1000e drivers. From Florian Westphal. 8) Transparent Ethernet Bridging support for GRO, from Jesse Gross. 9) Add BPF packet actions to packet scheduler, from Jiri Pirko. 10) Add support for uniqu flow IDs to openvswitch, from Joe Stringer. 11) New NetCP ethernet driver, from Muralidharan Karicheri and Wingman Kwok. 12) More sanely handle out-of-window dupacks, which can result in serious ACK storms. From Neal Cardwell. 13) Various rhashtable bug fixes and enhancements, from Herbert Xu, Patrick McHardy, and Thomas Graf. 14) Support xmit_more in be2net, from Sathya Perla. 15) Group Policy extensions for vxlan, from Thomas Graf. 16) Remove Checksum Offload support for vxlan, from Tom Herbert. 17) Like ipv4, support lockless transmit over ipv6 UDP sockets. From Vlad Yasevich. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1494+1 commits) crypto: fix af_alg_make_sg() conversion to iov_iter ipv4: Namespecify TCP PMTU mechanism i40e: Fix for stats init function call in Rx setup tcp: don't include Fast Open option in SYN-ACK on pure SYN-data openvswitch: Only set TUNNEL_VXLAN_OPT if VXLAN-GBP metadata is set ipv6: Make __ipv6_select_ident static ipv6: Fix fragment id assignment on LE arches. bridge: Fix inability to add non-vlan fdb entry net: Mellanox: Delete unnecessary checks before the function call "vunmap" cxgb4: Add support in cxgb4 to get expansion rom version via ethtool ethtool: rename reserved1 memeber in ethtool_drvinfo for expansion ROM version net: dsa: Remove redundant phy_attach() IB/mlx4: Reset flow support for IB kernel ULPs IB/mlx4: Always use the correct port for mirrored multicast attachments net/bonding: Fix potential bad memory access during bonding events tipc: remove tipc_snprintf tipc: nl compat add noop and remove legacy nl framework tipc: convert legacy nl stats show to nl compat tipc: convert legacy nl net id get to nl compat tipc: convert legacy nl net id set to nl compat ...
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c227
1 files changed, 146 insertions, 81 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 7fe82929f509..d030575532a2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -371,9 +371,10 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
if (pt->type == htons(ETH_P_ALL))
- return &ptype_all;
+ return pt->dev ? &pt->dev->ptype_all : &ptype_all;
else
- return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
+ return pt->dev ? &pt->dev->ptype_specific :
+ &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
/**
@@ -1734,6 +1735,23 @@ static inline int deliver_skb(struct sk_buff *skb,
return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}
+static inline void deliver_ptype_list_skb(struct sk_buff *skb,
+ struct packet_type **pt,
+ struct net_device *dev, __be16 type,
+ struct list_head *ptype_list)
+{
+ struct packet_type *ptype, *pt_prev = *pt;
+
+ list_for_each_entry_rcu(ptype, ptype_list, list) {
+ if (ptype->type != type)
+ continue;
+ if (pt_prev)
+ deliver_skb(skb, pt_prev, dev);
+ pt_prev = ptype;
+ }
+ *pt = pt_prev;
+}
+
static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{
if (!ptype->af_packet_priv || !skb->sk)
@@ -1757,45 +1775,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
struct packet_type *ptype;
struct sk_buff *skb2 = NULL;
struct packet_type *pt_prev = NULL;
+ struct list_head *ptype_list = &ptype_all;
rcu_read_lock();
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
+again:
+ list_for_each_entry_rcu(ptype, ptype_list, list) {
/* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org)
*/
- if ((ptype->dev == dev || !ptype->dev) &&
- (!skb_loop_sk(ptype, skb))) {
- if (pt_prev) {
- deliver_skb(skb2, pt_prev, skb->dev);
- pt_prev = ptype;
- continue;
- }
+ if (skb_loop_sk(ptype, skb))
+ continue;
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (!skb2)
- break;
+ if (pt_prev) {
+ deliver_skb(skb2, pt_prev, skb->dev);
+ pt_prev = ptype;
+ continue;
+ }
- net_timestamp_set(skb2);
+ /* need to clone skb, done only once */
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ goto out_unlock;
- /* skb->nh should be correctly
- set by sender, so that the second statement is
- just protection against buggy protocols.
- */
- skb_reset_mac_header(skb2);
-
- if (skb_network_header(skb2) < skb2->data ||
- skb_network_header(skb2) > skb_tail_pointer(skb2)) {
- net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
- ntohs(skb2->protocol),
- dev->name);
- skb_reset_network_header(skb2);
- }
+ net_timestamp_set(skb2);
- skb2->transport_header = skb2->network_header;
- skb2->pkt_type = PACKET_OUTGOING;
- pt_prev = ptype;
+ /* skb->nh should be correctly
+ * set by sender, so that the second statement is
+ * just protection against buggy protocols.
+ */
+ skb_reset_mac_header(skb2);
+
+ if (skb_network_header(skb2) < skb2->data ||
+ skb_network_header(skb2) > skb_tail_pointer(skb2)) {
+ net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
+ ntohs(skb2->protocol),
+ dev->name);
+ skb_reset_network_header(skb2);
}
+
+ skb2->transport_header = skb2->network_header;
+ skb2->pkt_type = PACKET_OUTGOING;
+ pt_prev = ptype;
}
+
+ if (ptype_list == &ptype_all) {
+ ptype_list = &dev->ptype_all;
+ goto again;
+ }
+out_unlock:
if (pt_prev)
pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
@@ -2549,7 +2576,7 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
if (skb->encapsulation)
features &= dev->hw_enc_features;
- if (!vlan_tx_tag_present(skb)) {
+ if (!skb_vlan_tag_present(skb)) {
if (unlikely(protocol == htons(ETH_P_8021Q) ||
protocol == htons(ETH_P_8021AD))) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2588,7 +2615,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len;
int rc;
- if (!list_empty(&ptype_all))
+ if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
@@ -2630,7 +2657,7 @@ out:
static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
netdev_features_t features)
{
- if (vlan_tx_tag_present(skb) &&
+ if (skb_vlan_tag_present(skb) &&
!vlan_hw_offload_capable(features, skb->vlan_proto))
skb = __vlan_hwaccel_push_inside(skb);
return skb;
@@ -3003,6 +3030,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
/* One global table that all flow-based protocols share. */
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
+u32 rps_cpu_mask __read_mostly;
+EXPORT_SYMBOL(rps_cpu_mask);
struct static_key rps_needed __read_mostly;
@@ -3059,16 +3088,17 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow **rflowp)
{
- struct netdev_rx_queue *rxqueue;
- struct rps_map *map;
+ const struct rps_sock_flow_table *sock_flow_table;
+ struct netdev_rx_queue *rxqueue = dev->_rx;
struct rps_dev_flow_table *flow_table;
- struct rps_sock_flow_table *sock_flow_table;
+ struct rps_map *map;
int cpu = -1;
- u16 tcpu;
+ u32 tcpu;
u32 hash;
if (skb_rx_queue_recorded(skb)) {
u16 index = skb_get_rx_queue(skb);
+
if (unlikely(index >= dev->real_num_rx_queues)) {
WARN_ONCE(dev->real_num_rx_queues > 1,
"%s received packet on queue %u, but number "
@@ -3076,39 +3106,40 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
dev->name, index, dev->real_num_rx_queues);
goto done;
}
- rxqueue = dev->_rx + index;
- } else
- rxqueue = dev->_rx;
+ rxqueue += index;
+ }
+ /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
+
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
map = rcu_dereference(rxqueue->rps_map);
- if (map) {
- if (map->len == 1 &&
- !rcu_access_pointer(rxqueue->rps_flow_table)) {
- tcpu = map->cpus[0];
- if (cpu_online(tcpu))
- cpu = tcpu;
- goto done;
- }
- } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
+ if (!flow_table && !map)
goto done;
- }
skb_reset_network_header(skb);
hash = skb_get_hash(skb);
if (!hash)
goto done;
- flow_table = rcu_dereference(rxqueue->rps_flow_table);
sock_flow_table = rcu_dereference(rps_sock_flow_table);
if (flow_table && sock_flow_table) {
- u16 next_cpu;
struct rps_dev_flow *rflow;
+ u32 next_cpu;
+ u32 ident;
+
+ /* First check into global flow table if there is a match */
+ ident = sock_flow_table->ents[hash & sock_flow_table->mask];
+ if ((ident ^ hash) & ~rps_cpu_mask)
+ goto try_rps;
+
+ next_cpu = ident & rps_cpu_mask;
+ /* OK, now we know there is a match,
+ * we can look at the local (per receive queue) flow table
+ */
rflow = &flow_table->flows[hash & flow_table->mask];
tcpu = rflow->cpu;
- next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
-
/*
* If the desired CPU (where last recvmsg was done) is
* different from current CPU (one in the rx-queue flow
@@ -3135,6 +3166,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
}
}
+try_rps:
+
if (map) {
tcpu = map->cpus[reciprocal_scale(hash, map->len)];
if (cpu_online(tcpu)) {
@@ -3586,7 +3619,6 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
- struct net_device *null_or_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
@@ -3629,11 +3661,15 @@ another_round:
goto skip_taps;
list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (!ptype->dev || ptype->dev == skb->dev) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
- }
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
+ }
+
+ list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
+ if (pt_prev)
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = ptype;
}
skip_taps:
@@ -3647,7 +3683,7 @@ ncls:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
- if (vlan_tx_tag_present(skb)) {
+ if (skb_vlan_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
@@ -3679,8 +3715,8 @@ ncls:
}
}
- if (unlikely(vlan_tx_tag_present(skb))) {
- if (vlan_tx_tag_get_id(skb))
+ if (unlikely(skb_vlan_tag_present(skb))) {
+ if (skb_vlan_tag_get_id(skb))
skb->pkt_type = PACKET_OTHERHOST;
/* Note: we might in the future use prio bits
* and set skb->priority like in vlan_do_receive()
@@ -3689,19 +3725,21 @@ ncls:
skb->vlan_tci = 0;
}
+ type = skb->protocol;
+
/* deliver only exact match when indicated */
- null_or_dev = deliver_exact ? skb->dev : NULL;
+ if (likely(!deliver_exact)) {
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &ptype_base[ntohs(type) &
+ PTYPE_HASH_MASK]);
+ }
- type = skb->protocol;
- list_for_each_entry_rcu(ptype,
- &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
- if (ptype->type == type &&
- (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
- ptype->dev == orig_dev)) {
- if (pt_prev)
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = ptype;
- }
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &orig_dev->ptype_specific);
+
+ if (unlikely(skb->dev != orig_dev)) {
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &skb->dev->ptype_specific);
}
if (pt_prev) {
@@ -5294,6 +5332,26 @@ void netdev_upper_dev_unlink(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
+/**
+ * netdev_bonding_info_change - Dispatch event about slave change
+ * @dev: device
+ * @netdev_bonding_info: info to dispatch
+ *
+ * Send NETDEV_BONDING_INFO to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_bonding_info_change(struct net_device *dev,
+ struct netdev_bonding_info *bonding_info)
+{
+ struct netdev_notifier_bonding_info info;
+
+ memcpy(&info.bonding_info, bonding_info,
+ sizeof(struct netdev_bonding_info));
+ call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
+ &info.info);
+}
+EXPORT_SYMBOL(netdev_bonding_info_change);
+
static void netdev_adjacent_add_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
@@ -6143,13 +6201,16 @@ static int netif_alloc_rx_queues(struct net_device *dev)
{
unsigned int i, count = dev->num_rx_queues;
struct netdev_rx_queue *rx;
+ size_t sz = count * sizeof(*rx);
BUG_ON(count < 1);
- rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
- if (!rx)
- return -ENOMEM;
-
+ rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+ if (!rx) {
+ rx = vzalloc(sz);
+ if (!rx)
+ return -ENOMEM;
+ }
dev->_rx = rx;
for (i = 0; i < count; i++)
@@ -6547,6 +6608,8 @@ void netdev_run_todo(void)
/* paranoia */
BUG_ON(netdev_refcnt_read(dev));
+ BUG_ON(!list_empty(&dev->ptype_all));
+ BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
WARN_ON(dev->dn_ptr);
@@ -6729,6 +6792,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->all_adj_list.upper);
INIT_LIST_HEAD(&dev->all_adj_list.lower);
+ INIT_LIST_HEAD(&dev->ptype_all);
+ INIT_LIST_HEAD(&dev->ptype_specific);
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -6779,7 +6844,7 @@ void free_netdev(struct net_device *dev)
netif_free_tx_queues(dev);
#ifdef CONFIG_SYSFS
- kfree(dev->_rx);
+ kvfree(dev->_rx);
#endif
kfree(rcu_dereference_protected(dev->ingress_queue, 1));
@@ -7064,11 +7129,11 @@ static int dev_cpu_callback(struct notifier_block *nfb,
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
- netif_rx_internal(skb);
+ netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}
while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
- netif_rx_internal(skb);
+ netif_rx_ni(skb);
input_queue_head_incr(oldsd);
}