From f56e7bba22fad16c0d4fac996623ce1c13244f8f Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 22 Apr 2015 21:49:17 -0700 Subject: igb: Pull timestamp from fragment before adding it to skb This change makes it so that we pull the timestamp from the fragment before we add it to the skb. By doing this we can avoid a possible issue in which the fragment can possibly be less than IGB_RX_HDR_LEN due to the timestamp being pulled after the copybreak check. While making this change I realized we could also pull the rest of the igb_pull_tail function into igb_add_rx_frag since in the case of igb, unlike ixgbe, we are able to unmap the entire buffer before calling add_rx_frag so merging the two allows for sharing of code between the two merged functions. Reported-by: Cong Wang Signed-off-by: Alexander Duyck Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 94 ++++++++----------------------- 1 file changed, 25 insertions(+), 69 deletions(-) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 2f70a9b152bd..fc7729e78f3d 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6621,22 +6621,25 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, struct sk_buff *skb) { struct page *page = rx_buffer->page; + unsigned char *va = page_address(page) + rx_buffer->page_offset; unsigned int size = le16_to_cpu(rx_desc->wb.upper.length); #if (PAGE_SIZE < 8192) unsigned int truesize = IGB_RX_BUFSZ; #else - unsigned int truesize = ALIGN(size, L1_CACHE_BYTES); + unsigned int truesize = SKB_DATA_ALIGN(size); #endif + unsigned int pull_len; - if ((size <= IGB_RX_HDR_LEN) && !skb_is_nonlinear(skb)) { - unsigned char *va = page_address(page) + rx_buffer->page_offset; + if (unlikely(skb_is_nonlinear(skb))) + goto add_tail_frag; - if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { - igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb); - va += IGB_TS_HDR_LEN; - size -= IGB_TS_HDR_LEN; - } + if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) { + igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb); + va += IGB_TS_HDR_LEN; + size -= IGB_TS_HDR_LEN; + } + if (likely(size <= IGB_RX_HDR_LEN)) { memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long))); /* page is not reserved, we can reuse buffer as-is */ @@ -6648,8 +6651,21 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring, return false; } + /* we need the header to contain the greater of either ETH_HLEN or + * 60 bytes if the skb->len is less than 60 for skb_pad. + */ + pull_len = eth_get_headlen(va, IGB_RX_HDR_LEN); + + /* align pull length to size of long to optimize memcpy performance */ + memcpy(__skb_put(skb, pull_len), va, ALIGN(pull_len, sizeof(long))); + + /* update all of the pointers */ + va += pull_len; + size -= pull_len; + +add_tail_frag: skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, - rx_buffer->page_offset, size, truesize); + (unsigned long)va & ~PAGE_MASK, size, truesize); return igb_can_reuse_rx_page(rx_buffer, page, truesize); } @@ -6790,62 +6806,6 @@ static bool igb_is_non_eop(struct igb_ring *rx_ring, return true; } -/** - * igb_pull_tail - igb specific version of skb_pull_tail - * @rx_ring: rx descriptor ring packet is being transacted on - * @rx_desc: pointer to the EOP Rx descriptor - * @skb: pointer to current skb being adjusted - * - * This function is an igb specific version of __pskb_pull_tail. The - * main difference between this version and the original function is that - * this function can make several assumptions about the state of things - * that allow for significant optimizations versus the standard function. - * As a result we can do things like drop a frag and maintain an accurate - * truesize for the skb. - */ -static void igb_pull_tail(struct igb_ring *rx_ring, - union e1000_adv_rx_desc *rx_desc, - struct sk_buff *skb) -{ - struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0]; - unsigned char *va; - unsigned int pull_len; - - /* it is valid to use page_address instead of kmap since we are - * working with pages allocated out of the lomem pool per - * alloc_page(GFP_ATOMIC) - */ - va = skb_frag_address(frag); - - if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) { - /* retrieve timestamp from buffer */ - igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb); - - /* update pointers to remove timestamp header */ - skb_frag_size_sub(frag, IGB_TS_HDR_LEN); - frag->page_offset += IGB_TS_HDR_LEN; - skb->data_len -= IGB_TS_HDR_LEN; - skb->len -= IGB_TS_HDR_LEN; - - /* move va to start of packet data */ - va += IGB_TS_HDR_LEN; - } - - /* we need the header to contain the greater of either ETH_HLEN or - * 60 bytes if the skb->len is less than 60 for skb_pad. - */ - pull_len = eth_get_headlen(va, IGB_RX_HDR_LEN); - - /* align pull length to size of long to optimize memcpy performance */ - skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long))); - - /* update all of the pointers */ - skb_frag_size_sub(frag, pull_len); - frag->page_offset += pull_len; - skb->data_len -= pull_len; - skb->tail += pull_len; -} - /** * igb_cleanup_headers - Correct corrupted or empty headers * @rx_ring: rx descriptor ring packet is being transacted on @@ -6873,10 +6833,6 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring, } } - /* place header in linear portion of buffer */ - if (skb_is_nonlinear(skb)) - igb_pull_tail(rx_ring, rx_desc, skb); - /* if eth_skb_pad returns an error the skb was freed */ if (eth_skb_pad(skb)) return true; -- cgit v1.2.3-70-g09d2 From 6fb469023cd995d7be5ab3bf12b79387710382ff Mon Sep 17 00:00:00 2001 From: Todd Fujinaka Date: Wed, 20 May 2015 15:40:20 -0700 Subject: igb: bump version to igb-5.3.0 Signed-off-by: Todd Fujinaka Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index fc7729e78f3d..41e274046896 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -57,8 +57,8 @@ #include "igb.h" #define MAJ 5 -#define MIN 2 -#define BUILD 18 +#define MIN 3 +#define BUILD 0 #define DRV_VERSION __stringify(MAJ) "." __stringify(MIN) "." \ __stringify(BUILD) "-k" char igb_driver_name[] = "igb"; -- cgit v1.2.3-70-g09d2 From 72ddef0506da852dc82f078f37ced8ef4d74a2bf Mon Sep 17 00:00:00 2001 From: Shota Suzuki Date: Wed, 1 Jul 2015 09:25:52 +0900 Subject: igb: Fix oops caused by missing queue pairing When initializing igb driver (e.g. 82576, I350), IGB_FLAG_QUEUE_PAIRS is set if adapter->rss_queues exceeds half of max_rss_queues in igb_init_queue_configuration(). On the other hand, IGB_FLAG_QUEUE_PAIRS is not set even if the number of queues exceeds half of max_combined in igb_set_channels() when changing the number of queues by "ethtool -L". In this case, if numvecs is larger than MAX_MSIX_ENTRIES (10), the size of adapter->msix_entries[], an overflow can occur in igb_set_interrupt_capability(), which in turn leads to an oops. Fix this problem as follows: - When changing the number of queues by "ethtool -L", set IGB_FLAG_QUEUE_PAIRS in the same way as initializing igb driver. - When increasing the size of q_vector, reallocate it appropriately. (With IGB_FLAG_QUEUE_PAIRS set, the size of q_vector gets larger.) Another possible way to fix this problem is to cap the queues at its initial number, which is the number of the initial online cpus. But this is not the optimal way because we cannot increase queues when another cpu becomes online. Note that before commit cd14ef54d25b ("igb: Change to use statically allocated array for MSIx entries"), this problem did not cause oops but just made the number of queues become 1 because of entering msi_only mode in igb_set_interrupt_capability(). Fixes: 907b7835799f ("igb: Add ethtool support to configure number of channels") CC: stable Signed-off-by: Shota Suzuki Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb.h | 1 + drivers/net/ethernet/intel/igb/igb_ethtool.c | 5 ++++- drivers/net/ethernet/intel/igb/igb_main.c | 16 ++++++++++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h index c2bd4f98a837..212d668dabb3 100644 --- a/drivers/net/ethernet/intel/igb/igb.h +++ b/drivers/net/ethernet/intel/igb/igb.h @@ -540,6 +540,7 @@ void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, unsigned char *va, struct sk_buff *skb); int igb_ptp_set_ts_config(struct net_device *netdev, struct ifreq *ifr); int igb_ptp_get_ts_config(struct net_device *netdev, struct ifreq *ifr); +void igb_set_flag_queue_pairs(struct igb_adapter *, const u32); #ifdef CONFIG_IGB_HWMON void igb_sysfs_exit(struct igb_adapter *adapter); int igb_sysfs_init(struct igb_adapter *adapter); diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c index b7b9c670bb3c..74262768b09b 100644 --- a/drivers/net/ethernet/intel/igb/igb_ethtool.c +++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c @@ -3008,6 +3008,7 @@ static int igb_set_channels(struct net_device *netdev, { struct igb_adapter *adapter = netdev_priv(netdev); unsigned int count = ch->combined_count; + unsigned int max_combined = 0; /* Verify they are not requesting separate vectors */ if (!count || ch->rx_count || ch->tx_count) @@ -3018,11 +3019,13 @@ static int igb_set_channels(struct net_device *netdev, return -EINVAL; /* Verify the number of channels doesn't exceed hw limits */ - if (count > igb_max_channels(adapter)) + max_combined = igb_max_channels(adapter); + if (count > max_combined) return -EINVAL; if (count != adapter->rss_queues) { adapter->rss_queues = count; + igb_set_flag_queue_pairs(adapter, max_combined); /* Hardware has to reinitialize queues and interrupts to * match the new configuration. diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 41e274046896..7b97e8479290 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -1205,10 +1205,14 @@ static int igb_alloc_q_vector(struct igb_adapter *adapter, /* allocate q_vector and rings */ q_vector = adapter->q_vector[v_idx]; - if (!q_vector) + if (!q_vector) { q_vector = kzalloc(size, GFP_KERNEL); - else + } else if (size > ksize(q_vector)) { + kfree_rcu(q_vector, rcu); + q_vector = kzalloc(size, GFP_KERNEL); + } else { memset(q_vector, 0, size); + } if (!q_vector) return -ENOMEM; @@ -2888,6 +2892,14 @@ static void igb_init_queue_configuration(struct igb_adapter *adapter) adapter->rss_queues = min_t(u32, max_rss_queues, num_online_cpus()); + igb_set_flag_queue_pairs(adapter, max_rss_queues); +} + +void igb_set_flag_queue_pairs(struct igb_adapter *adapter, + const u32 max_rss_queues) +{ + struct e1000_hw *hw = &adapter->hw; + /* Determine if we need to pair queues. */ switch (hw->mac.type) { case e1000_82575: -- cgit v1.2.3-70-g09d2 From f468adc944ef40f23cacdd898e8cfbb5ba4b75a4 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Tue, 7 Jul 2015 18:53:45 +0300 Subject: igb: missing rtnl_unlock in igb_sriov_reinit() Signed-off-by: Vasily Averin Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 7b97e8479290..0069c328d7ad 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -7506,6 +7506,7 @@ static int igb_sriov_reinit(struct pci_dev *dev) igb_init_queue_configuration(adapter); if (igb_init_interrupt_scheme(adapter, true)) { + rtnl_unlock(); dev_err(&pdev->dev, "Unable to allocate memory for queues\n"); return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From 6423fc34160939142d72ffeaa2db6408317f54df Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Fri, 10 Jul 2015 15:01:12 +0200 Subject: igb: do not re-init SR-IOV during probe During driver probing the following code path is triggered. igb_probe ->igb_sw_init ->igb_probe_vfs ->igb_pci_enable_sriov ->igb_sriov_reinit Doing the SR-IOV re-init is not necessary during probing since we're starting from scratch. Here we can call igb_enable_sriov() right away. Running igb_sriov_reinit() during igb_probe() also seems to cause occasional packet loss on some onboard 82576 NICs. Reproduced on Dell and HP servers with onboard 82576 NICs. Example: Intel Corporation 82576 Gigabit Network Connection [8086:10c9] (rev 01) Subsystem: Dell Device [1028:0481] Signed-off-by: Stefan Assmann Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 0069c328d7ad..67ac0eabe6c4 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2851,7 +2851,7 @@ static void igb_probe_vfs(struct igb_adapter *adapter) return; pci_sriov_set_totalvfs(pdev, 7); - igb_pci_enable_sriov(pdev, max_vfs); + igb_enable_sriov(pdev, max_vfs); #endif /* CONFIG_PCI_IOV */ } -- cgit v1.2.3-70-g09d2 From c23d92b80e0b44d4c17085f0413e7574a7583615 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 29 Jul 2015 14:38:15 -0600 Subject: igb: Teardown SR-IOV before unregister_netdev() When the .remove() callback for a PF is called, SR-IOV support for the device is disabled, which requires unbinding and removing the VFs. The VFs may be in-use either by the host kernel or userspace, such as assigned to a VM through vfio-pci. In this latter case, the VFs may be removed either by shutting down the VM or hot-unplugging the devices from the VM. Unfortunately in the case of a Windows 2012 R2 guest, hot-unplug is broken due to the ordering of the PF driver teardown. Disabling SR-IOV prior to unregister_netdev() avoids this issue. Signed-off-by: Alex Williamson Acked-by: Mitch Williams Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 67ac0eabe6c4..8d22298236ad 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2809,14 +2809,14 @@ static void igb_remove(struct pci_dev *pdev) */ igb_release_hw_control(adapter); - unregister_netdev(netdev); - - igb_clear_interrupt_scheme(adapter); - #ifdef CONFIG_PCI_IOV igb_disable_sriov(pdev); #endif + unregister_netdev(netdev); + + igb_clear_interrupt_scheme(adapter); + pci_iounmap(pdev, hw->hw_addr); if (hw->flash_address) iounmap(hw->flash_address); -- cgit v1.2.3-70-g09d2 From 3eb14ea8d9584e96680729c2c7a9129bafa13a39 Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Mon, 3 Aug 2015 11:36:26 +0800 Subject: igb: Fix a deadlock in igb_sriov_reinit When igb_init_interrupt_scheme in igb_sriov_reinit is failed, the lock acquired by rtnl_lock() is not released, which causes a deadlock. This patch adds rtnl_unlock() in error handling to fix it. Signed-off-by: Jia-Ju Bai Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 8d22298236ad..17c1c842c33d 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -7413,6 +7413,7 @@ static int igb_resume(struct device *dev) if (igb_init_interrupt_scheme(adapter, true)) { dev_err(&pdev->dev, "Unable to allocate memory for queues\n"); + rtnl_unlock(); return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From 42ad1a03b4caf4d95b980bce17c46242e6728ddc Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Wed, 5 Aug 2015 22:05:16 +0800 Subject: igb: Fix a memory leak in igb_probe In error handling code of igb_probe, the memory adapter->shadow_vfta allocated by kcalloc in igb_sw_init is not freed. So when register_netdev or igb_init_i2c is failed, a memory leak will occur. This patch adds kfree to fix it. Signed-off-by: Jia-Ju Bai Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 17c1c842c33d..1ebdb461fed3 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2649,6 +2649,7 @@ err_eeprom: if (hw->flash_address) iounmap(hw->flash_address); err_sw_init: + kfree(adapter->shadow_vfta); igb_clear_interrupt_scheme(adapter); pci_iounmap(pdev, hw->hw_addr); err_ioremap: -- cgit v1.2.3-70-g09d2 From ceee3450b3a85db05a107d54fbea031c77d30401 Mon Sep 17 00:00:00 2001 From: Todd Fujinaka Date: Fri, 7 Aug 2015 17:27:39 -0700 Subject: igb: make sure SR-IOV init uses the right number of queues Recent changes to igb_probe_vfs() could lead to the PF holding onto all of the queues. Reorder igb_probe_vfs() to be before gb_init_queue_configuration() and add some more error checking. Signed-off-by: Todd Fujinaka Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher --- drivers/net/ethernet/intel/igb/igb_main.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 1ebdb461fed3..1902ef8f4a0b 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -179,6 +179,8 @@ static void igb_check_vf_rate_limit(struct igb_adapter *); #ifdef CONFIG_PCI_IOV static int igb_vf_configure(struct igb_adapter *adapter, int vf); static int igb_pci_enable_sriov(struct pci_dev *dev, int num_vfs); +static int igb_disable_sriov(struct pci_dev *dev); +static int igb_pci_disable_sriov(struct pci_dev *dev); #endif #ifdef CONFIG_PM @@ -2651,6 +2653,9 @@ err_eeprom: err_sw_init: kfree(adapter->shadow_vfta); igb_clear_interrupt_scheme(adapter); +#ifdef CONFIG_PCI_IOV + igb_disable_sriov(pdev); +#endif pci_iounmap(pdev, hw->hw_addr); err_ioremap: free_netdev(netdev); @@ -2981,6 +2986,8 @@ static int igb_sw_init(struct igb_adapter *adapter) } #endif /* CONFIG_PCI_IOV */ + igb_probe_vfs(adapter); + igb_init_queue_configuration(adapter); /* Setup and initialize a copy of the hw vlan table array */ @@ -2993,8 +3000,6 @@ static int igb_sw_init(struct igb_adapter *adapter) return -ENOMEM; } - igb_probe_vfs(adapter); - /* Explicitly disable IRQ since the NIC can be in any state. */ igb_irq_disable(adapter); -- cgit v1.2.3-70-g09d2 From 2f064f3485cd29633ad1b3cfb00cc519509a3d72 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 21 Aug 2015 14:11:51 -0700 Subject: mm: make page pfmemalloc check more robust Commit c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") added checks for page->pfmemalloc to __skb_fill_page_desc(): if (page->pfmemalloc && !page->mapping) skb->pfmemalloc = true; It assumes page->mapping == NULL implies that page->pfmemalloc can be trusted. However, __delete_from_page_cache() can set set page->mapping to NULL and leave page->index value alone. Due to being in union, a non-zero page->index will be interpreted as true page->pfmemalloc. So the assumption is invalid if the networking code can see such a page. And it seems it can. We have encountered this with a NFS over loopback setup when such a page is attached to a new skbuf. There is no copying going on in this case so the page confuses __skb_fill_page_desc which interprets the index as pfmemalloc flag and the network stack drops packets that have been allocated using the reserves unless they are to be queued on sockets handling the swapping which is the case here and that leads to hangs when the nfs client waits for a response from the server which has been dropped and thus never arrive. The struct page is already heavily packed so rather than finding another hole to put it in, let's do a trick instead. We can reuse the index again but define it to an impossible value (-1UL). This is the page index so it should never see the value that large. Replace all direct users of page->pfmemalloc by page_is_pfmemalloc which will hide this nastiness from unspoiled eyes. The information will get lost if somebody wants to use page->index obviously but that was the case before and the original code expected that the information should be persisted somewhere else if that is really needed (e.g. what SLAB and SLUB do). [akpm@linux-foundation.org: fix blooper in slub] Fixes: c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") Signed-off-by: Michal Hocko Debugged-by: Vlastimil Babka Debugged-by: Jiri Bohac Cc: Eric Dumazet Cc: David Miller Acked-by: Mel Gorman Cc: [3.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/intel/fm10k/fm10k_main.c | 2 +- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 +- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 2 +- include/linux/mm.h | 28 +++++++++++++++++++++++ include/linux/mm_types.h | 9 -------- include/linux/skbuff.h | 14 ++++-------- mm/page_alloc.c | 9 +++++--- mm/slab.c | 4 ++-- mm/slub.c | 2 +- net/core/skbuff.c | 2 +- 11 files changed, 47 insertions(+), 29 deletions(-) (limited to 'drivers/net/ethernet/intel/igb/igb_main.c') diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_main.c b/drivers/net/ethernet/intel/fm10k/fm10k_main.c index 982fdcdc795b..b5b2925103ec 100644 --- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c +++ b/drivers/net/ethernet/intel/fm10k/fm10k_main.c @@ -216,7 +216,7 @@ static void fm10k_reuse_rx_page(struct fm10k_ring *rx_ring, static inline bool fm10k_page_is_reserved(struct page *page) { - return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc; + return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer, diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 2f70a9b152bd..830466c49987 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6566,7 +6566,7 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring, static inline bool igb_page_is_reserved(struct page *page) { - return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc; + return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer, diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 9aa6104e34ea..ae21e0b06c3a 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1832,7 +1832,7 @@ static void ixgbe_reuse_rx_page(struct ixgbe_ring *rx_ring, static inline bool ixgbe_page_is_reserved(struct page *page) { - return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc; + return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } /** diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index e71cdde9cb01..1d7b00b038a2 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -765,7 +765,7 @@ static void ixgbevf_reuse_rx_page(struct ixgbevf_ring *rx_ring, static inline bool ixgbevf_page_is_reserved(struct page *page) { - return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc; + return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page); } /** diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e872f92dbac..bf6f117fcf4d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1002,6 +1002,34 @@ static inline int page_mapped(struct page *page) return atomic_read(&(page)->_mapcount) >= 0; } +/* + * Return true only if the page has been allocated with + * ALLOC_NO_WATERMARKS and the low watermark was not + * met implying that the system is under some pressure. + */ +static inline bool page_is_pfmemalloc(struct page *page) +{ + /* + * Page index cannot be this large so this must be + * a pfmemalloc page. + */ + return page->index == -1UL; +} + +/* + * Only to be called by the page allocator on a freshly allocated + * page. + */ +static inline void set_page_pfmemalloc(struct page *page) +{ + page->index = -1UL; +} + +static inline void clear_page_pfmemalloc(struct page *page) +{ + page->index = 0; +} + /* * Different kinds of faults, as returned by handle_mm_fault(). * Used to decide whether a process gets delivered SIGBUS or diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0038ac7466fd..15549578d559 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -63,15 +63,6 @@ struct page { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ - bool pfmemalloc; /* If set by the page allocator, - * ALLOC_NO_WATERMARKS was set - * and the low watermark was not - * met implying that the system - * is under some pressure. The - * caller should try ensure - * this page is only used to - * free other pages. - */ }; union { diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 22b6d9ca1654..9b88536487e6 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1602,20 +1602,16 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; /* - * Propagate page->pfmemalloc to the skb if we can. The problem is - * that not all callers have unique ownership of the page. If - * pfmemalloc is set, we check the mapping as a mapping implies - * page->index is set (index and pfmemalloc share space). - * If it's a valid mapping, we cannot use page->pfmemalloc but we - * do not lose pfmemalloc information as the pages would not be - * allocated using __GFP_MEMALLOC. + * Propagate page pfmemalloc to the skb if we can. The problem is + * that not all callers have unique ownership of the page but rely + * on page_is_pfmemalloc doing the right thing(tm). */ frag->page.p = page; frag->page_offset = off; skb_frag_size_set(frag, size); page = compound_head(page); - if (page->pfmemalloc && !page->mapping) + if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } @@ -2263,7 +2259,7 @@ static inline struct page *dev_alloc_page(void) static inline void skb_propagate_pfmemalloc(struct page *page, struct sk_buff *skb) { - if (page && page->pfmemalloc) + if (page_is_pfmemalloc(page)) skb->pfmemalloc = true; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index df959b7d6085..5b5240b7f642 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1343,12 +1343,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_owner(page, order, gfp_flags); /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking * steps that will free more memory. The caller should avoid the page * being used for !PFMEMALLOC purposes. */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + if (alloc_flags & ALLOC_NO_WATERMARKS) + set_page_pfmemalloc(page); + else + clear_page_pfmemalloc(page); return 0; } @@ -3345,7 +3348,7 @@ refill: atomic_add(size - 1, &page->_count); /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page->pfmemalloc; + nc->pfmemalloc = page_is_pfmemalloc(page); nc->pagecnt_bias = size; nc->offset = size; } diff --git a/mm/slab.c b/mm/slab.c index 200e22412a16..bbd0b47dc6a9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, } /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (unlikely(page->pfmemalloc)) + if (page_is_pfmemalloc(page)) pfmemalloc_active = true; nr_pages = (1 << cachep->gfporder); @@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { diff --git a/mm/slub.c b/mm/slub.c index 816df0016555..f68c0e50f3c0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab_cache = s; __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); start = page_address(page); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bf9a5d93c2d1..7b84330e5d30 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -340,7 +340,7 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size) if (skb && frag_size) { skb->head_frag = 1; - if (virt_to_head_page(data)->pfmemalloc) + if (page_is_pfmemalloc(virt_to_head_page(data))) skb->pfmemalloc = 1; } return skb; -- cgit v1.2.3-70-g09d2