From be8c6d86d558860bbc493ad3fbc51fc96ab0c2e4 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:06 +0200 Subject: net: mac802154: Rename the synchronous xmit worker There are currently two driver hooks: one is synchronous, the other is not. We cannot rely on driver implementations to provide a synchronous API (which is related to the bus medium more than a wish to have a synchronized implementation) so we are going to introduce a sync API above any kind of driver transmit function. In order to clarify what this worker is for (synchronous driver implementation), let's rename it so that people don't get bothered by the fact that their driver does not make use of the "xmit worker" which is a too generic name. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/ieee802154_i.h | 2 +- net/mac802154/main.c | 2 +- net/mac802154/tx.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 1381e6a5e180..d7632c6d225f 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -123,7 +123,7 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata) extern struct ieee802154_mlme_ops mac802154_mlme_wpan; void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb); -void ieee802154_xmit_worker(struct work_struct *work); +void ieee802154_xmit_sync_worker(struct work_struct *work); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t diff --git a/net/mac802154/main.c b/net/mac802154/main.c index bd7bdb1219dd..392771bba9dd 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -95,7 +95,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) skb_queue_head_init(&local->skb_queue); - INIT_WORK(&local->tx_work, ieee802154_xmit_worker); + INIT_WORK(&local->tx_work, ieee802154_xmit_sync_worker); /* init supported flags with 802.15.4 default ranges */ phy->supported.max_minbe = 8; diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index c829e4a75325..97df5985b830 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -22,7 +22,7 @@ #include "ieee802154_i.h" #include "driver-ops.h" -void ieee802154_xmit_worker(struct work_struct *work) +void ieee802154_xmit_sync_worker(struct work_struct *work) { struct ieee802154_local *local = container_of(work, struct ieee802154_local, tx_work); -- cgit v1.2.3-70-g09d2 From 983a974b40f66d202b075ab8ac584c698a3fc141 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:07 +0200 Subject: net: mac802154: Rename the main tx_work struct This entry is dedicated to synchronous transmissions done by drivers without async hook. Make this clearer that this is not a work that any driver can use by at least prefixing it with "sync_". While at it, let's enhance the comment explaining why we choose one or the other. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/ieee802154_i.h | 2 +- net/mac802154/main.c | 2 +- net/mac802154/tx.c | 9 ++++++--- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index d7632c6d225f..a8b7b9049f14 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -55,7 +55,7 @@ struct ieee802154_local { struct sk_buff_head skb_queue; struct sk_buff *tx_skb; - struct work_struct tx_work; + struct work_struct sync_tx_work; /* A negative Linux error code or a null/positive MLME error status */ int tx_result; }; diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 392771bba9dd..40fab08df24b 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -95,7 +95,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) skb_queue_head_init(&local->skb_queue); - INIT_WORK(&local->tx_work, ieee802154_xmit_sync_worker); + INIT_WORK(&local->sync_tx_work, ieee802154_xmit_sync_worker); /* init supported flags with 802.15.4 default ranges */ phy->supported.max_minbe = 8; diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 97df5985b830..a01689ddd547 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -25,7 +25,7 @@ void ieee802154_xmit_sync_worker(struct work_struct *work) { struct ieee802154_local *local = - container_of(work, struct ieee802154_local, tx_work); + container_of(work, struct ieee802154_local, sync_tx_work); struct sk_buff *skb = local->tx_skb; struct net_device *dev = skb->dev; int res; @@ -76,7 +76,10 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) /* Stop the netif queue on each sub_if_data object. */ ieee802154_stop_queue(&local->hw); - /* async is priority, otherwise sync is fallback */ + /* Drivers should preferably implement the async callback. In some rare + * cases they only provide a sync callback which we will use as a + * fallback. + */ if (local->ops->xmit_async) { unsigned int len = skb->len; @@ -90,7 +93,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) dev->stats.tx_bytes += len; } else { local->tx_skb = skb; - queue_work(local->workqueue, &local->tx_work); + queue_work(local->workqueue, &local->sync_tx_work); } return NETDEV_TX_OK; -- cgit v1.2.3-70-g09d2 From d08d951a9ae7e30fc0421a1af2b4193ce4aa6b3e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:08 +0200 Subject: net: mac802154: Enhance the error path in the main tx helper Before adding more logic in the error path, let's move the wake queue call there, rename the default label and create an additional one. There is no functional change. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/tx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index a01689ddd547..4a46ce8d2ac8 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -65,7 +65,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) consume_skb(skb); skb = nskb; } else { - goto err_tx; + goto err_free_skb; } } @@ -84,10 +84,8 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) unsigned int len = skb->len; ret = drv_xmit_async(local, skb); - if (ret) { - ieee802154_wake_queue(&local->hw); - goto err_tx; - } + if (ret) + goto err_wake_netif_queue; dev->stats.tx_packets++; dev->stats.tx_bytes += len; @@ -98,7 +96,9 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) return NETDEV_TX_OK; -err_tx: +err_wake_netif_queue: + ieee802154_wake_queue(&local->hw); +err_free_skb: kfree_skb(skb); return NETDEV_TX_OK; } -- cgit v1.2.3-70-g09d2 From bde000ae459f2829ed88e967f7fa7665b4e3afaf Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:09 +0200 Subject: net: mac802154: Follow the count of ongoing transmissions In order to create a synchronous API for MLME command purposes, we need to be able to track the end of the ongoing transmissions. Let's introduce an atomic variable which is incremented when a transmission starts and decremented when relevant so that we know at any moment whether there is an ongoing transmission. The counter gets decremented in the following situations: - The operation is asynchronous and there was a failure during the offloading process. - The operation is synchronous and the synchronous operation failed. - The operation finished, either successfully or not. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-5-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 3 +++ net/mac802154/tx.c | 3 +++ net/mac802154/util.c | 2 ++ 3 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index d8d8719315fd..678ff00c7d70 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -214,6 +214,9 @@ struct wpan_phy { /* the network namespace this phy lives in currently */ possible_net_t _net; + /* Transmission monitoring */ + atomic_t ongoing_txs; + char priv[] __aligned(NETDEV_ALIGN); }; diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 4a46ce8d2ac8..33f64ecd96c7 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -44,6 +44,7 @@ void ieee802154_xmit_sync_worker(struct work_struct *work) err_tx: /* Restart the netif queue on each sub_if_data object. */ ieee802154_wake_queue(&local->hw); + atomic_dec(&local->phy->ongoing_txs); kfree_skb(skb); netdev_dbg(dev, "transmission failed\n"); } @@ -75,6 +76,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) /* Stop the netif queue on each sub_if_data object. */ ieee802154_stop_queue(&local->hw); + atomic_inc(&local->phy->ongoing_txs); /* Drivers should preferably implement the async callback. In some rare * cases they only provide a sync callback which we will use as a @@ -98,6 +100,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) err_wake_netif_queue: ieee802154_wake_queue(&local->hw); + atomic_dec(&local->phy->ongoing_txs); err_free_skb: kfree_skb(skb); return NETDEV_TX_OK; diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 9f024d85563b..76dc663e2af4 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -88,6 +88,7 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, } dev_consume_skb_any(skb); + atomic_dec(&hw->phy->ongoing_txs); } EXPORT_SYMBOL(ieee802154_xmit_complete); @@ -99,6 +100,7 @@ void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, local->tx_result = reason; ieee802154_wake_queue(hw); dev_kfree_skb_any(skb); + atomic_dec(&hw->phy->ongoing_txs); } EXPORT_SYMBOL(ieee802154_xmit_error); -- cgit v1.2.3-70-g09d2 From 20a19d1df3e4079cbaa045ec89bbefb831d4705d Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:10 +0200 Subject: net: mac802154: Bring the ability to hold the transmit queue Create a hold_txs atomic variable and increment/decrement it when relevant, ie. when we want to hold the queue or release it: currently all the "stopped" situations are suitable, but very soon we will more extensively use this feature for MLME purposes. Upon release, the atomic counter is decremented and checked. If it is back to 0, then the netif queue gets woken up. This makes the whole process fully transparent, provided that all the users of ieee802154_wake/stop_queue() now call ieee802154_hold/release_queue() instead. In no situation individual drivers should call any of these helpers manually in order to avoid messing with the counters. There are other functions more suited for this purpose which have been introduced, such as the _xmit_complete() and _xmit_error() helpers which will handle all that for them. One advantage is that, as no more drivers call the stop/wake helpers directly, we can safely stop exporting them and only declare the hold/release ones in a header only accessible to the core. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-6-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 6 +++-- include/net/mac802154.h | 27 ----------------------- net/ieee802154/core.c | 2 ++ net/mac802154/cfg.c | 4 ++-- net/mac802154/ieee802154_i.h | 19 ++++++++++++++++ net/mac802154/tx.c | 6 ++--- net/mac802154/util.c | 52 ++++++++++++++++++++++++++++++++++++++------ 7 files changed, 75 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 678ff00c7d70..e87f1b07f20f 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include @@ -214,8 +214,10 @@ struct wpan_phy { /* the network namespace this phy lives in currently */ possible_net_t _net; - /* Transmission monitoring */ + /* Transmission monitoring and control */ + spinlock_t queue_lock; atomic_t ongoing_txs; + atomic_t hold_txs; char priv[] __aligned(NETDEV_ALIGN); }; diff --git a/include/net/mac802154.h b/include/net/mac802154.h index bdac0ddbdcdb..357d25ef627a 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -460,33 +460,6 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw); */ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi); -/** - * ieee802154_wake_queue - wake ieee802154 queue - * @hw: pointer as obtained from ieee802154_alloc_hw(). - * - * Tranceivers usually have either one transmit framebuffer or one framebuffer - * for both transmitting and receiving. Hence, the core currently only handles - * one frame at a time for each phy, which means we had to stop the queue to - * avoid new skb to come during the transmission. The queue then needs to be - * woken up after the operation. - * - * Drivers should use this function instead of netif_wake_queue. - */ -void ieee802154_wake_queue(struct ieee802154_hw *hw); - -/** - * ieee802154_stop_queue - stop ieee802154 queue - * @hw: pointer as obtained from ieee802154_alloc_hw(). - * - * Tranceivers usually have either one transmit framebuffer or one framebuffer - * for both transmitting and receiving. Hence, the core currently only handles - * one frame at a time for each phy, which means we need to tell upper layers to - * stop giving us new skbs while we are busy with the transmitted one. The queue - * must then be stopped before transmitting. - * - * Drivers should use this function instead of netif_stop_queue. - */ -void ieee802154_stop_queue(struct ieee802154_hw *hw); /** * ieee802154_xmit_complete - frame transmission complete diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index de259b5170ab..47a4de6df88b 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -130,6 +130,8 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) init_waitqueue_head(&rdev->dev_wait); + spin_lock_init(&rdev->wpan_phy.queue_lock); + return &rdev->wpan_phy; } EXPORT_SYMBOL(wpan_phy_new); diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 1e4a9f74ed43..b51100fd9e3f 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -46,7 +46,7 @@ static int ieee802154_suspend(struct wpan_phy *wpan_phy) if (!local->open_count) goto suspend; - ieee802154_stop_queue(&local->hw); + ieee802154_hold_queue(local); synchronize_net(); /* stop hardware - this must stop RX */ @@ -72,7 +72,7 @@ static int ieee802154_resume(struct wpan_phy *wpan_phy) return ret; wake_up: - ieee802154_wake_queue(&local->hw); + ieee802154_release_queue(local); local->suspended = false; return 0; } diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index a8b7b9049f14..0c7ff9e0b632 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -130,6 +130,25 @@ netdev_tx_t ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer); +/** + * ieee802154_hold_queue - hold ieee802154 queue + * @local: main mac object + * + * Hold a queue by incrementing an atomic counter and requesting the netif + * queues to be stopped. The queues cannot be woken up while the counter has not + * been reset with as any ieee802154_release_queue() calls as needed. + */ +void ieee802154_hold_queue(struct ieee802154_local *local); + +/** + * ieee802154_release_queue - release ieee802154 queue + * @local: main mac object + * + * Release a queue which is held by decrementing an atomic counter and wake it + * up only if the counter reaches 0. + */ +void ieee802154_release_queue(struct ieee802154_local *local); + /* MIB callbacks */ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 33f64ecd96c7..6a53c83cf039 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -43,7 +43,7 @@ void ieee802154_xmit_sync_worker(struct work_struct *work) err_tx: /* Restart the netif queue on each sub_if_data object. */ - ieee802154_wake_queue(&local->hw); + ieee802154_release_queue(local); atomic_dec(&local->phy->ongoing_txs); kfree_skb(skb); netdev_dbg(dev, "transmission failed\n"); @@ -75,7 +75,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) } /* Stop the netif queue on each sub_if_data object. */ - ieee802154_stop_queue(&local->hw); + ieee802154_hold_queue(local); atomic_inc(&local->phy->ongoing_txs); /* Drivers should preferably implement the async callback. In some rare @@ -99,7 +99,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) return NETDEV_TX_OK; err_wake_netif_queue: - ieee802154_wake_queue(&local->hw); + ieee802154_release_queue(local); atomic_dec(&local->phy->ongoing_txs); err_free_skb: kfree_skb(skb); diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 76dc663e2af4..0ed8b5bcbe8a 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -13,7 +13,17 @@ /* privid for wpan_phys to determine whether they belong to us or not */ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; -void ieee802154_wake_queue(struct ieee802154_hw *hw) +/** + * ieee802154_wake_queue - wake ieee802154 queue + * @local: main mac object + * + * Tranceivers usually have either one transmit framebuffer or one framebuffer + * for both transmitting and receiving. Hence, the core currently only handles + * one frame at a time for each phy, which means we had to stop the queue to + * avoid new skb to come during the transmission. The queue then needs to be + * woken up after the operation. + */ +static void ieee802154_wake_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; @@ -27,9 +37,18 @@ void ieee802154_wake_queue(struct ieee802154_hw *hw) } rcu_read_unlock(); } -EXPORT_SYMBOL(ieee802154_wake_queue); -void ieee802154_stop_queue(struct ieee802154_hw *hw) +/** + * ieee802154_stop_queue - stop ieee802154 queue + * @local: main mac object + * + * Tranceivers usually have either one transmit framebuffer or one framebuffer + * for both transmitting and receiving. Hence, the core currently only handles + * one frame at a time for each phy, which means we need to tell upper layers to + * stop giving us new skbs while we are busy with the transmitted one. The queue + * must then be stopped before transmitting. + */ +static void ieee802154_stop_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; @@ -43,14 +62,33 @@ void ieee802154_stop_queue(struct ieee802154_hw *hw) } rcu_read_unlock(); } -EXPORT_SYMBOL(ieee802154_stop_queue); + +void ieee802154_hold_queue(struct ieee802154_local *local) +{ + unsigned long flags; + + spin_lock_irqsave(&local->phy->queue_lock, flags); + if (!atomic_fetch_inc(&local->phy->hold_txs)) + ieee802154_stop_queue(&local->hw); + spin_unlock_irqrestore(&local->phy->queue_lock, flags); +} + +void ieee802154_release_queue(struct ieee802154_local *local) +{ + unsigned long flags; + + spin_lock_irqsave(&local->phy->queue_lock, flags); + if (!atomic_dec_and_test(&local->phy->hold_txs)) + ieee802154_wake_queue(&local->hw); + spin_unlock_irqrestore(&local->phy->queue_lock, flags); +} enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer) { struct ieee802154_local *local = container_of(timer, struct ieee802154_local, ifs_timer); - ieee802154_wake_queue(&local->hw); + ieee802154_release_queue(local); return HRTIMER_NORESTART; } @@ -84,7 +122,7 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, hw->phy->sifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); } else { - ieee802154_wake_queue(hw); + ieee802154_release_queue(local); } dev_consume_skb_any(skb); @@ -98,7 +136,7 @@ void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, struct ieee802154_local *local = hw_to_local(hw); local->tx_result = reason; - ieee802154_wake_queue(hw); + ieee802154_release_queue(local); dev_kfree_skb_any(skb); atomic_dec(&hw->phy->ongoing_txs); } -- cgit v1.2.3-70-g09d2 From 226730e1aa2844b6e5499a6fe6bea4db17a894a8 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:11 +0200 Subject: net: mac802154: Create a hot tx path Let's rename the current Tx path to show that this is the "hot" Tx path. We will soon introduce a slower Tx path for MLME commands. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-7-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/tx.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 6a53c83cf039..607019b8f8ab 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -106,6 +106,12 @@ err_free_skb: return NETDEV_TX_OK; } +static netdev_tx_t +ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb) +{ + return ieee802154_tx(local, skb); +} + netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -113,7 +119,7 @@ ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->skb_iif = dev->ifindex; - return ieee802154_tx(sdata->local, skb); + return ieee802154_hot_tx(sdata->local, skb); } netdev_tx_t @@ -135,5 +141,5 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->skb_iif = dev->ifindex; - return ieee802154_tx(sdata->local, skb); + return ieee802154_hot_tx(sdata->local, skb); } -- cgit v1.2.3-70-g09d2 From a40612f399eabe23424acf5985643a35d4f2832e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:12 +0200 Subject: net: mac802154: Introduce a helper to disable the queue Sometimes calling the stop queue helper is not enough because it does not hold any lock. In order to be safe and avoid racy situations when trying to (soon) sync the Tx queue, for instance before sending an MLME frame, let's now introduce an helper which actually hold the necessary locks when doing so. Suggested-by: Alexander Aring Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-8-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/ieee802154_i.h | 12 ++++++++++++ net/mac802154/util.c | 14 ++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 0c7ff9e0b632..e34db1d49ef4 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -149,6 +149,18 @@ void ieee802154_hold_queue(struct ieee802154_local *local); */ void ieee802154_release_queue(struct ieee802154_local *local); +/** + * ieee802154_disable_queue - disable ieee802154 queue + * @local: main mac object + * + * When trying to sync the Tx queue, we cannot just stop the queue + * (which is basically a bit being set without proper lock handling) + * because it would be racy. We actually need to call netif_tx_disable() + * instead, which is done by this helper. Restarting the queue can + * however still be done with a regular wake call. + */ +void ieee802154_disable_queue(struct ieee802154_local *local); + /* MIB callbacks */ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 0ed8b5bcbe8a..999534f64485 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -83,6 +83,20 @@ void ieee802154_release_queue(struct ieee802154_local *local) spin_unlock_irqrestore(&local->phy->queue_lock, flags); } +void ieee802154_disable_queue(struct ieee802154_local *local) +{ + struct ieee802154_sub_if_data *sdata; + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!sdata->dev) + continue; + + netif_tx_disable(sdata->dev); + } + rcu_read_unlock(); +} + enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer) { struct ieee802154_local *local = -- cgit v1.2.3-70-g09d2 From f0feb34904735ffa21fe7b0c50f9f9527ec74b7a Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:13 +0200 Subject: net: mac802154: Introduce a tx queue flushing mechanism Right now we are able to stop a queue but we have no indication if a transmission is ongoing or not. Thanks to recent additions, we can track the number of ongoing transmissions so we know if the last transmission is over. Adding on top of it an internal wait queue also allows to be woken up asynchronously when this happens. If, beforehands, we marked the queue to be held and stopped it, we end up flushing and stopping the tx queue. Thanks to this feature, we will soon be able to introduce a synchronous transmit API. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-9-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 1 + net/ieee802154/core.c | 1 + net/mac802154/cfg.c | 2 +- net/mac802154/ieee802154_i.h | 1 + net/mac802154/tx.c | 26 ++++++++++++++++++++++++-- net/mac802154/util.c | 6 ++++-- 6 files changed, 32 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index e87f1b07f20f..0804d79669a4 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -218,6 +218,7 @@ struct wpan_phy { spinlock_t queue_lock; atomic_t ongoing_txs; atomic_t hold_txs; + wait_queue_head_t sync_txq; char priv[] __aligned(NETDEV_ALIGN); }; diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 47a4de6df88b..57546e07e06a 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -129,6 +129,7 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) wpan_phy_net_set(&rdev->wpan_phy, &init_net); init_waitqueue_head(&rdev->dev_wait); + init_waitqueue_head(&rdev->wpan_phy.sync_txq); spin_lock_init(&rdev->wpan_phy.queue_lock); diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index b51100fd9e3f..93df24f75572 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -46,7 +46,7 @@ static int ieee802154_suspend(struct wpan_phy *wpan_phy) if (!local->open_count) goto suspend; - ieee802154_hold_queue(local); + ieee802154_sync_and_hold_queue(local); synchronize_net(); /* stop hardware - this must stop RX */ diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index e34db1d49ef4..a057827fc48a 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -124,6 +124,7 @@ extern struct ieee802154_mlme_ops mac802154_mlme_wpan; void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb); void ieee802154_xmit_sync_worker(struct work_struct *work); +int ieee802154_sync_and_hold_queue(struct ieee802154_local *local); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 607019b8f8ab..38f74b8b6740 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -44,7 +44,8 @@ void ieee802154_xmit_sync_worker(struct work_struct *work) err_tx: /* Restart the netif queue on each sub_if_data object. */ ieee802154_release_queue(local); - atomic_dec(&local->phy->ongoing_txs); + if (!atomic_dec_and_test(&local->phy->ongoing_txs)) + wake_up(&local->phy->sync_txq); kfree_skb(skb); netdev_dbg(dev, "transmission failed\n"); } @@ -100,12 +101,33 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) err_wake_netif_queue: ieee802154_release_queue(local); - atomic_dec(&local->phy->ongoing_txs); + if (!atomic_dec_and_test(&local->phy->ongoing_txs)) + wake_up(&local->phy->sync_txq); err_free_skb: kfree_skb(skb); return NETDEV_TX_OK; } +static int ieee802154_sync_queue(struct ieee802154_local *local) +{ + int ret; + + ieee802154_hold_queue(local); + ieee802154_disable_queue(local); + wait_event(local->phy->sync_txq, !atomic_read(&local->phy->ongoing_txs)); + ret = local->tx_result; + ieee802154_release_queue(local); + + return ret; +} + +int ieee802154_sync_and_hold_queue(struct ieee802154_local *local) +{ + ieee802154_hold_queue(local); + + return ieee802154_sync_queue(local); +} + static netdev_tx_t ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb) { diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 999534f64485..5e1fcc7b0123 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -140,7 +140,8 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, } dev_consume_skb_any(skb); - atomic_dec(&hw->phy->ongoing_txs); + if (!atomic_dec_and_test(&hw->phy->ongoing_txs)) + wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_complete); @@ -152,7 +153,8 @@ void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, local->tx_result = reason; ieee802154_release_queue(local); dev_kfree_skb_any(skb); - atomic_dec(&hw->phy->ongoing_txs); + if (!atomic_dec_and_test(&hw->phy->ongoing_txs)) + wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_error); -- cgit v1.2.3-70-g09d2 From ddd9ee7cda122ac55571b8b11b51ef1e71918b63 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:14 +0200 Subject: net: mac802154: Introduce a synchronous API for MLME commands This is the slow path, we need to wait for each command to be processed before continuing so let's introduce an helper which does the transmission and blocks until it gets notified of its asynchronous completion. This helper is going to be used when introducing scan support. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-10-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/ieee802154_i.h | 4 ++++ net/mac802154/tx.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index a057827fc48a..8a4816ae71e7 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -125,6 +125,10 @@ extern struct ieee802154_mlme_ops mac802154_mlme_wpan; void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb); void ieee802154_xmit_sync_worker(struct work_struct *work); int ieee802154_sync_and_hold_queue(struct ieee802154_local *local); +int ieee802154_mlme_op_pre(struct ieee802154_local *local); +int ieee802154_mlme_tx(struct ieee802154_local *local, struct sk_buff *skb); +void ieee802154_mlme_op_post(struct ieee802154_local *local); +int ieee802154_mlme_tx_one(struct ieee802154_local *local, struct sk_buff *skb); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 38f74b8b6740..4827391600f6 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -128,6 +128,50 @@ int ieee802154_sync_and_hold_queue(struct ieee802154_local *local) return ieee802154_sync_queue(local); } +int ieee802154_mlme_op_pre(struct ieee802154_local *local) +{ + return ieee802154_sync_and_hold_queue(local); +} + +int ieee802154_mlme_tx(struct ieee802154_local *local, struct sk_buff *skb) +{ + int ret; + + /* Avoid possible calls to ->ndo_stop() when we asynchronously perform + * MLME transmissions. + */ + rtnl_lock(); + + /* Ensure the device was not stopped, otherwise error out */ + if (!local->open_count) { + rtnl_unlock(); + return -ENETDOWN; + } + + ieee802154_tx(local, skb); + ret = ieee802154_sync_queue(local); + + rtnl_unlock(); + + return ret; +} + +void ieee802154_mlme_op_post(struct ieee802154_local *local) +{ + ieee802154_release_queue(local); +} + +int ieee802154_mlme_tx_one(struct ieee802154_local *local, struct sk_buff *skb) +{ + int ret; + + ieee802154_mlme_op_pre(local); + ret = ieee802154_mlme_tx(local, skb); + ieee802154_mlme_op_post(local); + + return ret; +} + static netdev_tx_t ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb) { -- cgit v1.2.3-70-g09d2 From 2b13db13af50a5dcdb944723c828915a50f0c3b2 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:15 +0200 Subject: net: mac802154: Add a warning in the hot path We should never start a transmission after the queue has been stopped. But because it might work we don't kill the function here but rather warn loudly the user that something is wrong. Set a flag when the queue should remain stopped. Reset this flag when the queue actually gets restarded. Just check this value to know if a transmission is legitimate, warn if it is not. Turn the flags variable into an unsigned long to allow the use of atomic helpers on it. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-11-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 5 ++++- net/mac802154/tx.c | 16 +++++++++++++++- net/mac802154/util.c | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 0804d79669a4..428cece22205 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -166,11 +166,14 @@ wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b) * level setting. * @WPAN_PHY_FLAG_CCA_MODE: Indicates that transceiver will support cca mode * setting. + * @WPAN_PHY_FLAG_STATE_QUEUE_STOPPED: Indicates that the transmit queue was + * temporarily stopped. */ enum wpan_phy_flags { WPAN_PHY_FLAG_TXPOWER = BIT(1), WPAN_PHY_FLAG_CCA_ED_LEVEL = BIT(2), WPAN_PHY_FLAG_CCA_MODE = BIT(3), + WPAN_PHY_FLAG_STATE_QUEUE_STOPPED = BIT(4), }; struct wpan_phy { @@ -182,7 +185,7 @@ struct wpan_phy { */ const void *privid; - u32 flags; + unsigned long flags; /* * This is a PIB according to 802.15.4-2011. diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 4827391600f6..6188f42276e7 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -123,9 +123,13 @@ static int ieee802154_sync_queue(struct ieee802154_local *local) int ieee802154_sync_and_hold_queue(struct ieee802154_local *local) { + int ret; + ieee802154_hold_queue(local); + ret = ieee802154_sync_queue(local); + set_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); - return ieee802154_sync_queue(local); + return ret; } int ieee802154_mlme_op_pre(struct ieee802154_local *local) @@ -172,9 +176,19 @@ int ieee802154_mlme_tx_one(struct ieee802154_local *local, struct sk_buff *skb) return ret; } +static bool ieee802154_queue_is_stopped(struct ieee802154_local *local) +{ + return test_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); +} + static netdev_tx_t ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb) { + /* Warn if the net interface tries to transmit frames while the + * ieee802154 core assumes the queue is stopped. + */ + WARN_ON_ONCE(ieee802154_queue_is_stopped(local)); + return ieee802154_tx(local, skb); } diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 5e1fcc7b0123..60eb7bd3bfc1 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -29,6 +29,7 @@ static void ieee802154_wake_queue(struct ieee802154_hw *hw) struct ieee802154_sub_if_data *sdata; rcu_read_lock(); + clear_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; -- cgit v1.2.3-70-g09d2 From 4f790184139beb8b35a1f4f9a4b6731c6eef1763 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 19 May 2022 17:05:16 +0200 Subject: net: mac802154: Add a warning in the slow path In order to be able to detect possible conflicts between the net interface core and the ieee802154 core, let's add a warning in the slow path: we want to be sure that whenever we start an asynchronous MLME transmission (which can be fully asynchronous) the net core somehow agrees that this transmission is possible, ie. the device was not stopped. Warning in this case would allow us to track down more easily possible issues with the MLME logic if we ever get reports. Unlike in the hot path, such a situation cannot be handled. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220519150516.443078-12-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/tx.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net') diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 6188f42276e7..5b471e932271 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -132,6 +132,25 @@ int ieee802154_sync_and_hold_queue(struct ieee802154_local *local) return ret; } +static bool ieee802154_netif_is_down(struct ieee802154_local *local) +{ + struct ieee802154_sub_if_data *sdata; + bool is_down = true; + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!sdata->dev) + continue; + + is_down = !netif_running(sdata->dev); + if (is_down) + break; + } + rcu_read_unlock(); + + return is_down; +} + int ieee802154_mlme_op_pre(struct ieee802154_local *local) { return ieee802154_sync_and_hold_queue(local); @@ -152,6 +171,14 @@ int ieee802154_mlme_tx(struct ieee802154_local *local, struct sk_buff *skb) return -ENETDOWN; } + /* Warn if the ieee802154 core thinks MLME frames can be sent while the + * net interface expects this cannot happen. + */ + if (WARN_ON_ONCE(ieee802154_netif_is_down(local))) { + rtnl_unlock(); + return -ENETDOWN; + } + ieee802154_tx(local, skb); ret = ieee802154_sync_queue(local); -- cgit v1.2.3-70-g09d2 From 2ec2f6bed4d12a7ed74ed179a5d2879b117105d1 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 13 Jun 2022 00:37:34 -0400 Subject: mac802154: util: fix release queue handling The semantic of atomic_dec_and_test() is to return true if zero is reached and we need call ieee802154_wake_queue() when zero is reached. Fixes: 20a19d1df3e4 ("net: mac802154: Bring the ability to hold the transmit queue") Signed-off-by: Alexander Aring Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20220613043735.1039895-2-aahringo@redhat.com Signed-off-by: Stefan Schmidt --- net/mac802154/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 60eb7bd3bfc1..60f6c0f10641 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -79,7 +79,7 @@ void ieee802154_release_queue(struct ieee802154_local *local) unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); - if (!atomic_dec_and_test(&local->phy->hold_txs)) + if (atomic_dec_and_test(&local->phy->hold_txs)) ieee802154_wake_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } -- cgit v1.2.3-70-g09d2 From 6c1c78d0182fcbfe953bf5a0c3e71204d176b887 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 13 Jun 2022 00:37:35 -0400 Subject: mac802154: fix atomic_dec_and_test checks We need to call wake_up() when hold_txs reaches zero. The semantic of atomic_dec_and_test() is that it returns true when it's zero. Fixes: f0feb3490473 ("net: mac802154: Introduce a tx queue flushing mechanism") Signed-off-by: Alexander Aring Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20220613043735.1039895-3-aahringo@redhat.com Signed-off-by: Stefan Schmidt --- net/mac802154/tx.c | 4 ++-- net/mac802154/util.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 5b471e932271..8ddcd2e841ca 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -44,7 +44,7 @@ void ieee802154_xmit_sync_worker(struct work_struct *work) err_tx: /* Restart the netif queue on each sub_if_data object. */ ieee802154_release_queue(local); - if (!atomic_dec_and_test(&local->phy->ongoing_txs)) + if (atomic_dec_and_test(&local->phy->ongoing_txs)) wake_up(&local->phy->sync_txq); kfree_skb(skb); netdev_dbg(dev, "transmission failed\n"); @@ -101,7 +101,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) err_wake_netif_queue: ieee802154_release_queue(local); - if (!atomic_dec_and_test(&local->phy->ongoing_txs)) + if (atomic_dec_and_test(&local->phy->ongoing_txs)) wake_up(&local->phy->sync_txq); err_free_skb: kfree_skb(skb); diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 60f6c0f10641..f08605f59b60 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -141,7 +141,7 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, } dev_consume_skb_any(skb); - if (!atomic_dec_and_test(&hw->phy->ongoing_txs)) + if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_complete); @@ -154,7 +154,7 @@ void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, local->tx_result = reason; ieee802154_release_queue(local); dev_kfree_skb_any(skb); - if (!atomic_dec_and_test(&hw->phy->ongoing_txs)) + if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_error); -- cgit v1.2.3-70-g09d2 From fbdaa5ba6bd6955f7e7f9228e4d815cc5e43fe5b Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 17 Jun 2022 21:29:14 +0200 Subject: net: mac802154: Fix a Tx warning check The purpose of the netif_is_down() helper was to ensure that the network interface used was still up when performing the transmission. What it actually did was to check if _all_ interfaces were up. This was not noticed at that time because I did not use interfaces at all before discussing with Alexander Aring about how to handle coordinators properly. Drop the helper and call netif_running() on the right sub interface object directly. Fixes: 4f790184139b ("net: mac802154: Add a warning in the slow path") Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20220617192914.1275611-1-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/ieee802154_i.h | 8 ++++++-- net/mac802154/tx.c | 31 ++++++++----------------------- 2 files changed, 14 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 8a4816ae71e7..010365a6364e 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -126,9 +126,13 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb); void ieee802154_xmit_sync_worker(struct work_struct *work); int ieee802154_sync_and_hold_queue(struct ieee802154_local *local); int ieee802154_mlme_op_pre(struct ieee802154_local *local); -int ieee802154_mlme_tx(struct ieee802154_local *local, struct sk_buff *skb); +int ieee802154_mlme_tx(struct ieee802154_local *local, + struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb); void ieee802154_mlme_op_post(struct ieee802154_local *local); -int ieee802154_mlme_tx_one(struct ieee802154_local *local, struct sk_buff *skb); +int ieee802154_mlme_tx_one(struct ieee802154_local *local, + struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 8ddcd2e841ca..9d8d43cf1e64 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -132,31 +132,14 @@ int ieee802154_sync_and_hold_queue(struct ieee802154_local *local) return ret; } -static bool ieee802154_netif_is_down(struct ieee802154_local *local) -{ - struct ieee802154_sub_if_data *sdata; - bool is_down = true; - - rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (!sdata->dev) - continue; - - is_down = !netif_running(sdata->dev); - if (is_down) - break; - } - rcu_read_unlock(); - - return is_down; -} - int ieee802154_mlme_op_pre(struct ieee802154_local *local) { return ieee802154_sync_and_hold_queue(local); } -int ieee802154_mlme_tx(struct ieee802154_local *local, struct sk_buff *skb) +int ieee802154_mlme_tx(struct ieee802154_local *local, + struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb) { int ret; @@ -174,7 +157,7 @@ int ieee802154_mlme_tx(struct ieee802154_local *local, struct sk_buff *skb) /* Warn if the ieee802154 core thinks MLME frames can be sent while the * net interface expects this cannot happen. */ - if (WARN_ON_ONCE(ieee802154_netif_is_down(local))) { + if (WARN_ON_ONCE(!netif_running(sdata->dev))) { rtnl_unlock(); return -ENETDOWN; } @@ -192,12 +175,14 @@ void ieee802154_mlme_op_post(struct ieee802154_local *local) ieee802154_release_queue(local); } -int ieee802154_mlme_tx_one(struct ieee802154_local *local, struct sk_buff *skb) +int ieee802154_mlme_tx_one(struct ieee802154_local *local, + struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb) { int ret; ieee802154_mlme_op_pre(local); - ret = ieee802154_mlme_tx(local, skb); + ret = ieee802154_mlme_tx(local, sdata, skb); ieee802154_mlme_op_post(local); return ret; -- cgit v1.2.3-70-g09d2 From 1d9e4c91db17c9bf6f94ac234a4d4f2bffd52b97 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 6 Sep 2022 19:02:04 +0200 Subject: wifi: mac80211: add pointer from link STA to STA While often not needed, this considerably simplifies going from a link to the STA. This helps in cases such as debugfs where a single pointer should allow accessing a specific link and the STA. Signed-off-by: Benjamin Berg Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 +++ net/mac80211/sta_info.c | 1 + 2 files changed, 4 insertions(+) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ac2bad57933f..7778a92d9582 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2176,6 +2176,7 @@ struct ieee80211_sta_aggregates { * All link specific info for a STA link for a non MLD STA(single) * or a MLD STA(multiple entries) are stored here. * + * @sta: reference to owning STA * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr * in ieee80211_sta. For MLO Link STA this addr can be same or different * from addr in ieee80211_sta (representing MLD STA addr) @@ -2196,6 +2197,8 @@ struct ieee80211_sta_aggregates { * */ struct ieee80211_link_sta { + struct ieee80211_sta *sta; + u8 addr[ETH_ALEN]; u8 link_id; enum ieee80211_smps_mode smps_mode; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cebfd148bb40..71b1488bd390 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -511,6 +511,7 @@ static void sta_info_add_link(struct sta_info *sta, link_info->sta = sta; link_info->link_id = link_id; link_info->pub = link_sta; + link_info->pub->sta = &sta->sta; link_sta->link_id = link_id; rcu_assign_pointer(sta->link[link_id], link_info); rcu_assign_pointer(sta->sta.link[link_id], link_sta); -- cgit v1.2.3-70-g09d2 From d2caad527c191563116809990081ab4fc0dafdb6 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 6 Sep 2022 14:26:52 +0200 Subject: wifi: mac80211: add API to show the link STAs in debugfs Create debugfs data per-link. For drivers, there is a new operation link_sta_add_debugfs which will always be called. For non-MLO, the station directory will be used directly rather than creating a corresponding subdirectory. As such, non-MLO drivers can simply continue to create the data from sta_debugfs_add. Signed-off-by: Benjamin Berg [add missing inlines if !CONFIG_MAC80211_DEBUGFS] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 11 ++++ net/mac80211/debugfs_sta.c | 127 +++++++++++++++++++++++++++++++++++++-------- net/mac80211/debugfs_sta.h | 12 +++++ net/mac80211/driver-ops.c | 27 +++++++++- net/mac80211/driver-ops.h | 16 ++++++ net/mac80211/sta_info.c | 25 +++++++++ net/mac80211/sta_info.h | 5 ++ 7 files changed, 200 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7778a92d9582..c413050ec8dd 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3790,6 +3790,13 @@ struct ieee80211_prep_tx_info { * should be within a CONFIG_MAC80211_DEBUGFS conditional. This * callback can sleep. * + * @link_sta_add_debugfs: Drivers can use this callback to add debugfs files + * when a link is added to a mac80211 station. This callback + * should be within a CPTCFG_MAC80211_DEBUGFS conditional. This + * callback can sleep. + * For non-MLO the callback will be called once for the deflink with the + * station's directory rather than a separate subdirectory. + * * @sta_notify: Notifies low level driver about power state transition of an * associated station, AP, IBSS/WDS/mesh peer etc. For a VIF operating * in AP mode, this callback will not be called when the flag @@ -4260,6 +4267,10 @@ struct ieee80211_ops { struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct dentry *dir); + void (*link_sta_add_debugfs)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + struct dentry *dir); #endif void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, enum sta_notify_cmd, struct ieee80211_sta *sta); diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index d3397c1248d3..68c07d4b95a5 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -5,7 +5,7 @@ * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation */ #include @@ -435,8 +435,16 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu } STA_OPS_RW(agg_status); -static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) +/* link sta attributes */ +#define LINK_STA_OPS(name) \ +static const struct file_operations link_sta_ ##name## _ops = { \ + .read = link_sta_##name##_read, \ + .open = simple_open, \ + .llseek = generic_file_llseek, \ +} + +static ssize_t link_sta_ht_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) { #define PRINT_HT_CAP(_cond, _str) \ do { \ @@ -446,8 +454,8 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, char *buf, *p; int i; ssize_t bufsz = 512; - struct sta_info *sta = file->private_data; - struct ieee80211_sta_ht_cap *htc = &sta->sta.deflink.ht_cap; + struct link_sta_info *link_sta = file->private_data; + struct ieee80211_sta_ht_cap *htc = &link_sta->pub->ht_cap; ssize_t ret; buf = kzalloc(bufsz, GFP_KERNEL); @@ -524,14 +532,14 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, kfree(buf); return ret; } -STA_OPS(ht_capa); +LINK_STA_OPS(ht_capa); -static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) +static ssize_t link_sta_vht_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) { char *buf, *p; - struct sta_info *sta = file->private_data; - struct ieee80211_sta_vht_cap *vhtc = &sta->sta.deflink.vht_cap; + struct link_sta_info *link_sta = file->private_data; + struct ieee80211_sta_vht_cap *vhtc = &link_sta->pub->vht_cap; ssize_t ret; ssize_t bufsz = 512; @@ -638,15 +646,15 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, kfree(buf); return ret; } -STA_OPS(vht_capa); +LINK_STA_OPS(vht_capa); -static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) +static ssize_t link_sta_he_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) { char *buf, *p; size_t buf_sz = PAGE_SIZE; - struct sta_info *sta = file->private_data; - struct ieee80211_sta_he_cap *hec = &sta->sta.deflink.he_cap; + struct link_sta_info *link_sta = file->private_data; + struct ieee80211_sta_he_cap *hec = &link_sta->pub->he_cap; struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp; u8 ppe_size; u8 *cap; @@ -1011,7 +1019,7 @@ out: kfree(buf); return ret; } -STA_OPS(he_capa); +LINK_STA_OPS(he_capa); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ @@ -1048,12 +1056,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD(num_ps_buf_frames); DEBUGFS_ADD(last_seq_ctrl); DEBUGFS_ADD(agg_status); - DEBUGFS_ADD(ht_capa); - DEBUGFS_ADD(vht_capa); - DEBUGFS_ADD(he_capa); - - DEBUGFS_ADD_COUNTER(rx_duplicates, deflink.rx_stats.num_duplicates); - DEBUGFS_ADD_COUNTER(rx_fragments, deflink.rx_stats.fragments); + /* FIXME: Kept here as the statistics are only done on the deflink */ DEBUGFS_ADD_COUNTER(tx_filtered, deflink.status_stats.filtered); if (local->ops->wake_tx_queue) { @@ -1076,3 +1079,83 @@ void ieee80211_sta_debugfs_remove(struct sta_info *sta) debugfs_remove_recursive(sta->debugfs_dir); sta->debugfs_dir = NULL; } + +#undef DEBUGFS_ADD +#undef DEBUGFS_ADD_COUNTER + +#define DEBUGFS_ADD(name) \ + debugfs_create_file(#name, 0400, \ + link_sta->debugfs_dir, link_sta, &link_sta_ ##name## _ops) +#define DEBUGFS_ADD_COUNTER(name, field) \ + debugfs_create_ulong(#name, 0400, link_sta->debugfs_dir, &link_sta->field) + +void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) +{ + if (WARN_ON(!link_sta->sta->debugfs_dir)) + return; + + /* For non-MLO, leave the files in the main directory. */ + if (link_sta->sta->sta.valid_links) { + char link_dir_name[10]; + + snprintf(link_dir_name, sizeof(link_dir_name), + "link-%d", link_sta->link_id); + + link_sta->debugfs_dir = + debugfs_create_dir(link_dir_name, + link_sta->sta->debugfs_dir); + } else { + if (WARN_ON(link_sta != &link_sta->sta->deflink)) + return; + + link_sta->debugfs_dir = link_sta->sta->debugfs_dir; + } + + DEBUGFS_ADD(ht_capa); + DEBUGFS_ADD(vht_capa); + DEBUGFS_ADD(he_capa); + + DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates); + DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); +} + +void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta) +{ + if (!link_sta->debugfs_dir || !link_sta->sta->debugfs_dir) { + link_sta->debugfs_dir = NULL; + return; + } + + if (link_sta->debugfs_dir == link_sta->sta->debugfs_dir) { + WARN_ON(link_sta != &link_sta->sta->deflink); + link_sta->sta->debugfs_dir = NULL; + return; + } + + debugfs_remove_recursive(link_sta->debugfs_dir); + link_sta->debugfs_dir = NULL; +} + +void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta) +{ + if (WARN_ON(!link_sta->debugfs_dir)) + return; + + drv_link_sta_add_debugfs(link_sta->sta->local, link_sta->sta->sdata, + link_sta->pub, link_sta->debugfs_dir); +} + +void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta) +{ + if (!link_sta->debugfs_dir) + return; + + if (WARN_ON(link_sta->debugfs_dir == link_sta->sta->debugfs_dir)) + return; + + /* Recreate the directory excluding the driver data */ + debugfs_remove_recursive(link_sta->debugfs_dir); + link_sta->debugfs_dir = NULL; + + ieee80211_link_sta_debugfs_add(link_sta); +} diff --git a/net/mac80211/debugfs_sta.h b/net/mac80211/debugfs_sta.h index d2e7c27ad6d1..cde8148bdb18 100644 --- a/net/mac80211/debugfs_sta.h +++ b/net/mac80211/debugfs_sta.h @@ -7,9 +7,21 @@ #ifdef CONFIG_MAC80211_DEBUGFS void ieee80211_sta_debugfs_add(struct sta_info *sta); void ieee80211_sta_debugfs_remove(struct sta_info *sta); + +void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta); +void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta); + +void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta); +void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta); #else static inline void ieee80211_sta_debugfs_add(struct sta_info *sta) {} static inline void ieee80211_sta_debugfs_remove(struct sta_info *sta) {} + +static inline void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) {} +static inline void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta) {} + +static inline void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta) {} +static inline void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta) {} #endif #endif /* __MAC80211_DEBUGFS_STA_H */ diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 5392ffa18270..d737db4e07e2 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -7,6 +7,7 @@ #include "ieee80211_i.h" #include "trace.h" #include "driver-ops.h" +#include "debugfs_sta.h" int drv_start(struct ieee80211_local *local) { @@ -497,6 +498,11 @@ int drv_change_sta_links(struct ieee80211_local *local, struct ieee80211_sta *sta, u16 old_links, u16 new_links) { + struct sta_info *info = container_of(sta, struct sta_info, sta); + struct link_sta_info *link_sta; + unsigned long links_to_add; + unsigned long links_to_rem; + unsigned int link_id; int ret = -EOPNOTSUPP; might_sleep(); @@ -510,11 +516,30 @@ int drv_change_sta_links(struct ieee80211_local *local, if (old_links == new_links) return 0; + links_to_add = ~old_links & new_links; + links_to_rem = old_links & ~new_links; + + for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) { + link_sta = rcu_dereference_protected(info->link[link_id], + lockdep_is_held(&local->sta_mtx)); + + ieee80211_link_sta_debugfs_drv_remove(link_sta); + } + trace_drv_change_sta_links(local, sdata, sta, old_links, new_links); if (local->ops->change_sta_links) ret = local->ops->change_sta_links(&local->hw, &sdata->vif, sta, old_links, new_links); trace_drv_return_int(local, ret); - return ret; + if (ret) + return ret; + + for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) { + link_sta = rcu_dereference_protected(info->link[link_id], + lockdep_is_held(&local->sta_mtx)); + ieee80211_link_sta_debugfs_drv_add(link_sta); + } + + return 0; } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 81e40b0a3b16..809bad53e15b 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -480,6 +480,22 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local, local->ops->sta_add_debugfs(&local->hw, &sdata->vif, sta, dir); } + +static inline void drv_link_sta_add_debugfs(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta, + struct dentry *dir) +{ + might_sleep(); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + if (local->ops->link_sta_add_debugfs) + local->ops->link_sta_add_debugfs(&local->hw, &sdata->vif, + link_sta, dir); +} #endif static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 71b1488bd390..e6beaea4075e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -366,6 +366,9 @@ static void sta_remove_link(struct sta_info *sta, unsigned int link_id, if (unhash) link_sta_info_hash_del(sta->local, link_sta); + if (test_sta_flag(sta, WLAN_STA_INSERTED)) + ieee80211_link_sta_debugfs_remove(link_sta); + if (link_sta != &sta->deflink) alloc = container_of(link_sta, typeof(*alloc), info); @@ -875,6 +878,26 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); + if (sta->sta.valid_links) { + int i; + + for (i = 0; i < ARRAY_SIZE(sta->link); i++) { + struct link_sta_info *link_sta; + + link_sta = rcu_dereference_protected(sta->link[i], + lockdep_is_held(&local->sta_mtx)); + + if (!link_sta) + continue; + + ieee80211_link_sta_debugfs_add(link_sta); + if (sdata->vif.active_links & BIT(i)) + ieee80211_link_sta_debugfs_drv_add(link_sta); + } + } else { + ieee80211_link_sta_debugfs_add(&sta->deflink); + ieee80211_link_sta_debugfs_drv_add(&sta->deflink); + } sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); @@ -2824,6 +2847,8 @@ int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta); + ieee80211_link_sta_debugfs_add(&alloc->info); + return 0; } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 2517ea714dc4..6e672bf9c79d 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -513,6 +513,7 @@ struct ieee80211_fragment_cache { * @status_stats.avg_ack_signal: average ACK signal * @cur_max_bandwidth: maximum bandwidth to use for TX to the station, * taken from HT/VHT capabilities or VHT operating mode notification + * @debugfs_dir: debug filesystem directory dentry * @pub: public (driver visible) link STA data * TODO Move other link params from sta_info as required for MLD operation */ @@ -560,6 +561,10 @@ struct link_sta_info { enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; +#ifdef CONFIG_MAC80211_DEBUGFS + struct dentry *debugfs_dir; +#endif + struct ieee80211_link_sta *pub; }; -- cgit v1.2.3-70-g09d2 From c2b6b1c13e17cdef76e01349f4aa6f035cfad063 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Tue, 6 Sep 2022 17:24:10 +0200 Subject: wifi: mac80211: include link address in debugfs Add the link address to the per-link information, but only if we are using MLO. Signed-off-by: Benjamin Berg Signed-off-by: Johannes Berg --- net/mac80211/debugfs_sta.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 68c07d4b95a5..0d8ff0303028 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -443,6 +443,19 @@ static const struct file_operations link_sta_ ##name## _ops = { \ .llseek = generic_file_llseek, \ } +static ssize_t link_sta_addr_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct link_sta_info *link_sta = file->private_data; + u8 mac[3 * ETH_ALEN + 1]; + + snprintf(mac, sizeof(mac), "%pM\n", link_sta->pub->addr); + + return simple_read_from_buffer(userbuf, count, ppos, mac, 3 * ETH_ALEN); +} + +LINK_STA_OPS(addr); + static ssize_t link_sta_ht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -1104,6 +1117,8 @@ void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) link_sta->debugfs_dir = debugfs_create_dir(link_dir_name, link_sta->sta->debugfs_dir); + + DEBUGFS_ADD(addr); } else { if (WARN_ON(link_sta != &link_sta->sta->deflink)) return; -- cgit v1.2.3-70-g09d2 From 9b41a9d7dca0159723172a47097b3f2352e37e44 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Oct 2022 13:52:26 +0200 Subject: wifi: mac80211: recalc station aggregate data during link switch During link switching, the active links change, so we need to recalculate the aggregate data in the stations. Signed-off-by: Johannes Berg --- net/mac80211/link.c | 17 +++++++++++++++++ net/mac80211/sta_info.c | 33 ++++++++++++++++++++++++--------- net/mac80211/sta_info.h | 2 ++ 3 files changed, 43 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/link.c b/net/mac80211/link.c index e309708abae8..d1f5a9f7c647 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -357,6 +357,11 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; + + /* this is very temporary, but do it anyway */ + __ieee80211_sta_recalc_aggregates(sta, + old_active | active_links); + ret = drv_change_sta_links(local, sdata, &sta->sta, old_active, old_active | active_links); @@ -369,10 +374,22 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; + + __ieee80211_sta_recalc_aggregates(sta, active_links); + ret = drv_change_sta_links(local, sdata, &sta->sta, old_active | active_links, active_links); WARN_ON_ONCE(ret); + + /* + * Do it again, just in case - the driver might very + * well have called ieee80211_sta_recalc_aggregates() + * from there when filling in the new links, which + * would set it wrong since the vif's active links are + * not switched yet... + */ + __ieee80211_sta_recalc_aggregates(sta, active_links); } for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index e6beaea4075e..2bb6a71c72ef 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2151,22 +2151,30 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, } EXPORT_SYMBOL(ieee80211_sta_register_airtime); -void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) +void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links) { - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - struct ieee80211_link_sta *link_sta; - int link_id, i; bool first = true; + int link_id; - if (!pubsta->valid_links || !pubsta->mlo) { - pubsta->cur = &pubsta->deflink.agg; + if (!sta->sta.valid_links || !sta->sta.mlo) { + sta->sta.cur = &sta->sta.deflink.agg; return; } rcu_read_lock(); - for_each_sta_active_link(&sta->sdata->vif, pubsta, link_sta, link_id) { + for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { + struct ieee80211_link_sta *link_sta; + int i; + + if (!(active_links & BIT(link_id))) + continue; + + link_sta = rcu_dereference(sta->sta.link[link_id]); + if (!link_sta) + continue; + if (first) { - sta->cur = pubsta->deflink.agg; + sta->cur = sta->sta.deflink.agg; first = false; continue; } @@ -2185,7 +2193,14 @@ void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) } rcu_read_unlock(); - pubsta->cur = &sta->cur; + sta->sta.cur = &sta->cur; +} + +void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + + __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links); } EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 6e672bf9c79d..69820b551668 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -927,6 +927,8 @@ void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len); +void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links); + enum sta_stats_type { STA_STATS_RATE_TYPE_INVALID = 0, STA_STATS_RATE_TYPE_LEGACY, -- cgit v1.2.3-70-g09d2 From 53ad07e9823bca10c26e71d662b58c3e80e8ff2a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 6 Sep 2022 11:27:57 +0200 Subject: wifi: cfg80211: support reporting failed links For assoc and connect result APIs, support reporting failed links; they should still come with the BSS pointer in the case of assoc, so they're released correctly. In the case of connect result, this is optional. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++++ net/wireless/mlme.c | 4 ++++ net/wireless/nl80211.c | 5 ++++- net/wireless/sme.c | 14 ++++++++++++++ 4 files changed, 29 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e09ff87146c1..4d35a4234417 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -6933,6 +6933,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr); * @ap_mld_addr: AP MLD address (in case of MLO) * @links: per-link information indexed by link ID, use links[0] for * non-MLO connections + * @links.status: Set this (along with a BSS pointer) for links that + * were rejected by the AP. */ struct cfg80211_rx_assoc_resp { const u8 *buf; @@ -6944,6 +6946,7 @@ struct cfg80211_rx_assoc_resp { struct { const u8 *addr; struct cfg80211_bss *bss; + u16 status; } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; @@ -7454,6 +7457,9 @@ struct cfg80211_fils_resp_params { * if the bss is expired during the connection, esp. for those drivers * implementing connect op. Only one parameter among @bssid and @bss needs * to be specified. + * @links.status: per-link status code, to report a status code that's not + * %WLAN_STATUS_SUCCESS for a given link, it must also be in the + * @valid_links bitmap and may have a BSS pointer (which is then released) */ struct cfg80211_connect_resp_params { int status; @@ -7470,6 +7476,7 @@ struct cfg80211_connect_resp_params { const u8 *addr; const u8 *bssid; struct cfg80211_bss *bss; + u16 status; } links[IEEE80211_MLD_MAX_NUM_LINKS]; }; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 581df7f4c524..58e1fb18f85a 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -42,6 +42,10 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, unsigned int link_id; for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { + cr.links[link_id].status = data->links[link_id].status; + WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS && + (!cr.ap_mld_addr || !cr.links[link_id].bss)); + cr.links[link_id].bss = data->links[link_id].bss; if (!cr.links[link_id].bss) continue; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8ff8b1c040f0..ad7393cd3d18 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -17745,6 +17745,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, link_info_size += (cr->links[link].bssid || cr->links[link].bss) ? nla_total_size(ETH_ALEN) : 0; + link_info_size += nla_total_size(sizeof(u16)); } } @@ -17813,7 +17814,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) || (cr->links[link].addr && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, - cr->links[link].addr))) + cr->links[link].addr)) || + nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, + cr->links[link].status)) goto nla_put_failure; nla_nest_end(msg, nested_mlo_links); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index d513536617bd..f94497e9db43 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -793,6 +793,10 @@ void __cfg80211_connect_result(struct net_device *dev, } for_each_valid_link(cr, link) { + /* don't do extra lookups for failures */ + if (cr->links[link].status != WLAN_STATUS_SUCCESS) + continue; + if (cr->links[link].bss) continue; @@ -829,6 +833,16 @@ void __cfg80211_connect_result(struct net_device *dev, } memset(wdev->links, 0, sizeof(wdev->links)); + for_each_valid_link(cr, link) { + if (cr->links[link].status == WLAN_STATUS_SUCCESS) + continue; + cr->valid_links &= ~BIT(link); + /* don't require bss pointer for failed links */ + if (!cr->links[link].bss) + continue; + cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); + cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); + } wdev->valid_links = cr->valid_links; for_each_valid_link(cr, link) wdev->links[link].client.current_bss = -- cgit v1.2.3-70-g09d2 From 45ebac4f059b92906e7e86dd1a780739f883857c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Tue, 6 Sep 2022 11:48:56 +0300 Subject: wifi: mac80211: Parse station profile from association response When processing an association response frame for a Multi-Link connection, extract the per station profile for each additional link, and use it for parsing the link elements. As the Multi-Link element might be fragmented, add support for reassembling a fragmented element. To simplify memory management logic, extend 'struct ieee802_11_elems' to hold a scratch buffer, which is used for the defragmentation. Once an element is reconstructed in the scratch area, point the corresponding element pointer to it. Currently only defragmentation of Multi-Link element and the contained per-STA profile subelement is supported. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + net/mac80211/ieee80211_i.h | 26 +++++++- net/mac80211/mlme.c | 17 ++++- net/mac80211/util.c | 150 ++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 188 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b935a85b2f44..f51e939f28f2 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4666,6 +4666,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) enum ieee80211_mle_subelems { IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, + IEEE80211_MLE_SUBELEM_FRAGMENT = 254, }; #define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 4e1d4c339f2d..5dcbc8de53fd 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1707,8 +1707,27 @@ struct ieee802_11_elems { u8 tx_pwr_env_num; u8 eht_cap_len; + /* mult-link element can be de-fragmented and thus u8 is not sufficient */ + size_t multi_link_len; + + /* + * store the per station profile pointer and length in case that the + * parsing also handled Multi-Link element parsing for a specific link + * ID. + */ + struct ieee80211_mle_per_sta_profile *prof; + size_t sta_prof_len; + /* whether a parse error occurred while retrieving these elements */ bool parse_error; + + /* + * scratch buffer that can be used for various element parsing related + * tasks, e.g., element de-fragmentation etc. + */ + size_t scratch_len; + u8 *scratch_pos; + u8 scratch[]; }; static inline struct ieee80211_local *hw_to_local( @@ -2197,9 +2216,13 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, * represent a non-transmitting BSS in which case the data * for that non-transmitting BSS is returned * @link_id: the link ID to parse elements for, if a STA profile - * is present in the multi-link element, or -1 to ignore + * is present in the multi-link element, or -1 to ignore; + * note that the code currently assumes parsing an association + * (or re-association) response frame if this is given * @from_ap: frame is received from an AP (currently used only * for EHT capabilities parsing) + * @scratch_len: if non zero, specifies the requested length of the scratch + * buffer; otherwise, 'len' is used. */ struct ieee80211_elems_parse_params { const u8 *start; @@ -2210,6 +2233,7 @@ struct ieee80211_elems_parse_params { struct cfg80211_bss *bss; int link_id; bool from_ap; + size_t scratch_len; }; struct ieee802_11_elems * diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 54b8d5065bbd..a7e06c8ddaf3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3923,11 +3923,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_local *local = sdata->local; + unsigned int link_id = link->link_id; struct ieee80211_elems_parse_params parse_params = { .start = elem_start, .len = elem_len, .bss = cbss, - .link_id = link == &sdata->deflink ? -1 : link->link_id, + .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, }; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; @@ -3942,8 +3943,18 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, if (!elems) return false; - /* FIXME: use from STA profile element after parsing that */ - capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); + if (link_id == assoc_data->assoc_link_id) { + capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); + } else if (!elems->prof) { + ret = false; + goto out; + } else { + const u8 *ptr = elems->prof->variable + + elems->prof->sta_info_len - 1; + + /* FIXME: need to also handle the status code */ + capab_info = get_unaligned_le16(ptr); + } if (!is_s1g && !elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bf7461c41bef..40b75fa82b15 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1026,8 +1026,10 @@ ieee80211_parse_extension_element(u32 *crc, elems->eht_operation = data; break; case WLAN_EID_EXT_EHT_MULTI_LINK: - if (ieee80211_mle_size_ok(data, len)) + if (ieee80211_mle_size_ok(data, len)) { elems->multi_link = (void *)data; + elems->multi_link_len = len; + } break; } } @@ -1497,6 +1499,145 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, return found ? profile_len : 0; } +static void ieee80211_defragment_element(struct ieee802_11_elems *elems, + void **elem_ptr, size_t *len, + size_t total_len, u8 frag_id) +{ + u8 *data = *elem_ptr, *pos, *start; + const struct element *elem; + + /* + * Since 'data' points to the data of the element, not the element + * itself, allow 254 in case it was an extended element where the + * extended ID isn't part of the data we see here and thus not part of + * 'len' either. + */ + if (!data || (*len != 254 && *len != 255)) + return; + + start = elems->scratch_pos; + + if (WARN_ON(*len > (elems->scratch + elems->scratch_len - + elems->scratch_pos))) + return; + + memcpy(elems->scratch_pos, data, *len); + elems->scratch_pos += *len; + + pos = data + *len; + total_len -= *len; + for_each_element(elem, pos, total_len) { + if (elem->id != frag_id) + break; + + if (WARN_ON(elem->datalen > + (elems->scratch + elems->scratch_len - + elems->scratch_pos))) + return; + + memcpy(elems->scratch_pos, elem->data, elem->datalen); + elems->scratch_pos += elem->datalen; + + *len += elem->datalen; + } + + *elem_ptr = start; +} + +static void ieee80211_mle_get_sta_prof(struct ieee802_11_elems *elems, + u8 link_id) +{ + const struct ieee80211_multi_link_elem *ml = elems->multi_link; + size_t ml_len = elems->multi_link_len; + const struct element *sub; + + if (!ml || !ml_len) + return; + + if (le16_get_bits(ml->control, IEEE80211_ML_CONTROL_TYPE) != + IEEE80211_ML_CONTROL_TYPE_BASIC) + return; + + for_each_mle_subelement(sub, (u8 *)ml, ml_len) { + struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; + u16 control; + + if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) + continue; + + if (!ieee80211_mle_sta_prof_size_ok(sub->data, sub->datalen)) + return; + + control = le16_to_cpu(prof->control); + + if (link_id != u16_get_bits(control, + IEEE80211_MLE_STA_CONTROL_LINK_ID)) + continue; + + if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE)) + return; + + elems->prof = prof; + elems->sta_prof_len = sub->datalen; + + /* the sub element can be fragmented */ + ieee80211_defragment_element(elems, (void **)&elems->prof, + &elems->sta_prof_len, + ml_len - (sub->data - (u8 *)ml), + IEEE80211_MLE_SUBELEM_FRAGMENT); + return; + } +} + +static void ieee80211_mle_parse_link(struct ieee802_11_elems *elems, + struct ieee80211_elems_parse_params *params) +{ + struct ieee80211_mle_per_sta_profile *prof; + struct ieee80211_elems_parse_params sub = { + .action = params->action, + .from_ap = params->from_ap, + .link_id = -1, + }; + const struct element *non_inherit = NULL; + const u8 *end; + + if (params->link_id == -1) + return; + + ieee80211_defragment_element(elems, (void **)&elems->multi_link, + &elems->multi_link_len, + elems->total_len - ((u8 *)elems->multi_link - + elems->ie_start), + WLAN_EID_FRAGMENT); + + ieee80211_mle_get_sta_prof(elems, params->link_id); + prof = elems->prof; + + if (!prof) + return; + + /* check if we have the 4 bytes for the fixed part in assoc response */ + if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) { + elems->prof = NULL; + elems->sta_prof_len = 0; + return; + } + + /* + * Skip the capability information and the status code that are expected + * as part of the station profile in association response frames. Note + * the -1 is because the 'sta_info_len' is accounted to as part of the + * per-STA profile, but not part of the 'u8 variable[]' portion. + */ + sub.start = prof->variable + prof->sta_info_len - 1 + 4; + end = (const u8 *)prof + elems->sta_prof_len; + sub.len = end - sub.start; + + non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub.start, sub.len); + _ieee802_11_parse_elems_full(&sub, elems, non_inherit); +} + struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) { @@ -1504,12 +1645,15 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) const struct element *non_inherit = NULL; u8 *nontransmitted_profile; int nontransmitted_profile_len = 0; + size_t scratch_len = params->scratch_len ?: 2 * params->len; - elems = kzalloc(sizeof(*elems), GFP_ATOMIC); + elems = kzalloc(sizeof(*elems) + scratch_len, GFP_ATOMIC); if (!elems) return NULL; elems->ie_start = params->start; elems->total_len = params->len; + elems->scratch_len = scratch_len; + elems->scratch_pos = elems->scratch; nontransmitted_profile = kmalloc(params->len, GFP_ATOMIC); if (nontransmitted_profile) { @@ -1537,6 +1681,8 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) _ieee802_11_parse_elems_full(&sub, elems, NULL); } + ieee80211_mle_parse_link(elems, params); + if (elems->tim && !elems->parse_error) { const struct ieee80211_tim_ie *tim_ie = elems->tim; -- cgit v1.2.3-70-g09d2 From c2d052a3c41aa7815714a18911819efde0d7d384 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Thu, 8 Sep 2022 11:31:10 +0300 Subject: wifi: mac80211: Process association status for affiliated links In case the AP returned a non success status for one of the links, do not activate the link. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/mlme.c | 40 +++++++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 5dcbc8de53fd..517a50abdb09 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -412,6 +412,8 @@ struct ieee80211_mgd_assoc_data { u8 *elems; /* pointing to inside ie[] below */ ieee80211_conn_flags_t conn_flags; + + u16 status; } link[IEEE80211_MLD_MAX_NUM_LINKS]; u8 ap_addr[ETH_ALEN] __aligned(2); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a7e06c8ddaf3..2e4bb75c68c0 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2754,7 +2754,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; - if (!cbss) + if (!cbss || + assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; link = sdata_dereference(sdata->link[link_id], sdata); @@ -2782,7 +2783,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link; struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; - if (!cbss) + if (!cbss || + assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; link = sdata_dereference(sdata->link[link_id], sdata); @@ -3945,6 +3947,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, if (link_id == assoc_data->assoc_link_id) { capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); + + /* + * we should not get to this flow unless the association was + * successful, so set the status directly to success + */ + assoc_data->link[link_id].status = WLAN_STATUS_SUCCESS; } else if (!elems->prof) { ret = false; goto out; @@ -3952,8 +3960,19 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, const u8 *ptr = elems->prof->variable + elems->prof->sta_info_len - 1; - /* FIXME: need to also handle the status code */ + /* + * During parsing, we validated that these fields exist, + * otherwise elems->prof would have been set to NULL. + */ capab_info = get_unaligned_le16(ptr); + assoc_data->link[link_id].status = get_unaligned_le16(ptr + 2); + + if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { + link_info(link, "association response status code=%u\n", + assoc_data->link[link_id].status); + ret = true; + goto out; + } } if (!is_s1g && !elems->supp_rates) { @@ -4874,6 +4893,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, unsigned int link_id; struct sta_info *sta; u64 changed[IEEE80211_MLD_MAX_NUM_LINKS] = {}; + u16 valid_links = 0; int err; mutex_lock(&sdata->local->sta_mtx); @@ -4886,8 +4906,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out_err; if (sdata->vif.valid_links) { - u16 valid_links = 0; - for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!assoc_data->link[link_id].bss) continue; @@ -4957,6 +4975,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, &changed[link_id])) goto out_err; + if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { + valid_links &= ~BIT(link_id); + ieee80211_sta_remove_link(sta, link_id); + continue; + } + if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_activate_link(sta, link_id); if (err) @@ -4964,6 +4988,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } } + /* links might have changed due to rejected ones, set them again */ + ieee80211_vif_set_links(sdata, valid_links); + rate_control_rate_init(sta); if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) { @@ -5197,10 +5224,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; + if (!assoc_data->link[link_id].bss) continue; + resp.links[link_id].bss = assoc_data->link[link_id].bss; resp.links[link_id].addr = link->conf->addr; + resp.links[link_id].status = assoc_data->link[link_id].status; /* get uapsd queues configuration - same for all links */ resp.uapsd_queues = 0; -- cgit v1.2.3-70-g09d2 From 2d5e6171493695e962592657430a749f9f0d0728 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 6 Sep 2022 22:18:19 +0200 Subject: wifi: mac80211: wme: use ap_addr instead of deflink BSSID We use this to look up the destination station, so it needs to be the MLD address of the AP for an MLO; use ap_addr instead of the BSSID. Signed-off-by: Johannes Berg --- net/mac80211/wme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index ecc1de2e68a5..9fab97f6fbea 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -211,7 +211,7 @@ u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, if (sta) break; - ra = sdata->deflink.u.mgd.bssid; + ra = sdata->vif.cfg.ap_addr; break; case NL80211_IFTYPE_ADHOC: ra = skb->data; -- cgit v1.2.3-70-g09d2 From f7ee304111584b1822c960b0b5e7ebbe3d4e406b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 6 Sep 2022 22:25:50 +0200 Subject: wifi: mac80211: transmit AddBA with MLD address This management frame is intended for the MLD so we treat it in mac80211 as MLD addressed as well, and should therefore use the MLD address of the AP for the BSSID field in the frame, address translation applies. Signed-off-by: Johannes Berg --- net/mac80211/agg-tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 07c892aa8c73..9c40f8d3bce8 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -82,7 +82,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, sdata->vif.type == NL80211_IFTYPE_MESH_POINT) memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_STATION) - memcpy(mgmt->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN); -- cgit v1.2.3-70-g09d2 From 1e0f8cc96b7162075d2e3b6bef856497884a3ae8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 6 Sep 2022 22:37:03 +0200 Subject: wifi: nl80211: use link ID in NL80211_CMD_SET_BSS We clearly need the link ID here, to know the right BSS to configure. Use/require it. Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 ++ net/wireless/nl80211.c | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4d35a4234417..659dd1bee70f 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2105,6 +2105,7 @@ struct mpath_info { * * Used to change BSS parameters (mainly for AP mode). * + * @link_id: link_id or -1 for non-MLD * @use_cts_prot: Whether to use CTS protection * (0 = no, 1 = yes, -1 = do not change) * @use_short_preamble: Whether the use of short preambles is allowed @@ -2122,6 +2123,7 @@ struct mpath_info { * @p2p_opp_ps: P2P opportunistic PS (-1 = no change) */ struct bss_parameters { + int link_id; int use_cts_prot; int use_short_preamble; int use_short_slot_time; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ad7393cd3d18..1d0277758d0e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7780,6 +7780,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) int err; memset(¶ms, 0, sizeof(params)); + params.link_id = nl80211_link_id_or_invalid(info->attrs); /* default to not changing parameters */ params.use_cts_prot = -1; params.use_short_preamble = -1; @@ -16564,7 +16565,8 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_bss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_GET_REG, -- cgit v1.2.3-70-g09d2 From 9a886df0c36928039fc760eb8129c1d7f4bc418e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 6 Sep 2022 22:42:25 +0200 Subject: wifi: mac80211: use link_id in ieee80211_change_bss() We should set the parameters here per link, except unfortunately ap_isolate, but we can't really change that anymore so it'll remain a quirk in the API in that you need to change it on one of the valid links and it'll apply to all. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 687b4c878d4a..ef5700eac05c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2554,47 +2554,50 @@ static int ieee80211_change_bss(struct wiphy *wiphy, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; u32 changed = 0; - if (!sdata_dereference(sdata->deflink.u.ap.beacon, sdata)) + link = ieee80211_link_or_deflink(sdata, params->link_id, true); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (!sdata_dereference(link->u.ap.beacon, sdata)) return -ENOENT; - sband = ieee80211_get_sband(sdata); + sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->use_cts_prot >= 0) { - sdata->vif.bss_conf.use_cts_prot = params->use_cts_prot; + link->conf->use_cts_prot = params->use_cts_prot; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (params->use_short_preamble >= 0) { - sdata->vif.bss_conf.use_short_preamble = - params->use_short_preamble; + link->conf->use_short_preamble = params->use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } - if (!sdata->vif.bss_conf.use_short_slot && + if (!link->conf->use_short_slot && (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ)) { - sdata->vif.bss_conf.use_short_slot = true; + link->conf->use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } if (params->use_short_slot_time >= 0) { - sdata->vif.bss_conf.use_short_slot = - params->use_short_slot_time; + link->conf->use_short_slot = params->use_short_slot_time; changed |= BSS_CHANGED_ERP_SLOT; } if (params->basic_rates) { - ieee80211_parse_bitrates(sdata->vif.bss_conf.chandef.width, + ieee80211_parse_bitrates(link->conf->chandef.width, wiphy->bands[sband->band], params->basic_rates, params->basic_rates_len, - &sdata->vif.bss_conf.basic_rates); + &link->conf->basic_rates); changed |= BSS_CHANGED_BASIC_RATES; - ieee80211_check_rate_mask(&sdata->deflink); + ieee80211_check_rate_mask(link); } if (params->ap_isolate >= 0) { @@ -2606,30 +2609,29 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (params->ht_opmode >= 0) { - sdata->vif.bss_conf.ht_operation_mode = - (u16) params->ht_opmode; + link->conf->ht_operation_mode = (u16)params->ht_opmode; changed |= BSS_CHANGED_HT; } if (params->p2p_ctwindow >= 0) { - sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &= + link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_CTWINDOW_MASK; - sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= + link->conf->p2p_noa_attr.oppps_ctwindow |= params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; changed |= BSS_CHANGED_P2P_PS; } if (params->p2p_opp_ps > 0) { - sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= + link->conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } else if (params->p2p_opp_ps == 0) { - sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &= + link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } - ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); + ieee80211_link_info_change_notify(sdata, link, changed); return 0; } -- cgit v1.2.3-70-g09d2 From 0143ea09b63d58ed3e19ffdfc444cc23d8d877b6 Mon Sep 17 00:00:00 2001 From: Haim Dreyfuss Date: Sun, 11 Sep 2022 22:01:42 +0300 Subject: wifi: mac80211: advertise TWT requester only with HW support Currently, we rely only on the AP capability. If the AP supports TWT responder we will advertise TWT requester even if the driver or HW doesn't support it. Fix this by checking the HW capability. Signed-off-by: Haim Dreyfuss Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2e4bb75c68c0..b6f378e7edea 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3870,9 +3870,15 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, } } -static bool ieee80211_twt_req_supported(const struct link_sta_info *link_sta, +static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct link_sta_info *link_sta, const struct ieee802_11_elems *elems) { + const struct ieee80211_sta_he_cap *own_he_cap = + ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); + if (elems->ext_capab_len < 10) return false; @@ -3880,14 +3886,19 @@ static bool ieee80211_twt_req_supported(const struct link_sta_info *link_sta, return false; return link_sta->pub->he_cap.he_cap_elem.mac_cap_info[0] & - IEEE80211_HE_MAC_CAP0_TWT_RES; + IEEE80211_HE_MAC_CAP0_TWT_RES && + own_he_cap && + (own_he_cap->he_cap_elem.mac_cap_info[0] & + IEEE80211_HE_MAC_CAP0_TWT_REQ); } -static int ieee80211_recalc_twt_req(struct ieee80211_link_data *link, +static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct ieee802_11_elems *elems) { - bool twt = ieee80211_twt_req_supported(link_sta, elems); + bool twt = ieee80211_twt_req_supported(sdata, sband, link_sta, elems); if (link->conf->twt_requester != twt) { link->conf->twt_requester = twt; @@ -4129,7 +4140,8 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, else bss_conf->twt_protected = false; - *changed |= ieee80211_recalc_twt_req(link, link_sta, elems); + *changed |= ieee80211_recalc_twt_req(sdata, sband, link, + link_sta, elems); if (elems->eht_operation && elems->eht_cap && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT)) { @@ -5472,6 +5484,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee802_11_elems *elems; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; + struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; struct link_sta_info *link_sta; struct sta_info *sta; @@ -5734,7 +5747,12 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, goto free; } - changed |= ieee80211_recalc_twt_req(link, link_sta, elems); + if (WARN_ON(!link->conf->chandef.chan)) + goto free; + + sband = local->hw.wiphy->bands[link->conf->chandef.chan->band]; + + changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); if (ieee80211_config_bw(link, elems->ht_cap_elem, elems->vht_cap_elem, elems->ht_operation, -- cgit v1.2.3-70-g09d2 From 9beed8de806644b59729d862e66e8a4eef894fa8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 26 Sep 2022 15:45:04 +0200 Subject: wifi: mac80211: set internal scan request BSSID If any driver relies entirely on the scan request BSSID, then that would be wrong for internal scans. Initialize it to the broadcast address since we don't otherwise use the field. Signed-off-by: Johannes Berg --- net/mac80211/main.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 46f3eddc2388..25e72812000e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1155,6 +1155,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!local->int_scan_req) return -ENOMEM; + eth_broadcast_addr(local->int_scan_req->bssid); + for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band]) continue; -- cgit v1.2.3-70-g09d2 From 3903963ed93dda7923144fd8b25fdfe0c3fd7437 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 16 Sep 2022 19:15:52 +0200 Subject: wifi: mac80211: fix AddBA response addressing Since this frame is addressed from/to an MLD, it should be built with the correct AP MLD address (in station mode) to be encrypted properly. Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 9414d3bbd65f..eaef88a603e5 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -242,7 +242,7 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, sdata->vif.type == NL80211_IFTYPE_MESH_POINT) memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_STATION) - memcpy(mgmt->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN); -- cgit v1.2.3-70-g09d2 From 78a6a43aaf87180ec7425a2a90468e1b4d09a1ec Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 21 Sep 2022 02:44:58 +0200 Subject: wifi: mac80211: mlme: fix null-ptr deref on failed assoc If association to an AP without a link 0 fails, then we crash in tracing because it assumes that either ap_mld_addr or link 0 BSS is valid, since we clear sdata->vif.valid_links and then don't add the ap_mld_addr to the struct. Since we clear also sdata->vif.cfg.ap_addr, keep a local copy of it and assign it earlier, before clearing valid_links, to fix this. Fixes: 81151ce462e5 ("wifi: mac80211: support MLO authentication/association with one link") Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b6f378e7edea..1ad0bf3bfcae 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5082,6 +5082,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct cfg80211_rx_assoc_resp resp = { .uapsd_queues = -1, }; + u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; sdata_assert_lock(sdata); @@ -5251,6 +5252,11 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, resp.uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; } + if (sdata->vif.valid_links) { + ether_addr_copy(ap_mld_addr, sdata->vif.cfg.ap_addr); + resp.ap_mld_addr = ap_mld_addr; + } + ieee80211_destroy_assoc_data(sdata, status_code == WLAN_STATUS_SUCCESS ? ASSOC_SUCCESS : @@ -5260,8 +5266,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, resp.len = len; resp.req_ies = ifmgd->assoc_req_ies; resp.req_ies_len = ifmgd->assoc_req_ies_len; - if (sdata->vif.valid_links) - resp.ap_mld_addr = sdata->vif.cfg.ap_addr; cfg80211_rx_assoc_resp(sdata->dev, &resp); notify_driver: drv_mgd_complete_tx(sdata->local, sdata, &info); -- cgit v1.2.3-70-g09d2 From 85176a3fcd9748558cff72d4cdff5465b8732282 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 21 Sep 2022 23:01:46 +0200 Subject: wifi: mac80211: check link ID in auth/assoc continuation Ensure that the link ID matches in auth/assoc continuation, otherwise we need to reset all the data. Fixes: 81151ce462e5 ("wifi: mac80211: support MLO authentication/association with one link") Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 517a50abdb09..b704656027db 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -390,6 +390,7 @@ struct ieee80211_mgd_auth_data { bool done, waiting; bool peer_confirmed; bool timeout_started; + int link_id; u8 ap_addr[ETH_ALEN] __aligned(2); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 1ad0bf3bfcae..c46f355265ee 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6702,6 +6702,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, req->ap_mld_addr ?: req->bss->bssid, ETH_ALEN); auth_data->bss = req->bss; + auth_data->link_id = req->link_id; if (req->auth_data_len >= 4) { if (req->auth_type == NL80211_AUTHTYPE_SAE) { @@ -6720,7 +6721,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, * removal and re-addition of the STA entry in * ieee80211_prep_connection(). */ - cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss; + cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss && + ifmgd->auth_data->link_id == req->link_id; if (req->ie && req->ie_len) { memcpy(&auth_data->data[auth_data->data_len], @@ -7044,7 +7046,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, /* keep sta info, bssid if matching */ match = ether_addr_equal(ifmgd->auth_data->ap_addr, - assoc_data->ap_addr); + assoc_data->ap_addr) && + ifmgd->auth_data->link_id == req->link_id; ieee80211_destroy_auth_data(sdata, match); } -- cgit v1.2.3-70-g09d2 From 7a693ce0033707a49e951769cc230ef870ff76a8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 20 Sep 2022 22:43:53 +0200 Subject: wifi: mac80211: mlme: mark assoc link in output It's useful to know which link was used for the association, mark it when printing the links. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c46f355265ee..50a376f86ec2 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4946,9 +4946,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (sdata->vif.valid_links) link_info(link, - "local address %pM, AP link address %pM\n", + "local address %pM, AP link address %pM%s\n", link->conf->addr, - assoc_data->link[link_id].bss->bssid); + assoc_data->link[link_id].bss->bssid, + link_id == assoc_data->assoc_link_id ? + " (assoc)" : ""); link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->sta_mtx)); -- cgit v1.2.3-70-g09d2 From e406121e1860e7789ebfa0e1b6853e48dde8b879 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 19 Sep 2022 19:19:30 +0200 Subject: wifi: mac80211: change AddBA deny error message If the station has no HT, we deny the aggregation session but the error message talks about QoS; change it to say HT instead. Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index eaef88a603e5..f8bec5ee5b44 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -299,7 +299,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, if (!sta->sta.deflink.ht_cap.ht_supported && sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) { ht_dbg(sta->sdata, - "STA %pM erroneously requests BA session on tid %d w/o QoS\n", + "STA %pM erroneously requests BA session on tid %d w/o HT\n", sta->sta.addr, tid); /* send a response anyway, it's an error case if we get here */ goto end; -- cgit v1.2.3-70-g09d2 From cb04b5ef855c210a6e8e3e64fae9e2da9100f72e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 15 Sep 2022 14:57:47 +0200 Subject: wifi: mac80211: don't clear DTIM period after setting it Fix the code that sets the DTIM period to always propagate it into link->conf->dtim_period and not overwrite it, while still preferring to set it from the beacon data if available. Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 50a376f86ec2..a651d3ffd8e6 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2717,18 +2717,10 @@ static u32 ieee80211_link_set_associated(struct ieee80211_link_data *link, } if (link->u.mgd.have_beacon) { - /* - * If the AP is buggy we may get here with no DTIM period - * known, so assume it's 1 which is the only safe assumption - * in that case, although if the TIM IE is broken powersave - * probably just won't work at all. - */ - bss_conf->dtim_period = link->u.mgd.dtim_period ?: 1; bss_conf->beacon_rate = bss->beacon_rate; changed |= BSS_CHANGED_BEACON_INFO; } else { bss_conf->beacon_rate = NULL; - bss_conf->dtim_period = 0; } /* Tell the driver to monitor connection quality (if supported) */ @@ -4934,10 +4926,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; struct link_sta_info *link_sta; - if (!assoc_data->link[link_id].bss) + if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); @@ -4957,19 +4950,25 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (WARN_ON(!link_sta)) goto out_err; - if (link_id != assoc_data->assoc_link_id) { - struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; + if (!link->u.mgd.have_beacon) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); - ies = rcu_dereference(cbss->ies); + ies = rcu_dereference(cbss->beacon_ies); + if (ies) + link->u.mgd.have_beacon = true; + else + ies = rcu_dereference(cbss->ies); ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, &link->u.mgd.dtim_period); - link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; link->conf->beacon_int = cbss->beacon_interval; rcu_read_unlock(); + } + link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; + + if (link_id != assoc_data->assoc_link_id) { err = ieee80211_prep_channel(sdata, link, cbss, &link->u.mgd.conn_flags); if (err) { -- cgit v1.2.3-70-g09d2 From 69e0d04e2b0b3a6ffdd2794eb65b3daf6ae5cbf7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 30 Sep 2022 12:54:30 +0200 Subject: wifi: mac80211: prohibit IEEE80211_HT_CAP_DELAY_BA with MLO This won't work right at least with the code as it is, so at least for now just assume it's never set for MLO. It may very well never change, almost no drivers support it. Signed-off-by: Johannes Berg --- net/mac80211/main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 25e72812000e..425793dd7c9c 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1087,6 +1087,16 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) channels += sband->n_channels; + /* + * Due to the way the aggregation code handles this and it + * being an HT capability, we can't really support delayed + * BA in MLO (yet). + */ + if (WARN_ON(sband->ht_cap.ht_supported && + (sband->ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA) && + hw->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)) + return -EINVAL; + if (max_bitrates < sband->n_bitrates) max_bitrates = sband->n_bitrates; supp_ht = supp_ht || sband->ht_cap.ht_supported; -- cgit v1.2.3-70-g09d2 From 2e82be13c6a33cb8befd557f168c92fb363107df Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 19 Sep 2022 19:01:19 +0200 Subject: wifi: mac80211: agg-rx: avoid band check If the deflink of the station is on 6 GHz, then it won't have HT. If at the same time we're using MLO, then vif.bss_conf isn't used, and thus vif.bss_conf.chandef.chan is NULL, causing the code to crash. Fix this by just checking for both HT and HE, and refusing the aggregation session if both are not present. This might be a bit wrong since it would accept an aggregation session from a peer that has HE but no HT on 2.4 or 5 GHz, but such a peer shouldn't exist in the first place, and it probably supports aggregation if it has HE support. Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index f8bec5ee5b44..bd41022f4894 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -297,7 +297,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } if (!sta->sta.deflink.ht_cap.ht_supported && - sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) { + !sta->sta.deflink.he_cap.has_he) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o HT\n", sta->sta.addr, tid); -- cgit v1.2.3-70-g09d2 From e8d0b807b4a223ae499d43e4baa6c45f946f42d5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 30 Sep 2022 13:24:11 +0200 Subject: wifi: mac80211: remove support for AddBA with fragmentation HE added support for dynamic fragmentation inside aggregation sessions, but no existing driver ever advertises it. Thus, remove the code for now, it cannot work as-is in MLO. For it to properly work in MLO, we'd need to validate that the frag level is identical across all the link bands/iftypes, which is a good amount of complex code that's just not worth it as long as no driver has support for it. Signed-off-by: Johannes Berg --- net/mac80211/agg-rx.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index bd41022f4894..c6fa53230450 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -183,34 +183,15 @@ static void ieee80211_add_addbaext(struct ieee80211_sub_if_data *sdata, const struct ieee80211_addba_ext_ie *req, u16 buf_size) { - struct ieee80211_supported_band *sband; struct ieee80211_addba_ext_ie *resp; - const struct ieee80211_sta_he_cap *he_cap; - u8 frag_level, cap_frag_level; u8 *pos; - sband = ieee80211_get_sband(sdata); - if (!sband) - return; - he_cap = ieee80211_get_he_iftype_cap(sband, - ieee80211_vif_type_p2p(&sdata->vif)); - if (!he_cap) - return; - pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie)); *pos++ = WLAN_EID_ADDBA_EXT; *pos++ = sizeof(struct ieee80211_addba_ext_ie); resp = (struct ieee80211_addba_ext_ie *)pos; resp->data = req->data & IEEE80211_ADDBA_EXT_NO_FRAG; - frag_level = u32_get_bits(req->data, - IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK); - cap_frag_level = u32_get_bits(he_cap->he_cap_elem.mac_cap_info[0], - IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK); - if (frag_level > cap_frag_level) - frag_level = cap_frag_level; - resp->data |= u8_encode_bits(frag_level, - IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK); resp->data |= u8_encode_bits(buf_size >> IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT, IEEE80211_ADDBA_EXT_BUF_SIZE_MASK); } -- cgit v1.2.3-70-g09d2 From 9d13aff91ecd3f077b432df35291c945bde585be Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Oct 2022 14:08:41 +0200 Subject: wifi: mac80211: fix ifdef symbol name This should of course be CONFIG_, not CPTCFG_, which is an artifact from working with backports. Fixes: 9dd1953846c7 ("wifi: nl80211/mac80211: clarify link ID in control port TX") Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 27c964be102e..d9e7a2ed5d2c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2971,7 +2971,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, if (pre_conf_link_id != link_id && link_id != IEEE80211_LINK_UNSPECIFIED) { -#ifdef CPTCFG_MAC80211_VERBOSE_DEBUG +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG net_info_ratelimited("%s: dropped frame to %pM with bad link ID request (%d vs. %d)\n", sdata->name, hdr.addr1, pre_conf_link_id, link_id); -- cgit v1.2.3-70-g09d2 From 4857ed9385fbd25060051cc42a93c3074de2f4dd Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 15 Sep 2022 21:52:43 +0200 Subject: wifi: mac80211: minstrel_ht: remove unused has_mrr member from struct minstrel_priv Remove unused has_mrr (has multi-rate retry capabilities) member from struct minstrel_priv (only set once in minstrel_ht_alloc, never used again). Signed-off-by: Peter Seiderer Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 3 --- net/mac80211/rc80211_minstrel_ht.h | 1 - 2 files changed, 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 7f3f5f51081d..f936d6b3f093 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1963,9 +1963,6 @@ minstrel_ht_alloc(struct ieee80211_hw *hw) /* safe default, does not necessarily have to match hw properties */ mp->max_retry = 7; - if (hw->max_rates >= 4) - mp->has_mrr = true; - mp->hw = hw; mp->update_interval = HZ / 20; diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index 1766ff0c78d3..4be0401f7721 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -74,7 +74,6 @@ struct minstrel_priv { struct ieee80211_hw *hw; - bool has_mrr; unsigned int cw_min; unsigned int cw_max; unsigned int max_retry; -- cgit v1.2.3-70-g09d2 From 0ff57171d6d225558c81a69439d5323e35b40549 Mon Sep 17 00:00:00 2001 From: Vinayak Yadawad Date: Wed, 7 Sep 2022 18:14:48 +0530 Subject: cfg80211: Update Transition Disable policy during port authorization In case of 4way handshake offload, transition disable policy updated by the AP during EAPOL 3/4 is not updated to the upper layer. This results in mismatch between transition disable policy between the upper layer and the driver. This patch addresses this issue by updating transition disable policy as part of port authorization indication. Signed-off-by: Vinayak Yadawad Signed-off-by: Johannes Berg --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 2 +- include/net/cfg80211.h | 4 +++- include/uapi/linux/nl80211.h | 3 +++ net/wireless/core.h | 5 ++++- net/wireless/nl80211.c | 8 +++++++- net/wireless/nl80211.h | 3 ++- net/wireless/sme.c | 12 ++++++++---- net/wireless/util.c | 4 +++- 8 files changed, 31 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index bf184c0e64cb..3f2336062217 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -6268,7 +6268,7 @@ done: brcmf_dbg(CONN, "Report roaming result\n"); if (profile->use_fwsup == BRCMF_PROFILE_FWSUP_1X && profile->is_ft) { - cfg80211_port_authorized(ndev, profile->bssid, GFP_KERNEL); + cfg80211_port_authorized(ndev, profile->bssid, NULL, 0, GFP_KERNEL); brcmf_dbg(CONN, "Report port authorized\n"); } diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 659dd1bee70f..11a370e64143 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -7683,6 +7683,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * * @dev: network device * @bssid: the BSSID of the AP + * @td_bitmap: transition disable policy + * @td_bitmap_len: Length of transition disable policy * @gfp: allocation flags * * This function should be called by a driver that supports 4 way handshake @@ -7693,7 +7695,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, * indicate the 802.11 association. */ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, - gfp_t gfp); + const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp); /** * cfg80211_disconnected - notify cfg80211 that connection was dropped diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index c32e7616a366..c14a91bbca7c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2749,6 +2749,8 @@ enum nl80211_commands { * When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the ack RX * timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates * the incoming frame RX timestamp. + * @NL80211_ATTR_TD_BITMAP: Transition Disable bitmap, for subsequent + * (re)associations. * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3276,6 +3278,7 @@ enum nl80211_attrs { NL80211_ATTR_TX_HW_TIMESTAMP, NL80211_ATTR_RX_HW_TIMESTAMP, + NL80211_ATTR_TD_BITMAP, /* add attributes here, update the policy in nl80211.c */ diff --git a/net/wireless/core.h b/net/wireless/core.h index 775e16cb99ed..af85d8909935 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -271,6 +271,8 @@ struct cfg80211_event { } ij; struct { u8 bssid[ETH_ALEN]; + const u8 *td_bitmap; + u8 td_bitmap_len; } pa; }; }; @@ -409,7 +411,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, bool wextev); void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info); -void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid); +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_autodisconnect_wk(struct work_struct *work); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1d0277758d0e..fe368af39554 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -17942,7 +17942,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, } void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid) + struct net_device *netdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len) { struct sk_buff *msg; void *hdr; @@ -17962,6 +17963,11 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) goto nla_put_failure; + if ((td_bitmap_len > 0) && td_bitmap) + if (nla_put(msg, NL80211_ATTR_TD_BITMAP, + td_bitmap_len, td_bitmap)) + goto nla_put_failure; + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 855d540ddfb9..ba9457e94c43 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -83,7 +83,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_roam_info *info, gfp_t gfp); void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid); + struct net_device *netdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len); void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index f94497e9db43..4b5b6ee0fe01 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -1251,7 +1251,8 @@ out: } EXPORT_SYMBOL(cfg80211_roamed); -void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len) { ASSERT_WDEV_LOCK(wdev); @@ -1264,11 +1265,11 @@ void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) return; nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, - bssid); + bssid, td_bitmap, td_bitmap_len); } void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, - gfp_t gfp) + const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -1278,12 +1279,15 @@ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, if (WARN_ON(!bssid)) return; - ev = kzalloc(sizeof(*ev), gfp); + ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp); if (!ev) return; ev->type = EVENT_PORT_AUTHORIZED; memcpy(ev->pa.bssid, bssid, ETH_ALEN); + ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev); + ev->pa.td_bitmap_len = td_bitmap_len; + memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len); /* * Use the wdev event list so that if there are pending diff --git a/net/wireless/util.c b/net/wireless/util.c index 01493568a21d..f09d528e5199 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -988,7 +988,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) __cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: - __cfg80211_port_authorized(wdev, ev->pa.bssid); + __cfg80211_port_authorized(wdev, ev->pa.bssid, + ev->pa.td_bitmap, + ev->pa.td_bitmap_len); break; } wdev_unlock(wdev); -- cgit v1.2.3-70-g09d2 From c850e31f79f049af5022f07cd9961605b4470d0b Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Sun, 9 Oct 2022 18:30:38 +0200 Subject: wifi: mac80211: add internal handler for wake_tx_queue Start to align the TX handling to only use internal TX queues (iTXQs): Provide a handler for drivers not having a custom wake_tx_queue callback and update the documentation. Signed-off-by: Alexander Wetzel Signed-off-by: Johannes Berg --- include/net/mac80211.h | 51 +++++++++++++++++++++++++++++++------------------- net/mac80211/util.c | 46 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index cda4584dfd51..721c450a9ccd 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -89,15 +89,13 @@ /** * DOC: mac80211 software tx queueing * - * mac80211 provides an optional intermediate queueing implementation designed - * to allow the driver to keep hardware queues short and provide some fairness - * between different stations/interfaces. - * In this model, the driver pulls data frames from the mac80211 queue instead - * of letting mac80211 push them via drv_tx(). - * Other frames (e.g. control or management) are still pushed using drv_tx(). + * mac80211 uses an intermediate queueing implementation, designed to allow the + * driver to keep hardware queues short and to provide some fairness between + * different stations/interfaces. * - * Drivers indicate that they use this model by implementing the .wake_tx_queue - * driver operation. + * Drivers must provide the .wake_tx_queue driver operation by either + * linking it to ieee80211_handle_wake_tx_queue() or implementing a custom + * handler. * * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with * another per-sta for non-data/non-mgmt and bufferable management frames, and @@ -106,9 +104,12 @@ * The driver is expected to initialize its private per-queue data for stations * and interfaces in the .add_interface and .sta_add ops. * - * The driver can't access the queue directly. To dequeue a frame from a - * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a - * queue, it calls the .wake_tx_queue driver op. + * The driver can't access the internal TX queues (iTXQs) directly. + * Whenever mac80211 adds a new frame to a queue, it calls the .wake_tx_queue + * driver op. + * Drivers implementing a custom .wake_tx_queue op can get them by calling + * ieee80211_tx_dequeue(). Drivers using ieee80211_handle_wake_tx_queue() will + * simply get the individual frames pushed via the .tx driver operation. * * Drivers can optionally delegate responsibility for scheduling queues to * mac80211, to take advantage of airtime fairness accounting. In this case, to @@ -1826,7 +1827,7 @@ struct ieee80211_vif_cfg { * for this interface. * @drv_priv: data area for driver use, will always be aligned to * sizeof(void \*). - * @txq: the multicast data TX queue (if driver uses the TXQ abstraction) + * @txq: the multicast data TX queue * @txqs_stopped: per AC flag to indicate that intermediate TXQs are stopped, * protected by fq->lock. * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see @@ -2259,8 +2260,8 @@ struct ieee80211_link_sta { * For non MLO STA it will point to the deflink data. For MLO STA * ieee80211_sta_recalc_aggregates() must be called to update it. * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not. - * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that - * the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames + * @txq: per-TID data TX queues; note that the last entry (%IEEE80211_NUM_TIDS) + * is used for non-data frames * @deflink: This holds the default link STA information, for non MLO STA all link * specific STA information is accessed through @deflink or through * link[0] which points to address of @deflink. For MLO Link STA @@ -5713,7 +5714,7 @@ void ieee80211_key_replay(struct ieee80211_key_conf *keyconf); * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_wake_queue. + * Drivers must use this function instead of netif_wake_queue. */ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); @@ -5722,7 +5723,7 @@ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue); * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_stop_queue. */ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); @@ -5731,7 +5732,7 @@ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue); * @hw: pointer as obtained from ieee80211_alloc_hw(). * @queue: queue number (counted from zero). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_queue_stopped. * * Return: %true if the queue is stopped. %false otherwise. */ @@ -5742,7 +5743,7 @@ int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue); * ieee80211_stop_queues - stop all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * - * Drivers should use this function instead of netif_stop_queue. + * Drivers must use this function instead of netif_tx_stop_all_queues. */ void ieee80211_stop_queues(struct ieee80211_hw *hw); @@ -5750,7 +5751,7 @@ void ieee80211_stop_queues(struct ieee80211_hw *hw); * ieee80211_wake_queues - wake all queues * @hw: pointer as obtained from ieee80211_alloc_hw(). * - * Drivers should use this function instead of netif_wake_queue. + * Drivers must use this function instead of netif_tx_wake_all_queues. */ void ieee80211_wake_queues(struct ieee80211_hw *hw); @@ -6971,6 +6972,18 @@ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw, return skb; } +/** + * ieee80211_handle_wake_tx_queue - mac80211 handler for wake_tx_queue callback + * + * @hw: pointer as obtained from wake_tx_queue() callback(). + * @txq: pointer as obtained from wake_tx_queue() callback(). + * + * Drivers can use this function for the mandatory mac80211 wake_tx_queue + * callback in struct ieee80211_ops. They should not call this function. + */ +void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, + struct ieee80211_txq *txq); + /** * ieee80211_next_txq - get next tx queue to pull packets from * diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 40b75fa82b15..a4bf86f17c39 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -288,6 +288,52 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_ctstoself_duration); +static void wake_tx_push_queue(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_txq *queue) +{ + int q = sdata->vif.hw_queue[queue->ac]; + struct ieee80211_tx_control control = { + .sta = queue->sta, + }; + struct sk_buff *skb; + unsigned long flags; + bool q_stopped; + + while (1) { + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + q_stopped = local->queue_stop_reasons[q]; + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + + if (q_stopped) + break; + + skb = ieee80211_tx_dequeue(&local->hw, queue); + if (!skb) + break; + + drv_tx(local, &control, skb); + } +} + +/* wake_tx_queue handler for driver not implementing a custom one*/ +void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); + struct ieee80211_txq *queue; + + /* Use ieee80211_next_txq() for airtime fairness accounting */ + ieee80211_txq_schedule_start(hw, txq->ac); + while ((queue = ieee80211_next_txq(hw, txq->ac))) { + wake_tx_push_queue(local, sdata, queue); + ieee80211_return_txq(hw, queue, false); + } + ieee80211_txq_schedule_end(hw, txq->ac); +} +EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue); + static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) { struct ieee80211_local *local = sdata->local; -- cgit v1.2.3-70-g09d2 From 107395f9cf4419fa29b553a00872a27e579ae304 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Sun, 9 Oct 2022 18:30:40 +0200 Subject: wifi: mac80211: Drop support for TX push path All drivers are now using mac80211 internal queues (iTXQs). Drop mac80211 internal support for the old push path. Signed-off-by: Alexander Wetzel Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 3 -- net/mac80211/debugfs.c | 4 +-- net/mac80211/debugfs_netdev.c | 3 +- net/mac80211/debugfs_sta.c | 6 ++-- net/mac80211/ieee80211_i.h | 1 - net/mac80211/iface.c | 69 ++----------------------------------------- net/mac80211/main.c | 11 ++----- net/mac80211/rx.c | 3 -- net/mac80211/sta_info.c | 51 +++++++++++++------------------- net/mac80211/tdls.c | 1 - net/mac80211/tx.c | 28 +++--------------- net/mac80211/util.c | 53 ++++----------------------------- net/mac80211/wme.c | 63 +++++---------------------------------- net/mac80211/wme.h | 4 +-- 14 files changed, 48 insertions(+), 252 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ef5700eac05c..c848fe04dd44 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -4340,9 +4340,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata; int ret = 0; - if (!local->ops->wake_tx_queue) - return 1; - spin_lock_bh(&local->fq.lock); rcu_read_lock(); diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 78c7d60e8667..dfb9f55e2685 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -663,9 +663,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD_MODE(force_tx_status, 0600); DEBUGFS_ADD_MODE(aql_enable, 0600); DEBUGFS_ADD(aql_pending); - - if (local->ops->wake_tx_queue) - DEBUGFS_ADD_MODE(aqm, 0600); + DEBUGFS_ADD_MODE(aqm, 0600); DEBUGFS_ADD_MODE(airtime_flags, 0600); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 5b014786fd2d..c87e1137e5da 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -677,8 +677,7 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); DEBUGFS_ADD(hw_queues); - if (sdata->local->ops->wake_tx_queue && - sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) DEBUGFS_ADD(aqm); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 0d8ff0303028..7a3d7893e19d 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -1072,10 +1072,8 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) /* FIXME: Kept here as the statistics are only done on the deflink */ DEBUGFS_ADD_COUNTER(tx_filtered, deflink.status_stats.filtered); - if (local->ops->wake_tx_queue) { - DEBUGFS_ADD(aqm); - DEBUGFS_ADD(airtime); - } + DEBUGFS_ADD(aqm); + DEBUGFS_ADD(airtime); if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b704656027db..63ff0d2524b6 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2307,7 +2307,6 @@ void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted); -void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb); void ieee80211_add_pending_skbs(struct ieee80211_local *local, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index dd9ac1f7d2ea..7c4ce716c939 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -458,12 +458,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do if (cancel_scan) ieee80211_scan_cancel(local); - /* - * Stop TX on this interface first. - */ - if (!local->ops->wake_tx_queue && sdata->dev) - netif_tx_stop_all_queues(sdata->dev); - ieee80211_roc_purge(local, sdata); switch (sdata->vif.type) { @@ -811,13 +805,6 @@ static void ieee80211_uninit(struct net_device *dev) ieee80211_teardown_sdata(IEEE80211_DEV_TO_SUB_IF(dev)); } -static u16 ieee80211_netdev_select_queue(struct net_device *dev, - struct sk_buff *skb, - struct net_device *sb_dev) -{ - return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); -} - static void ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { @@ -831,7 +818,6 @@ static const struct net_device_ops ieee80211_dataif_ops = { .ndo_start_xmit = ieee80211_subif_start_xmit, .ndo_set_rx_mode = ieee80211_set_multicast_list, .ndo_set_mac_address = ieee80211_change_mac, - .ndo_select_queue = ieee80211_netdev_select_queue, .ndo_get_stats64 = ieee80211_get_stats64, }; @@ -939,7 +925,6 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = { .ndo_start_xmit = ieee80211_subif_start_xmit_8023, .ndo_set_rx_mode = ieee80211_set_multicast_list, .ndo_set_mac_address = ieee80211_change_mac, - .ndo_select_queue = ieee80211_netdev_select_queue, .ndo_get_stats64 = ieee80211_get_stats64, .ndo_fill_forward_path = ieee80211_netdev_fill_forward_path, }; @@ -1441,35 +1426,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) ieee80211_recalc_ps(local); - if (sdata->vif.type == NL80211_IFTYPE_MONITOR || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - local->ops->wake_tx_queue) { - /* XXX: for AP_VLAN, actually track AP queues */ - if (dev) - netif_tx_start_all_queues(dev); - } else if (dev) { - unsigned long flags; - int n_acs = IEEE80211_NUM_ACS; - int ac; - - if (local->hw.queues < IEEE80211_NUM_ACS) - n_acs = 1; - - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE || - (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 && - skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) { - for (ac = 0; ac < n_acs; ac++) { - int ac_queue = sdata->vif.hw_queue[ac]; - - if (local->queue_stop_reasons[ac_queue] == 0 && - skb_queue_empty(&local->pending[ac_queue])) - netif_start_subqueue(dev, ac); - } - } - spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - } - set_bit(SDATA_STATE_RUNNING, &sdata->state); return 0; @@ -1499,17 +1455,12 @@ static void ieee80211_if_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_NO_QUEUE; dev->netdev_ops = &ieee80211_dataif_ops; dev->needs_free_netdev = true; dev->priv_destructor = ieee80211_if_free; } -static void ieee80211_if_setup_no_queue(struct net_device *dev) -{ - ieee80211_if_setup(dev); - dev->priv_flags |= IFF_NO_QUEUE; -} - static void ieee80211_iface_process_skb(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) @@ -2094,9 +2045,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, struct net_device *ndev = NULL; struct ieee80211_sub_if_data *sdata = NULL; struct txq_info *txqi; - void (*if_setup)(struct net_device *dev); int ret, i; - int txqs = 1; ASSERT_RTNL(); @@ -2119,30 +2068,18 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, sizeof(void *)); int txq_size = 0; - if (local->ops->wake_tx_queue && - type != NL80211_IFTYPE_AP_VLAN && + if (type != NL80211_IFTYPE_AP_VLAN && (type != NL80211_IFTYPE_MONITOR || (params->flags & MONITOR_FLAG_ACTIVE))) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; - if (local->ops->wake_tx_queue) { - if_setup = ieee80211_if_setup_no_queue; - } else { - if_setup = ieee80211_if_setup; - if (local->hw.queues >= IEEE80211_NUM_ACS) - txqs = IEEE80211_NUM_ACS; - } - ndev = alloc_netdev_mqs(size + txq_size, name, name_assign_type, - if_setup, txqs, 1); + ieee80211_if_setup, 1, 1); if (!ndev) return -ENOMEM; - if (!local->ops->wake_tx_queue && local->hw.wiphy->tx_queue_len) - ndev->tx_queue_len = local->hw.wiphy->tx_queue_len; - dev_net_set(ndev, wiphy_net(local->hw.wiphy)); ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 425793dd7c9c..b7279d88cddb 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -630,7 +630,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (WARN_ON(!ops->tx || !ops->start || !ops->stop || !ops->config || !ops->add_interface || !ops->remove_interface || - !ops->configure_filter)) + !ops->configure_filter || !ops->wake_tx_queue)) return NULL; if (WARN_ON(ops->sta_state && (ops->sta_add || ops->sta_remove))) @@ -719,9 +719,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (!ops->set_key) wiphy->flags |= WIPHY_FLAG_IBSS_RSN; - if (ops->wake_tx_queue) - wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS); - + wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM); wiphy->bss_priv_size = sizeof(struct ieee80211_bss); @@ -834,10 +832,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, atomic_set(&local->agg_queue_stop[i], 0); } tasklet_setup(&local->tx_pending_tasklet, ieee80211_tx_pending); - - if (ops->wake_tx_queue) - tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs); - + tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs); tasklet_setup(&local->tasklet, ieee80211_tasklet_handler); skb_queue_head_init(&local->skb_queue); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 589521717c35..ac8691026903 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1571,9 +1571,6 @@ static void sta_ps_start(struct sta_info *sta) ieee80211_clear_fast_xmit(sta); - if (!sta->sta.txq[0]) - return; - for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { struct ieee80211_txq *txq = sta->sta.txq[tid]; struct txq_info *txqi = to_txq_info(txq); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 2bb6a71c72ef..6e0fd82855ae 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -140,17 +140,15 @@ static void __cleanup_single_sta(struct sta_info *sta) atomic_dec(&ps->num_sta_ps); } - if (sta->sta.txq[0]) { - for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - struct txq_info *txqi; + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + struct txq_info *txqi; - if (!sta->sta.txq[i]) - continue; + if (!sta->sta.txq[i]) + continue; - txqi = to_txq_info(sta->sta.txq[i]); + txqi = to_txq_info(sta->sta.txq[i]); - ieee80211_txq_purge(local, txqi); - } + ieee80211_txq_purge(local, txqi); } for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { @@ -428,8 +426,7 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); - if (sta->sta.txq[0]) - kfree(to_txq_info(sta->sta.txq[0])); + kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); @@ -531,6 +528,8 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; + void *txq_data; + int size; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); @@ -600,21 +599,18 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->last_connected = ktime_get_seconds(); - if (local->ops->wake_tx_queue) { - void *txq_data; - int size = sizeof(struct txq_info) + - ALIGN(hw->txq_data_size, sizeof(void *)); + size = sizeof(struct txq_info) + + ALIGN(hw->txq_data_size, sizeof(void *)); - txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); - if (!txq_data) - goto free; + txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); + if (!txq_data) + goto free; - for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - struct txq_info *txq = txq_data + i * size; + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + struct txq_info *txq = txq_data + i * size; - /* might not do anything for the bufferable MMPDU TXQ */ - ieee80211_txq_init(sdata, sta, txq, i); - } + /* might not do anything for the (bufferable) MMPDU TXQ */ + ieee80211_txq_init(sdata, sta, txq, i); } if (sta_prepare_rate_control(local, sta, gfp)) @@ -688,8 +684,7 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, return sta; free_txq: - if (sta->sta.txq[0]) - kfree(to_txq_info(sta->sta.txq[0])); + kfree(to_txq_info(sta->sta.txq[0])); free: sta_info_free_link(&sta->deflink); #ifdef CONFIG_MAC80211_MESH @@ -1982,9 +1977,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, * TIM recalculation. */ - if (!sta->sta.txq[0]) - return; - for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || @@ -2484,7 +2476,7 @@ static void sta_set_tidstats(struct sta_info *sta, tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; } - if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) { + if (tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); @@ -2812,9 +2804,6 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta) static void sta_update_codel_params(struct sta_info *sta, u32 thr) { - if (!sta->sdata->local->ops->wake_tx_queue) - return; - if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { sta->cparams.target = MS2TIME(50); sta->cparams.interval = MS2TIME(300); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index f4b4d25eef95..b255f3b5bf01 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1016,7 +1016,6 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, skb->priority = 256 + 5; break; } - skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, skb)); /* * Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress. diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e137355bea67..bb2e54610101 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1599,9 +1599,6 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) bool supp_vht = false; enum nl80211_band band; - if (!local->ops->wake_tx_queue) - return 0; - ret = fq_init(fq, 4096); if (ret) return ret; @@ -1649,9 +1646,6 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; - if (!local->ops->wake_tx_queue) - return; - kfree(local->cvars); local->cvars = NULL; @@ -1668,8 +1662,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, struct ieee80211_vif *vif; struct txq_info *txqi; - if (!local->ops->wake_tx_queue || - sdata->vif.type == NL80211_IFTYPE_MONITOR) + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) return false; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) @@ -4184,12 +4177,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, if (IS_ERR(sta)) sta = NULL; - if (local->ops->wake_tx_queue) { - u16 queue = __ieee80211_select_queue(sdata, sta, skb); - skb_set_queue_mapping(skb, queue); - skb_get_hash(skb); - } - + skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb)); ieee80211_aggr_check(sdata, sta, skb); sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); @@ -4495,11 +4483,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_tx *tid_tx; u8 tid; - if (local->ops->wake_tx_queue) { - u16 queue = __ieee80211_select_queue(sdata, sta, skb); - skb_set_queue_mapping(skb, queue); - skb_get_hash(skb); - } + skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb)); if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) @@ -4753,9 +4737,6 @@ void ieee80211_tx_pending(struct tasklet_struct *t) if (!txok) break; } - - if (skb_queue_empty(&local->pending[i])) - ieee80211_propagate_queue_wake(local, i); } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); @@ -5948,10 +5929,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, } if (!IS_ERR(sta)) { - u16 queue = __ieee80211_select_queue(sdata, sta, skb); + u16 queue = ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); - skb_get_hash(skb); /* * for MLO STA, the SA should be the AP MLD address, but diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 7a80843f1a0a..53217dc3f932 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -446,39 +446,6 @@ void ieee80211_wake_txqs(struct tasklet_struct *t) spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } -void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) -{ - struct ieee80211_sub_if_data *sdata; - int n_acs = IEEE80211_NUM_ACS; - - if (local->ops->wake_tx_queue) - return; - - if (local->hw.queues < IEEE80211_NUM_ACS) - n_acs = 1; - - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - int ac; - - if (!sdata->dev) - continue; - - if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE && - local->queue_stop_reasons[sdata->vif.cab_queue] != 0) - continue; - - for (ac = 0; ac < n_acs; ac++) { - int ac_queue = sdata->vif.hw_queue[ac]; - - if (ac_queue == queue || - (sdata->vif.cab_queue == queue && - local->queue_stop_reasons[ac_queue] == 0 && - skb_queue_empty(&local->pending[ac_queue]))) - netif_wake_subqueue(sdata->dev, ac); - } - } -} - static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted, @@ -509,11 +476,7 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, /* someone still has this queue stopped */ return; - if (skb_queue_empty(&local->pending[queue])) { - rcu_read_lock(); - ieee80211_propagate_queue_wake(local, queue); - rcu_read_unlock(); - } else + if (!skb_queue_empty(&local->pending[queue])) tasklet_schedule(&local->tx_pending_tasklet); /* @@ -523,12 +486,10 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, * release someone's lock, but it is fine because all the callers of * __ieee80211_wake_queue call it right before releasing the lock. */ - if (local->ops->wake_tx_queue) { - if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) - tasklet_schedule(&local->wake_txqs_tasklet); - else - _ieee80211_wake_txqs(local, flags); - } + if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) + tasklet_schedule(&local->wake_txqs_tasklet); + else + _ieee80211_wake_txqs(local, flags); } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, @@ -585,10 +546,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, for (ac = 0; ac < n_acs; ac++) { if (sdata->vif.hw_queue[ac] == queue || sdata->vif.cab_queue == queue) { - if (!local->ops->wake_tx_queue) { - netif_stop_subqueue(sdata->dev, ac); - continue; - } spin_lock(&local->fq.lock); sdata->vif.txqs_stopped[ac] = true; spin_unlock(&local->fq.lock); diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index 9fab97f6fbea..a12c63638680 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -122,6 +122,9 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; + /* Ensure hash is set prior to potential SW encryption */ + skb_get_hash(skb); + if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || local->hw.queues < IEEE80211_NUM_ACS) return 0; @@ -141,12 +144,15 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, return ieee80211_downgrade_queue(sdata, NULL, skb); } -u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb) +u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) { struct mac80211_qos_map *qos_map; bool qos; + /* Ensure hash is set prior to potential SW encryption */ + skb_get_hash(skb); + /* all mesh/ocb stations are required to support WME */ if (sta && (sdata->vif.type == NL80211_IFTYPE_MESH_POINT || sdata->vif.type == NL80211_IFTYPE_OCB)) @@ -176,59 +182,6 @@ u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, return ieee80211_downgrade_queue(sdata, sta, skb); } - -/* Indicate which queue to use. */ -u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb) -{ - struct ieee80211_local *local = sdata->local; - struct sta_info *sta = NULL; - const u8 *ra = NULL; - u16 ret; - - /* when using iTXQ, we can do this later */ - if (local->ops->wake_tx_queue) - return 0; - - if (local->hw.queues < IEEE80211_NUM_ACS || skb->len < 6) { - skb->priority = 0; /* required for correct WPA/11i MIC */ - return 0; - } - - rcu_read_lock(); - switch (sdata->vif.type) { - case NL80211_IFTYPE_AP_VLAN: - sta = rcu_dereference(sdata->u.vlan.sta); - if (sta) - break; - fallthrough; - case NL80211_IFTYPE_AP: - ra = skb->data; - break; - case NL80211_IFTYPE_STATION: - /* might be a TDLS station */ - sta = sta_info_get(sdata, skb->data); - if (sta) - break; - - ra = sdata->vif.cfg.ap_addr; - break; - case NL80211_IFTYPE_ADHOC: - ra = skb->data; - break; - default: - break; - } - - if (!sta && ra && !is_multicast_ether_addr(ra)) - sta = sta_info_get(sdata, ra); - - ret = __ieee80211_select_queue(sdata, sta, skb); - - rcu_read_unlock(); - return ret; -} - /** * ieee80211_set_qos_hdr - Fill in the QoS header if there is one. * diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h index 2e3dec0b6087..81f0039527a9 100644 --- a/net/mac80211/wme.h +++ b/net/mac80211/wme.h @@ -13,10 +13,8 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_hdr *hdr); -u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb); u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb); + struct sta_info *sta, struct sk_buff *skb); void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); -- cgit v1.2.3-70-g09d2 From e9d8d9c4084dadfa5a68a2e6e4e4dda7a0a728ba Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 7 Oct 2022 10:53:04 +0200 Subject: mac802154: move receive parameters above start This patch moves all receive parameters above the drv_start() functionality to make it accessibile in the drv_start() function. Signed-off-by: Alexander Aring Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20221007085310.503366-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/driver-ops.h | 190 ++++++++++++++++++++++----------------------- 1 file changed, 95 insertions(+), 95 deletions(-) (limited to 'net') diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index d23f0db98015..c9d54088a567 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -24,203 +24,221 @@ drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb) return local->ops->xmit_sync(&local->hw, skb); } -static inline int drv_start(struct ieee802154_local *local) +static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) { + struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - trace_802154_drv_start(local); - local->started = true; - smp_mb(); - ret = local->ops->start(&local->hw); + if (!local->ops->set_hw_addr_filt) { + WARN_ON(1); + return -EOPNOTSUPP; + } + + filt.pan_id = pan_id; + + trace_802154_drv_set_pan_id(local, pan_id); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, + IEEE802154_AFILT_PANID_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } -static inline void drv_stop(struct ieee802154_local *local) +static inline int +drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) { - might_sleep(); + struct ieee802154_hw_addr_filt filt; + int ret; - trace_802154_drv_stop(local); - local->ops->stop(&local->hw); - trace_802154_drv_return_void(local); + might_sleep(); - /* sync away all work on the tasklet before clearing started */ - tasklet_disable(&local->tasklet); - tasklet_enable(&local->tasklet); + if (!local->ops->set_hw_addr_filt) { + WARN_ON(1); + return -EOPNOTSUPP; + } - barrier(); + filt.ieee_addr = extended_addr; - local->started = false; + trace_802154_drv_set_extended_addr(local, extended_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, + IEEE802154_AFILT_IEEEADDR_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int -drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) +drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) { + struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - trace_802154_drv_set_channel(local, page, channel); - ret = local->ops->set_channel(&local->hw, page, channel); + if (!local->ops->set_hw_addr_filt) { + WARN_ON(1); + return -EOPNOTSUPP; + } + + filt.short_addr = short_addr; + + trace_802154_drv_set_short_addr(local, short_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, + IEEE802154_AFILT_SADDR_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } -static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) +static inline int +drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) { + struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_txpower) { + if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } - trace_802154_drv_set_tx_power(local, mbm); - ret = local->ops->set_txpower(&local->hw, mbm); + filt.pan_coord = is_coord; + + trace_802154_drv_set_pan_coord(local, is_coord); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, + IEEE802154_AFILT_PANC_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } -static inline int drv_set_cca_mode(struct ieee802154_local *local, - const struct wpan_phy_cca *cca) +static inline int +drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) { int ret; might_sleep(); - if (!local->ops->set_cca_mode) { + if (!local->ops->set_promiscuous_mode) { WARN_ON(1); return -EOPNOTSUPP; } - trace_802154_drv_set_cca_mode(local, cca); - ret = local->ops->set_cca_mode(&local->hw, cca); + trace_802154_drv_set_promiscuous_mode(local, on); + ret = local->ops->set_promiscuous_mode(&local->hw, on); trace_802154_drv_return_int(local, ret); return ret; } -static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) +static inline int drv_start(struct ieee802154_local *local) { int ret; might_sleep(); - if (!local->ops->set_lbt) { - WARN_ON(1); - return -EOPNOTSUPP; - } - - trace_802154_drv_set_lbt_mode(local, mode); - ret = local->ops->set_lbt(&local->hw, mode); + trace_802154_drv_start(local); + local->started = true; + smp_mb(); + ret = local->ops->start(&local->hw); trace_802154_drv_return_int(local, ret); return ret; } +static inline void drv_stop(struct ieee802154_local *local) +{ + might_sleep(); + + trace_802154_drv_stop(local); + local->ops->stop(&local->hw); + trace_802154_drv_return_void(local); + + /* sync away all work on the tasklet before clearing started */ + tasklet_disable(&local->tasklet); + tasklet_enable(&local->tasklet); + + barrier(); + + local->started = false; +} + static inline int -drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) +drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) { int ret; might_sleep(); - if (!local->ops->set_cca_ed_level) { - WARN_ON(1); - return -EOPNOTSUPP; - } - - trace_802154_drv_set_cca_ed_level(local, mbm); - ret = local->ops->set_cca_ed_level(&local->hw, mbm); + trace_802154_drv_set_channel(local, page, channel); + ret = local->ops->set_channel(&local->hw, page, channel); trace_802154_drv_return_int(local, ret); return ret; } -static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) +static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) { - struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_hw_addr_filt) { + if (!local->ops->set_txpower) { WARN_ON(1); return -EOPNOTSUPP; } - filt.pan_id = pan_id; - - trace_802154_drv_set_pan_id(local, pan_id); - ret = local->ops->set_hw_addr_filt(&local->hw, &filt, - IEEE802154_AFILT_PANID_CHANGED); + trace_802154_drv_set_tx_power(local, mbm); + ret = local->ops->set_txpower(&local->hw, mbm); trace_802154_drv_return_int(local, ret); return ret; } -static inline int -drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) +static inline int drv_set_cca_mode(struct ieee802154_local *local, + const struct wpan_phy_cca *cca) { - struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_hw_addr_filt) { + if (!local->ops->set_cca_mode) { WARN_ON(1); return -EOPNOTSUPP; } - filt.ieee_addr = extended_addr; - - trace_802154_drv_set_extended_addr(local, extended_addr); - ret = local->ops->set_hw_addr_filt(&local->hw, &filt, - IEEE802154_AFILT_IEEEADDR_CHANGED); + trace_802154_drv_set_cca_mode(local, cca); + ret = local->ops->set_cca_mode(&local->hw, cca); trace_802154_drv_return_int(local, ret); return ret; } -static inline int -drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) +static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) { - struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_hw_addr_filt) { + if (!local->ops->set_lbt) { WARN_ON(1); return -EOPNOTSUPP; } - filt.short_addr = short_addr; - - trace_802154_drv_set_short_addr(local, short_addr); - ret = local->ops->set_hw_addr_filt(&local->hw, &filt, - IEEE802154_AFILT_SADDR_CHANGED); + trace_802154_drv_set_lbt_mode(local, mode); + ret = local->ops->set_lbt(&local->hw, mode); trace_802154_drv_return_int(local, ret); return ret; } static inline int -drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) +drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) { - struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_hw_addr_filt) { + if (!local->ops->set_cca_ed_level) { WARN_ON(1); return -EOPNOTSUPP; } - filt.pan_coord = is_coord; - - trace_802154_drv_set_pan_coord(local, is_coord); - ret = local->ops->set_hw_addr_filt(&local->hw, &filt, - IEEE802154_AFILT_PANC_CHANGED); + trace_802154_drv_set_cca_ed_level(local, mbm); + ret = local->ops->set_cca_ed_level(&local->hw, mbm); trace_802154_drv_return_int(local, ret); return ret; } @@ -264,22 +282,4 @@ drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) return ret; } -static inline int -drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) -{ - int ret; - - might_sleep(); - - if (!local->ops->set_promiscuous_mode) { - WARN_ON(1); - return -EOPNOTSUPP; - } - - trace_802154_drv_set_promiscuous_mode(local, on); - ret = local->ops->set_promiscuous_mode(&local->hw, on); - trace_802154_drv_return_int(local, ret); - return ret; -} - #endif /* __MAC802154_DRIVER_OPS */ -- cgit v1.2.3-70-g09d2 From ac8037c35bd1fb1799d39f52205f813e6585b58b Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Fri, 7 Oct 2022 10:53:05 +0200 Subject: mac802154: set filter at drv_start() The current filtering level is set on the first interface up on a wpan phy. If we support scan functionality we need to change the filtering level on the fly on an operational phy and switching back again. This patch will move the receive mode parameter e.g. address filter and promiscuous mode to the drv_start() functionality to allow changing the receive mode on an operational phy not on first ifup only. In future this should be handled on driver layer because each hardware has it's own way to enter a specific filtering level. However this should offer to switch to mode IEEE802154_FILTERING_NONE and back to IEEE802154_FILTERING_4_FRAME_FIELDS. Only IEEE802154_FILTERING_4_FRAME_FIELDS and IEEE802154_FILTERING_NONE are somewhat supported by current hardware. All other filtering levels can be supported in future but will end in IEEE802154_FILTERING_NONE as the receive part can kind of "emulate" those receive paths by doing additional filtering routines. There are in total three filtering levels in the code: - the per-interface default level (should not be changed) - the required per-interface level (mac commands may play with it) - the actual per-PHY (hw) level that is currently in use Signed-off-by: Alexander Aring [ Link: https://lore.kernel.org/r/20221007085310.503366-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 7 +++-- net/mac802154/cfg.c | 2 +- net/mac802154/driver-ops.h | 71 +++++++++++++++++++++++++++++++++++++++++++- net/mac802154/ieee802154_i.h | 12 ++++++++ net/mac802154/iface.c | 44 ++++++++++----------------- net/mac802154/rx.c | 12 +++++++- 6 files changed, 115 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 428cece22205..e1481f9cf049 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -223,6 +223,11 @@ struct wpan_phy { atomic_t hold_txs; wait_queue_head_t sync_txq; + /* Current filtering level on reception. + * Only allowed to be changed if phy is not operational. + */ + enum ieee802154_filtering_level filtering; + char priv[] __aligned(NETDEV_ALIGN); }; @@ -374,8 +379,6 @@ struct wpan_dev { bool lbt; - bool promiscuous_mode; - /* fallback for acknowledgment bit setting */ bool ackreq; }; diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 93df24f75572..dc2d918fac68 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -67,7 +67,7 @@ static int ieee802154_resume(struct wpan_phy *wpan_phy) goto wake_up; /* restart hardware */ - ret = drv_start(local); + ret = drv_start(local, local->phy->filtering, &local->addr_filt); if (ret) return ret; diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index c9d54088a567..a7af3f0ddb3e 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -129,12 +129,81 @@ drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) return ret; } -static inline int drv_start(struct ieee802154_local *local) +static inline int drv_start(struct ieee802154_local *local, + enum ieee802154_filtering_level level, + const struct ieee802154_hw_addr_filt *addr_filt) { int ret; might_sleep(); + /* setup receive mode parameters e.g. address mode */ + if (local->hw.flags & IEEE802154_HW_AFILT) { + ret = drv_set_pan_id(local, addr_filt->pan_id); + if (ret < 0) + return ret; + + ret = drv_set_short_addr(local, addr_filt->short_addr); + if (ret < 0) + return ret; + + ret = drv_set_extended_addr(local, addr_filt->ieee_addr); + if (ret < 0) + return ret; + } + + switch (level) { + case IEEE802154_FILTERING_NONE: + fallthrough; + case IEEE802154_FILTERING_1_FCS: + fallthrough; + case IEEE802154_FILTERING_2_PROMISCUOUS: + /* TODO: Requires a different receive mode setup e.g. + * at86rf233 hardware. + */ + fallthrough; + case IEEE802154_FILTERING_3_SCAN: + if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { + ret = drv_set_promiscuous_mode(local, true); + if (ret < 0) + return ret; + } else { + return -EOPNOTSUPP; + } + + /* In practice other filtering levels can be requested, but as + * for now most hardware/drivers only support + * IEEE802154_FILTERING_NONE, we fallback to this actual + * filtering level in hardware and make our own additional + * filtering in mac802154 receive path. + * + * TODO: Move this logic to the device drivers as hardware may + * support more higher level filters. Hardware may also require + * a different order how register are set, which could currently + * be buggy, so all received parameters need to be moved to the + * start() callback and let the driver go into the mode before + * it will turn on receive handling. + */ + local->phy->filtering = IEEE802154_FILTERING_NONE; + break; + case IEEE802154_FILTERING_4_FRAME_FIELDS: + /* Do not error out if IEEE802154_HW_PROMISCUOUS because we + * expect the hardware to operate at the level + * IEEE802154_FILTERING_4_FRAME_FIELDS anyway. + */ + if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { + ret = drv_set_promiscuous_mode(local, false); + if (ret < 0) + return ret; + } + + local->phy->filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; + break; + default: + WARN_ON(1); + return -EINVAL; + } + trace_802154_drv_start(local); local->started = true; smp_mb(); diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 010365a6364e..509e0172fe82 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -26,6 +26,8 @@ struct ieee802154_local { struct ieee802154_hw hw; const struct ieee802154_ops *ops; + /* hardware address filter */ + struct ieee802154_hw_addr_filt addr_filt; /* ieee802154 phy */ struct wpan_phy *phy; @@ -82,6 +84,16 @@ struct ieee802154_sub_if_data { struct ieee802154_local *local; struct net_device *dev; + /* Each interface starts and works in nominal state at a given filtering + * level given by iface_default_filtering, which is set once for all at + * the interface creation and should not evolve over time. For some MAC + * operations however, the filtering level may change temporarily, as + * reflected in the required_filtering field. The actual filtering at + * the PHY level may be different and is shown in struct wpan_phy. + */ + enum ieee802154_filtering_level iface_default_filtering; + enum ieee802154_filtering_level required_filtering; + unsigned long state; char name[IFNAMSIZ]; diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 500ed1b81250..d9b50884d34e 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -147,25 +147,12 @@ static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; - if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { - ret = drv_set_promiscuous_mode(local, - wpan_dev->promiscuous_mode); - if (ret < 0) - return ret; - } + sdata->required_filtering = sdata->iface_default_filtering; if (local->hw.flags & IEEE802154_HW_AFILT) { - ret = drv_set_pan_id(local, wpan_dev->pan_id); - if (ret < 0) - return ret; - - ret = drv_set_extended_addr(local, wpan_dev->extended_addr); - if (ret < 0) - return ret; - - ret = drv_set_short_addr(local, wpan_dev->short_addr); - if (ret < 0) - return ret; + local->addr_filt.pan_id = wpan_dev->pan_id; + local->addr_filt.ieee_addr = wpan_dev->extended_addr; + local->addr_filt.short_addr = wpan_dev->short_addr; } if (local->hw.flags & IEEE802154_HW_LBT) { @@ -206,7 +193,8 @@ static int mac802154_slave_open(struct net_device *dev) if (res) goto err; - res = drv_start(local); + res = drv_start(local, sdata->required_filtering, + &local->addr_filt); if (res) goto err; } @@ -223,15 +211,16 @@ err: static int ieee802154_check_mac_settings(struct ieee802154_local *local, - struct wpan_dev *wpan_dev, - struct wpan_dev *nwpan_dev) + struct ieee802154_sub_if_data *sdata, + struct ieee802154_sub_if_data *nsdata) { + struct wpan_dev *nwpan_dev = &nsdata->wpan_dev; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + ASSERT_RTNL(); - if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { - if (wpan_dev->promiscuous_mode != nwpan_dev->promiscuous_mode) - return -EBUSY; - } + if (sdata->iface_default_filtering != nsdata->iface_default_filtering) + return -EBUSY; if (local->hw.flags & IEEE802154_HW_AFILT) { if (wpan_dev->pan_id != nwpan_dev->pan_id || @@ -285,8 +274,7 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, /* check all phy mac sublayer settings are the same. * We have only one phy, different values makes trouble. */ - ret = ieee802154_check_mac_settings(local, wpan_dev, - &nsdata->wpan_dev); + ret = ieee802154_check_mac_settings(local, sdata, nsdata); if (ret < 0) return ret; } @@ -586,7 +574,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, sdata->dev->priv_destructor = mac802154_wpan_free; sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; - wpan_dev->promiscuous_mode = false; + sdata->iface_default_filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; wpan_dev->header_ops = &ieee802154_header_ops; mutex_init(&sdata->sec_mtx); @@ -600,7 +588,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, case NL802154_IFTYPE_MONITOR: sdata->dev->needs_free_netdev = true; sdata->dev->netdev_ops = &mac802154_monitor_ops; - wpan_dev->promiscuous_mode = true; + sdata->iface_default_filtering = IEEE802154_FILTERING_NONE; break; default: BUG(); diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index b8ce84618a55..8543c28948a0 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -268,10 +268,20 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) ieee802154_monitors_rx(local, skb); + /* TODO: Avoid delivering frames received at the level + * IEEE802154_FILTERING_NONE on interfaces not expecting it because of + * the missing auto ACK handling feature. + */ + + /* TODO: Handle upcomming receive path where the PHY is at the + * IEEE802154_FILTERING_NONE level during a scan. + */ + /* Check if transceiver doesn't validate the checksum. * If not we validate the checksum here. */ - if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM) { + if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM || + local->phy->filtering == IEEE802154_FILTERING_NONE) { crc = crc_ccitt(0, skb->data, skb->len); if (crc) { rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From a4b5b4c56dd8b1dd46b2f13cb09f5f8031978f86 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 7 Oct 2022 10:53:08 +0200 Subject: mac802154: Drop IEEE802154_HW_RX_DROP_BAD_CKSUM This IEEE802154_HW_RX_DROP_BAD_CKSUM flag was only used by hwsim to reflect the fact that it would not validate the checksum (FCS). So this was only useful while the only filtering level hwsim was capable of was "NONE". Now that the driver has been improved we no longer need this flag. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221007085310.503366-7-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- drivers/net/ieee802154/mac802154_hwsim.c | 3 ++- include/net/mac802154.h | 4 ---- net/mac802154/rx.c | 7 ++----- 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index 75d802e0b685..1db7da3ccc1a 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -287,6 +287,7 @@ static int hwsim_hw_start(struct ieee802154_hw *hw) struct hwsim_phy *phy = hw->priv; phy->suspended = false; + return 0; } @@ -933,7 +934,7 @@ static int hwsim_add_one(struct genl_info *info, struct device *dev, phy->idx = idx; INIT_LIST_HEAD(&phy->edges); - hw->flags = IEEE802154_HW_PROMISCUOUS | IEEE802154_HW_RX_DROP_BAD_CKSUM; + hw->flags = IEEE802154_HW_PROMISCUOUS; hw->parent = dev; err = ieee802154_register_hw(hw); diff --git a/include/net/mac802154.h b/include/net/mac802154.h index 357d25ef627a..4a3a9de9da73 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -111,9 +111,6 @@ struct ieee802154_hw { * promiscuous mode setting. * * @IEEE802154_HW_RX_OMIT_CKSUM: Indicates that receiver omits FCS. - * - * @IEEE802154_HW_RX_DROP_BAD_CKSUM: Indicates that receiver will not filter - * frames with bad checksum. */ enum ieee802154_hw_flags { IEEE802154_HW_TX_OMIT_CKSUM = BIT(0), @@ -123,7 +120,6 @@ enum ieee802154_hw_flags { IEEE802154_HW_AFILT = BIT(4), IEEE802154_HW_PROMISCUOUS = BIT(5), IEEE802154_HW_RX_OMIT_CKSUM = BIT(6), - IEEE802154_HW_RX_DROP_BAD_CKSUM = BIT(7), }; /* Indicates that receiver omits FCS and xmitter will add FCS on it's own. */ diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 8543c28948a0..80dd52bc6bf1 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -277,11 +277,8 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) * IEEE802154_FILTERING_NONE level during a scan. */ - /* Check if transceiver doesn't validate the checksum. - * If not we validate the checksum here. - */ - if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM || - local->phy->filtering == IEEE802154_FILTERING_NONE) { + /* Level 1 filtering: Check the FCS by software when relevant */ + if (local->hw.phy->filtering == IEEE802154_FILTERING_NONE) { crc = crc_ccitt(0, skb->data, skb->len); if (crc) { rcu_read_unlock(); -- cgit v1.2.3-70-g09d2 From 0218277df5a527c9e9474aa0e4e4e2f4bbc6f5cc Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 7 Oct 2022 10:53:09 +0200 Subject: mac802154: Avoid delivering frames received in a non satisfying filtering mode We must avoid the situation where one interface disables address filtering and AACK on the PHY while another interface expects to run with AACK and address filtering enabled. Just ignore the frames on the concerned interface if this happens. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221007085310.503366-8-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/rx.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 80dd52bc6bf1..6640bd237352 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -209,6 +209,13 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, if (!ieee802154_sdata_running(sdata)) continue; + /* Do not deliver packets received on interfaces expecting + * AACK=1 if the address filters where disabled. + */ + if (local->hw.phy->filtering < IEEE802154_FILTERING_4_FRAME_FIELDS && + sdata->required_filtering == IEEE802154_FILTERING_4_FRAME_FIELDS) + continue; + ieee802154_subif_frame(sdata, skb, &hdr); skb = NULL; break; @@ -268,11 +275,6 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) ieee802154_monitors_rx(local, skb); - /* TODO: Avoid delivering frames received at the level - * IEEE802154_FILTERING_NONE on interfaces not expecting it because of - * the missing auto ACK handling feature. - */ - /* TODO: Handle upcomming receive path where the PHY is at the * IEEE802154_FILTERING_NONE level during a scan. */ -- cgit v1.2.3-70-g09d2 From 3a22550ab50a2bed1779c7d03c9f0239d33cc514 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Mon, 5 Sep 2022 22:27:24 +0200 Subject: net: mac802154: Avoid displaying misleading debug information With DEBUG defined, any frame received will see its MHR fields (fc and addresses, mainly) being printed in the kernel log buffer, unconditionally. In most cases this is fine, but in some specific cases (like Acknowledgment frames, where both the source and destination addressing fields are omitted), it displays garbage which is misleading. Only print the addressing fields when they are present, which clarifies the logs. Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/r/20220905202724.1322046-1-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/rx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 6640bd237352..d1f7b8df41fe 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -114,8 +114,10 @@ fail: static void ieee802154_print_addr(const char *name, const struct ieee802154_addr *addr) { - if (addr->mode == IEEE802154_ADDR_NONE) + if (addr->mode == IEEE802154_ADDR_NONE) { pr_debug("%s not present\n", name); + return; + } pr_debug("%s PAN ID: %04x\n", name, le16_to_cpu(addr->pan_id)); if (addr->mode == IEEE802154_ADDR_SHORT) { -- cgit v1.2.3-70-g09d2 From f00909e2e6fe4ac6b2420e3863a0c533fe4f15e0 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 17 Oct 2022 17:35:40 +0800 Subject: net: ip6_gre: Remove the unused function ip6gre_tnl_addr_conflict() The function ip6gre_tnl_addr_conflict() is defined in the ip6_gre.c file, but not called elsewhere, so delete this unused function. net/ipv6/ip6_gre.c:887:20: warning: unused function 'ip6gre_tnl_addr_conflict'. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2419 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20221017093540.26806-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Paolo Abeni --- net/ipv6/ip6_gre.c | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 48b4ff0294f6..02b1b54165e8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -870,26 +870,6 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) return 0; } -/** - * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own - * @t: the outgoing tunnel device - * @hdr: IPv6 header from the incoming packet - * - * Description: - * Avoid trivial tunneling loop by checking that tunnel exit-point - * doesn't match source of incoming packet. - * - * Return: - * 1 if conflict, - * 0 else - **/ - -static inline bool ip6gre_tnl_addr_conflict(const struct ip6_tnl *t, - const struct ipv6hdr *hdr) -{ - return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); -} - static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); -- cgit v1.2.3-70-g09d2 From e91001bae0d1725d9a49b5bfb5f46f6d1ca6bf1d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 17 Oct 2022 23:08:09 +0100 Subject: esp6: remove redundant variable err Variable err is being assigned a value that is not read, the assignment is redundant and so is the variable. Remove it. Cleans up clang scan warning: net/ipv6/esp6_offload.c:64:7: warning: Although the value stored to 'err' is used in the enclosing expression, the value is never actually read from 'err' [deadcode.DeadStores] Signed-off-by: Colin Ian King Signed-off-by: Steffen Klassert --- net/ipv6/esp6_offload.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 79d43548279c..97edf461bc72 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -56,12 +56,11 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head, __be32 seq; __be32 spi; int nhoff; - int err; if (!pskb_pull(skb, offset)) return NULL; - if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) + if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); -- cgit v1.2.3-70-g09d2 From 262985fad1bd819d1323c6dbd72a8d9ed1c6090c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 18 Oct 2022 09:40:00 +0300 Subject: bridge: mcast: Use spin_lock() instead of spin_lock_bh() IGMPv3 / MLDv2 Membership Reports are only processed from the data path with softIRQ disabled, so there is no need to call spin_lock_bh(). Use spin_lock() instead. This is consistent with how other IGMP / MLD packets are processed. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index db4f2641d1cd..09140bc8c15e 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2669,7 +2669,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, if (!pmctx || igmpv2) continue; - spin_lock_bh(&brmctx->br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; @@ -2717,7 +2717,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&brmctx->br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } return err; @@ -2807,7 +2807,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, if (!pmctx || mldv1) continue; - spin_lock_bh(&brmctx->br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; @@ -2859,7 +2859,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&brmctx->br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } return err; -- cgit v1.2.3-70-g09d2 From d1942cd47dbdfb5a6187f660418a4e1be38d1312 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 18 Oct 2022 09:40:01 +0300 Subject: bridge: mcast: Simplify MDB entry creation Before creating a new MDB entry, br_multicast_new_group() will call br_mdb_ip_get() to see if one exists and return it if so. Therefore, simply call br_multicast_new_group() and omit the call to br_mdb_ip_get(). Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 589ff497d50c..321be94c445a 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -866,7 +866,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, unsigned long now = jiffies; unsigned char flags = 0; u8 filter_mode; - int err; __mdb_entry_to_br_ip(entry, &group, mdb_attrs); @@ -892,13 +891,9 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return -EINVAL; } - mp = br_mdb_ip_get(br, &group); - if (!mp) { - mp = br_multicast_new_group(br, &group); - err = PTR_ERR_OR_ZERO(mp); - if (err) - return err; - } + mp = br_multicast_new_group(br, &group); + if (IS_ERR(mp)) + return PTR_ERR(mp); /* host join */ if (!port) { -- cgit v1.2.3-70-g09d2 From de1deb156970bade8b877124c4142edac1696ee6 Mon Sep 17 00:00:00 2001 From: Zhang Changzhong Date: Fri, 30 Sep 2022 14:52:08 +0800 Subject: can: j1939: j1939_session_tx_eoma(): fix debug info Use "%s" instead of "%p" to print function name in debug info. Signed-off-by: Zhang Changzhong Acked-by: Oleksij Rempel Link: https://lore.kernel.org/all/1664520728-4644-1-git-send-email-zhangchangzhong@huawei.com Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index d7d86c944d76..6ec48a41988c 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -985,7 +985,7 @@ static int j1939_session_tx_eoma(struct j1939_session *session) /* wait for the EOMA packet to come in */ j1939_tp_set_rxtimeout(session, 1250); - netdev_dbg(session->priv->ndev, "%p: 0x%p\n", __func__, session); + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); return 0; } -- cgit v1.2.3-70-g09d2 From ab3f7828c9793a5dfa99a54dc19ae3491c38bfa3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 02:06:33 -0700 Subject: openvswitch: Use kmalloc_size_roundup() to match ksize() usage Round up allocations with kmalloc_size_roundup() so that openvswitch's use of ksize() is always accurate and no special handling of the memory is needed by KASAN, UBSAN_BOUNDS, nor FORTIFY_SOURCE. Cc: Pravin B Shelar Cc: dev@openvswitch.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221018090628.never.537-kees@kernel.org Signed-off-by: Jakub Kicinski --- net/openvswitch/flow_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 4a07ab094a84..ead5418c126e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2309,7 +2309,7 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size) WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); - sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); + sfa = kmalloc(kmalloc_size_roundup(sizeof(*sfa) + size), GFP_KERNEL); if (!sfa) return ERR_PTR(-ENOMEM); -- cgit v1.2.3-70-g09d2 From 6fdfdef7fdb57e6b9f768c9ca0718dcb5e727a85 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 19 Oct 2022 21:07:33 +0300 Subject: sctp: remove unnecessary NULL check in sctp_association_init() '&asoc->ulpq' passed to sctp_ulpq_init() as the first argument, then sctp_qlpq_init() initializes it and eventually returns the address of the struct member back. Therefore, in this case, the return pointer cannot be NULL. Moreover, it seems sctp_ulpq_init() has always been used only in sctp_association_init(), so there's really no need to return ulpq anymore. Detected using the static analysis tool - Svace. Signed-off-by: Alexey Kodanev Reviewed-by: Xin Long Link: https://lore.kernel.org/r/20221019180735.161388-1-aleksei.kodanev@bell-sw.com Signed-off-by: Jakub Kicinski --- include/net/sctp/ulpqueue.h | 3 +-- net/sctp/associola.c | 4 +--- net/sctp/ulpqueue.c | 5 +---- 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h index 0eaf8650e3b2..60f6641290c3 100644 --- a/include/net/sctp/ulpqueue.h +++ b/include/net/sctp/ulpqueue.h @@ -35,8 +35,7 @@ struct sctp_ulpq { }; /* Prototypes. */ -struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *, - struct sctp_association *); +void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc); void sctp_ulpq_flush(struct sctp_ulpq *ulpq); void sctp_ulpq_free(struct sctp_ulpq *); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 3460abceba44..63ba5551c13f 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -226,8 +226,7 @@ static struct sctp_association *sctp_association_init( /* Create an output queue. */ sctp_outq_init(asoc, &asoc->outqueue); - if (!sctp_ulpq_init(&asoc->ulpq, asoc)) - goto fail_init; + sctp_ulpq_init(&asoc->ulpq, asoc); if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp)) goto stream_free; @@ -277,7 +276,6 @@ static struct sctp_association *sctp_association_init( stream_free: sctp_stream_free(&asoc->stream); -fail_init: sock_put(asoc->base.sk); sctp_endpoint_put(asoc->ep); return NULL; diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 0a8510a0c5e6..24960dcb6a21 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -38,8 +38,7 @@ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq); /* 1st Level Abstractions */ /* Initialize a ULP queue from a block of memory. */ -struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, - struct sctp_association *asoc) +void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc) { memset(ulpq, 0, sizeof(struct sctp_ulpq)); @@ -48,8 +47,6 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, skb_queue_head_init(&ulpq->reasm_uo); skb_queue_head_init(&ulpq->lobby); ulpq->pd_mode = 0; - - return ulpq; } -- cgit v1.2.3-70-g09d2 From b66aeddbe30c26e56e33c65a74d073dc319beed5 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 19 Oct 2022 21:07:34 +0300 Subject: sctp: remove unnecessary NULL check in sctp_ulpq_tail_event() After commit 013b96ec6461 ("sctp: Pass sk_buff_head explicitly to sctp_ulpq_tail_event().") there is one more unneeded check of skb_list for NULL. Detected using the static analysis tool - Svace. Signed-off-by: Alexey Kodanev Reviewed-by: Xin Long Link: https://lore.kernel.org/r/20221019180735.161388-2-aleksei.kodanev@bell-sw.com Signed-off-by: Jakub Kicinski --- net/sctp/ulpqueue.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 24960dcb6a21..b05daafd369a 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -256,10 +256,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) return 1; out_free: - if (skb_list) - sctp_queue_purge_ulpevents(skb_list); - else - sctp_ulpevent_free(event); + sctp_queue_purge_ulpevents(skb_list); return 0; } -- cgit v1.2.3-70-g09d2 From 377eb9aab084bf298eed9b55ba173dad36fdf7f2 Mon Sep 17 00:00:00 2001 From: Alexey Kodanev Date: Wed, 19 Oct 2022 21:07:35 +0300 Subject: sctp: remove unnecessary NULL checks in sctp_enqueue_event() After commit 178ca044aa60 ("sctp: Make sctp_enqueue_event tak an skb list."), skb_list cannot be NULL. Detected using the static analysis tool - Svace. Signed-off-by: Alexey Kodanev Reviewed-by: Xin Long Link: https://lore.kernel.org/r/20221019180735.161388-3-aleksei.kodanev@bell-sw.com Signed-off-by: Jakub Kicinski --- net/sctp/stream_interleave.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index bb22b71df7a3..94727feb07b3 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -490,11 +490,8 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq, if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; - if (skb_list) - skb_queue_splice_tail_init(skb_list, - &sk->sk_receive_queue); - else - __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_queue_splice_tail_init(skb_list, + &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; @@ -504,10 +501,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq, return 1; out_free: - if (skb_list) - sctp_queue_purge_ulpevents(skb_list); - else - sctp_ulpevent_free(event); + sctp_queue_purge_ulpevents(skb_list); return 0; } -- cgit v1.2.3-70-g09d2 From 4161634bce9537ed173b3c8fd0bf9f0218bcf41c Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 19 Oct 2022 15:44:23 +0200 Subject: mac802154: Ensure proper scan-level filtering We now have a fine grained filtering information so let's ensure proper filtering in scan mode, which means that only beacons are processed. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221019134423.877169-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/rx.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index d1f7b8df41fe..dc9ca2c0ea84 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -34,6 +34,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, struct sk_buff *skb, const struct ieee802154_hdr *hdr) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct wpan_phy *wpan_phy = sdata->local->hw.phy; __le16 span, sshort; int rc; @@ -42,6 +43,17 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, span = wpan_dev->pan_id; sshort = wpan_dev->short_addr; + /* Level 3 filtering: Only beacons are accepted during scans */ + if (sdata->required_filtering == IEEE802154_FILTERING_3_SCAN && + sdata->required_filtering > wpan_phy->filtering) { + if (mac_cb(skb)->type != IEEE802154_FC_TYPE_BEACON) { + dev_dbg(&sdata->dev->dev, + "drop non-beacon frame (0x%x) during scan\n", + mac_cb(skb)->type); + goto fail; + } + } + switch (mac_cb(skb)->dest.mode) { case IEEE802154_ADDR_NONE: if (mac_cb(skb)->dest.mode != IEEE802154_ADDR_NONE) @@ -277,10 +289,6 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) ieee802154_monitors_rx(local, skb); - /* TODO: Handle upcomming receive path where the PHY is at the - * IEEE802154_FILTERING_NONE level during a scan. - */ - /* Level 1 filtering: Check the FCS by software when relevant */ if (local->hw.phy->filtering == IEEE802154_FILTERING_NONE) { crc = crc_ccitt(0, skb->data, skb->len); -- cgit v1.2.3-70-g09d2 From b5fc29233d28be7a3322848ebe73ac327559cdb9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 19 Oct 2022 15:35:59 -0700 Subject: inet6: Remove inet6_destroy_sock() in sk->sk_prot->destroy(). After commit d38afeec26ed ("tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct()."), we call inet6_destroy_sock() in sk->sk_destruct() by setting inet6_sock_destruct() to it to make sure we do not leak inet6-specific resources. Now we can remove unnecessary inet6_destroy_sock() calls in sk->sk_prot->destroy(). DCCP and SCTP have their own sk->sk_destruct() function, so we change them separately in the following patches. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Matthieu Baerts Signed-off-by: David S. Miller --- net/ipv6/ping.c | 6 ------ net/ipv6/raw.c | 2 -- net/ipv6/tcp_ipv6.c | 8 +------- net/ipv6/udp.c | 2 -- net/l2tp/l2tp_ip6.c | 2 -- net/mptcp/protocol.c | 7 ------- 6 files changed, 1 insertion(+), 26 deletions(-) (limited to 'net') diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 86c26e48d065..808983bc2ec9 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -23,11 +23,6 @@ #include #include -static void ping_v6_destroy(struct sock *sk) -{ - inet6_destroy_sock(sk); -} - /* Compatibility glue so we can support IPv6 when it's compiled as a module */ static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) @@ -205,7 +200,6 @@ struct proto pingv6_prot = { .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, - .destroy = ping_v6_destroy, .pre_connect = ping_v6_pre_connect, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 722de9dd0ff7..a06a9f847db5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1173,8 +1173,6 @@ static void raw6_destroy(struct sock *sk) lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); - - inet6_destroy_sock(sk); } static int rawv6_init_sk(struct sock *sk) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2a3f9296df1e..f676be14e6b6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1966,12 +1966,6 @@ static int tcp_v6_init_sock(struct sock *sk) return 0; } -static void tcp_v6_destroy_sock(struct sock *sk) -{ - tcp_v4_destroy_sock(sk); - inet6_destroy_sock(sk); -} - #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, @@ -2164,7 +2158,7 @@ struct proto tcpv6_prot = { .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, - .destroy = tcp_v6_destroy_sock, + .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 129ec5a9b0eb..2260406740d3 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1661,8 +1661,6 @@ void udpv6_destroy_sock(struct sock *sk) udp_encap_disable(); } } - - inet6_destroy_sock(sk); } /* diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 9dbd801ddb98..2478aa60145f 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -257,8 +257,6 @@ static void l2tp_ip6_destroy_sock(struct sock *sk) if (tunnel) l2tp_tunnel_delete(tunnel); - - inet6_destroy_sock(sk); } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f599ad44ed24..2e16c897c229 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3898,12 +3898,6 @@ static const struct proto_ops mptcp_v6_stream_ops = { static struct proto mptcp_v6_prot; -static void mptcp_v6_destroy(struct sock *sk) -{ - mptcp_destroy(sk); - inet6_destroy_sock(sk); -} - static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, @@ -3919,7 +3913,6 @@ int __init mptcp_proto_v6_init(void) mptcp_v6_prot = mptcp_prot; strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; - mptcp_v6_prot.destroy = mptcp_v6_destroy; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); err = proto_register(&mptcp_v6_prot, 1); -- cgit v1.2.3-70-g09d2 From 1651951ebea54970e0bda60c638fc2eee7a6218f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 19 Oct 2022 15:36:00 -0700 Subject: dccp: Call inet6_destroy_sock() via sk->sk_destruct(). After commit d38afeec26ed ("tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct()."), we call inet6_destroy_sock() in sk->sk_destruct() by setting inet6_sock_destruct() to it to make sure we do not leak inet6-specific resources. DCCP sets its own sk->sk_destruct() in the dccp_init_sock(), and DCCPv6 socket shares it by calling the same init function via dccp_v6_init_sock(). To call inet6_sock_destruct() from DCCPv6 sk->sk_destruct(), we export it and set dccp_v6_sk_destruct() in the init function. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/dccp/dccp.h | 1 + net/dccp/ipv6.c | 15 ++++++++------- net/dccp/proto.c | 8 +++++++- net/ipv6/af_inet6.c | 1 + 4 files changed, 17 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 7dfc00c9fb32..9ddc3a9e89e4 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -278,6 +278,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len); +void dccp_destruct_common(struct sock *sk); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); void dccp_destroy_sock(struct sock *sk); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e57b43006074..ae62b1591dea 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1021,6 +1021,12 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .sockaddr_len = sizeof(struct sockaddr_in6), }; +static void dccp_v6_sk_destruct(struct sock *sk) +{ + dccp_destruct_common(sk); + inet6_sock_destruct(sk); +} + /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ @@ -1033,17 +1039,12 @@ static int dccp_v6_init_sock(struct sock *sk) if (unlikely(!dccp_v6_ctl_sock_initialized)) dccp_v6_ctl_sock_initialized = 1; inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + sk->sk_destruct = dccp_v6_sk_destruct; } return err; } -static void dccp_v6_destroy_sock(struct sock *sk) -{ - dccp_destroy_sock(sk); - inet6_destroy_sock(sk); -} - static struct timewait_sock_ops dccp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct dccp6_timewait_sock), }; @@ -1066,7 +1067,7 @@ static struct proto dccp_v6_prot = { .accept = inet_csk_accept, .get_port = inet_csk_get_port, .shutdown = dccp_shutdown, - .destroy = dccp_v6_destroy_sock, + .destroy = dccp_destroy_sock, .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), diff --git a/net/dccp/proto.c b/net/dccp/proto.c index c548ca3e9b0e..9494b0d224f9 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -171,12 +171,18 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); -static void dccp_sk_destruct(struct sock *sk) +void dccp_destruct_common(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = NULL; +} +EXPORT_SYMBOL_GPL(dccp_destruct_common); + +static void dccp_sk_destruct(struct sock *sk) +{ + dccp_destruct_common(sk); inet_sock_destruct(sk); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 024191004982..6540551ea7ec 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -114,6 +114,7 @@ void inet6_sock_destruct(struct sock *sk) inet6_cleanup_sock(sk); inet_sock_destruct(sk); } +EXPORT_SYMBOL_GPL(inet6_sock_destruct); static int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) -- cgit v1.2.3-70-g09d2 From 6431b0f6ff1633ae598667e4cdd93830074a03e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 19 Oct 2022 15:36:01 -0700 Subject: sctp: Call inet6_destroy_sock() via sk->sk_destruct(). After commit d38afeec26ed ("tcp/udp: Call inet6_destroy_sock() in IPv6 sk->sk_destruct()."), we call inet6_destroy_sock() in sk->sk_destruct() by setting inet6_sock_destruct() to it to make sure we do not leak inet6-specific resources. SCTP sets its own sk->sk_destruct() in the sctp_init_sock(), and SCTPv6 socket reuses it as the init function. To call inet6_sock_destruct() from SCTPv6 sk->sk_destruct(), we set sctp_v6_destruct_sock() in a new init function. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/sctp/socket.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 83628c347744..3e83963d1b8a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5098,13 +5098,17 @@ static void sctp_destroy_sock(struct sock *sk) } /* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_sock(struct sock *sk) +static void sctp_destruct_common(struct sock *sk) { struct sctp_sock *sp = sctp_sk(sk); /* Free up the HMAC transform. */ crypto_free_shash(sp->hmac); +} +static void sctp_destruct_sock(struct sock *sk) +{ + sctp_destruct_common(sk); inet_sock_destruct(sk); } @@ -9427,7 +9431,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; - newsk->sk_destruct = sctp_destruct_sock; + newsk->sk_destruct = sk->sk_destruct; newsk->sk_family = sk->sk_family; newsk->sk_protocol = IPPROTO_SCTP; newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; @@ -9662,11 +9666,20 @@ struct proto sctp_prot = { #if IS_ENABLED(CONFIG_IPV6) -#include -static void sctp_v6_destroy_sock(struct sock *sk) +static void sctp_v6_destruct_sock(struct sock *sk) +{ + sctp_destruct_common(sk); + inet6_sock_destruct(sk); +} + +static int sctp_v6_init_sock(struct sock *sk) { - sctp_destroy_sock(sk); - inet6_destroy_sock(sk); + int ret = sctp_init_sock(sk); + + if (!ret) + sk->sk_destruct = sctp_v6_destruct_sock; + + return ret; } struct proto sctpv6_prot = { @@ -9676,8 +9689,8 @@ struct proto sctpv6_prot = { .disconnect = sctp_disconnect, .accept = sctp_accept, .ioctl = sctp_ioctl, - .init = sctp_init_sock, - .destroy = sctp_v6_destroy_sock, + .init = sctp_v6_init_sock, + .destroy = sctp_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, -- cgit v1.2.3-70-g09d2 From 1f8c4eeb945553baf868bbec7a8c59810df97a07 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 19 Oct 2022 15:36:02 -0700 Subject: inet6: Remove inet6_destroy_sock(). The last user of inet6_destroy_sock() is its wrapper inet6_cleanup_sock(). Let's rename inet6_destroy_sock() to inet6_cleanup_sock(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/transp_v6.h | 2 -- net/ipv6/af_inet6.c | 8 +------- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index b830463e3dff..d27b1caf3753 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -58,8 +58,6 @@ ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp, #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006) -void inet6_destroy_sock(struct sock *sk); - #define IPV6_SEQ_DGRAM_HEADER \ " sl " \ "local_address " \ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 6540551ea7ec..68075295d587 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -490,7 +490,7 @@ int inet6_release(struct socket *sock) } EXPORT_SYMBOL(inet6_release); -void inet6_destroy_sock(struct sock *sk) +void inet6_cleanup_sock(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; @@ -515,12 +515,6 @@ void inet6_destroy_sock(struct sock *sk) txopt_put(opt); } } -EXPORT_SYMBOL_GPL(inet6_destroy_sock); - -void inet6_cleanup_sock(struct sock *sk) -{ - inet6_destroy_sock(sk); -} EXPORT_SYMBOL_GPL(inet6_cleanup_sock); /* -- cgit v1.2.3-70-g09d2 From b45a337f061e131bd01b6df2d89f1228d4350a85 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 19 Oct 2022 15:36:03 -0700 Subject: inet6: Clean up failure path in do_ipv6_setsockopt(). We can reuse the unlock label above and need not repeat the same code. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv6/ipv6_sockglue.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 532f4478c884..9ce51680290b 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1005,10 +1005,8 @@ unlock: return retv; e_inval: - sockopt_release_sock(sk); - if (needs_rtnl) - rtnl_unlock(); - return -EINVAL; + retv = -EINVAL; + goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, -- cgit v1.2.3-70-g09d2 From 404c76783f322120266a4ef659abe418dc74f5c6 Mon Sep 17 00:00:00 2001 From: Amit Cohen Date: Thu, 20 Oct 2022 17:20:03 +0200 Subject: ethtool: Add support for 800Gbps link modes Add support for 800Gbps speed, link modes of 100Gbps per lane. As mentioned in slide 21 in IEEE documentation [1], all adopted 802.3df copper and optical PMDs baselines using 100G/lane will be supported. Add the relevant PMDs which are mentioned in slide 5 in IEEE documentation [1] and were approved on 10-2022 [2]: BP - KR8 Cu Cable - CR8 MMF 50m - VR8 MMF 100m - SR8 SMF 500m - DR8 SMF 2km - DR8-2 [1]: https://www.ieee802.org/3/df/public/22_10/22_1004/shrikhande_3df_01a_221004.pdf [2]: https://ieee802.org/3/df/KeyMotions_3df_221005.pdf Signed-off-by: Amit Cohen Reviewed-by: Ido Schimmel Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- drivers/net/phy/phy-core.c | 11 ++++++++++- include/uapi/linux/ethtool.h | 8 ++++++++ net/ethtool/common.c | 14 ++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 2c8bf438ea61..5d08c627a516 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -13,7 +13,7 @@ */ const char *phy_speed_to_str(int speed) { - BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 93, + BUILD_BUG_ON_MSG(__ETHTOOL_LINK_MODE_MASK_NBITS != 99, "Enum ethtool_link_mode_bit_indices and phylib are out of sync. " "If a speed or mode has been added please update phy_speed_to_str " "and the PHY settings array.\n"); @@ -49,6 +49,8 @@ const char *phy_speed_to_str(int speed) return "200Gbps"; case SPEED_400000: return "400Gbps"; + case SPEED_800000: + return "800Gbps"; case SPEED_UNKNOWN: return "Unknown"; default: @@ -157,6 +159,13 @@ EXPORT_SYMBOL_GPL(phy_interface_num_ports); .bit = ETHTOOL_LINK_MODE_ ## b ## _BIT} static const struct phy_setting settings[] = { + /* 800G */ + PHY_SETTING( 800000, FULL, 800000baseCR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseKR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseDR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseDR8_2_Full ), + PHY_SETTING( 800000, FULL, 800000baseSR8_Full ), + PHY_SETTING( 800000, FULL, 800000baseVR8_Full ), /* 400G */ PHY_SETTING( 400000, FULL, 400000baseCR8_Full ), PHY_SETTING( 400000, FULL, 400000baseKR8_Full ), diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index dc2aa3d75b39..f341de2ae612 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1737,6 +1737,13 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_100baseFX_Half_BIT = 90, ETHTOOL_LINK_MODE_100baseFX_Full_BIT = 91, ETHTOOL_LINK_MODE_10baseT1L_Full_BIT = 92, + ETHTOOL_LINK_MODE_800000baseCR8_Full_BIT = 93, + ETHTOOL_LINK_MODE_800000baseKR8_Full_BIT = 94, + ETHTOOL_LINK_MODE_800000baseDR8_Full_BIT = 95, + ETHTOOL_LINK_MODE_800000baseDR8_2_Full_BIT = 96, + ETHTOOL_LINK_MODE_800000baseSR8_Full_BIT = 97, + ETHTOOL_LINK_MODE_800000baseVR8_Full_BIT = 98, + /* must be last entry */ __ETHTOOL_LINK_MODE_MASK_NBITS }; @@ -1848,6 +1855,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_100000 100000 #define SPEED_200000 200000 #define SPEED_400000 400000 +#define SPEED_800000 800000 #define SPEED_UNKNOWN -1 diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 566adf85e658..ee3e02da0013 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -202,6 +202,12 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(100, FX, Half), __DEFINE_LINK_MODE_NAME(100, FX, Full), __DEFINE_LINK_MODE_NAME(10, T1L, Full), + __DEFINE_LINK_MODE_NAME(800000, CR8, Full), + __DEFINE_LINK_MODE_NAME(800000, KR8, Full), + __DEFINE_LINK_MODE_NAME(800000, DR8, Full), + __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full), + __DEFINE_LINK_MODE_NAME(800000, SR8, Full), + __DEFINE_LINK_MODE_NAME(800000, VR8, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -238,6 +244,8 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_X 1 #define __LINK_MODE_LANES_FX 1 #define __LINK_MODE_LANES_T1L 1 +#define __LINK_MODE_LANES_VR8 8 +#define __LINK_MODE_LANES_DR8_2 8 #define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ @@ -352,6 +360,12 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(100, FX, Half), __DEFINE_LINK_MODE_PARAMS(100, FX, Full), __DEFINE_LINK_MODE_PARAMS(10, T1L, Full), + __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); -- cgit v1.2.3-70-g09d2 From a5ef058dc4d9a3e60d1808a0700e18e0e37e408e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Oct 2022 19:48:51 +0200 Subject: net: introduce and use custom sockopt socket flag We will soon introduce custom setsockopt for UDP sockets, too. Instead of doing even more complex arbitrary checks inside sock_use_custom_sol_socket(), add a new socket flag and set it for the relevant socket types (currently only MPTCP). Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/net.h | 1 + net/mptcp/protocol.c | 4 ++++ net/socket.c | 8 +------- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/net.h b/include/linux/net.h index 711c3593c3b8..59350fd85823 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,7 @@ struct net; #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define SOCK_CUSTOM_SOCKOPT 5 #ifndef ARCH_HAS_SOCKET_TYPES /** diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2e16c897c229..e60d144bfb2d 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2708,6 +2708,8 @@ static int mptcp_init_sock(struct sock *sk) if (ret) return ret; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); + /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ @@ -3684,6 +3686,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct mptcp_subflow_context *subflow; struct sock *newsk = newsock->sk; + set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + lock_sock(newsk); /* PM/worker can now acquire the first subflow socket diff --git a/net/socket.c b/net/socket.c index 00da9ce3dba0..55c5d536e5f6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2199,13 +2199,7 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static bool sock_use_custom_sol_socket(const struct socket *sock) { - const struct sock *sk = sock->sk; - - /* Use sock->ops->setsockopt() for MPTCP */ - return IS_ENABLED(CONFIG_MPTCP) && - sk->sk_protocol == IPPROTO_MPTCP && - sk->sk_type == SOCK_STREAM && - (sk->sk_family == AF_INET || sk->sk_family == AF_INET6); + return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); } /* -- cgit v1.2.3-70-g09d2 From 8a3854c7b8e4532063b14bed34115079b7d0cb36 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Oct 2022 19:48:52 +0200 Subject: udp: track the forward memory release threshold in an hot cacheline When the receiver process and the BH runs on different cores, udp_rmem_release() experience a cache miss while accessing sk_rcvbuf, as the latter shares the same cacheline with sk_forward_alloc, written by the BH. With this patch, UDP tracks the rcvbuf value and its update via custom SOL_SOCKET socket options, and copies the forward memory threshold value used by udp_rmem_release() in a different cacheline, already accessed by the above function and uncontended. Since the UDP socket init operation grown a bit, factor out the common code between v4 and v6 in a shared helper. Overall the above give a 10% peek throughput increase under UDP flood. Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/udp.h | 3 +++ include/net/udp.h | 9 +++++++++ net/ipv4/udp.c | 18 +++++++++++++++--- net/ipv6/udp.c | 4 ++-- 4 files changed, 29 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/udp.h b/include/linux/udp.h index e96da4157d04..5cdba00a904a 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -87,6 +87,9 @@ struct udp_sock { /* This field is dirtied by udp_recvmsg() */ int forward_deficit; + + /* This fields follows rcvbuf value, and is touched by udp_recvmsg */ + int forward_threshold; }; #define UDP_MAX_SEGMENTS (1 << 6UL) diff --git a/include/net/udp.h b/include/net/udp.h index fee053bcd17c..de4b528522bb 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -174,6 +174,15 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *)); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); +static inline void udp_lib_init_sock(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + skb_queue_head_init(&up->reader_queue); + up->forward_threshold = sk->sk_rcvbuf >> 2; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); +} + /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c83e5271030b..e77c8f0e9087 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1448,7 +1448,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2) && + if (size < READ_ONCE(up->forward_threshold) && !skb_queue_empty(&up->reader_queue)) return; } else { @@ -1622,7 +1622,7 @@ static void udp_destruct_sock(struct sock *sk) int udp_init_sock(struct sock *sk) { - skb_queue_head_init(&udp_sk(sk)->reader_queue); + udp_lib_init_sock(sk); sk->sk_destruct = udp_destruct_sock; return 0; } @@ -2671,6 +2671,18 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int err = 0; int is_udplite = IS_UDPLITE(sk); + if (level == SOL_SOCKET) { + err = sk_setsockopt(sk, level, optname, optval, optlen); + + if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) { + sockopt_lock_sock(sk); + /* paired with READ_ONCE in udp_rmem_release() */ + WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2); + sockopt_release_sock(sk); + } + return err; + } + if (optlen < sizeof(int)) return -EINVAL; @@ -2784,7 +2796,7 @@ EXPORT_SYMBOL(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE) + if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_push_pending_frames); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 2260406740d3..297f7cc06044 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -64,7 +64,7 @@ static void udpv6_destruct_sock(struct sock *sk) int udpv6_init_sock(struct sock *sk) { - skb_queue_head_init(&udp_sk(sk)->reader_queue); + udp_lib_init_sock(sk); sk->sk_destruct = udpv6_destruct_sock; return 0; } @@ -1669,7 +1669,7 @@ void udpv6_destroy_sock(struct sock *sk) int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE) + if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_v6_push_pending_frames); -- cgit v1.2.3-70-g09d2 From 0cafd77dcd032d1687efaba5598cf07bce85997f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 20 Oct 2022 23:20:18 +0000 Subject: net: add a refcount tracker for kernel sockets Commit ffa84b5ffb37 ("net: add netns refcount tracker to struct sock") added a tracker to sockets, but did not track kernel sockets. We still have syzbot reports hinting about netns being destroyed while some kernel TCP sockets had not been dismantled. This patch tracks kernel sockets, and adds a ref_tracker_dir_print() call to net_free() right before the netns is freed. Normally, each layer is responsible for properly releasing its kernel sockets before last call to net_free(). This debugging facility is enabled with CONFIG_NET_NS_REFCNT_TRACKER=y Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Tested-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/net_namespace.h | 30 ++++++++++++++++++++++-------- net/core/net_namespace.c | 5 +++++ net/core/sock.c | 14 ++++++++++++++ net/netlink/af_netlink.c | 11 +++++++++++ net/rds/tcp.c | 3 +++ 5 files changed, 55 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8c3587d5c308..78beaa765c73 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -92,7 +92,9 @@ struct net { struct ns_common ns; struct ref_tracker_dir refcnt_tracker; - + struct ref_tracker_dir notrefcnt_tracker; /* tracker for objects not + * refcounted against netns + */ struct list_head dev_base_head; struct proc_dir_entry *proc_net; struct proc_dir_entry *proc_net_stat; @@ -320,19 +322,31 @@ static inline int check_net(const struct net *net) #endif -static inline void netns_tracker_alloc(struct net *net, - netns_tracker *tracker, gfp_t gfp) +static inline void __netns_tracker_alloc(struct net *net, + netns_tracker *tracker, + bool refcounted, + gfp_t gfp) { #ifdef CONFIG_NET_NS_REFCNT_TRACKER - ref_tracker_alloc(&net->refcnt_tracker, tracker, gfp); + ref_tracker_alloc(refcounted ? &net->refcnt_tracker : + &net->notrefcnt_tracker, + tracker, gfp); #endif } -static inline void netns_tracker_free(struct net *net, - netns_tracker *tracker) +static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker, + gfp_t gfp) +{ + __netns_tracker_alloc(net, tracker, true, gfp); +} + +static inline void __netns_tracker_free(struct net *net, + netns_tracker *tracker, + bool refcounted) { #ifdef CONFIG_NET_NS_REFCNT_TRACKER - ref_tracker_free(&net->refcnt_tracker, tracker); + ref_tracker_free(refcounted ? &net->refcnt_tracker : + &net->notrefcnt_tracker, tracker); #endif } @@ -346,7 +360,7 @@ static inline struct net *get_net_track(struct net *net, static inline void put_net_track(struct net *net, netns_tracker *tracker) { - netns_tracker_free(net, tracker); + __netns_tracker_free(net, tracker, true); put_net(net); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 0ec2f5906a27..12c68edf7682 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -309,6 +309,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128); + ref_tracker_dir_init(&net->notrefcnt_tracker, 128); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); @@ -429,6 +430,10 @@ static void net_free(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); + + /* There should not be any trackers left there. */ + ref_tracker_dir_exit(&net->notrefcnt_tracker); + kmem_cache_free(net_cachep, net); } } diff --git a/net/core/sock.c b/net/core/sock.c index a3ba0358c77c..aa608dc0930b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2094,6 +2094,9 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); + } else { + __netns_tracker_alloc(net, &sk->ns_tracker, + false, priority); } sock_net_set(sk, net); @@ -2149,6 +2152,9 @@ static void __sk_destruct(struct rcu_head *head) if (likely(sk->sk_net_refcnt)) put_net_track(sock_net(sk), &sk->ns_tracker); + else + __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); + sk_prot_free(sk->sk_prot_creator, sk); } @@ -2237,6 +2243,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) if (likely(newsk->sk_net_refcnt)) { get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); + } else { + /* Kernel sockets are not elevating the struct net refcount. + * Instead, use a tracker to more easily detect if a layer + * is not properly dismantling its kernel sockets at netns + * destroy time. + */ + __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, + false, priority); } sk_node_init(&newsk->sk_node); sock_lock_init(newsk); diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a662e8a5ff84..f0c94d394ab1 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -812,6 +812,17 @@ static int netlink_release(struct socket *sock) } sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); + + /* Because struct net might disappear soon, do not keep a pointer. */ + if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { + __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); + /* Because of deferred_put_nlk_sk and use of work queue, + * it is possible netns will be freed before this socket. + */ + sock_net_set(sk, &init_net); + __netns_tracker_alloc(&init_net, &sk->ns_tracker, + false, GFP_KERNEL); + } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 4444fd82b66d..c5b86066ff66 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -503,6 +503,9 @@ bool rds_tcp_tune(struct socket *sock) release_sock(sk); return false; } + /* Update ns_tracker to current stack trace and refcounted tracker */ + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); -- cgit v1.2.3-70-g09d2 From 233baf9a1bc46f18ad3bec688f52ea5f818a8a25 Mon Sep 17 00:00:00 2001 From: xu xin Date: Thu, 20 Oct 2022 06:54:41 +0000 Subject: net: remove useless parameter of __sock_cmsg_send The parameter 'msg' has never been used by __sock_cmsg_send, so we can remove it safely. Reported-by: Zeal Robot Signed-off-by: xu xin Reviewed-by: Zhang Yunkai Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/sock.h | 2 +- net/core/sock.c | 4 ++-- net/ipv4/ip_sockglue.c | 2 +- net/ipv6/datagram.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 9e464f6409a7..b1dacc4d68c9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1901,7 +1901,7 @@ static inline void sockcm_init(struct sockcm_cookie *sockc, *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; } -int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, +int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); diff --git a/net/core/sock.c b/net/core/sock.c index aa608dc0930b..2786c1107e53 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2744,7 +2744,7 @@ failure: } EXPORT_SYMBOL(sock_alloc_send_pskb); -int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, +int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; @@ -2798,7 +2798,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; - ret = __sock_cmsg_send(sk, msg, cmsg, sockc); + ret = __sock_cmsg_send(sk, cmsg, sockc); if (ret) return ret; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 6e19cad154f5..5f16807d3235 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -267,7 +267,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, } #endif if (cmsg->cmsg_level == SOL_SOCKET) { - err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc); + err = __sock_cmsg_send(sk, cmsg, &ipc->sockc); if (err) return err; continue; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 5ecb56522f9d..df7e032ce87d 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -771,7 +771,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } if (cmsg->cmsg_level == SOL_SOCKET) { - err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc); + err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc); if (err) return err; continue; -- cgit v1.2.3-70-g09d2 From 4727bab4e9bbeafeff6acdfcb077a7a548cbde30 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 21 Oct 2022 10:58:22 +0800 Subject: net: skb: move skb_pp_recycle() to skbuff.c skb_pp_recycle() is only used by skb_free_head() in skbuff.c, so move it to skbuff.c. Signed-off-by: Yunsheng Lin Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ------- net/core/skbuff.c | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7be5bb4c94b6..59c9fd55699d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5050,12 +5050,5 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) } #endif -static inline bool skb_pp_recycle(struct sk_buff *skb, void *data) -{ - if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) - return false; - return page_pool_return_skb_page(virt_to_page(data)); -} - #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1d9719e72f9d..9b3b19816d2d 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -748,6 +748,13 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +static bool skb_pp_recycle(struct sk_buff *skb, void *data) +{ + if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) + return false; + return page_pool_return_skb_page(virt_to_page(data)); +} + static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; -- cgit v1.2.3-70-g09d2 From f7fe25a6f00522791cce38bac552d295339d0c79 Mon Sep 17 00:00:00 2001 From: Antony Antony Date: Fri, 21 Oct 2022 15:42:01 +0200 Subject: xfrm: update x->lastused for every packet x->lastused was only updated for outgoing mobile IPv6 packet. With this fix update it for every, in and out, packet. This is useful to check if the a SA is still in use, or when was the last time an SA was used. lastused time of in SA can used to check IPsec path is functional. Signed-off-by: Antony Antony Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_input.c | 1 + net/xfrm/xfrm_output.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 97074f6f2bde..c06e54a10540 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -671,6 +671,7 @@ resume: x->curlft.bytes += skb->len; x->curlft.packets++; + x->lastused = ktime_get_real_seconds(); spin_unlock(&x->lock); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9a5e79a38c67..78cb8d0a6a18 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -209,8 +209,6 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb) __skb_pull(skb, hdr_len); memmove(ipv6_hdr(skb), iph, hdr_len); - x->lastused = ktime_get_real_seconds(); - return 0; #else WARN_ON_ONCE(1); @@ -534,6 +532,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) x->curlft.bytes += skb->len; x->curlft.packets++; + x->lastused = ktime_get_real_seconds(); spin_unlock_bh(&x->lock); -- cgit v1.2.3-70-g09d2 From 4a6a676f8c16ec17d2f8d69ce3b5d680277ed0d2 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Fri, 21 Oct 2022 00:58:39 -0700 Subject: act_skbedit: skbedit queue mapping for receive queue Add support for skbedit queue mapping action on receive side. This is supported only in hardware, so the skip_sw flag is enforced. This enables offloading filters for receive queue selection in the hardware using the skbedit action. Traffic arrives on the Rx queue requested in the skbedit action parameter. A new tc action flag TCA_ACT_FLAGS_AT_INGRESS is introduced to identify the traffic direction the action queue_mapping is requested on during filter addition. This is used to disallow offloading the skbedit queue mapping action on transmit side. Example: $tc filter add dev $IFACE ingress protocol ip flower dst_ip $DST_IP\ action skbedit queue_mapping $rxq_id skip_sw Reviewed-by: Sridhar Samudrala Signed-off-by: Amritha Nambiar Signed-off-by: Paolo Abeni --- include/net/act_api.h | 1 + include/net/flow_offload.h | 2 ++ include/net/tc_act/tc_skbedit.h | 29 +++++++++++++++++++++++++++++ net/sched/act_skbedit.c | 14 ++++++++++++-- net/sched/cls_api.c | 7 +++++++ 5 files changed, 51 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 61f2ceb3939e..c94ea1a306e0 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -67,6 +67,7 @@ struct tc_action { #define TCA_ACT_FLAGS_BIND (1U << (TCA_ACT_FLAGS_USER_BITS + 1)) #define TCA_ACT_FLAGS_REPLACE (1U << (TCA_ACT_FLAGS_USER_BITS + 2)) #define TCA_ACT_FLAGS_NO_RTNL (1U << (TCA_ACT_FLAGS_USER_BITS + 3)) +#define TCA_ACT_FLAGS_AT_INGRESS (1U << (TCA_ACT_FLAGS_USER_BITS + 4)) /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index e343f9f8363e..7a60bc6d72c9 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -155,6 +155,7 @@ enum flow_action_id { FLOW_ACTION_MARK, FLOW_ACTION_PTYPE, FLOW_ACTION_PRIORITY, + FLOW_ACTION_RX_QUEUE_MAPPING, FLOW_ACTION_WAKE, FLOW_ACTION_QUEUE, FLOW_ACTION_SAMPLE, @@ -247,6 +248,7 @@ struct flow_action_entry { u32 csum_flags; /* FLOW_ACTION_CSUM */ u32 mark; /* FLOW_ACTION_MARK */ u16 ptype; /* FLOW_ACTION_PTYPE */ + u16 rx_queue; /* FLOW_ACTION_RX_QUEUE_MAPPING */ u32 priority; /* FLOW_ACTION_PRIORITY */ struct { /* FLOW_ACTION_QUEUE */ u32 ctx; diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index dc1079f28e13..9649600fb3dc 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -95,12 +95,41 @@ static inline u32 tcf_skbedit_priority(const struct tc_action *a) return priority; } +static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a) +{ + u16 rx_queue; + + rcu_read_lock(); + rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping; + rcu_read_unlock(); + + return rx_queue; +} + /* Return true iff action is queue_mapping */ static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a) { return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING); } +/* Return true if action is on ingress traffic */ +static inline bool is_tcf_skbedit_ingress(u32 flags) +{ + return flags & TCA_ACT_FLAGS_AT_INGRESS; +} + +static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a) +{ + return is_tcf_skbedit_queue_mapping(a) && + !is_tcf_skbedit_ingress(a->tcfa_flags); +} + +static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a) +{ + return is_tcf_skbedit_queue_mapping(a) && + is_tcf_skbedit_ingress(a->tcfa_flags); +} + /* Return true iff action is inheritdsfield */ static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a) { diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 7f598784fd30..1710780c908a 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -148,6 +148,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { + if (is_tcf_skbedit_ingress(act_flags) && + !(act_flags & TCA_ACT_FLAGS_SKIP_SW)) { + NL_SET_ERR_MSG_MOD(extack, "\"queue_mapping\" option on receive side is hardware only, use skip_sw"); + return -EOPNOTSUPP; + } flags |= SKBEDIT_F_QUEUE_MAPPING; queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); } @@ -374,9 +379,12 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data } else if (is_tcf_skbedit_priority(act)) { entry->id = FLOW_ACTION_PRIORITY; entry->priority = tcf_skbedit_priority(act); - } else if (is_tcf_skbedit_queue_mapping(act)) { - NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used"); + } else if (is_tcf_skbedit_tx_queue_mapping(act)) { + NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used on transmit side"); return -EOPNOTSUPP; + } else if (is_tcf_skbedit_rx_queue_mapping(act)) { + entry->id = FLOW_ACTION_RX_QUEUE_MAPPING; + entry->rx_queue = tcf_skbedit_rx_queue_mapping(act); } else if (is_tcf_skbedit_inheritdsfield(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"inheritdsfield\" option is used"); return -EOPNOTSUPP; @@ -394,6 +402,8 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data fl_action->id = FLOW_ACTION_PTYPE; else if (is_tcf_skbedit_priority(act)) fl_action->id = FLOW_ACTION_PRIORITY; + else if (is_tcf_skbedit_rx_queue_mapping(act)) + fl_action->id = FLOW_ACTION_RX_QUEUE_MAPPING; else return -EOPNOTSUPP; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 50566db45949..23d1cfa4f58c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1953,6 +1953,11 @@ static void tfilter_put(struct tcf_proto *tp, void *fh) tp->ops->put(tp, fh); } +static bool is_qdisc_ingress(__u32 classid) +{ + return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS)); +} + static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -2144,6 +2149,8 @@ replay: flags |= TCA_ACT_FLAGS_REPLACE; if (!rtnl_held) flags |= TCA_ACT_FLAGS_NO_RTNL; + if (is_qdisc_ingress(parent)) + flags |= TCA_ACT_FLAGS_AT_INGRESS; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack); if (err == 0) { -- cgit v1.2.3-70-g09d2 From cc2bbbfd9a5064cb8fc2996962d90b782f906223 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 24 Oct 2022 20:19:31 +0300 Subject: xfrm: Remove not-used total variable Total variable is not used in xfrm_byidx_resize() and can be safely removed. Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index e392d8d05e0c..d80519c4e389 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -605,7 +605,7 @@ static void xfrm_bydst_resize(struct net *net, int dir) xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } -static void xfrm_byidx_resize(struct net *net, int total) +static void xfrm_byidx_resize(struct net *net) { unsigned int hmask = net->xfrm.policy_idx_hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); @@ -683,7 +683,7 @@ static void xfrm_hash_resize(struct work_struct *work) xfrm_bydst_resize(net, dir); } if (xfrm_byidx_should_resize(net, total)) - xfrm_byidx_resize(net, total); + xfrm_byidx_resize(net); mutex_unlock(&hash_resize_mutex); } -- cgit v1.2.3-70-g09d2 From b261eda84ec136240a9ca753389853a3a1bccca2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 21 Oct 2022 13:44:34 -0700 Subject: soreuseport: Fix socket selection for SO_INCOMING_CPU. Kazuho Oku reported that setsockopt(SO_INCOMING_CPU) does not work with setsockopt(SO_REUSEPORT) since v4.6. With the combination of SO_REUSEPORT and SO_INCOMING_CPU, we could build a highly efficient server application. setsockopt(SO_INCOMING_CPU) associates a CPU with a TCP listener or UDP socket, and then incoming packets processed on the CPU will likely be distributed to the socket. Technically, a socket could even receive packets handled on another CPU if no sockets in the reuseport group have the same CPU receiving the flow. The logic exists in compute_score() so that a socket will get a higher score if it has the same CPU with the flow. However, the score gets ignored after the blamed two commits, which introduced a faster socket selection algorithm for SO_REUSEPORT. This patch introduces a counter of sockets with SO_INCOMING_CPU in a reuseport group to check if we should iterate all sockets to find a proper one. We increment the counter when * calling listen() if the socket has SO_INCOMING_CPU and SO_REUSEPORT * enabling SO_INCOMING_CPU if the socket is in a reuseport group Also, we decrement it when * detaching a socket out of the group to apply SO_INCOMING_CPU to migrated TCP requests * disabling SO_INCOMING_CPU if the socket is in a reuseport group When the counter reaches 0, we can get back to the O(1) selection algorithm. The overall changes are negligible for the non-SO_INCOMING_CPU case, and the only notable thing is that we have to update sk_incomnig_cpu under reuseport_lock. Otherwise, the race prevents transitioning to the O(n) algorithm and results in the wrong socket selection. cpu1 (setsockopt) cpu2 (listen) +-----------------+ +-------------+ lock_sock(sk1) lock_sock(sk2) reuseport_update_incoming_cpu(sk1, val) . | /* set CPU as 0 */ |- WRITE_ONCE(sk1->incoming_cpu, val) | | spin_lock_bh(&reuseport_lock) | reuseport_grow(sk2, reuse) | . | |- more_socks_size = reuse->max_socks * 2U; | |- if (more_socks_size > U16_MAX && | | reuse->num_closed_socks) | | . | | |- RCU_INIT_POINTER(sk1->sk_reuseport_cb, NULL); | | `- __reuseport_detach_closed_sock(sk1, reuse) | | . | | `- reuseport_put_incoming_cpu(sk1, reuse) | | . | | | /* Read shutdown()ed sk1's sk_incoming_cpu | | | * without lock_sock(). | | | */ | | `- if (sk1->sk_incoming_cpu >= 0) | | . | | | /* decrement not-yet-incremented | | | * count, which is never incremented. | | | */ | | `- __reuseport_put_incoming_cpu(reuse); | | | `- spin_lock_bh(&reuseport_lock) | |- spin_lock_bh(&reuseport_lock) | |- reuse = rcu_dereference_protected(sk1->sk_reuseport_cb, ...) |- if (!reuse) | . | | /* Cannot increment reuse->incoming_cpu. */ | `- goto out; | `- spin_unlock_bh(&reuseport_lock) Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection") Fixes: c125e80b8868 ("soreuseport: fast reuseport TCP socket selection") Reported-by: Kazuho Oku Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/sock_reuseport.h | 2 + net/core/sock.c | 2 +- net/core/sock_reuseport.c | 94 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 92 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h index efc9085c6892..6ec140b0a61b 100644 --- a/include/net/sock_reuseport.h +++ b/include/net/sock_reuseport.h @@ -16,6 +16,7 @@ struct sock_reuseport { u16 max_socks; /* length of socks */ u16 num_socks; /* elements in socks */ u16 num_closed_socks; /* closed elements in socks */ + u16 incoming_cpu; /* The last synq overflow event timestamp of this * reuse->socks[] group. */ @@ -58,5 +59,6 @@ static inline bool reuseport_has_conns(struct sock *sk) } void reuseport_has_conns_set(struct sock *sk); +void reuseport_update_incoming_cpu(struct sock *sk, int val); #endif /* _SOCK_REUSEPORT_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 2786c1107e53..4571914a4aa8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1436,7 +1436,7 @@ set_sndbuf: break; } case SO_INCOMING_CPU: - WRITE_ONCE(sk->sk_incoming_cpu, val); + reuseport_update_incoming_cpu(sk, val); break; case SO_CNX_ADVICE: diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index fb90e1e00773..5a165286e4d8 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -37,6 +37,70 @@ void reuseport_has_conns_set(struct sock *sk) } EXPORT_SYMBOL(reuseport_has_conns_set); +static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse) +{ + /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ + WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1); +} + +static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse) +{ + /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ + WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1); +} + +static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) +{ + if (sk->sk_incoming_cpu >= 0) + __reuseport_get_incoming_cpu(reuse); +} + +static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) +{ + if (sk->sk_incoming_cpu >= 0) + __reuseport_put_incoming_cpu(reuse); +} + +void reuseport_update_incoming_cpu(struct sock *sk, int val) +{ + struct sock_reuseport *reuse; + int old_sk_incoming_cpu; + + if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) { + /* Paired with REAE_ONCE() in sk_incoming_cpu_update() + * and compute_score(). + */ + WRITE_ONCE(sk->sk_incoming_cpu, val); + return; + } + + spin_lock_bh(&reuseport_lock); + + /* This must be done under reuseport_lock to avoid a race with + * reuseport_grow(), which accesses sk->sk_incoming_cpu without + * lock_sock() when detaching a shutdown()ed sk. + * + * Paired with READ_ONCE() in reuseport_select_sock_by_hash(). + */ + old_sk_incoming_cpu = sk->sk_incoming_cpu; + WRITE_ONCE(sk->sk_incoming_cpu, val); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + /* reuseport_grow() has detached a closed sk. */ + if (!reuse) + goto out; + + if (old_sk_incoming_cpu < 0 && val >= 0) + __reuseport_get_incoming_cpu(reuse); + else if (old_sk_incoming_cpu >= 0 && val < 0) + __reuseport_put_incoming_cpu(reuse); + +out: + spin_unlock_bh(&reuseport_lock); +} + static int reuseport_sock_index(struct sock *sk, const struct sock_reuseport *reuse, bool closed) @@ -64,6 +128,7 @@ static void __reuseport_add_sock(struct sock *sk, /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ smp_wmb(); reuse->num_socks++; + reuseport_get_incoming_cpu(sk, reuse); } static bool __reuseport_detach_sock(struct sock *sk, @@ -76,6 +141,7 @@ static bool __reuseport_detach_sock(struct sock *sk, reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; reuse->num_socks--; + reuseport_put_incoming_cpu(sk, reuse); return true; } @@ -86,6 +152,7 @@ static void __reuseport_add_closed_sock(struct sock *sk, reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; /* paired with READ_ONCE() in inet_csk_bind_conflict() */ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); + reuseport_get_incoming_cpu(sk, reuse); } static bool __reuseport_detach_closed_sock(struct sock *sk, @@ -99,6 +166,7 @@ static bool __reuseport_detach_closed_sock(struct sock *sk, reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; /* paired with READ_ONCE() in inet_csk_bind_conflict() */ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); + reuseport_put_incoming_cpu(sk, reuse); return true; } @@ -166,6 +234,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) reuse->bind_inany = bind_inany; reuse->socks[0] = sk; reuse->num_socks = 1; + reuseport_get_incoming_cpu(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -209,6 +278,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; more_reuse->has_conns = reuse->has_conns; + more_reuse->incoming_cpu = reuse->incoming_cpu; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -458,18 +528,32 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, u32 hash, u16 num_socks) { + struct sock *first_valid_sk = NULL; int i, j; i = j = reciprocal_scale(hash, num_socks); - while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + do { + struct sock *sk = reuse->socks[i]; + + if (sk->sk_state != TCP_ESTABLISHED) { + /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */ + if (!READ_ONCE(reuse->incoming_cpu)) + return sk; + + /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */ + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) + return sk; + + if (!first_valid_sk) + first_valid_sk = sk; + } + i++; if (i >= num_socks) i = 0; - if (i == j) - return NULL; - } + } while (i != j); - return reuse->socks[i]; + return first_valid_sk; } /** -- cgit v1.2.3-70-g09d2 From d3d429047cc66ff49780c93e4fccd9527723d385 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 21 Oct 2022 17:45:03 -0700 Subject: mptcp: sockopt: make 'tcp_fastopen_connect' generic There are other socket options that need to act only on the first subflow, e.g. all TCP_FASTOPEN* socket options. This is similar to the getsockopt version. In the next commit, this new mptcp_setsockopt_first_sf_only() helper is used by other another option. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Paolo Abeni --- net/mptcp/sockopt.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index c7cb68c725b2..8d3b09d75c3a 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -769,17 +769,17 @@ static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optv return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen); } -static int mptcp_setsockopt_sol_tcp_fastopen_connect(struct mptcp_sock *msk, sockptr_t optval, - unsigned int optlen) +static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct socket *sock; - /* Limit to first subflow */ + /* Limit to first subflow, before the connection establishment */ sock = __mptcp_nmpc_socket(msk); if (!sock) return -EINVAL; - return tcp_setsockopt(sock->sk, SOL_TCP, TCP_FASTOPEN_CONNECT, optval, optlen); + return tcp_setsockopt(sock->sk, level, optname, optval, optlen); } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, @@ -811,7 +811,8 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_DEFER_ACCEPT: return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen); case TCP_FASTOPEN_CONNECT: - return mptcp_setsockopt_sol_tcp_fastopen_connect(msk, optval, optlen); + return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, + optval, optlen); } return -EOPNOTSUPP; -- cgit v1.2.3-70-g09d2 From e64d4deb4de006faf7613c6603f5bf1bd1844ab3 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 21 Oct 2022 17:45:04 -0700 Subject: mptcp: add TCP_FASTOPEN_NO_COOKIE support The goal of this socket option is to configure MPTCP + TFO without cookie per socket. It was already possible to enable TFO without a cookie per netns by setting net.ipv4.tcp_fastopen sysctl knob to the right value. Per route was also supported by setting 'fastopen_no_cookie' option. This patch adds a per socket support like it is possible to do with TCP thanks to TCP_FASTOPEN_NO_COOKIE socket option. The only thing to do here is to relay the request to the first subflow like it is already done for TCP_FASTOPEN_CONNECT. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Paolo Abeni --- net/mptcp/sockopt.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 8d3b09d75c3a..1857281a0dd5 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -560,6 +560,7 @@ static bool mptcp_supported_sockopt(int level, int optname) case TCP_TX_DELAY: case TCP_INQ: case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_NO_COOKIE: return true; } @@ -568,8 +569,8 @@ static bool mptcp_supported_sockopt(int level, int optname) /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, * TCP_REPAIR_WINDOW are not supported, better avoid this mess */ - /* TCP_FASTOPEN_KEY, TCP_FASTOPEN, TCP_FASTOPEN_NO_COOKIE, - * are not supported fastopen is currently unsupported + /* TCP_FASTOPEN_KEY, TCP_FASTOPEN are not supported because + * fastopen for the listener side is currently unsupported */ } return false; @@ -811,6 +812,7 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_DEFER_ACCEPT: return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen); case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_NO_COOKIE: return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); } @@ -1175,6 +1177,7 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_CC_INFO: case TCP_DEFER_ACCEPT: case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_NO_COOKIE: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); case TCP_INQ: -- cgit v1.2.3-70-g09d2 From caea64675d8b91deb59140a92f190124399ccd37 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 21 Oct 2022 17:45:05 -0700 Subject: mptcp: sockopt: use new helper for TCP_DEFER_ACCEPT mptcp_setsockopt_sol_tcp_defer() was doing the same thing as mptcp_setsockopt_first_sf_only() except for the returned code in case of error. Ignoring the error is needed to mimic how TCP_DEFER_ACCEPT is handled when used with "plain" TCP sockets. The specific function for TCP_DEFER_ACCEPT can be replaced by the new mptcp_setsockopt_first_sf_only() helper and errors can be ignored to stay compatible with TCP. A bit of cleanup. Suggested-by: Mat Martineau Reviewed-by: Mat Martineau Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Paolo Abeni --- net/mptcp/sockopt.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 1857281a0dd5..f85e9bbfe86f 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -758,18 +758,6 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } -static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optval, - unsigned int optlen) -{ - struct socket *listener; - - listener = __mptcp_nmpc_socket(msk); - if (!listener) - return 0; /* TCP_DEFER_ACCEPT does not fail */ - - return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen); -} - static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -810,7 +798,9 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_NODELAY: return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen); case TCP_DEFER_ACCEPT: - return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen); + /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ + mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); + return 0; case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_NO_COOKIE: return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, -- cgit v1.2.3-70-g09d2 From ac1f8c049319847b1b4c6b387fdb2e3f7fb84ffc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 28 Sep 2022 23:55:06 +0200 Subject: netfilter: nft_payload: move struct nft_payload_set definition where it belongs Not required to expose this header in nf_tables_core.h, move it to where it is used, ie. nft_payload. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 10 ---------- net/netfilter/nft_payload.c | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 1223af68cd9a..990c3767a350 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -66,16 +66,6 @@ struct nft_payload { u8 dreg; }; -struct nft_payload_set { - enum nft_payload_bases base:8; - u8 offset; - u8 len; - u8 sreg; - u8 csum_type; - u8 csum_offset; - u8 csum_flags; -}; - extern const struct nft_expr_ops nft_payload_fast_ops; extern const struct nft_expr_ops nft_bitwise_fast_ops; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 088244f9d838..07621d509a68 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -665,6 +665,16 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, return 0; } +struct nft_payload_set { + enum nft_payload_bases base:8; + u8 offset; + u8 len; + u8 sreg; + u8 csum_type; + u8 csum_offset; + u8 csum_flags; +}; + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) -- cgit v1.2.3-70-g09d2 From d037abc2414b4539401e0e6aa278bedc4628ad69 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 21 Oct 2022 16:17:53 +0200 Subject: netfilter: nft_objref: make it builtin nft_objref is needed to reference named objects, it makes no sense to disable it. Before: text data bss dec filename 4014 424 0 4438 nft_objref.o 4174 1128 0 5302 nft_objref.ko 359351 15276 864 375491 nf_tables.ko After: text data bss dec filename 3815 408 0 4223 nft_objref.o 363161 15692 864 379717 nf_tables.ko Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 1 + net/netfilter/Kconfig | 6 ------ net/netfilter/Makefile | 4 ++-- net/netfilter/nf_tables_core.c | 1 + net/netfilter/nft_objref.c | 22 +--------------------- 5 files changed, 5 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 990c3767a350..83d763631f81 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -18,6 +18,7 @@ extern struct nft_expr_type nft_meta_type; extern struct nft_expr_type nft_rt_type; extern struct nft_expr_type nft_exthdr_type; extern struct nft_expr_type nft_last_type; +extern struct nft_expr_type nft_objref_type; #ifdef CONFIG_NETWORK_SECMARK extern struct nft_object_type nft_secmark_obj_type; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 4b8d04640ff3..0846bd75b1da 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -568,12 +568,6 @@ config NFT_TUNNEL This option adds the "tunnel" expression that you can use to set tunneling policies. -config NFT_OBJREF - tristate "Netfilter nf_tables stateful object reference module" - help - This option adds the "objref" expression that allows you to refer to - stateful objects, such as counters and quotas. - config NFT_QUEUE depends on NETFILTER_NETLINK_QUEUE tristate "Netfilter nf_tables queue module" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 0f060d100880..7a6b518ba2b4 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -86,7 +86,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \ - nft_counter.o nft_chain_route.o nf_tables_offload.o \ + nft_counter.o nft_objref.o \ + nft_chain_route.o nf_tables_offload.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ nft_set_pipapo.o @@ -104,7 +105,6 @@ obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o -obj-$(CONFIG_NFT_OBJREF) += nft_objref.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index cee3e4e905ec..6dcead50208c 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -340,6 +340,7 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_exthdr_type, &nft_last_type, &nft_counter_type, + &nft_objref_type, }; static struct nft_object_type *nft_basic_objects[] = { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 5d8d91b3904d..74e0eea4abac 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -82,7 +82,6 @@ static void nft_objref_activate(const struct nft_ctx *ctx, obj->use++; } -static struct nft_expr_type nft_objref_type; static const struct nft_expr_ops nft_objref_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), @@ -195,7 +194,6 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } -static struct nft_expr_type nft_objref_type; static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), @@ -233,28 +231,10 @@ static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; -static struct nft_expr_type nft_objref_type __read_mostly = { +struct nft_expr_type nft_objref_type __read_mostly = { .name = "objref", .select_ops = nft_objref_select_ops, .policy = nft_objref_policy, .maxattr = NFTA_OBJREF_MAX, .owner = THIS_MODULE, }; - -static int __init nft_objref_module_init(void) -{ - return nft_register_expr(&nft_objref_type); -} - -static void __exit nft_objref_module_exit(void) -{ - nft_unregister_expr(&nft_objref_type); -} - -module_init(nft_objref_module_init); -module_exit(nft_objref_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso "); -MODULE_ALIAS_NFT_EXPR("objref"); -MODULE_DESCRIPTION("nftables stateful object reference module"); -- cgit v1.2.3-70-g09d2 From c247897d7c194e6fb4a7c8314af2226bec017a59 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:29 +0200 Subject: netfilter: nft_payload: access GRE payload via inner offset Parse GRE v0 packets to properly set up inner offset, this allow for matching on inner headers. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 07621d509a68..03a1f271bf4f 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -19,6 +19,7 @@ /* For layer 4 checksum field offset. */ #include #include +#include #include #include #include @@ -100,6 +101,37 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) pkt->inneroff = thoff + __tcp_hdrlen(th); } break; + case IPPROTO_GRE: { + u32 offset = sizeof(struct gre_base_hdr), version; + struct gre_base_hdr *gre, _gre; + + gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre); + if (!gre) + return -1; + + version = gre->flags & GRE_VERSION; + switch (version) { + case GRE_VERSION_0: + if (gre->flags & GRE_ROUTING) + return -1; + + if (gre->flags & GRE_CSUM) { + offset += sizeof_field(struct gre_full_hdr, csum) + + sizeof_field(struct gre_full_hdr, reserved1); + } + if (gre->flags & GRE_KEY) + offset += sizeof_field(struct gre_full_hdr, key); + + if (gre->flags & GRE_SEQ) + offset += sizeof_field(struct gre_full_hdr, seq); + break; + default: + return -1; + } + + pkt->inneroff = thoff + offset; + } + break; default: return -1; } -- cgit v1.2.3-70-g09d2 From 3927ce8850cacc1790cd4c5d02ca42e24df9fa6b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:30 +0200 Subject: netfilter: nft_payload: access ipip payload for inner offset ipip is an special case, transport and inner header offset are set to the same offset to use the upcoming inner expression for matching on inner tunnel headers. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 03a1f271bf4f..84b490d6cc75 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -132,6 +132,9 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) pkt->inneroff = thoff + offset; } break; + case IPPROTO_IPIP: + pkt->inneroff = thoff; + break; default: return -1; } -- cgit v1.2.3-70-g09d2 From 3a07327d10a09379315c844c63f27941f5081e0a Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 25 Oct 2022 13:48:15 +0200 Subject: netfilter: nft_inner: support for inner tunnel header matching This new expression allows you to match on the inner headers that are encapsulated by any of the existing tunneling protocols. This expression parses the inner packet to set the link, network and transport offsets, so the existing expressions (with a few updates) can be reused to match on the inner headers. The inner expression supports for different tunnel combinations such as: - ethernet frame over IPv4/IPv6 packet, eg. VxLAN. - IPv4/IPv6 packet over IPv4/IPv6 packet, eg. IPIP. - IPv4/IPv6 packet over IPv4/IPv6 + transport header, eg. GRE. - transport header (ESP or SCTP) over transport header (usually UDP) The following fields are used to describe the tunnel protocol: - flags, which describe how to parse the inner headers: NFT_PAYLOAD_CTX_INNER_TUN, the tunnel provides its own header. NFT_PAYLOAD_CTX_INNER_ETHER, the ethernet frame is available as inner header. NFT_PAYLOAD_CTX_INNER_NH, the network header is available as inner header. NFT_PAYLOAD_CTX_INNER_TH, the transport header is available as inner header. For example, VxLAN sets on all of these flags. While GRE only sets on NFT_PAYLOAD_CTX_INNER_NH and NFT_PAYLOAD_CTX_INNER_TH. Then, ESP over UDP only sets on NFT_PAYLOAD_CTX_INNER_TH. The tunnel description is composed of the following attributes: - header size: in case the tunnel comes with its own header, eg. VxLAN. - type: this provides a hint to userspace on how to delinearize the rule. This is useful for VxLAN and Geneve since they run over UDP, since transport does not provide a hint. This is also useful in case hardware offload is ever supported. The type is not currently interpreted by the kernel. - expression: currently only payload supported. Follow up patch adds also inner meta support which is required by autogenerated dependencies. The exthdr expression should be supported too at some point. There is a new inner_ops operation that needs to be set on to allow to use an existing expression from the inner expression. This patch adds a new NFT_PAYLOAD_TUN_HEADER base which allows to match on the tunnel header fields, eg. vxlan vni. The payload expression is embedded into nft_inner private area and this private data area is passed to the payload inner eval function via direct call. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 + include/net/netfilter/nf_tables_core.h | 24 +++ include/uapi/linux/netfilter/nf_tables.h | 26 +++ net/netfilter/Makefile | 2 +- net/netfilter/nf_tables_api.c | 37 ++++ net/netfilter/nf_tables_core.c | 1 + net/netfilter/nft_inner.c | 336 +++++++++++++++++++++++++++++++ net/netfilter/nft_payload.c | 89 +++++++- 8 files changed, 518 insertions(+), 2 deletions(-) create mode 100644 net/netfilter/nft_inner.c (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index f6db510689a8..2dbfe7524a7e 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -375,6 +375,10 @@ static inline void *nft_expr_priv(const struct nft_expr *expr) return (void *)expr->data; } +struct nft_expr_info; + +int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, + struct nft_expr_info *info); int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, @@ -864,6 +868,7 @@ struct nft_expr_type { const struct nlattr * const tb[]); void (*release_ops)(const struct nft_expr_ops *ops); const struct nft_expr_ops *ops; + const struct nft_expr_ops *inner_ops; struct list_head list; const char *name; struct module *owner; diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 83d763631f81..be2b2b5d0a52 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -19,6 +19,7 @@ extern struct nft_expr_type nft_rt_type; extern struct nft_expr_type nft_exthdr_type; extern struct nft_expr_type nft_last_type; extern struct nft_expr_type nft_objref_type; +extern struct nft_expr_type nft_inner_type; #ifdef CONFIG_NETWORK_SECMARK extern struct nft_object_type nft_secmark_obj_type; @@ -139,4 +140,27 @@ void nft_rt_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); + +enum { + NFT_PAYLOAD_CTX_INNER_TUN = (1 << 0), + NFT_PAYLOAD_CTX_INNER_LL = (1 << 1), + NFT_PAYLOAD_CTX_INNER_NH = (1 << 2), + NFT_PAYLOAD_CTX_INNER_TH = (1 << 3), +}; + +struct nft_inner_tun_ctx { + u16 inner_tunoff; + u16 inner_lloff; + u16 inner_nhoff; + u16 inner_thoff; + __be16 llproto; + u8 l4proto; + u8 flags; +}; + +int nft_payload_inner_offset(const struct nft_pktinfo *pkt); +void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx); + #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 466fd3f4447c..05a15dce8271 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -760,6 +760,7 @@ enum nft_payload_bases { NFT_PAYLOAD_NETWORK_HEADER, NFT_PAYLOAD_TRANSPORT_HEADER, NFT_PAYLOAD_INNER_HEADER, + NFT_PAYLOAD_TUN_HEADER, }; /** @@ -779,6 +780,31 @@ enum nft_payload_csum_flags { NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0), }; +enum nft_inner_type { + NFT_INNER_UNSPEC = 0, + NFT_INNER_VXLAN, +}; + +enum nft_inner_flags { + NFT_INNER_HDRSIZE = (1 << 0), + NFT_INNER_LL = (1 << 1), + NFT_INNER_NH = (1 << 2), + NFT_INNER_TH = (1 << 3), +}; +#define NFT_INNER_MASK (NFT_INNER_HDRSIZE | NFT_INNER_LL | \ + NFT_INNER_NH | NFT_INNER_TH) + +enum nft_inner_attributes { + NFTA_INNER_UNSPEC, + NFTA_INNER_NUM, + NFTA_INNER_TYPE, + NFTA_INNER_FLAGS, + NFTA_INNER_HDRSIZE, + NFTA_INNER_EXPR, + __NFTA_INNER_MAX +}; +#define NFTA_INNER_MAX (__NFTA_INNER_MAX - 1) + /** * enum nft_payload_attributes - nf_tables payload expression netlink attributes * diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 7a6b518ba2b4..1d4db1943936 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -86,7 +86,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \ - nft_counter.o nft_objref.o \ + nft_counter.o nft_objref.o nft_inner.o \ nft_chain_route.o nf_tables_offload.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ nft_set_pipapo.o diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 58d9cbc9ccdc..6b79f5e18f08 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2857,6 +2857,43 @@ err1: return err; } +int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, + struct nft_expr_info *info) +{ + struct nlattr *tb[NFTA_EXPR_MAX + 1]; + const struct nft_expr_type *type; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, + nft_expr_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_EXPR_DATA]) + return -EINVAL; + + type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); + if (IS_ERR(type)) + return PTR_ERR(type); + + if (!type->inner_ops) + return -EOPNOTSUPP; + + err = nla_parse_nested_deprecated(info->tb, type->maxattr, + tb[NFTA_EXPR_DATA], + type->policy, NULL); + if (err < 0) + goto err_nla_parse; + + info->attr = nla; + info->ops = type->inner_ops; + + return 0; + +err_nla_parse: + return err; +} + static int nf_tables_newexpr(const struct nft_ctx *ctx, const struct nft_expr_info *expr_info, struct nft_expr *expr) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 6dcead50208c..709a736c301c 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -341,6 +341,7 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_last_type, &nft_counter_type, &nft_objref_type, + &nft_inner_type, }; static struct nft_object_type *nft_basic_objects[] = { diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c new file mode 100644 index 000000000000..1e4079b5b431 --- /dev/null +++ b/net/netfilter/nft_inner.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Pablo Neira Ayuso + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Same layout as nft_expr but it embeds the private expression data area. */ +struct __nft_expr { + const struct nft_expr_ops *ops; + union { + struct nft_payload payload; + } __attribute__((aligned(__alignof__(u64)))); +}; + +enum { + NFT_INNER_EXPR_PAYLOAD, +}; + +struct nft_inner { + u8 flags; + u8 hdrsize; + u8 type; + u8 expr_type; + + struct __nft_expr expr; +}; + +static int nft_inner_parse_l2l3(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx, u32 off) +{ + __be16 llproto, outer_llproto; + u32 nhoff, thoff; + + if (priv->flags & NFT_INNER_LL) { + struct vlan_ethhdr *veth, _veth; + struct ethhdr *eth, _eth; + u32 hdrsize; + + eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth); + if (!eth) + return -1; + + switch (eth->h_proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + llproto = eth->h_proto; + hdrsize = sizeof(_eth); + break; + case htons(ETH_P_8021Q): + veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth); + if (!eth) + return -1; + + outer_llproto = veth->h_vlan_encapsulated_proto; + llproto = veth->h_vlan_proto; + hdrsize = sizeof(_veth); + break; + default: + return -1; + } + + ctx->inner_lloff = off; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL; + off += hdrsize; + } else { + struct iphdr *iph; + u32 _version; + + iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version); + if (!iph) + return -1; + + switch (iph->version) { + case 4: + llproto = htons(ETH_P_IP); + break; + case 6: + llproto = htons(ETH_P_IPV6); + break; + default: + return -1; + } + } + + ctx->llproto = llproto; + if (llproto == htons(ETH_P_8021Q)) + llproto = outer_llproto; + + nhoff = off; + + switch (llproto) { + case htons(ETH_P_IP): { + struct iphdr *iph, _iph; + + iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return -1; + + if (iph->ihl < 5 || iph->version != 4) + return -1; + + ctx->inner_nhoff = nhoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; + + thoff = nhoff + (iph->ihl * 4); + if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) { + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + ctx->inner_thoff = thoff; + ctx->l4proto = iph->protocol; + } + } + break; + case htons(ETH_P_IPV6): { + struct ipv6hdr *ip6h, _ip6h; + int fh_flags = IP6_FH_F_AUTH; + unsigned short fragoff; + int l4proto; + + ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h); + if (!ip6h) + return -1; + + if (ip6h->version != 6) + return -1; + + ctx->inner_nhoff = nhoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; + + thoff = nhoff; + l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags); + if (l4proto < 0 || thoff > U16_MAX) + return -1; + + if (fragoff == 0) { + thoff = nhoff + sizeof(_ip6h); + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + ctx->inner_thoff = thoff; + ctx->l4proto = l4proto; + } + } + break; + default: + return -1; + } + + return 0; +} + +static int nft_inner_parse_tunhdr(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx, u32 *off) +{ + if (pkt->tprot != IPPROTO_UDP || + pkt->tprot != IPPROTO_GRE) + return -1; + + ctx->inner_tunoff = *off; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; + *off += priv->hdrsize; + + return 0; +} + +static int nft_inner_parse(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx ctx = {}; + u32 off = pkt->inneroff; + + if (priv->flags & NFT_INNER_HDRSIZE && + nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0) + return -1; + + if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { + if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0) + return -1; + } else if (priv->flags & NFT_INNER_TH) { + ctx.inner_thoff = off; + ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH; + } + + *tun_ctx = ctx; + + return 0; +} + +static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_inner *priv = nft_expr_priv(expr); + struct nft_inner_tun_ctx tun_ctx = {}; + + if (nft_payload_inner_offset(pkt) < 0) + goto err; + + if (nft_inner_parse(priv, pkt, &tun_ctx) < 0) + goto err; + + switch (priv->expr_type) { + case NFT_INNER_EXPR_PAYLOAD: + nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); + break; + default: + WARN_ON_ONCE(1); + goto err; + } + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = { + [NFTA_INNER_NUM] = { .type = NLA_U32 }, + [NFTA_INNER_FLAGS] = { .type = NLA_U32 }, + [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 }, + [NFTA_INNER_TYPE] = { .type = NLA_U32 }, + [NFTA_INNER_EXPR] = { .type = NLA_NESTED }, +}; + +struct nft_expr_info { + const struct nft_expr_ops *ops; + const struct nlattr *attr; + struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; +}; + +static int nft_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_inner *priv = nft_expr_priv(expr); + u32 flags, hdrsize, type, num; + struct nft_expr_info expr_info; + int err; + + if (!tb[NFTA_INNER_FLAGS] || + !tb[NFTA_INNER_HDRSIZE] || + !tb[NFTA_INNER_TYPE] || + !tb[NFTA_INNER_EXPR]) + return -EINVAL; + + flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS])); + if (flags & ~NFT_INNER_MASK) + return -EOPNOTSUPP; + + num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM])); + if (num != 0) + return -EOPNOTSUPP; + + hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE])); + type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE])); + + if (type > U8_MAX) + return -EINVAL; + + if (flags & NFT_INNER_HDRSIZE) { + if (hdrsize == 0 || hdrsize > 64) + return -EOPNOTSUPP; + } + + priv->flags = flags; + priv->hdrsize = hdrsize; + priv->type = type; + + err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info); + if (err < 0) + return err; + + priv->expr.ops = expr_info.ops; + + if (!strcmp(expr_info.ops->type->name, "payload")) + priv->expr_type = NFT_INNER_EXPR_PAYLOAD; + else + return -EINVAL; + + err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr, + (const struct nlattr * const*)expr_info.tb); + if (err < 0) + return err; + + return 0; +} + +static int nft_inner_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_inner *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) || + nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) || + nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) || + nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize))) + goto nla_put_failure; + + if (nft_expr_dump(skb, NFTA_INNER_EXPR, + (struct nft_expr *)&priv->expr) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_expr_ops nft_inner_ops = { + .type = &nft_inner_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)), + .eval = nft_inner_eval, + .init = nft_inner_init, + .dump = nft_inner_dump, +}; + +struct nft_expr_type nft_inner_type __read_mostly = { + .name = "inner", + .ops = &nft_inner_ops, + .policy = nft_inner_policy, + .maxattr = NFTA_INNER_MAX, + .owner = THIS_MODULE, +}; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 84b490d6cc75..9d2ac764a14c 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -144,7 +144,7 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) return 0; } -static int nft_payload_inner_offset(const struct nft_pktinfo *pkt) +int nft_payload_inner_offset(const struct nft_pktinfo *pkt) { if (!(pkt->flags & NFT_PKTINFO_INNER) && __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0) @@ -587,6 +587,92 @@ const struct nft_expr_ops nft_payload_fast_ops = { .offload = nft_payload_offload, }; +void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + u32 *dest = ®s->data[priv->dreg]; + int offset; + + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + + switch (priv->base) { + case NFT_PAYLOAD_TUN_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN)) + goto err; + + offset = tun_ctx->inner_tunoff; + break; + case NFT_PAYLOAD_LL_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL)) + goto err; + + offset = tun_ctx->inner_lloff; + break; + case NFT_PAYLOAD_NETWORK_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH)) + goto err; + + offset = tun_ctx->inner_nhoff; + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) + goto err; + + offset = tun_ctx->inner_thoff; + break; + default: + WARN_ON_ONCE(1); + goto err; + } + offset += priv->offset; + + if (skb_copy_bits(skb, offset, dest, priv->len) < 0) + goto err; + + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_payload_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_payload *priv = nft_expr_priv(expr); + u32 base; + + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + switch (base) { + case NFT_PAYLOAD_TUN_HEADER: + case NFT_PAYLOAD_LL_HEADER: + case NFT_PAYLOAD_NETWORK_HEADER: + case NFT_PAYLOAD_TRANSPORT_HEADER: + break; + default: + return -EOPNOTSUPP; + } + + priv->base = base; + priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + + return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); +} + +static const struct nft_expr_ops nft_payload_inner_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), + .init = nft_payload_inner_init, + .dump = nft_payload_dump, + /* direct call to nft_payload_inner_eval(). */ +}; + static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); @@ -930,6 +1016,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, struct nft_expr_type nft_payload_type __read_mostly = { .name = "payload", .select_ops = nft_payload_select_ops, + .inner_ops = &nft_payload_inner_ops, .policy = nft_payload_policy, .maxattr = NFTA_PAYLOAD_MAX, .owner = THIS_MODULE, -- cgit v1.2.3-70-g09d2 From 0e795b37ba044893107f887b037594645a6fc584 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:32 +0200 Subject: netfilter: nft_inner: add percpu inner context Add NFT_PKTINFO_INNER_FULL flag to annotate that inner offsets are available. Store nft_inner_tun_ctx object in percpu area to cache existing inner offsets for this skbuff. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 1 + include/net/netfilter/nf_tables_core.h | 1 + net/netfilter/nft_inner.c | 26 ++++++++++++++++++++++---- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 2dbfe7524a7e..38e2b396e38a 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -24,6 +24,7 @@ struct module; enum { NFT_PKTINFO_L4PROTO = (1 << 0), NFT_PKTINFO_INNER = (1 << 1), + NFT_PKTINFO_INNER_FULL = (1 << 2), }; struct nft_pktinfo { diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index be2b2b5d0a52..3e825381ac5c 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -149,6 +149,7 @@ enum { }; struct nft_inner_tun_ctx { + u16 type; u16 inner_tunoff; u16 inner_lloff; u16 inner_nhoff; diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 1e4079b5b431..29f2eefe0357 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -21,6 +21,8 @@ #include #include +static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); + /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { const struct nft_expr_ops *ops; @@ -180,7 +182,7 @@ static int nft_inner_parse_tunhdr(const struct nft_inner *priv, } static int nft_inner_parse(const struct nft_inner *priv, - const struct nft_pktinfo *pkt, + struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { struct nft_inner_tun_ctx ctx = {}; @@ -199,25 +201,41 @@ static int nft_inner_parse(const struct nft_inner *priv, } *tun_ctx = ctx; + tun_ctx->type = priv->type; + pkt->flags |= NFT_PKTINFO_INNER_FULL; return 0; } +static bool nft_inner_parse_needed(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + const struct nft_inner_tun_ctx *tun_ctx) +{ + if (!(pkt->flags & NFT_PKTINFO_INNER_FULL)) + return true; + + if (priv->type != tun_ctx->type) + return true; + + return false; +} + static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { + struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); const struct nft_inner *priv = nft_expr_priv(expr); - struct nft_inner_tun_ctx tun_ctx = {}; if (nft_payload_inner_offset(pkt) < 0) goto err; - if (nft_inner_parse(priv, pkt, &tun_ctx) < 0) + if (nft_inner_parse_needed(priv, pkt, tun_ctx) && + nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0) goto err; switch (priv->expr_type) { case NFT_INNER_EXPR_PAYLOAD: - nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx); + nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); break; default: WARN_ON_ONCE(1); -- cgit v1.2.3-70-g09d2 From a150d122b6bdb84df532057aa3b2faf8c6485792 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:33 +0200 Subject: netfilter: nft_meta: add inner match support Add support for inner meta matching on: - NFT_META_PROTOCOL: to match on the ethertype, this can be used regardless tunnel protocol provides no link layer header, in that case nft_inner sets on the ethertype based on the IP header version field. - NFT_META_L4PROTO: to match on the layer 4 protocol. These meta expression are usually autogenerated as dependencies by userspace nftables. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_meta.h | 6 ++++ net/netfilter/nft_inner.c | 8 ++++++ net/netfilter/nft_meta.c | 62 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+) (limited to 'net') diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index 9b51cc67de54..f3a5285a511c 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -46,4 +46,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx, bool nft_meta_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr); + +struct nft_inner_tun_ctx; +void nft_meta_inner_eval(const struct nft_expr *expr, + struct nft_regs *regs, const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx); + #endif diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 29f2eefe0357..c43a2fe0ceb7 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -28,11 +29,13 @@ struct __nft_expr { const struct nft_expr_ops *ops; union { struct nft_payload payload; + struct nft_meta meta; } __attribute__((aligned(__alignof__(u64)))); }; enum { NFT_INNER_EXPR_PAYLOAD, + NFT_INNER_EXPR_META, }; struct nft_inner { @@ -237,6 +240,9 @@ static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, case NFT_INNER_EXPR_PAYLOAD: nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); break; + case NFT_INNER_EXPR_META: + nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); + break; default: WARN_ON_ONCE(1); goto err; @@ -306,6 +312,8 @@ static int nft_inner_init(const struct nft_ctx *ctx, if (!strcmp(expr_info.ops->type->name, "payload")) priv->expr_type = NFT_INNER_EXPR_PAYLOAD; + else if (!strcmp(expr_info.ops->type->name, "meta")) + priv->expr_type = NFT_INNER_EXPR_META; else return -EINVAL; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 55d2d49c3425..8c39adeebb5c 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -831,9 +831,71 @@ nft_meta_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EINVAL); } +static int nft_meta_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + unsigned int len; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_PROTOCOL: + len = sizeof(u16); + break; + case NFT_META_L4PROTO: + len = sizeof(u32); + break; + default: + return -EOPNOTSUPP; + } + priv->len = len; + + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); +} + +void nft_meta_inner_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + + switch (priv->key) { + case NFT_META_PROTOCOL: + nft_reg_store16(dest, (__force u16)tun_ctx->llproto); + break; + case NFT_META_L4PROTO: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) + goto err; + + nft_reg_store8(dest, tun_ctx->l4proto); + break; + default: + WARN_ON_ONCE(1); + goto err; + } + return; + +err: + regs->verdict.code = NFT_BREAK; +} +EXPORT_SYMBOL_GPL(nft_meta_inner_eval); + +static const struct nft_expr_ops nft_meta_inner_ops = { + .type = &nft_meta_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .init = nft_meta_inner_init, + .dump = nft_meta_get_dump, + /* direct call to nft_meta_inner_eval(). */ +}; + struct nft_expr_type nft_meta_type __read_mostly = { .name = "meta", .select_ops = nft_meta_select_ops, + .inner_ops = &nft_meta_inner_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, -- cgit v1.2.3-70-g09d2 From 0db14b95660b63dceeb7e89f2e3ffa97d331fce0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:34 +0200 Subject: netfilter: nft_inner: add geneve support Geneve tunnel header may contain options, parse geneve header and update offset to point to the link layer header according to the opt_len field. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 1 + net/netfilter/nft_inner.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 05a15dce8271..e4b739d57480 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -783,6 +783,7 @@ enum nft_payload_csum_flags { enum nft_inner_type { NFT_INNER_UNSPEC = 0, NFT_INNER_VXLAN, + NFT_INNER_GENEVE, }; enum nft_inner_flags { diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index c43a2fe0ceb7..19fdc8c70cd1 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -181,6 +182,22 @@ static int nft_inner_parse_tunhdr(const struct nft_inner *priv, ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; *off += priv->hdrsize; + switch (priv->type) { + case NFT_INNER_GENEVE: { + struct genevehdr *gnvh, _gnvh; + + gnvh = skb_header_pointer(pkt->skb, pkt->inneroff, + sizeof(_gnvh), &_gnvh); + if (!gnvh) + return -1; + + *off += gnvh->opt_len * 4; + } + break; + default: + break; + } + return 0; } -- cgit v1.2.3-70-g09d2 From 91619eb60aeccd3181d9b88975add706a9b763c1 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 17 Oct 2022 13:03:35 +0200 Subject: netfilter: nft_inner: set tunnel offset to GRE header offset Set inner tunnel offset to the GRE header, this is redundant to existing transport header offset, but this normalizes the handling of the tunnel header regardless its location in the layering. GRE version 0 is overloaded with RFCs, the type decorator in the inner expression might also be useful to interpret matching fields from the netlink delinearize path in userspace. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_inner.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 19fdc8c70cd1..eae7caeff316 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -174,8 +174,13 @@ static int nft_inner_parse_tunhdr(const struct nft_inner *priv, const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *ctx, u32 *off) { - if (pkt->tprot != IPPROTO_UDP || - pkt->tprot != IPPROTO_GRE) + if (pkt->tprot == IPPROTO_GRE) { + ctx->inner_tunoff = pkt->thoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; + return 0; + } + + if (pkt->tprot != IPPROTO_UDP) return -1; ctx->inner_tunoff = *off; -- cgit v1.2.3-70-g09d2 From b5f0de6df6dce8d641ef58ef7012f3304dffb9a1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 02:56:03 -0700 Subject: net: dev: Convert sa_data to flexible array in struct sockaddr One of the worst offenders of "fake flexible arrays" is struct sockaddr, as it is the classic example of why GCC and Clang have been traditionally forced to treat all trailing arrays as fake flexible arrays: in the distant misty past, sa_data became too small, and code started just treating it as a flexible array, even though it was fixed-size. The special case by the compiler is specifically that sizeof(sa->sa_data) and FORTIFY_SOURCE (which uses __builtin_object_size(sa->sa_data, 1)) do not agree (14 and -1 respectively), which makes FORTIFY_SOURCE treat it as a flexible array. However, the coming -fstrict-flex-arrays compiler flag will remove these special cases so that FORTIFY_SOURCE can gain coverage over all the trailing arrays in the kernel that are _not_ supposed to be treated as a flexible array. To deal with this change, convert sa_data to a true flexible array. To keep the structure size the same, move sa_data into a union with a newly introduced sa_data_min with the original size. The result is that FORTIFY_SOURCE can continue to have no idea how large sa_data may actually be, but anything using sizeof(sa->sa_data) must switch to sizeof(sa->sa_data_min). Cc: Jens Axboe Cc: Pavel Begunkov Cc: David Ahern Cc: Dylan Yudaken Cc: Yajun Deng Cc: Petr Machata Cc: Hangbin Liu Cc: Leon Romanovsky Cc: syzbot Cc: Willem de Bruijn Cc: Pablo Neira Ayuso Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221018095503.never.671-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 5 ++++- net/core/dev.c | 2 +- net/core/dev_ioctl.c | 2 +- net/packet/af_packet.c | 10 +++++----- 4 files changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/socket.h b/include/linux/socket.h index de3701a2a212..13c3a237b9c9 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -33,7 +33,10 @@ typedef __kernel_sa_family_t sa_family_t; struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ - char sa_data[14]; /* 14 bytes of protocol address */ + union { + char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ + DECLARE_FLEX_ARRAY(char, sa_data); + }; }; struct linger { diff --git a/net/core/dev.c b/net/core/dev.c index 3be256051e99..fff62068a53d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8822,7 +8822,7 @@ EXPORT_SYMBOL(dev_set_mac_address_user); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { - size_t size = sizeof(sa->sa_data); + size_t size = sizeof(sa->sa_data_min); struct net_device *dev; int ret = 0; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 7674bb9f3076..5cdbfbf9a7dc 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -342,7 +342,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof(ifr->ifr_hwaddr.sa_data), + min(sizeof(ifr->ifr_hwaddr.sa_data_min), (size_t)dev->addr_len)); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return 0; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 6ce8dd19f33c..8c5b3da0c29f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3277,7 +3277,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[sizeof(uaddr->sa_data) + 1]; + char name[sizeof(uaddr->sa_data_min) + 1]; /* * Check legality @@ -3288,8 +3288,8 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); - name[sizeof(uaddr->sa_data)] = 0; + memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); + name[sizeof(uaddr->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); } @@ -3561,11 +3561,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); + memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock(); return sizeof(*uaddr); -- cgit v1.2.3-70-g09d2 From c83597fa5dc6b322e9bdf929e5f4136a3f4aa4db Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:45 -0700 Subject: bpf: Refactor some inode/task/sk storage functions for reuse Refactor codes so that inode/task/sk storage implementation can maximally share the same code. I also added some comments in new function bpf_local_storage_unlink_nolock() to make codes easy to understand. There is no functionality change. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042845.672944-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 17 ++-- kernel/bpf/bpf_inode_storage.c | 38 +------- kernel/bpf/bpf_local_storage.c | 190 ++++++++++++++++++++++++-------------- kernel/bpf/bpf_task_storage.c | 38 +------- net/core/bpf_sk_storage.c | 35 +------ 5 files changed, 137 insertions(+), 181 deletions(-) (limited to 'net') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 7ea18d4da84b..6d37a40cd90e 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -116,21 +116,22 @@ static struct bpf_local_storage_cache name = { \ .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ } -u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); -void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, - u16 idx); - /* Helper functions for bpf_local_storage */ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); -struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); +struct bpf_map * +bpf_local_storage_map_alloc(union bpf_attr *attr, + struct bpf_local_storage_cache *cache); struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, +bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); + +void bpf_local_storage_map_free(struct bpf_map *map, + struct bpf_local_storage_cache *cache, int __percpu *busy_counter); int bpf_local_storage_map_check_btf(const struct bpf_map *map, @@ -141,10 +142,6 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_omem, bool use_trace_rcu); - void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 5f7683b19199..6a1d4d22816a 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -56,11 +56,9 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode, void bpf_inode_storage_free(struct inode *inode) { - struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_inode_storage = false; struct bpf_storage_blob *bsb; - struct hlist_node *n; bsb = bpf_inode(inode); if (!bsb) @@ -74,30 +72,11 @@ void bpf_inode_storage_free(struct inode *inode) return; } - /* Neither the bpf_prog nor the bpf-map's syscall - * could be modifying the local_storage->list now. - * Thus, no elem can be added-to or deleted-from the - * local_storage->list by the bpf_prog or by the bpf-map's syscall. - * - * It is racing with bpf_local_storage_map_free() alone - * when unlinking elem from the local_storage->list and - * the map's bucket->list. - */ raw_spin_lock_bh(&local_storage->lock); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - free_inode_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, false); - } + free_inode_storage = bpf_local_storage_unlink_nolock(local_storage); raw_spin_unlock_bh(&local_storage->lock); rcu_read_unlock(); - /* free_inoode_storage should always be true as long as - * local_storage->list was non-empty. - */ if (free_inode_storage) kfree_rcu(local_storage, rcu); } @@ -226,23 +205,12 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) { - struct bpf_local_storage_map *smap; - - smap = bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); - - smap->cache_idx = bpf_local_storage_cache_idx_get(&inode_cache); - return &smap->map; + return bpf_local_storage_map_alloc(attr, &inode_cache); } static void inode_storage_map_free(struct bpf_map *map) { - struct bpf_local_storage_map *smap; - - smap = (struct bpf_local_storage_map *)map; - bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx); - bpf_local_storage_map_free(smap, NULL); + bpf_local_storage_map_free(map, &inode_cache, NULL); } BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 781d14167140..93d9b1b17bc8 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -113,9 +113,9 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu) * The caller must ensure selem->smap is still valid to be * dereferenced for its smap->elem_size and smap->cache_idx. */ -bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_mem, bool use_trace_rcu) +static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, + struct bpf_local_storage_elem *selem, + bool uncharge_mem, bool use_trace_rcu) { struct bpf_local_storage_map *smap; bool free_local_storage; @@ -501,7 +501,7 @@ unlock_err: return ERR_PTR(err); } -u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) +static u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) { u64 min_usage = U64_MAX; u16 i, res = 0; @@ -525,76 +525,14 @@ u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache) return res; } -void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, - u16 idx) +static void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, + u16 idx) { spin_lock(&cache->idx_lock); cache->idx_usage_counts[idx]--; spin_unlock(&cache->idx_lock); } -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, - int __percpu *busy_counter) -{ - struct bpf_local_storage_elem *selem; - struct bpf_local_storage_map_bucket *b; - unsigned int i; - - /* Note that this map might be concurrently cloned from - * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone - * RCU read section to finish before proceeding. New RCU - * read sections should be prevented via bpf_map_inc_not_zero. - */ - synchronize_rcu(); - - /* bpf prog and the userspace can no longer access this map - * now. No new selem (of this map) can be added - * to the owner->storage or to the map bucket's list. - * - * The elem of this map can be cleaned up here - * or when the storage is freed e.g. - * by bpf_sk_storage_free() during __sk_destruct(). - */ - for (i = 0; i < (1U << smap->bucket_log); i++) { - b = &smap->buckets[i]; - - rcu_read_lock(); - /* No one is adding to b->list now */ - while ((selem = hlist_entry_safe( - rcu_dereference_raw(hlist_first_rcu(&b->list)), - struct bpf_local_storage_elem, map_node))) { - if (busy_counter) { - migrate_disable(); - this_cpu_inc(*busy_counter); - } - bpf_selem_unlink(selem, false); - if (busy_counter) { - this_cpu_dec(*busy_counter); - migrate_enable(); - } - cond_resched_rcu(); - } - rcu_read_unlock(); - } - - /* While freeing the storage we may still need to access the map. - * - * e.g. when bpf_sk_storage_free() has unlinked selem from the map - * which then made the above while((selem = ...)) loop - * exit immediately. - * - * However, while freeing the storage one still needs to access the - * smap->elem_size to do the uncharging in - * bpf_selem_unlink_storage_nolock(). - * - * Hence, wait another rcu grace period for the storage to be freed. - */ - synchronize_rcu(); - - kvfree(smap->buckets); - bpf_map_area_free(smap); -} - int bpf_local_storage_map_alloc_check(union bpf_attr *attr) { if (attr->map_flags & ~BPF_LOCAL_STORAGE_CREATE_FLAG_MASK || @@ -614,7 +552,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr) return 0; } -struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) +static struct bpf_local_storage_map *__bpf_local_storage_map_alloc(union bpf_attr *attr) { struct bpf_local_storage_map *smap; unsigned int i; @@ -664,3 +602,117 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, return 0; } + +bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage) +{ + struct bpf_local_storage_elem *selem; + bool free_storage = false; + struct hlist_node *n; + + /* Neither the bpf_prog nor the bpf_map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added to or deleted from the + * local_storage->list by the bpf_prog or by the bpf_map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + /* If local_storage list has only one element, the + * bpf_selem_unlink_storage_nolock() will return true. + * Otherwise, it will return false. The current loop iteration + * intends to remove all local storage. So the last iteration + * of the loop will set the free_cgroup_storage to true. + */ + free_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false, false); + } + + return free_storage; +} + +struct bpf_map * +bpf_local_storage_map_alloc(union bpf_attr *attr, + struct bpf_local_storage_cache *cache) +{ + struct bpf_local_storage_map *smap; + + smap = __bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(cache); + return &smap->map; +} + +void bpf_local_storage_map_free(struct bpf_map *map, + struct bpf_local_storage_cache *cache, + int __percpu *busy_counter) +{ + struct bpf_local_storage_map_bucket *b; + struct bpf_local_storage_elem *selem; + struct bpf_local_storage_map *smap; + unsigned int i; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(cache, smap->cache_idx); + + /* Note that this map might be concurrently cloned from + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone + * RCU read section to finish before proceeding. New RCU + * read sections should be prevented via bpf_map_inc_not_zero. + */ + synchronize_rcu(); + + /* bpf prog and the userspace can no longer access this map + * now. No new selem (of this map) can be added + * to the owner->storage or to the map bucket's list. + * + * The elem of this map can be cleaned up here + * or when the storage is freed e.g. + * by bpf_sk_storage_free() during __sk_destruct(). + */ + for (i = 0; i < (1U << smap->bucket_log); i++) { + b = &smap->buckets[i]; + + rcu_read_lock(); + /* No one is adding to b->list now */ + while ((selem = hlist_entry_safe( + rcu_dereference_raw(hlist_first_rcu(&b->list)), + struct bpf_local_storage_elem, map_node))) { + if (busy_counter) { + migrate_disable(); + this_cpu_inc(*busy_counter); + } + bpf_selem_unlink(selem, false); + if (busy_counter) { + this_cpu_dec(*busy_counter); + migrate_enable(); + } + cond_resched_rcu(); + } + rcu_read_unlock(); + } + + /* While freeing the storage we may still need to access the map. + * + * e.g. when bpf_sk_storage_free() has unlinked selem from the map + * which then made the above while((selem = ...)) loop + * exit immediately. + * + * However, while freeing the storage one still needs to access the + * smap->elem_size to do the uncharging in + * bpf_selem_unlink_storage_nolock(). + * + * Hence, wait another rcu grace period for the storage to be freed. + */ + synchronize_rcu(); + + kvfree(smap->buckets); + bpf_map_area_free(smap); +} diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index ba3fe72d1fa5..8e832db8151a 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -71,10 +71,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map, void bpf_task_storage_free(struct task_struct *task) { - struct bpf_local_storage_elem *selem; struct bpf_local_storage *local_storage; bool free_task_storage = false; - struct hlist_node *n; unsigned long flags; rcu_read_lock(); @@ -85,32 +83,13 @@ void bpf_task_storage_free(struct task_struct *task) return; } - /* Neither the bpf_prog nor the bpf-map's syscall - * could be modifying the local_storage->list now. - * Thus, no elem can be added-to or deleted-from the - * local_storage->list by the bpf_prog or by the bpf-map's syscall. - * - * It is racing with bpf_local_storage_map_free() alone - * when unlinking elem from the local_storage->list and - * the map's bucket->list. - */ bpf_task_storage_lock(); raw_spin_lock_irqsave(&local_storage->lock, flags); - hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { - /* Always unlink from map before unlinking from - * local_storage. - */ - bpf_selem_unlink_map(selem); - free_task_storage = bpf_selem_unlink_storage_nolock( - local_storage, selem, false, false); - } + free_task_storage = bpf_local_storage_unlink_nolock(local_storage); raw_spin_unlock_irqrestore(&local_storage->lock, flags); bpf_task_storage_unlock(); rcu_read_unlock(); - /* free_task_storage should always be true as long as - * local_storage->list was non-empty. - */ if (free_task_storage) kfree_rcu(local_storage, rcu); } @@ -337,23 +316,12 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) { - struct bpf_local_storage_map *smap; - - smap = bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); - - smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); - return &smap->map; + return bpf_local_storage_map_alloc(attr, &task_cache); } static void task_storage_map_free(struct bpf_map *map) { - struct bpf_local_storage_map *smap; - - smap = (struct bpf_local_storage_map *)map; - bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); - bpf_local_storage_map_free(smap, &bpf_task_storage_busy); + bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); } BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 94374d529ea4..49884e7de080 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -48,10 +48,8 @@ static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map) /* Called by __sk_destruct() & bpf_sk_storage_clone() */ void bpf_sk_storage_free(struct sock *sk) { - struct bpf_local_storage_elem *selem; struct bpf_local_storage *sk_storage; bool free_sk_storage = false; - struct hlist_node *n; rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); @@ -60,24 +58,8 @@ void bpf_sk_storage_free(struct sock *sk) return; } - /* Netiher the bpf_prog nor the bpf-map's syscall - * could be modifying the sk_storage->list now. - * Thus, no elem can be added-to or deleted-from the - * sk_storage->list by the bpf_prog or by the bpf-map's syscall. - * - * It is racing with bpf_local_storage_map_free() alone - * when unlinking elem from the sk_storage->list and - * the map's bucket->list. - */ raw_spin_lock_bh(&sk_storage->lock); - hlist_for_each_entry_safe(selem, n, &sk_storage->list, snode) { - /* Always unlink from map before unlinking from - * sk_storage. - */ - bpf_selem_unlink_map(selem); - free_sk_storage = bpf_selem_unlink_storage_nolock( - sk_storage, selem, true, false); - } + free_sk_storage = bpf_local_storage_unlink_nolock(sk_storage); raw_spin_unlock_bh(&sk_storage->lock); rcu_read_unlock(); @@ -87,23 +69,12 @@ void bpf_sk_storage_free(struct sock *sk) static void bpf_sk_storage_map_free(struct bpf_map *map) { - struct bpf_local_storage_map *smap; - - smap = (struct bpf_local_storage_map *)map; - bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - bpf_local_storage_map_free(smap, NULL); + bpf_local_storage_map_free(map, &sk_cache, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) { - struct bpf_local_storage_map *smap; - - smap = bpf_local_storage_map_alloc(attr); - if (IS_ERR(smap)) - return ERR_CAST(smap); - - smap->cache_idx = bpf_local_storage_cache_idx_get(&sk_cache); - return &smap->map; + return bpf_local_storage_map_alloc(attr, &sk_cache); } static int notsupp_get_next_key(struct bpf_map *map, void *key, -- cgit v1.2.3-70-g09d2 From 982e2b7329fe5c5387453524f40eec706d2c60bf Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Wed, 26 Oct 2022 09:40:34 +0200 Subject: net: mac802154: Fixup function parameter name in docs The function parameter name was wrong in kdocs. net/mac802154/util.c:27: warning: Function parameter or member 'hw' not described in 'ieee802154_wake_queue' net/mac802154/util.c:27: warning: Excess function parameter 'local' description in 'ieee802154_wake_queue' net/mac802154/util.c:53: warning: Function parameter or member 'hw' not described in 'ieee802154_stop_queue' net/mac802154/util.c:53: warning: Excess function parameter 'local' description in 'ieee802154_stop_queue' Fixing name and description. Signed-off-by: Stefan Schmidt --- net/mac802154/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/util.c b/net/mac802154/util.c index f08605f59b60..ebc9a8521765 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -15,7 +15,7 @@ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; /** * ieee802154_wake_queue - wake ieee802154 queue - * @local: main mac object + * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles @@ -41,7 +41,7 @@ static void ieee802154_wake_queue(struct ieee802154_hw *hw) /** * ieee802154_stop_queue - stop ieee802154 queue - * @local: main mac object + * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles -- cgit v1.2.3-70-g09d2 From d1e96cc4fbe031c19d6fd9d8d2e63c03452fa290 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Oct 2022 18:05:46 +0000 Subject: mptcp: fix tracking issue in mptcp_subflow_create_socket() My recent patch missed that mptcp_subflow_create_socket() was creating a 'kernel' socket, then converted it to 'user' socket. Fixes: 0cafd77dcd03 ("net: add a refcount tracker for kernel sockets") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Matthieu Baerts Reviewed-by: Kuniyuki Iwashima Reviewed-by: Mat Martineau Link: https://lore.kernel.org/r/20221025180546.652251-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 07dd23d0fe04..120f792fda97 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1595,7 +1595,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. + * Update ns_tracker to current stack trace and refcounted tracker. */ + __netns_tracker_free(net, &sf->sk->ns_tracker, false); sf->sk->sk_net_refcnt = 1; get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); -- cgit v1.2.3-70-g09d2 From b65ef50e0647cb6d7317ee5122f461286bb7f8d5 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Oct 2022 14:50:46 +0100 Subject: net/rds: remove variable total_copied Variable total_copied is just being incremented and it's never used anywhere else. The variable and the increment are redundant so remove it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20221024135046.2159523-1-colin.i.king@gmail.com Signed-off-by: Jakub Kicinski --- net/rds/message.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 44dbc612ef54..b47e4f0a1639 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -366,7 +366,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * struct scatterlist *sg; int ret = 0; int length = iov_iter_count(from); - int total_copied = 0; struct rds_msg_zcopy_info *info; rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -404,7 +403,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * ret = -EFAULT; goto err; } - total_copied += copied; length -= copied; sg_set_page(sg, pages, copied, start); rm->data.op_nents++; -- cgit v1.2.3-70-g09d2 From 12d6c1d3a2ad0c199ec57c201cdc71e8e157a232 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 25 Oct 2022 15:39:35 -0700 Subject: skbuff: Proactively round up to kmalloc bucket size Instead of discovering the kmalloc bucket size _after_ allocation, round up proactively so the allocation is explicitly made for the full size, allowing the compiler to correctly reason about the resulting size of the buffer through the existing __alloc_size() hint. This will allow for kernels built with CONFIG_UBSAN_BOUNDS or the coming dynamic bounds checking under CONFIG_FORTIFY_SOURCE to gain back the __alloc_size() hints that were temporarily reverted in commit 93dd04ab0b2b ("slab: remove __alloc_size attribute from __kmalloc_track_caller") Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: netdev@vger.kernel.org Cc: Greg Kroah-Hartman Cc: Nick Desaulniers Cc: David Rientjes Acked-by: Vlastimil Babka Link: https://patchwork.kernel.org/project/netdevbpf/patch/20221021234713.you.031-kees@kernel.org/ Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221025223811.up.360-kees@kernel.org Signed-off-by: Paolo Abeni --- net/core/skbuff.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9b3b19816d2d..7c41025ae76a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -506,14 +506,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, */ size = SKB_DATA_ALIGN(size); size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); + osize = kmalloc_size_roundup(size); + data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc); if (unlikely(!data)) goto nodata; - /* kmalloc(size) might give us more room than requested. + /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - osize = ksize(data); size = SKB_WITH_OVERHEAD(osize); prefetchw(data + size); @@ -1821,10 +1821,11 @@ EXPORT_SYMBOL(__pskb_copy_fclone); int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { - int i, osize = skb_end_offset(skb); - int size = osize + nhead + ntail; + unsigned int osize = skb_end_offset(skb); + unsigned int size = osize + nhead + ntail; long off; u8 *data; + int i; BUG_ON(nhead < 0); @@ -1832,15 +1833,16 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb_zcopy_downgrade_managed(skb); - size = SKB_DATA_ALIGN(size); - if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask, NUMA_NO_NODE, NULL); + + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + size = kmalloc_size_roundup(size); + data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; - size = SKB_WITH_OVERHEAD(ksize(data)); + size = SKB_WITH_OVERHEAD(size); /* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. @@ -6174,21 +6176,20 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, const int headlen, gfp_t gfp_mask) { int i; - int size = skb_end_offset(skb); + unsigned int size = skb_end_offset(skb); int new_hlen = headlen - off; u8 *data; - size = SKB_DATA_ALIGN(size); - if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - data = kmalloc_reserve(size + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask, NUMA_NO_NODE, NULL); + + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + size = kmalloc_size_roundup(size); + data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; - - size = SKB_WITH_OVERHEAD(ksize(data)); + size = SKB_WITH_OVERHEAD(size); /* Copy real data, and all frags */ skb_copy_from_linear_data_offset(skb, off, data, new_hlen); @@ -6293,22 +6294,21 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, int pos, gfp_t gfp_mask) { int i, k = 0; - int size = skb_end_offset(skb); + unsigned int size = skb_end_offset(skb); u8 *data; const int nfrags = skb_shinfo(skb)->nr_frags; struct skb_shared_info *shinfo; - size = SKB_DATA_ALIGN(size); - if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - data = kmalloc_reserve(size + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask, NUMA_NO_NODE, NULL); + + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + size = kmalloc_size_roundup(size); + data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; - - size = SKB_WITH_OVERHEAD(ksize(data)); + size = SKB_WITH_OVERHEAD(size); memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); -- cgit v1.2.3-70-g09d2 From bd456f283b66704920fae8e655ebc769cb743420 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:11 +0000 Subject: tcp: add sysctls for TCP PLB parameters PLB (Protective Load Balancing) is a host based mechanism for load balancing across switch links. It leverages congestion signals(e.g. ECN) from transport layer to randomly change the path of the connection experiencing congestion. PLB changes the path of the connection by changing the outgoing IPv6 flow label for IPv6 connections (implemented in Linux by calling sk_rethink_txhash()). Because of this implementation mechanism, PLB can currently only work for IPv6 traffic. For more information, see the SIGCOMM 2022 paper: https://doi.org/10.1145/3544216.3544226 This commit adds new sysctl knobs and sets their default values for TCP PLB. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 75 ++++++++++++++++++++++++++++++++++ include/net/netns/ipv4.h | 5 +++ net/ipv4/sysctl_net_ipv4.c | 43 +++++++++++++++++++ net/ipv4/tcp_ipv4.c | 8 ++++ 4 files changed, 131 insertions(+) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index e7b3fa7bb3f7..815efc89ad73 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1069,6 +1069,81 @@ tcp_child_ehash_entries - INTEGER Default: 0 +tcp_plb_enabled - BOOLEAN + If set and the underlying congestion control (e.g. DCTCP) supports + and enables PLB feature, TCP PLB (Protective Load Balancing) is + enabled. PLB is described in the following paper: + https://doi.org/10.1145/3544216.3544226. Based on PLB parameters, + upon sensing sustained congestion, TCP triggers a change in + flow label field for outgoing IPv6 packets. A change in flow label + field potentially changes the path of outgoing packets for switches + that use ECMP/WCMP for routing. + + PLB changes socket txhash which results in a change in IPv6 Flow Label + field, and currently no-op for IPv4 headers. It is possible + to apply PLB for IPv4 with other network header fields (e.g. TCP + or IPv4 options) or using encapsulation where outer header is used + by switches to determine next hop. In either case, further host + and switch side changes will be needed. + + When set, PLB assumes that congestion signal (e.g. ECN) is made + available and used by congestion control module to estimate a + congestion measure (e.g. ce_ratio). PLB needs a congestion measure to + make repathing decisions. + + Default: FALSE + +tcp_plb_idle_rehash_rounds - INTEGER + Number of consecutive congested rounds (RTT) seen after which + a rehash can be performed, given there are no packets in flight. + This is referred to as M in PLB paper: + https://doi.org/10.1145/3544216.3544226. + + Possible Values: 0 - 31 + + Default: 3 + +tcp_plb_rehash_rounds - INTEGER + Number of consecutive congested rounds (RTT) seen after which + a forced rehash can be performed. Be careful when setting this + parameter, as a small value increases the risk of retransmissions. + This is referred to as N in PLB paper: + https://doi.org/10.1145/3544216.3544226. + + Possible Values: 0 - 31 + + Default: 12 + +tcp_plb_suspend_rto_sec - INTEGER + Time, in seconds, to suspend PLB in event of an RTO. In order to avoid + having PLB repath onto a connectivity "black hole", after an RTO a TCP + connection suspends PLB repathing for a random duration between 1x and + 2x of this parameter. Randomness is added to avoid concurrent rehashing + of multiple TCP connections. This should be set corresponding to the + amount of time it takes to repair a failed link. + + Possible Values: 0 - 255 + + Default: 60 + +tcp_plb_cong_thresh - INTEGER + Fraction of packets marked with congestion over a round (RTT) to + tag that round as congested. This is referred to as K in the PLB paper: + https://doi.org/10.1145/3544216.3544226. + + The 0-1 fraction range is mapped to 0-256 range to avoid floating + point operations. For example, 128 means that if at least 50% of + the packets in a round were marked as congested then the round + will be tagged as congested. + + Setting threshold to 0 means that PLB repaths every RTT regardless + of congestion. This is not intended behavior for PLB and should be + used only for experimentation purpose. + + Possible Values: 0 - 256 + + Default: 128 + UDP variables ============= diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1b8004679445..25f90bba4889 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -183,6 +183,11 @@ struct netns_ipv4 { unsigned long tfo_active_disable_stamp; u32 tcp_challenge_timestamp; u32 tcp_challenge_count; + u8 sysctl_tcp_plb_enabled; + u8 sysctl_tcp_plb_idle_rehash_rounds; + u8 sysctl_tcp_plb_rehash_rounds; + u8 sysctl_tcp_plb_suspend_rto_sec; + int sysctl_tcp_plb_cong_thresh; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9b8a6db7a66b..0af28cedd071 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -40,6 +40,8 @@ static int one_day_secs = 24 * 3600; static u32 fib_multipath_hash_fields_all_mask __maybe_unused = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; +static int tcp_plb_max_rounds = 31; +static int tcp_plb_max_cong_thresh = 256; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1384,6 +1386,47 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_plb_enabled", + .data = &init_net.ipv4.sysctl_tcp_plb_enabled, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "tcp_plb_idle_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_suspend_rto_sec", + .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "tcp_plb_cong_thresh", + .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_plb_max_cong_thresh, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 87d440f47a70..58b838b56c7f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3218,6 +3218,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Set default values for PLB */ + net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ + net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; + net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; + net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; + /* Default congestion threshold for PLB to mark a round is 50% */ + net->ipv4.sysctl_tcp_plb_cong_thresh = 128; + /* Reno is always built in */ if (!net_eq(net, &init_net) && bpf_try_module_get(init_net.ipv4.tcp_congestion_control, -- cgit v1.2.3-70-g09d2 From 1a91bb7c3ebf95e908ec33220defbcda1ecc072f Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:12 +0000 Subject: tcp: add PLB functionality for TCP Congestion control algorithms track PLB state and cause the connection to trigger a path change when either of the 2 conditions is satisfied: - No packets are in flight and (# consecutive congested rounds >= sysctl_tcp_plb_idle_rehash_rounds) - (# consecutive congested rounds >= sysctl_tcp_plb_rehash_rounds) A round (RTT) is marked as congested when congestion signal (ECN ce_ratio) over an RTT is greater than sysctl_tcp_plb_cong_thresh. In the event of RTO, PLB (via tcp_write_timeout()) triggers a path change and disables congestion-triggered path changes for random time between (sysctl_tcp_plb_suspend_rto_sec, 2*sysctl_tcp_plb_suspend_rto_sec) to avoid hopping onto the "connectivity blackhole". RTO-triggered path changes can still happen during this cool-off period. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 28 ++++++++++++++ net/ipv4/Makefile | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_plb.c | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 net/ipv4/tcp_plb.c (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 14d45661a84d..6b814e788f00 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2140,6 +2140,34 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); +/* tcp_plb.c */ + +/* + * Scaling factor for fractions in PLB. For example, tcp_plb_update_state + * expects cong_ratio which represents fraction of traffic that experienced + * congestion over a single RTT. In order to avoid floating point operations, + * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in. + */ +#define TCP_PLB_SCALE 8 + +/* State for PLB (Protective Load Balancing) for a single TCP connection. */ +struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +}; + +static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +{ + plb->consec_cong_rounds = 0; + plb->pause_until = 0; +} +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio); +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bbdd9c44f14e..af7d2cf490fb 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ tcp_rate.o tcp_recovery.o tcp_ulp.o \ - tcp_offload.o datagram.o raw.o udp.o udplite.o \ + tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 58b838b56c7f..ebab9e8b184c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3224,7 +3224,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; /* Default congestion threshold for PLB to mark a round is 50% */ - net->ipv4.sysctl_tcp_plb_cong_thresh = 128; + net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; /* Reno is always built in */ if (!net_eq(net, &init_net) && diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c new file mode 100644 index 000000000000..f4ced370acad --- /dev/null +++ b/net/ipv4/tcp_plb.c @@ -0,0 +1,107 @@ +/* Protective Load Balancing (PLB) + * + * PLB was designed to reduce link load imbalance across datacenter + * switches. PLB is a host-based optimization; it leverages congestion + * signals from the transport layer to randomly change the path of the + * connection experiencing sustained congestion. PLB prefers to repath + * after idle periods to minimize packet reordering. It repaths by + * changing the IPv6 Flow Label on the packets of a connection, which + * datacenter switches include as part of ECMP/WCMP hashing. + * + * PLB is described in detail in: + * + * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, + * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, + * David Wetherall,Abdul Kabbani: + * "PLB: Congestion Signals are Simple and Effective for + * Network Load Balancing" + * In ACM SIGCOMM 2022, Amsterdam Netherlands. + * + */ + +#include + +/* Called once per round-trip to update PLB state for a connection. */ +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio) +{ + struct net *net = sock_net(sk); + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + if (cong_ratio >= 0) { + if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh)) + plb->consec_cong_rounds = 0; + else if (plb->consec_cong_rounds < + READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds)) + plb->consec_cong_rounds++; + } +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state); + +/* Check whether recent congestion has been persistent enough to warrant + * a load balancing decision that switches the connection to another path. + */ +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 max_suspend; + bool forced_rehash = false, idle_rehash = false; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + forced_rehash = plb->consec_cong_rounds >= + READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds); + /* If sender goes idle then we check whether to rehash. */ + idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) && + !tcp_sk(sk)->packets_out && + plb->consec_cong_rounds >= + READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds); + + if (!forced_rehash && !idle_rehash) + return; + + /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for + * cases where the max suspension end is before the actual suspension + * end. We clear pause_until to 0 to indicate there is no recent + * RTO event that constrains PLB rehashing. + */ + max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; + if (plb->pause_until && + (!before(tcp_jiffies32, plb->pause_until) || + before(tcp_jiffies32 + max_suspend, plb->pause_until))) + plb->pause_until = 0; + + if (plb->pause_until) + return; + + sk_rethink_txhash(sk); + plb->consec_cong_rounds = 0; +} +EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); + +/* Upon RTO, disallow load balancing for a while, to avoid having load + * balancing decisions switch traffic to a black-holed path that was + * previously avoided with a sk_rethink_txhash() call at RTO time. + */ +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 pause; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; + pause += prandom_u32_max(pause); + plb->pause_until = tcp_jiffies32 + pause; + + /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call + * that may switch this connection to a path with completely different + * congestion characteristics. + */ + plb->consec_cong_rounds = 0; +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto); -- cgit v1.2.3-70-g09d2 From c30f8e0b048087ffbf20e51a4be160f7ec279ff6 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:13 +0000 Subject: tcp: add support for PLB in DCTCP PLB support is added to TCP DCTCP code. As DCTCP uses ECN as the congestion signal, PLB also uses ECN to make decisions whether to change the path or not upon sustained congestion. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_dctcp.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 2a6c0dd665a4..e0a2ca7456ff 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -54,6 +54,7 @@ struct dctcp { u32 next_seq; u32 ce_state; u32 loss_cwnd; + struct tcp_plb_state plb; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ @@ -91,6 +92,8 @@ static void dctcp_init(struct sock *sk) ca->ce_state = 0; dctcp_reset(tp, ca); + tcp_plb_init(sk, &ca->plb); + return; } @@ -117,14 +120,28 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { + u32 delivered = tp->delivered - ca->old_delivered; u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; + u32 ce_ratio = 0; + + if (delivered > 0) { + /* dctcp_alpha keeps EWMA of fraction of ECN marked + * packets. Because of EWMA smoothing, PLB reaction can + * be slow so we use ce_ratio which is an instantaneous + * measure of congestion. ce_ratio is the fraction of + * ECN marked packets in the previous RTT. + */ + if (delivered_ce > 0) + ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered; + tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio); + tcp_plb_check_rehash(sk, &ca->plb); + } /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (delivered_ce) { - u32 delivered = tp->delivered - ca->old_delivered; /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 M packets. @@ -172,8 +189,12 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; case CA_EVENT_LOSS: + tcp_plb_update_state_upon_rto(sk, &ca->plb); dctcp_react_to_loss(sk); break; + case CA_EVENT_TX_START: + tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */ + break; default: /* Don't care for the rest. */ break; -- cgit v1.2.3-70-g09d2 From 29c1c44646aec5d5134f2365259a84becc1ee7d3 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:14 +0000 Subject: tcp: add u32 counter in tcp_sock and an SNMP counter for PLB A u32 counter is added to tcp_sock for counting the number of PLB triggered rehashes for a TCP connection. An SNMP counter is also added to count overall PLB triggered rehash events for a host. These counters are hooked up to PLB implementation for DCTCP. TCP_NLA_REHASH is added to SCM_TIMESTAMPING_OPT_STATS that reports the rehash attempts triggered due to PLB or timeouts. This gives a historical view of sustained congestion or timeouts experienced by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + include/uapi/linux/snmp.h | 1 + include/uapi/linux/tcp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_plb.c | 2 ++ 6 files changed, 9 insertions(+) (limited to 'net') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 41b1da621a45..ca7f05a130d2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -423,6 +423,7 @@ struct tcp_sock { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; + u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 4d7470036a8b..6600cb0164c2 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -292,6 +292,7 @@ enum LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */ LINUX_MIB_TCPMIGRATEREQSUCCESS, /* TCPMigrateReqSuccess */ LINUX_MIB_TCPMIGRATEREQFAILURE, /* TCPMigrateReqFailure */ + LINUX_MIB_TCPPLBREHASH, /* TCPPLBRehash */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8fc09e8638b3..c9abe86eda5f 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -315,6 +315,7 @@ enum { TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ TCP_NLA_TTL, /* TTL or hop limit of a packet received */ + TCP_NLA_REHASH, /* PLB and timeout triggered rehash attempts */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 5386f460bd20..f88daace9de3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -297,6 +297,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), + SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef14efa1fb70..1da7c53b6cb5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3176,6 +3176,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; + tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rack.mstamp = 0; @@ -3973,6 +3974,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } @@ -4049,6 +4051,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); + nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c index f4ced370acad..bb1a08fda113 100644 --- a/net/ipv4/tcp_plb.c +++ b/net/ipv4/tcp_plb.c @@ -79,6 +79,8 @@ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) sk_rethink_txhash(sk); plb->consec_cong_rounds = 0; + tcp_sk(sk)->plb_rehash++; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); } EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); -- cgit v1.2.3-70-g09d2 From 71fc704768f601ed3fa36310822a5e03f310f781 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:15 +0000 Subject: tcp: add rcv_wnd and plb_rehash to TCP_INFO rcv_wnd can be useful to diagnose TCP performance where receiver window becomes the bottleneck. rehash reports the PLB and timeout triggered rehash attempts by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 5 +++++ net/ipv4/tcp.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index c9abe86eda5f..879eeb0a084b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -284,6 +284,11 @@ struct tcp_info { __u32 tcpi_snd_wnd; /* peer's advertised receive window after * scaling (bytes) */ + __u32 tcpi_rcv_wnd; /* local advertised receive window after + * scaling (bytes) + */ + + __u32 tcpi_rehash; /* PLB or timeout triggered rehash attempts */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1da7c53b6cb5..de8f0cd7cb32 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3940,6 +3940,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_rcv_wnd = tp->rcv_wnd; + info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } -- cgit v1.2.3-70-g09d2 From d120d1a63b2c484d6175873d8ee736a633f74b70 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Oct 2022 15:22:15 +0200 Subject: net: Remove the obsolte u64_stats_fetch_*_irq() users (net). Now that the 32bit UP oddity is gone and 32bit uses always a sequence count, there is no need for the fetch_irq() variants anymore. Convert to the regular interface. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jakub Kicinski --- net/8021q/vlan_dev.c | 4 ++-- net/bridge/br_multicast.c | 4 ++-- net/bridge/br_vlan.c | 4 ++-- net/core/dev.c | 4 ++-- net/core/devlink.c | 4 ++-- net/core/drop_monitor.c | 8 ++++---- net/core/gen_stats.c | 16 ++++++++-------- net/dsa/slave.c | 4 ++-- net/ipv4/af_inet.c | 4 ++-- net/ipv6/seg6_local.c | 4 ++-- net/mac80211/sta_info.c | 8 ++++---- net/mpls/af_mpls.c | 4 ++-- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- net/netfilter/nf_tables_api.c | 4 ++-- net/openvswitch/datapath.c | 4 ++-- net/openvswitch/flow_table.c | 9 ++++----- 16 files changed, 44 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e1bb41a443c4..296d0145932f 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct net_device *dev, p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); do { - start = u64_stats_fetch_begin_irq(&p->syncp); + start = u64_stats_fetch_begin(&p->syncp); rxpackets = u64_stats_read(&p->rx_packets); rxbytes = u64_stats_read(&p->rx_bytes); rxmulticast = u64_stats_read(&p->rx_multicast); txpackets = u64_stats_read(&p->tx_packets); txbytes = u64_stats_read(&p->tx_bytes); - } while (u64_stats_fetch_retry_irq(&p->syncp, start)); + } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rxpackets; stats->rx_bytes += rxbytes; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 09140bc8c15e..5e988f0ed2c0 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct net_bridge *br, unsigned int start; do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 6e53dc991409..f2fc284abab3 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1378,12 +1378,12 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, cpu_stats = per_cpu_ptr(v->stats, i); do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); rxpackets = u64_stats_read(&cpu_stats->rx_packets); rxbytes = u64_stats_read(&cpu_stats->rx_bytes); txbytes = u64_stats_read(&cpu_stats->tx_bytes); txpackets = u64_stats_read(&cpu_stats->tx_packets); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); diff --git a/net/core/dev.c b/net/core/dev.c index fff62068a53d..cfb68db040a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10477,12 +10477,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, stats = per_cpu_ptr(netstats, cpu); do { - start = u64_stats_fetch_begin_irq(&stats->syncp); + start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; diff --git a/net/core/devlink.c b/net/core/devlink.c index 89baa7c0938b..0a16ad45520e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8304,10 +8304,10 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, cpu_stats = per_cpu_ptr(trap_stats, i); do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); rx_packets = u64_stats_read(&cpu_stats->rx_packets); rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rx_packets); u64_stats_add(&stats->rx_bytes, rx_bytes); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index f084a4a6b7ab..11aa6e8a3098 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net_dm_stats *stats) u64 dropped; do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } @@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats) u64 dropped; do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index c8d137ef5980..b71ccaec0991 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, u64 bytes, packets; do { - start = u64_stats_fetch_begin_irq(&bcpu->syncp); + start = u64_stats_fetch_begin(&bcpu->syncp); bytes = u64_stats_read(&bcpu->bytes); packets = u64_stats_read(&bcpu->packets); - } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + } while (u64_stats_fetch_retry(&bcpu->syncp, start)); t_bytes += bytes; t_packets += packets; @@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, } do { if (running) - start = u64_stats_fetch_begin_irq(&b->syncp); + start = u64_stats_fetch_begin(&b->syncp); bytes = u64_stats_read(&b->bytes); packets = u64_stats_read(&b->packets); - } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + } while (running && u64_stats_fetch_retry(&b->syncp, start)); _bstats_update(bstats, bytes, packets); } @@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, u64 bytes, packets; do { - start = u64_stats_fetch_begin_irq(&bcpu->syncp); + start = u64_stats_fetch_begin(&bcpu->syncp); bytes = u64_stats_read(&bcpu->bytes); packets = u64_stats_read(&bcpu->packets); - } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + } while (u64_stats_fetch_retry(&bcpu->syncp, start)); t_bytes += bytes; t_packets += packets; @@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, } do { if (running) - start = u64_stats_fetch_begin_irq(&b->syncp); + start = u64_stats_fetch_begin(&b->syncp); *ret_bytes = u64_stats_read(&b->bytes); *ret_packets = u64_stats_read(&b->packets); - } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + } while (running && u64_stats_fetch_retry(&b->syncp, start)); } static int diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a9fde48cffd4..83e419afa89e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -976,12 +976,12 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, s = per_cpu_ptr(dev->tstats, i); do { - start = u64_stats_fetch_begin_irq(&s->syncp); + start = u64_stats_fetch_begin(&s->syncp); tx_packets = u64_stats_read(&s->tx_packets); tx_bytes = u64_stats_read(&s->tx_bytes); rx_packets = u64_stats_read(&s->rx_packets); rx_bytes = u64_stats_read(&s->rx_bytes); - } while (u64_stats_fetch_retry_irq(&s->syncp, start)); + } while (u64_stats_fetch_retry(&s->syncp, start)); data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 3dd02396517d..585f13b6fef6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1706,9 +1706,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { - start = u64_stats_fetch_begin_irq(syncp); + start = u64_stats_fetch_begin(syncp); v = *(((u64 *)bhptr) + offt); - } while (u64_stats_fetch_retry_irq(syncp, start)); + } while (u64_stats_fetch_retry(syncp, start)); return v; } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 8370726ae7bf..487f8e98deaa 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1644,13 +1644,13 @@ static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt) pcounters = per_cpu_ptr(slwt->pcpu_counters, i); do { - start = u64_stats_fetch_begin_irq(&pcounters->syncp); + start = u64_stats_fetch_begin(&pcounters->syncp); packets = u64_stats_read(&pcounters->packets); bytes = u64_stats_read(&pcounters->bytes); errors = u64_stats_read(&pcounters->errors); - } while (u64_stats_fetch_retry_irq(&pcounters->syncp, start)); + } while (u64_stats_fetch_retry(&pcounters->syncp, start)); counters.packets += packets; counters.bytes += bytes; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 6e0fd82855ae..04e0f132b1d9 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2427,9 +2427,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, u64 value; do { - start = u64_stats_fetch_begin_irq(&rxstats->syncp); + start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->msdu[tid]; - } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } @@ -2495,9 +2495,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) u64 value; do { - start = u64_stats_fetch_begin_irq(&rxstats->syncp); + start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->bytes; - } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index b52afe316dc4..35b5f806fdda 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev, p = per_cpu_ptr(mdev->stats, i); do { - start = u64_stats_fetch_begin_irq(&p->syncp); + start = u64_stats_fetch_begin(&p->syncp); local = p->stats; - } while (u64_stats_fetch_retry_irq(&p->syncp, start)); + } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 988222fff9f0..4d62059a6021 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) u64 conns, inpkts, outpkts, inbytes, outbytes; do { - start = u64_stats_fetch_begin_irq(&u->syncp); + start = u64_stats_fetch_begin(&u->syncp); conns = u->cnt.conns; inpkts = u->cnt.inpkts; outpkts = u->cnt.outpkts; inbytes = u->cnt.inbytes; outbytes = u->cnt.outbytes; - } while (u64_stats_fetch_retry_irq(&u->syncp, start)); + } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6b79f5e18f08..62da204eed41 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); do { - seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + seq = u64_stats_fetch_begin(&cpu_stats->syncp); pkts = cpu_stats->pkts; bytes = cpu_stats->bytes; - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq)); total.pkts += pkts; total.bytes += bytes; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 155263e73512..f95b716ea96d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -716,9 +716,9 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + start = u64_stats_fetch_begin(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d4a2db0b2299..0a0e4c283f02 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma) stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { - start = u64_stats_fetch_begin_irq(&stats->syncp); + start = u64_stats_fetch_begin(&stats->syncp); counter = stats->usage_cntrs[i]; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + } while (u64_stats_fetch_retry(&stats->syncp, start)); ma->masks_usage_zero_cntr[i] += counter; } @@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flow_table *table) stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { - start = u64_stats_fetch_begin_irq(&stats->syncp); + start = u64_stats_fetch_begin(&stats->syncp); counter = stats->usage_cntrs[i]; - } while (u64_stats_fetch_retry_irq(&stats->syncp, - start)); + } while (u64_stats_fetch_retry(&stats->syncp, start)); masks_and_count[i].counter += counter; } -- cgit v1.2.3-70-g09d2 From 58ba426388d9fe56aa638f555b01d6e63cada88c Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 27 Oct 2022 17:10:14 -0400 Subject: net/packet: add PACKET_FANOUT_FLAG_IGNORE_OUTGOING Extend packet socket option PACKET_IGNORE_OUTGOING to fanout groups. The socket option sets ptype.ignore_outgoing, which makes dev_queue_xmit_nit skip the socket. When the socket joins a fanout group, the option is not reflected in the struct ptype of the group. dev_queue_xmit_nit only tests the fanout ptype, so the flag is ignored once a socket joins a fanout group. Inheriting the option from a socket would change established behavior. Different sockets in the group can set different flags, and can also change them at runtime. Testing in packet_rcv_fanout defeats the purpose of the original patch, which is to avoid skb_clone in dev_queue_xmit_nit (esp. for MSG_ZEROCOPY packets). Instead, introduce a new fanout group flag with the same behavior. Tested with https://github.com/wdebruij/kerneltools/blob/master/tests/test_psock_fanout_ignore_outgoing.c Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20221027211014.3581513-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index c07caf7b40db..a8516b3594a4 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -70,6 +70,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_EBPF 7 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_UNIQUEID 0x2000 +#define PACKET_FANOUT_FLAG_IGNORE_OUTGOING 0x4000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 struct tpacket_stats { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8c5b3da0c29f..44f20cf8a0c0 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1777,6 +1777,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; + match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; -- cgit v1.2.3-70-g09d2 From 738136a0e3757a8534df3ad97d6ff6d7f429f6c1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 27 Oct 2022 14:25:53 -0700 Subject: netlink: split up copies in the ack construction Clean up the use of unsafe_memcpy() by adding a flexible array at the end of netlink message header and splitting up the header and data copies. Reviewed-by: Kees Cook Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- include/net/netlink.h | 21 +++++++++++++++++++++ include/uapi/linux/netlink.h | 2 ++ net/netlink/af_netlink.c | 29 ++++++++++++++++++++--------- 3 files changed, 43 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/netlink.h b/include/net/netlink.h index 4418b1981e31..784b4688fc6f 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -931,6 +931,27 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se return __nlmsg_put(skb, portid, seq, type, payload, flags); } +/** + * nlmsg_append - Add more data to a nlmsg in a skb + * @skb: socket buffer to store message in + * @size: length of message payload + * + * Append data to an existing nlmsg, used when constructing a message + * with multiple fixed-format headers (which is rare). + * Returns NULL if the tailroom of the skb is insufficient to store + * the extra payload. + */ +static inline void *nlmsg_append(struct sk_buff *skb, u32 size) +{ + if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size))) + return NULL; + + if (NLMSG_ALIGN(size) - size) + memset(skb_tail_pointer(skb) + size, 0, + NLMSG_ALIGN(size) - size); + return __skb_put(skb, NLMSG_ALIGN(size)); +} + /** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index e2ae82e3f9f7..5da0da59bf01 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -48,6 +48,7 @@ struct sockaddr_nl { * @nlmsg_flags: Additional flags * @nlmsg_seq: Sequence number * @nlmsg_pid: Sending process port ID + * @nlmsg_data: Message payload */ struct nlmsghdr { __u32 nlmsg_len; @@ -55,6 +56,7 @@ struct nlmsghdr { __u16 nlmsg_flags; __u32 nlmsg_seq; __u32 nlmsg_pid; + __u8 nlmsg_data[]; }; /* Flags values */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f0c94d394ab1..b10d5e50b99d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2499,19 +2499,24 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, flags |= NLM_F_ACK_TLVS; skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); - if (!skb) { - NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; - sk_error_report(NETLINK_CB(in_skb).sk); - return; - } + if (!skb) + goto err_bad_put; rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - NLMSG_ERROR, payload, flags); + NLMSG_ERROR, sizeof(*errmsg), flags); + if (!rep) + goto err_bad_put; errmsg = nlmsg_data(rep); errmsg->error = err; - unsafe_memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) - ? nlh->nlmsg_len : sizeof(*nlh), - /* Bounds checked by the skb layer. */); + errmsg->msg = *nlh; + + if (!(flags & NLM_F_CAPPED)) { + if (!nlmsg_append(skb, nlmsg_len(nlh))) + goto err_bad_put; + + memcpy(errmsg->msg.nlmsg_data, nlh->nlmsg_data, + nlmsg_len(nlh)); + } if (tlvlen) netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack); @@ -2519,6 +2524,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, nlmsg_end(skb, rep); nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid); + + return; + +err_bad_put: + NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; + sk_error_report(NETLINK_CB(in_skb).sk); } EXPORT_SYMBOL(netlink_ack); -- cgit v1.2.3-70-g09d2 From 1d997f1013079c05b642c739901e3584a3ae558d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 28 Oct 2022 04:42:21 -0400 Subject: rtnetlink: pass netlink message header and portid to rtnl_configure_link() This patch pass netlink message header and portid to rtnl_configure_link() All the functions in this call chain need to add the parameters so we can use them in the last call rtnl_notify(), and notify the userspace about the new link info if NLM_F_ECHO flag is set. - rtnl_configure_link() - __dev_notify_flags() - rtmsg_ifinfo() - rtmsg_ifinfo_event() - rtmsg_ifinfo_build_skb() - rtmsg_ifinfo_send() - rtnl_notify() Also move __dev_notify_flags() declaration to net/core/dev.h, as Jakub suggested. Signed-off-by: Hangbin Liu Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- drivers/net/can/vxcan.c | 2 +- drivers/net/geneve.c | 2 +- drivers/net/veth.c | 2 +- drivers/net/vxlan/vxlan_core.c | 4 ++-- drivers/net/wwan/wwan_core.c | 2 +- include/linux/netdevice.h | 2 -- include/linux/rtnetlink.h | 9 +++++---- include/net/netlink.h | 11 +++++++++++ include/net/rtnetlink.h | 3 ++- net/core/dev.c | 25 +++++++++++++------------ net/core/dev.h | 4 ++++ net/core/rtnetlink.c | 35 +++++++++++++++++++---------------- net/ipv4/ip_gre.c | 2 +- 13 files changed, 61 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index 26a472d2ea58..4068d962203d 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -236,7 +236,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev, netif_carrier_off(peer); - err = rtnl_configure_link(peer, ifmp); + err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto unregister_network_device; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index f393e454f45c..89ff7f8e8c7e 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1907,7 +1907,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name, if (err) goto err; - err = rtnl_configure_link(dev, NULL); + err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto err; diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 740506c44427..ac7c0653695f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1773,7 +1773,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, veth_disable_gro(peer); netif_carrier_off(peer); - err = rtnl_configure_link(peer, ifmp); + err = rtnl_configure_link(peer, ifmp, 0, NULL); if (err < 0) goto err_configure_peer; diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 6ab669dcd1c6..92224b36787a 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3794,7 +3794,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, goto errout; } - err = rtnl_configure_link(dev, NULL); + err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto unlink; @@ -4416,7 +4416,7 @@ struct net_device *vxlan_dev_create(struct net *net, const char *name, return ERR_PTR(err); } - err = rtnl_configure_link(dev, NULL); + err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) { LIST_HEAD(list_kill); diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c index 62e9f7d6c9fe..d72ee18476d1 100644 --- a/drivers/net/wwan/wwan_core.c +++ b/drivers/net/wwan/wwan_core.c @@ -1058,7 +1058,7 @@ static void wwan_create_default_link(struct wwan_device *wwandev, goto unlock; } - rtnl_configure_link(dev, NULL); /* Link initialized, notify new link */ + rtnl_configure_link(dev, NULL, 0, NULL); /* Link initialized, notify new link */ unlock: rtnl_unlock(); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 99e58b773266..4b5052db978f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3855,8 +3855,6 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); -void __dev_notify_flags(struct net_device *, unsigned int old_flags, - unsigned int gchanges); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index ae2c6a3cec5d..92ad75549e9c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -12,21 +12,22 @@ extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, - u32 group, struct nlmsghdr *nlh, gfp_t flags); + u32 group, const struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, + u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex); + int new_ifindex, u32 portid, u32 seq); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, - gfp_t flags); + gfp_t flags, u32 portid, const struct nlmsghdr *nlh); /* RTNL is used as a global lock for all changes to network configuration */ diff --git a/include/net/netlink.h b/include/net/netlink.h index 784b4688fc6f..464e2e026f7b 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -899,6 +899,17 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh) return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0; } +/** + * nlmsg_seq - return the seq number of netlink message + * @nlh: netlink message header + * + * Returns 0 if netlink message is NULL + */ +static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) +{ + return nlh ? nlh->nlmsg_seq : 0; +} + /** * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bf8bb3357825..cd94f65dc2a9 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -187,7 +187,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, struct nlattr *tb[], struct netlink_ext_ack *extack); int rtnl_delete_link(struct net_device *dev); -int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, + u32 portid, const struct nlmsghdr *nlh); int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, struct netlink_ext_ack *exterr); diff --git a/net/core/dev.c b/net/core/dev.c index cfb68db040a4..19e0db536022 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1333,7 +1333,7 @@ void netdev_state_change(struct net_device *dev) call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info); - rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } EXPORT_SYMBOL(netdev_state_change); @@ -1469,7 +1469,7 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) if (ret < 0) return ret; - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_UP, dev); return ret; @@ -1541,7 +1541,7 @@ void dev_close_many(struct list_head *head, bool unlink) __dev_close_many(head); list_for_each_entry_safe(dev, tmp, head, close_list) { - rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL); call_netdevice_notifiers(NETDEV_DOWN, dev); if (unlink) list_del_init(&dev->close_list); @@ -8351,7 +8351,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev_change_rx_flags(dev, IFF_PROMISC); } if (notify) - __dev_notify_flags(dev, old_flags, IFF_PROMISC); + __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); return 0; } @@ -8406,7 +8406,7 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) dev_set_rx_mode(dev); if (notify) __dev_notify_flags(dev, old_flags, - dev->gflags ^ old_gflags); + dev->gflags ^ old_gflags, 0, NULL); } return 0; } @@ -8569,12 +8569,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, } void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, - unsigned int gchanges) + unsigned int gchanges, u32 portid, + const struct nlmsghdr *nlh) { unsigned int changes = dev->flags ^ old_flags; if (gchanges) - rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); + rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh); if (changes & IFF_UP) { if (dev->flags & IFF_UP) @@ -8616,7 +8617,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, return ret; changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); - __dev_notify_flags(dev, old_flags, changes); + __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } EXPORT_SYMBOL(dev_change_flags); @@ -10101,7 +10102,7 @@ int register_netdevice(struct net_device *dev) */ if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: return ret; @@ -10849,7 +10850,7 @@ void unregister_netdevice_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL, 0); + GFP_KERNEL, NULL, 0, 0, 0); /* * Flush the unicast and multicast chains @@ -10864,7 +10865,7 @@ void unregister_netdevice_many(struct list_head *head) dev->netdev_ops->ndo_uninit(dev); if (skb) - rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, 0, NULL); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); @@ -11042,7 +11043,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); synchronize_net(); err = 0; diff --git a/net/core/dev.h b/net/core/dev.h index cbb8a925175a..6b3c7302f570 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -88,6 +88,10 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); +void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, + unsigned int gchanges, u32 portid, + const struct nlmsghdr *nlh); + static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 74864dc46a7e..c9dd9730f3c6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -760,7 +760,7 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, - struct nlmsghdr *nlh, gfp_t flags) + const struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; @@ -3180,7 +3180,8 @@ out: return err; } -int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) +int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, + u32 portid, const struct nlmsghdr *nlh) { unsigned int old_flags; int err; @@ -3194,10 +3195,10 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) } if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags)); + __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); } else { dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - __dev_notify_flags(dev, old_flags, ~0U); + __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); } return 0; } @@ -3369,7 +3370,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, goto out; } - err = rtnl_configure_link(dev, ifm); + err = rtnl_configure_link(dev, ifm, 0, NULL); if (err < 0) goto out_unregister; if (link_net) { @@ -3896,7 +3897,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex) + int new_ifindex, u32 portid, u32 seq) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -3907,7 +3908,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, goto errout; err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), - type, 0, 0, change, 0, 0, event, + type, portid, seq, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ @@ -3922,16 +3923,18 @@ errout: return NULL; } -void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) +void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags, + u32 portid, const struct nlmsghdr *nlh) { struct net *net = dev_net(dev); - rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); + rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags); } static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags, int *new_nsid, int new_ifindex) + gfp_t flags, int *new_nsid, int new_ifindex, + u32 portid, const struct nlmsghdr *nlh) { struct sk_buff *skb; @@ -3939,23 +3942,23 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, - new_ifindex); + new_ifindex, portid, nlmsg_seq(nlh)); if (skb) - rtmsg_ifinfo_send(skb, dev, flags); + rtmsg_ifinfo_send(skb, dev, flags, portid, nlh); } void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) + gfp_t flags, u32 portid, const struct nlmsghdr *nlh) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, - NULL, 0); + NULL, 0, portid, nlh); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, - new_nsid, new_ifindex); + new_nsid, new_ifindex, 0, NULL); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -6140,7 +6143,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL, NULL, 0); + GFP_KERNEL, NULL, 0, 0, NULL); break; default: break; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f866d6282b2b..d8ee5238c395 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1665,7 +1665,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, if (err) goto out; - err = rtnl_configure_link(dev, NULL); + err = rtnl_configure_link(dev, NULL, 0, NULL); if (err < 0) goto out; -- cgit v1.2.3-70-g09d2 From 77f4aa9a2a1766a0b9343fd812b71f18d05178da Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 28 Oct 2022 04:42:22 -0400 Subject: net: add new helper unregister_netdevice_many_notify Add new helper unregister_netdevice_many_notify(), pass netlink message header and portid, which could be used to notify userspace when flag NLM_F_ECHO is set. Make the unregister_netdevice_many() as a wrapper of new function unregister_netdevice_many_notify(). Suggested-by: Guillaume Nault Signed-off-by: Hangbin Liu Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/core/dev.c | 27 +++++++++++++++++---------- net/core/dev.h | 3 +++ 2 files changed, 20 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 19e0db536022..2e4f1c97b59e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10781,14 +10781,8 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); -/** - * unregister_netdevice_many - unregister many devices - * @head: list of devices - * - * Note: As most callers use a stack allocated list_head, - * we force a list_del() to make sure stack wont be corrupted later. - */ -void unregister_netdevice_many(struct list_head *head) +void unregister_netdevice_many_notify(struct list_head *head, + u32 portid, const struct nlmsghdr *nlh) { struct net_device *dev, *tmp; LIST_HEAD(close_head); @@ -10850,7 +10844,8 @@ void unregister_netdevice_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL, 0, 0, 0); + GFP_KERNEL, NULL, 0, + portid, nlmsg_seq(nlh)); /* * Flush the unicast and multicast chains @@ -10865,7 +10860,7 @@ void unregister_netdevice_many(struct list_head *head) dev->netdev_ops->ndo_uninit(dev); if (skb) - rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, 0, NULL); + rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh); /* Notifier chain MUST detach us all upper devices. */ WARN_ON(netdev_has_any_upper_dev(dev)); @@ -10888,6 +10883,18 @@ void unregister_netdevice_many(struct list_head *head) list_del(head); } + +/** + * unregister_netdevice_many - unregister many devices + * @head: list of devices + * + * Note: As most callers use a stack allocated list_head, + * we force a list_del() to make sure stack wont be corrupted later. + */ +void unregister_netdevice_many(struct list_head *head) +{ + unregister_netdevice_many_notify(head, 0, NULL); +} EXPORT_SYMBOL(unregister_netdevice_many); /** diff --git a/net/core/dev.h b/net/core/dev.h index 6b3c7302f570..814ed5b7b960 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -92,6 +92,9 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, const struct nlmsghdr *nlh); +void unregister_netdevice_many_notify(struct list_head *head, + u32 portid, const struct nlmsghdr *nlh); + static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { -- cgit v1.2.3-70-g09d2 From d88e136cab37d6a5aa3691a2f636d37bd6520cc2 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 28 Oct 2022 04:42:23 -0400 Subject: rtnetlink: Honour NLM_F_ECHO flag in rtnl_newlink_create This patch pass the netlink header message in rtnl_newlink_create() to the new updated rtnl_configure_link(), so that the kernel could reply unicast when userspace set NLM_F_ECHO flag to request the new created interface info. Suggested-by: Guillaume Nault Signed-off-by: Hangbin Liu Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c9dd9730f3c6..839ff8b7eadc 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3312,11 +3312,13 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, + const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; struct net *net = sock_net(skb->sk); + u32 portid = NETLINK_CB(skb).portid; struct net *dest_net, *link_net; struct net_device *dev; char ifname[IFNAMSIZ]; @@ -3370,7 +3372,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, goto out; } - err = rtnl_configure_link(dev, ifm, 0, NULL); + err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; if (link_net) { @@ -3579,7 +3581,7 @@ replay: return -EOPNOTSUPP; } - return rtnl_newlink_create(skb, ifm, ops, tb, data, extack); + return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3-70-g09d2 From f3a63cce1b4fbde7738395c5a2dea83f05de3407 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 28 Oct 2022 04:42:24 -0400 Subject: rtnetlink: Honour NLM_F_ECHO flag in rtnl_delete_link This patch use the new helper unregister_netdevice_many_notify() for rtnl_delete_link(), so that the kernel could reply unicast when userspace set NLM_F_ECHO flag to request the new created interface info. At the same time, the parameters of rtnl_delete_link() need to be updated since we need nlmsghdr and portid info. Suggested-by: Guillaume Nault Signed-off-by: Hangbin Liu Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- include/net/rtnetlink.h | 2 +- net/core/rtnetlink.c | 7 ++++--- net/openvswitch/vport-geneve.c | 2 +- net/openvswitch/vport-gre.c | 2 +- net/openvswitch/vport-netdev.c | 2 +- net/openvswitch/vport-vxlan.c | 2 +- 6 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index cd94f65dc2a9..d9076a7a430c 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -186,7 +186,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack); -int rtnl_delete_link(struct net_device *dev); +int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 839ff8b7eadc..d2f27548fc0b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3110,7 +3110,7 @@ static int rtnl_group_dellink(const struct net *net, int group) return 0; } -int rtnl_delete_link(struct net_device *dev) +int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh) { const struct rtnl_link_ops *ops; LIST_HEAD(list_kill); @@ -3120,7 +3120,7 @@ int rtnl_delete_link(struct net_device *dev) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); + unregister_netdevice_many_notify(&list_kill, portid, nlh); return 0; } @@ -3130,6 +3130,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + u32 portid = NETLINK_CB(skb).portid; struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; @@ -3171,7 +3172,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; } - err = rtnl_delete_link(dev); + err = rtnl_delete_link(dev, portid, nlh); out: if (netnsid >= 0) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 89a8e1501809..b10e1602c6b1 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -91,7 +91,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { - rtnl_delete_link(dev); + rtnl_delete_link(dev, 0, NULL); rtnl_unlock(); ovs_vport_free(vport); goto error; diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index e6b5e76a962a..4014c9b5eb79 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -57,7 +57,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms) err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { - rtnl_delete_link(dev); + rtnl_delete_link(dev, 0, NULL); rtnl_unlock(); ovs_vport_free(vport); return ERR_PTR(err); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 2f61d5bdce1a..903537a5da22 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -172,7 +172,7 @@ void ovs_netdev_tunnel_destroy(struct vport *vport) * if it's not already shutting down. */ if (vport->dev->reg_state == NETREG_REGISTERED) - rtnl_delete_link(vport->dev); + rtnl_delete_link(vport->dev, 0, NULL); netdev_put(vport->dev, &vport->dev_tracker); vport->dev = NULL; rtnl_unlock(); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 188e9c1360a1..0b881b043bcf 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -120,7 +120,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); if (err < 0) { - rtnl_delete_link(dev); + rtnl_delete_link(dev, 0, NULL); rtnl_unlock(); ovs_vport_free(vport); goto error; -- cgit v1.2.3-70-g09d2 From 0e84afe8ebfbb9eade3f4f6de4720887bf908e26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:16 +0000 Subject: net: dropreason: add SKB_CONSUMED reason This will allow to simply use in the future: kfree_skb_reason(skb, reason); Instead of repeating sequences like: if (dropped) kfree_skb_reason(skb, reason); else consume_skb(skb); For instance, following patch in the series is adding @reason to skb_release_data() and skb_release_all(), so that we can propagate a meaningful @reason whenever consume_skb()/kfree_skb() have to take care of a potential frag_list. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 2 ++ net/core/skbuff.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index c1cbcdbaf149..0bd18c14dae0 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -80,6 +80,8 @@ enum skb_drop_reason { * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case) */ SKB_NOT_DROPPED_YET = 0, + /** @SKB_CONSUMED: packet has been consumed */ + SKB_CONSUMED, /** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */ SKB_DROP_REASON_NOT_SPECIFIED, /** @SKB_DROP_REASON_NO_SOCKET: socket not found */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1d84a17eada5..7ce797cd121f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -94,6 +94,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags); #undef FN #define FN(reason) [SKB_DROP_REASON_##reason] = #reason, const char * const drop_reasons[] = { + [SKB_CONSUMED] = "CONSUMED", DEFINE_DROP_REASON(FN, FN) }; EXPORT_SYMBOL(drop_reasons); @@ -894,7 +895,10 @@ kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason) DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX); - trace_kfree_skb(skb, __builtin_return_address(0), reason); + if (reason == SKB_CONSUMED) + trace_consume_skb(skb); + else + trace_kfree_skb(skb, __builtin_return_address(0), reason); __kfree_skb(skb); } EXPORT_SYMBOL(kfree_skb_reason); -- cgit v1.2.3-70-g09d2 From 511a3eda2f8d4719114ee3f2c781c37233bd171f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:17 +0000 Subject: net: dropreason: propagate drop_reason to skb_release_data() When an skb with a frag list is consumed, we currently pretend all skbs in the frag list were dropped. In order to fix this, add a @reason argument to skb_release_data() and skb_release_all(). Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7ce797cd121f..42a35b59fb1e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -769,7 +769,7 @@ static void skb_free_head(struct sk_buff *skb) } } -static void skb_release_data(struct sk_buff *skb) +static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason) { struct skb_shared_info *shinfo = skb_shinfo(skb); int i; @@ -792,7 +792,7 @@ static void skb_release_data(struct sk_buff *skb) free_head: if (shinfo->frag_list) - kfree_skb_list(shinfo->frag_list); + kfree_skb_list_reason(shinfo->frag_list, reason); skb_free_head(skb); exit: @@ -855,11 +855,11 @@ void skb_release_head_state(struct sk_buff *skb) } /* Free everything but the sk_buff shell. */ -static void skb_release_all(struct sk_buff *skb) +static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason) { skb_release_head_state(skb); if (likely(skb->head)) - skb_release_data(skb); + skb_release_data(skb, reason); } /** @@ -873,7 +873,7 @@ static void skb_release_all(struct sk_buff *skb) void __kfree_skb(struct sk_buff *skb) { - skb_release_all(skb); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); kfree_skbmem(skb); } EXPORT_SYMBOL(__kfree_skb); @@ -1056,7 +1056,7 @@ EXPORT_SYMBOL(consume_skb); void __consume_stateless_skb(struct sk_buff *skb) { trace_consume_skb(skb); - skb_release_data(skb); + skb_release_data(skb, SKB_CONSUMED); kfree_skbmem(skb); } @@ -1081,7 +1081,7 @@ static void napi_skb_cache_put(struct sk_buff *skb) void __kfree_skb_defer(struct sk_buff *skb) { - skb_release_all(skb); + skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED); napi_skb_cache_put(skb); } @@ -1119,7 +1119,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget) return; } - skb_release_all(skb); + skb_release_all(skb, SKB_CONSUMED); napi_skb_cache_put(skb); } EXPORT_SYMBOL(napi_consume_skb); @@ -1250,7 +1250,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg); */ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) { - skb_release_all(dst); + skb_release_all(dst, SKB_CONSUMED); return __skb_clone(dst, src); } EXPORT_SYMBOL_GPL(skb_morph); @@ -1873,7 +1873,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb); + skb_release_data(skb, SKB_CONSUMED); } else { skb_free_head(skb); } @@ -6213,7 +6213,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, skb_frag_ref(skb, i); if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); - skb_release_data(skb); + skb_release_data(skb, SKB_CONSUMED); } else { /* we can reuse existing recount- all we did was * relocate values @@ -6356,7 +6356,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, kfree(data); return -ENOMEM; } - skb_release_data(skb); + skb_release_data(skb, SKB_CONSUMED); skb->head = data; skb->head_frag = 0; -- cgit v1.2.3-70-g09d2 From 4ecbb1c27c363686d11a241cd682a454a8454c2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:18 +0000 Subject: net: dropreason: add SKB_DROP_REASON_DUP_FRAG This is used to track when a duplicate segment received by various reassembly units is dropped. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 3 +++ net/ipv4/ip_fragment.c | 13 +++++++++---- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 13 +++++++++---- 4 files changed, 22 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 0bd18c14dae0..602d555a5f83 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -68,6 +68,7 @@ FN(IP_INADDRERRORS) \ FN(IP_INNOROUTES) \ FN(PKT_TOO_BIG) \ + FN(DUP_FRAG) \ FNe(MAX) /** @@ -300,6 +301,8 @@ enum skb_drop_reason { * MTU) */ SKB_DROP_REASON_PKT_TOO_BIG, + /** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */ + SKB_DROP_REASON_DUP_FRAG, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index fb153569889e..676bd8d25955 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -278,10 +278,14 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) struct net_device *dev; unsigned int fragsize; int err = -ENOENT; + SKB_DR(reason); u8 ecn; - if (qp->q.flags & INET_FRAG_COMPLETE) + /* If reassembly is already done, @skb must be a duplicate frag. */ + if (qp->q.flags & INET_FRAG_COMPLETE) { + SKB_DR_SET(reason, DUP_FRAG); goto err; + } if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && @@ -382,8 +386,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) insert_error: if (err == IPFRAG_DUP) { - kfree_skb(skb); - return -EINVAL; + SKB_DR_SET(reason, DUP_FRAG); + err = -EINVAL; + goto err; } err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); @@ -391,7 +396,7 @@ discard_qp: inet_frag_kill(&qp->q); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: - kfree_skb(skb); + kfree_skb_reason(skb, reason); return err; } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 38db0064d661..d13240f13607 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -253,7 +253,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, if (err) { if (err == IPFRAG_DUP) { /* No error for duplicates, pretend they got queued. */ - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG); return -EINPROGRESS; } goto insert_error; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index ff866f2a879e..5bc8a28e67f9 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -112,10 +112,14 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct sk_buff *prev_tail; struct net_device *dev; int err = -ENOENT; + SKB_DR(reason); u8 ecn; - if (fq->q.flags & INET_FRAG_COMPLETE) + /* If reassembly is already done, @skb must be a duplicate frag. */ + if (fq->q.flags & INET_FRAG_COMPLETE) { + SKB_DR_SET(reason, DUP_FRAG); goto err; + } err = -EINVAL; offset = ntohs(fhdr->frag_off) & ~0x7; @@ -226,8 +230,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, insert_error: if (err == IPFRAG_DUP) { - kfree_skb(skb); - return -EINVAL; + SKB_DR_SET(reason, DUP_FRAG); + err = -EINVAL; + goto err; } err = -EINVAL; __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), @@ -237,7 +242,7 @@ discard_fq: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: - kfree_skb(skb); + kfree_skb_reason(skb, reason); return err; } -- cgit v1.2.3-70-g09d2 From 77adfd3a1d44c4730fd2af99b497e04ddc2b5837 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:19 +0000 Subject: net: dropreason: add SKB_DROP_REASON_FRAG_REASM_TIMEOUT Used to track skbs freed after a timeout happened in a reassmbly unit. Passing a @reason argument to inet_frag_rbtree_purge() allows to use correct consumed status for frags that have been successfully re-assembled. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 3 +++ include/net/inet_frag.h | 6 +++++- include/net/ipv6_frag.h | 3 ++- net/ipv4/inet_fragment.c | 14 ++++++++++---- net/ipv4/ip_fragment.c | 6 ++++-- 5 files changed, 24 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 602d555a5f83..1d45a74148c3 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -69,6 +69,7 @@ FN(IP_INNOROUTES) \ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ + FN(FRAG_REASM_TIMEOUT) \ FNe(MAX) /** @@ -303,6 +304,8 @@ enum skb_drop_reason { SKB_DROP_REASON_PKT_TOO_BIG, /** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */ SKB_DROP_REASON_DUP_FRAG, + /** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */ + SKB_DROP_REASON_FRAG_REASM_TIMEOUT, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 0b0876610553..b23ddec3cd5c 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -7,6 +7,7 @@ #include #include #include +#include /* Per netns frag queues directory */ struct fqdir { @@ -34,12 +35,14 @@ struct fqdir { * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable + * @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed) */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), INET_FRAG_HASH_DEAD = BIT(3), + INET_FRAG_DROP = BIT(4), }; struct frag_v4_compare_key { @@ -139,7 +142,8 @@ void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); /* Free all skbs in the queue; return the sum of their truesizes. */ -unsigned int inet_frag_rbtree_purge(struct rb_root *root); +unsigned int inet_frag_rbtree_purge(struct rb_root *root, + enum skb_drop_reason reason); static inline void inet_frag_put(struct inet_frag_queue *q) { diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 5052c66e22d2..7321ffe3a108 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -76,6 +76,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; + fq->q.flags |= INET_FRAG_DROP; inet_frag_kill(&fq->q); dev = dev_get_by_index_rcu(net, fq->iif); @@ -101,7 +102,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) spin_unlock(&fq->q.lock); icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); - kfree_skb(head); + kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); goto out_rcu_unlock; out: diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index c9f9ac5013a7..7072fc0783ef 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -133,6 +133,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) count = del_timer_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); + fq->flags |= INET_FRAG_DROP; if (!(fq->flags & INET_FRAG_COMPLETE)) { fq->flags |= INET_FRAG_COMPLETE; count++; @@ -260,7 +261,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) kmem_cache_free(f->frags_cachep, q); } -unsigned int inet_frag_rbtree_purge(struct rb_root *root) +unsigned int inet_frag_rbtree_purge(struct rb_root *root, + enum skb_drop_reason reason) { struct rb_node *p = rb_first(root); unsigned int sum = 0; @@ -274,7 +276,7 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root) struct sk_buff *next = FRAG_CB(skb)->next_frag; sum += skb->truesize; - kfree_skb(skb); + kfree_skb_reason(skb, reason); skb = next; } } @@ -284,17 +286,21 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge); void inet_frag_destroy(struct inet_frag_queue *q) { - struct fqdir *fqdir; unsigned int sum, sum_truesize = 0; + enum skb_drop_reason reason; struct inet_frags *f; + struct fqdir *fqdir; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); + reason = (q->flags & INET_FRAG_DROP) ? + SKB_DROP_REASON_FRAG_REASM_TIMEOUT : + SKB_CONSUMED; WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ fqdir = q->fqdir; f = fqdir->f; - sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments); + sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 676bd8d25955..85e8113259c3 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -153,6 +153,7 @@ static void ip_expire(struct timer_list *t) if (qp->q.flags & INET_FRAG_COMPLETE) goto out; + qp->q.flags |= INET_FRAG_DROP; ipq_kill(qp); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); @@ -194,7 +195,7 @@ out: spin_unlock(&qp->q.lock); out_rcu_unlock: rcu_read_unlock(); - kfree_skb(head); + kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); ipq_put(qp); } @@ -254,7 +255,8 @@ static int ip_frag_reinit(struct ipq *qp) return -ETIMEDOUT; } - sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments); + sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments, + SKB_DROP_REASON_NOT_SPECIFIED); sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; -- cgit v1.2.3-70-g09d2 From 3bdfb04f13ebdd4ae50fc5dc595663874781e48c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 29 Oct 2022 15:45:20 +0000 Subject: net: dropreason: add SKB_DROP_REASON_FRAG_TOO_FAR IPv4 reassembly unit can decide to drop frags based on /proc/sys/net/ipv4/ipfrag_max_dist sysctl. Add a specific drop reason to track this specific and weird case. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dropreason.h | 6 ++++++ net/ipv4/ip_fragment.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/dropreason.h b/include/net/dropreason.h index 1d45a74148c3..70539288f995 100644 --- a/include/net/dropreason.h +++ b/include/net/dropreason.h @@ -70,6 +70,7 @@ FN(PKT_TOO_BIG) \ FN(DUP_FRAG) \ FN(FRAG_REASM_TIMEOUT) \ + FN(FRAG_TOO_FAR) \ FNe(MAX) /** @@ -306,6 +307,11 @@ enum skb_drop_reason { SKB_DROP_REASON_DUP_FRAG, /** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */ SKB_DROP_REASON_FRAG_REASM_TIMEOUT, + /** + * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far. + * (/proc/sys/net/ipv4/ipfrag_max_dist) + */ + SKB_DROP_REASON_FRAG_TOO_FAR, /** * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be * used as a real 'reason' diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 85e8113259c3..69c00ffdcf3e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -256,7 +256,7 @@ static int ip_frag_reinit(struct ipq *qp) } sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments, - SKB_DROP_REASON_NOT_SPECIFIED); + SKB_DROP_REASON_FRAG_TOO_FAR); sub_frag_mem_limit(qp->q.fqdir, sum_truesize); qp->q.flags = 0; -- cgit v1.2.3-70-g09d2 From 4d1c7d87030be2a0043620b240cdb88199396366 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 26 Oct 2022 11:35:00 +0200 Subject: mac802154: Move an skb free within the rx path It may appear clearer to free the skb at the end of the path rather than in the middle, within a helper. Move kfree_skb() from the end of __ieee802154_rx_handle_packet() to right after it in the calling function ieee802154_rx(). Doing so implies reworking a little bit the exit path. Suggested-by: Alexander Aring Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221026093502.602734-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/rx.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 2ae23a2f4a09..499f71df257f 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -234,8 +234,6 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, skb = NULL; break; } - - kfree_skb(skb); } static void @@ -274,7 +272,7 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) WARN_ON_ONCE(softirq_count() == 0); if (local->suspended) - goto drop; + goto free_skb; /* TODO: When a transceiver omits the checksum here, we * add an own calculated one. This is currently an ugly @@ -292,20 +290,17 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) /* Level 1 filtering: Check the FCS by software when relevant */ if (local->hw.phy->filtering == IEEE802154_FILTERING_NONE) { crc = crc_ccitt(0, skb->data, skb->len); - if (crc) { - rcu_read_unlock(); + if (crc) goto drop; - } } /* remove crc */ skb_trim(skb, skb->len - 2); __ieee802154_rx_handle_packet(local, skb); - rcu_read_unlock(); - - return; drop: + rcu_read_unlock(); +free_skb: kfree_skb(skb); } -- cgit v1.2.3-70-g09d2 From eb30a7a5c88f65d06c35c2268af0075c0d6d803e Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 26 Oct 2022 11:35:01 +0200 Subject: mac802154: Clarify an expression While going through the whole interface opening logic in my head I was consistently bothered by the condition checking whether there was only one interface of type NODE/COORD opened at the same time. What actually bothered me was the fact that in one case we would use the wpan_dev pointer directly while in the other case we would use the sdata pointer, making it harder to differentiate both. In practice the condition should be straightforward to read. IMHO dropping the wpan_dev indirection allows to clarify the check. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221026093502.602734-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/iface.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index d9b50884d34e..f21132dd82cf 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -254,7 +254,6 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype iftype) { struct ieee802154_local *local = sdata->local; - struct wpan_dev *wpan_dev = &sdata->wpan_dev; struct ieee802154_sub_if_data *nsdata; /* we hold the RTNL here so can safely walk the list */ @@ -267,7 +266,7 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, * exist really an use case if we need to support * multiple node types at the same time. */ - if (wpan_dev->iftype == NL802154_IFTYPE_NODE && + if (sdata->wpan_dev.iftype == NL802154_IFTYPE_NODE && nsdata->wpan_dev.iftype == NL802154_IFTYPE_NODE) return -EBUSY; -- cgit v1.2.3-70-g09d2 From 2622e785f7579e8c92d0fcf55e9d6b3955e4f1a6 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 26 Oct 2022 11:35:02 +0200 Subject: mac802154: Allow the creation of coordinator interfaces As a first strep in introducing proper PAN management and association, we need to be able to create coordinator interfaces which might act as coordinator or PAN coordinator. Hence, let's add the minimum support to allow the creation of these interfaces. Even though the necessary logic to handle several interfaces on the same device is added to make this future move easier, in practice only several interfaces of type MONITOR are allowed at the same time. The other combinations are not allowed (interface creation is possible but only one can be opened at a time) because, with a single PHY featuring a single set of address filters, we cannot afford handling two distinct interfaces (with different address filters or filtering requirements): * Having 2 NODEs, 2 COORDs or 1 NODE + 1 COORD -> cannot work because the address filters would be different * Having 1 MONITOR + either 1 NODE or 1 COORD -> cannot work because the filtering levels are incompatible Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221026093502.602734-4-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/iface.c | 14 ++++++++------ net/mac802154/main.c | 2 +- net/mac802154/rx.c | 11 +++++++---- 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index f21132dd82cf..7de2f843379c 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -261,13 +261,13 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, if (nsdata != sdata && ieee802154_sdata_running(nsdata)) { int ret; - /* TODO currently we don't support multiple node types - * we need to run skb_clone at rx path. Check if there - * exist really an use case if we need to support - * multiple node types at the same time. + /* TODO currently we don't support multiple node/coord + * types we need to run skb_clone at rx path. Check if + * there exist really an use case if we need to support + * multiple node/coord types at the same time. */ - if (sdata->wpan_dev.iftype == NL802154_IFTYPE_NODE && - nsdata->wpan_dev.iftype == NL802154_IFTYPE_NODE) + if (sdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR && + nsdata->wpan_dev.iftype != NL802154_IFTYPE_MONITOR) return -EBUSY; /* check all phy mac sublayer settings are the same. @@ -564,6 +564,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); switch (type) { + case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ieee802154_be64_to_le64(&wpan_dev->extended_addr, sdata->dev->dev_addr); @@ -623,6 +624,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, ieee802154_le64_to_be64(ndev->perm_addr, &local->hw.phy->perm_extended_addr); switch (type) { + case NL802154_IFTYPE_COORD: case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) { diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 40fab08df24b..3ed31daf7b9c 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -107,7 +107,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE; /* always supported */ - phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE); + phy->supported.iftypes = BIT(NL802154_IFTYPE_NODE) | BIT(NL802154_IFTYPE_COORD); return &local->hw; } diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 499f71df257f..7ca41fd259f1 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -208,6 +208,7 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, int ret; struct ieee802154_sub_if_data *sdata; struct ieee802154_hdr hdr; + struct sk_buff *skb2; ret = ieee802154_parse_frame_start(skb, &hdr); if (ret) { @@ -217,7 +218,7 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, } list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->wpan_dev.iftype != NL802154_IFTYPE_NODE) + if (sdata->wpan_dev.iftype == NL802154_IFTYPE_MONITOR) continue; if (!ieee802154_sdata_running(sdata)) @@ -230,9 +231,11 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, sdata->required_filtering == IEEE802154_FILTERING_4_FRAME_FIELDS) continue; - ieee802154_subif_frame(sdata, skb, &hdr); - skb = NULL; - break; + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) { + skb2->dev = sdata->dev; + ieee802154_subif_frame(sdata, skb2, &hdr); + } } } -- cgit v1.2.3-70-g09d2 From 66394126bf2045d25b70c9327a6372186bba7d66 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 28 Oct 2022 10:05:57 +0200 Subject: netfilter: nft_payload: use __be16 to store gre version GRE_VERSION and GRE_VERSION0 are expressed in network byte order, use __be16. Uncovered by sparse: net/netfilter/nft_payload.c:112:25: warning: incorrect type in assignment (different base types) net/netfilter/nft_payload.c:112:25: expected unsigned int [usertype] version net/netfilter/nft_payload.c:112:25: got restricted __be16 net/netfilter/nft_payload.c:114:22: warning: restricted __be16 degrades to integer Fixes: c247897d7c19 ("netfilter: nft_payload: access GRE payload via inner offset") Reported-by: Jakub Kicinski Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 9d2ac764a14c..53e64d8aa01f 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -102,8 +102,9 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) } break; case IPPROTO_GRE: { - u32 offset = sizeof(struct gre_base_hdr), version; + u32 offset = sizeof(struct gre_base_hdr); struct gre_base_hdr *gre, _gre; + __be16 version; gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre); if (!gre) -- cgit v1.2.3-70-g09d2 From 7394c2dd62dec8bdbe347d195c2dc1b332cc9ec5 Mon Sep 17 00:00:00 2001 From: Peng Wu Date: Tue, 1 Nov 2022 01:37:28 +0000 Subject: netfilter: nft_inner: fix return value check in nft_inner_parse_l2l3() In nft_inner_parse_l2l3(), the return value of skb_header_pointer() is 'veth' instead of 'eth' when case 'htons(ETH_P_8021Q)' and fix it. Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching") Signed-off-by: Peng Wu Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_inner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index eae7caeff316..809f0d0787ec 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -72,7 +72,7 @@ static int nft_inner_parse_l2l3(const struct nft_inner *priv, break; case htons(ETH_P_8021Q): veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth); - if (!eth) + if (!veth) return -1; outer_llproto = veth->h_vlan_encapsulated_proto; -- cgit v1.2.3-70-g09d2 From 44827016be44c6b2634a92ebbdb3d95610ff5268 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 29 Oct 2022 02:46:04 +0100 Subject: net: core: inet[46]_pton strlen len types inet[46]_pton check the input length against a sane length limit (INET[6]_ADDRSTRLEN), but the strlen value gets truncated due to being stored in an int, so there's a theoretical potential for a >4G string to pass the limit test. Use size_t since that's what strlen actually returns. I've had a hunt for callers that could hit this, but I've not managed to find anything that doesn't get checked with some other limit first; but it's possible that I've missed something in the depth of the storage target paths. Signed-off-by: Dr. David Alan Gilbert Link: https://lore.kernel.org/r/20221029014604.114024-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- net/core/utils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index 938495bc1d34..c994e95172ac 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -302,7 +302,7 @@ static int inet4_pton(const char *src, u16 port_num, struct sockaddr_storage *addr) { struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; - int srclen = strlen(src); + size_t srclen = strlen(src); if (srclen > INET_ADDRSTRLEN) return -EINVAL; @@ -322,7 +322,7 @@ static int inet6_pton(struct net *net, const char *src, u16 port_num, { struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; const char *scope_delim; - int srclen = strlen(src); + size_t srclen = strlen(src); if (srclen > INET6_ADDRSTRLEN) return -EINVAL; -- cgit v1.2.3-70-g09d2 From b0e01253a764faab9ecf56c0759e6b06470eb9ff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Nov 2022 03:52:34 +0000 Subject: tcp: refine tcp_prune_ofo_queue() logic After commits 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets") and 72cd43ba64fc1 ("tcp: free batches of packets in tcp_prune_ofo_queue()") tcp_prune_ofo_queue() drops a fraction of ooo queue, to make room for incoming packet. However it makes no sense to drop packets that are before the incoming packet, in sequence space. In order to recover from packet losses faster, it makes more sense to only drop ooo packets which are after the incoming packet. Tested: packetdrill test: 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [3800], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 32792 +0 > S. 0:0(0) ack 1 +.1 < . 1:1(0) ack 1 win 1024 +0 accept(3, ..., ...) = 4 +.01 < . 200:300(100) ack 1 win 1024 +0 > . 1:1(0) ack 1 +.01 < . 400:500(100) ack 1 win 1024 +0 > . 1:1(0) ack 1 +.01 < . 600:700(100) ack 1 win 1024 +0 > . 1:1(0) ack 1 +.01 < . 800:900(100) ack 1 win 1024 +0 > . 1:1(0) ack 1 +.01 < . 1000:1100(100) ack 1 win 1024 +0 > . 1:1(0) ack 1 +.01 < . 1200:1300(100) ack 1 win 1024 +0 > . 1:1(0) ack 1 // this packet is dropped because we have no room left. +.01 < . 1400:1500(100) ack 1 win 1024 +.01 < . 1:200(199) ack 1 win 1024 // Make sure kernel did not drop 200:300 sequence +0 > . 1:1(0) ack 300 // Make room, since our RCVBUF is very small +0 read(4, ..., 299) = 299 +.01 < . 300:400(100) ack 1 win 1024 +0 > . 1:1(0) ack 500 +.01 < . 500:600(100) ack 1 win 1024 +0 > . 1:1(0) ack 700 +0 read(4, ..., 400) = 400 +.01 < . 700:800(100) ack 1 win 1024 +0 > . 1:1(0) ack 900 +.01 < . 900:1000(100) ack 1 win 1024 +0 > . 1:1(0) ack 1100 +.01 < . 1100:1200(100) ack 1 win 1024 // This checks that 1200:1300 has not been removed from ooo queue +0 > . 1:1(0) ack 1300 Suggested-by: Jakub Kicinski Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20221101035234.3910189-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 51 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0640453fce54..d764b5854dfc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4764,8 +4764,8 @@ static void tcp_ofo_queue(struct sock *sk) } } -static bool tcp_prune_ofo_queue(struct sock *sk); -static int tcp_prune_queue(struct sock *sk); +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) @@ -4773,11 +4773,11 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || !sk_rmem_schedule(sk, skb, size)) { - if (tcp_prune_queue(sk) < 0) + if (tcp_prune_queue(sk, skb) < 0) return -1; while (!sk_rmem_schedule(sk, skb, size)) { - if (!tcp_prune_ofo_queue(sk)) + if (!tcp_prune_ofo_queue(sk, skb)) return -1; } } @@ -5329,6 +5329,8 @@ new_range: * Clean the out-of-order queue to make room. * We drop high sequences packets to : * 1) Let a chance for holes to be filled. + * This means we do not drop packets from ooo queue if their sequence + * is before incoming packet sequence. * 2) not add too big latencies if thousands of packets sit there. * (But if application shrinks SO_RCVBUF, we could still end up * freeing whole queue here) @@ -5336,24 +5338,31 @@ new_range: * * Return true if queue has shrunk. */ -static bool tcp_prune_ofo_queue(struct sock *sk) +static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); struct rb_node *node, *prev; + bool pruned = false; int goal; if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) return false; - NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); goal = sk->sk_rcvbuf >> 3; node = &tp->ooo_last_skb->rbnode; + do { + struct sk_buff *skb = rb_to_skb(node); + + /* If incoming skb would land last in ofo queue, stop pruning. */ + if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq)) + break; + pruned = true; prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); - goal -= rb_to_skb(node)->truesize; - tcp_drop_reason(sk, rb_to_skb(node), - SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); + goal -= skb->truesize; + tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE); + tp->ooo_last_skb = rb_to_skb(prev); if (!prev || goal <= 0) { if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) @@ -5362,16 +5371,18 @@ static bool tcp_prune_ofo_queue(struct sock *sk) } node = prev; } while (node); - tp->ooo_last_skb = rb_to_skb(prev); - /* Reset SACK state. A conforming SACK implementation will - * do the same at a timeout based retransmit. When a connection - * is in a sad state like this, we care only about integrity - * of the connection not performance. - */ - if (tp->rx_opt.sack_ok) - tcp_sack_reset(&tp->rx_opt); - return true; + if (pruned) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + } + return pruned; } /* Reduce allocated memory if we can, trying to get @@ -5381,7 +5392,7 @@ static bool tcp_prune_ofo_queue(struct sock *sk) * until the socket owning process reads some of the data * to stabilize the situation. */ -static int tcp_prune_queue(struct sock *sk) +static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -5408,7 +5419,7 @@ static int tcp_prune_queue(struct sock *sk) /* Collapsing did not help, destructive actions follow. * This must not ever occur. */ - tcp_prune_ofo_queue(sk); + tcp_prune_ofo_queue(sk, in_skb); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; -- cgit v1.2.3-70-g09d2 From 7e8cdc97148c6ba66671e88ad9f7d434f4df3438 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Sun, 30 Oct 2022 16:03:37 +0100 Subject: nfc: Add KCOV annotations Add remote KCOV annotations for NFC processing that is done in background threads. This enables efficient coverage-guided fuzzing of the NFC subsystem. The intention is to add annotations to background threads that process skb's that were allocated in syscall context (thus have a KCOV handle associated with the current fuzz test). This includes nci_recv_frame() that is called by the virtual nci driver in the syscall context. Signed-off-by: Dmitry Vyukov Cc: Bongsu Jeon Cc: Krzysztof Kozlowski Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/nfc/nci/core.c | 8 +++++++- net/nfc/nci/hci.c | 4 +++- net/nfc/rawsock.c | 3 +++ 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 6a193cce2a75..dbe5258e13ff 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -24,6 +24,7 @@ #include #include #include +#include #include "../nfc.h" #include @@ -1472,6 +1473,7 @@ static void nci_tx_work(struct work_struct *work) skb = skb_dequeue(&ndev->tx_q); if (!skb) return; + kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Check if data flow control is used */ if (atomic_read(&conn_info->credits_cnt) != @@ -1487,6 +1489,7 @@ static void nci_tx_work(struct work_struct *work) mod_timer(&ndev->data_timer, jiffies + msecs_to_jiffies(NCI_DATA_TIMEOUT)); + kcov_remote_stop(); } } @@ -1497,7 +1500,8 @@ static void nci_rx_work(struct work_struct *work) struct nci_dev *ndev = container_of(work, struct nci_dev, rx_work); struct sk_buff *skb; - while ((skb = skb_dequeue(&ndev->rx_q))) { + for (; (skb = skb_dequeue(&ndev->rx_q)); kcov_remote_stop()) { + kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Send copy to sniffer */ nfc_send_to_raw_sock(ndev->nfc_dev, skb, @@ -1551,6 +1555,7 @@ static void nci_cmd_work(struct work_struct *work) if (!skb) return; + kcov_remote_start_common(skb_get_kcov_handle(skb)); atomic_dec(&ndev->cmd_cnt); pr_debug("NCI TX: MT=cmd, PBF=%d, GID=0x%x, OID=0x%x, plen=%d\n", @@ -1563,6 +1568,7 @@ static void nci_cmd_work(struct work_struct *work) mod_timer(&ndev->cmd_timer, jiffies + msecs_to_jiffies(NCI_CMD_TIMEOUT)); + kcov_remote_stop(); } } diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index 78c4b6addf15..de175318a3a0 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -14,6 +14,7 @@ #include #include #include +#include struct nci_data { u8 conn_id; @@ -409,7 +410,8 @@ static void nci_hci_msg_rx_work(struct work_struct *work) const struct nci_hcp_message *message; u8 pipe, type, instruction; - while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) { + for (; (skb = skb_dequeue(&hdev->msg_rx_queue)); kcov_remote_stop()) { + kcov_remote_start_common(skb_get_kcov_handle(skb)); pipe = NCI_HCP_MSG_GET_PIPE(skb->data[0]); skb_pull(skb, NCI_HCI_HCP_PACKET_HEADER_LEN); message = (struct nci_hcp_message *)skb->data; diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 8dd569765f96..5125392bb68e 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "nfc.h" @@ -189,6 +190,7 @@ static void rawsock_tx_work(struct work_struct *work) } skb = skb_dequeue(&sk->sk_write_queue); + kcov_remote_start_common(skb_get_kcov_handle(skb)); sock_hold(sk); rc = nfc_data_exchange(dev, target_idx, skb, @@ -197,6 +199,7 @@ static void rawsock_tx_work(struct work_struct *work) rawsock_report_error(sk, rc); sock_put(sk); } + kcov_remote_stop(); } static int rawsock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) -- cgit v1.2.3-70-g09d2 From ec32c0c42d0a7208571c4c77f140bffaa4e5e304 Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 1 Nov 2022 10:48:29 +0100 Subject: net: dcb: add new pcp selector to app object Add new PCP selector for the 8021Qaz APP managed object. As the PCP selector is not part of the 8021Qaz standard, a new non-std extension attribute DCB_ATTR_DCB_APP has been introduced. Also two helper functions to translate between selector and app attribute type has been added. The new selector has been given a value of 255, to minimize the risk of future overlap of std- and non-std attributes. The new DCB_ATTR_DCB_APP is sent alongside the ieee std attribute in the app table. This means that the dcb_app struct can now both contain std- and non-std app attributes. Currently there is no overlap between the selector values of the two attributes. The purpose of adding the PCP selector, is to be able to offload PCP-based queue classification to the 8021Q Priority Code Point table, see 6.9.3 of IEEE Std 802.1Q-2018. PCP and DEI is encoded in the protocol field as 8*dei+pcp, so that a mapping of PCP 2 and DEI 1 to priority 3 is encoded as {255, 10, 3}. Signed-off-by: Daniel Machon Reviewed-by: Petr Machata Signed-off-by: Paolo Abeni --- include/uapi/linux/dcbnl.h | 6 +++++ net/dcb/dcbnl.c | 58 ++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 60 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index a791a94013a6..dc7ef96207ca 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -218,6 +218,9 @@ struct cee_pfc { #define IEEE_8021QAZ_APP_SEL_ANY 4 #define IEEE_8021QAZ_APP_SEL_DSCP 5 +/* Non-std selector values */ +#define DCB_APP_SEL_PCP 255 + /* This structure contains the IEEE 802.1Qaz APP managed object. This * object is also used for the CEE std as well. * @@ -247,6 +250,8 @@ struct dcb_app { __u16 protocol; }; +#define IEEE_8021QAZ_APP_SEL_MAX 255 + /** * struct dcb_peer_app_info - APP feature information sent by the peer * @@ -425,6 +430,7 @@ enum ieee_attrs { enum ieee_attrs_app { DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP, + DCB_ATTR_DCB_APP, __DCB_ATTR_IEEE_APP_MAX }; #define DCB_ATTR_IEEE_APP_MAX (__DCB_ATTR_IEEE_APP_MAX - 1) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index dc4fb699b56c..0c17bb28ae60 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -179,6 +179,38 @@ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { static LIST_HEAD(dcb_app_list); static DEFINE_SPINLOCK(dcb_lock); +static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) +{ + switch (selector) { + case IEEE_8021QAZ_APP_SEL_ETHERTYPE: + case IEEE_8021QAZ_APP_SEL_STREAM: + case IEEE_8021QAZ_APP_SEL_DGRAM: + case IEEE_8021QAZ_APP_SEL_ANY: + case IEEE_8021QAZ_APP_SEL_DSCP: + return DCB_ATTR_IEEE_APP; + case DCB_APP_SEL_PCP: + return DCB_ATTR_DCB_APP; + default: + return DCB_ATTR_IEEE_APP_UNSPEC; + } +} + +static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type) +{ + switch (type) { + case DCB_ATTR_IEEE_APP: + case DCB_ATTR_DCB_APP: + return true; + default: + return false; + } +} + +static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector) +{ + return dcbnl_app_attr_type_get(selector) == type; +} + static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq, u32 flags, struct nlmsghdr **nlhp) { @@ -1116,8 +1148,9 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { - err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app), - &itr->app); + enum ieee_attrs_app type = + dcbnl_app_attr_type_get(itr->app.selector); + err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); return -EMSGSIZE; @@ -1493,9 +1526,10 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, int rem; nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { + enum ieee_attrs_app type = nla_type(attr); struct dcb_app *app_data; - if (nla_type(attr) != DCB_ATTR_IEEE_APP) + if (!dcbnl_app_attr_type_validate(type)) continue; if (nla_len(attr) < sizeof(struct dcb_app)) { @@ -1504,6 +1538,13 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, } app_data = nla_data(attr); + + if (!dcbnl_app_selector_validate(type, + app_data->selector)) { + err = -EINVAL; + goto err; + } + if (ops->ieee_setapp) err = ops->ieee_setapp(netdev, app_data); else @@ -1554,11 +1595,20 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, int rem; nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) { + enum ieee_attrs_app type = nla_type(attr); struct dcb_app *app_data; - if (nla_type(attr) != DCB_ATTR_IEEE_APP) + if (!dcbnl_app_attr_type_validate(type)) continue; + app_data = nla_data(attr); + + if (!dcbnl_app_selector_validate(type, + app_data->selector)) { + err = -EINVAL; + goto err; + } + if (ops->ieee_delapp) err = ops->ieee_delapp(netdev, app_data); else -- cgit v1.2.3-70-g09d2 From 6182d5875c330a5a611687caa05f47752455720c Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Tue, 1 Nov 2022 10:48:30 +0100 Subject: net: dcb: add new apptrust attribute Add new apptrust extension attributes to the 8021Qaz APP managed object. Two new attributes, DCB_ATTR_DCB_APP_TRUST_TABLE and DCB_ATTR_DCB_APP_TRUST, has been added. Trusted selectors are passed in the nested attribute DCB_ATTR_DCB_APP_TRUST, in order of precedence. The new attributes are meant to allow drivers, whose hw supports the notion of trust, to be able to set whether a particular app selector is trusted - and in which order. Signed-off-by: Daniel Machon Reviewed-by: Petr Machata Signed-off-by: Paolo Abeni --- include/net/dcbnl.h | 4 +++ include/uapi/linux/dcbnl.h | 2 ++ net/dcb/dcbnl.c | 76 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h index 2b2d86fb3131..8841ab6c2de7 100644 --- a/include/net/dcbnl.h +++ b/include/net/dcbnl.h @@ -109,6 +109,10 @@ struct dcbnl_rtnl_ops { /* buffer settings */ int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *); int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *); + + /* apptrust */ + int (*dcbnl_setapptrust)(struct net_device *, u8 *, int); + int (*dcbnl_getapptrust)(struct net_device *, u8 *, int *); }; #endif /* __NET_DCBNL_H__ */ diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h index dc7ef96207ca..99047223ab26 100644 --- a/include/uapi/linux/dcbnl.h +++ b/include/uapi/linux/dcbnl.h @@ -410,6 +410,7 @@ enum dcbnl_attrs { * @DCB_ATTR_IEEE_PEER_ETS: peer ETS configuration - get only * @DCB_ATTR_IEEE_PEER_PFC: peer PFC configuration - get only * @DCB_ATTR_IEEE_PEER_APP: peer APP tlv - get only + * @DCB_ATTR_DCB_APP_TRUST_TABLE: selector trust table */ enum ieee_attrs { DCB_ATTR_IEEE_UNSPEC, @@ -423,6 +424,7 @@ enum ieee_attrs { DCB_ATTR_IEEE_QCN, DCB_ATTR_IEEE_QCN_STATS, DCB_ATTR_DCB_BUFFER, + DCB_ATTR_DCB_APP_TRUST_TABLE, __DCB_ATTR_IEEE_MAX }; #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1) diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 0c17bb28ae60..cec0632f96db 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -166,6 +166,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)}, + [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED}, }; /* DCB number of traffic classes nested attributes. */ @@ -1062,9 +1063,9 @@ nla_put_failure: /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { - struct nlattr *ieee, *app; - struct dcb_app_type *itr; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; + struct nlattr *ieee, *app, *apptrust; + struct dcb_app_type *itr; int dcbx; int err; @@ -1166,6 +1167,30 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); + if (ops->dcbnl_getapptrust) { + u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; + int nselectors, i; + + apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); + if (!apptrust) + return -EMSGSIZE; + + err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); + if (!err) { + for (i = 0; i < nselectors; i++) { + enum ieee_attrs_app type = + dcbnl_app_attr_type_get(selectors[i]); + err = nla_put_u8(skb, type, selectors[i]); + if (err) { + nla_nest_cancel(skb, apptrust); + return err; + } + } + } + + nla_nest_end(skb, apptrust); + } + /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; @@ -1554,6 +1579,53 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, } } + if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { + u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; + struct nlattr *attr; + int nselectors = 0; + int rem; + + if (!ops->dcbnl_setapptrust) { + err = -EOPNOTSUPP; + goto err; + } + + nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE], + rem) { + enum ieee_attrs_app type = nla_type(attr); + u8 selector; + int i; + + if (!dcbnl_app_attr_type_validate(type) || + nla_len(attr) != 1 || + nselectors >= sizeof(selectors)) { + err = -EINVAL; + goto err; + } + + selector = nla_get_u8(attr); + + if (!dcbnl_app_selector_validate(type, selector)) { + err = -EINVAL; + goto err; + } + + /* Duplicate selector ? */ + for (i = 0; i < nselectors; i++) { + if (selectors[i] == selector) { + err = -EINVAL; + goto err; + } + } + + selectors[nselectors++] = selector; + } + + err = ops->dcbnl_setapptrust(netdev, selectors, nselectors); + if (err) + goto err; + } + err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0); -- cgit v1.2.3-70-g09d2 From 07ec7b502800ba9f7b8b15cb01dd6556bb41aaca Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 27 Oct 2022 15:55:37 -0700 Subject: bpf: make sure skb->len != 0 when redirecting to a tunneling device syzkaller managed to trigger another case where skb->len == 0 when we enter __dev_queue_xmit: WARNING: CPU: 0 PID: 2470 at include/linux/skbuff.h:2576 skb_assert_len include/linux/skbuff.h:2576 [inline] WARNING: CPU: 0 PID: 2470 at include/linux/skbuff.h:2576 __dev_queue_xmit+0x2069/0x35e0 net/core/dev.c:4295 Call Trace: dev_queue_xmit+0x17/0x20 net/core/dev.c:4406 __bpf_tx_skb net/core/filter.c:2115 [inline] __bpf_redirect_no_mac net/core/filter.c:2140 [inline] __bpf_redirect+0x5fb/0xda0 net/core/filter.c:2163 ____bpf_clone_redirect net/core/filter.c:2447 [inline] bpf_clone_redirect+0x247/0x390 net/core/filter.c:2419 bpf_prog_48159a89cb4a9a16+0x59/0x5e bpf_dispatcher_nop_func include/linux/bpf.h:897 [inline] __bpf_prog_run include/linux/filter.h:596 [inline] bpf_prog_run include/linux/filter.h:603 [inline] bpf_test_run+0x46c/0x890 net/bpf/test_run.c:402 bpf_prog_test_run_skb+0xbdc/0x14c0 net/bpf/test_run.c:1170 bpf_prog_test_run+0x345/0x3c0 kernel/bpf/syscall.c:3648 __sys_bpf+0x43a/0x6c0 kernel/bpf/syscall.c:5005 __do_sys_bpf kernel/bpf/syscall.c:5091 [inline] __se_sys_bpf kernel/bpf/syscall.c:5089 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:5089 do_syscall_64+0x54/0x70 arch/x86/entry/common.c:48 entry_SYSCALL_64_after_hwframe+0x61/0xc6 The reproducer doesn't really reproduce outside of syzkaller environment, so I'm taking a guess here. It looks like we do generate correct ETH_HLEN-sized packet, but we redirect the packet to the tunneling device. Before we do so, we __skb_pull l2 header and arrive again at skb->len == 0. Doesn't seem like we can do anything better than having an explicit check after __skb_pull? Cc: Eric Dumazet Reported-by: syzbot+f635e86ec3fa0a37e019@syzkaller.appspotmail.com Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221027225537.353077-1-sdf@google.com Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index bb0136e7a8e4..cb3b635e35be 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2126,6 +2126,10 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, if (mlen) { __skb_pull(skb, mlen); + if (unlikely(!skb->len)) { + kfree_skb(skb); + return -ERANGE; + } /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that -- cgit v1.2.3-70-g09d2 From a35ec8e38cdd1766f29924ca391a01de20163931 Mon Sep 17 00:00:00 2001 From: "Hans J. Schultz" Date: Tue, 1 Nov 2022 21:39:21 +0200 Subject: bridge: Add MAC Authentication Bypass (MAB) support Hosts that support 802.1X authentication are able to authenticate themselves by exchanging EAPOL frames with an authenticator (Ethernet bridge, in this case) and an authentication server. Access to the network is only granted by the authenticator to successfully authenticated hosts. The above is implemented in the bridge using the "locked" bridge port option. When enabled, link-local frames (e.g., EAPOL) can be locally received by the bridge, but all other frames are dropped unless the host is authenticated. That is, unless the user space control plane installed an FDB entry according to which the source address of the frame is located behind the locked ingress port. The entry can be dynamic, in which case learning needs to be enabled so that the entry will be refreshed by incoming traffic. There are deployments in which not all the devices connected to the authenticator (the bridge) support 802.1X. Such devices can include printers and cameras. One option to support such deployments is to unlock the bridge ports connecting these devices, but a slightly more secure option is to use MAB. When MAB is enabled, the MAC address of the connected device is used as the user name and password for the authentication. For MAB to work, the user space control plane needs to be notified about MAC addresses that are trying to gain access so that they will be compared against an allow list. This can be implemented via the regular learning process with the sole difference that learned FDB entries are installed with a new "locked" flag indicating that the entry cannot be used to authenticate the device. The flag cannot be set by user space, but user space can clear the flag by replacing the entry, thereby authenticating the device. Locked FDB entries implement the following semantics with regards to roaming, aging and forwarding: 1. Roaming: Locked FDB entries can roam to unlocked (authorized) ports, in which case the "locked" flag is cleared. FDB entries cannot roam to locked ports regardless of MAB being enabled or not. Therefore, locked FDB entries are only created if an FDB entry with the given {MAC, VID} does not already exist. This behavior prevents unauthenticated devices from disrupting traffic destined to already authenticated devices. 2. Aging: Locked FDB entries age and refresh by incoming traffic like regular entries. 3. Forwarding: Locked FDB entries forward traffic like regular entries. If user space detects an unauthorized MAC behind a locked port and wishes to prevent traffic with this MAC DA from reaching the host, it can do so using tc or a different mechanism. Enable the above behavior using a new bridge port option called "mab". It can only be enabled on a bridge port that is both locked and has learning enabled. Locked FDB entries are flushed from the port once MAB is disabled. A new option is added because there are pure 802.1X deployments that are not interested in notifications about locked FDB entries. Signed-off-by: Hans J. Schultz Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + include/uapi/linux/neighbour.h | 8 +++++++- net/bridge/br_fdb.c | 24 ++++++++++++++++++++++++ net/bridge/br_input.c | 21 +++++++++++++++++++-- net/bridge/br_netlink.c | 21 ++++++++++++++++++++- net/bridge/br_private.h | 3 ++- net/core/rtnetlink.c | 5 +++++ 8 files changed, 79 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index d62ef428e3aa..1668ac4d7adc 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -59,6 +59,7 @@ struct br_ip_list { #define BR_MRP_LOST_IN_CONT BIT(19) #define BR_TX_FWD_OFFLOAD BIT(20) #define BR_PORT_LOCKED BIT(21) +#define BR_PORT_MAB BIT(22) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 5e7a1041df3a..d92b3f79eba3 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -561,6 +561,7 @@ enum { IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, IFLA_BRPORT_LOCKED, + IFLA_BRPORT_MAB, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index a998bf761635..5e67a7eaf4a7 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -52,7 +52,8 @@ enum { #define NTF_STICKY (1 << 6) #define NTF_ROUTER (1 << 7) /* Extended flags under NDA_FLAGS_EXT: */ -#define NTF_EXT_MANAGED (1 << 0) +#define NTF_EXT_MANAGED (1 << 0) +#define NTF_EXT_LOCKED (1 << 1) /* * Neighbor Cache Entry States. @@ -86,6 +87,11 @@ enum { * NTF_EXT_MANAGED flagged neigbor entries are managed by the kernel on behalf * of a user space control plane, and automatically refreshed so that (if * possible) they remain in NUD_REACHABLE state. + * + * NTF_EXT_LOCKED flagged bridge FDB entries are entries generated by the + * bridge in response to a host trying to communicate via a locked bridge port + * with MAB enabled. Their purpose is to notify user space that a host requires + * authentication. */ struct nda_cacheinfo { diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index e7f4fccb6adb..3b83af4458b8 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -105,6 +105,7 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, struct nda_cacheinfo ci; struct nlmsghdr *nlh; struct ndmsg *ndm; + u32 ext_flags = 0; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) @@ -125,11 +126,16 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_flags |= NTF_EXT_LEARNED; if (test_bit(BR_FDB_STICKY, &fdb->flags)) ndm->ndm_flags |= NTF_STICKY; + if (test_bit(BR_FDB_LOCKED, &fdb->flags)) + ext_flags |= NTF_EXT_LOCKED; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) goto nla_put_failure; + if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags)) + goto nla_put_failure; + ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); @@ -171,6 +177,7 @@ static inline size_t fdb_nlmsg_size(void) return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + + nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */ + nla_total_size(sizeof(u16)) /* NDA_VLAN */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ @@ -879,6 +886,11 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, &fdb->flags))) clear_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags); + /* Clear locked flag when roaming to an + * unlocked port. + */ + if (unlikely(test_bit(BR_FDB_LOCKED, &fdb->flags))) + clear_bit(BR_FDB_LOCKED, &fdb->flags); } if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) @@ -1082,6 +1094,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + if (test_and_clear_bit(BR_FDB_LOCKED, &fdb->flags)) + modified = true; + if (fdb_handle_notify(fdb, notify)) modified = true; @@ -1150,6 +1165,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_bridge_port *p = NULL; struct net_bridge_vlan *v; struct net_bridge *br = NULL; + u32 ext_flags = 0; int err = 0; trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags); @@ -1178,6 +1194,14 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], vg = nbp_vlan_group(p); } + if (tb[NDA_FLAGS_EXT]) + ext_flags = nla_get_u32(tb[NDA_FLAGS_EXT]); + + if (ext_flags & NTF_EXT_LOCKED) { + NL_SET_ERR_MSG_MOD(extack, "Cannot add FDB entry with \"locked\" flag set"); + return -EINVAL; + } + if (tb[NDA_FDB_EXT_ATTRS]) { attr = tb[NDA_FDB_EXT_ATTRS]; err = nla_parse_nested(nfea_tb, NFEA_MAX, attr, diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 68b3e850bcb9..d04d2205ad4e 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -109,9 +109,26 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_fdb_entry *fdb_src = br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid); - if (!fdb_src || READ_ONCE(fdb_src->dst) != p || - test_bit(BR_FDB_LOCAL, &fdb_src->flags)) + if (!fdb_src) { + /* FDB miss. Create locked FDB entry if MAB is enabled + * and drop the packet. + */ + if (p->flags & BR_PORT_MAB) + br_fdb_update(br, p, eth_hdr(skb)->h_source, + vid, BIT(BR_FDB_LOCKED)); goto drop; + } else if (READ_ONCE(fdb_src->dst) != p || + test_bit(BR_FDB_LOCAL, &fdb_src->flags)) { + /* FDB mismatch. Drop the packet without roaming. */ + goto drop; + } else if test_bit(BR_FDB_LOCKED, &fdb_src->flags) { + /* FDB match, but entry is locked. Refresh it and drop + * the packet. + */ + br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, + BIT(BR_FDB_LOCKED)); + goto drop; + } } nbp_switchdev_frame_mark(p, skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index d087fd4c784a..4316cc82ae17 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -188,6 +188,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(1) /* IFLA_BRPORT_ISOLATED */ + nla_total_size(1) /* IFLA_BRPORT_LOCKED */ + + nla_total_size(1) /* IFLA_BRPORT_MAB */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -274,7 +275,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN, !!(p->flags & BR_MRP_LOST_IN_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) || - nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED))) + nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) || + nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -876,6 +878,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 }, [IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 }, [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 }, + [IFLA_BRPORT_MAB] = { .type = NLA_U8 }, [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 }, [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 }, }; @@ -943,6 +946,22 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS); br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED); br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED); + br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB); + + if ((p->flags & BR_PORT_MAB) && + (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) { + NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled"); + p->flags = old_flags; + return -EINVAL; + } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) { + struct net_bridge_fdb_flush_desc desc = { + .flags = BIT(BR_FDB_LOCKED), + .flags_mask = BIT(BR_FDB_LOCKED), + .port_ifindex = p->dev->ifindex, + }; + + br_fdb_flush(p->br, &desc); + } changed_mask = old_flags ^ p->flags; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 06e5f6faa431..4ce8b8e5ae0b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -251,7 +251,8 @@ enum { BR_FDB_ADDED_BY_EXT_LEARN, BR_FDB_OFFLOADED, BR_FDB_NOTIFY, - BR_FDB_NOTIFY_INACTIVE + BR_FDB_NOTIFY_INACTIVE, + BR_FDB_LOCKED, }; struct net_bridge_fdb_key { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d2f27548fc0b..b64fffeb3844 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4051,6 +4051,11 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, return err; } + if (tb[NDA_FLAGS_EXT]) { + netdev_info(dev, "invalid flags given to default FDB implementation\n"); + return err; + } + if (vid) { netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; -- cgit v1.2.3-70-g09d2 From 3830c5719af66fac9849cf5fb04b03d4e4bb46ff Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:01:59 +0100 Subject: net: devlink: convert devlink port type-specific pointers to union Instead of storing type_dev as a void pointer, convert it to union and use it to store either struct net_device or struct ib_device pointer. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 13 ++++++++++--- net/core/devlink.c | 17 +++++++++++++---- 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index ba6b8b094943..6c55aabaedf1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -121,12 +121,19 @@ struct devlink_port { struct list_head region_list; struct devlink *devlink; unsigned int index; - spinlock_t type_lock; /* Protects type and type_dev - * pointer consistency. + spinlock_t type_lock; /* Protects type and type_eth/ib + * structures consistency. */ enum devlink_port_type type; enum devlink_port_type desired_type; - void *type_dev; + union { + struct { + struct net_device *netdev; + } type_eth; + struct { + struct ib_device *ibdev; + } type_ib; + }; struct devlink_port_attrs attrs; u8 attrs_set:1, switch_port:1, diff --git a/net/core/devlink.c b/net/core/devlink.c index 0a16ad45520e..868d04c2164f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1303,7 +1303,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { struct net *net = devlink_net(devlink_port->devlink); - struct net_device *netdev = devlink_port->type_dev; + struct net_device *netdev = devlink_port->type_eth.netdev; if (netdev && net_eq(net, dev_net(netdev)) && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, @@ -1313,7 +1313,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg, goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { - struct ib_device *ibdev = devlink_port->type_dev; + struct ib_device *ibdev = devlink_port->type_ib.ibdev; if (ibdev && nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME, @@ -10003,7 +10003,16 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, devlink_port_type_warn_cancel(devlink_port); spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; - devlink_port->type_dev = type_dev; + switch (type) { + case DEVLINK_PORT_TYPE_ETH: + devlink_port->type_eth.netdev = type_dev; + break; + case DEVLINK_PORT_TYPE_IB: + devlink_port->type_ib.ibdev = type_dev; + break; + default: + break; + } spin_unlock_bh(&devlink_port->type_lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } @@ -12016,7 +12025,7 @@ devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata, spin_lock(&in_devlink_port->type_lock); if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH) - metadata->input_dev = in_devlink_port->type_dev; + metadata->input_dev = in_devlink_port->type_eth.netdev; spin_unlock(&in_devlink_port->type_lock); } -- cgit v1.2.3-70-g09d2 From 8573a04404ddacb2d966eef09bf38b2ad6dbe86f Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:00 +0100 Subject: net: devlink: move port_type_warn_schedule() call to __devlink_port_type_set() As __devlink_port_type_set() is going to be called directly from netdevice notifier event handle in one of the follow-up patches, move the port_type_warn_schedule() call there. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 868d04c2164f..3ba3435e2cd5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -10000,7 +10000,11 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, { ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); - devlink_port_type_warn_cancel(devlink_port); + if (type == DEVLINK_PORT_TYPE_NOTSET) + devlink_port_type_warn_schedule(devlink_port); + else + devlink_port_type_warn_cancel(devlink_port); + spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; switch (type) { @@ -10095,7 +10099,6 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); void devlink_port_type_clear(struct devlink_port *devlink_port) { __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); - devlink_port_type_warn_schedule(devlink_port); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); -- cgit v1.2.3-70-g09d2 From 45791e0d00c445936bb19535fe847083b1edd26d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:01 +0100 Subject: net: devlink: move port_type_netdev_checks() call to __devlink_port_type_set() As __devlink_port_type_set() is going to be called directly from netdevice notifier event handle in one of the follow-up patches, move the port_type_netdev_checks() call there. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 63 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 3ba3435e2cd5..ff81a5a5087c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -9994,33 +9994,6 @@ void devlink_port_unregister(struct devlink_port *devlink_port) } EXPORT_SYMBOL_GPL(devlink_port_unregister); -static void __devlink_port_type_set(struct devlink_port *devlink_port, - enum devlink_port_type type, - void *type_dev) -{ - ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); - - if (type == DEVLINK_PORT_TYPE_NOTSET) - devlink_port_type_warn_schedule(devlink_port); - else - devlink_port_type_warn_cancel(devlink_port); - - spin_lock_bh(&devlink_port->type_lock); - devlink_port->type = type; - switch (type) { - case DEVLINK_PORT_TYPE_ETH: - devlink_port->type_eth.netdev = type_dev; - break; - case DEVLINK_PORT_TYPE_IB: - devlink_port->type_ib.ibdev = type_dev; - break; - default: - break; - } - spin_unlock_bh(&devlink_port->type_lock); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); -} - static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, struct net_device *netdev) { @@ -10058,6 +10031,38 @@ static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, } } +static void __devlink_port_type_set(struct devlink_port *devlink_port, + enum devlink_port_type type, + void *type_dev) +{ + struct net_device *netdev = type_dev; + + ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); + + if (type == DEVLINK_PORT_TYPE_NOTSET) { + devlink_port_type_warn_schedule(devlink_port); + } else { + devlink_port_type_warn_cancel(devlink_port); + if (type == DEVLINK_PORT_TYPE_ETH && netdev) + devlink_port_type_netdev_checks(devlink_port, netdev); + } + + spin_lock_bh(&devlink_port->type_lock); + devlink_port->type = type; + switch (type) { + case DEVLINK_PORT_TYPE_ETH: + devlink_port->type_eth.netdev = netdev; + break; + case DEVLINK_PORT_TYPE_IB: + devlink_port->type_ib.ibdev = type_dev; + break; + default: + break; + } + spin_unlock_bh(&devlink_port->type_lock); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); +} + /** * devlink_port_type_eth_set - Set port type to Ethernet * @@ -10067,9 +10072,7 @@ static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, void devlink_port_type_eth_set(struct devlink_port *devlink_port, struct net_device *netdev) { - if (netdev) - devlink_port_type_netdev_checks(devlink_port, netdev); - else + if (!netdev) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); -- cgit v1.2.3-70-g09d2 From d41c9dbd12745cfc1cb2946cd99016d83c2c5364 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:02 +0100 Subject: net: devlink: take RTNL in port_fill() function only if it is not held Follow-up patch is going to introduce a netdevice notifier event processing which is called with RTNL mutex held. Processing of this will eventually lead to call to port_notity() and port_fill() which currently takes RTNL mutex internally. So as a temporary solution, propagate a bool indicating if the mutex is already held. This will go away in one of the follow-up patches. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index ff81a5a5087c..3387dfbb80c5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1278,7 +1278,8 @@ out: static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, u32 seq, - int flags, struct netlink_ext_ack *extack) + int flags, struct netlink_ext_ack *extack, + bool rtnl_held) { struct devlink *devlink = devlink_port->devlink; void *hdr; @@ -1293,7 +1294,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, goto nla_put_failure; /* Hold rtnl lock while accessing port's netdev attributes. */ - rtnl_lock(); + if (!rtnl_held) + rtnl_lock(); spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; @@ -1321,7 +1323,8 @@ static int devlink_nl_port_fill(struct sk_buff *msg, goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); - rtnl_unlock(); + if (!rtnl_held) + rtnl_unlock(); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) @@ -1336,14 +1339,15 @@ static int devlink_nl_port_fill(struct sk_buff *msg, nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); - rtnl_unlock(); + if (!rtnl_held) + rtnl_unlock(); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } -static void devlink_port_notify(struct devlink_port *devlink_port, - enum devlink_command cmd) +static void __devlink_port_notify(struct devlink_port *devlink_port, + enum devlink_command cmd, bool rtnl_held) { struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; @@ -1358,7 +1362,8 @@ static void devlink_port_notify(struct devlink_port *devlink_port, if (!msg) return; - err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); + err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL, + rtnl_held); if (err) { nlmsg_free(msg); return; @@ -1368,6 +1373,12 @@ static void devlink_port_notify(struct devlink_port *devlink_port, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } +static void devlink_port_notify(struct devlink_port *devlink_port, + enum devlink_command cmd) +{ + __devlink_port_notify(devlink_port, cmd, false); +} + static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { @@ -1531,7 +1542,7 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, - info->extack); + info->extack, false); if (err) { nlmsg_free(msg); return err; @@ -1561,7 +1572,8 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->extack); + NLM_F_MULTI, cb->extack, + false); if (err) { devl_unlock(devlink); devlink_put(devlink); @@ -1773,7 +1785,8 @@ static int devlink_port_new_notify(struct devlink *devlink, } err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, - info->snd_portid, info->snd_seq, 0, NULL); + info->snd_portid, info->snd_seq, 0, NULL, + false); if (err) goto out; @@ -10033,7 +10046,7 @@ static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, - void *type_dev) + void *type_dev, bool rtnl_held) { struct net_device *netdev = type_dev; @@ -10060,7 +10073,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, break; } spin_unlock_bh(&devlink_port->type_lock); - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + __devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW, rtnl_held); } /** @@ -10077,7 +10090,8 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev, + false); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); @@ -10090,7 +10104,8 @@ EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev, + false); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); @@ -10101,7 +10116,8 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); */ void devlink_port_type_clear(struct devlink_port *devlink_port) { - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL, + false); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); -- cgit v1.2.3-70-g09d2 From 02a68a47eadedf95748facfca6ced31fb0181d52 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:03 +0100 Subject: net: devlink: track netdev with devlink_port assigned Currently, ethernet drivers are using devlink_port_type_eth_set() and devlink_port_type_clear() to set devlink port type and link to related netdev. Instead of calling them directly, let the driver use SET_NETDEV_DEVLINK_PORT macro to assign devlink_port pointer and let devlink to track it. Note the devlink port pointer is static during the time netdevice is registered. In devlink code, use per-namespace netdev notifier to track the netdevices with devlink_port assigned and change the internal devlink_port type and related type pointer accordingly. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 ++++++++++++ net/core/dev.c | 14 +++++---- net/core/devlink.c | 75 ++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 99 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4b5052db978f..f048a30ea10b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1999,6 +1999,11 @@ enum netdev_ml_priv_type { * registered * @offload_xstats_l3: L3 HW stats for this netdevice. * + * @devlink_port: Pointer to related devlink port structure. + * Assigned by a driver before netdev registration using + * SET_NETDEV_DEVLINK_PORT macro. This pointer is static + * during the time netdevice is registered. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2349,9 +2354,22 @@ struct net_device { netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; struct rtnl_hw_stats64 *offload_xstats_l3; + + struct devlink_port *devlink_port; }; #define to_net_dev(d) container_of(d, struct net_device, dev) +/* + * Driver should use this to assign devlink port instance to a netdevice + * before it registers the netdevice. Therefore devlink_port is static + * during the netdev lifetime after it is registered. + */ +#define SET_NETDEV_DEVLINK_PORT(dev, port) \ +({ \ + WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED); \ + ((dev)->devlink_port = (port)); \ +}) + static inline bool netif_elide_gro(const struct net_device *dev) { if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) @@ -2785,6 +2803,7 @@ enum netdev_cmd { NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, + NETDEV_PRE_UNINIT, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, diff --git a/net/core/dev.c b/net/core/dev.c index 2e4f1c97b59e..3bacee3bee78 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1621,10 +1621,10 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd) N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER) N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE) N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE) - N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER) - N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO) - N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO) - N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) + N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) + N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) + N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE) + N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN) N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO) N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO) N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE) @@ -10060,7 +10060,7 @@ int register_netdevice(struct net_device *dev) dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; write_unlock(&dev_base_lock); if (ret) - goto err_uninit; + goto err_uninit_notify; __netdev_update_features(dev); @@ -10107,6 +10107,8 @@ int register_netdevice(struct net_device *dev) out: return ret; +err_uninit_notify: + call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); err_uninit: if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); @@ -10856,6 +10858,8 @@ void unregister_netdevice_many_notify(struct list_head *head, netdev_name_node_alt_flush(dev); netdev_name_node_free(dev->name_node); + call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev); + if (dev->netdev_ops->ndo_uninit) dev->netdev_ops->ndo_uninit(dev); diff --git a/net/core/devlink.c b/net/core/devlink.c index 3387dfbb80c5..6f06c05c7b1a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -71,6 +71,7 @@ struct devlink { refcount_t refcount; struct completion comp; struct rcu_head rcu; + struct notifier_block netdevice_nb; char priv[] __aligned(NETDEV_ALIGN); }; @@ -9615,6 +9616,9 @@ void devlink_set_features(struct devlink *devlink, u64 features) } EXPORT_SYMBOL_GPL(devlink_set_features); +static int devlink_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr); + /** * devlink_alloc_ns - Allocate new devlink instance resources * in specific namespace @@ -9645,10 +9649,13 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, &last_id, GFP_KERNEL); - if (ret < 0) { - kfree(devlink); - return NULL; - } + if (ret < 0) + goto err_xa_alloc; + + devlink->netdevice_nb.notifier_call = devlink_netdevice_event; + ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb); + if (ret) + goto err_register_netdevice_notifier; devlink->dev = dev; devlink->ops = ops; @@ -9675,6 +9682,12 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, init_completion(&devlink->comp); return devlink; + +err_register_netdevice_notifier: + xa_erase(&devlinks, devlink->index); +err_xa_alloc: + kfree(devlink); + return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); @@ -9828,6 +9841,10 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->port_list)); xa_destroy(&devlink->snapshot_ids); + + unregister_netdevice_notifier_net(devlink_net(devlink), + &devlink->netdevice_nb); + xa_erase(&devlinks, devlink->index); kfree(devlink); @@ -10121,6 +10138,56 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) } EXPORT_SYMBOL_GPL(devlink_port_type_clear); +static int devlink_netdevice_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct devlink_port *devlink_port = netdev->devlink_port; + struct devlink *devlink; + + devlink = container_of(nb, struct devlink, netdevice_nb); + + if (!devlink_port || devlink_port->devlink != devlink) + return NOTIFY_OK; + + switch (event) { + case NETDEV_POST_INIT: + /* Set the type but not netdev pointer. It is going to be set + * later on by NETDEV_REGISTER event. Happens once during + * netdevice register + */ + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, + NULL, true); + break; + case NETDEV_REGISTER: + /* Set the netdev on top of previously set type. Note this + * event happens also during net namespace change so here + * we take into account netdev pointer appearing in this + * namespace. + */ + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, + netdev, true); + break; + case NETDEV_UNREGISTER: + /* Clear netdev pointer, but not the type. This event happens + * also during net namespace change so we need to clear + * pointer to netdev that is going to another net namespace. + */ + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, + NULL, true); + break; + case NETDEV_PRE_UNINIT: + /* Clear the type and the netdev pointer. Happens one during + * netdevice unregister. + */ + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, + NULL, true); + break; + } + + return NOTIFY_OK; +} + static int __devlink_port_attrs_set(struct devlink_port *devlink_port, enum devlink_port_flavour flavour) { -- cgit v1.2.3-70-g09d2 From ac73d4bf2cdaf2cb8a43df8ee4a5c066d2c5d7b4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:04 +0100 Subject: net: make drivers to use SET_NETDEV_DEVLINK_PORT to set devlink_port Benefit from the previously implemented tracking of netdev events in devlink code and instead of calling devlink_port_type_eth_set() and devlink_port_type_clear() to set devlink port type and link to related netdev, use SET_NETDEV_DEVLINK_PORT() macro to assign devlink_port pointer to netdevice which is about to be registered. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 +--- .../ethernet/freescale/dpaa2/dpaa2-eth-devlink.c | 11 +------ drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 1 + drivers/net/ethernet/fungible/funeth/funeth_main.c | 5 +--- drivers/net/ethernet/intel/ice/ice_devlink.c | 14 ++------- drivers/net/ethernet/intel/ice/ice_main.c | 3 +- drivers/net/ethernet/intel/ice/ice_repr.c | 3 +- .../ethernet/marvell/prestera/prestera_devlink.c | 10 ------- .../ethernet/marvell/prestera/prestera_devlink.h | 3 -- .../net/ethernet/marvell/prestera/prestera_main.c | 4 +-- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 9 ++---- .../net/ethernet/mellanox/mlx5/core/en/devlink.c | 7 ----- .../net/ethernet/mellanox/mlx5/core/en/devlink.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 34 +++++----------------- drivers/net/ethernet/mellanox/mlxsw/core.c | 20 +++---------- drivers/net/ethernet/mellanox/mlxsw/core.h | 7 ++--- drivers/net/ethernet/mellanox/mlxsw/minimal.c | 6 ++-- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 5 ++-- drivers/net/ethernet/mscc/ocelot_net.c | 1 + drivers/net/ethernet/mscc/ocelot_vsc7514.c | 9 ------ drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 12 ++------ drivers/net/ethernet/netronome/nfp/nfp_net_main.c | 11 ++----- drivers/net/ethernet/netronome/nfp/nfp_port.h | 2 -- .../net/ethernet/pensando/ionic/ionic_devlink.c | 2 +- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 6 ++-- drivers/net/netdevsim/dev.c | 2 -- drivers/net/netdevsim/netdev.c | 1 + net/dsa/dsa2.c | 9 ------ net/dsa/slave.c | 1 + 30 files changed, 41 insertions(+), 167 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 04cf7684f1b0..9604294b67aa 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -13122,9 +13122,6 @@ static void bnxt_remove_one(struct pci_dev *pdev) if (BNXT_PF(bp)) bnxt_sriov_disable(bp); - if (BNXT_PF(bp)) - devlink_port_type_clear(&bp->dl_port); - bnxt_ptp_clear(bp); pci_disable_pcie_error_reporting(pdev); unregister_netdev(dev); @@ -13537,6 +13534,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) return -ENOMEM; bp = netdev_priv(dev); + SET_NETDEV_DEVLINK_PORT(dev, &bp->dl_port); bp->board_idx = ent->driver_data; bp->msg_enable = BNXT_DEF_MSG_ENABLE; bnxt_set_max_func_irqs(bp, max_irqs); @@ -13712,8 +13710,6 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (rc) goto init_err_cleanup; - if (BNXT_PF(bp)) - devlink_port_type_eth_set(&bp->dl_port, bp->dev); bnxt_dl_fw_reporters_create(bp); bnxt_print_device_info(bp); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c index 7fefe1574b6a..5c6dd3029e2f 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c @@ -226,25 +226,16 @@ int dpaa2_eth_dl_port_add(struct dpaa2_eth_priv *priv) { struct devlink_port *devlink_port = &priv->devlink_port; struct devlink_port_attrs attrs = {}; - int err; attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; devlink_port_attrs_set(devlink_port, &attrs); - - err = devlink_port_register(priv->devlink, devlink_port, 0); - if (err) - return err; - - devlink_port_type_eth_set(devlink_port, priv->net_dev); - - return 0; + return devlink_port_register(priv->devlink, devlink_port, 0); } void dpaa2_eth_dl_port_del(struct dpaa2_eth_priv *priv) { struct devlink_port *devlink_port = &priv->devlink_port; - devlink_port_type_clear(devlink_port); devlink_port_unregister(devlink_port); } diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 892a1403c469..273f1d77c012 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -4791,6 +4791,7 @@ static int dpaa2_eth_probe(struct fsl_mc_device *dpni_dev) priv = netdev_priv(net_dev); priv->net_dev = net_dev; + SET_NETDEV_DEVLINK_PORT(net_dev, &priv->devlink_port); priv->iommu_domain = iommu_get_domain_for_dev(dev); diff --git a/drivers/net/ethernet/fungible/funeth/funeth_main.c b/drivers/net/ethernet/fungible/funeth/funeth_main.c index 095f51c4d9d9..208dc89f4972 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_main.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_main.c @@ -1760,6 +1760,7 @@ static int fun_create_netdev(struct fun_ethdev *ed, unsigned int portid) goto free_rss; SET_NETDEV_DEV(netdev, fdev->dev); + SET_NETDEV_DEVLINK_PORT(netdev, &fp->dl_port); netdev->netdev_ops = &fun_netdev_ops; netdev->hw_features = NETIF_F_SG | NETIF_F_RXHASH | NETIF_F_RXCSUM; @@ -1800,9 +1801,6 @@ static int fun_create_netdev(struct fun_ethdev *ed, unsigned int portid) rc = register_netdev(netdev); if (rc) goto unreg_devlink; - - devlink_port_type_eth_set(&fp->dl_port, netdev); - return 0; unreg_devlink: @@ -1827,7 +1825,6 @@ static void fun_destroy_netdev(struct net_device *netdev) struct funeth_priv *fp; fp = netdev_priv(netdev); - devlink_port_type_clear(&fp->dl_port); unregister_netdev(netdev); devlink_port_unregister(&fp->dl_port); fun_ktls_cleanup(fp); diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c index e6ec20079ced..455489e9457d 100644 --- a/drivers/net/ethernet/intel/ice/ice_devlink.c +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c @@ -1033,12 +1033,7 @@ int ice_devlink_create_pf_port(struct ice_pf *pf) */ void ice_devlink_destroy_pf_port(struct ice_pf *pf) { - struct devlink_port *devlink_port; - - devlink_port = &pf->devlink_port; - - devlink_port_type_clear(devlink_port); - devlink_port_unregister(devlink_port); + devlink_port_unregister(&pf->devlink_port); } /** @@ -1094,12 +1089,7 @@ int ice_devlink_create_vf_port(struct ice_vf *vf) */ void ice_devlink_destroy_vf_port(struct ice_vf *vf) { - struct devlink_port *devlink_port; - - devlink_port = &vf->devlink_port; - - devlink_port_type_clear(devlink_port); - devlink_port_unregister(devlink_port); + devlink_port_unregister(&vf->devlink_port); } #define ICE_DEVLINK_READ_BLK_SIZE (1024 * 1024) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 1f27dc20b4f1..74d25fda11bd 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4603,6 +4603,7 @@ static int ice_register_netdev(struct ice_pf *pf) if (err) goto err_devlink_create; + SET_NETDEV_DEVLINK_PORT(vsi->netdev, &pf->devlink_port); err = register_netdev(vsi->netdev); if (err) goto err_register_netdev; @@ -4611,8 +4612,6 @@ static int ice_register_netdev(struct ice_pf *pf) netif_carrier_off(vsi->netdev); netif_tx_stop_all_queues(vsi->netdev); - devlink_port_type_eth_set(&pf->devlink_port, vsi->netdev); - return 0; err_register_netdev: ice_devlink_destroy_pf_port(pf); diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index bd31748aae1b..663a7a0e1814 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -339,12 +339,11 @@ static int ice_repr_add(struct ice_vf *vf) repr->netdev->max_mtu = ICE_MAX_MTU; SET_NETDEV_DEV(repr->netdev, ice_pf_to_dev(vf->pf)); + SET_NETDEV_DEVLINK_PORT(repr->netdev, &vf->devlink_port); err = ice_repr_reg_netdev(repr->netdev); if (err) goto err_netdev; - devlink_port_type_eth_set(&vf->devlink_port, repr->netdev); - ice_virtchnl_set_repr_ops(vf); return 0; diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c index 06279cd6da67..637b8fee65e7 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c @@ -445,16 +445,6 @@ void prestera_devlink_port_unregister(struct prestera_port *port) devlink_port_unregister(&port->dl_port); } -void prestera_devlink_port_set(struct prestera_port *port) -{ - devlink_port_type_eth_set(&port->dl_port, port->dev); -} - -void prestera_devlink_port_clear(struct prestera_port *port) -{ - devlink_port_type_clear(&port->dl_port); -} - struct devlink_port *prestera_devlink_get_port(struct net_device *dev) { struct prestera_port *port = netdev_priv(dev); diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.h b/drivers/net/ethernet/marvell/prestera/prestera_devlink.h index b322295bad3a..04e8556f748a 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.h +++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.h @@ -15,9 +15,6 @@ void prestera_devlink_unregister(struct prestera_switch *sw); int prestera_devlink_port_register(struct prestera_port *port); void prestera_devlink_port_unregister(struct prestera_port *port); -void prestera_devlink_port_set(struct prestera_port *port); -void prestera_devlink_port_clear(struct prestera_port *port); - struct devlink_port *prestera_devlink_get_port(struct net_device *dev); void prestera_devlink_trap_report(struct prestera_port *port, diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index 24f9d6024745..a76bf3606aa6 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -644,6 +644,7 @@ static int prestera_port_create(struct prestera_switch *sw, u32 id) dev->netdev_ops = &prestera_netdev_ops; dev->ethtool_ops = &prestera_ethtool_ops; SET_NETDEV_DEV(dev, sw->dev->dev); + SET_NETDEV_DEVLINK_PORT(dev, &port->dl_port); if (port->caps.transceiver != PRESTERA_PORT_TCVR_SFP) netif_carrier_off(dev); @@ -737,8 +738,6 @@ static int prestera_port_create(struct prestera_switch *sw, u32 id) if (err) goto err_register_netdev; - prestera_devlink_port_set(port); - err = prestera_port_sfp_bind(port); if (err) goto err_sfp_bind; @@ -761,7 +760,6 @@ static void prestera_port_destroy(struct prestera_port *port) struct net_device *dev = port->dev; cancel_delayed_work_sync(&port->cached_hw_stats.caching_dw); - prestera_devlink_port_clear(port); unregister_netdev(dev); prestera_port_list_del(port); prestera_devlink_port_unregister(port); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index ca4b93a01034..8800d3f1f55c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2337,11 +2337,8 @@ void mlx4_en_destroy_netdev(struct net_device *dev) en_dbg(DRV, priv, "Destroying netdev on port:%d\n", priv->port); /* Unregister device - this will close the port if it was up */ - if (priv->registered) { - devlink_port_type_clear(mlx4_get_devlink_port(mdev->dev, - priv->port)); + if (priv->registered) unregister_netdev(dev); - } if (priv->allocated) mlx4_free_hwq_res(mdev->dev, &priv->res, MLX4_EN_PAGE_SIZE); @@ -3474,6 +3471,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, mdev->profile.prof[priv->port].tx_ppp, mdev->profile.prof[priv->port].tx_pause); + SET_NETDEV_DEVLINK_PORT(dev, + mlx4_get_devlink_port(mdev->dev, priv->port)); err = register_netdev(dev); if (err) { en_err(priv, "Netdev registration failed for port %d\n", port); @@ -3481,8 +3480,6 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, } priv->registered = 1; - devlink_port_type_eth_set(mlx4_get_devlink_port(mdev->dev, priv->port), - dev); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c index b69f9d10ccbd..ce0e56f856d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c @@ -51,13 +51,6 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv) return ret; } -void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv) -{ - struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); - - devlink_port_type_eth_set(dl_port, priv->netdev); -} - void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv) { struct devlink_port *dl_port = mlx5e_devlink_get_dl_port(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h index 10b50feb9883..1c203257ac30 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h @@ -9,7 +9,6 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv); void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv); -void mlx5e_devlink_port_type_eth_set(struct mlx5e_priv *priv); struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev); static inline struct devlink_port * diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 364f04309149..97e876472a06 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5932,14 +5932,13 @@ static int mlx5e_probe(struct auxiliary_device *adev, goto err_profile_cleanup; } + SET_NETDEV_DEVLINK_PORT(netdev, mlx5e_devlink_get_dl_port(priv)); err = register_netdev(netdev); if (err) { mlx5_core_err(mdev, "register_netdev failed, %d\n", err); goto err_resume; } - mlx5e_devlink_port_type_eth_set(priv); - mlx5e_dcbnl_init_app(priv); mlx5_uplink_netdev_set(mdev, netdev); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 794cd8dfe9c9..5412633b9c8c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -1253,37 +1253,20 @@ mlx5e_vport_uplink_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep * { struct mlx5e_priv *priv = netdev_priv(mlx5_uplink_netdev_get(dev)); struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); - struct devlink_port *dl_port; - int err; rpriv->netdev = priv->netdev; - - err = mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, - rpriv); - if (err) - return err; - - dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); - if (dl_port) - devlink_port_type_eth_set(dl_port, rpriv->netdev); - - return 0; + return mlx5e_netdev_change_profile(priv, &mlx5e_uplink_rep_profile, + rpriv); } static void mlx5e_vport_uplink_rep_unload(struct mlx5e_rep_priv *rpriv) { struct net_device *netdev = rpriv->netdev; - struct devlink_port *dl_port; - struct mlx5_core_dev *dev; struct mlx5e_priv *priv; priv = netdev_priv(netdev); - dev = priv->mdev; - dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); - if (dl_port) - devlink_port_type_clear(dl_port); mlx5e_netdev_attach_nic_profile(priv); } @@ -1326,6 +1309,11 @@ mlx5e_vport_vf_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) goto err_cleanup_profile; } + dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, + rpriv->rep->vport); + if (dl_port) + SET_NETDEV_DEVLINK_PORT(netdev, dl_port); + err = register_netdev(netdev); if (err) { netdev_warn(netdev, @@ -1334,9 +1322,6 @@ mlx5e_vport_vf_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) goto err_detach_netdev; } - dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); - if (dl_port) - devlink_port_type_eth_set(dl_port, netdev); return 0; err_detach_netdev: @@ -1382,8 +1367,6 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) struct mlx5e_rep_priv *rpriv = mlx5e_rep_to_rep_priv(rep); struct net_device *netdev = rpriv->netdev; struct mlx5e_priv *priv = netdev_priv(netdev); - struct mlx5_core_dev *dev = priv->mdev; - struct devlink_port *dl_port; void *ppriv = priv->ppriv; if (rep->vport == MLX5_VPORT_UPLINK) { @@ -1391,9 +1374,6 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch_rep *rep) goto free_ppriv; } - dl_port = mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); - if (dl_port) - devlink_port_type_clear(dl_port); unregister_netdev(netdev); mlx5e_detach_netdev(priv); priv->profile->cleanup(priv); diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index e2a985ec2c76..a83f6bc30072 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -3172,29 +3172,17 @@ void mlxsw_core_cpu_port_fini(struct mlxsw_core *mlxsw_core) } EXPORT_SYMBOL(mlxsw_core_cpu_port_fini); -void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u16 local_port, - void *port_driver_priv, struct net_device *dev) +void mlxsw_core_port_netdev_link(struct mlxsw_core *mlxsw_core, u16 local_port, + void *port_driver_priv, struct net_device *dev) { struct mlxsw_core_port *mlxsw_core_port = &mlxsw_core->ports[local_port]; struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port; mlxsw_core_port->port_driver_priv = port_driver_priv; - devlink_port_type_eth_set(devlink_port, dev); + SET_NETDEV_DEVLINK_PORT(dev, devlink_port); } -EXPORT_SYMBOL(mlxsw_core_port_eth_set); - -void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u16 local_port, - void *port_driver_priv) -{ - struct mlxsw_core_port *mlxsw_core_port = - &mlxsw_core->ports[local_port]; - struct devlink_port *devlink_port = &mlxsw_core_port->devlink_port; - - mlxsw_core_port->port_driver_priv = port_driver_priv; - devlink_port_type_clear(devlink_port); -} -EXPORT_SYMBOL(mlxsw_core_port_clear); +EXPORT_SYMBOL(mlxsw_core_port_netdev_link); struct devlink_port * mlxsw_core_port_devlink_port_get(struct mlxsw_core *mlxsw_core, diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index ca0c3d2bee6b..e0a6fcbbcb19 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -264,10 +264,9 @@ int mlxsw_core_cpu_port_init(struct mlxsw_core *mlxsw_core, const unsigned char *switch_id, unsigned char switch_id_len); void mlxsw_core_cpu_port_fini(struct mlxsw_core *mlxsw_core); -void mlxsw_core_port_eth_set(struct mlxsw_core *mlxsw_core, u16 local_port, - void *port_driver_priv, struct net_device *dev); -void mlxsw_core_port_clear(struct mlxsw_core *mlxsw_core, u16 local_port, - void *port_driver_priv); +void mlxsw_core_port_netdev_link(struct mlxsw_core *mlxsw_core, u16 local_port, + void *port_driver_priv, + struct net_device *dev); struct devlink_port * mlxsw_core_port_devlink_port_get(struct mlxsw_core *mlxsw_core, u16 local_port); diff --git a/drivers/net/ethernet/mellanox/mlxsw/minimal.c b/drivers/net/ethernet/mellanox/mlxsw/minimal.c index 55b3c42bb007..177cf7e4db34 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/minimal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/minimal.c @@ -265,6 +265,8 @@ mlxsw_m_port_create(struct mlxsw_m *mlxsw_m, u16 local_port, u8 slot_index, SET_NETDEV_DEV(dev, mlxsw_m->bus_info->dev); dev_net_set(dev, mlxsw_core_net(mlxsw_m->core)); mlxsw_m_port = netdev_priv(dev); + mlxsw_core_port_netdev_link(mlxsw_m->core, local_port, + mlxsw_m_port, dev); mlxsw_m_port->dev = dev; mlxsw_m_port->mlxsw_m = mlxsw_m; mlxsw_m_port->local_port = local_port; @@ -298,9 +300,6 @@ mlxsw_m_port_create(struct mlxsw_m *mlxsw_m, u16 local_port, u8 slot_index, goto err_register_netdev; } - mlxsw_core_port_eth_set(mlxsw_m->core, mlxsw_m_port->local_port, - mlxsw_m_port, dev); - return 0; err_register_netdev: @@ -316,7 +315,6 @@ static void mlxsw_m_port_remove(struct mlxsw_m *mlxsw_m, u16 local_port) { struct mlxsw_m_port *mlxsw_m_port = mlxsw_m->ports[local_port]; - mlxsw_core_port_clear(mlxsw_m->core, local_port, mlxsw_m); unregister_netdev(mlxsw_m_port->dev); /* This calls ndo_stop */ mlxsw_m->ports[local_port] = NULL; free_netdev(mlxsw_m_port->dev); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 6ed496f6cbfb..b13739e1f8f3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1651,6 +1651,8 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u16 local_port, SET_NETDEV_DEV(dev, mlxsw_sp->bus_info->dev); dev_net_set(dev, mlxsw_sp_net(mlxsw_sp)); mlxsw_sp_port = netdev_priv(dev); + mlxsw_core_port_netdev_link(mlxsw_sp->core, local_port, + mlxsw_sp_port, dev); mlxsw_sp_port->dev = dev; mlxsw_sp_port->mlxsw_sp = mlxsw_sp; mlxsw_sp_port->local_port = local_port; @@ -1839,8 +1841,6 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u16 local_port, goto err_register_netdev; } - mlxsw_core_port_eth_set(mlxsw_sp->core, mlxsw_sp_port->local_port, - mlxsw_sp_port, dev); mlxsw_core_schedule_dw(&mlxsw_sp_port->periodic_hw_stats.update_dw, 0); return 0; @@ -1897,7 +1897,6 @@ static void mlxsw_sp_port_remove(struct mlxsw_sp *mlxsw_sp, u16 local_port) cancel_delayed_work_sync(&mlxsw_sp_port->periodic_hw_stats.update_dw); cancel_delayed_work_sync(&mlxsw_sp_port->ptp.shaper_dw); - mlxsw_core_port_clear(mlxsw_sp->core, local_port, mlxsw_sp); unregister_netdev(mlxsw_sp_port->dev); /* This calls ndo_stop */ mlxsw_sp_port_ptp_clear(mlxsw_sp_port); mlxsw_sp_port_vlan_classification_set(mlxsw_sp_port, true, true); diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 50858cc10fef..5efc07751c8d 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1873,6 +1873,7 @@ int ocelot_probe_port(struct ocelot *ocelot, int port, struct regmap *target, if (ocelot->fdma) ocelot_fdma_netdev_init(ocelot, dev); + SET_NETDEV_DEVLINK_PORT(dev, &ocelot->devlink_ports[port]); err = register_netdev(dev); if (err) { dev_err(ocelot->dev, "register_netdev failed\n"); diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c index 6f22aea08a64..93431d2ff4f1 100644 --- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c +++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c @@ -377,9 +377,6 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev, return -ENOMEM; for_each_available_child_of_node(ports, portnp) { - struct ocelot_port_private *priv; - struct ocelot_port *ocelot_port; - struct devlink_port *dlp; struct regmap *target; struct resource *res; char res_name[8]; @@ -420,12 +417,6 @@ static int mscc_ocelot_init_ports(struct platform_device *pdev, } devlink_ports_registered |= BIT(port); - - ocelot_port = ocelot->ports[port]; - priv = container_of(ocelot_port, struct ocelot_port_private, - port); - dlp = &ocelot->devlink_ports[port]; - devlink_port_type_eth_set(dlp, priv->dev); } /* Initialize unused devlink ports at the end */ diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 405786c00334..f3f0f11d8b52 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -334,6 +334,8 @@ int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port) int serial_len; int ret; + SET_NETDEV_DEVLINK_PORT(port->netdev, &port->dl_port); + rtnl_lock(); ret = nfp_devlink_fill_eth_port(port, ð_port); rtnl_unlock(); @@ -361,16 +363,6 @@ void nfp_devlink_port_unregister(struct nfp_port *port) devl_port_unregister(&port->dl_port); } -void nfp_devlink_port_type_eth_set(struct nfp_port *port) -{ - devlink_port_type_eth_set(&port->dl_port, port->netdev); -} - -void nfp_devlink_port_type_clear(struct nfp_port *port) -{ - devlink_port_type_clear(&port->dl_port); -} - struct devlink_port *nfp_devlink_get_devlink_port(struct net_device *netdev) { struct nfp_port *port; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c index 3bae92dc899e..7abf0c576868 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c @@ -156,22 +156,17 @@ nfp_net_pf_init_vnic(struct nfp_pf *pf, struct nfp_net *nn, unsigned int id) nfp_net_debugfs_vnic_add(nn, pf->ddir); - if (nn->port) - nfp_devlink_port_type_eth_set(nn->port); - nfp_net_info(nn); if (nfp_net_is_data_vnic(nn)) { err = nfp_app_vnic_init(pf->app, nn); if (err) - goto err_devlink_port_type_clean; + goto err_debugfs_vnic_clean; } return 0; -err_devlink_port_type_clean: - if (nn->port) - nfp_devlink_port_type_clear(nn->port); +err_debugfs_vnic_clean: nfp_net_debugfs_dir_clean(&nn->debugfs_dir); nfp_net_clean(nn); err_devlink_port_clean: @@ -220,8 +215,6 @@ static void nfp_net_pf_clean_vnic(struct nfp_pf *pf, struct nfp_net *nn) { if (nfp_net_is_data_vnic(nn)) nfp_app_vnic_clean(pf->app, nn); - if (nn->port) - nfp_devlink_port_type_clear(nn->port); nfp_net_debugfs_dir_clean(&nn->debugfs_dir); nfp_net_clean(nn); if (nn->port) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_port.h b/drivers/net/ethernet/netronome/nfp/nfp_port.h index 6793cdf9ff11..f8cd157ca1d7 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_port.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_port.h @@ -129,8 +129,6 @@ int nfp_net_refresh_port_table_sync(struct nfp_pf *pf); int nfp_devlink_port_register(struct nfp_app *app, struct nfp_port *port); void nfp_devlink_port_unregister(struct nfp_port *port); -void nfp_devlink_port_type_eth_set(struct nfp_port *port); -void nfp_devlink_port_type_clear(struct nfp_port *port); /* Mac stats (0x0000 - 0x0200) * all counters are 64bit. diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c index 4297ed9024c0..567f778433e2 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c @@ -90,7 +90,7 @@ int ionic_devlink_register(struct ionic *ionic) return err; } - devlink_port_type_eth_set(&ionic->dl_port, ionic->lif->netdev); + SET_NETDEV_DEVLINK_PORT(ionic->lif->netdev, &ionic->dl_port); devlink_register(dl); return 0; } diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 8800c8a010ff..b2a08f3f0e93 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2534,7 +2534,6 @@ static void am65_cpsw_unregister_devlink(struct am65_cpsw_common *common) static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) { struct device *dev = common->dev; - struct devlink_port *dl_port; struct am65_cpsw_port *port; int ret = 0, i; @@ -2561,15 +2560,14 @@ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) if (!port->ndev) continue; + SET_NETDEV_DEVLINK_PORT(port->ndev, &port->devlink_port); + ret = register_netdev(port->ndev); if (ret) { dev_err(dev, "error registering slave net device%i %d\n", i, ret); goto err_cleanup_ndev; } - - dl_port = &port->devlink_port; - devlink_port_type_eth_set(dl_port, port->ndev); } ret = am65_cpsw_register_notifiers(common); diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index a7880c7ce94c..387c05953a8b 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -1406,7 +1406,6 @@ static int __nsim_dev_port_add(struct nsim_dev *nsim_dev, enum nsim_dev_port_typ goto err_nsim_destroy; } - devlink_port_type_eth_set(devlink_port, nsim_dev_port->ns->netdev); list_add(&nsim_dev_port->list, &nsim_dev->port_list); return 0; @@ -1429,7 +1428,6 @@ static void __nsim_dev_port_del(struct nsim_dev_port *nsim_dev_port) list_del(&nsim_dev_port->list); if (nsim_dev_port_is_vf(nsim_dev_port)) devl_rate_leaf_destroy(&nsim_dev_port->devlink_port); - devlink_port_type_clear(devlink_port); nsim_destroy(nsim_dev_port->ns); nsim_dev_port_debugfs_exit(nsim_dev_port); devl_port_unregister(devlink_port); diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index e470e3398abc..e7cbae743e86 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -360,6 +360,7 @@ nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) ns->nsim_dev_port = nsim_dev_port; ns->nsim_bus_dev = nsim_dev->nsim_bus_dev; SET_NETDEV_DEV(dev, &ns->nsim_bus_dev->dev); + SET_NETDEV_DEVLINK_PORT(dev, &nsim_dev_port->devlink_port); nsim_ethtool_init(ns); if (nsim_dev_port_is_pf(nsim_dev_port)) err = nsim_init_netdevsim(ns); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index e504a18fc125..b80dbd02e154 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -529,7 +529,6 @@ static void dsa_port_devlink_teardown(struct dsa_port *dp) static int dsa_port_setup(struct dsa_port *dp) { - struct devlink_port *dlp = &dp->devlink_port; bool dsa_port_link_registered = false; struct dsa_switch *ds = dp->ds; bool dsa_port_enabled = false; @@ -585,10 +584,6 @@ static int dsa_port_setup(struct dsa_port *dp) case DSA_PORT_TYPE_USER: of_get_mac_address(dp->dn, dp->mac); err = dsa_slave_create(dp); - if (err) - break; - - devlink_port_type_eth_set(dlp, dp->slave); break; } @@ -608,13 +603,9 @@ static int dsa_port_setup(struct dsa_port *dp) static void dsa_port_teardown(struct dsa_port *dp) { - struct devlink_port *dlp = &dp->devlink_port; - if (!dp->setup) return; - devlink_port_type_clear(dlp); - switch (dp->type) { case DSA_PORT_TYPE_UNUSED: break; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 83e419afa89e..158bb3f1aa0b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2406,6 +2406,7 @@ int dsa_slave_create(struct dsa_port *port) SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); SET_NETDEV_DEV(slave_dev, port->ds->dev); + SET_NETDEV_DEVLINK_PORT(slave_dev, &port->devlink_port); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; -- cgit v1.2.3-70-g09d2 From c80965784dbf2fd624be654c1e73c24beada7441 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:05 +0100 Subject: net: devlink: remove netdev arg from devlink_port_type_eth_set() Since devlink_port_type_eth_set() should no longer be called by any driver with netdev pointer as it should rather use SET_NETDEV_DEVLINK_PORT, remove the netdev arg. Add a warn to type_clear() Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/main.c | 2 +- include/net/devlink.h | 3 +-- net/core/devlink.c | 23 ++++++++++++++--------- 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index d3fc86cd3c1d..3ae246391549 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -3043,7 +3043,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port) */ if (!IS_ENABLED(CONFIG_MLX4_EN) && dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH) - devlink_port_type_eth_set(&info->devlink_port, NULL); + devlink_port_type_eth_set(&info->devlink_port); else if (!IS_ENABLED(CONFIG_MLX4_INFINIBAND) && dev->caps.port_type[port] == MLX4_PORT_TYPE_IB) devlink_port_type_ib_set(&info->devlink_port, NULL); diff --git a/include/net/devlink.h b/include/net/devlink.h index 6c55aabaedf1..b1582b32183a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1582,8 +1582,7 @@ int devlink_port_register(struct devlink *devlink, unsigned int port_index); void devl_port_unregister(struct devlink_port *devlink_port); void devlink_port_unregister(struct devlink_port *devlink_port); -void devlink_port_type_eth_set(struct devlink_port *devlink_port, - struct net_device *netdev); +void devlink_port_type_eth_set(struct devlink_port *devlink_port); void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev); void devlink_port_type_clear(struct devlink_port *devlink_port); diff --git a/net/core/devlink.c b/net/core/devlink.c index 6f06c05c7b1a..70a374c828ae 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -10097,17 +10097,15 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, * devlink_port_type_eth_set - Set port type to Ethernet * * @devlink_port: devlink port - * @netdev: related netdevice + * + * If driver is calling this, most likely it is doing something wrong. */ -void devlink_port_type_eth_set(struct devlink_port *devlink_port, - struct net_device *netdev) +void devlink_port_type_eth_set(struct devlink_port *devlink_port) { - if (!netdev) - dev_warn(devlink_port->devlink->dev, - "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", - devlink_port->index); - - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev, + dev_warn(devlink_port->devlink->dev, + "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", + devlink_port->index); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL, false); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); @@ -10130,9 +10128,16 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); * devlink_port_type_clear - Clear port type * * @devlink_port: devlink port + * + * If driver is calling this for clearing Ethernet type, most likely + * it is doing something wrong. */ void devlink_port_type_clear(struct devlink_port *devlink_port) { + if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) + dev_warn(devlink_port->devlink->dev, + "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", + devlink_port->index); __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL, false); } -- cgit v1.2.3-70-g09d2 From d0f5172629339f4a9cbbe5f9ae51cea48b4af333 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:06 +0100 Subject: net: devlink: remove net namespace check from devlink_nl_port_fill() It is ensured by the netdevice notifier event processing, that only netdev pointers from the same net namespaces are filled. Remove the net namespace check from devlink_nl_port_fill() as it is no longer needed. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 70a374c828ae..d948bb2fdd5f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1305,10 +1305,9 @@ static int devlink_nl_port_fill(struct sk_buff *msg, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { - struct net *net = devlink_net(devlink_port->devlink); struct net_device *netdev = devlink_port->type_eth.netdev; - if (netdev && net_eq(net, dev_net(netdev)) && + if (netdev && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, netdev->ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, -- cgit v1.2.3-70-g09d2 From 31265c1e29eb28f17df50d04ee421b5b6369fefd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:07 +0100 Subject: net: devlink: store copy netdevice ifindex and ifname to allow port_fill() without RTNL held To avoid a need to take RTNL mutex in port_fill() function, benefit from the introduce infrastructure that tracks netdevice notifier events. Store the ifindex and ifname upon register and change name events. Remove the rtnl_held bool propagated down to port_fill() function as it is no longer needed. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 2 ++ net/core/devlink.c | 68 ++++++++++++++++++++------------------------------- 2 files changed, 29 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index b1582b32183a..7befad57afd4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -129,6 +129,8 @@ struct devlink_port { union { struct { struct net_device *netdev; + int ifindex; + char ifname[IFNAMSIZ]; } type_eth; struct { struct ib_device *ibdev; diff --git a/net/core/devlink.c b/net/core/devlink.c index d948bb2fdd5f..38de3a1dff36 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1279,8 +1279,7 @@ out: static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, enum devlink_command cmd, u32 portid, u32 seq, - int flags, struct netlink_ext_ack *extack, - bool rtnl_held) + int flags, struct netlink_ext_ack *extack) { struct devlink *devlink = devlink_port->devlink; void *hdr; @@ -1294,9 +1293,6 @@ static int devlink_nl_port_fill(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) goto nla_put_failure; - /* Hold rtnl lock while accessing port's netdev attributes. */ - if (!rtnl_held) - rtnl_lock(); spin_lock_bh(&devlink_port->type_lock); if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type)) goto nla_put_failure_type_locked; @@ -1305,13 +1301,11 @@ static int devlink_nl_port_fill(struct sk_buff *msg, devlink_port->desired_type)) goto nla_put_failure_type_locked; if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) { - struct net_device *netdev = devlink_port->type_eth.netdev; - - if (netdev && + if (devlink_port->type_eth.netdev && (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX, - netdev->ifindex) || + devlink_port->type_eth.ifindex) || nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME, - netdev->name))) + devlink_port->type_eth.ifname))) goto nla_put_failure_type_locked; } if (devlink_port->type == DEVLINK_PORT_TYPE_IB) { @@ -1323,8 +1317,6 @@ static int devlink_nl_port_fill(struct sk_buff *msg, goto nla_put_failure_type_locked; } spin_unlock_bh(&devlink_port->type_lock); - if (!rtnl_held) - rtnl_unlock(); if (devlink_nl_port_attrs_put(msg, devlink_port)) goto nla_put_failure; if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack)) @@ -1339,15 +1331,13 @@ static int devlink_nl_port_fill(struct sk_buff *msg, nla_put_failure_type_locked: spin_unlock_bh(&devlink_port->type_lock); - if (!rtnl_held) - rtnl_unlock(); nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } -static void __devlink_port_notify(struct devlink_port *devlink_port, - enum devlink_command cmd, bool rtnl_held) +static void devlink_port_notify(struct devlink_port *devlink_port, + enum devlink_command cmd) { struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; @@ -1362,8 +1352,7 @@ static void __devlink_port_notify(struct devlink_port *devlink_port, if (!msg) return; - err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL, - rtnl_held); + err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; @@ -1373,12 +1362,6 @@ static void __devlink_port_notify(struct devlink_port *devlink_port, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } -static void devlink_port_notify(struct devlink_port *devlink_port, - enum devlink_command cmd) -{ - __devlink_port_notify(devlink_port, cmd, false); -} - static void devlink_rate_notify(struct devlink_rate *devlink_rate, enum devlink_command cmd) { @@ -1542,7 +1525,7 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, - info->extack, false); + info->extack); if (err) { nlmsg_free(msg); return err; @@ -1572,8 +1555,7 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, cb->extack, - false); + NLM_F_MULTI, cb->extack); if (err) { devl_unlock(devlink); devlink_put(devlink); @@ -1785,8 +1767,7 @@ static int devlink_port_new_notify(struct devlink *devlink, } err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, - info->snd_portid, info->snd_seq, 0, NULL, - false); + info->snd_portid, info->snd_seq, 0, NULL); if (err) goto out; @@ -10062,7 +10043,7 @@ static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port, static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, - void *type_dev, bool rtnl_held) + void *type_dev) { struct net_device *netdev = type_dev; @@ -10081,6 +10062,13 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, switch (type) { case DEVLINK_PORT_TYPE_ETH: devlink_port->type_eth.netdev = netdev; + if (netdev) { + ASSERT_RTNL(); + devlink_port->type_eth.ifindex = netdev->ifindex; + BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) != + sizeof(netdev->name)); + strcpy(devlink_port->type_eth.ifname, netdev->name); + } break; case DEVLINK_PORT_TYPE_IB: devlink_port->type_ib.ibdev = type_dev; @@ -10089,7 +10077,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, break; } spin_unlock_bh(&devlink_port->type_lock); - __devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW, rtnl_held); + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); } /** @@ -10104,8 +10092,7 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL, - false); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); @@ -10118,8 +10105,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_eth_set); void devlink_port_type_ib_set(struct devlink_port *devlink_port, struct ib_device *ibdev) { - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev, - false); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev); } EXPORT_SYMBOL_GPL(devlink_port_type_ib_set); @@ -10137,8 +10123,7 @@ void devlink_port_type_clear(struct devlink_port *devlink_port) dev_warn(devlink_port->devlink->dev, "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n", devlink_port->index); - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL, - false); + __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL); } EXPORT_SYMBOL_GPL(devlink_port_type_clear); @@ -10161,16 +10146,17 @@ static int devlink_netdevice_event(struct notifier_block *nb, * netdevice register */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, - NULL, true); + NULL); break; case NETDEV_REGISTER: + case NETDEV_CHANGENAME: /* Set the netdev on top of previously set type. Note this * event happens also during net namespace change so here * we take into account netdev pointer appearing in this * namespace. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, - netdev, true); + netdev); break; case NETDEV_UNREGISTER: /* Clear netdev pointer, but not the type. This event happens @@ -10178,14 +10164,14 @@ static int devlink_netdevice_event(struct notifier_block *nb, * pointer to netdev that is going to another net namespace. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, - NULL, true); + NULL); break; case NETDEV_PRE_UNINIT: /* Clear the type and the netdev pointer. Happens one during * netdevice unregister. */ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, - NULL, true); + NULL); break; } -- cgit v1.2.3-70-g09d2 From e705a621c071b43e4ea971abb70d7677dc640c27 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:08 +0100 Subject: net: devlink: add not cleared type warning to port unregister By the time port unregister is called. There should be no type set. Make sure that the driver cleared it before and warn in case it didn't. This enforces symmetricity with type set and port register. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 38de3a1dff36..4a0ba86b86ed 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -9977,6 +9977,7 @@ EXPORT_SYMBOL_GPL(devlink_port_register); void devl_port_unregister(struct devlink_port *devlink_port) { lockdep_assert_held(&devlink_port->devlink->lock); + WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET); devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); -- cgit v1.2.3-70-g09d2 From 8eba37f7e9bc82fac08f31d318e36f341494442d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:09 +0100 Subject: net: devlink: use devlink_port pointer instead of ndo_get_devlink_port Use newly introduced devlink_port pointer instead of getting it calling to ndo_get_devlink_port op. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 12 ++---------- net/core/net-sysfs.c | 4 ++-- net/ethtool/ioctl.c | 11 ++--------- 3 files changed, 6 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 4a0ba86b86ed..3a454d0045e5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -12505,14 +12505,6 @@ free_msg: nlmsg_free(msg); } -static struct devlink_port *netdev_to_devlink_port(struct net_device *dev) -{ - if (!dev->netdev_ops->ndo_get_devlink_port) - return NULL; - - return dev->netdev_ops->ndo_get_devlink_port(dev); -} - void devlink_compat_running_version(struct devlink *devlink, char *buf, size_t len) { @@ -12558,7 +12550,7 @@ int devlink_compat_phys_port_name_get(struct net_device *dev, */ ASSERT_RTNL(); - devlink_port = netdev_to_devlink_port(dev); + devlink_port = dev->devlink_port; if (!devlink_port) return -EOPNOTSUPP; @@ -12574,7 +12566,7 @@ int devlink_compat_switch_id_get(struct net_device *dev, * devlink_port instance cannot disappear in the middle. No need to take * any devlink lock as only permanent values are accessed. */ - devlink_port = netdev_to_devlink_port(dev); + devlink_port = dev->devlink_port; if (!devlink_port || !devlink_port->switch_port) return -EOPNOTSUPP; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 8409d41405df..679b84cc8794 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -532,7 +532,7 @@ static ssize_t phys_port_name_show(struct device *dev, * returning early without hitting the trylock/restart below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && - !netdev->netdev_ops->ndo_get_devlink_port) + !netdev->devlink_port) return -EOPNOTSUPP; if (!rtnl_trylock()) @@ -562,7 +562,7 @@ static ssize_t phys_switch_id_show(struct device *dev, * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && - !netdev->netdev_ops->ndo_get_devlink_port) + !netdev->devlink_port) return -EOPNOTSUPP; if (!rtnl_trylock()) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 57e7238a4136..b6835136c53f 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -44,16 +44,9 @@ struct ethtool_devlink_compat { static struct devlink *netdev_to_devlink_get(struct net_device *dev) { - struct devlink_port *devlink_port; - - if (!dev->netdev_ops->ndo_get_devlink_port) - return NULL; - - devlink_port = dev->netdev_ops->ndo_get_devlink_port(dev); - if (!devlink_port) + if (!dev->devlink_port) return NULL; - - return devlink_try_get(devlink_port->devlink); + return devlink_try_get(dev->devlink_port->devlink); } /* -- cgit v1.2.3-70-g09d2 From 77df1db80da384c565106321f5934967690da7dd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:10 +0100 Subject: net: remove unused ndo_get_devlink_port Remove ndo_get_devlink_port which is no longer used alongside with the implementations in drivers. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 8 -------- drivers/net/ethernet/fungible/funeth/funeth_main.c | 8 -------- drivers/net/ethernet/intel/ice/ice_main.c | 15 --------------- drivers/net/ethernet/intel/ice/ice_repr.c | 9 --------- drivers/net/ethernet/marvell/prestera/prestera_devlink.c | 7 ------- drivers/net/ethernet/marvell/prestera/prestera_devlink.h | 2 -- drivers/net/ethernet/marvell/prestera/prestera_main.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c | 10 ---------- drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 1 - drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 10 ---------- drivers/net/ethernet/mellanox/mlxsw/minimal.c | 11 ----------- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 11 ----------- drivers/net/ethernet/mscc/ocelot_net.c | 10 ---------- drivers/net/ethernet/netronome/nfp/nfp_app.h | 2 -- drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 11 ----------- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 -- drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 1 - drivers/net/ethernet/ti/am65-cpsw-nuss.c | 8 -------- drivers/net/netdevsim/netdev.c | 9 --------- include/linux/netdevice.h | 5 ----- net/dsa/slave.c | 8 -------- 22 files changed, 150 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 9604294b67aa..03272aa48511 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -13073,13 +13073,6 @@ int bnxt_get_port_parent_id(struct net_device *dev, return 0; } -static struct devlink_port *bnxt_get_devlink_port(struct net_device *dev) -{ - struct bnxt *bp = netdev_priv(dev); - - return &bp->dl_port; -} - static const struct net_device_ops bnxt_netdev_ops = { .ndo_open = bnxt_open, .ndo_start_xmit = bnxt_start_xmit, @@ -13111,7 +13104,6 @@ static const struct net_device_ops bnxt_netdev_ops = { .ndo_xdp_xmit = bnxt_xdp_xmit, .ndo_bridge_getlink = bnxt_bridge_getlink, .ndo_bridge_setlink = bnxt_bridge_setlink, - .ndo_get_devlink_port = bnxt_get_devlink_port, }; static void bnxt_remove_one(struct pci_dev *pdev) diff --git a/drivers/net/ethernet/fungible/funeth/funeth_main.c b/drivers/net/ethernet/fungible/funeth/funeth_main.c index 208dc89f4972..b4cce30e526a 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_main.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_main.c @@ -1178,13 +1178,6 @@ static int fun_xdp(struct net_device *dev, struct netdev_bpf *xdp) } } -static struct devlink_port *fun_get_devlink_port(struct net_device *netdev) -{ - struct funeth_priv *fp = netdev_priv(netdev); - - return &fp->dl_port; -} - static int fun_init_vports(struct fun_ethdev *ed, unsigned int n) { if (ed->num_vports) @@ -1350,7 +1343,6 @@ static const struct net_device_ops fun_netdev_ops = { .ndo_set_vf_vlan = fun_set_vf_vlan, .ndo_set_vf_rate = fun_set_vf_rate, .ndo_get_vf_config = fun_get_vf_config, - .ndo_get_devlink_port = fun_get_devlink_port, }; #define GSO_ENCAP_FLAGS (NETIF_F_GSO_GRE | NETIF_F_GSO_IPXIP4 | \ diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 74d25fda11bd..a9fc89aebebe 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -298,20 +298,6 @@ static int ice_clear_promisc(struct ice_vsi *vsi, u8 promisc_m) return status; } -/** - * ice_get_devlink_port - Get devlink port from netdev - * @netdev: the netdevice structure - */ -static struct devlink_port *ice_get_devlink_port(struct net_device *netdev) -{ - struct ice_pf *pf = ice_netdev_to_pf(netdev); - - if (!ice_is_switchdev_running(pf)) - return NULL; - - return &pf->devlink_port; -} - /** * ice_vsi_sync_fltr - Update the VSI filter list to the HW * @vsi: ptr to the VSI @@ -9107,5 +9093,4 @@ static const struct net_device_ops ice_netdev_ops = { .ndo_bpf = ice_xdp, .ndo_xdp_xmit = ice_xdp_xmit, .ndo_xsk_wakeup = ice_xsk_wakeup, - .ndo_get_devlink_port = ice_get_devlink_port, }; diff --git a/drivers/net/ethernet/intel/ice/ice_repr.c b/drivers/net/ethernet/intel/ice/ice_repr.c index 663a7a0e1814..0483eb14c288 100644 --- a/drivers/net/ethernet/intel/ice/ice_repr.c +++ b/drivers/net/ethernet/intel/ice/ice_repr.c @@ -134,14 +134,6 @@ static int ice_repr_stop(struct net_device *netdev) return 0; } -static struct devlink_port * -ice_repr_get_devlink_port(struct net_device *netdev) -{ - struct ice_repr *repr = ice_netdev_to_repr(netdev); - - return &repr->vf->devlink_port; -} - /** * ice_repr_sp_stats64 - get slow path stats for port representor * @dev: network interface device structure @@ -250,7 +242,6 @@ static const struct net_device_ops ice_repr_netdev_ops = { .ndo_open = ice_repr_open, .ndo_stop = ice_repr_stop, .ndo_start_xmit = ice_eswitch_port_start_xmit, - .ndo_get_devlink_port = ice_repr_get_devlink_port, .ndo_setup_tc = ice_repr_setup_tc, .ndo_has_offload_stats = ice_repr_ndo_has_offload_stats, .ndo_get_offload_stats = ice_repr_ndo_get_offload_stats, diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c index 637b8fee65e7..84ad05c9f12d 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c @@ -445,13 +445,6 @@ void prestera_devlink_port_unregister(struct prestera_port *port) devlink_port_unregister(&port->dl_port); } -struct devlink_port *prestera_devlink_get_port(struct net_device *dev) -{ - struct prestera_port *port = netdev_priv(dev); - - return &port->dl_port; -} - int prestera_devlink_traps_register(struct prestera_switch *sw) { const u32 groups_count = ARRAY_SIZE(prestera_trap_groups_arr); diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.h b/drivers/net/ethernet/marvell/prestera/prestera_devlink.h index 04e8556f748a..bf84ad6fd87e 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.h +++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.h @@ -15,8 +15,6 @@ void prestera_devlink_unregister(struct prestera_switch *sw); int prestera_devlink_port_register(struct prestera_port *port); void prestera_devlink_port_unregister(struct prestera_port *port); -struct devlink_port *prestera_devlink_get_port(struct net_device *dev); - void prestera_devlink_trap_report(struct prestera_port *port, struct sk_buff *skb, u8 cpu_code); int prestera_devlink_traps_register(struct prestera_switch *sw); diff --git a/drivers/net/ethernet/marvell/prestera/prestera_main.c b/drivers/net/ethernet/marvell/prestera/prestera_main.c index a76bf3606aa6..edbdda8f958d 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_main.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_main.c @@ -569,7 +569,6 @@ static const struct net_device_ops prestera_netdev_ops = { .ndo_change_mtu = prestera_port_change_mtu, .ndo_get_stats64 = prestera_port_get_stats64, .ndo_set_mac_address = prestera_port_set_mac_address, - .ndo_get_devlink_port = prestera_devlink_get_port, }; int prestera_port_autoneg_set(struct prestera_port *port, u64 link_modes) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c index ce0e56f856d6..83adaabf59f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.c @@ -62,13 +62,3 @@ void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv) if (!(priv->mdev->priv.flags & MLX5_PRIV_FLAGS_MLX5E_LOCKED_FLOW)) devl_unlock(devlink); } - -struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev) -{ - struct mlx5e_priv *priv = netdev_priv(dev); - - if (!netif_device_present(dev)) - return NULL; - - return mlx5e_devlink_get_dl_port(priv); -} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h index 1c203257ac30..4f238d4fff55 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/devlink.h @@ -9,7 +9,6 @@ int mlx5e_devlink_port_register(struct mlx5e_priv *priv); void mlx5e_devlink_port_unregister(struct mlx5e_priv *priv); -struct devlink_port *mlx5e_get_devlink_port(struct net_device *dev); static inline struct devlink_port * mlx5e_devlink_get_dl_port(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 97e876472a06..78d34c039fa7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4897,7 +4897,6 @@ const struct net_device_ops mlx5e_netdev_ops = { .ndo_has_offload_stats = mlx5e_has_offload_stats, .ndo_get_offload_stats = mlx5e_get_offload_stats, #endif - .ndo_get_devlink_port = mlx5e_get_devlink_port, }; static u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 5412633b9c8c..1b53e8852c86 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -607,15 +607,6 @@ static int mlx5e_rep_change_mtu(struct net_device *netdev, int new_mtu) return mlx5e_change_mtu(netdev, new_mtu, NULL); } -static struct devlink_port *mlx5e_rep_get_devlink_port(struct net_device *netdev) -{ - struct mlx5e_priv *priv = netdev_priv(netdev); - struct mlx5e_rep_priv *rpriv = priv->ppriv; - struct mlx5_core_dev *dev = priv->mdev; - - return mlx5_esw_offloads_devlink_port(dev->priv.eswitch, rpriv->rep->vport); -} - static int mlx5e_rep_change_carrier(struct net_device *dev, bool new_carrier) { struct mlx5e_priv *priv = netdev_priv(dev); @@ -644,7 +635,6 @@ static const struct net_device_ops mlx5e_netdev_ops_rep = { .ndo_stop = mlx5e_rep_close, .ndo_start_xmit = mlx5e_xmit, .ndo_setup_tc = mlx5e_rep_setup_tc, - .ndo_get_devlink_port = mlx5e_rep_get_devlink_port, .ndo_get_stats64 = mlx5e_rep_get_stats, .ndo_has_offload_stats = mlx5e_rep_has_offload_stats, .ndo_get_offload_stats = mlx5e_rep_get_offload_stats, diff --git a/drivers/net/ethernet/mellanox/mlxsw/minimal.c b/drivers/net/ethernet/mellanox/mlxsw/minimal.c index 177cf7e4db34..6b56eadd736e 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/minimal.c +++ b/drivers/net/ethernet/mellanox/mlxsw/minimal.c @@ -81,20 +81,9 @@ static int mlxsw_m_port_stop(struct net_device *dev) return 0; } -static struct devlink_port * -mlxsw_m_port_get_devlink_port(struct net_device *dev) -{ - struct mlxsw_m_port *mlxsw_m_port = netdev_priv(dev); - struct mlxsw_m *mlxsw_m = mlxsw_m_port->mlxsw_m; - - return mlxsw_core_port_devlink_port_get(mlxsw_m->core, - mlxsw_m_port->local_port); -} - static const struct net_device_ops mlxsw_m_port_netdev_ops = { .ndo_open = mlxsw_m_port_open, .ndo_stop = mlxsw_m_port_stop, - .ndo_get_devlink_port = mlxsw_m_port_get_devlink_port, }; static void mlxsw_m_module_get_drvinfo(struct net_device *dev, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index b13739e1f8f3..04dc79da6024 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1259,16 +1259,6 @@ static int mlxsw_sp_set_features(struct net_device *dev, return 0; } -static struct devlink_port * -mlxsw_sp_port_get_devlink_port(struct net_device *dev) -{ - struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; - - return mlxsw_core_port_devlink_port_get(mlxsw_sp->core, - mlxsw_sp_port->local_port); -} - static int mlxsw_sp_port_hwtstamp_set(struct mlxsw_sp_port *mlxsw_sp_port, struct ifreq *ifr) { @@ -1342,7 +1332,6 @@ static const struct net_device_ops mlxsw_sp_port_netdev_ops = { .ndo_vlan_rx_add_vid = mlxsw_sp_port_add_vid, .ndo_vlan_rx_kill_vid = mlxsw_sp_port_kill_vid, .ndo_set_features = mlxsw_sp_set_features, - .ndo_get_devlink_port = mlxsw_sp_port_get_devlink_port, .ndo_eth_ioctl = mlxsw_sp_port_ioctl, }; diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 5efc07751c8d..f50dada2bb8e 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -194,15 +194,6 @@ void ocelot_port_devlink_teardown(struct ocelot *ocelot, int port) devlink_port_unregister(dlp); } -static struct devlink_port *ocelot_get_devlink_port(struct net_device *dev) -{ - struct ocelot_port_private *priv = netdev_priv(dev); - struct ocelot *ocelot = priv->port.ocelot; - int port = priv->port.index; - - return &ocelot->devlink_ports[port]; -} - int ocelot_setup_tc_cls_flower(struct ocelot_port_private *priv, struct flow_cls_offload *f, bool ingress) @@ -925,7 +916,6 @@ static const struct net_device_ops ocelot_port_netdev_ops = { .ndo_set_features = ocelot_set_features, .ndo_setup_tc = ocelot_setup_tc, .ndo_eth_ioctl = ocelot_ioctl, - .ndo_get_devlink_port = ocelot_get_devlink_port, }; struct net_device *ocelot_port_to_netdev(struct ocelot *ocelot, int port) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index dd56207df246..90707346a4ef 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -445,6 +445,4 @@ int nfp_app_nic_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, int nfp_app_nic_vnic_init_phy_port(struct nfp_pf *pf, struct nfp_app *app, struct nfp_net *nn, unsigned int id); -struct devlink_port *nfp_devlink_get_devlink_port(struct net_device *netdev); - #endif diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index f3f0f11d8b52..8bfd48d50ef0 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -362,14 +362,3 @@ void nfp_devlink_port_unregister(struct nfp_port *port) { devl_port_unregister(&port->dl_port); } - -struct devlink_port *nfp_devlink_get_devlink_port(struct net_device *netdev) -{ - struct nfp_port *port; - - port = nfp_port_from_netdev(netdev); - if (!port) - return NULL; - - return &port->dl_port; -} diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index a5ca5c4a7896..8c1a870bc0e5 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2013,7 +2013,6 @@ const struct net_device_ops nfp_nfd3_netdev_ops = { .ndo_get_phys_port_name = nfp_net_get_phys_port_name, .ndo_bpf = nfp_net_xdp, .ndo_xsk_wakeup = nfp_net_xsk_wakeup, - .ndo_get_devlink_port = nfp_devlink_get_devlink_port, .ndo_bridge_getlink = nfp_net_bridge_getlink, .ndo_bridge_setlink = nfp_net_bridge_setlink, }; @@ -2044,7 +2043,6 @@ const struct net_device_ops nfp_nfdk_netdev_ops = { .ndo_features_check = nfp_net_features_check, .ndo_get_phys_port_name = nfp_net_get_phys_port_name, .ndo_bpf = nfp_net_xdp, - .ndo_get_devlink_port = nfp_devlink_get_devlink_port, .ndo_bridge_getlink = nfp_net_bridge_getlink, .ndo_bridge_setlink = nfp_net_bridge_setlink, }; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index a6b6ca1fd55e..3af1229a3f08 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -275,7 +275,6 @@ const struct net_device_ops nfp_repr_netdev_ops = { .ndo_set_features = nfp_port_set_features, .ndo_set_mac_address = eth_mac_addr, .ndo_get_port_parent_id = nfp_port_get_port_parent_id, - .ndo_get_devlink_port = nfp_devlink_get_devlink_port, }; void diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index b2a08f3f0e93..0dbdce4dc077 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1380,13 +1380,6 @@ static void am65_cpsw_nuss_ndo_get_stats(struct net_device *dev, stats->tx_dropped = dev->stats.tx_dropped; } -static struct devlink_port *am65_cpsw_ndo_get_devlink_port(struct net_device *ndev) -{ - struct am65_cpsw_port *port = am65_ndev_to_port(ndev); - - return &port->devlink_port; -} - static const struct net_device_ops am65_cpsw_nuss_netdev_ops = { .ndo_open = am65_cpsw_nuss_ndo_slave_open, .ndo_stop = am65_cpsw_nuss_ndo_slave_stop, @@ -1400,7 +1393,6 @@ static const struct net_device_ops am65_cpsw_nuss_netdev_ops = { .ndo_vlan_rx_kill_vid = am65_cpsw_nuss_ndo_slave_kill_vid, .ndo_eth_ioctl = am65_cpsw_nuss_ndo_slave_ioctl, .ndo_setup_tc = am65_cpsw_qos_ndo_setup_tc, - .ndo_get_devlink_port = am65_cpsw_ndo_get_devlink_port, }; static void am65_cpsw_nuss_mac_config(struct phylink_config *config, unsigned int mode, diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index e7cbae743e86..6db6a75ff9b9 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -238,13 +238,6 @@ nsim_set_features(struct net_device *dev, netdev_features_t features) return 0; } -static struct devlink_port *nsim_get_devlink_port(struct net_device *dev) -{ - struct netdevsim *ns = netdev_priv(dev); - - return &ns->nsim_dev_port->devlink_port; -} - static const struct net_device_ops nsim_netdev_ops = { .ndo_start_xmit = nsim_start_xmit, .ndo_set_rx_mode = nsim_set_rx_mode, @@ -263,7 +256,6 @@ static const struct net_device_ops nsim_netdev_ops = { .ndo_setup_tc = nsim_setup_tc, .ndo_set_features = nsim_set_features, .ndo_bpf = nsim_bpf, - .ndo_get_devlink_port = nsim_get_devlink_port, }; static const struct net_device_ops nsim_vf_netdev_ops = { @@ -275,7 +267,6 @@ static const struct net_device_ops nsim_vf_netdev_ops = { .ndo_get_stats64 = nsim_get_stats64, .ndo_setup_tc = nsim_setup_tc, .ndo_set_features = nsim_set_features, - .ndo_get_devlink_port = nsim_get_devlink_port, }; static void nsim_setup(struct net_device *dev) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f048a30ea10b..d45713a06568 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1366,10 +1366,6 @@ struct netdev_net_notifier { * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. - * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev); - * Get devlink port instance associated with a given netdev. - * Called with a reference on the netdevice and devlink locks only, - * rtnl_lock is not held. * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. @@ -1600,7 +1596,6 @@ struct net_device_ops { struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); - struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 158bb3f1aa0b..4176482cd03f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2165,13 +2165,6 @@ static const struct dcbnl_rtnl_ops __maybe_unused dsa_slave_dcbnl_ops = { .ieee_delapp = dsa_slave_dcbnl_ieee_delapp, }; -static struct devlink_port *dsa_slave_get_devlink_port(struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return &dp->devlink_port; -} - static void dsa_slave_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { @@ -2219,7 +2212,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_get_stats64 = dsa_slave_get_stats64, .ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid, - .ndo_get_devlink_port = dsa_slave_get_devlink_port, .ndo_change_mtu = dsa_slave_change_mtu, .ndo_fill_forward_path = dsa_slave_fill_forward_path, }; -- cgit v1.2.3-70-g09d2 From dca56c3038c34a3e5acfe0aadb1f2bc9d724ae79 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:11 +0100 Subject: net: expose devlink port over rtnetlink Expose devlink port handle related to netdev over rtnetlink. Introduce a new nested IFLA attribute to carry the info. Call into devlink code to fill-up the nest with existing devlink attributes that are used over devlink netlink. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 14 ++++++++++++++ include/uapi/linux/if_link.h | 2 ++ net/core/devlink.c | 18 ++++++++++++++++++ net/core/rtnetlink.c | 39 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 73 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 7befad57afd4..fa6e936af1a5 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1873,6 +1873,9 @@ int devlink_compat_phys_port_name_get(struct net_device *dev, int devlink_compat_switch_id_get(struct net_device *dev, struct netdev_phys_item_id *ppid); +int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port); +size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port); + #else static inline struct devlink *devlink_try_get(struct devlink *devlink) @@ -1909,6 +1912,17 @@ devlink_compat_switch_id_get(struct net_device *dev, return -EOPNOTSUPP; } +static inline int +devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) +{ + return 0; +} + +static inline size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d92b3f79eba3..1021a7e47a86 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -372,6 +372,8 @@ enum { IFLA_TSO_MAX_SEGS, IFLA_ALLMULTI, /* Allmulti count: > 0 means acts ALLMULTI */ + IFLA_DEVLINK_PORT, + __IFLA_MAX }; diff --git a/net/core/devlink.c b/net/core/devlink.c index 3a454d0045e5..2dcf2bcc3527 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -880,6 +880,24 @@ nla_put_failure: return -EMSGSIZE; } +int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port) +{ + if (devlink_nl_put_handle(msg, devlink_port->devlink)) + return -EMSGSIZE; + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index)) + return -EMSGSIZE; + return 0; +} + +size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port) +{ + struct devlink *devlink = devlink_port->devlink; + + return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */ + + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */ + + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */ +} + struct devlink_reload_combination { enum devlink_reload_action action; enum devlink_reload_limit limit; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b64fffeb3844..64289bc98887 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "dev.h" @@ -1038,6 +1039,16 @@ static size_t rtnl_proto_down_size(const struct net_device *dev) return size; } +static size_t rtnl_devlink_port_size(const struct net_device *dev) +{ + size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */ + + if (dev->devlink_port) + size += devlink_nl_port_handle_size(dev->devlink_port); + + return size; +} + static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { @@ -1091,6 +1102,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + + rtnl_devlink_port_size(dev) + 0; } @@ -1728,6 +1740,30 @@ nla_put_failure: return -EMSGSIZE; } +static int rtnl_fill_devlink_port(struct sk_buff *skb, + const struct net_device *dev) +{ + struct nlattr *devlink_port_nest; + int ret; + + devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT); + if (!devlink_port_nest) + return -EMSGSIZE; + + if (dev->devlink_port) { + ret = devlink_nl_port_handle_fill(skb, dev->devlink_port); + if (ret < 0) + goto nest_cancel; + } + + nla_nest_end(skb, devlink_port_nest); + return 0; + +nest_cancel: + nla_nest_cancel(skb, devlink_port_nest); + return ret; +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, @@ -1865,6 +1901,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, dev->dev.parent->bus->name)) goto nla_put_failure; + if (rtnl_fill_devlink_port(skb, dev)) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3-70-g09d2 From db559117828d2448fe81ada051c60bcf39f822e9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:56 +0530 Subject: bpf: Consolidate spin_lock, timer management into btf_record Now that kptr_off_tab has been refactored into btf_record, and can hold more than one specific field type, accomodate bpf_spin_lock and bpf_timer as well. While they don't require any more metadata than offset, having all special fields in one place allows us to share the same code for allocated user defined types and handle both map values and these allocated objects in a similar fashion. As an optimization, we still keep spin_lock_off and timer_off offsets in the btf_record structure, just to avoid having to find the btf_field struct each time their offset is needed. This is mostly needed to manipulate such objects in a map value at runtime. It's ok to hardcode just one offset as more than one field is disallowed. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 53 ++++--- include/linux/btf.h | 3 +- kernel/bpf/arraymap.c | 19 +-- kernel/bpf/bpf_local_storage.c | 2 +- kernel/bpf/btf.c | 323 ++++++++++++++++++++++------------------- kernel/bpf/hashtab.c | 24 +-- kernel/bpf/helpers.c | 6 +- kernel/bpf/local_storage.c | 2 +- kernel/bpf/map_in_map.c | 5 +- kernel/bpf/syscall.c | 135 ++++++++--------- kernel/bpf/verifier.c | 82 +++-------- net/core/bpf_sk_storage.c | 4 +- 12 files changed, 314 insertions(+), 344 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5f2a42033a37..aae85019abde 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -166,13 +166,13 @@ struct bpf_map_ops { enum { /* Support at most 8 pointers in a BTF type */ - BTF_FIELDS_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + - 1 + /* for bpf_spin_lock */ - 1, /* for bpf_timer */ + BTF_FIELDS_MAX = 10, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX, }; enum btf_field_type { + BPF_SPIN_LOCK = (1 << 0), + BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, @@ -196,6 +196,8 @@ struct btf_field { struct btf_record { u32 cnt; u32 field_mask; + int spin_lock_off; + int timer_off; struct btf_field fields[]; }; @@ -220,10 +222,8 @@ struct bpf_map { u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; - int spin_lock_off; /* >=0 valid offset, <0 error */ - struct btf_record *record; - int timer_off; /* >=0 valid offset, <0 error */ u32 id; + struct btf_record *record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; @@ -257,9 +257,29 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline const char *btf_field_type_name(enum btf_field_type type) +{ + switch (type) { + case BPF_SPIN_LOCK: + return "bpf_spin_lock"; + case BPF_TIMER: + return "bpf_timer"; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return "kptr"; + default: + WARN_ON_ONCE(1); + return "unknown"; + } +} + static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return sizeof(struct bpf_spin_lock); + case BPF_TIMER: + return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); @@ -272,6 +292,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type) static inline u32 btf_field_type_align(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return __alignof__(struct bpf_spin_lock); + case BPF_TIMER: + return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); @@ -288,22 +312,8 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline bool map_value_has_spin_lock(const struct bpf_map *map) -{ - return map->spin_lock_off >= 0; -} - -static inline bool map_value_has_timer(const struct bpf_map *map) -{ - return map->timer_off >= 0; -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { - if (unlikely(map_value_has_spin_lock(map))) - memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); - if (unlikely(map_value_has_timer(map))) - memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); if (!IS_ERR_OR_NULL(map->record)) { struct btf_field *fields = map->record->fields; u32 cnt = map->record->cnt; @@ -1740,6 +1750,7 @@ void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 9e62717cdc7a..282006abd062 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,7 +163,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 417f84342e98..672eb17ac421 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -306,13 +306,6 @@ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key return 0; } -static void check_and_free_fields(struct bpf_array *arr, void *val) -{ - if (map_value_has_timer(&arr->map)) - bpf_timer_cancel_and_free(val + arr->map.timer_off); - bpf_obj_free_fields(arr->map.record, val); -} - /* Called from syscall or from eBPF program */ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) @@ -334,13 +327,13 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { val = this_cpu_ptr(array->pptrs[index & array->index_mask]); copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } else { val = array->value + (u64)array->elem_size * (index & array->index_mask); @@ -348,7 +341,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, copy_map_value_locked(map, val, value, false); else copy_map_value(map, val, value); - check_and_free_fields(array, val); + bpf_obj_free_fields(array->map.record, val); } return 0; } @@ -385,7 +378,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { copy_map_value_long(map, per_cpu_ptr(pptr, cpu), value + off); - check_and_free_fields(array, per_cpu_ptr(pptr, cpu)); + bpf_obj_free_fields(array->map.record, per_cpu_ptr(pptr, cpu)); off += size; } rcu_read_unlock(); @@ -409,11 +402,11 @@ static void array_map_free_timers(struct bpf_map *map) int i; /* We don't reset or free fields other than timer on uref dropping to zero. */ - if (!map_value_has_timer(map)) + if (!btf_record_has_field(map->record, BPF_TIMER)) return; for (i = 0; i < array->map.max_entries; i++) - bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off); + bpf_obj_free_timer(map->record, array_map_elem_ptr(array, i)); } /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 93d9b1b17bc8..37020078d1c1 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -382,7 +382,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST) || /* BPF_F_LOCK can only be used in a value with spin_lock */ unlikely((map_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(&smap->map))) + !btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))) return ERR_PTR(-EINVAL); if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8391a77138ee..3dad828db13c 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3205,16 +3205,20 @@ enum { struct btf_field_info { enum btf_field_type type; u32 off; - u32 type_id; + struct { + u32 type_id; + } kptr; }; static int btf_find_struct(const struct btf *btf, const struct btf_type *t, - u32 off, int sz, struct btf_field_info *info) + u32 off, int sz, enum btf_field_type field_type, + struct btf_field_info *info) { if (!__btf_type_is_struct(t)) return BTF_FIELD_IGNORE; if (t->size != sz) return BTF_FIELD_IGNORE; + info->type = field_type; info->off = off; return BTF_FIELD_FOUND; } @@ -3251,28 +3255,66 @@ static int btf_find_kptr(const struct btf *btf, const struct btf_type *t, if (!__btf_type_is_struct(t)) return -EINVAL; - info->type_id = res_id; - info->off = off; info->type = type; + info->off = off; + info->kptr.type_id = res_id; return BTF_FIELD_FOUND; } -static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_info_type field_type, +static int btf_get_field_type(const char *name, u32 field_mask, u32 *seen_mask, + int *align, int *sz) +{ + int type = 0; + + if (field_mask & BPF_SPIN_LOCK) { + if (!strcmp(name, "bpf_spin_lock")) { + if (*seen_mask & BPF_SPIN_LOCK) + return -E2BIG; + *seen_mask |= BPF_SPIN_LOCK; + type = BPF_SPIN_LOCK; + goto end; + } + } + if (field_mask & BPF_TIMER) { + if (!strcmp(name, "bpf_timer")) { + if (*seen_mask & BPF_TIMER) + return -E2BIG; + *seen_mask |= BPF_TIMER; + type = BPF_TIMER; + goto end; + } + } + /* Only return BPF_KPTR when all other types with matchable names fail */ + if (field_mask & BPF_KPTR) { + type = BPF_KPTR_REF; + goto end; + } + return 0; +end: + *sz = btf_field_type_size(type); + *align = btf_field_type_align(type); + return type; +} + +static int btf_find_struct_field(const struct btf *btf, + const struct btf_type *t, u32 field_mask, struct btf_field_info *info, int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_member *member; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_member(i, t, member) { const struct btf_type *member_type = btf_type_by_id(btf, member->type); - if (name && strcmp(__btf_name_by_offset(btf, member_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, member_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; off = __btf_member_bit_offset(t, member); if (off % 8) @@ -3280,17 +3322,18 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t return -EINVAL; off /= 8; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, member_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + ret = btf_find_struct(btf, member_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, member_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3310,37 +3353,41 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t } static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, - const char *name, int sz, int align, - enum btf_field_info_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { + int ret, idx = 0, align, sz, field_type; const struct btf_var_secinfo *vsi; struct btf_field_info tmp; - int ret, idx = 0; - u32 i, off; + u32 i, off, seen_mask = 0; for_each_vsi(i, t, vsi) { const struct btf_type *var = btf_type_by_id(btf, vsi->type); const struct btf_type *var_type = btf_type_by_id(btf, var->type); - off = vsi->offset; - - if (name && strcmp(__btf_name_by_offset(btf, var_type->name_off), name)) + field_type = btf_get_field_type(__btf_name_by_offset(btf, var_type->name_off), + field_mask, &seen_mask, &align, &sz); + if (field_type == 0) continue; + if (field_type < 0) + return field_type; + + off = vsi->offset; if (vsi->size != sz) continue; if (off % align) - return -EINVAL; + continue; switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - case BTF_FIELD_TIMER: - ret = btf_find_struct(btf, var_type, off, sz, + case BPF_SPIN_LOCK: + case BPF_TIMER: + ret = btf_find_struct(btf, var_type, off, sz, field_type, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) return ret; break; - case BTF_FIELD_KPTR: + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: ret = btf_find_kptr(btf, var_type, off, sz, idx < info_cnt ? &info[idx] : &tmp); if (ret < 0) @@ -3360,78 +3407,98 @@ static int btf_find_datasec_var(const struct btf *btf, const struct btf_type *t, } static int btf_find_field(const struct btf *btf, const struct btf_type *t, - enum btf_field_info_type field_type, - struct btf_field_info *info, int info_cnt) + u32 field_mask, struct btf_field_info *info, + int info_cnt) { - const char *name; - int sz, align; - - switch (field_type) { - case BTF_FIELD_SPIN_LOCK: - name = "bpf_spin_lock"; - sz = sizeof(struct bpf_spin_lock); - align = __alignof__(struct bpf_spin_lock); - break; - case BTF_FIELD_TIMER: - name = "bpf_timer"; - sz = sizeof(struct bpf_timer); - align = __alignof__(struct bpf_timer); - break; - case BTF_FIELD_KPTR: - name = NULL; - sz = sizeof(u64); - align = 8; - break; - default: - return -EFAULT; - } - if (__btf_type_is_struct(t)) - return btf_find_struct_field(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_struct_field(btf, t, field_mask, info, info_cnt); else if (btf_type_is_datasec(t)) - return btf_find_datasec_var(btf, t, name, sz, align, field_type, info, info_cnt); + return btf_find_datasec_var(btf, t, field_mask, info, info_cnt); return -EINVAL; } -/* find 'struct bpf_spin_lock' in map value. - * return >= 0 offset if found - * and < 0 in case of error - */ -int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +static int btf_parse_kptr(const struct btf *btf, struct btf_field *field, + struct btf_field_info *info) { - struct btf_field_info info; + struct module *mod = NULL; + const struct btf_type *t; + struct btf *kernel_btf; int ret; + s32 id; - ret = btf_find_field(btf, t, BTF_FIELD_SPIN_LOCK, &info, 1); - if (ret < 0) - return ret; - if (!ret) - return -ENOENT; - return info.off; -} + /* Find type in map BTF, and use it to look up the matching type + * in vmlinux or module BTFs, by name and kind. + */ + t = btf_type_by_id(btf, info->kptr.type_id); + id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), + &kernel_btf); + if (id < 0) + return id; + + /* Find and stash the function pointer for the destruction function that + * needs to be eventually invoked from the map free path. + */ + if (info->type == BPF_KPTR_REF) { + const struct btf_type *dtor_func; + const char *dtor_func_name; + unsigned long addr; + s32 dtor_btf_id; + + /* This call also serves as a whitelist of allowed objects that + * can be used as a referenced pointer and be stored in a map at + * the same time. + */ + dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); + if (dtor_btf_id < 0) { + ret = dtor_btf_id; + goto end_btf; + } -int btf_find_timer(const struct btf *btf, const struct btf_type *t) -{ - struct btf_field_info info; - int ret; + dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); + if (!dtor_func) { + ret = -ENOENT; + goto end_btf; + } - ret = btf_find_field(btf, t, BTF_FIELD_TIMER, &info, 1); - if (ret < 0) - return ret; - if (!ret) - return -ENOENT; - return info.off; + if (btf_is_module(kernel_btf)) { + mod = btf_try_get_module(kernel_btf); + if (!mod) { + ret = -ENXIO; + goto end_btf; + } + } + + /* We already verified dtor_func to be btf_type_is_func + * in register_btf_id_dtor_kfuncs. + */ + dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); + addr = kallsyms_lookup_name(dtor_func_name); + if (!addr) { + ret = -EINVAL; + goto end_mod; + } + field->kptr.dtor = (void *)addr; + } + + field->kptr.btf_id = id; + field->kptr.btf = kernel_btf; + field->kptr.module = mod; + return 0; +end_mod: + module_put(mod); +end_btf: + btf_put(kernel_btf); + return ret; } -struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t) +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size) { struct btf_field_info info_arr[BTF_FIELDS_MAX]; - struct btf *kernel_btf = NULL; - struct module *mod = NULL; struct btf_record *rec; int ret, i, cnt; - ret = btf_find_field(btf, t, BTF_FIELD_KPTR, info_arr, ARRAY_SIZE(info_arr)); + ret = btf_find_field(btf, t, field_mask, info_arr, ARRAY_SIZE(info_arr)); if (ret < 0) return ERR_PTR(ret); if (!ret) @@ -3441,80 +3508,44 @@ struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type rec = kzalloc(offsetof(struct btf_record, fields[cnt]), GFP_KERNEL | __GFP_NOWARN); if (!rec) return ERR_PTR(-ENOMEM); - rec->cnt = 0; - for (i = 0; i < cnt; i++) { - const struct btf_type *t; - s32 id; - /* Find type in map BTF, and use it to look up the matching type - * in vmlinux or module BTFs, by name and kind. - */ - t = btf_type_by_id(btf, info_arr[i].type_id); - id = bpf_find_btf_id(__btf_name_by_offset(btf, t->name_off), BTF_INFO_KIND(t->info), - &kernel_btf); - if (id < 0) { - ret = id; + rec->spin_lock_off = -EINVAL; + rec->timer_off = -EINVAL; + for (i = 0; i < cnt; i++) { + if (info_arr[i].off + btf_field_type_size(info_arr[i].type) > value_size) { + WARN_ONCE(1, "verifier bug off %d size %d", info_arr[i].off, value_size); + ret = -EFAULT; goto end; } - /* Find and stash the function pointer for the destruction function that - * needs to be eventually invoked from the map free path. - */ - if (info_arr[i].type == BPF_KPTR_REF) { - const struct btf_type *dtor_func; - const char *dtor_func_name; - unsigned long addr; - s32 dtor_btf_id; - - /* This call also serves as a whitelist of allowed objects that - * can be used as a referenced pointer and be stored in a map at - * the same time. - */ - dtor_btf_id = btf_find_dtor_kfunc(kernel_btf, id); - if (dtor_btf_id < 0) { - ret = dtor_btf_id; - goto end_btf; - } - - dtor_func = btf_type_by_id(kernel_btf, dtor_btf_id); - if (!dtor_func) { - ret = -ENOENT; - goto end_btf; - } - - if (btf_is_module(kernel_btf)) { - mod = btf_try_get_module(kernel_btf); - if (!mod) { - ret = -ENXIO; - goto end_btf; - } - } - - /* We already verified dtor_func to be btf_type_is_func - * in register_btf_id_dtor_kfuncs. - */ - dtor_func_name = __btf_name_by_offset(kernel_btf, dtor_func->name_off); - addr = kallsyms_lookup_name(dtor_func_name); - if (!addr) { - ret = -EINVAL; - goto end_mod; - } - rec->fields[i].kptr.dtor = (void *)addr; - } - rec->field_mask |= info_arr[i].type; rec->fields[i].offset = info_arr[i].off; rec->fields[i].type = info_arr[i].type; - rec->fields[i].kptr.btf_id = id; - rec->fields[i].kptr.btf = kernel_btf; - rec->fields[i].kptr.module = mod; + + switch (info_arr[i].type) { + case BPF_SPIN_LOCK: + WARN_ON_ONCE(rec->spin_lock_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->spin_lock_off = rec->fields[i].offset; + break; + case BPF_TIMER: + WARN_ON_ONCE(rec->timer_off >= 0); + /* Cache offset for faster lookup at runtime */ + rec->timer_off = rec->fields[i].offset; + break; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + ret = btf_parse_kptr(btf, &rec->fields[i], &info_arr[i]); + if (ret < 0) + goto end; + break; + default: + ret = -EFAULT; + goto end; + } rec->cnt++; } return rec; -end_mod: - module_put(mod); -end_btf: - btf_put(kernel_btf); end: btf_record_free(rec); return ERR_PTR(ret); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c5ea8f9bb7a9..50d254cd0709 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -222,7 +222,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) u32 num_entries = htab->map.max_entries; int i; - if (!map_value_has_timer(&htab->map)) + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (htab_has_extra_elems(htab)) num_entries += num_possible_cpus(); @@ -231,9 +231,7 @@ static void htab_free_prealloced_timers(struct bpf_htab *htab) struct htab_elem *elem; elem = get_htab_elem(htab, i); - bpf_timer_cancel_and_free(elem->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + bpf_obj_free_timer(htab->map.record, elem->key + round_up(htab->map.key_size, 8)); cond_resched(); } } @@ -763,8 +761,6 @@ static void check_and_free_fields(struct bpf_htab *htab, { void *map_value = elem->key + round_up(htab->map.key_size, 8); - if (map_value_has_timer(&htab->map)) - bpf_timer_cancel_and_free(map_value + htab->map.timer_off); bpf_obj_free_fields(htab->map.record, map_value); } @@ -1089,7 +1085,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, head = &b->head; if (unlikely(map_flags & BPF_F_LOCK)) { - if (unlikely(!map_value_has_spin_lock(map))) + if (unlikely(!btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; /* find an element without taking the bucket lock */ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, @@ -1472,12 +1468,8 @@ static void htab_free_malloced_timers(struct bpf_htab *htab) struct htab_elem *l; hlist_nulls_for_each_entry(l, n, head, hash_node) { - /* We don't reset or free kptr on uref dropping to zero, - * hence just free timer. - */ - bpf_timer_cancel_and_free(l->key + - round_up(htab->map.key_size, 8) + - htab->map.timer_off); + /* We only free timer on uref dropping to zero */ + bpf_obj_free_timer(htab->map.record, l->key + round_up(htab->map.key_size, 8)); } cond_resched_rcu(); } @@ -1488,8 +1480,8 @@ static void htab_map_free_timers(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - /* We don't reset or free kptr on uref dropping to zero. */ - if (!map_value_has_timer(&htab->map)) + /* We only free timer on uref dropping to zero */ + if (!btf_record_has_field(htab->map.record, BPF_TIMER)) return; if (!htab_is_prealloc(htab)) htab_free_malloced_timers(htab); @@ -1673,7 +1665,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, elem_map_flags = attr->batch.elem_flags; if ((elem_map_flags & ~BPF_F_LOCK) || - ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map))) + ((elem_map_flags & BPF_F_LOCK) && !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; map_flags = attr->batch.flags; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 124fd199ce5c..283f55bbeb70 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -366,9 +366,9 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, struct bpf_spin_lock *lock; if (lock_src) - lock = src + map->spin_lock_off; + lock = src + map->record->spin_lock_off; else - lock = dst + map->spin_lock_off; + lock = dst + map->record->spin_lock_off; preempt_disable(); __bpf_spin_lock_irqsave(lock); copy_map_value(map, dst, src); @@ -1169,7 +1169,7 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map ret = -ENOMEM; goto out; } - t->value = (void *)timer - map->timer_off; + t->value = (void *)timer - map->record->timer_off; t->map = map; t->prog = NULL; rcu_assign_pointer(t->callback_fn, NULL); diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 098cf336fae6..e90d9f63edc5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -151,7 +151,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return -EINVAL; if (unlikely((flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map))) + !btf_record_has_field(map->record, BPF_SPIN_LOCK))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index d6c662183f88..8ca0cca39d49 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -29,7 +29,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-ENOTSUPP); } - if (map_value_has_spin_lock(inner_map)) { + if (btf_record_has_field(inner_map->record, BPF_SPIN_LOCK)) { fdput(f); return ERR_PTR(-ENOTSUPP); } @@ -50,8 +50,6 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; - inner_map_meta->spin_lock_off = inner_map->spin_lock_off; - inner_map_meta->timer_off = inner_map->timer_off; inner_map_meta->record = btf_record_dup(inner_map->record); if (IS_ERR(inner_map_meta->record)) { /* btf_record_dup returns NULL or valid pointer in case of @@ -92,7 +90,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0, return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && - meta0->timer_off == meta1->timer_off && meta0->map_flags == meta1->map_flags && btf_record_equal(meta0->record, meta1->record); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b80c0e2eb73f..53d6dc5cf0e2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -527,6 +527,9 @@ void btf_record_free(struct btf_record *rec) return; for (i = 0; i < rec->cnt; i++) { switch (rec->fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (rec->fields[i].kptr.module) @@ -564,6 +567,9 @@ struct btf_record *btf_record_dup(const struct btf_record *rec) new_rec->cnt = 0; for (i = 0; i < rec->cnt; i++) { switch (fields[i].type) { + case BPF_SPIN_LOCK: + case BPF_TIMER: + break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: btf_get(fields[i].kptr.btf); @@ -600,6 +606,13 @@ bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *r return !memcmp(rec_a, rec_b, size); } +void bpf_obj_free_timer(const struct btf_record *rec, void *obj) +{ + if (WARN_ON_ONCE(!btf_record_has_field(rec, BPF_TIMER))) + return; + bpf_timer_cancel_and_free(obj + rec->timer_off); +} + void bpf_obj_free_fields(const struct btf_record *rec, void *obj) { const struct btf_field *fields; @@ -613,6 +626,11 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj) void *field_ptr = obj + field->offset; switch (fields[i].type) { + case BPF_SPIN_LOCK: + break; + case BPF_TIMER: + bpf_timer_cancel_and_free(field_ptr); + break; case BPF_KPTR_UNREF: WRITE_ONCE(*(u64 *)field_ptr, 0); break; @@ -798,8 +816,7 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) struct bpf_map *map = filp->private_data; int err; - if (!map->ops->map_mmap || map_value_has_spin_lock(map) || - map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record)) + if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record)) return -ENOTSUPP; if (!(vma->vm_flags & VM_SHARED)) @@ -954,48 +971,30 @@ static void map_field_offs_swap(void *_a, void *_b, int size, const void *priv) static int bpf_map_alloc_off_arr(struct bpf_map *map) { - bool has_spin_lock = map_value_has_spin_lock(map); - bool has_timer = map_value_has_timer(map); bool has_fields = !IS_ERR_OR_NULL(map->record); struct btf_field_offs *fo; - u32 i; + struct btf_record *rec; + u32 i, *off; + u8 *sz; - if (!has_spin_lock && !has_timer && !has_fields) { + if (!has_fields) { map->field_offs = NULL; return 0; } - fo = kmalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN); + fo = kzalloc(sizeof(*map->field_offs), GFP_KERNEL | __GFP_NOWARN); if (!fo) return -ENOMEM; map->field_offs = fo; - fo->cnt = 0; - if (has_spin_lock) { - i = fo->cnt; - - fo->field_off[i] = map->spin_lock_off; - fo->field_sz[i] = sizeof(struct bpf_spin_lock); - fo->cnt++; - } - if (has_timer) { - i = fo->cnt; - - fo->field_off[i] = map->timer_off; - fo->field_sz[i] = sizeof(struct bpf_timer); - fo->cnt++; - } - if (has_fields) { - struct btf_record *rec = map->record; - u32 *off = &fo->field_off[fo->cnt]; - u8 *sz = &fo->field_sz[fo->cnt]; - - for (i = 0; i < rec->cnt; i++) { - *off++ = rec->fields[i].offset; - *sz++ = btf_field_type_size(rec->fields[i].type); - } - fo->cnt += rec->cnt; + rec = map->record; + off = fo->field_off; + sz = fo->field_sz; + for (i = 0; i < rec->cnt; i++) { + *off++ = rec->fields[i].offset; + *sz++ = btf_field_type_size(rec->fields[i].type); } + fo->cnt = rec->cnt; if (fo->cnt == 1) return 0; @@ -1026,39 +1025,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; - map->spin_lock_off = btf_find_spin_lock(btf, value_type); - - if (map_value_has_spin_lock(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY && - map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && - map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE && - map->map_type != BPF_MAP_TYPE_TASK_STORAGE && - map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) - return -ENOTSUPP; - if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > - map->value_size) { - WARN_ONCE(1, - "verifier bug spin_lock_off %d value_size %d\n", - map->spin_lock_off, map->value_size); - return -EFAULT; - } - } - - map->timer_off = btf_find_timer(btf, value_type); - if (map_value_has_timer(map)) { - if (map->map_flags & BPF_F_RDONLY_PROG) - return -EACCES; - if (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_LRU_HASH && - map->map_type != BPF_MAP_TYPE_ARRAY) - return -EOPNOTSUPP; - } - - map->record = btf_parse_fields(btf, value_type); + map->record = btf_parse_fields(btf, value_type, BPF_SPIN_LOCK | BPF_TIMER | BPF_KPTR, + map->value_size); if (!IS_ERR_OR_NULL(map->record)) { int i; @@ -1074,6 +1042,26 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, switch (map->record->field_mask & (1 << i)) { case 0: continue; + case BPF_SPIN_LOCK: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY && + map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_SK_STORAGE && + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE && + map->map_type != BPF_MAP_TYPE_CGRP_STORAGE) { + ret = -EOPNOTSUPP; + goto free_map_tab; + } + break; + case BPF_TIMER: + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_LRU_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) { + return -EOPNOTSUPP; + goto free_map_tab; + } + break; case BPF_KPTR_UNREF: case BPF_KPTR_REF: if (map->map_type != BPF_MAP_TYPE_HASH && @@ -1153,8 +1141,6 @@ static int map_create(union bpf_attr *attr) mutex_init(&map->freeze_mutex); spin_lock_init(&map->owner.lock); - map->spin_lock_off = -EINVAL; - map->timer_off = -EINVAL; if (attr->btf_key_type_id || attr->btf_value_type_id || /* Even the map's value is a kernel's struct, * the bpf_prog.o must have BTF to begin with @@ -1368,7 +1354,7 @@ static int map_lookup_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1441,7 +1427,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1604,7 +1590,7 @@ int generic_map_delete_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1661,7 +1647,7 @@ int generic_map_update_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { return -EINVAL; } @@ -1724,7 +1710,7 @@ int generic_map_lookup_batch(struct bpf_map *map, return -EINVAL; if ((attr->batch.elem_flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) return -EINVAL; value_size = bpf_map_value_size(map); @@ -1846,7 +1832,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) } if ((attr->flags & BPF_F_LOCK) && - !map_value_has_spin_lock(map)) { + !btf_record_has_field(map->record, BPF_SPIN_LOCK)) { err = -EINVAL; goto err_put; } @@ -1917,8 +1903,7 @@ static int map_freeze(const union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || - map_value_has_timer(map) || !IS_ERR_OR_NULL(map->record)) { + if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS || !IS_ERR_OR_NULL(map->record)) { fdput(f); return -ENOTSUPP; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5ce5364ce898..73a3516f1a48 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -454,7 +454,7 @@ static bool reg_type_not_null(enum bpf_reg_type type) static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) { return reg->type == PTR_TO_MAP_VALUE && - map_value_has_spin_lock(reg->map_ptr); + btf_record_has_field(reg->map_ptr->record, BPF_SPIN_LOCK); } static bool type_is_rdonly_mem(u32 type) @@ -1388,7 +1388,7 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) /* transfer reg's id which is unique for every map_lookup_elem * as UID of the inner map. */ - if (map_value_has_timer(map->inner_map_meta)) + if (btf_record_has_field(map->inner_map_meta->record, BPF_TIMER)) reg->map_uid = reg->id; } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; @@ -3817,29 +3817,6 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, if (err) return err; - if (map_value_has_spin_lock(map)) { - u32 lock = map->spin_lock_off; - - /* if any part of struct bpf_spin_lock can be touched by - * load/store reject this program. - * To check that [x1, x2) overlaps with [y1, y2) - * it is sufficient to check x1 < y2 && y1 < x2. - */ - if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && - lock < reg->umax_value + off + size) { - verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); - return -EACCES; - } - } - if (map_value_has_timer(map)) { - u32 t = map->timer_off; - - if (reg->smin_value + off < t + sizeof(struct bpf_timer) && - t < reg->umax_value + off + size) { - verbose(env, "bpf_timer cannot be accessed directly by load/store\n"); - return -EACCES; - } - } if (IS_ERR_OR_NULL(map->record)) return 0; rec = map->record; @@ -3847,6 +3824,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, struct btf_field *field = &rec->fields[i]; u32 p = field->offset; + /* If any part of a field can be touched by load/store, reject + * this program. To check that [x1, x2) overlaps with [y1, y2), + * it is sufficient to check x1 < y2 && y1 < x2. + */ if (reg->smin_value + off < p + btf_field_type_size(field->type) && p < reg->umax_value + off + size) { switch (field->type) { @@ -3871,7 +3852,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, } break; default: - verbose(env, "field cannot be accessed directly by load/store\n"); + verbose(env, "%s cannot be accessed directly by load/store\n", + btf_field_type_name(field->type)); return -EACCES; } } @@ -5440,24 +5422,13 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (!map_value_has_spin_lock(map)) { - if (map->spin_lock_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_spin_lock'\n", - map->name); - else if (map->spin_lock_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_spin_lock'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_spin_lock is mangled\n", - map->name); + if (!btf_record_has_field(map->record, BPF_SPIN_LOCK)) { + verbose(env, "map '%s' has no valid bpf_spin_lock\n", map->name); return -EINVAL; } - if (map->spin_lock_off != val + reg->off) { - verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", - val + reg->off); + if (map->record->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock' that is at %d\n", + val + reg->off, map->record->spin_lock_off); return -EINVAL; } if (is_lock) { @@ -5500,24 +5471,13 @@ static int process_timer_func(struct bpf_verifier_env *env, int regno, map->name); return -EINVAL; } - if (!map_value_has_timer(map)) { - if (map->timer_off == -E2BIG) - verbose(env, - "map '%s' has more than one 'struct bpf_timer'\n", - map->name); - else if (map->timer_off == -ENOENT) - verbose(env, - "map '%s' doesn't have 'struct bpf_timer'\n", - map->name); - else - verbose(env, - "map '%s' is not a struct type or bpf_timer is mangled\n", - map->name); + if (!btf_record_has_field(map->record, BPF_TIMER)) { + verbose(env, "map '%s' has no valid bpf_timer\n", map->name); return -EINVAL; } - if (map->timer_off != val + reg->off) { + if (map->record->timer_off != val + reg->off) { verbose(env, "off %lld doesn't point to 'struct bpf_timer' that is at %d\n", - val + reg->off, map->timer_off); + val + reg->off, map->record->timer_off); return -EINVAL; } if (meta->map_ptr) { @@ -7470,7 +7430,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn regs[BPF_REG_0].map_uid = meta.map_uid; regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag; if (!type_may_be_null(ret_type) && - map_value_has_spin_lock(meta.map_ptr)) { + btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) { regs[BPF_REG_0].id = ++env->id_gen; } break; @@ -10381,7 +10341,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) insn->src_reg == BPF_PSEUDO_MAP_IDX_VALUE) { dst_reg->type = PTR_TO_MAP_VALUE; dst_reg->off = aux->map_off; - if (map_value_has_spin_lock(map)) + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) dst_reg->id = ++env->id_gen; } else if (insn->src_reg == BPF_PSEUDO_MAP_FD || insn->src_reg == BPF_PSEUDO_MAP_IDX) { @@ -12659,7 +12619,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, { enum bpf_prog_type prog_type = resolve_prog_type(prog); - if (map_value_has_spin_lock(map)) { + if (btf_record_has_field(map->record, BPF_SPIN_LOCK)) { if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); return -EINVAL; @@ -12676,7 +12636,7 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } - if (map_value_has_timer(map)) { + if (btf_record_has_field(map->record, BPF_TIMER)) { if (is_tracing_prog_type(prog_type)) { verbose(env, "tracing progs cannot use bpf_timer yet\n"); return -EINVAL; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 49884e7de080..9d2288c0736e 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -147,7 +147,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, if (!copy_selem) return NULL; - if (map_value_has_spin_lock(&smap->map)) + if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) copy_map_value_locked(&smap->map, SDATA(copy_selem)->data, SDATA(selem)->data, true); else @@ -566,7 +566,7 @@ static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb) if (!nla_value) goto errout; - if (map_value_has_spin_lock(&smap->map)) + if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)) copy_map_value_locked(&smap->map, nla_data(nla_value), sdata->data, true); else -- cgit v1.2.3-70-g09d2 From d28c0e73efbee95edf789f07c494592f89862606 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 1 Nov 2022 10:41:04 +0000 Subject: rds: remove redundant variable total_payload_len Variable total_payload_len is being used to accumulate payload lengths however it is never read or used afterwards. It is redundant and can be removed. Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/rds/send.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 0c5504068e3c..5e57a1581dc6 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1114,7 +1114,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) struct rds_conn_path *cpath; struct in6_addr daddr; __u32 scope_id = 0; - size_t total_payload_len = payload_len, rdma_payload_len = 0; + size_t rdma_payload_len = 0; bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) && sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY)); int num_sgs = DIV_ROUND_UP(payload_len, PAGE_SIZE); @@ -1243,7 +1243,6 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (ret) goto out; - total_payload_len += rdma_payload_len; if (max_t(size_t, payload_len, rdma_payload_len) > RDS_MAX_MSG_SIZE) { ret = -EMSGSIZE; goto out; -- cgit v1.2.3-70-g09d2 From 552acbf576fb7cb7ec70e978ca148dfbdae12a0e Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Fri, 4 Nov 2022 10:25:13 +0800 Subject: net: remove redundant check in ip_metrics_convert() Now ip_metrics_convert() is only called by ip_fib_metrics_init(). Before ip_fib_metrics_init() invokes ip_metrics_convert(), it checks whether input parameter fc_mx is NULL. Therefore, ip_metrics_convert() doesn't need to check fc_mx. Signed-off-by: Zhengchao Shao Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20221104022513.168868-1-shaozhengchao@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/metrics.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 25ea6ac44db9..7fcfdfd8f9de 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -14,9 +14,6 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, struct nlattr *nla; int remaining; - if (!fc_mx) - return 0; - nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) { int type = nla_type(nla); u32 val; -- cgit v1.2.3-70-g09d2 From ff14adbd8779f098576ffdb98381809ed8e40dfe Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:31 -0700 Subject: genetlink: refactor the cmd <> policy mapping dump The code at the top of ctrl_dumppolicy() dumps mappings between ops and policies. It supports dumping both the entire family and single op if dump is filtered. But both of those cases are handled inside a loop, which makes the logic harder to follow and change. Refactor to split the two cases more clearly. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 3e16527beb91..0a7a856e9ce0 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1319,21 +1319,24 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) void *hdr; if (!ctx->policies) { - while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { - struct genl_ops op; + struct genl_ops op; - if (ctx->single_op) { - int err; + if (ctx->single_op) { + int err; - err = genl_get_cmd(ctx->op, ctx->rt, &op); - if (WARN_ON(err)) - return skb->len; + err = genl_get_cmd(ctx->op, ctx->rt, &op); + if (WARN_ON(err)) + return err; - /* break out of the loop after this one */ - ctx->opidx = genl_get_cmd_cnt(ctx->rt); - } else { - genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); - } + if (ctrl_dumppolicy_put_op(skb, cb, &op)) + return skb->len; + + /* don't enter the loop below */ + ctx->opidx = genl_get_cmd_cnt(ctx->rt); + } + + while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { + genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); if (ctrl_dumppolicy_put_op(skb, cb, &op)) return skb->len; -- cgit v1.2.3-70-g09d2 From 20b0b53aca436af9fece9428ca3ab7c7b9cf4583 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:33 -0700 Subject: genetlink: introduce split op representation We currently have two forms of operations - small ops and "full" ops (or just ops). The former does not have pointers for some of the less commonly used features (namely dump start/done and policy). The "full" ops, however, still don't contain all the necessary information. In particular the policy is per command ID, while do and dump often accept different attributes. It's also not possible to define different pre_doit and post_doit callbacks for different commands within the family. At the same time a lot of commands do not support dumping and therefore all the dump-related information is wasted space. Create a new command representation which can hold info about a do implementation or a dump implementation, but not both at the same time. Use this new representation on the command execution path (genl_family_rcv_msg) as we either run a do or a dump and don't have to create a "full" op there. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/genetlink.h | 60 ++++++++++++++++++++++++++++++++--- net/batman-adv/netlink.c | 6 ++-- net/core/devlink.c | 4 +-- net/core/drop_monitor.c | 4 +-- net/ieee802154/nl802154.c | 6 ++-- net/netlink/genetlink.c | 79 ++++++++++++++++++++++++++++++++++++++--------- net/wireless/nl80211.c | 6 ++-- 7 files changed, 136 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 81180fc6526a..4be7989c451b 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -18,7 +18,7 @@ struct genl_multicast_group { u8 flags; }; -struct genl_ops; +struct genl_split_ops; struct genl_info; /** @@ -66,10 +66,10 @@ struct genl_family { u8 n_mcgrps; u8 resv_start_op; const struct nla_policy *policy; - int (*pre_doit)(const struct genl_ops *ops, + int (*pre_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); - void (*post_doit)(const struct genl_ops *ops, + void (*post_doit)(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); const struct genl_ops * ops; @@ -182,6 +182,58 @@ struct genl_ops { u8 validate; }; +/** + * struct genl_split_ops - generic netlink operations (do/dump split version) + * @cmd: command identifier + * @internal_flags: flags used by the family + * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) + * @validate: validation flags from enum genl_validate_flags + * @policy: netlink policy (takes precedence over family policy) + * @maxattr: maximum number of attributes supported + * + * Do callbacks: + * @pre_doit: called before an operation's @doit callback, it may + * do additional, common, filtering and return an error + * @doit: standard command callback + * @post_doit: called after an operation's @doit callback, it may + * undo operations done by pre_doit, for example release locks + * + * Dump callbacks: + * @start: start callback for dumps + * @dumpit: callback for dumpers + * @done: completion callback for dumps + * + * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags. + * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags. + * Exactly one of those flags must be set. + */ +struct genl_split_ops { + union { + struct { + int (*pre_doit)(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); + int (*doit)(struct sk_buff *skb, + struct genl_info *info); + void (*post_doit)(const struct genl_split_ops *ops, + struct sk_buff *skb, + struct genl_info *info); + }; + struct { + int (*start)(struct netlink_callback *cb); + int (*dumpit)(struct sk_buff *skb, + struct netlink_callback *cb); + int (*done)(struct netlink_callback *cb); + }; + }; + const struct nla_policy *policy; + unsigned int maxattr; + u8 cmd; + u8 internal_flags; + u8 flags; + u8 validate; +}; + /** * struct genl_dumpit_info - info that is available during dumpit op call * @family: generic netlink family - for internal genl code usage @@ -190,7 +242,7 @@ struct genl_ops { */ struct genl_dumpit_info { const struct genl_family *family; - struct genl_ops op; + struct genl_split_ops op; struct nlattr **attrs; }; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index a5e4a4e976cf..ad5714f737be 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1267,7 +1267,8 @@ batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net, * * Return: 0 on success or negative error number in case of failure */ -static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, +static int batadv_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); @@ -1332,7 +1333,8 @@ err_put_softif: * @skb: Netlink message with request data * @info: receiver information */ -static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb, +static void batadv_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface; diff --git a/net/core/devlink.c b/net/core/devlink.c index 2dcf2bcc3527..40fcdded57e6 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -770,7 +770,7 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id) #define DEVLINK_NL_FLAG_NEED_RATE_NODE BIT(3) #define DEVLINK_NL_FLAG_NEED_LINECARD BIT(4) -static int devlink_nl_pre_doit(const struct genl_ops *ops, +static int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct devlink_linecard *linecard; @@ -828,7 +828,7 @@ unlock: return err; } -static void devlink_nl_post_doit(const struct genl_ops *ops, +static void devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct devlink_linecard *linecard; diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 11aa6e8a3098..5a782d1d8fd3 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1620,7 +1620,7 @@ static const struct genl_small_ops dropmon_ops[] = { }, }; -static int net_dm_nl_pre_doit(const struct genl_ops *ops, +static int net_dm_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_lock(&net_dm_mutex); @@ -1628,7 +1628,7 @@ static int net_dm_nl_pre_doit(const struct genl_ops *ops, return 0; } -static void net_dm_nl_post_doit(const struct genl_ops *ops, +static void net_dm_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { mutex_unlock(&net_dm_mutex); diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 38c4f3cb010e..b33d1b5eda87 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -2157,7 +2157,8 @@ static int nl802154_del_llsec_seclevel(struct sk_buff *skb, #define NL802154_FLAG_CHECK_NETDEV_UP 0x08 #define NL802154_FLAG_NEED_WPAN_DEV 0x10 -static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, +static int nl802154_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev; @@ -2219,7 +2220,8 @@ static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, return 0; } -static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, +static void nl802154_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { if (info->user_ptr[1]) { diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 0a7a856e9ce0..c66299740c05 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -189,6 +189,51 @@ static int genl_get_cmd(u32 cmd, const struct genl_family *family, return genl_get_cmd_small(cmd, family, op); } +static void +genl_cmd_full_to_split(struct genl_split_ops *op, + const struct genl_family *family, + const struct genl_ops *full, u8 flags) +{ + if (flags & GENL_CMD_CAP_DUMP) { + op->start = full->start; + op->dumpit = full->dumpit; + op->done = full->done; + } else { + op->pre_doit = family->pre_doit; + op->doit = full->doit; + op->post_doit = family->post_doit; + } + + op->policy = full->policy; + op->maxattr = full->maxattr; + + op->cmd = full->cmd; + op->internal_flags = full->internal_flags; + op->flags = full->flags; + op->validate = full->validate; + + /* Make sure flags include the GENL_CMD_CAP_DO / GENL_CMD_CAP_DUMP */ + op->flags |= flags; +} + +static int +genl_get_cmd_split(u32 cmd, u8 flags, const struct genl_family *family, + struct genl_split_ops *op) +{ + struct genl_ops full; + int err; + + err = genl_get_cmd(cmd, family, &full); + if (err) { + memset(op, 0, sizeof(*op)); + return err; + } + + genl_cmd_full_to_split(op, family, &full, flags); + + return 0; +} + static void genl_get_cmd_by_index(unsigned int i, const struct genl_family *family, struct genl_ops *op) @@ -544,7 +589,7 @@ static struct nlattr ** genl_family_rcv_msg_attrs_parse(const struct genl_family *family, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, - const struct genl_ops *ops, + const struct genl_split_ops *ops, int hdrlen, enum genl_validate_flags no_strict_flag) { @@ -580,18 +625,19 @@ struct genl_start_context { const struct genl_family *family; struct nlmsghdr *nlh; struct netlink_ext_ack *extack; - const struct genl_ops *ops; + const struct genl_split_ops *ops; int hdrlen; }; static int genl_start(struct netlink_callback *cb) { struct genl_start_context *ctx = cb->data; - const struct genl_ops *ops = ctx->ops; + const struct genl_split_ops *ops; struct genl_dumpit_info *info; struct nlattr **attrs = NULL; int rc = 0; + ops = ctx->ops; if (ops->validate & GENL_DONT_VALIDATE_DUMP) goto no_attrs; @@ -633,7 +679,7 @@ no_attrs: static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { - const struct genl_ops *ops = &genl_dumpit_info(cb)->op; + const struct genl_split_ops *ops = &genl_dumpit_info(cb)->op; int rc; genl_lock(); @@ -645,7 +691,7 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int genl_lock_done(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - const struct genl_ops *ops = &info->op; + const struct genl_split_ops *ops = &info->op; int rc = 0; if (ops->done) { @@ -661,7 +707,7 @@ static int genl_lock_done(struct netlink_callback *cb) static int genl_parallel_done(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - const struct genl_ops *ops = &info->op; + const struct genl_split_ops *ops = &info->op; int rc = 0; if (ops->done) @@ -675,7 +721,7 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, - const struct genl_ops *ops, + const struct genl_split_ops *ops, int hdrlen, struct net *net) { struct genl_start_context ctx; @@ -721,7 +767,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, - const struct genl_ops *ops, + const struct genl_split_ops *ops, int hdrlen, struct net *net) { struct nlattr **attrbuf; @@ -747,16 +793,16 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); - if (family->pre_doit) { - err = family->pre_doit(ops, skb, &info); + if (ops->pre_doit) { + err = ops->pre_doit(ops, skb, &info); if (err) goto out; } err = ops->doit(skb, &info); - if (family->post_doit) - family->post_doit(ops, skb, &info); + if (ops->post_doit) + ops->post_doit(ops, skb, &info); out: genl_family_rcv_msg_attrs_free(attrbuf); @@ -801,8 +847,9 @@ static int genl_family_rcv_msg(const struct genl_family *family, { struct net *net = sock_net(skb->sk); struct genlmsghdr *hdr = nlmsg_data(nlh); - struct genl_ops op; + struct genl_split_ops op; int hdrlen; + u8 flags; /* this family doesn't exist in this netns */ if (!family->netnsok && !net_eq(net, &init_net)) @@ -815,7 +862,9 @@ static int genl_family_rcv_msg(const struct genl_family *family, if (genl_header_check(family, nlh, hdr, extack)) return -EINVAL; - if (genl_get_cmd(hdr->cmd, family, &op)) + flags = (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP ? + GENL_CMD_CAP_DUMP : GENL_CMD_CAP_DO; + if (genl_get_cmd_split(hdr->cmd, flags, family, &op)) return -EOPNOTSUPP; if ((op.flags & GENL_ADMIN_PERM) && @@ -826,7 +875,7 @@ static int genl_family_rcv_msg(const struct genl_family *family, !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) + if (flags & GENL_CMD_CAP_DUMP) return genl_family_rcv_msg_dumpit(family, skb, nlh, extack, &op, hdrlen, net); else diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 148f66edb015..1ad0326ff4dc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -16140,7 +16140,8 @@ static u32 nl80211_internal_flags[] = { #undef SELECTOR }; -static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, +static int nl80211_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = NULL; @@ -16241,7 +16242,8 @@ out_unlock: return err; } -static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, +static void nl80211_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { u32 internal_flags = nl80211_internal_flags[ops->internal_flags]; -- cgit v1.2.3-70-g09d2 From 7747eb75f6185a92db2e4a2643fdf6e4b6d87153 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:34 -0700 Subject: genetlink: load policy based on validation flags Set the policy and maxattr pointers based on validation flags. genl_family_rcv_msg_attrs_parse() will do nothing and return NULL if maxattrs is zero, so no behavior change is expected. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index c66299740c05..770726ac491e 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -204,8 +204,14 @@ genl_cmd_full_to_split(struct genl_split_ops *op, op->post_doit = family->post_doit; } - op->policy = full->policy; - op->maxattr = full->maxattr; + if (flags & GENL_CMD_CAP_DUMP && + full->validate & GENL_DONT_VALIDATE_DUMP) { + op->policy = NULL; + op->maxattr = 0; + } else { + op->policy = full->policy; + op->maxattr = full->maxattr; + } op->cmd = full->cmd; op->internal_flags = full->internal_flags; @@ -638,10 +644,8 @@ static int genl_start(struct netlink_callback *cb) int rc = 0; ops = ctx->ops; - if (ops->validate & GENL_DONT_VALIDATE_DUMP) - goto no_attrs; - - if (ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen)) + if (!(ops->validate & GENL_DONT_VALIDATE_DUMP) && + ctx->nlh->nlmsg_len < nlmsg_msg_size(ctx->hdrlen)) return -EINVAL; attrs = genl_family_rcv_msg_attrs_parse(ctx->family, ctx->nlh, ctx->extack, @@ -650,7 +654,6 @@ static int genl_start(struct netlink_callback *cb) if (IS_ERR(attrs)) return PTR_ERR(attrs); -no_attrs: info = genl_dumpit_info_alloc(); if (!info) { genl_family_rcv_msg_attrs_free(attrs); -- cgit v1.2.3-70-g09d2 From e1a248911d0694c8f95510fcaa920c0b8e99c26c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:35 -0700 Subject: genetlink: check for callback type at op load time Now that genl_get_cmd_split() is informed what type of callback user is trying to access (do or dump) we can check that this callback is indeed available and return an error early. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 770726ac491e..7c04df1bee2b 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -189,11 +189,17 @@ static int genl_get_cmd(u32 cmd, const struct genl_family *family, return genl_get_cmd_small(cmd, family, op); } -static void +static int genl_cmd_full_to_split(struct genl_split_ops *op, const struct genl_family *family, const struct genl_ops *full, u8 flags) { + if ((flags & GENL_CMD_CAP_DO && !full->doit) || + (flags & GENL_CMD_CAP_DUMP && !full->dumpit)) { + memset(op, 0, sizeof(*op)); + return -ENOENT; + } + if (flags & GENL_CMD_CAP_DUMP) { op->start = full->start; op->dumpit = full->dumpit; @@ -220,6 +226,8 @@ genl_cmd_full_to_split(struct genl_split_ops *op, /* Make sure flags include the GENL_CMD_CAP_DO / GENL_CMD_CAP_DUMP */ op->flags |= flags; + + return 0; } static int @@ -235,9 +243,7 @@ genl_get_cmd_split(u32 cmd, u8 flags, const struct genl_family *family, return err; } - genl_cmd_full_to_split(op, family, &full, flags); - - return 0; + return genl_cmd_full_to_split(op, family, &full, flags); } static void genl_get_cmd_by_index(unsigned int i, @@ -730,9 +736,6 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family, struct genl_start_context ctx; int err; - if (!ops->dumpit) - return -EOPNOTSUPP; - ctx.family = family; ctx.nlh = nlh; ctx.extack = extack; @@ -777,9 +780,6 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family, struct genl_info info; int err; - if (!ops->doit) - return -EOPNOTSUPP; - attrbuf = genl_family_rcv_msg_attrs_parse(family, nlh, extack, ops, hdrlen, GENL_DONT_VALIDATE_STRICT); -- cgit v1.2.3-70-g09d2 From 92d3d9ba9bb3ab17b5bd4fd46032ced0264872c9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:36 -0700 Subject: genetlink: add policies for both doit and dumpit in ctrl_dumppolicy_start() Separate adding doit and dumpit policies for CTRL_CMD_GETPOLICY. This has no effect until we actually allow do and dump to come from different sources as netlink_policy_dump_add_policy() does deduplication. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 7c04df1bee2b..d0c35738839b 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1260,29 +1260,57 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) ctx->rt = rt; if (tb[CTRL_ATTR_OP]) { + struct genl_split_ops doit, dump; + ctx->single_op = true; ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); - err = genl_get_cmd(ctx->op, rt, &op); - if (err) { + if (genl_get_cmd_split(ctx->op, GENL_CMD_CAP_DO, rt, &doit) && + genl_get_cmd_split(ctx->op, GENL_CMD_CAP_DUMP, rt, &dump)) { NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); - return err; + return -ENOENT; } - if (!op.policy) - return -ENODATA; + if (doit.policy) { + err = netlink_policy_dump_add_policy(&ctx->state, + doit.policy, + doit.maxattr); + if (err) + goto err_free_state; + } + if (dump.policy) { + err = netlink_policy_dump_add_policy(&ctx->state, + dump.policy, + dump.maxattr); + if (err) + goto err_free_state; + } - return netlink_policy_dump_add_policy(&ctx->state, op.policy, - op.maxattr); + if (!ctx->state) + return -ENODATA; + return 0; } for (i = 0; i < genl_get_cmd_cnt(rt); i++) { + struct genl_split_ops doit, dumpit; + genl_get_cmd_by_index(i, rt, &op); - if (op.policy) { + genl_cmd_full_to_split(&doit, ctx->rt, &op, GENL_CMD_CAP_DO); + genl_cmd_full_to_split(&dumpit, ctx->rt, + &op, GENL_CMD_CAP_DUMP); + + if (doit.policy) { + err = netlink_policy_dump_add_policy(&ctx->state, + doit.policy, + doit.maxattr); + if (err) + goto err_free_state; + } + if (dumpit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, - op.policy, - op.maxattr); + dumpit.policy, + dumpit.maxattr); if (err) goto err_free_state; } -- cgit v1.2.3-70-g09d2 From 26588edbef6094c41734b4443c2d7b4c4bd0085f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:37 -0700 Subject: genetlink: support split policies in ctrl_dumppolicy_put_op() Pass do and dump versions of the op to ctrl_dumppolicy_put_op() so that it can provide a different policy index for the two. Since we now look at policies, and those are set appropriately there's no need to look at the GENL_DONT_VALIDATE_DUMP flag. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 55 +++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d0c35738839b..93e33e20a0e8 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1345,7 +1345,8 @@ static void *ctrl_dumppolicy_prep(struct sk_buff *skb, static int ctrl_dumppolicy_put_op(struct sk_buff *skb, struct netlink_callback *cb, - struct genl_ops *op) + struct genl_split_ops *doit, + struct genl_split_ops *dumpit) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr *nest_pol, *nest_op; @@ -1353,10 +1354,7 @@ static int ctrl_dumppolicy_put_op(struct sk_buff *skb, int idx; /* skip if we have nothing to show */ - if (!op->policy) - return 0; - if (!op->doit && - (!op->dumpit || op->validate & GENL_DONT_VALIDATE_DUMP)) + if (!doit->policy && !dumpit->policy) return 0; hdr = ctrl_dumppolicy_prep(skb, cb); @@ -1367,21 +1365,26 @@ static int ctrl_dumppolicy_put_op(struct sk_buff *skb, if (!nest_pol) goto err; - nest_op = nla_nest_start(skb, op->cmd); + nest_op = nla_nest_start(skb, doit->cmd); if (!nest_op) goto err; - /* for now both do/dump are always the same */ - idx = netlink_policy_dump_get_policy_idx(ctx->state, - op->policy, - op->maxattr); + if (doit->policy) { + idx = netlink_policy_dump_get_policy_idx(ctx->state, + doit->policy, + doit->maxattr); - if (op->doit && nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx)) - goto err; + if (nla_put_u32(skb, CTRL_ATTR_POLICY_DO, idx)) + goto err; + } + if (dumpit->policy) { + idx = netlink_policy_dump_get_policy_idx(ctx->state, + dumpit->policy, + dumpit->maxattr); - if (op->dumpit && !(op->validate & GENL_DONT_VALIDATE_DUMP) && - nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx)) - goto err; + if (nla_put_u32(skb, CTRL_ATTR_POLICY_DUMP, idx)) + goto err; + } nla_nest_end(skb, nest_op); nla_nest_end(skb, nest_pol); @@ -1399,16 +1402,19 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) void *hdr; if (!ctx->policies) { + struct genl_split_ops doit, dumpit; struct genl_ops op; if (ctx->single_op) { - int err; - - err = genl_get_cmd(ctx->op, ctx->rt, &op); - if (WARN_ON(err)) - return err; + if (genl_get_cmd_split(ctx->op, GENL_CMD_CAP_DO, + ctx->rt, &doit) && + genl_get_cmd_split(ctx->op, GENL_CMD_CAP_DUMP, + ctx->rt, &dumpit)) { + WARN_ON(1); + return -ENOENT; + } - if (ctrl_dumppolicy_put_op(skb, cb, &op)) + if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit)) return skb->len; /* don't enter the loop below */ @@ -1418,7 +1424,12 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); - if (ctrl_dumppolicy_put_op(skb, cb, &op)) + genl_cmd_full_to_split(&doit, ctx->rt, + &op, GENL_CMD_CAP_DO); + genl_cmd_full_to_split(&dumpit, ctx->rt, + &op, GENL_CMD_CAP_DUMP); + + if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit)) return skb->len; ctx->opidx++; -- cgit v1.2.3-70-g09d2 From 8d84322ae6d7921c461b6a5ef39be0b48c177b8a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:38 -0700 Subject: genetlink: inline genl_get_cmd() All callers go via genl_get_cmd_split() now, so rename it to genl_get_cmd() remove the original. Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 93e33e20a0e8..ec32b6063a3f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -181,14 +181,6 @@ static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, return -ENOENT; } -static int genl_get_cmd(u32 cmd, const struct genl_family *family, - struct genl_ops *op) -{ - if (!genl_get_cmd_full(cmd, family, op)) - return 0; - return genl_get_cmd_small(cmd, family, op); -} - static int genl_cmd_full_to_split(struct genl_split_ops *op, const struct genl_family *family, @@ -231,13 +223,15 @@ genl_cmd_full_to_split(struct genl_split_ops *op, } static int -genl_get_cmd_split(u32 cmd, u8 flags, const struct genl_family *family, - struct genl_split_ops *op) +genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family, + struct genl_split_ops *op) { struct genl_ops full; int err; - err = genl_get_cmd(cmd, family, &full); + err = genl_get_cmd_full(cmd, family, &full); + if (err == -ENOENT) + err = genl_get_cmd_small(cmd, family, &full); if (err) { memset(op, 0, sizeof(*op)); return err; @@ -867,7 +861,7 @@ static int genl_family_rcv_msg(const struct genl_family *family, flags = (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP ? GENL_CMD_CAP_DUMP : GENL_CMD_CAP_DO; - if (genl_get_cmd_split(hdr->cmd, flags, family, &op)) + if (genl_get_cmd(hdr->cmd, flags, family, &op)) return -EOPNOTSUPP; if ((op.flags & GENL_ADMIN_PERM) && @@ -1265,8 +1259,8 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) ctx->single_op = true; ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); - if (genl_get_cmd_split(ctx->op, GENL_CMD_CAP_DO, rt, &doit) && - genl_get_cmd_split(ctx->op, GENL_CMD_CAP_DUMP, rt, &dump)) { + if (genl_get_cmd(ctx->op, GENL_CMD_CAP_DO, rt, &doit) && + genl_get_cmd(ctx->op, GENL_CMD_CAP_DUMP, rt, &dump)) { NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); return -ENOENT; } @@ -1406,10 +1400,10 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) struct genl_ops op; if (ctx->single_op) { - if (genl_get_cmd_split(ctx->op, GENL_CMD_CAP_DO, - ctx->rt, &doit) && - genl_get_cmd_split(ctx->op, GENL_CMD_CAP_DUMP, - ctx->rt, &dumpit)) { + if (genl_get_cmd(ctx->op, GENL_CMD_CAP_DO, + ctx->rt, &doit) && + genl_get_cmd(ctx->op, GENL_CMD_CAP_DUMP, + ctx->rt, &dumpit)) { WARN_ON(1); return -ENOENT; } -- cgit v1.2.3-70-g09d2 From 6557461cd27843213ca19a7918bbaa8c3b7c5f5c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:39 -0700 Subject: genetlink: add iterator for walking family ops Subsequent changes will expose split op structures to users, so walking the family ops with just an index will get harder. Add a structured iterator, convert the simple cases. Policy dumping needs more careful conversion. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 117 +++++++++++++++++++++++++++++++----------------- 1 file changed, 76 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index ec32b6063a3f..06bf091c1b8a 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -252,6 +252,57 @@ static void genl_get_cmd_by_index(unsigned int i, WARN_ON_ONCE(1); } +struct genl_op_iter { + const struct genl_family *family; + struct genl_split_ops doit; + struct genl_split_ops dumpit; + int i; + u32 cmd; + u8 flags; +}; + +static bool +genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter) +{ + iter->family = family; + iter->i = 0; + + iter->flags = 0; + + return genl_get_cmd_cnt(iter->family); +} + +static bool genl_op_iter_next(struct genl_op_iter *iter) +{ + struct genl_ops op; + + if (iter->i >= genl_get_cmd_cnt(iter->family)) + return false; + + genl_get_cmd_by_index(iter->i, iter->family, &op); + iter->i++; + + genl_cmd_full_to_split(&iter->doit, iter->family, &op, GENL_CMD_CAP_DO); + genl_cmd_full_to_split(&iter->dumpit, iter->family, + &op, GENL_CMD_CAP_DUMP); + + iter->cmd = iter->doit.cmd | iter->dumpit.cmd; + iter->flags = iter->doit.flags | iter->dumpit.flags; + + return true; +} + +static void +genl_op_iter_copy(struct genl_op_iter *dst, struct genl_op_iter *src) +{ + *dst = *src; +} + +static unsigned int genl_op_iter_idx(struct genl_op_iter *iter) +{ + return iter->i; +} + static int genl_allocate_reserve_groups(int n_groups, int *first_id) { unsigned long *new_groups; @@ -419,25 +470,23 @@ static void genl_unregister_mc_groups(const struct genl_family *family) static int genl_validate_ops(const struct genl_family *family) { - int i, j; + struct genl_op_iter i, j; if (WARN_ON(family->n_ops && !family->ops) || WARN_ON(family->n_small_ops && !family->small_ops)) return -EINVAL; - for (i = 0; i < genl_get_cmd_cnt(family); i++) { - struct genl_ops op; - - genl_get_cmd_by_index(i, family, &op); - if (op.dumpit == NULL && op.doit == NULL) + for (genl_op_iter_init(family, &i); genl_op_iter_next(&i); ) { + if (!(i.flags & (GENL_CMD_CAP_DO | GENL_CMD_CAP_DUMP))) return -EINVAL; - if (WARN_ON(op.cmd >= family->resv_start_op && op.validate)) + + if (WARN_ON(i.cmd >= family->resv_start_op && + (i.doit.validate || i.dumpit.validate))) return -EINVAL; - for (j = i + 1; j < genl_get_cmd_cnt(family); j++) { - struct genl_ops op2; - genl_get_cmd_by_index(j, family, &op2); - if (op.cmd == op2.cmd) + genl_op_iter_copy(&j, &i); + while (genl_op_iter_next(&j)) { + if (i.cmd == j.cmd) return -EINVAL; } } @@ -917,6 +966,7 @@ static struct genl_family genl_ctrl; static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { + struct genl_op_iter i; void *hdr; hdr = genlmsg_put(skb, portid, seq, &genl_ctrl, flags, cmd); @@ -930,33 +980,26 @@ static int ctrl_fill_info(const struct genl_family *family, u32 portid, u32 seq, nla_put_u32(skb, CTRL_ATTR_MAXATTR, family->maxattr)) goto nla_put_failure; - if (genl_get_cmd_cnt(family)) { + if (genl_op_iter_init(family, &i)) { struct nlattr *nla_ops; - int i; nla_ops = nla_nest_start_noflag(skb, CTRL_ATTR_OPS); if (nla_ops == NULL) goto nla_put_failure; - for (i = 0; i < genl_get_cmd_cnt(family); i++) { + while (genl_op_iter_next(&i)) { struct nlattr *nest; - struct genl_ops op; u32 op_flags; - genl_get_cmd_by_index(i, family, &op); - op_flags = op.flags; - if (op.dumpit) - op_flags |= GENL_CMD_CAP_DUMP; - if (op.doit) - op_flags |= GENL_CMD_CAP_DO; - if (op.policy) + op_flags = i.flags; + if (i.doit.policy || i.dumpit.policy) op_flags |= GENL_CMD_CAP_HASPOL; - nest = nla_nest_start_noflag(skb, i + 1); + nest = nla_nest_start_noflag(skb, genl_op_iter_idx(&i)); if (nest == NULL) goto nla_put_failure; - if (nla_put_u32(skb, CTRL_ATTR_OP_ID, op.cmd) || + if (nla_put_u32(skb, CTRL_ATTR_OP_ID, i.cmd) || nla_put_u32(skb, CTRL_ATTR_OP_FLAGS, op_flags)) goto nla_put_failure; @@ -1229,8 +1272,8 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->attrs; const struct genl_family *rt; - struct genl_ops op; - int err, i; + struct genl_op_iter i; + int err; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); @@ -1285,26 +1328,18 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) return 0; } - for (i = 0; i < genl_get_cmd_cnt(rt); i++) { - struct genl_split_ops doit, dumpit; - - genl_get_cmd_by_index(i, rt, &op); - - genl_cmd_full_to_split(&doit, ctx->rt, &op, GENL_CMD_CAP_DO); - genl_cmd_full_to_split(&dumpit, ctx->rt, - &op, GENL_CMD_CAP_DUMP); - - if (doit.policy) { + for (genl_op_iter_init(rt, &i); genl_op_iter_next(&i); ) { + if (i.doit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, - doit.policy, - doit.maxattr); + i.doit.policy, + i.doit.maxattr); if (err) goto err_free_state; } - if (dumpit.policy) { + if (i.dumpit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, - dumpit.policy, - dumpit.maxattr); + i.dumpit.policy, + i.dumpit.maxattr); if (err) goto err_free_state; } -- cgit v1.2.3-70-g09d2 From b502b3185cd68f24510f8a253bf31a1b34605bb6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:40 -0700 Subject: genetlink: use iterator in the op to policy map dumping We can't put the full iterator in the struct ctrl_dump_policy_ctx because dump context is statically sized by netlink core. Allocate it dynamically. Rename policy to dump_map to make the logic a little easier to follow. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 06bf091c1b8a..4b8c65d9e9d3 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1252,10 +1252,10 @@ static int genl_ctrl_event(int event, const struct genl_family *family, struct ctrl_dump_policy_ctx { struct netlink_policy_dump_state *state; const struct genl_family *rt; - unsigned int opidx; + struct genl_op_iter *op_iter; u32 op; u16 fam_id; - u8 policies:1, + u8 dump_map:1, single_op:1; }; @@ -1325,9 +1325,16 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) if (!ctx->state) return -ENODATA; + + ctx->dump_map = 1; return 0; } + ctx->op_iter = kmalloc(sizeof(*ctx->op_iter), GFP_KERNEL); + if (!ctx->op_iter) + return -ENOMEM; + ctx->dump_map = genl_op_iter_init(rt, ctx->op_iter); + for (genl_op_iter_init(rt, &i); genl_op_iter_next(&i); ) { if (i.doit.policy) { err = netlink_policy_dump_add_policy(&ctx->state, @@ -1345,12 +1352,16 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) } } - if (!ctx->state) - return -ENODATA; + if (!ctx->state) { + err = -ENODATA; + goto err_free_op_iter; + } return 0; err_free_state: netlink_policy_dump_free(ctx->state); +err_free_op_iter: + kfree(ctx->op_iter); return err; } @@ -1430,11 +1441,10 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; void *hdr; - if (!ctx->policies) { - struct genl_split_ops doit, dumpit; - struct genl_ops op; - + if (ctx->dump_map) { if (ctx->single_op) { + struct genl_split_ops doit, dumpit; + if (genl_get_cmd(ctx->op, GENL_CMD_CAP_DO, ctx->rt, &doit) && genl_get_cmd(ctx->op, GENL_CMD_CAP_DUMP, @@ -1446,26 +1456,18 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit)) return skb->len; - /* don't enter the loop below */ - ctx->opidx = genl_get_cmd_cnt(ctx->rt); + /* done with the per-op policy index list */ + ctx->dump_map = 0; } - while (ctx->opidx < genl_get_cmd_cnt(ctx->rt)) { - genl_get_cmd_by_index(ctx->opidx, ctx->rt, &op); - - genl_cmd_full_to_split(&doit, ctx->rt, - &op, GENL_CMD_CAP_DO); - genl_cmd_full_to_split(&dumpit, ctx->rt, - &op, GENL_CMD_CAP_DUMP); - - if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit)) + while (ctx->dump_map) { + if (ctrl_dumppolicy_put_op(skb, cb, + &ctx->op_iter->doit, + &ctx->op_iter->dumpit)) return skb->len; - ctx->opidx++; + ctx->dump_map = genl_op_iter_next(ctx->op_iter); } - - /* completed with the per-op policy index list */ - ctx->policies = true; } while (netlink_policy_dump_loop(ctx->state)) { @@ -1498,6 +1500,7 @@ static int ctrl_dumppolicy_done(struct netlink_callback *cb) { struct ctrl_dump_policy_ctx *ctx = (void *)cb->ctx; + kfree(ctx->op_iter); netlink_policy_dump_free(ctx->state); return 0; } -- cgit v1.2.3-70-g09d2 From 7acfbbe17c18554960b812a95c26d7ef9b7b28ac Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:41 -0700 Subject: genetlink: inline old iteration helpers All dumpers use the iterators now, inline the cmd by index stuff into iterator code. Reviewed-by: Jacob Keller Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 4b8c65d9e9d3..0a4f1470f442 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -118,11 +118,6 @@ static const struct genl_family *genl_family_find_byname(char *name) return NULL; } -static int genl_get_cmd_cnt(const struct genl_family *family) -{ - return family->n_ops + family->n_small_ops; -} - static void genl_op_from_full(const struct genl_family *family, unsigned int i, struct genl_ops *op) { @@ -240,18 +235,6 @@ genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family, return genl_cmd_full_to_split(op, family, &full, flags); } -static void genl_get_cmd_by_index(unsigned int i, - const struct genl_family *family, - struct genl_ops *op) -{ - if (i < family->n_ops) - genl_op_from_full(family, i, op); - else if (i < family->n_ops + family->n_small_ops) - genl_op_from_small(family, i - family->n_ops, op); - else - WARN_ON_ONCE(1); -} - struct genl_op_iter { const struct genl_family *family; struct genl_split_ops doit; @@ -269,22 +252,25 @@ genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter) iter->flags = 0; - return genl_get_cmd_cnt(iter->family); + return iter->family->n_ops + iter->family->n_small_ops; } static bool genl_op_iter_next(struct genl_op_iter *iter) { + const struct genl_family *family = iter->family; struct genl_ops op; - if (iter->i >= genl_get_cmd_cnt(iter->family)) + if (iter->i < family->n_ops) + genl_op_from_full(family, iter->i, &op); + else if (iter->i < family->n_ops + family->n_small_ops) + genl_op_from_small(family, iter->i - family->n_ops, &op); + else return false; - genl_get_cmd_by_index(iter->i, iter->family, &op); iter->i++; - genl_cmd_full_to_split(&iter->doit, iter->family, &op, GENL_CMD_CAP_DO); - genl_cmd_full_to_split(&iter->dumpit, iter->family, - &op, GENL_CMD_CAP_DUMP); + genl_cmd_full_to_split(&iter->doit, family, &op, GENL_CMD_CAP_DO); + genl_cmd_full_to_split(&iter->dumpit, family, &op, GENL_CMD_CAP_DUMP); iter->cmd = iter->doit.cmd | iter->dumpit.cmd; iter->flags = iter->doit.flags | iter->dumpit.flags; -- cgit v1.2.3-70-g09d2 From b8fd60c36a44351f773432e24efd8bb92f8ba0c6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:42 -0700 Subject: genetlink: allow families to use split ops directly Let families to hook in the new split ops. They are more flexible and should not be much larger than full ops. Each split op is 40B while full op is 48B. Devlink for example has 54 dos and 19 dumps, 2 of the dumps do not have a do -> 56 full commands = 2688B. Split ops would have taken 2920B, so 9% more space while allowing individual per/post doit and per-type policies. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- include/net/genetlink.h | 5 ++ net/netlink/genetlink.c | 170 ++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 149 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 4be7989c451b..d21210709f84 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -46,6 +46,9 @@ struct genl_info; * @n_ops: number of operations supported by this family * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family + * @split_ops: the split do/dump form of operation definition + * @n_split_ops: number of entries in @split_ops, not that with split do/dump + * ops the number of entries is not the same as number of commands * * Attribute policies (the combination of @policy and @maxattr fields) * can be attached at the family level or at the operation level. @@ -63,6 +66,7 @@ struct genl_family { u8 parallel_ops:1; u8 n_ops; u8 n_small_ops; + u8 n_split_ops; u8 n_mcgrps; u8 resv_start_op; const struct nla_policy *policy; @@ -74,6 +78,7 @@ struct genl_family { struct genl_info *info); const struct genl_ops * ops; const struct genl_small_ops *small_ops; + const struct genl_split_ops *split_ops; const struct genl_multicast_group *mcgrps; struct module *module; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 0a4f1470f442..90b0feb5eb73 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -101,6 +101,17 @@ genl_op_fill_in_reject_policy(const struct genl_family *family, op->maxattr = 1; } +static void +genl_op_fill_in_reject_policy_split(const struct genl_family *family, + struct genl_split_ops *op) +{ + if (op->policy) + return; + + op->policy = genl_policy_reject_all; + op->maxattr = 1; +} + static const struct genl_family *genl_family_find_byid(unsigned int id) { return idr_find(&genl_fam_idr, id); @@ -118,6 +129,16 @@ static const struct genl_family *genl_family_find_byname(char *name) return NULL; } +struct genl_op_iter { + const struct genl_family *family; + struct genl_split_ops doit; + struct genl_split_ops dumpit; + int cmd_idx; + int entry_idx; + u32 cmd; + u8 flags; +}; + static void genl_op_from_full(const struct genl_family *family, unsigned int i, struct genl_ops *op) { @@ -176,6 +197,50 @@ static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, return -ENOENT; } +static void genl_op_from_split(struct genl_op_iter *iter) +{ + const struct genl_family *family = iter->family; + int i, cnt = 0; + + i = iter->entry_idx - family->n_ops - family->n_small_ops; + + if (family->split_ops[i + cnt].flags & GENL_CMD_CAP_DO) { + iter->doit = family->split_ops[i + cnt]; + genl_op_fill_in_reject_policy_split(family, &iter->doit); + cnt++; + } else { + memset(&iter->doit, 0, sizeof(iter->doit)); + } + + if (i + cnt < family->n_split_ops && + family->split_ops[i + cnt].flags & GENL_CMD_CAP_DUMP) { + iter->dumpit = family->split_ops[i + cnt]; + genl_op_fill_in_reject_policy_split(family, &iter->dumpit); + cnt++; + } else { + memset(&iter->dumpit, 0, sizeof(iter->dumpit)); + } + + WARN_ON(!cnt); + iter->entry_idx += cnt; +} + +static int +genl_get_cmd_split(u32 cmd, u8 flag, const struct genl_family *family, + struct genl_split_ops *op) +{ + int i; + + for (i = 0; i < family->n_split_ops; i++) + if (family->split_ops[i].cmd == cmd && + family->split_ops[i].flags & flag) { + *op = family->split_ops[i]; + return 0; + } + + return -ENOENT; +} + static int genl_cmd_full_to_split(struct genl_split_ops *op, const struct genl_family *family, @@ -227,50 +292,60 @@ genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family, err = genl_get_cmd_full(cmd, family, &full); if (err == -ENOENT) err = genl_get_cmd_small(cmd, family, &full); - if (err) { - memset(op, 0, sizeof(*op)); - return err; - } + /* Found one of legacy forms */ + if (err == 0) + return genl_cmd_full_to_split(op, family, &full, flags); - return genl_cmd_full_to_split(op, family, &full, flags); + err = genl_get_cmd_split(cmd, flags, family, op); + if (err) + memset(op, 0, sizeof(*op)); + return err; } -struct genl_op_iter { - const struct genl_family *family; - struct genl_split_ops doit; - struct genl_split_ops dumpit; - int i; - u32 cmd; - u8 flags; -}; - static bool genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter) { iter->family = family; - iter->i = 0; + iter->cmd_idx = 0; + iter->entry_idx = 0; iter->flags = 0; - return iter->family->n_ops + iter->family->n_small_ops; + return iter->family->n_ops + + iter->family->n_small_ops + + iter->family->n_split_ops; } static bool genl_op_iter_next(struct genl_op_iter *iter) { const struct genl_family *family = iter->family; + bool legacy_op = true; struct genl_ops op; - if (iter->i < family->n_ops) - genl_op_from_full(family, iter->i, &op); - else if (iter->i < family->n_ops + family->n_small_ops) - genl_op_from_small(family, iter->i - family->n_ops, &op); - else + if (iter->entry_idx < family->n_ops) { + genl_op_from_full(family, iter->entry_idx, &op); + } else if (iter->entry_idx < family->n_ops + family->n_small_ops) { + genl_op_from_small(family, iter->entry_idx - family->n_ops, + &op); + } else if (iter->entry_idx < + family->n_ops + family->n_small_ops + family->n_split_ops) { + legacy_op = false; + /* updates entry_idx */ + genl_op_from_split(iter); + } else { return false; + } - iter->i++; + iter->cmd_idx++; - genl_cmd_full_to_split(&iter->doit, family, &op, GENL_CMD_CAP_DO); - genl_cmd_full_to_split(&iter->dumpit, family, &op, GENL_CMD_CAP_DUMP); + if (legacy_op) { + iter->entry_idx++; + + genl_cmd_full_to_split(&iter->doit, family, + &op, GENL_CMD_CAP_DO); + genl_cmd_full_to_split(&iter->dumpit, family, + &op, GENL_CMD_CAP_DUMP); + } iter->cmd = iter->doit.cmd | iter->dumpit.cmd; iter->flags = iter->doit.flags | iter->dumpit.flags; @@ -286,7 +361,7 @@ genl_op_iter_copy(struct genl_op_iter *dst, struct genl_op_iter *src) static unsigned int genl_op_iter_idx(struct genl_op_iter *iter) { - return iter->i; + return iter->cmd_idx; } static int genl_allocate_reserve_groups(int n_groups, int *first_id) @@ -454,12 +529,22 @@ static void genl_unregister_mc_groups(const struct genl_family *family) } } +static bool genl_split_op_check(const struct genl_split_ops *op) +{ + if (WARN_ON(hweight8(op->flags & (GENL_CMD_CAP_DO | + GENL_CMD_CAP_DUMP)) != 1)) + return true; + return false; +} + static int genl_validate_ops(const struct genl_family *family) { struct genl_op_iter i, j; + unsigned int s; if (WARN_ON(family->n_ops && !family->ops) || - WARN_ON(family->n_small_ops && !family->small_ops)) + WARN_ON(family->n_small_ops && !family->small_ops) || + WARN_ON(family->n_split_ops && !family->split_ops)) return -EINVAL; for (genl_op_iter_init(family, &i); genl_op_iter_next(&i); ) { @@ -477,6 +562,39 @@ static int genl_validate_ops(const struct genl_family *family) } } + if (family->n_split_ops) { + if (genl_split_op_check(&family->split_ops[0])) + return -EINVAL; + } + + for (s = 1; s < family->n_split_ops; s++) { + const struct genl_split_ops *a, *b; + + a = &family->split_ops[s - 1]; + b = &family->split_ops[s]; + + if (genl_split_op_check(b)) + return -EINVAL; + + /* Check sort order */ + if (a->cmd < b->cmd) + continue; + + if (a->internal_flags != b->internal_flags || + ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO | + GENL_CMD_CAP_DUMP))) { + WARN_ON(1); + return -EINVAL; + } + + if ((a->flags & GENL_CMD_CAP_DO) && + (b->flags & GENL_CMD_CAP_DUMP)) + continue; + + WARN_ON(1); + return -EINVAL; + } + return 0; } -- cgit v1.2.3-70-g09d2 From aba22ca8ccd6954ffbb1a7a671f34300887b4727 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:13:43 -0700 Subject: genetlink: convert control family to split ops Prove that the split ops work. Sadly we need to keep bug-wards compatibility and specify the same policy for dump as do, even tho we don't parse inputs for the dump. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/netlink/genetlink.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 90b0feb5eb73..362a61179036 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1609,14 +1609,22 @@ static int ctrl_dumppolicy_done(struct netlink_callback *cb) return 0; } -static const struct genl_ops genl_ctrl_ops[] = { +static const struct genl_split_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT, .policy = ctrl_policy_family, .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .doit = ctrl_getfamily, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = CTRL_CMD_GETFAMILY, + .validate = GENL_DONT_VALIDATE_DUMP, + .policy = ctrl_policy_family, + .maxattr = ARRAY_SIZE(ctrl_policy_family) - 1, .dumpit = ctrl_dumpfamily, + .flags = GENL_CMD_CAP_DUMP, }, { .cmd = CTRL_CMD_GETPOLICY, @@ -1625,6 +1633,7 @@ static const struct genl_ops genl_ctrl_ops[] = { .start = ctrl_dumppolicy_start, .dumpit = ctrl_dumppolicy, .done = ctrl_dumppolicy_done, + .flags = GENL_CMD_CAP_DUMP, }, }; @@ -1634,8 +1643,8 @@ static const struct genl_multicast_group genl_ctrl_groups[] = { static struct genl_family genl_ctrl __ro_after_init = { .module = THIS_MODULE, - .ops = genl_ctrl_ops, - .n_ops = ARRAY_SIZE(genl_ctrl_ops), + .split_ops = genl_ctrl_ops, + .n_split_ops = ARRAY_SIZE(genl_ctrl_ops), .resv_start_op = CTRL_CMD_GETPOLICY + 1, .mcgrps = genl_ctrl_groups, .n_mcgrps = ARRAY_SIZE(genl_ctrl_groups), -- cgit v1.2.3-70-g09d2 From e69761483361f3df455bc493c99af0ef1744a14f Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Sat, 5 Nov 2022 17:05:04 +0800 Subject: netlink: Fix potential skb memleak in netlink_ack Fix coverity issue 'Resource leak'. We should clean the skb resource if nlmsg_put/append failed. Fixes: 738136a0e375 ("netlink: split up copies in the ack construction") Signed-off-by: Tao Chen Link: https://lore.kernel.org/r/bff442d62c87de6299817fe1897cc5a5694ba9cc.1667638204.git.chentao.kernel@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index b10d5e50b99d..9ebdf3262015 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2500,7 +2500,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) - goto err_bad_put; + goto err_skb; rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, sizeof(*errmsg), flags); @@ -2528,6 +2528,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, return; err_bad_put: + nlmsg_free(skb); +err_skb: NETLINK_CB(in_skb).sk->sk_err = ENOBUFS; sk_error_report(NETLINK_CB(in_skb).sk); } -- cgit v1.2.3-70-g09d2 From 9a0f830f80265bd1ef816e1541ac24bee80e9a3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:01:25 -0700 Subject: ethtool: linkstate: add a statistic for PHY down events The previous attempt to augment carrier_down (see Link) was not met with much enthusiasm so let's do the simple thing of exposing what some devices already maintain. Add a common ethtool statistic for link going down. Currently users have to maintain per-driver mapping to extract the right stat from the vendor-specific ethtool -S stats. carrier_down does not fit the bill because it counts a lot of software related false positives. Add the statistic to the extended link state API to steer vendors towards implementing all of it. Implement for bnxt and all Linux-controlled PHYs. mlx5 and (possibly) enic also have a counter for this but I leave the implementation to their maintainers. Link: https://lore.kernel.org/r/20220520004500.2250674-1-kuba@kernel.org Reviewed-by: Florian Fainelli Reviewed-by: Michael Chan Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20221104190125.684910-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- Documentation/networking/ethtool-netlink.rst | 1 + drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 15 ++++++++++++++ drivers/net/phy/phy.c | 1 + include/linux/ethtool.h | 17 ++++++++++++++++ include/linux/phy.h | 3 +++ include/uapi/linux/ethtool_netlink.h | 1 + net/ethtool/linkstate.c | 24 ++++++++++++++++++++++- 7 files changed, 61 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index d578b8bcd8a4..bede24ef44fd 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -491,6 +491,7 @@ Kernel response contents: ``ETHTOOL_A_LINKSTATE_SQI_MAX`` u32 Max support SQI value ``ETHTOOL_A_LINKSTATE_EXT_STATE`` u8 link extended state ``ETHTOOL_A_LINKSTATE_EXT_SUBSTATE`` u8 link extended substate + ``ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT`` u32 count of link down events ==================================== ====== ============================ For most NIC drivers, the value of ``ETHTOOL_A_LINKSTATE_LINK`` returns diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index cc89e5eabcb9..d8f0351df954 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -4112,6 +4112,20 @@ static void bnxt_get_rmon_stats(struct net_device *dev, *ranges = bnxt_rmon_ranges; } +static void bnxt_get_link_ext_stats(struct net_device *dev, + struct ethtool_link_ext_stats *stats) +{ + struct bnxt *bp = netdev_priv(dev); + u64 *rx; + + if (BNXT_VF(bp) || !(bp->flags & BNXT_FLAG_PORT_STATS_EXT)) + return; + + rx = bp->rx_port_stats_ext.sw_stats; + stats->link_down_events = + *(rx + BNXT_RX_STATS_EXT_OFFSET(link_down_events)); +} + void bnxt_ethtool_free(struct bnxt *bp) { kfree(bp->test_info); @@ -4161,6 +4175,7 @@ const struct ethtool_ops bnxt_ethtool_ops = { .get_eeprom = bnxt_get_eeprom, .set_eeprom = bnxt_set_eeprom, .get_link = bnxt_get_link, + .get_link_ext_stats = bnxt_get_link_ext_stats, .get_eee = bnxt_get_eee, .set_eee = bnxt_set_eee, .get_module_info = bnxt_get_module_info, diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index e741d8aebffe..e5b6cb1a77f9 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -67,6 +67,7 @@ static void phy_link_down(struct phy_device *phydev) { phydev->phy_link_change(phydev, false); phy_led_trigger_change_speed(phydev); + WRITE_ONCE(phydev->link_down_events, phydev->link_down_events + 1); } static const char *phy_pause_str(struct phy_device *phydev) diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 99dc7bfbcd3c..5c51c7fda32a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -125,6 +125,20 @@ struct ethtool_link_ext_state_info { }; }; +struct ethtool_link_ext_stats { + /* Custom Linux statistic for PHY level link down events. + * In a simpler world it should be equal to netdev->carrier_down_count + * unfortunately netdev also counts local reconfigurations which don't + * actually take the physical link down, not to mention NC-SI which, + * if present, keeps the link up regardless of host state. + * This statistic counts when PHY _actually_ went down, or lost link. + * + * Note that we need u64 for ethtool_stats_init() and comparisons + * to ETHTOOL_STAT_NOT_SET, but only u32 is exposed to the user. + */ + u64 link_down_events; +}; + /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table @@ -481,6 +495,7 @@ struct ethtool_module_power_mode_params { * do not attach ext_substate attribute to netlink message). If link_ext_state * and link_ext_substate are unknown, return -ENODATA. If not implemented, * link_ext_state and link_ext_substate will not be sent to userspace. + * @get_link_ext_stats: Read extra link-related counters. * @get_eeprom_len: Read range of EEPROM addresses for validation of * @get_eeprom and @set_eeprom requests. * Returns 0 if device does not support EEPROM access. @@ -652,6 +667,8 @@ struct ethtool_ops { u32 (*get_link)(struct net_device *); int (*get_link_ext_state)(struct net_device *, struct ethtool_link_ext_state_info *); + void (*get_link_ext_stats)(struct net_device *dev, + struct ethtool_link_ext_stats *stats); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); diff --git a/include/linux/phy.h b/include/linux/phy.h index ddf66198f751..9a3752c0c444 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -600,6 +600,7 @@ struct macsec_ops; * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine + * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * @@ -723,6 +724,8 @@ struct phy_device { int pma_extable; + unsigned int link_down_events; + void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index bb57084ac524..aaf7c6963d61 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -262,6 +262,7 @@ enum { ETHTOOL_A_LINKSTATE_SQI_MAX, /* u32 */ ETHTOOL_A_LINKSTATE_EXT_STATE, /* u8 */ ETHTOOL_A_LINKSTATE_EXT_SUBSTATE, /* u8 */ + ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, /* u32 */ /* add new constants above here */ __ETHTOOL_A_LINKSTATE_CNT, diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index fb676f349455..2158c17a0b32 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -13,6 +13,7 @@ struct linkstate_reply_data { int link; int sqi; int sqi_max; + struct ethtool_link_ext_stats link_stats; bool link_ext_state_provided; struct ethtool_link_ext_state_info ethtool_link_ext_state_info; }; @@ -22,7 +23,7 @@ struct linkstate_reply_data { const struct nla_policy ethnl_linkstate_get_policy[] = { [ETHTOOL_A_LINKSTATE_HEADER] = - NLA_POLICY_NESTED(ethnl_header_policy), + NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static int linkstate_get_sqi(struct net_device *dev) @@ -107,6 +108,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, goto out; } + ethtool_stats_init((u64 *)&data->link_stats, + sizeof(data->link_stats) / 8); + + if (req_base->flags & ETHTOOL_FLAG_STATS) { + if (dev->phydev) + data->link_stats.link_down_events = + READ_ONCE(dev->phydev->link_down_events); + + if (dev->ethtool_ops->get_link_ext_stats) + dev->ethtool_ops->get_link_ext_stats(dev, + &data->link_stats); + } + ret = 0; out: ethnl_ops_complete(dev); @@ -134,6 +148,9 @@ static int linkstate_reply_size(const struct ethnl_req_info *req_base, if (data->ethtool_link_ext_state_info.__link_ext_substate) len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */ + if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET) + len += nla_total_size(sizeof(u32)); + return len; } @@ -166,6 +183,11 @@ static int linkstate_fill_reply(struct sk_buff *skb, return -EMSGSIZE; } + if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET) + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT, + data->link_stats.link_down_events)) + return -EMSGSIZE; + return 0; } -- cgit v1.2.3-70-g09d2 From 47f3ecf4763d3fea37d3453c9ee1f9f2169d71b3 Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Sun, 6 Nov 2022 14:31:27 +0200 Subject: ethtool: Fail number of channels change when it conflicts with rxnfc Similar to what we do with the hash indirection table [1], when network flow classification rules are forwarding traffic to channels greater than the requested number of channels, fail the operation. Without this, traffic could be directed to channels which no longer exist (dropped) after changing number of channels. [1] commit d4ab4286276f ("ethtool: correctly ensure {GS}CHANNELS doesn't conflict with GS{RXFH}") Reviewed-by: Tariq Toukan Signed-off-by: Gal Pressman Link: https://lore.kernel.org/r/20221106123127.522985-1-gal@nvidia.com Signed-off-by: Paolo Abeni --- net/ethtool/channels.c | 19 +++++++++++---- net/ethtool/common.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ethtool/common.h | 1 + net/ethtool/ioctl.c | 17 ++++++++----- 4 files changed, 92 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 403158862011..c7e37130647e 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -116,9 +116,10 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) struct ethtool_channels channels = {}; struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; - u32 err_attr, max_rx_in_use = 0; + u32 err_attr, max_rxfh_in_use; const struct ethtool_ops *ops; struct net_device *dev; + u64 max_rxnfc_in_use; int ret; ret = ethnl_parse_header_dev_get(&req_info, @@ -189,15 +190,23 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) } /* ensure the new Rx count fits within the configured Rx flow - * indirection table settings + * indirection table/rxnfc settings */ - if (netif_is_rxfh_configured(dev) && - !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && - (channels.combined_count + channels.rx_count) <= max_rx_in_use) { + if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use)) + max_rxnfc_in_use = 0; + if (!netif_is_rxfh_configured(dev) || + ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use)) + max_rxfh_in_use = 0; + if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) { ret = -EINVAL; GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing indirection table settings"); goto out_ops; } + if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) { + ret = -EINVAL; + GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings"); + goto out_ops; + } /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + diff --git a/net/ethtool/common.c b/net/ethtool/common.c index ee3e02da0013..21cfe8557205 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -512,6 +512,72 @@ int __ethtool_get_link(struct net_device *dev) return netif_running(dev) && dev->ethtool_ops->get_link(dev); } +static int ethtool_get_rxnfc_rule_count(struct net_device *dev) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc info = { + .cmd = ETHTOOL_GRXCLSRLCNT, + }; + int err; + + err = ops->get_rxnfc(dev, &info, NULL); + if (err) + return err; + + return info.rule_cnt; +} + +int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc *info; + int err, i, rule_cnt; + u64 max_ring = 0; + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + rule_cnt = ethtool_get_rxnfc_rule_count(dev); + if (rule_cnt <= 0) + return -EINVAL; + + info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->cmd = ETHTOOL_GRXCLSRLALL; + info->rule_cnt = rule_cnt; + err = ops->get_rxnfc(dev, info, info->rule_locs); + if (err) + goto err_free_info; + + for (i = 0; i < rule_cnt; i++) { + struct ethtool_rxnfc rule_info = { + .cmd = ETHTOOL_GRXCLSRULE, + .fs.location = info->rule_locs[i], + }; + + err = ops->get_rxnfc(dev, &rule_info, NULL); + if (err) + goto err_free_info; + + if (rule_info.fs.ring_cookie != RX_CLS_FLOW_DISC && + rule_info.fs.ring_cookie != RX_CLS_FLOW_WAKE && + !(rule_info.flow_type & FLOW_RSS) && + !ethtool_get_flow_spec_ring_vf(rule_info.fs.ring_cookie)) + max_ring = + max_t(u64, max_ring, rule_info.fs.ring_cookie); + } + + kvfree(info); + *max = max_ring; + return 0; + +err_free_info: + kvfree(info); + return err; +} + int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max) { u32 dev_size, current_max = 0; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index c1779657e074..b1b9db810eca 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -43,6 +43,7 @@ bool convert_legacy_settings_to_link_ksettings( struct ethtool_link_ksettings *link_ksettings, const struct ethtool_cmd *legacy_settings); int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max); +int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max); int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info); extern const struct ethtool_phy_ops *ethtool_phy_ops; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index b6835136c53f..e40f5e9e109b 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1789,7 +1789,8 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, { struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; u16 from_channel, to_channel; - u32 max_rx_in_use = 0; + u64 max_rxnfc_in_use; + u32 max_rxfh_in_use; unsigned int i; int ret; @@ -1820,11 +1821,15 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, return -EINVAL; /* ensure the new Rx count fits within the configured Rx flow - * indirection table settings */ - if (netif_is_rxfh_configured(dev) && - !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) && - (channels.combined_count + channels.rx_count) <= max_rx_in_use) - return -EINVAL; + * indirection table/rxnfc settings */ + if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use)) + max_rxnfc_in_use = 0; + if (!netif_is_rxfh_configured(dev) || + ethtool_get_max_rxfh_channel(dev, &max_rxfh_in_use)) + max_rxfh_in_use = 0; + if (channels.combined_count + channels.rx_count <= + max_t(u64, max_rxnfc_in_use, max_rxfh_in_use)) + return -EINVAL; /* Disabling channels, query zero-copy AF_XDP sockets */ from_channel = channels.combined_count + -- cgit v1.2.3-70-g09d2 From ca71277f36e0781db663aedeb5fc1e26e7c144c4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:14 -0500 Subject: net: move the ct helper function to nf_conntrack_helper for ovs and tc Move ovs_ct_helper from openvswitch to nf_conntrack_helper and rename as nf_ct_helper so that it can be used in TC act_ct in the next patch. Note that it also adds the checks for the family and proto, as in TC act_ct, the packets with correct family and proto are not guaranteed. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni --- include/net/netfilter/nf_conntrack_helper.h | 3 ++ net/netfilter/nf_conntrack_helper.c | 69 +++++++++++++++++++++++++++++ net/openvswitch/conntrack.c | 61 +------------------------ 3 files changed, 73 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 9939c366f720..b6676249eeeb 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -115,6 +115,9 @@ struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp); int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags); +int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, u16 proto); + void nf_ct_helper_destroy(struct nf_conn *ct); static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index ff737a76052e..88039eedadea 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -26,7 +26,9 @@ #include #include #include +#include #include +#include static DEFINE_MUTEX(nf_ct_helper_mutex); struct hlist_head *nf_ct_helper_hash __read_mostly; @@ -240,6 +242,73 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, } EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper); +/* 'skb' should already be pulled to nh_ofs. */ +int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + unsigned int protoff; + int err; + + if (ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + if (helper->tuple.src.l3num != NFPROTO_UNSPEC && + helper->tuple.src.l3num != proto) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + proto = ip_hdr(skb)->protocol; + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + int ofs; + + ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, + &frag_off); + if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + protoff = ofs; + proto = nexthdr; + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + if (helper->tuple.dst.protonum != proto) + return NF_ACCEPT; + + err = helper->help(skb, protoff, ct, ctinfo); + if (err != NF_ACCEPT) + return err; + + /* Adjust seqs after helper. This is needed due to some helpers (e.g., + * FTP with NAT) adusting the TCP payload size when mangling IP + * addresses and/or port numbers in the text-based control connection. + */ + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) + return NF_DROP; + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_ct_helper); + /* appropriate ct lock protecting must be taken by caller */ static int unhelp(struct nf_conn *ct, void *me) { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index c7b10234cf7c..18f54fa38e8f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -434,65 +434,6 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, return 0; } -/* 'skb' should already be pulled to nh_ofs. */ -static int ovs_ct_helper(struct sk_buff *skb, u16 proto) -{ - const struct nf_conntrack_helper *helper; - const struct nf_conn_help *help; - enum ip_conntrack_info ctinfo; - unsigned int protoff; - struct nf_conn *ct; - int err; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - switch (proto) { - case NFPROTO_IPV4: - protoff = ip_hdrlen(skb); - break; - case NFPROTO_IPV6: { - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - int ofs; - - ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return NF_ACCEPT; - } - protoff = ofs; - break; - } - default: - WARN_ONCE(1, "helper invoked on non-IP family!"); - return NF_DROP; - } - - err = helper->help(skb, protoff, ct, ctinfo); - if (err != NF_ACCEPT) - return err; - - /* Adjust seqs after helper. This is needed due to some helpers (e.g., - * FTP with NAT) adusting the TCP payload size when mangling IP - * addresses and/or port numbers in the text-based control connection. - */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) - return NF_DROP; - return NF_ACCEPT; -} - /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero * value if 'skb' is freed. */ @@ -1038,7 +979,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, */ if ((nf_ct_is_confirmed(ct) ? !cached || add_helper : info->commit) && - ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + nf_ct_helper(skb, ct, ctinfo, info->family) != NF_ACCEPT) { return -EINVAL; } -- cgit v1.2.3-70-g09d2 From f96cba2eb923c025014fe74a50e104b7c5234feb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:15 -0500 Subject: net: move add ct helper function to nf_conntrack_helper for ovs and tc Move ovs_ct_add_helper from openvswitch to nf_conntrack_helper and rename as nf_ct_add_helper, so that it can be used in TC act_ct in the next patch. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni --- include/net/netfilter/nf_conntrack_helper.h | 2 ++ net/netfilter/nf_conntrack_helper.c | 31 ++++++++++++++++++++ net/openvswitch/conntrack.c | 44 ++++------------------------- 3 files changed, 38 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index b6676249eeeb..f30b1694b690 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -117,6 +117,8 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, u16 proto); +int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, + u8 proto, bool nat, struct nf_conntrack_helper **hp); void nf_ct_helper_destroy(struct nf_conn *ct); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 88039eedadea..48ea6d0264b5 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -309,6 +309,37 @@ int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct, } EXPORT_SYMBOL_GPL(nf_ct_helper); +int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family, + u8 proto, bool nat, struct nf_conntrack_helper **hp) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + int ret = 0; + + helper = nf_conntrack_helper_try_module_get(name, family, proto); + if (!helper) + return -EINVAL; + + help = nf_ct_helper_ext_add(ct, GFP_KERNEL); + if (!help) { + nf_conntrack_helper_put(helper); + return -ENOMEM; + } +#if IS_ENABLED(CONFIG_NF_NAT) + if (nat) { + ret = nf_nat_helper_try_module_get(name, family, proto); + if (ret) { + nf_conntrack_helper_put(helper); + return ret; + } + } +#endif + rcu_assign_pointer(help->helper, helper); + *hp = helper; + return ret; +} +EXPORT_SYMBOL_GPL(nf_ct_add_helper); + /* appropriate ct lock protecting must be taken by caller */ static int unhelp(struct nf_conn *ct, void *me) { diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 18f54fa38e8f..4348321856af 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1291,43 +1291,6 @@ int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) return 0; } -static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, - const struct sw_flow_key *key, bool log) -{ - struct nf_conntrack_helper *helper; - struct nf_conn_help *help; - int ret = 0; - - helper = nf_conntrack_helper_try_module_get(name, info->family, - key->ip.proto); - if (!helper) { - OVS_NLERR(log, "Unknown helper \"%s\"", name); - return -EINVAL; - } - - help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL); - if (!help) { - nf_conntrack_helper_put(helper); - return -ENOMEM; - } - -#if IS_ENABLED(CONFIG_NF_NAT) - if (info->nat) { - ret = nf_nat_helper_try_module_get(name, info->family, - key->ip.proto); - if (ret) { - nf_conntrack_helper_put(helper); - OVS_NLERR(log, "Failed to load \"%s\" NAT helper, error: %d", - name, ret); - return ret; - } - } -#endif - rcu_assign_pointer(help->helper, helper); - info->helper = helper; - return ret; -} - #if IS_ENABLED(CONFIG_NF_NAT) static int parse_nat(const struct nlattr *attr, struct ovs_conntrack_info *info, bool log) @@ -1661,9 +1624,12 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, } if (helper) { - err = ovs_ct_add_helper(&ct_info, helper, key, log); - if (err) + err = nf_ct_add_helper(ct_info.ct, helper, ct_info.family, + key->ip.proto, ct_info.nat, &ct_info.helper); + if (err) { + OVS_NLERR(log, "Failed to add %s helper %d", helper, err); goto err_free_ct; + } } err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, -- cgit v1.2.3-70-g09d2 From 1913894100ca53205f2d56091cb34b8eba1de217 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:16 -0500 Subject: net: sched: call tcf_ct_params_free to free params in tcf_ct_init This patch is to make the err path simple by calling tcf_ct_params_free(), so that it won't cause problems when more members are added into param and need freeing on the err path. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni --- net/sched/act_ct.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index b38d91d6b249..193a460a9d7f 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -345,11 +345,9 @@ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) module_put(THIS_MODULE); } -static void tcf_ct_flow_table_put(struct tcf_ct_params *params) +static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft) { - struct tcf_ct_flow_table *ct_ft = params->ct_ft; - - if (refcount_dec_and_test(¶ms->ct_ft->ref)) { + if (refcount_dec_and_test(&ct_ft->ref)) { rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); queue_rcu_work(act_ct_wq, &ct_ft->rwork); @@ -832,18 +830,23 @@ out_free: return err; } -static void tcf_ct_params_free(struct rcu_head *head) +static void tcf_ct_params_free(struct tcf_ct_params *params) { - struct tcf_ct_params *params = container_of(head, - struct tcf_ct_params, rcu); - - tcf_ct_flow_table_put(params); - + if (params->ct_ft) + tcf_ct_flow_table_put(params->ct_ft); if (params->tmpl) nf_ct_put(params->tmpl); kfree(params); } +static void tcf_ct_params_free_rcu(struct rcu_head *head) +{ + struct tcf_ct_params *params; + + params = container_of(head, struct tcf_ct_params, rcu); + tcf_ct_params_free(params); +} + #if IS_ENABLED(CONFIG_NF_NAT) /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. @@ -1390,7 +1393,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, err = tcf_ct_flow_table_get(net, params); if (err) - goto cleanup_params; + goto cleanup; spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -1401,17 +1404,15 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (goto_ch) tcf_chain_put_by_act(goto_ch); if (params) - call_rcu(¶ms->rcu, tcf_ct_params_free); + call_rcu(¶ms->rcu, tcf_ct_params_free_rcu); return res; -cleanup_params: - if (params->tmpl) - nf_ct_put(params->tmpl); cleanup: if (goto_ch) tcf_chain_put_by_act(goto_ch); - kfree(params); + if (params) + tcf_ct_params_free(params); tcf_idr_release(*a, bind); return err; } @@ -1423,7 +1424,7 @@ static void tcf_ct_cleanup(struct tc_action *a) params = rcu_dereference_protected(c->params, 1); if (params) - call_rcu(¶ms->rcu, tcf_ct_params_free); + call_rcu(¶ms->rcu, tcf_ct_params_free_rcu); } static int tcf_ct_dump_key_val(struct sk_buff *skb, -- cgit v1.2.3-70-g09d2 From a21b06e7319129994f339ed47f512bbe57b77f5b Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 6 Nov 2022 15:34:17 -0500 Subject: net: sched: add helper support in act_ct This patch is to add helper support in act_ct for OVS actions=ct(alg=xxx) offloading, which is corresponding to Commit cae3a2627520 ("openvswitch: Allow attaching helpers to ct action") in OVS kernel part. The difference is when adding TC actions family and proto cannot be got from the filter/match, other than helper name in tb[TCA_CT_HELPER_NAME], we also need to send the family in tb[TCA_CT_HELPER_FAMILY] and the proto in tb[TCA_CT_HELPER_PROTO] to kernel. Acked-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Signed-off-by: Paolo Abeni --- include/net/tc_act/tc_ct.h | 1 + include/uapi/linux/tc_act/tc_ct.h | 3 ++ net/sched/act_ct.c | 89 +++++++++++++++++++++++++++++++++++---- 3 files changed, 85 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 8250d6f0a462..b24ea2d9400b 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -10,6 +10,7 @@ #include struct tcf_ct_params { + struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h index 5fb1d7ac1027..6c5200f0ed38 100644 --- a/include/uapi/linux/tc_act/tc_ct.h +++ b/include/uapi/linux/tc_act/tc_ct.h @@ -22,6 +22,9 @@ enum { TCA_CT_NAT_PORT_MIN, /* be16 */ TCA_CT_NAT_PORT_MAX, /* be16 */ TCA_CT_PAD, + TCA_CT_HELPER_NAME, /* string */ + TCA_CT_HELPER_FAMILY, /* u8 */ + TCA_CT_HELPER_PROTO, /* u8 */ __TCA_CT_MAX }; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 193a460a9d7f..da0b7f665277 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -33,6 +33,7 @@ #include #include #include +#include #include static struct workqueue_struct *act_ct_wq; @@ -655,7 +656,7 @@ struct tc_ct_action_net { /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, - u16 zone_id, bool force) + struct tcf_ct_params *p) { enum ip_conntrack_info ctinfo; struct nf_conn *ct; @@ -665,11 +666,19 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb, return false; if (!net_eq(net, read_pnet(&ct->ct_net))) goto drop_ct; - if (nf_ct_zone(ct)->id != zone_id) + if (nf_ct_zone(ct)->id != p->zone) goto drop_ct; + if (p->helper) { + struct nf_conn_help *help; + + help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); + if (help && rcu_access_pointer(help->helper) != p->helper) + goto drop_ct; + } /* Force conntrack entry direction. */ - if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { + if ((p->ct_action & TCA_CT_ACT_FORCE) && + CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { if (nf_ct_is_confirmed(ct)) nf_ct_kill(ct); @@ -832,6 +841,13 @@ out_free: static void tcf_ct_params_free(struct tcf_ct_params *params) { + if (params->helper) { +#if IS_ENABLED(CONFIG_NF_NAT) + if (params->ct_action & TCA_CT_ACT_NAT) + nf_nat_helper_put(params->helper); +#endif + nf_conntrack_helper_put(params->helper); + } if (params->ct_ft) tcf_ct_flow_table_put(params->ct_ft); if (params->tmpl) @@ -1026,13 +1042,14 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct net *net = dev_net(skb->dev); - bool cached, commit, clear, force; enum ip_conntrack_info ctinfo; struct tcf_ct *c = to_ct(a); struct nf_conn *tmpl = NULL; struct nf_hook_state state; + bool cached, commit, clear; int nh_ofs, err, retval; struct tcf_ct_params *p; + bool add_helper = false; bool skip_add = false; bool defrag = false; struct nf_conn *ct; @@ -1043,7 +1060,6 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, retval = READ_ONCE(c->tcf_action); commit = p->ct_action & TCA_CT_ACT_COMMIT; clear = p->ct_action & TCA_CT_ACT_CLEAR; - force = p->ct_action & TCA_CT_ACT_FORCE; tmpl = p->tmpl; tcf_lastuse_update(&c->tcf_tm); @@ -1086,7 +1102,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, * actually run the packet through conntrack twice unless it's for a * different zone. */ - cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force); + cached = tcf_ct_skb_nfct_cached(net, skb, p); if (!cached) { if (tcf_ct_flow_table_lookup(p, skb, family)) { skip_add = true; @@ -1119,6 +1135,22 @@ do_nat: if (err != NF_ACCEPT) goto drop; + if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) { + err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC); + if (err) + goto drop; + add_helper = true; + if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) { + if (!nfct_seqadj_ext_add(ct)) + goto drop; + } + } + + if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) { + if (nf_ct_helper(skb, ct, ctinfo, family) != NF_ACCEPT) + goto drop; + } + if (commit) { tcf_ct_act_set_mark(ct, p->mark, p->mark_mask); tcf_ct_act_set_labels(ct, p->labels, p->labels_mask); @@ -1167,6 +1199,9 @@ static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = { [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 }, [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 }, + [TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN }, + [TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 }, + [TCA_CT_HELPER_PROTO] = { .type = NLA_U8 }, }; static int tcf_ct_fill_params_nat(struct tcf_ct_params *p, @@ -1256,8 +1291,9 @@ static int tcf_ct_fill_params(struct net *net, { struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); struct nf_conntrack_zone zone; + int err, family, proto, len; struct nf_conn *tmpl; - int err; + char *name; p->zone = NF_CT_DEFAULT_ZONE_ID; @@ -1318,10 +1354,31 @@ static int tcf_ct_fill_params(struct net *net, NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template"); return -ENOMEM; } - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); p->tmpl = tmpl; + if (tb[TCA_CT_HELPER_NAME]) { + name = nla_data(tb[TCA_CT_HELPER_NAME]); + len = nla_len(tb[TCA_CT_HELPER_NAME]); + if (len > 16 || name[len - 1] != '\0') { + NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name."); + err = -EINVAL; + goto err; + } + family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET; + proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP; + err = nf_ct_add_helper(tmpl, name, family, proto, + p->ct_action & TCA_CT_ACT_NAT, &p->helper); + if (err) { + NL_SET_ERR_MSG_MOD(extack, "Failed to add helper"); + goto err; + } + } + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); return 0; +err: + nf_ct_put(p->tmpl); + p->tmpl = NULL; + return err; } static int tcf_ct_init(struct net *net, struct nlattr *nla, @@ -1490,6 +1547,19 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) return 0; } +static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper) +{ + if (!helper) + return 0; + + if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) || + nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) || + nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum)) + return -1; + + return 0; +} + static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { @@ -1542,6 +1612,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, if (tcf_ct_dump_nat(skb, p)) goto nla_put_failure; + if (tcf_ct_dump_helper(skb, p->helper)) + goto nla_put_failure; + skip_dump: if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; -- cgit v1.2.3-70-g09d2 From 4d843be56ba6a8c0e566afd58775742d9e721505 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 5 Apr 2022 21:48:48 +0100 Subject: rxrpc: Trace setting of the request-ack flag Add a tracepoint to log why the request-ack flag is set on an outgoing DATA packet, allowing debugging as to why. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 36 ++++++++++++++++++++++++++++++++++++ net/rxrpc/output.c | 32 +++++++++++++++++++++++--------- 2 files changed, 59 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d20bf4aa0204..4c501c660123 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -242,6 +242,16 @@ EM(rxrpc_tx_point_version_keepalive, "VerKeepalive") \ E_(rxrpc_tx_point_version_reply, "VerReply") +#define rxrpc_req_ack_traces \ + EM(rxrpc_reqack_ack_lost, "ACK-LOST ") \ + EM(rxrpc_reqack_already_on, "ALREADY-ON") \ + EM(rxrpc_reqack_more_rtt, "MORE-RTT ") \ + EM(rxrpc_reqack_no_srv_last, "NO-SRVLAST") \ + EM(rxrpc_reqack_old_rtt, "OLD-RTT ") \ + EM(rxrpc_reqack_retrans, "RETRANS ") \ + EM(rxrpc_reqack_slow_start, "SLOW-START") \ + E_(rxrpc_reqack_small_txwin, "SMALL-TXWN") + /* * Generate enums for tracing information. */ @@ -263,6 +273,7 @@ enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte); enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte); enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte); enum rxrpc_recvmsg_trace { rxrpc_recvmsg_traces } __mode(byte); +enum rxrpc_req_ack_trace { rxrpc_req_ack_traces } __mode(byte); enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); @@ -290,6 +301,7 @@ rxrpc_propose_ack_outcomes; rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; +rxrpc_req_ack_traces; rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_skb_traces; @@ -1395,6 +1407,30 @@ TRACE_EVENT(rxrpc_rx_discard_ack, __entry->call_ackr_prev) ); +TRACE_EVENT(rxrpc_req_ack, + TP_PROTO(unsigned int call_debug_id, rxrpc_seq_t seq, + enum rxrpc_req_ack_trace why), + + TP_ARGS(call_debug_id, seq, why), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id ) + __field(rxrpc_seq_t, seq ) + __field(enum rxrpc_req_ack_trace, why ) + ), + + TP_fast_assign( + __entry->call_debug_id = call_debug_id; + __entry->seq = seq; + __entry->why = why; + ), + + TP_printk("c=%08x q=%08x REQ-%s", + __entry->call_debug_id, + __entry->seq, + __print_symbolic(__entry->why, rxrpc_req_ack_traces)) + ); + #undef EM #undef E_ #endif /* _TRACE_RXRPC_H */ diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 9683617db704..2922c10bd500 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -350,6 +350,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, bool retrans) { + enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; struct rxrpc_wire_header whdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -405,16 +406,29 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) || - rxrpc_to_server(sp) - ) && - (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || - retrans || - call->cong_mode == RXRPC_CALL_SLOW_START || - (call->peer->rtt_count < 3 && sp->hdr.seq & 1) || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), - ktime_get_real()))) + if (whdr.flags & RXRPC_REQUEST_ACK) + why = rxrpc_reqack_already_on; + else if ((whdr.flags & RXRPC_LAST_PACKET) && rxrpc_to_client(sp)) + why = rxrpc_reqack_no_srv_last; + else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) + why = rxrpc_reqack_ack_lost; + else if (retrans) + why = rxrpc_reqack_retrans; + else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2) + why = rxrpc_reqack_slow_start; + else if (call->tx_winsize <= 2) + why = rxrpc_reqack_small_txwin; + else if (call->peer->rtt_count < 3 && sp->hdr.seq & 1) + why = rxrpc_reqack_more_rtt; + else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) + why = rxrpc_reqack_old_rtt; + else + goto dont_set_request_ack; + + trace_rxrpc_req_ack(call->debug_id, sp->hdr.seq, why); + if (why != rxrpc_reqack_no_srv_last) whdr.flags |= RXRPC_REQUEST_ACK; +dont_set_request_ack: if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; -- cgit v1.2.3-70-g09d2 From 334dfbfc5a7187c99761df2392dd4cc49c453bea Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 22 Apr 2022 00:20:49 +0100 Subject: rxrpc: Split call timer-expiration from call timer-set tracepoint Split the tracepoint for call timer-set to separate out the call timer-expiration event Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 42 +++++++++++++++++++++++++++++++++++++++++- net/rxrpc/call_object.c | 2 +- 2 files changed, 42 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4c501c660123..a72f04e3d264 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -133,7 +133,6 @@ #define rxrpc_timer_traces \ EM(rxrpc_timer_begin, "Begin ") \ - EM(rxrpc_timer_expired, "*EXPR*") \ EM(rxrpc_timer_exp_ack, "ExpAck") \ EM(rxrpc_timer_exp_hard, "ExpHrd") \ EM(rxrpc_timer_exp_idle, "ExpIdl") \ @@ -1019,6 +1018,47 @@ TRACE_EVENT(rxrpc_timer, __entry->timer - __entry->now) ); +TRACE_EVENT(rxrpc_timer_expired, + TP_PROTO(struct rxrpc_call *call, unsigned long now), + + TP_ARGS(call, now), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(long, now ) + __field(long, ack_at ) + __field(long, ack_lost_at ) + __field(long, resend_at ) + __field(long, ping_at ) + __field(long, expect_rx_by ) + __field(long, expect_req_by ) + __field(long, expect_term_by ) + __field(long, timer ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->now = now; + __entry->ack_at = call->ack_at; + __entry->ack_lost_at = call->ack_lost_at; + __entry->resend_at = call->resend_at; + __entry->expect_rx_by = call->expect_rx_by; + __entry->expect_req_by = call->expect_req_by; + __entry->expect_term_by = call->expect_term_by; + __entry->timer = call->timer.expires; + ), + + TP_printk("c=%08x EXPIRED a=%ld la=%ld r=%ld xr=%ld xq=%ld xt=%ld t=%ld", + __entry->call, + __entry->ack_at - __entry->now, + __entry->ack_lost_at - __entry->now, + __entry->resend_at - __entry->now, + __entry->expect_rx_by - __entry->now, + __entry->expect_req_by - __entry->now, + __entry->expect_term_by - __entry->now, + __entry->timer - __entry->now) + ); + TRACE_EVENT(rxrpc_rx_lose, TP_PROTO(struct rxrpc_skb_priv *sp), diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 6401cdf7a624..a75861a0c477 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -52,7 +52,7 @@ static void rxrpc_call_timer_expired(struct timer_list *t) _enter("%d", call->debug_id); if (call->state < RXRPC_CALL_COMPLETE) { - trace_rxrpc_timer(call, rxrpc_timer_expired, jiffies); + trace_rxrpc_timer_expired(call, jiffies); __rxrpc_queue_call(call); } else { rxrpc_put_call(call, rxrpc_call_put); -- cgit v1.2.3-70-g09d2 From 589a0c1e0ac31ccba49b214762e444dc015ee1e2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 28 Apr 2022 08:30:47 +0100 Subject: rxrpc: Track highest acked serial Keep track of the highest DATA serial number that has been acked by the peer for future purposes. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 1 + net/rxrpc/input.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 1ad0ec5afb50..6c93a2fa9628 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -690,6 +690,7 @@ struct rxrpc_call { rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ + rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ }; /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 721d847ba92b..4ba678f0c384 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -967,6 +967,10 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) call->acks_first_seq = first_soft_ack; call->acks_prev_seq = prev_pkt; + if (buf.ack.reason != RXRPC_ACK_PING && + after(acked_serial, call->acks_highest_serial)) + call->acks_highest_serial = acked_serial; + /* Parse rwind and mtu sizes if provided. */ if (buf.info.rxMTU) rxrpc_input_ackinfo(call, skb, &buf.info); -- cgit v1.2.3-70-g09d2 From b015424695f03a9fa5862d09c267ed458e256300 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 May 2022 14:01:25 +0100 Subject: rxrpc: Add stats procfile and DATA packet stats Add a procfile, /proc/net/rxrpc/stats, to display some statistics about what rxrpc has been doing. Writing a blank line to the stats file will clear the increment-only counters. Allocated resource counters don't get cleared. Add some counters to count various things about DATA packets, including the number created, transmitted and retransmitted and the number received, the number of ACK-requests markings and the number of jumbo packets received. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 17 +++++++++++++++++ net/rxrpc/call_event.c | 1 + net/rxrpc/input.c | 6 ++++++ net/rxrpc/net_ns.c | 2 ++ net/rxrpc/output.c | 2 ++ net/rxrpc/proc.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/sendmsg.c | 2 ++ 7 files changed, 78 insertions(+) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6c93a2fa9628..ed406a5f939b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -93,6 +93,14 @@ struct rxrpc_net { struct list_head peer_keepalive_new; struct timer_list peer_keepalive_timer; struct work_struct peer_keepalive_work; + + atomic_t stat_tx_data; + atomic_t stat_tx_data_retrans; + atomic_t stat_tx_data_send; + atomic_t stat_tx_data_send_frag; + atomic_t stat_rx_data; + atomic_t stat_rx_data_reqack; + atomic_t stat_rx_data_jumbo; }; /* @@ -1092,6 +1100,15 @@ void rxrpc_get_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_free_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_purge_queue(struct sk_buff_head *); +/* + * stats.c + */ +int rxrpc_stats_show(struct seq_file *seq, void *v); +int rxrpc_stats_clear(struct file *file, char *buf, size_t size); + +#define rxrpc_inc_stat(rxnet, s) atomic_inc(&(rxnet)->s) +#define rxrpc_dec_stat(rxnet, s) atomic_dec(&(rxnet)->s) + /* * sysctl.c */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 2a93e7b5fbd0..c5b3ae1fe80b 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -261,6 +261,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) rxrpc_get_skb(skb, rxrpc_skb_got); spin_unlock_bh(&call->lock); + rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); if (rxrpc_send_data_packet(call, skb, true) < 0) { rxrpc_free_skb(skb, rxrpc_skb_freed); return; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 4ba678f0c384..e7586d5ea2c3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -443,6 +443,12 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } } + rxrpc_inc_stat(call->rxnet, stat_rx_data); + if (sp->hdr.flags & RXRPC_REQUEST_ACK) + rxrpc_inc_stat(call->rxnet, stat_rx_data_reqack); + if (sp->hdr.flags & RXRPC_JUMBO_PACKET) + rxrpc_inc_stat(call->rxnet, stat_rx_data_jumbo); + spin_lock(&call->input_lock); /* Received data implicitly ACKs all of the request packets we sent diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index bb4c25d6df64..84242c0e467c 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -101,6 +101,8 @@ static __net_init int rxrpc_init_net(struct net *net) proc_create_net("locals", 0444, rxnet->proc_net, &rxrpc_local_seq_ops, sizeof(struct seq_net_private)); + proc_create_net_single_write("stats", S_IFREG | 0644, rxnet->proc_net, + rxrpc_stats_show, rxrpc_stats_clear, NULL); return 0; err_proc: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 2922c10bd500..8fddad2f63fc 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -462,6 +462,7 @@ dont_set_request_ack: * - in which case, we'll have processed the ICMP error * message and update the peer record */ + rxrpc_inc_stat(call->rxnet, stat_tx_data_send); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); @@ -537,6 +538,7 @@ send_fragmentable: case AF_INET: ip_sock_set_mtu_discover(conn->params.local->socket->sk, IP_PMTUDISC_DONT); + rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 245418943e01..102744411932 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -397,3 +397,51 @@ const struct seq_operations rxrpc_local_seq_ops = { .stop = rxrpc_local_seq_stop, .show = rxrpc_local_seq_show, }; + +/* + * Display stats in /proc/net/rxrpc/stats + */ +int rxrpc_stats_show(struct seq_file *seq, void *v) +{ + struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq)); + + seq_printf(seq, + "Data : send=%u sendf=%u\n", + atomic_read(&rxnet->stat_tx_data_send), + atomic_read(&rxnet->stat_tx_data_send_frag)); + seq_printf(seq, + "Data-Tx : nr=%u retrans=%u\n", + atomic_read(&rxnet->stat_tx_data), + atomic_read(&rxnet->stat_tx_data_retrans)); + seq_printf(seq, + "Data-Rx : nr=%u reqack=%u jumbo=%u\n", + atomic_read(&rxnet->stat_rx_data), + atomic_read(&rxnet->stat_rx_data_reqack), + atomic_read(&rxnet->stat_rx_data_jumbo)); + seq_printf(seq, + "Buffers : txb=%u rxb=%u\n", + atomic_read(&rxrpc_n_tx_skbs), + atomic_read(&rxrpc_n_rx_skbs)); + return 0; +} + +/* + * Clear stats if /proc/net/rxrpc/stats is written to. + */ +int rxrpc_stats_clear(struct file *file, char *buf, size_t size) +{ + struct seq_file *m = file->private_data; + struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(m)); + + if (size > 1 || (size == 1 && buf[0] != '\n')) + return -EINVAL; + + atomic_set(&rxnet->stat_tx_data, 0); + atomic_set(&rxnet->stat_tx_data_retrans, 0); + atomic_set(&rxnet->stat_tx_data_send, 0); + atomic_set(&rxnet->stat_tx_data_send_frag, 0); + atomic_set(&rxnet->stat_rx_data, 0); + atomic_set(&rxnet->stat_rx_data_reqack, 0); + atomic_set(&rxnet->stat_rx_data_jumbo, 0); + return size; +} diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 3c3a626459de..ad6f2cd08916 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -200,6 +200,8 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, _net("queue skb %p [%d]", skb, seq); + rxrpc_inc_stat(call->rxnet, stat_tx_data); + ASSERTCMP(seq, ==, call->tx_top + 1); if (last) -- cgit v1.2.3-70-g09d2 From f2a676d10038e8f3913dc576397b9c9efb190afd Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 11 May 2022 14:01:25 +0100 Subject: rxrpc: Record statistics about ACK types Record statistics about the different types of ACKs that have been transmitted and received and the number of ACKs that have been filled out and transmitted or that have been skipped. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 6 ++++++ net/rxrpc/call_event.c | 2 ++ net/rxrpc/input.c | 1 + net/rxrpc/output.c | 7 ++++++- net/rxrpc/proc.c | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 48 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ed406a5f939b..8ed707a11d43 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -101,6 +101,12 @@ struct rxrpc_net { atomic_t stat_rx_data; atomic_t stat_rx_data_reqack; atomic_t stat_rx_data_jumbo; + + atomic_t stat_tx_ack_fill; + atomic_t stat_tx_ack_send; + atomic_t stat_tx_ack_skip; + atomic_t stat_tx_acks[256]; + atomic_t stat_rx_acks[256]; }; /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index c5b3ae1fe80b..70f70a0393f7 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -50,6 +50,8 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, unsigned long expiry = rxrpc_soft_ack_delay; s8 prior = rxrpc_ack_priority[ack_reason]; + rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); + /* Pings are handled specially because we don't want to accidentally * lose a ping response by subsuming it into a ping. */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e7586d5ea2c3..7810b7d4f871 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -888,6 +888,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_ack(call, ack_serial, acked_serial, first_soft_ack, prev_pkt, summary.ack_reason, nr_acks); + rxrpc_inc_stat(call->rxnet, stat_rx_acks[buf.ack.reason]); switch (buf.ack.reason) { case RXRPC_ACK_PING_RESPONSE: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8fddad2f63fc..f350d39e3a60 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -83,8 +83,12 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, tmp = atomic_xchg(&call->ackr_nr_unacked, 0); tmp |= atomic_xchg(&call->ackr_nr_consumed, 0); if (!tmp && (reason == RXRPC_ACK_DELAY || - reason == RXRPC_ACK_IDLE)) + reason == RXRPC_ACK_IDLE)) { + rxrpc_inc_stat(call->rxnet, stat_tx_ack_skip); return 0; + } + + rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); /* Barrier against rxrpc_input_data(). */ serial = call->ackr_serial; @@ -253,6 +257,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, if (ping) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); + rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 102744411932..488c403f1d33 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -418,6 +418,33 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_data), atomic_read(&rxnet->stat_rx_data_reqack), atomic_read(&rxnet->stat_rx_data_jumbo)); + seq_printf(seq, + "Ack : fill=%u send=%u skip=%u\n", + atomic_read(&rxnet->stat_tx_ack_fill), + atomic_read(&rxnet->stat_tx_ack_send), + atomic_read(&rxnet->stat_tx_ack_skip)); + seq_printf(seq, + "Ack-Tx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n", + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_REQUESTED]), + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DUPLICATE]), + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]), + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_EXCEEDS_WINDOW]), + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_NOSPACE]), + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_PING]), + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_PING_RESPONSE]), + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]), + atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE])); + seq_printf(seq, + "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n", + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]), + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]), + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]), + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_EXCEEDS_WINDOW]), + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_NOSPACE]), + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]), + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]), + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_n_tx_skbs), @@ -443,5 +470,11 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) atomic_set(&rxnet->stat_rx_data, 0); atomic_set(&rxnet->stat_rx_data_reqack, 0); atomic_set(&rxnet->stat_rx_data_jumbo, 0); + + atomic_set(&rxnet->stat_tx_ack_fill, 0); + atomic_set(&rxnet->stat_tx_ack_send, 0); + atomic_set(&rxnet->stat_tx_ack_skip, 0); + memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks)); + memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks)); return size; } -- cgit v1.2.3-70-g09d2 From f7fa52421f76309c574f2575701660bc3ea3a705 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 18 Aug 2022 11:52:36 +0100 Subject: rxrpc: Record stats for why the REQUEST-ACK flag is being set Record stats for why the REQUEST-ACK flag is being set. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 1 + net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/output.c | 1 + net/rxrpc/proc.c | 14 ++++++++++++++ 4 files changed, 18 insertions(+) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a72f04e3d264..794523d15321 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -250,6 +250,7 @@ EM(rxrpc_reqack_retrans, "RETRANS ") \ EM(rxrpc_reqack_slow_start, "SLOW-START") \ E_(rxrpc_reqack_small_txwin, "SMALL-TXWN") +/* ---- Must update size of stat_why_req_ack[] if more are added! */ /* * Generate enums for tracing information. diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 8ed707a11d43..436a1e8d0abd 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -107,6 +107,8 @@ struct rxrpc_net { atomic_t stat_tx_ack_skip; atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; + + atomic_t stat_why_req_ack[8]; }; /* diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f350d39e3a60..77ed46ab33c5 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -430,6 +430,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, else goto dont_set_request_ack; + rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, sp->hdr.seq, why); if (why != rxrpc_reqack_no_srv_last) whdr.flags |= RXRPC_REQUEST_ACK; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 488c403f1d33..9bd357f39c39 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -445,6 +445,18 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); + seq_printf(seq, + "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n", + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); + seq_printf(seq, + "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n", + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_retrans]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin])); seq_printf(seq, "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_n_tx_skbs), @@ -476,5 +488,7 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) atomic_set(&rxnet->stat_tx_ack_skip, 0); memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks)); memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks)); + + memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack)); return size; } -- cgit v1.2.3-70-g09d2 From 8889a711f9b4dcf4dd1330fa493081beebd118c9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 7 Sep 2022 19:17:29 +0100 Subject: rxrpc: Fix ack.bufferSize to be 0 when generating an ack ack.bufferSize should be set to 0 when generating an ack. Fixes: 8d94aa381dab ("rxrpc: Calls shouldn't hold socket refs") Reported-by: Jeffrey Altman Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 77ed46ab33c5..b803bfe146de 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -97,7 +97,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, *_hard_ack = hard_ack; *_top = top; - pkt->ack.bufferSpace = htons(8); + pkt->ack.bufferSpace = htons(0); pkt->ack.maxSkew = htons(0); pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_highest_seq); -- cgit v1.2.3-70-g09d2 From 42fb06b391ace2aec5cdb1ebb8ff668f0a34332f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Oct 2022 08:49:29 +0100 Subject: net: Change the udp encap_err_rcv to allow use of {ip,ipv6}_icmp_error() Change the udp encap_err_rcv signature to match ip_icmp_error() and ipv6_icmp_error() so that those can be used from the called function and export them. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org --- include/linux/udp.h | 3 +- include/net/udp_tunnel.h | 4 +-- net/ipv4/ip_sockglue.c | 1 + net/ipv4/udp.c | 3 +- net/ipv6/datagram.c | 1 + net/ipv6/udp.c | 3 +- net/rxrpc/ar-internal.h | 2 +- net/rxrpc/peer_event.c | 71 ++++++++++++------------------------------------ 8 files changed, 28 insertions(+), 60 deletions(-) (limited to 'net') diff --git a/include/linux/udp.h b/include/linux/udp.h index 5cdba00a904a..dea57aa37df6 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -70,7 +70,8 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); - void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); + void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 72394f441dad..0ca9b7a11baf 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -68,8 +68,8 @@ typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk, struct sk_buff *skb); typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk, - struct sk_buff *skb, - unsigned int udp_offset); + struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload); typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk, struct list_head *head, diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 5f16807d3235..9f92ae35bb01 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -433,6 +433,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, } kfree_skb(skb); } +EXPORT_SYMBOL_GPL(ip_icmp_error); void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 89accc3c8bb3..b859d6c8298e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -784,7 +784,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ if (udp_sk(sk)->encap_err_rcv) - udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2); + udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info, + (u8 *)(uh+1)); goto out; } if (!inet->recverr) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index df7e032ce87d..7c7155b48f17 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -334,6 +334,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, if (sock_queue_err_skb(sk, skb)) kfree_skb(skb); } +EXPORT_SYMBOL_GPL(ipv6_icmp_error); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info) { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 297f7cc06044..4bc3fc27ec78 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -631,7 +631,8 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* Tunnels don't have an application socket: don't pass errors back */ if (tunnel) { if (udp_sk(sk)->encap_err_rcv) - udp_sk(sk)->encap_err_rcv(sk, skb, offset); + udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, + ntohl(info), (u8 *)(uh+1)); goto out; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 436a1e8d0abd..51270b2e49c3 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -998,7 +998,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c */ -void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); +void rxrpc_encap_err_rcv(struct sock *, struct sk_buff *, int, __be16, u32, u8 *); void rxrpc_error_report(struct sock *); void rxrpc_peer_keepalive_worker(struct work_struct *); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 32561e9567fe..d7d6d7aff985 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -29,20 +29,16 @@ static void rxrpc_distribute_error(struct rxrpc_peer *, int, */ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, struct sk_buff *skb, - unsigned int udp_offset, - unsigned int *info, + __be16 udp_port, struct sockaddr_rxrpc *srx) { struct iphdr *ip, *ip0 = ip_hdr(skb); struct icmphdr *icmp = icmp_hdr(skb); - struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code); switch (icmp->type) { case ICMP_DEST_UNREACH: - *info = ntohs(icmp->un.frag.mtu); - fallthrough; case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: ip = (struct iphdr *)((void *)icmp + 8); @@ -63,7 +59,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, case AF_INET: srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; - srx->transport.sin.sin_port = udp->dest; + srx->transport.sin.sin_port = udp_port; memcpy(&srx->transport.sin.sin_addr, &ip->daddr, sizeof(struct in_addr)); break; @@ -72,7 +68,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, case AF_INET6: srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; - srx->transport.sin.sin_port = udp->dest; + srx->transport.sin.sin_port = udp_port; memcpy(&srx->transport.sin.sin_addr, &ip->daddr, sizeof(struct in_addr)); break; @@ -93,20 +89,16 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, */ static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, struct sk_buff *skb, - unsigned int udp_offset, - unsigned int *info, + __be16 udp_port, struct sockaddr_rxrpc *srx) { struct icmp6hdr *icmp = icmp6_hdr(skb); struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb); - struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code); switch (icmp->icmp6_type) { case ICMPV6_DEST_UNREACH: - *info = ntohl(icmp->icmp6_mtu); - fallthrough; case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: @@ -129,13 +121,13 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, _net("Rx ICMP6 on v4 sock"); srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; - srx->transport.sin.sin_port = udp->dest; + srx->transport.sin.sin_port = udp_port; memcpy(&srx->transport.sin.sin_addr, &ip->daddr.s6_addr32[3], sizeof(struct in_addr)); break; case AF_INET6: _net("Rx ICMP6"); - srx->transport.sin.sin_port = udp->dest; + srx->transport.sin.sin_port = udp_port; memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr, sizeof(struct in6_addr)); break; @@ -152,15 +144,13 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, /* * Handle an error received on the local endpoint as a tunnel. */ -void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, - unsigned int udp_offset) +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) { struct sock_extended_err ee; struct sockaddr_rxrpc srx; struct rxrpc_local *local; struct rxrpc_peer *peer; - unsigned int info = 0; - int err; u8 version = ip_hdr(skb)->version; u8 type = icmp_hdr(skb)->type; u8 code = icmp_hdr(skb)->code; @@ -176,13 +166,11 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, switch (ip_hdr(skb)->version) { case IPVERSION: - peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset, - &info, &srx); + peer = rxrpc_lookup_peer_icmp_rcu(local, skb, port, &srx); break; #ifdef CONFIG_AF_RXRPC_IPV6 case 6: - peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset, - &info, &srx); + peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, port, &srx); break; #endif default: @@ -201,34 +189,12 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, switch (version) { case IPVERSION: - switch (type) { - case ICMP_DEST_UNREACH: - switch (code) { - case ICMP_FRAG_NEEDED: - rxrpc_adjust_mtu(peer, info); - rcu_read_unlock(); - rxrpc_put_peer(peer); - return; - default: - break; - } - - err = EHOSTUNREACH; - if (code <= NR_ICMP_UNREACH) { - /* Might want to do something different with - * non-fatal errors - */ - //harderr = icmp_err_convert[code].fatal; - err = icmp_err_convert[code].errno; - } - break; - - case ICMP_TIME_EXCEEDED: - err = EHOSTUNREACH; - break; - default: - err = EPROTO; - break; + if (type == ICMP_DEST_UNREACH && + code == ICMP_FRAG_NEEDED) { + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; } ee.ee_origin = SO_EE_ORIGIN_ICMP; @@ -239,16 +205,13 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_AF_RXRPC_IPV6 case 6: - switch (type) { - case ICMPV6_PKT_TOOBIG: + if (type == ICMPV6_PKT_TOOBIG) { rxrpc_adjust_mtu(peer, info); rcu_read_unlock(); rxrpc_put_peer(peer); return; } - icmpv6_err_convert(type, code, &err); - if (err == EACCES) err = EHOSTUNREACH; -- cgit v1.2.3-70-g09d2 From b6c66c4324e7dd66a06a6a034204ae7d4e95c28c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Oct 2022 09:51:12 +0100 Subject: rxrpc: Use the core ICMP/ICMP6 parsers Make rxrpc_encap_rcv_err() pass the ICMP/ICMP6 skbuff to ip_icmp_error() or ipv6_icmp_error() as appropriate to do the parsing rather than trying to do it in rxrpc. This pushes an error report onto the UDP socket's error queue and calls ->sk_error_report() from which point rxrpc can pick it up. It would be preferable to steal the packet directly from ip*_icmp_error() rather than letting it get queued, but this is probably good enough. Also note that __udp4_lib_err() calls sk_error_report() twice in some cases. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/local_object.c | 13 +++ net/rxrpc/peer_event.c | 245 ++++++----------------------------------------- 3 files changed, 42 insertions(+), 217 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 51270b2e49c3..ba0ee5d1c723 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -998,7 +998,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c */ -void rxrpc_encap_err_rcv(struct sock *, struct sk_buff *, int, __be16, u32, u8 *); void rxrpc_error_report(struct sock *); void rxrpc_peer_keepalive_worker(struct work_struct *); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 38ea98ff426b..e95e75e785fb 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -23,6 +23,19 @@ static void rxrpc_local_processor(struct work_struct *); static void rxrpc_local_rcu(struct rcu_head *); +/* + * Handle an ICMP/ICMP6 error turning up at the tunnel. Push it through the + * usual mechanism so that it gets parsed and presented through the UDP + * socket's error_report(). + */ +static void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload) +{ + if (ip_hdr(skb)->version == IPVERSION) + return ip_icmp_error(sk, skb, err, port, info, payload); + return ipv6_icmp_error(sk, skb, err, port, info, payload); +} + /* * Compare a local to an address. Return -ve, 0 or +ve to indicate less than, * same or greater than. diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index d7d6d7aff985..cda3890657a9 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -16,220 +16,12 @@ #include #include #include -#include #include "ar-internal.h" -static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int); static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); static void rxrpc_distribute_error(struct rxrpc_peer *, int, enum rxrpc_call_completion); -/* - * Find the peer associated with an ICMPv4 packet. - */ -static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, - struct sk_buff *skb, - __be16 udp_port, - struct sockaddr_rxrpc *srx) -{ - struct iphdr *ip, *ip0 = ip_hdr(skb); - struct icmphdr *icmp = icmp_hdr(skb); - - _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code); - - switch (icmp->type) { - case ICMP_DEST_UNREACH: - case ICMP_TIME_EXCEEDED: - case ICMP_PARAMETERPROB: - ip = (struct iphdr *)((void *)icmp + 8); - break; - default: - return NULL; - } - - memset(srx, 0, sizeof(*srx)); - srx->transport_type = local->srx.transport_type; - srx->transport_len = local->srx.transport_len; - srx->transport.family = local->srx.transport.family; - - /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice - * versa? - */ - switch (srx->transport.family) { - case AF_INET: - srx->transport_len = sizeof(srx->transport.sin); - srx->transport.family = AF_INET; - srx->transport.sin.sin_port = udp_port; - memcpy(&srx->transport.sin.sin_addr, &ip->daddr, - sizeof(struct in_addr)); - break; - -#ifdef CONFIG_AF_RXRPC_IPV6 - case AF_INET6: - srx->transport_len = sizeof(srx->transport.sin); - srx->transport.family = AF_INET; - srx->transport.sin.sin_port = udp_port; - memcpy(&srx->transport.sin.sin_addr, &ip->daddr, - sizeof(struct in_addr)); - break; -#endif - - default: - WARN_ON_ONCE(1); - return NULL; - } - - _net("ICMP {%pISp}", &srx->transport); - return rxrpc_lookup_peer_rcu(local, srx); -} - -#ifdef CONFIG_AF_RXRPC_IPV6 -/* - * Find the peer associated with an ICMPv6 packet. - */ -static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, - struct sk_buff *skb, - __be16 udp_port, - struct sockaddr_rxrpc *srx) -{ - struct icmp6hdr *icmp = icmp6_hdr(skb); - struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb); - - _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code); - - switch (icmp->icmp6_type) { - case ICMPV6_DEST_UNREACH: - case ICMPV6_PKT_TOOBIG: - case ICMPV6_TIME_EXCEED: - case ICMPV6_PARAMPROB: - ip = (struct ipv6hdr *)((void *)icmp + 8); - break; - default: - return NULL; - } - - memset(srx, 0, sizeof(*srx)); - srx->transport_type = local->srx.transport_type; - srx->transport_len = local->srx.transport_len; - srx->transport.family = local->srx.transport.family; - - /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice - * versa? - */ - switch (srx->transport.family) { - case AF_INET: - _net("Rx ICMP6 on v4 sock"); - srx->transport_len = sizeof(srx->transport.sin); - srx->transport.family = AF_INET; - srx->transport.sin.sin_port = udp_port; - memcpy(&srx->transport.sin.sin_addr, - &ip->daddr.s6_addr32[3], sizeof(struct in_addr)); - break; - case AF_INET6: - _net("Rx ICMP6"); - srx->transport.sin.sin_port = udp_port; - memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr, - sizeof(struct in6_addr)); - break; - default: - WARN_ON_ONCE(1); - return NULL; - } - - _net("ICMP {%pISp}", &srx->transport); - return rxrpc_lookup_peer_rcu(local, srx); -} -#endif /* CONFIG_AF_RXRPC_IPV6 */ - -/* - * Handle an error received on the local endpoint as a tunnel. - */ -void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, - __be16 port, u32 info, u8 *payload) -{ - struct sock_extended_err ee; - struct sockaddr_rxrpc srx; - struct rxrpc_local *local; - struct rxrpc_peer *peer; - u8 version = ip_hdr(skb)->version; - u8 type = icmp_hdr(skb)->type; - u8 code = icmp_hdr(skb)->code; - - rcu_read_lock(); - local = rcu_dereference_sk_user_data(sk); - if (unlikely(!local)) { - rcu_read_unlock(); - return; - } - - rxrpc_new_skb(skb, rxrpc_skb_received); - - switch (ip_hdr(skb)->version) { - case IPVERSION: - peer = rxrpc_lookup_peer_icmp_rcu(local, skb, port, &srx); - break; -#ifdef CONFIG_AF_RXRPC_IPV6 - case 6: - peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, port, &srx); - break; -#endif - default: - rcu_read_unlock(); - return; - } - - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - rcu_read_unlock(); - return; - } - - memset(&ee, 0, sizeof(ee)); - - switch (version) { - case IPVERSION: - if (type == ICMP_DEST_UNREACH && - code == ICMP_FRAG_NEEDED) { - rxrpc_adjust_mtu(peer, info); - rcu_read_unlock(); - rxrpc_put_peer(peer); - return; - } - - ee.ee_origin = SO_EE_ORIGIN_ICMP; - ee.ee_type = type; - ee.ee_code = code; - ee.ee_errno = err; - break; - -#ifdef CONFIG_AF_RXRPC_IPV6 - case 6: - if (type == ICMPV6_PKT_TOOBIG) { - rxrpc_adjust_mtu(peer, info); - rcu_read_unlock(); - rxrpc_put_peer(peer); - return; - } - - if (err == EACCES) - err = EHOSTUNREACH; - - ee.ee_origin = SO_EE_ORIGIN_ICMP6; - ee.ee_type = type; - ee.ee_code = code; - ee.ee_errno = err; - break; -#endif - } - - trace_rxrpc_rx_icmp(peer, &ee, &srx); - - rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR); - rcu_read_unlock(); - rxrpc_put_peer(peer); -} - /* * Find the peer associated with a local error. */ @@ -246,6 +38,9 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, srx->transport_len = local->srx.transport_len; srx->transport.family = local->srx.transport.family; + /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice + * versa? + */ switch (srx->transport.family) { case AF_INET: srx->transport_len = sizeof(srx->transport.sin); @@ -375,20 +170,38 @@ void rxrpc_error_report(struct sock *sk) } rxrpc_new_skb(skb, rxrpc_skb_received); serr = SKB_EXT_ERR(skb); + if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { + _leave("UDP empty message"); + rcu_read_unlock(); + rxrpc_free_skb(skb, rxrpc_skb_freed); + return; + } - if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) { - peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (peer) { - trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); - rxrpc_store_error(peer, serr); - } + peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + rcu_read_unlock(); + rxrpc_free_skb(skb, rxrpc_skb_freed); + _leave(" [no peer]"); + return; } + trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); + + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && + serr->ee.ee_type == ICMP_DEST_UNREACH && + serr->ee.ee_code == ICMP_FRAG_NEEDED)) { + rxrpc_adjust_mtu(peer, serr->ee.ee_info); + goto out; + } + + rxrpc_store_error(peer, serr); +out: rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); rxrpc_put_peer(peer); + _leave(""); } -- cgit v1.2.3-70-g09d2 From ed472b0c8783e7e3896a8fb4382f2187aae427e1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 22 Mar 2022 11:07:20 +0000 Subject: rxrpc: Call udp_sendmsg() directly Call udp_sendmsg() and udpv6_sendmsg() directly rather than calling kernel_sendmsg() as the latter assumes we want a kvec-class iterator. However, zerocopy explicitly doesn't work with such an iterator. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/ipv6/udp.c | 1 + net/rxrpc/output.c | 37 +++++++++++++++++++++++++++---------- 2 files changed, 28 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 4bc3fc27ec78..331d699355f3 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1639,6 +1639,7 @@ do_confirm: err = 0; goto out; } +EXPORT_SYMBOL(udpv6_sendmsg); void udpv6_destroy_sock(struct sock *sk) { diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index b803bfe146de..71885d741987 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "ar-internal.h" struct rxrpc_ack_buffer { @@ -23,6 +24,19 @@ struct rxrpc_ack_buffer { struct rxrpc_ackinfo ackinfo; }; +extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); + +static ssize_t do_udp_sendmsg(struct socket *sk, struct msghdr *msg, size_t len) +{ +#if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) + struct sockaddr *sa = msg->msg_name; + + if (sa->sa_family == AF_INET6) + return udpv6_sendmsg(sk->sk, msg, len); +#endif + return udp_sendmsg(sk->sk, msg, len); +} + struct rxrpc_abort_buffer { struct rxrpc_wire_header whdr; __be32 abort_code; @@ -258,8 +272,10 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); - conn->params.peer->last_tx_at = ktime_get_seconds(); + + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); + ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); + call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); @@ -336,8 +352,8 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) serial = atomic_inc_return(&conn->serial); pkt.whdr.serial = htonl(serial); - ret = kernel_sendmsg(conn->params.local->socket, - &msg, iov, 1, sizeof(pkt)); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, sizeof(pkt)); + ret = do_udp_sendmsg(conn->params.local->socket, &msg, sizeof(pkt)); conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, @@ -397,6 +413,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, iov[1].iov_base = skb->head; iov[1].iov_len = skb->len; len = iov[0].iov_len + iov[1].iov_len; + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -469,7 +486,7 @@ dont_set_request_ack: * message and update the peer record */ rxrpc_inc_stat(call->rxnet, stat_tx_data_send); - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); conn->params.peer->last_tx_at = ktime_get_seconds(); up_read(&conn->params.local->defrag_sem); @@ -545,8 +562,7 @@ send_fragmentable: ip_sock_set_mtu_discover(conn->params.local->socket->sk, IP_PMTUDISC_DONT); rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag); - ret = kernel_sendmsg(conn->params.local->socket, &msg, - iov, 2, len); + ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); conn->params.peer->last_tx_at = ktime_get_seconds(); ip_sock_set_mtu_discover(conn->params.local->socket->sk, @@ -632,8 +648,8 @@ void rxrpc_reject_packets(struct rxrpc_local *local) whdr.flags ^= RXRPC_CLIENT_INITIATED; whdr.flags &= RXRPC_CLIENT_INITIATED; - ret = kernel_sendmsg(local->socket, &msg, - iov, ioc, size); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, ioc, size); + ret = do_udp_sendmsg(local->socket, &msg, size); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, rxrpc_tx_point_reject); @@ -688,7 +704,8 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) _proto("Tx VERSION (keepalive)"); - ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); + ret = do_udp_sendmsg(peer->local->socket, &msg, len); if (ret < 0) trace_rxrpc_tx_fail(peer->debug_id, 0, ret, rxrpc_tx_point_version_keepalive); -- cgit v1.2.3-70-g09d2 From 23b237f3259299b75dd2ffefc7a4af889ba308c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Jan 2020 21:36:21 +0000 Subject: rxrpc: Remove unnecessary header inclusions Remove a bunch of unnecessary header inclusions. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/input.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'net') diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7810b7d4f871..59a0b8aee2a2 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -7,20 +7,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include "ar-internal.h" static void rxrpc_proto_abort(const char *why, -- cgit v1.2.3-70-g09d2 From 27f699ccb89d65165175525254fec3d9d6b8d500 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Oct 2022 13:52:06 +0100 Subject: rxrpc: Remove the flags from the rxrpc_skb tracepoint Remove the flags from the rxrpc_skb tracepoint as we're no longer going to be using this for the transmission buffers and so marking which are transmission buffers isn't going to be necessary. Note that this also remove the rxrpc skb flag that indicates if this is a transmission buffer and so the count is not updated for the moment. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 9 +++------ net/rxrpc/ar-internal.h | 1 - net/rxrpc/sendmsg.c | 1 - net/rxrpc/skbuff.c | 20 +++++++------------- 4 files changed, 10 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 794523d15321..484c8d032ab8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -461,14 +461,13 @@ TRACE_EVENT(rxrpc_call, TRACE_EVENT(rxrpc_skb, TP_PROTO(struct sk_buff *skb, enum rxrpc_skb_trace op, - int usage, int mod_count, u8 flags, const void *where), + int usage, int mod_count, const void *where), - TP_ARGS(skb, op, usage, mod_count, flags, where), + TP_ARGS(skb, op, usage, mod_count, where), TP_STRUCT__entry( __field(struct sk_buff *, skb ) __field(enum rxrpc_skb_trace, op ) - __field(u8, flags ) __field(int, usage ) __field(int, mod_count ) __field(const void *, where ) @@ -476,16 +475,14 @@ TRACE_EVENT(rxrpc_skb, TP_fast_assign( __entry->skb = skb; - __entry->flags = flags; __entry->op = op; __entry->usage = usage; __entry->mod_count = mod_count; __entry->where = where; ), - TP_printk("s=%p %cx %s u=%d m=%d p=%pSR", + TP_printk("s=%p Rx %s u=%d m=%d p=%pSR", __entry->skb, - __entry->flags & RXRPC_SKB_TX_BUFFER ? 'T' : 'R', __print_symbolic(__entry->op, rxrpc_skb_traces), __entry->usage, __entry->mod_count, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ba0ee5d1c723..92f20b6c23ca 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -198,7 +198,6 @@ struct rxrpc_skb_priv { u8 nr_subpackets; /* Number of subpackets */ u8 rx_flags; /* Received packet flags */ #define RXRPC_SKB_INCL_LAST 0x01 /* - Includes last packet */ -#define RXRPC_SKB_TX_BUFFER 0x02 /* - Is transmit buffer */ union { int remain; /* amount of space remaining for next write */ diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index ad6f2cd08916..b2d28aa12e10 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -363,7 +363,6 @@ reload: goto maybe_error; sp = rxrpc_skb(skb); - sp->rx_flags |= RXRPC_SKB_TX_BUFFER; rxrpc_new_skb(skb, rxrpc_skb_new); _debug("ALLOC SEND %p", skb); diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 580a5acffee7..0c827d5bb2b8 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -14,8 +14,7 @@ #include #include "ar-internal.h" -#define is_tx_skb(skb) (rxrpc_skb(skb)->rx_flags & RXRPC_SKB_TX_BUFFER) -#define select_skb_count(skb) (is_tx_skb(skb) ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs) +#define select_skb_count(skb) (&rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. @@ -24,8 +23,7 @@ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(select_skb_count(skb)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, - rxrpc_skb(skb)->rx_flags, here); + trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); } /* @@ -36,8 +34,7 @@ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) const void *here = __builtin_return_address(0); if (skb) { int n = atomic_read(select_skb_count(skb)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, - rxrpc_skb(skb)->rx_flags, here); + trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); } } @@ -48,8 +45,7 @@ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(select_skb_count(skb)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, - rxrpc_skb(skb)->rx_flags, here); + trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); skb_get(skb); } @@ -60,7 +56,7 @@ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(&rxrpc_n_rx_skbs); - trace_rxrpc_skb(skb, op, 0, n, 0, here); + trace_rxrpc_skb(skb, op, 0, n, here); } /* @@ -72,8 +68,7 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) if (skb) { int n; n = atomic_dec_return(select_skb_count(skb)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, - rxrpc_skb(skb)->rx_flags, here); + trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); kfree_skb(skb); } } @@ -88,8 +83,7 @@ void rxrpc_purge_queue(struct sk_buff_head *list) while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, rxrpc_skb_purged, - refcount_read(&skb->users), n, - rxrpc_skb(skb)->rx_flags, here); + refcount_read(&skb->users), n, here); kfree_skb(skb); } } -- cgit v1.2.3-70-g09d2 From a11e6ff961a01884482b2a70ced74a5c62d96801 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Oct 2022 14:46:08 +0100 Subject: rxrpc: Remove call->tx_phase Remove call->tx_phase as it's only ever set. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/call_object.c | 1 - net/rxrpc/input.c | 5 +---- net/rxrpc/recvmsg.c | 1 - 4 files changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 92f20b6c23ca..282cb986f8a7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -678,7 +678,6 @@ struct rxrpc_call { rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ u8 tx_winsize; /* Maximum size of Tx window */ - bool tx_phase; /* T if transmission phase, F if receive phase */ u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */ spinlock_t input_lock; /* Lock for packet input to this call */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index a75861a0c477..8290d94e9233 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -206,7 +206,6 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, return ERR_PTR(-ENOMEM); call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; call->service_id = srx->srx_service; - call->tx_phase = true; now = ktime_get_real(); call->acks_latest_ts = now; call->cong_tstamp = now; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 59a0b8aee2a2..0cb23ba79e3b 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -309,10 +309,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) return false; } } - if (!rxrpc_end_tx_phase(call, true, "ETD")) - return false; - call->tx_phase = false; - return true; + return rxrpc_end_tx_phase(call, true, "ETD"); } /* diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 7e39c262fd79..a378e7a672a8 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -203,7 +203,6 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) break; case RXRPC_CALL_SERVER_RECV_REQUEST: - call->tx_phase = true; call->state = RXRPC_CALL_SERVER_ACK_REQUEST; call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; write_unlock_bh(&call->state_lock); -- cgit v1.2.3-70-g09d2 From 02a1935640f8f8539b8f2dbd6eeb539de93b2ce4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 5 Apr 2022 21:16:32 +0100 Subject: rxrpc: Define rxrpc_txbuf struct to carry data to be transmitted Define a struct, rxrpc_txbuf, to carry data to be transmitted instead of a socket buffer so that it can be placed onto multiple queues at once. This also allows the data buffer to be in the same allocation as the internal data. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 45 +++++++++++++++++++ net/rxrpc/Makefile | 1 + net/rxrpc/ar-internal.h | 53 +++++++++++++++++++++++ net/rxrpc/proc.c | 3 +- net/rxrpc/txbuf.c | 100 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 net/rxrpc/txbuf.c (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 484c8d032ab8..47b157b1d32b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -252,6 +252,18 @@ E_(rxrpc_reqack_small_txwin, "SMALL-TXWN") /* ---- Must update size of stat_why_req_ack[] if more are added! */ +#define rxrpc_txbuf_traces \ + EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \ + EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ + EM(rxrpc_txbuf_free, "FREE ") \ + EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ + EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ + EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ + EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ + EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ + EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ + E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") + /* * Generate enums for tracing information. */ @@ -280,6 +292,7 @@ enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); enum rxrpc_transmit_trace { rxrpc_transmit_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); +enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ @@ -308,6 +321,7 @@ rxrpc_skb_traces; rxrpc_timer_traces; rxrpc_transmit_traces; rxrpc_tx_points; +rxrpc_txbuf_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -1469,6 +1483,37 @@ TRACE_EVENT(rxrpc_req_ack, __print_symbolic(__entry->why, rxrpc_req_ack_traces)) ); +TRACE_EVENT(rxrpc_txbuf, + TP_PROTO(unsigned int debug_id, + unsigned int call_debug_id, rxrpc_seq_t seq, + int ref, enum rxrpc_txbuf_trace what), + + TP_ARGS(debug_id, call_debug_id, seq, ref, what), + + TP_STRUCT__entry( + __field(unsigned int, debug_id ) + __field(unsigned int, call_debug_id ) + __field(rxrpc_seq_t, seq ) + __field(int, ref ) + __field(enum rxrpc_txbuf_trace, what ) + ), + + TP_fast_assign( + __entry->debug_id = debug_id; + __entry->call_debug_id = call_debug_id; + __entry->seq = seq; + __entry->ref = ref; + __entry->what = what; + ), + + TP_printk("B=%08x c=%08x q=%08x %s r=%d", + __entry->debug_id, + __entry->call_debug_id, + __entry->seq, + __print_symbolic(__entry->what, rxrpc_txbuf_traces), + __entry->ref) + ); + #undef EM #undef E_ #endif /* _TRACE_RXRPC_H */ diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index b11281bed2a4..fdeba488fc6e 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -30,6 +30,7 @@ rxrpc-y := \ sendmsg.o \ server_key.o \ skbuff.o \ + txbuf.o \ utils.o rxrpc-$(CONFIG_PROC_FS) += proc.o diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 282cb986f8a7..49e90614d5e5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -29,6 +29,7 @@ struct rxrpc_crypt { struct key_preparsed_payload; struct rxrpc_connection; +struct rxrpc_txbuf; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -759,6 +760,48 @@ struct rxrpc_send_params { bool upgrade; /* If the connection is upgradeable */ }; +/* + * Buffer of data to be output as a packet. + */ +struct rxrpc_txbuf { + struct rcu_head rcu; + struct list_head call_link; /* Link in call->tx_queue */ + struct list_head tx_link; /* Link in live Enc queue or Tx queue */ + struct rxrpc_call *call; /* Call to which belongs */ + ktime_t last_sent; /* Time at which last transmitted */ + refcount_t ref; + rxrpc_seq_t seq; /* Sequence number of this packet */ + unsigned int call_debug_id; + unsigned int debug_id; + unsigned int len; /* Amount of data in buffer */ + unsigned int space; /* Remaining data space */ + unsigned int offset; /* Offset of fill point */ + unsigned long flags; +#define RXRPC_TXBUF_ACKED 0 /* Set if ACK'd */ +#define RXRPC_TXBUF_NACKED 1 /* Set if NAK'd */ +#define RXRPC_TXBUF_LAST 2 /* Set if last packet in Tx phase */ +#define RXRPC_TXBUF_RESENT 3 /* Set if has been resent */ +#define RXRPC_TXBUF_RETRANS 4 /* Set if should be retransmitted */ + struct { + /* The packet for encrypting and DMA'ing. We align it such + * that data[] aligns correctly for any crypto blocksize. + */ + u8 pad[64 - sizeof(struct rxrpc_wire_header)]; + struct rxrpc_wire_header wire; /* Network-ready header */ + u8 data[RXRPC_JUMBO_DATALEN]; /* Data packet */ + } __aligned(64); +}; + +static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) +{ + return txb->wire.flags & RXRPC_CLIENT_INITIATED; +} + +static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) +{ + return !rxrpc_sending_to_server(txb); +} + #include /* @@ -1125,6 +1168,16 @@ static inline int __init rxrpc_sysctl_init(void) { return 0; } static inline void rxrpc_sysctl_exit(void) {} #endif +/* + * txbuf.c + */ +extern atomic_t rxrpc_nr_txbuf; +struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, + gfp_t gfp); +void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); +void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); +void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); + /* * utils.c */ diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 9bd357f39c39..3877afd23598 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -458,7 +458,8 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin])); seq_printf(seq, - "Buffers : txb=%u rxb=%u\n", + "Buffers : txb=%u,%u rxb=%u\n", + atomic_read(&rxrpc_nr_txbuf), atomic_read(&rxrpc_n_tx_skbs), atomic_read(&rxrpc_n_rx_skbs)); return 0; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c new file mode 100644 index 000000000000..66d922ad65d0 --- /dev/null +++ b/net/rxrpc/txbuf.c @@ -0,0 +1,100 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxRPC Tx data buffering. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include "ar-internal.h" + +static atomic_t rxrpc_txbuf_debug_ids; +atomic_t rxrpc_nr_txbuf; + +/* + * Allocate and partially initialise an I/O request structure. + */ +struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, + gfp_t gfp) +{ + struct rxrpc_txbuf *txb; + + txb = kmalloc(sizeof(*txb), gfp); + if (txb) { + INIT_LIST_HEAD(&txb->call_link); + INIT_LIST_HEAD(&txb->tx_link); + refcount_set(&txb->ref, 1); + txb->call = call; + txb->call_debug_id = call->debug_id; + txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->space = sizeof(txb->data); + txb->len = 0; + txb->offset = 0; + txb->flags = 0; + txb->seq = call->tx_top + 1; + txb->wire.epoch = htonl(call->conn->proto.epoch); + txb->wire.cid = htonl(call->cid); + txb->wire.callNumber = htonl(call->call_id); + txb->wire.seq = htonl(txb->seq); + txb->wire.type = packet_type; + txb->wire.flags = call->conn->out_clientflag; + txb->wire.userStatus = 0; + txb->wire.securityIndex = call->security_ix; + txb->wire._rsvd = 0; + txb->wire.serviceId = htons(call->service_id); + + trace_rxrpc_txbuf(txb->debug_id, + txb->call_debug_id, txb->seq, 1, + packet_type == RXRPC_PACKET_TYPE_DATA ? + rxrpc_txbuf_alloc_data : + rxrpc_txbuf_alloc_ack); + atomic_inc(&rxrpc_nr_txbuf); + } + + return txb; +} + +void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) +{ + int r; + + __refcount_inc(&txb->ref, &r); + trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what); +} + +void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) +{ + int r = refcount_read(&txb->ref); + + trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r, what); +} + +static void rxrpc_free_txbuf(struct rcu_head *rcu) +{ + struct rxrpc_txbuf *txb = container_of(rcu, struct rxrpc_txbuf, rcu); + + trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, + rxrpc_txbuf_free); + kfree(txb); + atomic_dec(&rxrpc_nr_txbuf); +} + +void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) +{ + unsigned int debug_id, call_debug_id; + rxrpc_seq_t seq; + bool dead; + int r; + + if (txb) { + debug_id = txb->debug_id; + call_debug_id = txb->call_debug_id; + seq = txb->seq; + dead = __refcount_dec_and_test(&txb->ref, &r); + trace_rxrpc_txbuf(debug_id, call_debug_id, seq, r - 1, what); + if (dead) + call_rcu(&txb->rcu, rxrpc_free_txbuf); + } +} -- cgit v1.2.3-70-g09d2 From 72f0c6fb057971864fe4d42b289b8e6ede836ef1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jan 2020 21:48:13 +0000 Subject: rxrpc: Allocate ACK records at proposal and queue for transmission Allocate rxrpc_txbuf records for ACKs and put onto a queue for the transmitter thread to dispatch. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 47 ++++++++--- net/rxrpc/ar-internal.h | 21 +++-- net/rxrpc/call_accept.c | 5 +- net/rxrpc/call_event.c | 189 ++++++++++++++++++++++++------------------- net/rxrpc/input.c | 89 +++++++++----------- net/rxrpc/local_object.c | 7 ++ net/rxrpc/output.c | 157 +++++++++++++++++------------------ net/rxrpc/recvmsg.c | 33 +++----- net/rxrpc/sendmsg.c | 4 +- net/rxrpc/txbuf.c | 1 + 10 files changed, 285 insertions(+), 268 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 47b157b1d32b..1597ff7ad97e 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -34,7 +34,8 @@ EM(rxrpc_local_new, "NEW") \ EM(rxrpc_local_processing, "PRO") \ EM(rxrpc_local_put, "PUT") \ - E_(rxrpc_local_queued, "QUE") + EM(rxrpc_local_queued, "QUE") \ + E_(rxrpc_local_tx_ack, "TAK") #define rxrpc_peer_traces \ EM(rxrpc_peer_got, "GOT") \ @@ -258,7 +259,9 @@ EM(rxrpc_txbuf_free, "FREE ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ + EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \ EM(rxrpc_txbuf_put_cleaned, "PUT CLEANED") \ + EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ @@ -1095,19 +1098,16 @@ TRACE_EVENT(rxrpc_rx_lose, TRACE_EVENT(rxrpc_propose_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, - u8 ack_reason, rxrpc_serial_t serial, bool immediate, - bool background, enum rxrpc_propose_ack_outcome outcome), + u8 ack_reason, rxrpc_serial_t serial, + enum rxrpc_propose_ack_outcome outcome), - TP_ARGS(call, why, ack_reason, serial, immediate, background, - outcome), + TP_ARGS(call, why, ack_reason, serial, outcome), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum rxrpc_propose_ack_trace, why ) __field(rxrpc_serial_t, serial ) __field(u8, ack_reason ) - __field(bool, immediate ) - __field(bool, background ) __field(enum rxrpc_propose_ack_outcome, outcome ) ), @@ -1116,21 +1116,44 @@ TRACE_EVENT(rxrpc_propose_ack, __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; - __entry->immediate = immediate; - __entry->background = background; __entry->outcome = outcome; ), - TP_printk("c=%08x %s %s r=%08x i=%u b=%u%s", + TP_printk("c=%08x %s %s r=%08x%s", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), __entry->serial, - __entry->immediate, - __entry->background, __print_symbolic(__entry->outcome, rxrpc_propose_ack_outcomes)) ); +TRACE_EVENT(rxrpc_send_ack, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, + u8 ack_reason, rxrpc_serial_t serial), + + TP_ARGS(call, why, ack_reason, serial), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_propose_ack_trace, why ) + __field(rxrpc_serial_t, serial ) + __field(u8, ack_reason ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->why = why; + __entry->serial = serial; + __entry->ack_reason = ack_reason; + ), + + TP_printk("c=%08x %s %s r=%08x", + __entry->call, + __print_symbolic(__entry->why, rxrpc_propose_ack_traces), + __print_symbolic(__entry->ack_reason, rxrpc_ack_names), + __entry->serial) + ); + TRACE_EVENT(rxrpc_retransmit, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation, s64 expiry), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 49e90614d5e5..802a8f372af2 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -292,6 +292,8 @@ struct rxrpc_local { struct hlist_node link; struct socket *socket; /* my UDP socket */ struct work_struct processor; + struct list_head ack_tx_queue; /* List of ACKs that need sending */ + spinlock_t ack_tx_lock; /* ACK list lock */ struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ @@ -520,10 +522,8 @@ enum rxrpc_call_flag { * Events that can be raised on a call. */ enum rxrpc_call_event { - RXRPC_CALL_EV_ACK, /* need to generate ACK */ RXRPC_CALL_EV_ABORT, /* need to generate abort */ RXRPC_CALL_EV_RESEND, /* Tx resend required */ - RXRPC_CALL_EV_PING, /* Ping send required */ RXRPC_CALL_EV_EXPIRED, /* Expiry occurred */ RXRPC_CALL_EV_ACK_LOST, /* ACK may be lost, send ping */ }; @@ -782,13 +782,20 @@ struct rxrpc_txbuf { #define RXRPC_TXBUF_LAST 2 /* Set if last packet in Tx phase */ #define RXRPC_TXBUF_RESENT 3 /* Set if has been resent */ #define RXRPC_TXBUF_RETRANS 4 /* Set if should be retransmitted */ + u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ struct { /* The packet for encrypting and DMA'ing. We align it such * that data[] aligns correctly for any crypto blocksize. */ u8 pad[64 - sizeof(struct rxrpc_wire_header)]; struct rxrpc_wire_header wire; /* Network-ready header */ - u8 data[RXRPC_JUMBO_DATALEN]; /* Data packet */ + union { + u8 data[RXRPC_JUMBO_DATALEN]; /* Data packet */ + struct { + struct rxrpc_ackpacket ack; + u8 acks[0]; + }; + }; } __aligned(64); }; @@ -824,8 +831,10 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); /* * call_event.c */ -void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool, bool, - enum rxrpc_propose_ack_trace); +void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, + enum rxrpc_propose_ack_trace why); +void rxrpc_send_ACK(struct rxrpc_call *, u8, rxrpc_serial_t, enum rxrpc_propose_ack_trace); +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); void rxrpc_reduce_call_timer(struct rxrpc_call *call, @@ -1030,7 +1039,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) /* * output.c */ -int rxrpc_send_ack_packet(struct rxrpc_call *, bool, rxrpc_serial_t *); +void rxrpc_transmit_ack_packets(struct rxrpc_local *); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); void rxrpc_reject_packets(struct rxrpc_local *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 99e10eea3732..d8db277d5ebe 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -248,9 +248,8 @@ static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) if (call->peer->rtt_count < 3 || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) - rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, - true, true, - rxrpc_propose_ack_ping_for_params); + rxrpc_send_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, + rxrpc_propose_ack_ping_for_params); } /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 70f70a0393f7..67b54ad914a1 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -20,46 +20,39 @@ /* * Propose a PING ACK be sent. */ -static void rxrpc_propose_ping(struct rxrpc_call *call, - bool immediate, bool background) +void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, + enum rxrpc_propose_ack_trace why) { - if (immediate) { - if (background && - !test_and_set_bit(RXRPC_CALL_EV_PING, &call->events)) - rxrpc_queue_call(call); - } else { - unsigned long now = jiffies; - unsigned long ping_at = now + rxrpc_idle_ack_delay; + unsigned long now = jiffies; + unsigned long ping_at = now + rxrpc_idle_ack_delay; - if (time_before(ping_at, call->ping_at)) { - WRITE_ONCE(call->ping_at, ping_at); - rxrpc_reduce_call_timer(call, ping_at, now, - rxrpc_timer_set_for_ping); - } + spin_lock_bh(&call->lock); + + if (time_before(ping_at, call->ping_at)) { + rxrpc_inc_stat(call->rxnet, stat_tx_acks[RXRPC_ACK_PING]); + WRITE_ONCE(call->ping_at, ping_at); + rxrpc_reduce_call_timer(call, ping_at, now, + rxrpc_timer_set_for_ping); + trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial, + rxrpc_propose_ack_use); } + + spin_unlock_bh(&call->lock); } /* * propose an ACK be sent */ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, bool immediate, bool background, - enum rxrpc_propose_ack_trace why) + u32 serial, enum rxrpc_propose_ack_trace why) { enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; unsigned long expiry = rxrpc_soft_ack_delay; + unsigned long now = jiffies, ack_at; s8 prior = rxrpc_ack_priority[ack_reason]; rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); - /* Pings are handled specially because we don't want to accidentally - * lose a ping response by subsuming it into a ping. - */ - if (ack_reason == RXRPC_ACK_PING) { - rxrpc_propose_ping(call, immediate, background); - goto trace; - } - /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial * numbers, but we don't alter the timeout. */ @@ -71,8 +64,6 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, outcome = rxrpc_propose_ack_update; call->ackr_serial = serial; } - if (!immediate) - goto trace; } else if (prior > rxrpc_ack_priority[call->ackr_reason]) { call->ackr_reason = ack_reason; call->ackr_serial = serial; @@ -84,8 +75,6 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, case RXRPC_ACK_REQUESTED: if (rxrpc_requested_ack_delay < expiry) expiry = rxrpc_requested_ack_delay; - if (serial == 1) - immediate = false; break; case RXRPC_ACK_DELAY: @@ -99,52 +88,89 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, break; default: - immediate = true; - break; + WARN_ON(1); + return; } - if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) { - _debug("already scheduled"); - } else if (immediate || expiry == 0) { - _debug("immediate ACK %lx", call->events); - if (!test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events) && - background) - rxrpc_queue_call(call); - } else { - unsigned long now = jiffies, ack_at; - - if (call->peer->srtt_us != 0) - ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3); - else - ack_at = expiry; - - ack_at += READ_ONCE(call->tx_backoff); - ack_at += now; - if (time_before(ack_at, call->ack_at)) { - WRITE_ONCE(call->ack_at, ack_at); - rxrpc_reduce_call_timer(call, ack_at, now, - rxrpc_timer_set_for_ack); - } + + if (call->peer->srtt_us != 0) + ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3); + else + ack_at = expiry; + + ack_at += READ_ONCE(call->tx_backoff); + ack_at += now; + if (time_before(ack_at, call->ack_at)) { + WRITE_ONCE(call->ack_at, ack_at); + rxrpc_reduce_call_timer(call, ack_at, now, + rxrpc_timer_set_for_ack); } -trace: - trace_rxrpc_propose_ack(call, why, ack_reason, serial, immediate, - background, outcome); + trace_rxrpc_propose_ack(call, why, ack_reason, serial, outcome); } /* * propose an ACK be sent, locking the call structure */ -void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, bool immediate, bool background, +void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, u32 serial, enum rxrpc_propose_ack_trace why) { spin_lock_bh(&call->lock); - __rxrpc_propose_ACK(call, ack_reason, serial, - immediate, background, why); + __rxrpc_propose_ACK(call, ack_reason, serial, why); spin_unlock_bh(&call->lock); } +/* + * Queue an ACK for immediate transmission. + */ +void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) +{ + struct rxrpc_local *local = call->conn->params.local; + struct rxrpc_txbuf *txb; + + if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) + return; + + rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); + + txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_ACK, + in_softirq() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS); + if (!txb) { + kleave(" = -ENOMEM"); + return; + } + + txb->ack_why = why; + txb->wire.seq = 0; + txb->wire.type = RXRPC_PACKET_TYPE_ACK; + txb->wire.flags |= RXRPC_SLOW_START_OK; + txb->ack.bufferSpace = 0; + txb->ack.maxSkew = 0; + txb->ack.firstPacket = 0; + txb->ack.previousPacket = 0; + txb->ack.serial = htonl(serial); + txb->ack.reason = ack_reason; + txb->ack.nAcks = 0; + + if (!rxrpc_try_get_call(call, rxrpc_call_got)) { + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_nomem); + return; + } + + spin_lock_bh(&local->ack_tx_lock); + list_add_tail(&txb->tx_link, &local->ack_tx_queue); + spin_unlock_bh(&local->ack_tx_lock); + trace_rxrpc_send_ack(call, why, ack_reason, serial); + + if (in_task()) { + rxrpc_transmit_ack_packets(call->peer->local); + } else { + rxrpc_get_local(local); + rxrpc_queue_local(local); + } +} + /* * Handle congestion being detected by the retransmit timeout. */ @@ -230,9 +256,8 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) ack_ts = ktime_sub(now, call->acks_latest_ts); if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3)) goto out; - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, - rxrpc_propose_ack_ping_for_lost_ack); - rxrpc_send_ack_packet(call, true, NULL); + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_lost_ack); goto out; } @@ -291,9 +316,10 @@ void rxrpc_process_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); - rxrpc_serial_t *send_ack; unsigned long now, next, t; unsigned int iterations = 0; + rxrpc_serial_t ackr_serial; + u8 ackr_reason; rxrpc_see_call(call); @@ -342,7 +368,15 @@ recheck_state: if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now); cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET); - set_bit(RXRPC_CALL_EV_ACK, &call->events); + spin_lock_bh(&call->lock); + ackr_reason = call->ackr_reason; + ackr_serial = call->ackr_serial; + call->ackr_reason = 0; + call->ackr_serial = 0; + spin_unlock_bh(&call->lock); + if (ackr_reason) + rxrpc_send_ACK(call, ackr_reason, ackr_serial, + rxrpc_propose_ack_ping_for_lost_ack); } t = READ_ONCE(call->ack_lost_at); @@ -356,16 +390,16 @@ recheck_state: if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now); cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET); - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, true, - rxrpc_propose_ack_ping_for_keepalive); - set_bit(RXRPC_CALL_EV_PING, &call->events); + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_keepalive); } t = READ_ONCE(call->ping_at); if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_ping, now); cmpxchg(&call->ping_at, t, now + MAX_JIFFY_OFFSET); - set_bit(RXRPC_CALL_EV_PING, &call->events); + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_keepalive); } t = READ_ONCE(call->resend_at); @@ -388,25 +422,10 @@ recheck_state: goto recheck_state; } - send_ack = NULL; if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) { call->acks_lost_top = call->tx_top; - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, - rxrpc_propose_ack_ping_for_lost_ack); - send_ack = &call->acks_lost_ping; - } - - if (test_and_clear_bit(RXRPC_CALL_EV_ACK, &call->events) || - send_ack) { - if (call->ackr_reason) { - rxrpc_send_ack_packet(call, false, send_ack); - goto recheck_state; - } - } - - if (test_and_clear_bit(RXRPC_CALL_EV_PING, &call->events)) { - rxrpc_send_ack_packet(call, true, NULL); - goto recheck_state; + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_lost_ack); } if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) && diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 0cb23ba79e3b..e23f66ef8c06 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -398,8 +398,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) unsigned int j, nr_subpackets, nr_unacked = 0; rxrpc_serial_t serial = sp->hdr.serial, ack_serial = serial; rxrpc_seq_t seq0 = sp->hdr.seq, hard_ack; - bool immediate_ack = false, jumbo_bad = false; - u8 ack = 0; + bool jumbo_bad = false; _enter("{%u,%u},{%u,%u}", call->rx_hard_ack, call->rx_top, skb->len, seq0); @@ -447,9 +446,9 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) nr_subpackets = sp->nr_subpackets; if (nr_subpackets > 1) { if (call->nr_jumbo_bad > 3) { - ack = RXRPC_ACK_NOSPACE; - ack_serial = serial; - goto ack; + rxrpc_send_ACK(call, RXRPC_ACK_NOSPACE, serial, + rxrpc_propose_ack_input_data); + goto unlock; } } @@ -459,6 +458,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK; bool terminal = (j == nr_subpackets - 1); bool last = terminal && (sp->rx_flags & RXRPC_SKB_INCL_LAST); + bool acked = false; u8 flags, annotation = j; _proto("Rx DATA+%u %%%u { #%x t=%u l=%u }", @@ -488,25 +488,22 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation); if (before_eq(seq, hard_ack)) { - ack = RXRPC_ACK_DUPLICATE; - ack_serial = serial; + rxrpc_send_ACK(call, RXRPC_ACK_DUPLICATE, serial, + rxrpc_propose_ack_input_data); continue; } if (call->rxtx_buffer[ix]) { rxrpc_input_dup_data(call, seq, nr_subpackets > 1, &jumbo_bad); - if (ack != RXRPC_ACK_DUPLICATE) { - ack = RXRPC_ACK_DUPLICATE; - ack_serial = serial; - } - immediate_ack = true; + rxrpc_send_ACK(call, RXRPC_ACK_DUPLICATE, serial, + rxrpc_propose_ack_input_data); continue; } if (after(seq, hard_ack + call->rx_winsize)) { - ack = RXRPC_ACK_EXCEEDS_WINDOW; - ack_serial = serial; + rxrpc_send_ACK(call, RXRPC_ACK_EXCEEDS_WINDOW, serial, + rxrpc_propose_ack_input_data); if (flags & RXRPC_JUMBO_PACKET) { if (!jumbo_bad) { call->nr_jumbo_bad++; @@ -514,12 +511,13 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } } - goto ack; + goto unlock; } - if (flags & RXRPC_REQUEST_ACK && !ack) { - ack = RXRPC_ACK_REQUESTED; - ack_serial = serial; + if (flags & RXRPC_REQUEST_ACK) { + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, serial, + rxrpc_propose_ack_input_data); + acked = true; } if (after(seq0, call->ackr_highest_seq)) @@ -542,11 +540,11 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) smp_store_release(&call->rx_top, seq); } else if (before(seq, call->rx_top)) { /* Send an immediate ACK if we fill in a hole */ - if (!ack) { - ack = RXRPC_ACK_DELAY; - ack_serial = serial; + if (!acked) { + rxrpc_send_ACK(call, RXRPC_ACK_DELAY, serial, + rxrpc_propose_ack_input_data); + acked = true; } - immediate_ack = true; } if (terminal) { @@ -558,14 +556,8 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) sp = NULL; } - nr_unacked++; - if (last) { set_bit(RXRPC_CALL_RX_LAST, &call->flags); - if (!ack) { - ack = RXRPC_ACK_DELAY; - ack_serial = serial; - } trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); } else { trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); @@ -574,32 +566,30 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) if (after_eq(seq, call->rx_expect_next)) { if (after(seq, call->rx_expect_next)) { _net("OOS %u > %u", seq, call->rx_expect_next); - ack = RXRPC_ACK_OUT_OF_SEQUENCE; - ack_serial = serial; + rxrpc_send_ACK(call, RXRPC_ACK_OUT_OF_SEQUENCE, serial, + rxrpc_propose_ack_input_data); + acked = true; } call->rx_expect_next = seq + 1; } - if (!ack) + + if (!acked) { + nr_unacked++; ack_serial = serial; + } } -ack: - if (atomic_add_return(nr_unacked, &call->ackr_nr_unacked) > 2 && !ack) - ack = RXRPC_ACK_IDLE; - - if (ack) - rxrpc_propose_ACK(call, ack, ack_serial, - immediate_ack, true, - rxrpc_propose_ack_input_data); +unlock: + if (atomic_add_return(nr_unacked, &call->ackr_nr_unacked) > 2) + rxrpc_send_ACK(call, RXRPC_ACK_IDLE, ack_serial, + rxrpc_propose_ack_input_data); else - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, - false, true, + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, ack_serial, rxrpc_propose_ack_input_data); trace_rxrpc_notify_socket(call->debug_id, serial); rxrpc_notify_socket(call); -unlock: spin_unlock(&call->input_lock); rxrpc_free_skb(skb, rxrpc_skb_freed); _leave(" [queued]"); @@ -893,13 +883,11 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (buf.ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", ack_serial); - rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - ack_serial, true, true, - rxrpc_propose_ack_respond_to_ping); + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, + rxrpc_propose_ack_respond_to_ping); } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, - ack_serial, true, true, - rxrpc_propose_ack_respond_to_ack); + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, + rxrpc_propose_ack_respond_to_ack); } /* If we get an EXCEEDS_WINDOW ACK from the server, it probably @@ -1011,9 +999,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) RXRPC_TX_ANNO_LAST && summary.nr_acks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) - rxrpc_propose_ACK(call, RXRPC_ACK_PING, ack_serial, - false, true, - rxrpc_propose_ack_ping_for_lost_reply); + rxrpc_propose_ping(call, ack_serial, + rxrpc_propose_ack_ping_for_lost_reply); rxrpc_congestion_management(call, skb, &summary, acked_serial); out: diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index e95e75e785fb..a178f71e5082 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -97,6 +97,8 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local->rxnet = rxnet; INIT_HLIST_NODE(&local->link); INIT_WORK(&local->processor, rxrpc_local_processor); + INIT_LIST_HEAD(&local->ack_tx_queue); + spin_lock_init(&local->ack_tx_lock); init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); @@ -432,6 +434,11 @@ static void rxrpc_local_processor(struct work_struct *work) break; } + if (!list_empty(&local->ack_tx_queue)) { + rxrpc_transmit_ack_packets(local); + again = true; + } + if (!skb_queue_empty(&local->reject_queue)) { rxrpc_reject_packets(local); again = true; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 71885d741987..1edbc678e8be 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -16,14 +16,6 @@ #include #include "ar-internal.h" -struct rxrpc_ack_buffer { - struct rxrpc_wire_header whdr; - struct rxrpc_ackpacket ack; - u8 acks[255]; - u8 pad[3]; - struct rxrpc_ackinfo ackinfo; -}; - extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); static ssize_t do_udp_sendmsg(struct socket *sk, struct msghdr *msg, size_t len) @@ -82,22 +74,21 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call) */ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, struct rxrpc_call *call, - struct rxrpc_ack_buffer *pkt, + struct rxrpc_txbuf *txb, rxrpc_seq_t *_hard_ack, - rxrpc_seq_t *_top, - u8 reason) + rxrpc_seq_t *_top) { - rxrpc_serial_t serial; + struct rxrpc_ackinfo ackinfo; unsigned int tmp; rxrpc_seq_t hard_ack, top, seq; int ix; u32 mtu, jmax; - u8 *ackp = pkt->acks; + u8 *ackp = txb->acks; tmp = atomic_xchg(&call->ackr_nr_unacked, 0); tmp |= atomic_xchg(&call->ackr_nr_consumed, 0); - if (!tmp && (reason == RXRPC_ACK_DELAY || - reason == RXRPC_ACK_IDLE)) { + if (!tmp && (txb->ack.reason == RXRPC_ACK_DELAY || + txb->ack.reason == RXRPC_ACK_IDLE)) { rxrpc_inc_stat(call->rxnet, stat_tx_ack_skip); return 0; } @@ -105,24 +96,16 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); /* Barrier against rxrpc_input_data(). */ - serial = call->ackr_serial; hard_ack = READ_ONCE(call->rx_hard_ack); top = smp_load_acquire(&call->rx_top); *_hard_ack = hard_ack; *_top = top; - pkt->ack.bufferSpace = htons(0); - pkt->ack.maxSkew = htons(0); - pkt->ack.firstPacket = htonl(hard_ack + 1); - pkt->ack.previousPacket = htonl(call->ackr_highest_seq); - pkt->ack.serial = htonl(serial); - pkt->ack.reason = reason; - pkt->ack.nAcks = top - hard_ack; - - if (reason == RXRPC_ACK_PING) - pkt->whdr.flags |= RXRPC_REQUEST_ACK; + txb->ack.firstPacket = htonl(hard_ack + 1); + txb->ack.previousPacket = htonl(call->ackr_highest_seq); + txb->ack.nAcks = top - hard_ack; - if (after(top, hard_ack)) { + if (txb->ack.nAcks) { seq = hard_ack + 1; do { ix = seq & RXRPC_RXTX_BUFF_MASK; @@ -137,15 +120,16 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, mtu = conn->params.peer->if_mtu; mtu -= conn->params.peer->hdrsize; jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max; - pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu); - pkt->ackinfo.maxMTU = htonl(mtu); - pkt->ackinfo.rwind = htonl(call->rx_winsize); - pkt->ackinfo.jumbo_max = htonl(jmax); + ackinfo.rxMTU = htonl(rxrpc_rx_mtu); + ackinfo.maxMTU = htonl(mtu); + ackinfo.rwind = htonl(call->rx_winsize); + ackinfo.jumbo_max = htonl(jmax); *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; - return top - hard_ack + 3; + memcpy(ackp, &ackinfo, sizeof(ackinfo)); + return top - hard_ack + 3 + sizeof(ackinfo); } /* @@ -194,26 +178,21 @@ static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call, /* * Send an ACK call packet. */ -int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, - rxrpc_serial_t *_serial) +static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *txb) { struct rxrpc_connection *conn; struct rxrpc_ack_buffer *pkt; + struct rxrpc_call *call = txb->call; struct msghdr msg; - struct kvec iov[2]; + struct kvec iov[1]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; size_t len, n; int ret, rtt_slot = -1; - u8 reason; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return -ECONNRESET; - pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); - if (!pkt) - return -ENOMEM; - conn = call->conn; msg.msg_name = &call->peer->srx.transport; @@ -222,85 +201,93 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, msg.msg_controllen = 0; msg.msg_flags = 0; - pkt->whdr.epoch = htonl(conn->proto.epoch); - pkt->whdr.cid = htonl(call->cid); - pkt->whdr.callNumber = htonl(call->call_id); - pkt->whdr.seq = 0; - pkt->whdr.type = RXRPC_PACKET_TYPE_ACK; - pkt->whdr.flags = RXRPC_SLOW_START_OK | conn->out_clientflag; - pkt->whdr.userStatus = 0; - pkt->whdr.securityIndex = call->security_ix; - pkt->whdr._rsvd = 0; - pkt->whdr.serviceId = htons(call->service_id); + if (txb->ack.reason == RXRPC_ACK_PING) + txb->wire.flags |= RXRPC_REQUEST_ACK; spin_lock_bh(&call->lock); - if (ping) { - reason = RXRPC_ACK_PING; - } else { - reason = call->ackr_reason; - if (!call->ackr_reason) { - spin_unlock_bh(&call->lock); - ret = 0; - goto out; - } - call->ackr_reason = 0; - } - n = rxrpc_fill_out_ack(conn, call, pkt, &hard_ack, &top, reason); - + n = rxrpc_fill_out_ack(conn, call, txb, &hard_ack, &top); spin_unlock_bh(&call->lock); if (n == 0) { kfree(pkt); return 0; } - iov[0].iov_base = pkt; - iov[0].iov_len = sizeof(pkt->whdr) + sizeof(pkt->ack) + n; - iov[1].iov_base = &pkt->ackinfo; - iov[1].iov_len = sizeof(pkt->ackinfo); - len = iov[0].iov_len + iov[1].iov_len; + iov[0].iov_base = &txb->wire; + iov[0].iov_len = sizeof(txb->wire) + sizeof(txb->ack) + n; + len = iov[0].iov_len; serial = atomic_inc_return(&conn->serial); - pkt->whdr.serial = htonl(serial); + txb->wire.serial = htonl(serial); trace_rxrpc_tx_ack(call->debug_id, serial, - ntohl(pkt->ack.firstPacket), - ntohl(pkt->ack.serial), - pkt->ack.reason, pkt->ack.nAcks); - if (_serial) - *_serial = serial; + ntohl(txb->ack.firstPacket), + ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks); + if (txb->ack_why == rxrpc_propose_ack_ping_for_lost_ack) + call->acks_lost_ping = serial; - if (ping) + if (txb->ack.reason == RXRPC_ACK_PING) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); else - trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr, + trace_rxrpc_tx_packet(call->debug_id, &txb->wire, rxrpc_tx_point_call_ack); rxrpc_tx_backoff(call, ret); if (call->state < RXRPC_CALL_COMPLETE) { - if (ret < 0) { + if (ret < 0) rxrpc_cancel_rtt_probe(call, serial, rtt_slot); - rxrpc_propose_ACK(call, pkt->ack.reason, - ntohl(pkt->ack.serial), - false, true, - rxrpc_propose_ack_retry_tx); - } - rxrpc_set_keepalive(call); } -out: kfree(pkt); return ret; } +/* + * ACK transmitter for a local endpoint. The UDP socket locks around each + * transmission, so we can only transmit one packet at a time, ACK, DATA or + * otherwise. + */ +void rxrpc_transmit_ack_packets(struct rxrpc_local *local) +{ + LIST_HEAD(queue); + int ret; + + trace_rxrpc_local(local->debug_id, rxrpc_local_tx_ack, + refcount_read(&local->ref), NULL); + + if (list_empty(&local->ack_tx_queue)) + return; + + spin_lock_bh(&local->ack_tx_lock); + list_splice_tail_init(&local->ack_tx_queue, &queue); + spin_unlock_bh(&local->ack_tx_lock); + + while (!list_empty(&queue)) { + struct rxrpc_txbuf *txb = + list_entry(queue.next, struct rxrpc_txbuf, tx_link); + + ret = rxrpc_send_ack_packet(local, txb); + if (ret < 0 && ret != -ECONNRESET) { + spin_lock_bh(&local->ack_tx_lock); + list_splice_init(&queue, &local->ack_tx_queue); + spin_unlock_bh(&local->ack_tx_lock); + break; + } + + list_del_init(&txb->tx_link); + rxrpc_put_call(txb->call, rxrpc_call_put); + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); + } +} + /* * Send an ABORT call packet. */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a378e7a672a8..104dd4a29f05 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -189,7 +189,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, serial, false, true, + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, serial, rxrpc_propose_ack_terminal_ack); //rxrpc_send_ack_packet(call, false, NULL); } @@ -206,7 +206,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) call->state = RXRPC_CALL_SERVER_ACK_REQUEST; call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; write_unlock_bh(&call->state_lock); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false, true, + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, rxrpc_propose_ack_processing_op); break; default: @@ -259,12 +259,11 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_end_rx_phase(call, serial); } else { /* Check to see if there's an ACK that needs sending. */ - if (atomic_inc_return(&call->ackr_nr_consumed) > 2) - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, serial, - true, false, - rxrpc_propose_ack_rotate_rx); - if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY) - rxrpc_send_ack_packet(call, false, NULL); + if (atomic_inc_return(&call->ackr_nr_consumed) > 2) { + rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial, + rxrpc_propose_ack_rotate_rx); + rxrpc_transmit_ack_packets(call->peer->local); + } } } @@ -363,10 +362,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, unsigned int rx_pkt_offset, rx_pkt_len; int ix, copy, ret = -EAGAIN, ret2; - if (test_and_clear_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags) && - call->ackr_reason) - rxrpc_send_ack_packet(call, false, NULL); - rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; rx_pkt_last = call->rx_pkt_last; @@ -389,6 +384,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (!skb) { trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq, rx_pkt_offset, rx_pkt_len, 0); + rxrpc_transmit_ack_packets(call->peer->local); break; } smp_rmb(); @@ -604,6 +600,7 @@ try_again: if (ret == -EAGAIN) ret = 0; + rxrpc_transmit_ack_packets(call->peer->local); if (after(call->rx_top, call->rx_hard_ack) && call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK]) rxrpc_notify_socket(call); @@ -734,17 +731,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, read_phase_complete: ret = 1; out: - switch (call->ackr_reason) { - case RXRPC_ACK_IDLE: - break; - case RXRPC_ACK_DELAY: - if (ret != -EAGAIN) - break; - fallthrough; - default: - rxrpc_send_ack_packet(call, false, NULL); - } - + rxrpc_transmit_ack_packets(call->peer->local); if (_service) *_service = call->service_id; mutex_unlock(&call->user_mutex); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index b2d28aa12e10..ef4949259020 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -332,9 +332,7 @@ reload: rxrpc_see_skb(skb, rxrpc_skb_seen); do { - /* Check to see if there's a ping ACK to reply to. */ - if (call->ackr_reason == RXRPC_ACK_PING_RESPONSE) - rxrpc_send_ack_packet(call, false, NULL); + rxrpc_transmit_ack_packets(call->peer->local); if (!skb) { size_t remain, bufsize, chunk, offset; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 66d922ad65d0..45a7b48a5e10 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -33,6 +33,7 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, txb->len = 0; txb->offset = 0; txb->flags = 0; + txb->ack_why = 0; txb->seq = call->tx_top + 1; txb->wire.epoch = htonl(call->conn->proto.epoch); txb->wire.cid = htonl(call->cid); -- cgit v1.2.3-70-g09d2 From 530403d9ba1c3f51c721a394f642e56309072295 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jan 2020 21:48:14 +0000 Subject: rxrpc: Clean up ACK handling Clean up the rxrpc_propose_ACK() function. If deferred PING ACK proposal is split out, it's only really needed for deferred DELAY ACKs. All other ACKs, bar terminal IDLE ACK are sent immediately. The deferred IDLE ACK submission can be handled by conversion of a DELAY ACK into an IDLE ACK if there's nothing to be SACK'd. Also, because there's a delay between an ACK being generated and being transmitted, it's possible that other ACKs of the same type will be generated during that interval. Apart from the ACK time and the serial number responded to, most of the ACK body, including window and SACK parameters, are not filled out till the point of transmission - so we can avoid generating a new ACK if there's one pending that will cover the SACK data we need to convey. Therefore, don't propose a new DELAY or IDLE ACK for a call if there's one already pending. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 52 ++++++++++++++++-------- net/rxrpc/ar-internal.h | 10 ++--- net/rxrpc/call_event.c | 95 ++++++++++++-------------------------------- net/rxrpc/call_object.c | 2 +- net/rxrpc/input.c | 8 ++-- net/rxrpc/misc.c | 18 --------- net/rxrpc/output.c | 7 ++++ net/rxrpc/protocol.h | 7 ---- net/rxrpc/recvmsg.c | 14 +++---- net/rxrpc/sendmsg.c | 2 +- net/rxrpc/sysctl.c | 9 ----- 11 files changed, 86 insertions(+), 138 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 1597ff7ad97e..d32e9858c682 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -158,6 +158,7 @@ #define rxrpc_propose_ack_traces \ EM(rxrpc_propose_ack_client_tx_end, "ClTxEnd") \ EM(rxrpc_propose_ack_input_data, "DataIn ") \ + EM(rxrpc_propose_ack_input_data_hole, "DataInH") \ EM(rxrpc_propose_ack_ping_for_check_life, "ChkLife") \ EM(rxrpc_propose_ack_ping_for_keepalive, "KeepAlv") \ EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \ @@ -170,11 +171,6 @@ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") -#define rxrpc_propose_ack_outcomes \ - EM(rxrpc_propose_ack_subsume, " Subsume") \ - EM(rxrpc_propose_ack_update, " Update") \ - E_(rxrpc_propose_ack_use, " New") - #define rxrpc_congest_modes \ EM(RXRPC_CALL_CONGEST_AVOIDANCE, "CongAvoid") \ EM(RXRPC_CALL_FAST_RETRANSMIT, "FastReTx ") \ @@ -313,7 +309,6 @@ rxrpc_congest_changes; rxrpc_congest_modes; rxrpc_conn_traces; rxrpc_local_traces; -rxrpc_propose_ack_outcomes; rxrpc_propose_ack_traces; rxrpc_receive_traces; rxrpc_recvmsg_traces; @@ -1012,7 +1007,7 @@ TRACE_EVENT(rxrpc_timer, __entry->call = call->debug_id; __entry->why = why; __entry->now = now; - __entry->ack_at = call->ack_at; + __entry->ack_at = call->delay_ack_at; __entry->ack_lost_at = call->ack_lost_at; __entry->resend_at = call->resend_at; __entry->expect_rx_by = call->expect_rx_by; @@ -1054,7 +1049,7 @@ TRACE_EVENT(rxrpc_timer_expired, TP_fast_assign( __entry->call = call->debug_id; __entry->now = now; - __entry->ack_at = call->ack_at; + __entry->ack_at = call->delay_ack_at; __entry->ack_lost_at = call->ack_lost_at; __entry->resend_at = call->resend_at; __entry->expect_rx_by = call->expect_rx_by; @@ -1098,17 +1093,15 @@ TRACE_EVENT(rxrpc_rx_lose, TRACE_EVENT(rxrpc_propose_ack, TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, - u8 ack_reason, rxrpc_serial_t serial, - enum rxrpc_propose_ack_outcome outcome), + u8 ack_reason, rxrpc_serial_t serial), - TP_ARGS(call, why, ack_reason, serial, outcome), + TP_ARGS(call, why, ack_reason, serial), TP_STRUCT__entry( __field(unsigned int, call ) __field(enum rxrpc_propose_ack_trace, why ) __field(rxrpc_serial_t, serial ) __field(u8, ack_reason ) - __field(enum rxrpc_propose_ack_outcome, outcome ) ), TP_fast_assign( @@ -1116,15 +1109,13 @@ TRACE_EVENT(rxrpc_propose_ack, __entry->why = why; __entry->serial = serial; __entry->ack_reason = ack_reason; - __entry->outcome = outcome; ), - TP_printk("c=%08x %s %s r=%08x%s", + TP_printk("c=%08x %s %s r=%08x", __entry->call, __print_symbolic(__entry->why, rxrpc_propose_ack_traces), __print_symbolic(__entry->ack_reason, rxrpc_ack_names), - __entry->serial, - __print_symbolic(__entry->outcome, rxrpc_propose_ack_outcomes)) + __entry->serial) ); TRACE_EVENT(rxrpc_send_ack, @@ -1154,6 +1145,35 @@ TRACE_EVENT(rxrpc_send_ack, __entry->serial) ); +TRACE_EVENT(rxrpc_drop_ack, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_propose_ack_trace why, + u8 ack_reason, rxrpc_serial_t serial, bool nobuf), + + TP_ARGS(call, why, ack_reason, serial, nobuf), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_propose_ack_trace, why ) + __field(rxrpc_serial_t, serial ) + __field(u8, ack_reason ) + __field(bool, nobuf ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->why = why; + __entry->serial = serial; + __entry->ack_reason = ack_reason; + __entry->nobuf = nobuf; + ), + + TP_printk("c=%08x %s %s r=%08x nbf=%u", + __entry->call, + __print_symbolic(__entry->why, rxrpc_propose_ack_traces), + __print_symbolic(__entry->ack_reason, rxrpc_ack_names), + __entry->serial, __entry->nobuf) + ); + TRACE_EVENT(rxrpc_retransmit, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation, s64 expiry), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 802a8f372af2..5a81545a58d5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -516,6 +516,8 @@ enum rxrpc_call_flag { RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ RXRPC_CALL_KERNEL, /* The call was made by the kernel */ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ + RXRPC_CALL_DELAY_ACK_PENDING, /* DELAY ACK generation is pending */ + RXRPC_CALL_IDLE_ACK_PENDING, /* IDLE ACK generation is pending */ }; /* @@ -582,7 +584,7 @@ struct rxrpc_call { struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ - unsigned long ack_at; /* When deferred ACK needs to happen */ + unsigned long delay_ack_at; /* When DELAY ACK needs to happen */ unsigned long ack_lost_at; /* When ACK is figured as lost */ unsigned long resend_at; /* When next resend needs to happen */ unsigned long ping_at; /* When next to send a ping */ @@ -834,7 +836,8 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_send_ACK(struct rxrpc_call *, u8, rxrpc_serial_t, enum rxrpc_propose_ack_trace); -void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, enum rxrpc_propose_ack_trace); +void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, + enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); void rxrpc_reduce_call_timer(struct rxrpc_call *call, @@ -1016,15 +1019,12 @@ static inline bool __rxrpc_use_local(struct rxrpc_local *local) * misc.c */ extern unsigned int rxrpc_max_backlog __read_mostly; -extern unsigned long rxrpc_requested_ack_delay; extern unsigned long rxrpc_soft_ack_delay; extern unsigned long rxrpc_idle_ack_delay; extern unsigned int rxrpc_rx_window_size; extern unsigned int rxrpc_rx_mtu; extern unsigned int rxrpc_rx_jumbo_max; -extern const s8 rxrpc_ack_priority[]; - /* * net_ns.c */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 67b54ad914a1..36f60ac1d95d 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -29,70 +29,29 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, spin_lock_bh(&call->lock); if (time_before(ping_at, call->ping_at)) { - rxrpc_inc_stat(call->rxnet, stat_tx_acks[RXRPC_ACK_PING]); WRITE_ONCE(call->ping_at, ping_at); rxrpc_reduce_call_timer(call, ping_at, now, rxrpc_timer_set_for_ping); - trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial, - rxrpc_propose_ack_use); + trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial); } spin_unlock_bh(&call->lock); } /* - * propose an ACK be sent + * Propose a DELAY ACK be sent in the future. */ -static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u32 serial, enum rxrpc_propose_ack_trace why) +static void __rxrpc_propose_delay_ACK(struct rxrpc_call *call, + rxrpc_serial_t serial, + enum rxrpc_propose_ack_trace why) { - enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; unsigned long expiry = rxrpc_soft_ack_delay; unsigned long now = jiffies, ack_at; - s8 prior = rxrpc_ack_priority[ack_reason]; - - rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); - - /* Update DELAY, IDLE, REQUESTED and PING_RESPONSE ACK serial - * numbers, but we don't alter the timeout. - */ - _debug("prior %u %u vs %u %u", - ack_reason, prior, - call->ackr_reason, rxrpc_ack_priority[call->ackr_reason]); - if (ack_reason == call->ackr_reason) { - if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) { - outcome = rxrpc_propose_ack_update; - call->ackr_serial = serial; - } - } else if (prior > rxrpc_ack_priority[call->ackr_reason]) { - call->ackr_reason = ack_reason; - call->ackr_serial = serial; - } else { - outcome = rxrpc_propose_ack_subsume; - } - - switch (ack_reason) { - case RXRPC_ACK_REQUESTED: - if (rxrpc_requested_ack_delay < expiry) - expiry = rxrpc_requested_ack_delay; - break; - - case RXRPC_ACK_DELAY: - if (rxrpc_soft_ack_delay < expiry) - expiry = rxrpc_soft_ack_delay; - break; - - case RXRPC_ACK_IDLE: - if (rxrpc_idle_ack_delay < expiry) - expiry = rxrpc_idle_ack_delay; - break; - - default: - WARN_ON(1); - return; - } + call->ackr_serial = serial; + if (rxrpc_soft_ack_delay < expiry) + expiry = rxrpc_soft_ack_delay; if (call->peer->srtt_us != 0) ack_at = usecs_to_jiffies(call->peer->srtt_us >> 3); else @@ -100,23 +59,23 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, ack_at += READ_ONCE(call->tx_backoff); ack_at += now; - if (time_before(ack_at, call->ack_at)) { - WRITE_ONCE(call->ack_at, ack_at); + if (time_before(ack_at, call->delay_ack_at)) { + WRITE_ONCE(call->delay_ack_at, ack_at); rxrpc_reduce_call_timer(call, ack_at, now, rxrpc_timer_set_for_ack); } - trace_rxrpc_propose_ack(call, why, ack_reason, serial, outcome); + trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); } /* - * propose an ACK be sent, locking the call structure + * Propose a DELAY ACK be sent, locking the call structure */ -void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, u32 serial, - enum rxrpc_propose_ack_trace why) +void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, + enum rxrpc_propose_ack_trace why) { spin_lock_bh(&call->lock); - __rxrpc_propose_ACK(call, ack_reason, serial, why); + __rxrpc_propose_delay_ACK(call, serial, why); spin_unlock_bh(&call->lock); } @@ -131,6 +90,11 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return; + if (ack_reason == RXRPC_ACK_DELAY && + test_and_set_bit(RXRPC_CALL_DELAY_ACK_PENDING, &call->flags)) { + trace_rxrpc_drop_ack(call, why, ack_reason, serial, false); + return; + } rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); @@ -319,7 +283,6 @@ void rxrpc_process_call(struct work_struct *work) unsigned long now, next, t; unsigned int iterations = 0; rxrpc_serial_t ackr_serial; - u8 ackr_reason; rxrpc_see_call(call); @@ -364,19 +327,13 @@ recheck_state: set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); } - t = READ_ONCE(call->ack_at); + t = READ_ONCE(call->delay_ack_at); if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_ack, now); - cmpxchg(&call->ack_at, t, now + MAX_JIFFY_OFFSET); - spin_lock_bh(&call->lock); - ackr_reason = call->ackr_reason; - ackr_serial = call->ackr_serial; - call->ackr_reason = 0; - call->ackr_serial = 0; - spin_unlock_bh(&call->lock); - if (ackr_reason) - rxrpc_send_ACK(call, ackr_reason, ackr_serial, - rxrpc_propose_ack_ping_for_lost_ack); + cmpxchg(&call->delay_ack_at, t, now + MAX_JIFFY_OFFSET); + ackr_serial = xchg(&call->ackr_serial, 0); + rxrpc_send_ACK(call, RXRPC_ACK_DELAY, ackr_serial, + rxrpc_propose_ack_ping_for_lost_ack); } t = READ_ONCE(call->ack_lost_at); @@ -441,7 +398,7 @@ recheck_state: set(call->expect_req_by); set(call->expect_term_by); - set(call->ack_at); + set(call->delay_ack_at); set(call->ack_lost_at); set(call->resend_at); set(call->keepalive_at); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8290d94e9233..8f9e88897197 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -222,7 +222,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) unsigned long now = jiffies; unsigned long j = now + MAX_JIFFY_OFFSET; - call->ack_at = j; + call->delay_ack_at = j; call->ack_lost_at = j; call->resend_at = j; call->ping_at = j; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e23f66ef8c06..4df1bdc4de6c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -299,7 +299,7 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) now = jiffies; timo = now + MAX_JIFFY_OFFSET; WRITE_ONCE(call->resend_at, timo); - WRITE_ONCE(call->ack_at, timo); + WRITE_ONCE(call->delay_ack_at, timo); trace_rxrpc_timer(call, rxrpc_timer_init_for_reply, now); } @@ -542,7 +542,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) /* Send an immediate ACK if we fill in a hole */ if (!acked) { rxrpc_send_ACK(call, RXRPC_ACK_DELAY, serial, - rxrpc_propose_ack_input_data); + rxrpc_propose_ack_input_data_hole); acked = true; } } @@ -584,8 +584,8 @@ unlock: rxrpc_send_ACK(call, RXRPC_ACK_IDLE, ack_serial, rxrpc_propose_ack_input_data); else - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, ack_serial, - rxrpc_propose_ack_input_data); + rxrpc_propose_delay_ACK(call, ack_serial, + rxrpc_propose_ack_input_data); trace_rxrpc_notify_socket(call->debug_id, serial); rxrpc_notify_socket(call); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index d4144fd86f84..f5f03f27bd6c 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -16,12 +16,6 @@ */ unsigned int rxrpc_max_backlog __read_mostly = 10; -/* - * How long to wait before scheduling ACK generation after seeing a - * packet with RXRPC_REQUEST_ACK set (in jiffies). - */ -unsigned long rxrpc_requested_ack_delay = 1; - /* * How long to wait before scheduling an ACK with subtype DELAY (in jiffies). * @@ -62,15 +56,3 @@ unsigned int rxrpc_rx_mtu = 5692; * sender that we're willing to handle. */ unsigned int rxrpc_rx_jumbo_max = 4; - -const s8 rxrpc_ack_priority[] = { - [0] = 0, - [RXRPC_ACK_DELAY] = 1, - [RXRPC_ACK_REQUESTED] = 2, - [RXRPC_ACK_IDLE] = 3, - [RXRPC_ACK_DUPLICATE] = 4, - [RXRPC_ACK_OUT_OF_SEQUENCE] = 5, - [RXRPC_ACK_EXCEEDS_WINDOW] = 6, - [RXRPC_ACK_NOSPACE] = 7, - [RXRPC_ACK_PING_RESPONSE] = 8, -}; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 1edbc678e8be..d35657b659ad 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -115,6 +115,8 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, *ackp++ = RXRPC_ACK_TYPE_NACK; seq++; } while (before_eq(seq, top)); + } else if (txb->ack.reason == RXRPC_ACK_DELAY) { + txb->ack.reason = RXRPC_ACK_IDLE; } mtu = conn->params.peer->if_mtu; @@ -204,6 +206,11 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * if (txb->ack.reason == RXRPC_ACK_PING) txb->wire.flags |= RXRPC_REQUEST_ACK; + if (txb->ack.reason == RXRPC_ACK_DELAY) + clear_bit(RXRPC_CALL_DELAY_ACK_PENDING, &call->flags); + if (txb->ack.reason == RXRPC_ACK_IDLE) + clear_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags); + spin_lock_bh(&call->lock); n = rxrpc_fill_out_ack(conn, call, txb, &hard_ack, &top); spin_unlock_bh(&call->lock); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index d2cf8e1d218f..f3d4e2700901 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -132,13 +132,6 @@ struct rxrpc_ackpacket { } __packed; -/* Some ACKs refer to specific packets and some are general and can be updated. */ -#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED) | \ - (1 << RXRPC_ACK_PING_RESPONSE) | \ - (1 << RXRPC_ACK_DELAY) | \ - (1 << RXRPC_ACK_IDLE)) - - /* * ACK packets can have a further piece of information tagged on the end */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 104dd4a29f05..46e784edc0f4 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -188,11 +188,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); - if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, serial, - rxrpc_propose_ack_terminal_ack); - //rxrpc_send_ack_packet(call, false, NULL); - } + if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) + rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); write_lock_bh(&call->state_lock); @@ -206,8 +203,8 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) call->state = RXRPC_CALL_SERVER_ACK_REQUEST; call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; write_unlock_bh(&call->state_lock); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, - rxrpc_propose_ack_processing_op); + rxrpc_propose_delay_ACK(call, serial, + rxrpc_propose_ack_processing_op); break; default: write_unlock_bh(&call->state_lock); @@ -259,7 +256,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_end_rx_phase(call, serial); } else { /* Check to see if there's an ACK that needs sending. */ - if (atomic_inc_return(&call->ackr_nr_consumed) > 2) { + if (atomic_inc_return(&call->ackr_nr_consumed) > 2 && + !test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) { rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial, rxrpc_propose_ack_rotate_rx); rxrpc_transmit_ack_packets(call->peer->local); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index ef4949259020..e32805a49324 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -234,7 +234,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, case RXRPC_CALL_SERVER_ACK_REQUEST: call->state = RXRPC_CALL_SERVER_SEND_REPLY; now = jiffies; - WRITE_ONCE(call->ack_at, now + MAX_JIFFY_OFFSET); + WRITE_ONCE(call->delay_ack_at, now + MAX_JIFFY_OFFSET); if (call->ackr_reason == RXRPC_ACK_DELAY) call->ackr_reason = 0; trace_rxrpc_timer(call, rxrpc_timer_init_for_send_reply, now); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 555e0910786b..2bd987364e44 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -26,15 +26,6 @@ static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; */ static struct ctl_table rxrpc_sysctl_table[] = { /* Values measured in milliseconds but used in jiffies */ - { - .procname = "req_ack_delay", - .data = &rxrpc_requested_ack_delay, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_ms_jiffies_minmax, - .extra1 = (void *)&one_jiffy, - .extra2 = (void *)&max_jiffies, - }, { .procname = "soft_ack_delay", .data = &rxrpc_soft_ack_delay, -- cgit v1.2.3-70-g09d2 From faf92e8d53f5f03842da25af971a3f0ef88ffba2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Oct 2022 17:22:40 +0100 Subject: rxrpc: Split the rxrpc_recvmsg tracepoint Split the rxrpc_recvmsg tracepoint so that the tracepoints that are about data packet processing (and which have extra pieces of information) are separate from the tracepoint that shows the general flow of recvmsg(). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 24 ++++++++++++++++++++++++ net/rxrpc/recvmsg.c | 37 ++++++++++++++++++------------------- 2 files changed, 42 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index d32e9858c682..84464b29e54a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -885,6 +885,30 @@ TRACE_EVENT(rxrpc_receive, ); TRACE_EVENT(rxrpc_recvmsg, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why, + int ret), + + TP_ARGS(call, why, ret), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_recvmsg_trace, why ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->call = call ? call->debug_id : 0; + __entry->why = why; + __entry->ret = ret; + ), + + TP_printk("c=%08x %s ret=%d", + __entry->call, + __print_symbolic(__entry->why, rxrpc_recvmsg_traces), + __entry->ret) + ); + +TRACE_EVENT(rxrpc_recvdata, TP_PROTO(struct rxrpc_call *call, enum rxrpc_recvmsg_trace why, rxrpc_seq_t seq, unsigned int offset, unsigned int len, int ret), diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 46e784edc0f4..77cde6311559 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -173,8 +173,8 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) break; } - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack, - call->rx_pkt_offset, call->rx_pkt_len, ret); + trace_rxrpc_recvdata(call, rxrpc_recvmsg_terminal, call->rx_hard_ack, + call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } @@ -380,8 +380,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, ix = seq & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; if (!skb) { - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq, - rx_pkt_offset, rx_pkt_len, 0); + trace_rxrpc_recvdata(call, rxrpc_recvmsg_hole, seq, + rx_pkt_offset, rx_pkt_len, 0); rxrpc_transmit_ack_packets(call->peer->local); break; } @@ -404,15 +404,15 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, &call->rxtx_annotations[ix], &rx_pkt_offset, &rx_pkt_len, &rx_pkt_last); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq, - rx_pkt_offset, rx_pkt_len, ret2); + trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, + rx_pkt_offset, rx_pkt_len, ret2); if (ret2 < 0) { ret = ret2; goto out; } } else { - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq, - rx_pkt_offset, rx_pkt_len, 0); + trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, + rx_pkt_offset, rx_pkt_len, 0); } /* We have to handle short, empty and used-up DATA packets. */ @@ -435,8 +435,8 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, } if (rx_pkt_len > 0) { - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq, - rx_pkt_offset, rx_pkt_len, 0); + trace_rxrpc_recvdata(call, rxrpc_recvmsg_full, seq, + rx_pkt_offset, rx_pkt_len, 0); ASSERTCMP(*_offset, ==, len); ret = 0; break; @@ -464,8 +464,8 @@ out: call->rx_pkt_last = rx_pkt_last; } done: - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, - rx_pkt_offset, rx_pkt_len, ret); + trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, + rx_pkt_offset, rx_pkt_len, ret); if (ret == -EAGAIN) set_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags); return ret; @@ -488,7 +488,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, DEFINE_WAIT(wait); - trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0); + trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; @@ -525,8 +525,7 @@ try_again: if (list_empty(&rx->recvmsg_q)) { if (signal_pending(current)) goto wait_interrupted; - trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, - 0, 0, 0, 0); + trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); @@ -545,7 +544,7 @@ try_again: rxrpc_get_call(call, rxrpc_call_got); write_unlock_bh(&rx->recvmsg_lock); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0); /* We're going to drop the socket lock, so we need to lock the call * against interference by sendmsg. @@ -630,7 +629,7 @@ try_again: error_unlock_call: mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); return ret; error_requeue_call: @@ -638,14 +637,14 @@ error_requeue_call: write_lock_bh(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); write_unlock_bh(&rx->recvmsg_lock); - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put); } error_no_call: release_sock(&rx->sk); error_trace: - trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); + trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); return ret; wait_interrupted: -- cgit v1.2.3-70-g09d2 From d4d02d8bb5c412d977af7ea7c7ea91977a6a64dc Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 7 Oct 2022 17:44:39 +0100 Subject: rxrpc: Clone received jumbo subpackets and queue separately Split up received jumbo packets into separate skbuffs by cloning the original skbuff for each subpacket and setting the offset and length of the data in that subpacket in the skbuff's private data. The subpackets are then placed on the recvmsg queue separately. The security class then gets to revise the offset and length to remove its metadata. If we fail to clone a packet, we just drop it and let the peer resend it. The original packet gets used for the final subpacket. This should make it easier to handle parallel decryption of the subpackets. It also simplifies the handling of lost or misordered packets in the queuing/buffering loop as the possibility of overlapping jumbo packets no longer needs to be considered. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 12 +- net/rxrpc/ar-internal.h | 32 +--- net/rxrpc/input.c | 401 ++++++++++++++++++++----------------------- net/rxrpc/insecure.c | 13 +- net/rxrpc/output.c | 2 +- net/rxrpc/protocol.h | 2 +- net/rxrpc/recvmsg.c | 104 ++--------- net/rxrpc/rxkad.c | 96 +++-------- 8 files changed, 245 insertions(+), 417 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 84464b29e54a..03a984e661bc 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -18,6 +18,7 @@ */ #define rxrpc_skb_traces \ EM(rxrpc_skb_cleaned, "CLN") \ + EM(rxrpc_skb_cloned_jumbo, "CLJ") \ EM(rxrpc_skb_freed, "FRE") \ EM(rxrpc_skb_got, "GOT") \ EM(rxrpc_skb_lost, "*L*") \ @@ -630,16 +631,15 @@ TRACE_EVENT(rxrpc_transmit, TRACE_EVENT(rxrpc_rx_data, TP_PROTO(unsigned int call, rxrpc_seq_t seq, - rxrpc_serial_t serial, u8 flags, u8 anno), + rxrpc_serial_t serial, u8 flags), - TP_ARGS(call, seq, serial, flags, anno), + TP_ARGS(call, seq, serial, flags), TP_STRUCT__entry( __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) __field(rxrpc_serial_t, serial ) __field(u8, flags ) - __field(u8, anno ) ), TP_fast_assign( @@ -647,15 +647,13 @@ TRACE_EVENT(rxrpc_rx_data, __entry->seq = seq; __entry->serial = serial; __entry->flags = flags; - __entry->anno = anno; ), - TP_printk("c=%08x DATA %08x q=%08x fl=%02x a=%02x", + TP_printk("c=%08x DATA %08x q=%08x fl=%02x", __entry->call, __entry->serial, __entry->seq, - __entry->flags, - __entry->anno) + __entry->flags) ); TRACE_EVENT(rxrpc_rx_ack, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5a81545a58d5..e93ed18816b1 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -195,19 +195,14 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { - atomic_t nr_ring_pins; /* Number of rxtx ring pins */ - u8 nr_subpackets; /* Number of subpackets */ - u8 rx_flags; /* Received packet flags */ -#define RXRPC_SKB_INCL_LAST 0x01 /* - Includes last packet */ - union { - int remain; /* amount of space remaining for next write */ - - /* List of requested ACKs on subpackets */ - unsigned long rx_req_ack[(RXRPC_MAX_NR_JUMBO + BITS_PER_LONG - 1) / - BITS_PER_LONG]; - }; - - struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ + u16 remain; + u16 offset; /* Offset of data */ + u16 len; /* Length of data */ + u8 rx_flags; /* Received packet flags */ + u8 flags; +#define RXRPC_RX_VERIFIED 0x01 + + struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; #define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb) @@ -252,16 +247,11 @@ struct rxrpc_security { int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t); /* verify the security on a received packet */ - int (*verify_packet)(struct rxrpc_call *, struct sk_buff *, - unsigned int, unsigned int, rxrpc_seq_t, u16); + int (*verify_packet)(struct rxrpc_call *, struct sk_buff *); /* Free crypto request on a call */ void (*free_call_crypto)(struct rxrpc_call *); - /* Locate the data in a received packet that has been verified. */ - void (*locate_data)(struct rxrpc_call *, struct sk_buff *, - unsigned int *, unsigned int *); - /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); @@ -628,7 +618,6 @@ struct rxrpc_call { int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ - bool rx_pkt_last; /* Current recvmsg packet is last */ /* Rx/Tx circular buffer, depending on phase. * @@ -652,8 +641,6 @@ struct rxrpc_call { #define RXRPC_TX_ANNO_LAST 0x04 #define RXRPC_TX_ANNO_RESENT 0x08 -#define RXRPC_RX_ANNO_SUBPACKET 0x3f /* Subpacket number in jumbogram */ -#define RXRPC_RX_ANNO_VERIFIED 0x80 /* Set if verified and decrypted */ rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but * not hard-ACK'd packet follows this. */ @@ -681,7 +668,6 @@ struct rxrpc_call { rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ u8 tx_winsize; /* Maximum size of Tx window */ - u8 nr_jumbo_bad; /* Number of jumbo dups/exceeds-windows */ spinlock_t input_lock; /* Lock for packet input to this call */ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 4df1bdc4de6c..0e7545ed0128 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -313,78 +313,178 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) } /* - * Scan a data packet to validate its structure and to work out how many - * subpackets it contains. - * - * A jumbo packet is a collection of consecutive packets glued together with - * little headers between that indicate how to change the initial header for - * each subpacket. - * - * RXRPC_JUMBO_PACKET must be set on all but the last subpacket - and all but - * the last are RXRPC_JUMBO_DATALEN in size. The last subpacket may be of any - * size. + * Process a DATA packet, adding the packet to the Rx ring. The caller's + * packet ref must be passed on or discarded. */ -static bool rxrpc_validate_data(struct sk_buff *skb) +static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int offset = sizeof(struct rxrpc_wire_header); - unsigned int len = skb->len; - u8 flags = sp->hdr.flags; + rxrpc_serial_t serial = sp->hdr.serial; + rxrpc_seq_t seq = sp->hdr.seq, hard_ack; + unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK; + bool last = sp->hdr.flags & RXRPC_LAST_PACKET; + bool acked = false; - for (;;) { - if (flags & RXRPC_REQUEST_ACK) - __set_bit(sp->nr_subpackets, sp->rx_req_ack); - sp->nr_subpackets++; + rxrpc_inc_stat(call->rxnet, stat_rx_data); + if (sp->hdr.flags & RXRPC_REQUEST_ACK) + rxrpc_inc_stat(call->rxnet, stat_rx_data_reqack); + if (sp->hdr.flags & RXRPC_JUMBO_PACKET) + rxrpc_inc_stat(call->rxnet, stat_rx_data_jumbo); - if (!(flags & RXRPC_JUMBO_PACKET)) - break; + hard_ack = READ_ONCE(call->rx_hard_ack); - if (len - offset < RXRPC_JUMBO_SUBPKTLEN) - goto protocol_error; - if (flags & RXRPC_LAST_PACKET) - goto protocol_error; - offset += RXRPC_JUMBO_DATALEN; - if (skb_copy_bits(skb, offset, &flags, 1) < 0) - goto protocol_error; - offset += sizeof(struct rxrpc_jumbo_header); + _proto("Rx DATA %%%u { #%x l=%u }", serial, seq, last); + + if (last) { + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && + seq != call->rx_top) { + rxrpc_proto_abort("LSN", call, seq); + goto out; + } + } else { + if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && + after_eq(seq, call->rx_top)) { + rxrpc_proto_abort("LSA", call, seq); + goto out; + } } - if (flags & RXRPC_LAST_PACKET) - sp->rx_flags |= RXRPC_SKB_INCL_LAST; - return true; + trace_rxrpc_rx_data(call->debug_id, seq, serial, sp->hdr.flags); -protocol_error: - return false; + if (before_eq(seq, hard_ack)) { + rxrpc_send_ACK(call, RXRPC_ACK_DUPLICATE, serial, + rxrpc_propose_ack_input_data); + goto out; + } + + if (call->rxtx_buffer[ix]) { + rxrpc_send_ACK(call, RXRPC_ACK_DUPLICATE, serial, + rxrpc_propose_ack_input_data); + goto out; + } + + if (after(seq, hard_ack + call->rx_winsize)) { + rxrpc_send_ACK(call, RXRPC_ACK_EXCEEDS_WINDOW, serial, + rxrpc_propose_ack_input_data); + goto out; + } + + if (sp->hdr.flags & RXRPC_REQUEST_ACK) { + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, serial, + rxrpc_propose_ack_input_data); + acked = true; + } + + if (after(seq, call->ackr_highest_seq)) + call->ackr_highest_seq = seq; + + /* Queue the packet. We use a couple of memory barriers here as need + * to make sure that rx_top is perceived to be set after the buffer + * pointer and that the buffer pointer is set after the annotation and + * the skb data. + * + * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() + * and also rxrpc_fill_out_ack(). + */ + call->rxtx_annotations[ix] = 1; + smp_wmb(); + call->rxtx_buffer[ix] = skb; + if (after(seq, call->rx_top)) { + smp_store_release(&call->rx_top, seq); + } else if (before(seq, call->rx_top)) { + /* Send an immediate ACK if we fill in a hole */ + if (!acked) { + rxrpc_send_ACK(call, RXRPC_ACK_DELAY, serial, + rxrpc_propose_ack_input_data_hole); + acked = true; + } + } + + /* From this point on, we're not allowed to touch the packet any longer + * as its ref now belongs to the Rx ring. + */ + skb = NULL; + sp = NULL; + + if (last) { + set_bit(RXRPC_CALL_RX_LAST, &call->flags); + trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); + } else { + trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); + } + + if (after_eq(seq, call->rx_expect_next)) { + if (after(seq, call->rx_expect_next)) { + _net("OOS %u > %u", seq, call->rx_expect_next); + rxrpc_send_ACK(call, RXRPC_ACK_OUT_OF_SEQUENCE, serial, + rxrpc_propose_ack_input_data); + acked = true; + } + call->rx_expect_next = seq + 1; + } + +out: + if (!acked && + atomic_inc_return(&call->ackr_nr_unacked) > 2) + rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial, + rxrpc_propose_ack_input_data); + else + rxrpc_propose_delay_ACK(call, serial, + rxrpc_propose_ack_input_data); + + trace_rxrpc_notify_socket(call->debug_id, serial); + rxrpc_notify_socket(call); + + rxrpc_free_skb(skb, rxrpc_skb_freed); + _leave(" [queued]"); } /* - * Handle reception of a duplicate packet. - * - * We have to take care to avoid an attack here whereby we're given a series of - * jumbograms, each with a sequence number one before the preceding one and - * filled up to maximum UDP size. If they never send us the first packet in - * the sequence, they can cause us to have to hold on to around 2MiB of kernel - * space until the call times out. - * - * We limit the space usage by only accepting three duplicate jumbo packets per - * call. After that, we tell the other side we're no longer accepting jumbos - * (that information is encoded in the ACK packet). + * Split a jumbo packet and file the bits separately. */ -static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq, - bool is_jumbo, bool *_jumbo_bad) +static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb) { - /* Discard normal packets that are duplicates. */ - if (is_jumbo) - return; + struct rxrpc_jumbo_header jhdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb), *jsp; + struct sk_buff *jskb; + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = skb->len - offset; - /* Skip jumbo subpackets that are duplicates. When we've had three or - * more partially duplicate jumbo packets, we refuse to take any more - * jumbos for this call. - */ - if (!*_jumbo_bad) { - call->nr_jumbo_bad++; - *_jumbo_bad = true; + while (sp->hdr.flags & RXRPC_JUMBO_PACKET) { + if (len < RXRPC_JUMBO_SUBPKTLEN) + goto protocol_error; + if (sp->hdr.flags & RXRPC_LAST_PACKET) + goto protocol_error; + if (skb_copy_bits(skb, offset + RXRPC_JUMBO_DATALEN, + &jhdr, sizeof(jhdr)) < 0) + goto protocol_error; + + jskb = skb_clone(skb, GFP_ATOMIC); + if (!jskb) { + kdebug("couldn't clone"); + return false; + } + rxrpc_new_skb(jskb, rxrpc_skb_cloned_jumbo); + jsp = rxrpc_skb(jskb); + jsp->offset = offset; + jsp->len = RXRPC_JUMBO_DATALEN; + rxrpc_input_data_one(call, jskb); + + sp->hdr.flags = jhdr.flags; + sp->hdr._rsvd = ntohs(jhdr._rsvd); + sp->hdr.seq++; + sp->hdr.serial++; + offset += RXRPC_JUMBO_SUBPKTLEN; + len -= RXRPC_JUMBO_SUBPKTLEN; } + + sp->offset = offset; + sp->len = len; + rxrpc_input_data_one(call, skb); + return true; + +protocol_error: + return false; } /* @@ -395,16 +495,14 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); enum rxrpc_call_state state; - unsigned int j, nr_subpackets, nr_unacked = 0; - rxrpc_serial_t serial = sp->hdr.serial, ack_serial = serial; - rxrpc_seq_t seq0 = sp->hdr.seq, hard_ack; - bool jumbo_bad = false; + rxrpc_serial_t serial = sp->hdr.serial; + rxrpc_seq_t seq0 = sp->hdr.seq; _enter("{%u,%u},{%u,%u}", call->rx_hard_ack, call->rx_top, skb->len, seq0); - _proto("Rx DATA %%%u { #%u f=%02x n=%u }", - sp->hdr.serial, seq0, sp->hdr.flags, sp->nr_subpackets); + _proto("Rx DATA %%%u { #%u f=%02x }", + sp->hdr.serial, seq0, sp->hdr.flags); state = READ_ONCE(call->state); if (state >= RXRPC_CALL_COMPLETE) { @@ -412,6 +510,24 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) return; } + /* Unshare the packet so that it can be modified for in-place + * decryption. + */ + if (sp->hdr.securityIndex != 0) { + struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC); + if (!nskb) { + rxrpc_eaten_skb(skb, rxrpc_skb_unshared_nomem); + return; + } + + if (nskb != skb) { + rxrpc_eaten_skb(skb, rxrpc_skb_received); + skb = nskb; + rxrpc_new_skb(skb, rxrpc_skb_unshared); + sp = rxrpc_skb(skb); + } + } + if (state == RXRPC_CALL_SERVER_RECV_REQUEST) { unsigned long timo = READ_ONCE(call->next_req_timo); unsigned long now, expect_req_by; @@ -425,12 +541,6 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } } - rxrpc_inc_stat(call->rxnet, stat_rx_data); - if (sp->hdr.flags & RXRPC_REQUEST_ACK) - rxrpc_inc_stat(call->rxnet, stat_rx_data_reqack); - if (sp->hdr.flags & RXRPC_JUMBO_PACKET) - rxrpc_inc_stat(call->rxnet, stat_rx_data_jumbo); - spin_lock(&call->input_lock); /* Received data implicitly ACKs all of the request packets we sent @@ -439,154 +549,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST || state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && !rxrpc_receiving_reply(call)) - goto unlock; - - hard_ack = READ_ONCE(call->rx_hard_ack); - - nr_subpackets = sp->nr_subpackets; - if (nr_subpackets > 1) { - if (call->nr_jumbo_bad > 3) { - rxrpc_send_ACK(call, RXRPC_ACK_NOSPACE, serial, - rxrpc_propose_ack_input_data); - goto unlock; - } - } - - for (j = 0; j < nr_subpackets; j++) { - rxrpc_serial_t serial = sp->hdr.serial + j; - rxrpc_seq_t seq = seq0 + j; - unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK; - bool terminal = (j == nr_subpackets - 1); - bool last = terminal && (sp->rx_flags & RXRPC_SKB_INCL_LAST); - bool acked = false; - u8 flags, annotation = j; - - _proto("Rx DATA+%u %%%u { #%x t=%u l=%u }", - j, serial, seq, terminal, last); - - if (last) { - if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && - seq != call->rx_top) { - rxrpc_proto_abort("LSN", call, seq); - goto unlock; - } - } else { - if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && - after_eq(seq, call->rx_top)) { - rxrpc_proto_abort("LSA", call, seq); - goto unlock; - } - } - - flags = 0; - if (last) - flags |= RXRPC_LAST_PACKET; - if (!terminal) - flags |= RXRPC_JUMBO_PACKET; - if (test_bit(j, sp->rx_req_ack)) - flags |= RXRPC_REQUEST_ACK; - trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation); - - if (before_eq(seq, hard_ack)) { - rxrpc_send_ACK(call, RXRPC_ACK_DUPLICATE, serial, - rxrpc_propose_ack_input_data); - continue; - } - - if (call->rxtx_buffer[ix]) { - rxrpc_input_dup_data(call, seq, nr_subpackets > 1, - &jumbo_bad); - rxrpc_send_ACK(call, RXRPC_ACK_DUPLICATE, serial, - rxrpc_propose_ack_input_data); - continue; - } - - if (after(seq, hard_ack + call->rx_winsize)) { - rxrpc_send_ACK(call, RXRPC_ACK_EXCEEDS_WINDOW, serial, - rxrpc_propose_ack_input_data); - if (flags & RXRPC_JUMBO_PACKET) { - if (!jumbo_bad) { - call->nr_jumbo_bad++; - jumbo_bad = true; - } - } - - goto unlock; - } - - if (flags & RXRPC_REQUEST_ACK) { - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, serial, - rxrpc_propose_ack_input_data); - acked = true; - } - - if (after(seq0, call->ackr_highest_seq)) - call->ackr_highest_seq = seq0; - - /* Queue the packet. We use a couple of memory barriers here as need - * to make sure that rx_top is perceived to be set after the buffer - * pointer and that the buffer pointer is set after the annotation and - * the skb data. - * - * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() - * and also rxrpc_fill_out_ack(). - */ - if (!terminal) - rxrpc_get_skb(skb, rxrpc_skb_got); - call->rxtx_annotations[ix] = annotation; - smp_wmb(); - call->rxtx_buffer[ix] = skb; - if (after(seq, call->rx_top)) { - smp_store_release(&call->rx_top, seq); - } else if (before(seq, call->rx_top)) { - /* Send an immediate ACK if we fill in a hole */ - if (!acked) { - rxrpc_send_ACK(call, RXRPC_ACK_DELAY, serial, - rxrpc_propose_ack_input_data_hole); - acked = true; - } - } - - if (terminal) { - /* From this point on, we're not allowed to touch the - * packet any longer as its ref now belongs to the Rx - * ring. - */ - skb = NULL; - sp = NULL; - } - - if (last) { - set_bit(RXRPC_CALL_RX_LAST, &call->flags); - trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); - } else { - trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); - } - - if (after_eq(seq, call->rx_expect_next)) { - if (after(seq, call->rx_expect_next)) { - _net("OOS %u > %u", seq, call->rx_expect_next); - rxrpc_send_ACK(call, RXRPC_ACK_OUT_OF_SEQUENCE, serial, - rxrpc_propose_ack_input_data); - acked = true; - } - call->rx_expect_next = seq + 1; - } + goto out; - if (!acked) { - nr_unacked++; - ack_serial = serial; - } + if (!rxrpc_input_split_jumbo(call, skb)) { + rxrpc_proto_abort("VLD", call, sp->hdr.seq); + goto out; } + skb = NULL; -unlock: - if (atomic_add_return(nr_unacked, &call->ackr_nr_unacked) > 2) - rxrpc_send_ACK(call, RXRPC_ACK_IDLE, ack_serial, - rxrpc_propose_ack_input_data); - else - rxrpc_propose_delay_ACK(call, ack_serial, - rxrpc_propose_ack_input_data); - +out: trace_rxrpc_notify_socket(call->debug_id, serial); rxrpc_notify_socket(call); @@ -1288,8 +1259,6 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) if (sp->hdr.callNumber == 0 || sp->hdr.seq == 0) goto bad_message; - if (!rxrpc_validate_data(skb)) - goto bad_message; /* Unshare the packet so that it can be modified for in-place * decryption. @@ -1403,7 +1372,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) trace_rxrpc_rx_data(chan->call_debug_id, sp->hdr.seq, sp->hdr.serial, - sp->hdr.flags, 0); + sp->hdr.flags); rxrpc_post_packet_to_conn(conn, skb); goto out; } diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 9aae99d67833..fd68f0e3af27 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -31,10 +31,11 @@ static int none_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, return 0; } -static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int offset, unsigned int len, - rxrpc_seq_t seq, u16 expected_cksum) +static int none_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + sp->flags |= RXRPC_RX_VERIFIED; return 0; } @@ -42,11 +43,6 @@ static void none_free_call_crypto(struct rxrpc_call *call) { } -static void none_locate_data(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int *_offset, unsigned int *_len) -{ -} - static int none_respond_to_challenge(struct rxrpc_connection *conn, struct sk_buff *skb, u32 *_abort_code) @@ -95,7 +91,6 @@ const struct rxrpc_security rxrpc_no_security = { .how_much_data = none_how_much_data, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, - .locate_data = none_locate_data, .respond_to_challenge = none_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index d35657b659ad..f7bb792c8aa1 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -121,7 +121,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, mtu = conn->params.peer->if_mtu; mtu -= conn->params.peer->hdrsize; - jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max; + jmax = rxrpc_rx_jumbo_max; ackinfo.rxMTU = htonl(rxrpc_rx_mtu); ackinfo.maxMTU = htonl(mtu); ackinfo.rwind = htonl(call->rx_winsize); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index f3d4e2700901..6760cb99c6d6 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -84,7 +84,7 @@ struct rxrpc_jumbo_header { __be16 _rsvd; /* reserved */ __be16 cksum; /* kerberos security checksum */ }; -}; +} __packed; #define RXRPC_JUMBO_DATALEN 1412 /* non-terminal jumbo packet data length */ #define RXRPC_JUMBO_SUBPKTLEN (RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header)) diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 77cde6311559..401aae687830 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -222,7 +222,6 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; bool last = false; - u8 subpacket; int ix; _enter("%d", call->debug_id); @@ -237,11 +236,9 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) rxrpc_see_skb(skb, rxrpc_skb_rotated); sp = rxrpc_skb(skb); - subpacket = call->rxtx_annotations[ix] & RXRPC_RX_ANNO_SUBPACKET; - serial = sp->hdr.serial + subpacket; + serial = sp->hdr.serial; - if (subpacket == sp->nr_subpackets - 1 && - sp->rx_flags & RXRPC_SKB_INCL_LAST) + if (sp->hdr.flags & RXRPC_LAST_PACKET) last = true; call->rxtx_buffer[ix] = NULL; @@ -266,80 +263,15 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) } /* - * Decrypt and verify a (sub)packet. The packet's length may be changed due to - * padding, but if this is the case, the packet length will be resident in the - * socket buffer. Note that we can't modify the master skb info as the skb may - * be the home to multiple subpackets. + * Decrypt and verify a DATA packet. */ -static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, - u8 annotation, - unsigned int offset, unsigned int len) +static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - rxrpc_seq_t seq = sp->hdr.seq; - u16 cksum = sp->hdr.cksum; - u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET; - _enter(""); - - /* For all but the head jumbo subpacket, the security checksum is in a - * jumbo header immediately prior to the data. - */ - if (subpacket > 0) { - __be16 tmp; - if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0) - BUG(); - cksum = ntohs(tmp); - seq += subpacket; - } - - return call->security->verify_packet(call, skb, offset, len, - seq, cksum); -} - -/* - * Locate the data within a packet. This is complicated by: - * - * (1) An skb may contain a jumbo packet - so we have to find the appropriate - * subpacket. - * - * (2) The (sub)packets may be encrypted and, if so, the encrypted portion - * contains an extra header which includes the true length of the data, - * excluding any encrypted padding. - */ -static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, - u8 *_annotation, - unsigned int *_offset, unsigned int *_len, - bool *_last) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int offset = sizeof(struct rxrpc_wire_header); - unsigned int len; - bool last = false; - int ret; - u8 annotation = *_annotation; - u8 subpacket = annotation & RXRPC_RX_ANNO_SUBPACKET; - - /* Locate the subpacket */ - offset += subpacket * RXRPC_JUMBO_SUBPKTLEN; - len = skb->len - offset; - if (subpacket < sp->nr_subpackets - 1) - len = RXRPC_JUMBO_DATALEN; - else if (sp->rx_flags & RXRPC_SKB_INCL_LAST) - last = true; - - if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) { - ret = rxrpc_verify_packet(call, skb, annotation, offset, len); - if (ret < 0) - return ret; - *_annotation |= RXRPC_RX_ANNO_VERIFIED; - } - - *_offset = offset; - *_len = len; - *_last = last; - call->security->locate_data(call, skb, _offset, _len); - return 0; + if (sp->flags & RXRPC_RX_VERIFIED) + return 0; + return call->security->verify_packet(call, skb); } /* @@ -356,13 +288,11 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top, seq; size_t remain; - bool rx_pkt_last; unsigned int rx_pkt_offset, rx_pkt_len; int ix, copy, ret = -EAGAIN, ret2; rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; - rx_pkt_last = call->rx_pkt_last; if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { seq = call->rx_hard_ack; @@ -391,7 +321,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (!(flags & MSG_PEEK)) { serial = sp->hdr.serial; - serial += call->rxtx_annotations[ix] & RXRPC_RX_ANNO_SUBPACKET; trace_rxrpc_receive(call, rxrpc_receive_front, serial, seq); } @@ -400,10 +329,9 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, sock_recv_timestamp(msg, sock->sk, skb); if (rx_pkt_offset == 0) { - ret2 = rxrpc_locate_data(call, skb, - &call->rxtx_annotations[ix], - &rx_pkt_offset, &rx_pkt_len, - &rx_pkt_last); + ret2 = rxrpc_verify_data(call, skb); + rx_pkt_offset = sp->offset; + rx_pkt_len = sp->len; trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, rx_pkt_offset, rx_pkt_len, ret2); if (ret2 < 0) { @@ -443,16 +371,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, } /* The whole packet has been transferred. */ - if (!(flags & MSG_PEEK)) - rxrpc_rotate_rx_window(call); + if (sp->hdr.flags & RXRPC_LAST_PACKET) + ret = 1; rx_pkt_offset = 0; rx_pkt_len = 0; - if (rx_pkt_last) { - ASSERTCMP(seq, ==, READ_ONCE(call->rx_top)); - ret = 1; - goto out; - } + if (!(flags & MSG_PEEK)) + rxrpc_rotate_rx_window(call); seq++; } @@ -461,7 +386,6 @@ out: if (!(flags & MSG_PEEK)) { call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; - call->rx_pkt_last = rx_pkt_last; } done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 78fa0524156f..b19937776aa9 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -439,11 +439,11 @@ static int rxkad_secure_packet(struct rxrpc_call *call, * decrypt partial encryption on a packet (level 1 security) */ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int offset, unsigned int len, rxrpc_seq_t seq, struct skcipher_request *req) { struct rxkad_level1_hdr sechdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; struct scatterlist sg[16]; bool aborted; @@ -453,9 +453,9 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, _enter(""); - if (len < 8) { + if (sp->len < 8) { aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_hdr", "V1H", - RXKADSEALEDINCON); + RXKADSEALEDINCON); goto protocol_error; } @@ -463,7 +463,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, * directly into the target buffer. */ sg_init_table(sg, ARRAY_SIZE(sg)); - ret = skb_to_sgvec(skb, sg, offset, 8); + ret = skb_to_sgvec(skb, sg, sp->offset, 8); if (unlikely(ret < 0)) return ret; @@ -477,12 +477,13 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_zero(req); /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) { + if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) { aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_len", "XV1", RXKADDATALEN); goto protocol_error; } - len -= sizeof(sechdr); + sp->offset += sizeof(sechdr); + sp->len -= sizeof(sechdr); buf = ntohl(sechdr.data_size); data_size = buf & 0xffff; @@ -496,11 +497,12 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, goto protocol_error; } - if (data_size > len) { + if (data_size > sp->len) { aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_datalen", "V1L", RXKADDATALEN); goto protocol_error; } + sp->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; @@ -515,12 +517,12 @@ protocol_error: * wholly decrypt a packet (level 2 security) */ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int offset, unsigned int len, rxrpc_seq_t seq, struct skcipher_request *req) { const struct rxrpc_key_token *token; struct rxkad_level2_hdr sechdr; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; bool aborted; @@ -528,9 +530,9 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, u16 check; int nsg, ret; - _enter(",{%d}", skb->len); + _enter(",{%d}", sp->len); - if (len < 8) { + if (sp->len < 8) { aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_hdr", "V2H", RXKADSEALEDINCON); goto protocol_error; @@ -550,7 +552,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, } sg_init_table(sg, nsg); - ret = skb_to_sgvec(skb, sg, offset, len); + ret = skb_to_sgvec(skb, sg, sp->offset, sp->len); if (unlikely(ret < 0)) { if (sg != _sg) kfree(sg); @@ -563,19 +565,20 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, len, iv.x); + skcipher_request_set_crypt(req, sg, sg, sp->len, iv.x); crypto_skcipher_decrypt(req); skcipher_request_zero(req); if (sg != _sg) kfree(sg); /* Extract the decrypted packet length */ - if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) { + if (skb_copy_bits(skb, sp->offset, &sechdr, sizeof(sechdr)) < 0) { aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_len", "XV2", RXKADDATALEN); goto protocol_error; } - len -= sizeof(sechdr); + sp->offset += sizeof(sechdr); + sp->len -= sizeof(sechdr); buf = ntohl(sechdr.data_size); data_size = buf & 0xffff; @@ -589,12 +592,13 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, goto protocol_error; } - if (data_size > len) { + if (data_size > sp->len) { aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_datalen", "V2L", RXKADDATALEN); goto protocol_error; } + sp->len = data_size; _leave(" = 0 [dlen=%x]", data_size); return 0; @@ -609,16 +613,15 @@ nomem: } /* - * Verify the security on a received packet or subpacket (if part of a - * jumbo packet). + * Verify the security on a received packet and the subpackets therein. */ -static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int offset, unsigned int len, - rxrpc_seq_t seq, u16 expected_cksum) +static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg; + rxrpc_seq_t seq = sp->hdr.seq; bool aborted; u16 cksum; u32 x, y; @@ -654,7 +657,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, if (cksum == 0) cksum = 1; /* zero checksums are not permitted */ - if (cksum != expected_cksum) { + if (cksum != sp->hdr.cksum) { aborted = rxrpc_abort_eproto(call, skb, "rxkad_csum", "VCK", RXKADSEALEDINCON); goto protocol_error; @@ -664,9 +667,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, case RXRPC_SECURITY_PLAIN: return 0; case RXRPC_SECURITY_AUTH: - return rxkad_verify_packet_1(call, skb, offset, len, seq, req); + return rxkad_verify_packet_1(call, skb, seq, req); case RXRPC_SECURITY_ENCRYPT: - return rxkad_verify_packet_2(call, skb, offset, len, seq, req); + return rxkad_verify_packet_2(call, skb, seq, req); default: return -ENOANO; } @@ -677,52 +680,6 @@ protocol_error: return -EPROTO; } -/* - * Locate the data contained in a packet that was partially encrypted. - */ -static void rxkad_locate_data_1(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int *_offset, unsigned int *_len) -{ - struct rxkad_level1_hdr sechdr; - - if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0) - BUG(); - *_offset += sizeof(sechdr); - *_len = ntohl(sechdr.data_size) & 0xffff; -} - -/* - * Locate the data contained in a packet that was completely encrypted. - */ -static void rxkad_locate_data_2(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int *_offset, unsigned int *_len) -{ - struct rxkad_level2_hdr sechdr; - - if (skb_copy_bits(skb, *_offset, &sechdr, sizeof(sechdr)) < 0) - BUG(); - *_offset += sizeof(sechdr); - *_len = ntohl(sechdr.data_size) & 0xffff; -} - -/* - * Locate the data contained in an already decrypted packet. - */ -static void rxkad_locate_data(struct rxrpc_call *call, struct sk_buff *skb, - unsigned int *_offset, unsigned int *_len) -{ - switch (call->conn->params.security_level) { - case RXRPC_SECURITY_AUTH: - rxkad_locate_data_1(call, skb, _offset, _len); - return; - case RXRPC_SECURITY_ENCRYPT: - rxkad_locate_data_2(call, skb, _offset, _len); - return; - default: - return; - } -} - /* * issue a challenge */ @@ -1397,7 +1354,6 @@ const struct rxrpc_security rxkad = { .secure_packet = rxkad_secure_packet, .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, - .locate_data = rxkad_locate_data, .issue_challenge = rxkad_issue_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, -- cgit v1.2.3-70-g09d2 From 5d7edbc9231ec6b60f9c5b7e7980e9a1cd92e6bb Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 27 Aug 2022 14:27:56 +0100 Subject: rxrpc: Get rid of the Rx ring Get rid of the Rx ring and replace it with a pair of queues instead. One queue gets the packets that are in-sequence and are ready for processing by recvmsg(); the other queue gets the out-of-sequence packets for addition to the first queue as the holes get filled. The annotation ring is removed and replaced with a SACK table. The SACK table has the bits set that correspond exactly to the sequence number of the packet being acked. The SACK ring is copied when an ACK packet is being assembled and rotated so that the first ACK is in byte 0. Flow control handling is altered so that packets that are moved to the in-sequence queue are hard-ACK'd even before they're consumed - and then the Rx window size in the ACK packet (rsize) is shrunk down to compensate (even going to 0 if the window is full). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 19 ++-- net/rxrpc/ar-internal.h | 29 +++--- net/rxrpc/call_object.c | 7 +- net/rxrpc/conn_object.c | 2 +- net/rxrpc/input.c | 213 ++++++++++++++++++++++++++----------------- net/rxrpc/misc.c | 5 +- net/rxrpc/output.c | 95 +++++++++++-------- net/rxrpc/proc.c | 7 +- net/rxrpc/recvmsg.c | 117 ++++++++++++------------ net/rxrpc/rxkad.c | 1 - net/rxrpc/sysctl.c | 2 +- 11 files changed, 290 insertions(+), 207 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 03a984e661bc..284a1560b0a8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -104,7 +104,12 @@ EM(rxrpc_receive_incoming, "INC") \ EM(rxrpc_receive_queue, "QUE") \ EM(rxrpc_receive_queue_last, "QLS") \ - E_(rxrpc_receive_rotate, "ROT") + EM(rxrpc_receive_queue_oos, "QUO") \ + EM(rxrpc_receive_queue_oos_last, "QOL") \ + EM(rxrpc_receive_oos, "OOS") \ + EM(rxrpc_receive_oos_last, "OSL") \ + EM(rxrpc_receive_rotate, "ROT") \ + E_(rxrpc_receive_rotate_last, "RLS") #define rxrpc_recvmsg_traces \ EM(rxrpc_recvmsg_cont, "CONT") \ @@ -860,8 +865,7 @@ TRACE_EVENT(rxrpc_receive, __field(enum rxrpc_receive_trace, why ) __field(rxrpc_serial_t, serial ) __field(rxrpc_seq_t, seq ) - __field(rxrpc_seq_t, hard_ack ) - __field(rxrpc_seq_t, top ) + __field(u64, window ) ), TP_fast_assign( @@ -869,8 +873,7 @@ TRACE_EVENT(rxrpc_receive, __entry->why = why; __entry->serial = serial; __entry->seq = seq; - __entry->hard_ack = call->rx_hard_ack; - __entry->top = call->rx_top; + __entry->window = atomic64_read(&call->ackr_window); ), TP_printk("c=%08x %s r=%08x q=%08x w=%08x-%08x", @@ -878,8 +881,8 @@ TRACE_EVENT(rxrpc_receive, __print_symbolic(__entry->why, rxrpc_receive_traces), __entry->serial, __entry->seq, - __entry->hard_ack, - __entry->top) + lower_32_bits(__entry->window), + upper_32_bits(__entry->window)) ); TRACE_EVENT(rxrpc_recvmsg, @@ -1459,7 +1462,7 @@ TRACE_EVENT(rxrpc_call_reset, __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; __entry->tx_seq = call->tx_hard_ack; - __entry->rx_seq = call->rx_hard_ack; + __entry->rx_seq = call->rx_highest_seq; ), TP_printk("c=%08x %08x:%08x r=%08x/%08x tx=%08x rx=%08x", diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index e93ed18816b1..c6c5bb3d3688 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -198,7 +198,6 @@ struct rxrpc_skb_priv { u16 remain; u16 offset; /* Offset of data */ u16 len; /* Length of data */ - u8 rx_flags; /* Received packet flags */ u8 flags; #define RXRPC_RX_VERIFIED 0x01 @@ -644,8 +643,20 @@ struct rxrpc_call { rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but * not hard-ACK'd packet follows this. */ + + /* Transmitted data tracking. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ u16 tx_backoff; /* Delay to insert due to Tx failure */ + u8 tx_winsize; /* Maximum size of Tx window */ + + /* Received data tracking */ + struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ + struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ + + rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ + rxrpc_seq_t rx_consumed; /* Highest packet consumed */ + rxrpc_serial_t rx_serial; /* Highest serial received for this call */ + u8 rx_winsize; /* Size of Rx window */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA @@ -660,23 +671,19 @@ struct rxrpc_call { u8 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ - rxrpc_seq_t rx_hard_ack; /* Dead slot in buffer; the first received but not - * consumed packet follows this. - */ - rxrpc_seq_t rx_top; /* Highest Rx slot allocated. */ - rxrpc_seq_t rx_expect_next; /* Expected next packet sequence number */ - rxrpc_serial_t rx_serial; /* Highest serial received for this call */ - u8 rx_winsize; /* Size of Rx window */ - u8 tx_winsize; /* Maximum size of Tx window */ - spinlock_t input_lock; /* Lock for packet input to this call */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ - rxrpc_seq_t ackr_highest_seq; /* Higest sequence number received */ + atomic64_t ackr_window; /* Base (in LSW) and top (in MSW) of SACK window */ atomic_t ackr_nr_unacked; /* Number of unacked packets */ atomic_t ackr_nr_consumed; /* Number of packets needing hard ACK */ + struct { +#define RXRPC_SACK_SIZE 256 + /* SACK table for soft-acked packets */ + u8 ackr_sack_table[RXRPC_SACK_SIZE]; + } __aligned(8); /* RTT management */ rxrpc_serial_t rtt_serial[4]; /* Serial number of DATA or PING sent */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 8f9e88897197..4d8601c6a32d 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -155,6 +155,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->accept_link); INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); + skb_queue_head_init(&call->recvmsg_queue); + skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); spin_lock_init(&call->notify_lock); @@ -165,13 +167,12 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->tx_total_len = -1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; + atomic64_set(&call->ackr_window, 0x100000001ULL); memset(&call->sock_node, 0xed, sizeof(call->sock_node)); - /* Leave space in the ring to handle a maxed-out jumbo packet */ call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; - call->rx_expect_next = 1; call->cong_cwnd = 2; call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1; @@ -519,6 +520,8 @@ static void rxrpc_cleanup_ring(struct rxrpc_call *call) rxrpc_free_skb(call->rxtx_buffer[i], rxrpc_skb_cleaned); call->rxtx_buffer[i] = NULL; } + skb_queue_purge(&call->recvmsg_queue); + skb_queue_purge(&call->rx_oos_queue); } /* diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 22089e37e97f..f7ea71ae6159 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -175,7 +175,7 @@ void __rxrpc_disconnect_call(struct rxrpc_connection *conn, trace_rxrpc_disconnect_call(call); switch (call->completion) { case RXRPC_CALL_SUCCEEDED: - chan->last_seq = call->rx_hard_ack; + chan->last_seq = call->rx_highest_seq; chan->last_type = RXRPC_PACKET_TYPE_ACK; break; case RXRPC_CALL_LOCALLY_ABORTED: diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 0e7545ed0128..947e7196e2be 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -312,18 +312,43 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) return rxrpc_end_tx_phase(call, true, "ETD"); } +static void rxrpc_input_update_ack_window(struct rxrpc_call *call, + rxrpc_seq_t window, rxrpc_seq_t wtop) +{ + atomic64_set_release(&call->ackr_window, ((u64)wtop) << 32 | window); +} + /* - * Process a DATA packet, adding the packet to the Rx ring. The caller's - * packet ref must be passed on or discarded. + * Push a DATA packet onto the Rx queue. + */ +static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, + rxrpc_seq_t window, rxrpc_seq_t wtop, + enum rxrpc_receive_trace why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + bool last = sp->hdr.flags & RXRPC_LAST_PACKET; + + __skb_queue_tail(&call->recvmsg_queue, skb); + rxrpc_input_update_ack_window(call, window, wtop); + + trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); +} + +/* + * Process a DATA packet. */ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct sk_buff *oos; rxrpc_serial_t serial = sp->hdr.serial; - rxrpc_seq_t seq = sp->hdr.seq, hard_ack; - unsigned int ix = seq & RXRPC_RXTX_BUFF_MASK; + u64 win = atomic64_read(&call->ackr_window); + rxrpc_seq_t window = lower_32_bits(win); + rxrpc_seq_t wtop = upper_32_bits(win); + rxrpc_seq_t wlimit = window + call->rx_winsize - 1; + rxrpc_seq_t seq = sp->hdr.seq; bool last = sp->hdr.flags & RXRPC_LAST_PACKET; - bool acked = false; + int ack_reason = -1; rxrpc_inc_stat(call->rxnet, stat_rx_data); if (sp->hdr.flags & RXRPC_REQUEST_ACK) @@ -331,112 +356,135 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb) if (sp->hdr.flags & RXRPC_JUMBO_PACKET) rxrpc_inc_stat(call->rxnet, stat_rx_data_jumbo); - hard_ack = READ_ONCE(call->rx_hard_ack); - - _proto("Rx DATA %%%u { #%x l=%u }", serial, seq, last); - if (last) { - if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && - seq != call->rx_top) { + if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) && + seq + 1 != wtop) { rxrpc_proto_abort("LSN", call, seq); - goto out; + goto err_free; } } else { if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && - after_eq(seq, call->rx_top)) { + after_eq(seq, wtop)) { + pr_warn("Packet beyond last: c=%x q=%x window=%x-%x wlimit=%x\n", + call->debug_id, seq, window, wtop, wlimit); rxrpc_proto_abort("LSA", call, seq); - goto out; + goto err_free; } } + if (after(seq, call->rx_highest_seq)) + call->rx_highest_seq = seq; + trace_rxrpc_rx_data(call->debug_id, seq, serial, sp->hdr.flags); - if (before_eq(seq, hard_ack)) { - rxrpc_send_ACK(call, RXRPC_ACK_DUPLICATE, serial, - rxrpc_propose_ack_input_data); - goto out; + if (before(seq, window)) { + ack_reason = RXRPC_ACK_DUPLICATE; + goto send_ack; } - - if (call->rxtx_buffer[ix]) { - rxrpc_send_ACK(call, RXRPC_ACK_DUPLICATE, serial, - rxrpc_propose_ack_input_data); - goto out; + if (after(seq, wlimit)) { + ack_reason = RXRPC_ACK_EXCEEDS_WINDOW; + goto send_ack; } - if (after(seq, hard_ack + call->rx_winsize)) { - rxrpc_send_ACK(call, RXRPC_ACK_EXCEEDS_WINDOW, serial, - rxrpc_propose_ack_input_data); - goto out; - } + /* Queue the packet. */ + if (seq == window) { + rxrpc_seq_t reset_from; + bool reset_sack = false; - if (sp->hdr.flags & RXRPC_REQUEST_ACK) { - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, serial, - rxrpc_propose_ack_input_data); - acked = true; - } + if (sp->hdr.flags & RXRPC_REQUEST_ACK) + ack_reason = RXRPC_ACK_REQUESTED; + /* Send an immediate ACK if we fill in a hole */ + else if (!skb_queue_empty(&call->rx_oos_queue)) + ack_reason = RXRPC_ACK_DELAY; - if (after(seq, call->ackr_highest_seq)) - call->ackr_highest_seq = seq; + window++; + if (after(window, wtop)) + wtop = window; - /* Queue the packet. We use a couple of memory barriers here as need - * to make sure that rx_top is perceived to be set after the buffer - * pointer and that the buffer pointer is set after the annotation and - * the skb data. - * - * Barriers against rxrpc_recvmsg_data() and rxrpc_rotate_rx_window() - * and also rxrpc_fill_out_ack(). - */ - call->rxtx_annotations[ix] = 1; - smp_wmb(); - call->rxtx_buffer[ix] = skb; - if (after(seq, call->rx_top)) { - smp_store_release(&call->rx_top, seq); - } else if (before(seq, call->rx_top)) { - /* Send an immediate ACK if we fill in a hole */ - if (!acked) { - rxrpc_send_ACK(call, RXRPC_ACK_DELAY, serial, - rxrpc_propose_ack_input_data_hole); - acked = true; + spin_lock(&call->recvmsg_queue.lock); + rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue); + skb = NULL; + + while ((oos = skb_peek(&call->rx_oos_queue))) { + struct rxrpc_skb_priv *osp = rxrpc_skb(oos); + + if (after(osp->hdr.seq, window)) + break; + + __skb_unlink(oos, &call->rx_oos_queue); + last = osp->hdr.flags & RXRPC_LAST_PACKET; + seq = osp->hdr.seq; + if (!reset_sack) { + reset_from = seq; + reset_sack = true; + } + + window++; + rxrpc_input_queue_data(call, oos, window, wtop, + rxrpc_receive_queue_oos); } - } - /* From this point on, we're not allowed to touch the packet any longer - * as its ref now belongs to the Rx ring. - */ - skb = NULL; - sp = NULL; + spin_unlock(&call->recvmsg_queue.lock); - if (last) { - set_bit(RXRPC_CALL_RX_LAST, &call->flags); - trace_rxrpc_receive(call, rxrpc_receive_queue_last, serial, seq); + if (reset_sack) { + do { + call->ackr_sack_table[reset_from % RXRPC_SACK_SIZE] = 0; + } while (reset_from++, before(reset_from, window)); + } } else { - trace_rxrpc_receive(call, rxrpc_receive_queue, serial, seq); - } + bool keep = false; + + ack_reason = RXRPC_ACK_OUT_OF_SEQUENCE; + + if (!call->ackr_sack_table[seq % RXRPC_SACK_SIZE]) { + call->ackr_sack_table[seq % RXRPC_SACK_SIZE] = 1; + keep = 1; + } + + if (after(seq + 1, wtop)) { + wtop = seq + 1; + rxrpc_input_update_ack_window(call, window, wtop); + } + + if (!keep) { + ack_reason = RXRPC_ACK_DUPLICATE; + goto send_ack; + } + + skb_queue_walk(&call->rx_oos_queue, oos) { + struct rxrpc_skb_priv *osp = rxrpc_skb(oos); - if (after_eq(seq, call->rx_expect_next)) { - if (after(seq, call->rx_expect_next)) { - _net("OOS %u > %u", seq, call->rx_expect_next); - rxrpc_send_ACK(call, RXRPC_ACK_OUT_OF_SEQUENCE, serial, - rxrpc_propose_ack_input_data); - acked = true; + if (after(osp->hdr.seq, seq)) { + __skb_queue_before(&call->rx_oos_queue, oos, skb); + goto oos_queued; + } } - call->rx_expect_next = seq + 1; + + __skb_queue_tail(&call->rx_oos_queue, skb); + oos_queued: + trace_rxrpc_receive(call, last ? rxrpc_receive_oos_last : rxrpc_receive_oos, + sp->hdr.serial, sp->hdr.seq); + skb = NULL; } -out: - if (!acked && - atomic_inc_return(&call->ackr_nr_unacked) > 2) - rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial, +send_ack: + if (ack_reason < 0 && + atomic_inc_return(&call->ackr_nr_unacked) > 2 && + test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) { + ack_reason = RXRPC_ACK_IDLE; + } else if (ack_reason >= 0) { + set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags); + } + + if (ack_reason >= 0) + rxrpc_send_ACK(call, ack_reason, serial, rxrpc_propose_ack_input_data); else rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_input_data); - trace_rxrpc_notify_socket(call->debug_id, serial); - rxrpc_notify_socket(call); - +err_free: rxrpc_free_skb(skb, rxrpc_skb_freed); - _leave(" [queued]"); } /* @@ -498,8 +546,9 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_serial_t serial = sp->hdr.serial; rxrpc_seq_t seq0 = sp->hdr.seq; - _enter("{%u,%u},{%u,%u}", - call->rx_hard_ack, call->rx_top, skb->len, seq0); + _enter("{%llx,%x},{%u,%x}", + atomic64_read(&call->ackr_window), call->rx_highest_seq, + skb->len, seq0); _proto("Rx DATA %%%u { #%u f=%02x }", sp->hdr.serial, seq0, sp->hdr.flags); diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index f5f03f27bd6c..056c428d8bf3 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -40,10 +40,7 @@ unsigned long rxrpc_idle_ack_delay = HZ / 2; * limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further * packets. */ -unsigned int rxrpc_rx_window_size = RXRPC_INIT_RX_WINDOW_SIZE; -#if (RXRPC_RXTX_BUFF_SIZE - 1) < RXRPC_INIT_RX_WINDOW_SIZE -#error Need to reduce RXRPC_INIT_RX_WINDOW_SIZE -#endif +unsigned int rxrpc_rx_window_size = 255; /* * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f7bb792c8aa1..0a4f37d7b6b5 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -74,47 +74,64 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call) */ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, struct rxrpc_call *call, - struct rxrpc_txbuf *txb, - rxrpc_seq_t *_hard_ack, - rxrpc_seq_t *_top) + struct rxrpc_txbuf *txb) { struct rxrpc_ackinfo ackinfo; - unsigned int tmp; - rxrpc_seq_t hard_ack, top, seq; - int ix; + unsigned int qsize; + rxrpc_seq_t window, wtop, wrap_point, ix, first; + int rsize; + u64 wtmp; u32 mtu, jmax; u8 *ackp = txb->acks; + u8 sack_buffer[sizeof(call->ackr_sack_table)] __aligned(8); - tmp = atomic_xchg(&call->ackr_nr_unacked, 0); - tmp |= atomic_xchg(&call->ackr_nr_consumed, 0); - if (!tmp && (txb->ack.reason == RXRPC_ACK_DELAY || - txb->ack.reason == RXRPC_ACK_IDLE)) { - rxrpc_inc_stat(call->rxnet, stat_tx_ack_skip); - return 0; - } - + atomic_set(&call->ackr_nr_unacked, 0); + atomic_set(&call->ackr_nr_consumed, 0); rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); /* Barrier against rxrpc_input_data(). */ - hard_ack = READ_ONCE(call->rx_hard_ack); - top = smp_load_acquire(&call->rx_top); - *_hard_ack = hard_ack; - *_top = top; - - txb->ack.firstPacket = htonl(hard_ack + 1); - txb->ack.previousPacket = htonl(call->ackr_highest_seq); - txb->ack.nAcks = top - hard_ack; - - if (txb->ack.nAcks) { - seq = hard_ack + 1; - do { - ix = seq & RXRPC_RXTX_BUFF_MASK; - if (call->rxtx_buffer[ix]) - *ackp++ = RXRPC_ACK_TYPE_ACK; - else - *ackp++ = RXRPC_ACK_TYPE_NACK; - seq++; - } while (before_eq(seq, top)); +retry: + wtmp = atomic64_read_acquire(&call->ackr_window); + window = lower_32_bits(wtmp); + wtop = upper_32_bits(wtmp); + txb->ack.firstPacket = htonl(window); + txb->ack.nAcks = 0; + + if (after(wtop, window)) { + /* Try to copy the SACK ring locklessly. We can use the copy, + * only if the now-current top of the window didn't go past the + * previously read base - otherwise we can't know whether we + * have old data or new data. + */ + memcpy(sack_buffer, call->ackr_sack_table, sizeof(sack_buffer)); + wrap_point = window + RXRPC_SACK_SIZE - 1; + wtmp = atomic64_read_acquire(&call->ackr_window); + window = lower_32_bits(wtmp); + wtop = upper_32_bits(wtmp); + if (after(wtop, wrap_point)) { + cond_resched(); + goto retry; + } + + /* The buffer is maintained as a ring with an invariant mapping + * between bit position and sequence number, so we'll probably + * need to rotate it. + */ + txb->ack.nAcks = wtop - window; + ix = window % RXRPC_SACK_SIZE; + first = sizeof(sack_buffer) - ix; + + if (ix + txb->ack.nAcks <= RXRPC_SACK_SIZE) { + memcpy(txb->acks, sack_buffer + ix, txb->ack.nAcks); + } else { + memcpy(txb->acks, sack_buffer + ix, first); + memcpy(txb->acks + first, sack_buffer, + txb->ack.nAcks - first); + } + + ackp += txb->ack.nAcks; + } else if (before(wtop, window)) { + pr_warn("ack window backward %x %x", window, wtop); } else if (txb->ack.reason == RXRPC_ACK_DELAY) { txb->ack.reason = RXRPC_ACK_IDLE; } @@ -122,16 +139,18 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, mtu = conn->params.peer->if_mtu; mtu -= conn->params.peer->hdrsize; jmax = rxrpc_rx_jumbo_max; + qsize = (window - 1) - call->rx_consumed; + rsize = max_t(int, call->rx_winsize - qsize, 0); ackinfo.rxMTU = htonl(rxrpc_rx_mtu); ackinfo.maxMTU = htonl(mtu); - ackinfo.rwind = htonl(call->rx_winsize); + ackinfo.rwind = htonl(rsize); ackinfo.jumbo_max = htonl(jmax); *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; memcpy(ackp, &ackinfo, sizeof(ackinfo)); - return top - hard_ack + 3 + sizeof(ackinfo); + return txb->ack.nAcks + 3 + sizeof(ackinfo); } /* @@ -188,7 +207,6 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * struct msghdr msg; struct kvec iov[1]; rxrpc_serial_t serial; - rxrpc_seq_t hard_ack, top; size_t len, n; int ret, rtt_slot = -1; @@ -212,7 +230,7 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * clear_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags); spin_lock_bh(&call->lock); - n = rxrpc_fill_out_ack(conn, call, txb, &hard_ack, &top); + n = rxrpc_fill_out_ack(conn, call, txb); spin_unlock_bh(&call->lock); if (n == 0) { kfree(pkt); @@ -236,6 +254,9 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); + /* Grab the highest received seq as late as possible */ + txb->ack.previousPacket = htonl(call->rx_highest_seq); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 3877afd23598..d48af0178866 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -54,8 +54,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); unsigned long timeout = 0; - rxrpc_seq_t tx_hard_ack, rx_hard_ack; + rxrpc_seq_t tx_hard_ack; char lbuff[50], rbuff[50]; + u64 wtmp; if (v == &rxnet->calls) { seq_puts(seq, @@ -91,7 +92,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) } tx_hard_ack = READ_ONCE(call->tx_hard_ack); - rx_hard_ack = READ_ONCE(call->rx_hard_ack); + wtmp = atomic64_read_acquire(&call->ackr_window); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %08x %08x %02x %08x %02x %08x %06lx\n", @@ -106,7 +107,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call->abort_code, call->debug_id, tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, - rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack, + lower_32_bits(wtmp), upper_32_bits(wtmp) - lower_32_bits(wtmp), call->rx_serial, timeout); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 401aae687830..efb85f983657 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -173,7 +173,8 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) break; } - trace_rxrpc_recvdata(call, rxrpc_recvmsg_terminal, call->rx_hard_ack, + trace_rxrpc_recvdata(call, rxrpc_recvmsg_terminal, + lower_32_bits(atomic64_read(&call->ackr_window)) - 1, call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } @@ -183,10 +184,11 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) */ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) { + rxrpc_seq_t whigh = READ_ONCE(call->rx_highest_seq); + _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); - trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); - ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); + trace_rxrpc_receive(call, rxrpc_receive_end, 0, whigh); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); @@ -220,45 +222,53 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_serial_t serial; - rxrpc_seq_t hard_ack, top; - bool last = false; - int ix; + rxrpc_seq_t old_consumed = call->rx_consumed, tseq; + bool last; + int acked; _enter("%d", call->debug_id); - hard_ack = call->rx_hard_ack; - top = smp_load_acquire(&call->rx_top); - ASSERT(before(hard_ack, top)); - - hard_ack++; - ix = hard_ack & RXRPC_RXTX_BUFF_MASK; - skb = call->rxtx_buffer[ix]; +further_rotation: + skb = skb_dequeue(&call->recvmsg_queue); rxrpc_see_skb(skb, rxrpc_skb_rotated); - sp = rxrpc_skb(skb); + sp = rxrpc_skb(skb); + tseq = sp->hdr.seq; serial = sp->hdr.serial; + last = sp->hdr.flags & RXRPC_LAST_PACKET; - if (sp->hdr.flags & RXRPC_LAST_PACKET) - last = true; - - call->rxtx_buffer[ix] = NULL; - call->rxtx_annotations[ix] = 0; /* Barrier against rxrpc_input_data(). */ - smp_store_release(&call->rx_hard_ack, hard_ack); + if (after(tseq, call->rx_consumed)) + smp_store_release(&call->rx_consumed, tseq); rxrpc_free_skb(skb, rxrpc_skb_freed); - trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); + trace_rxrpc_receive(call, last ? rxrpc_receive_rotate_last : rxrpc_receive_rotate, + serial, call->rx_consumed); if (last) { rxrpc_end_rx_phase(call, serial); - } else { - /* Check to see if there's an ACK that needs sending. */ - if (atomic_inc_return(&call->ackr_nr_consumed) > 2 && - !test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) { - rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial, - rxrpc_propose_ack_rotate_rx); - rxrpc_transmit_ack_packets(call->peer->local); - } + return; + } + + /* The next packet on the queue might entirely overlap with the one we + * just consumed; if so, rotate that away also. + */ + skb = skb_peek(&call->recvmsg_queue); + if (skb) { + sp = rxrpc_skb(skb); + if (sp->hdr.seq != call->rx_consumed && + after_eq(call->rx_consumed, sp->hdr.seq)) + goto further_rotation; + } + + /* Check to see if there's an ACK that needs sending. */ + acked = atomic_add_return(call->rx_consumed - old_consumed, + &call->ackr_nr_consumed); + if (acked > 2 && + !test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) { + rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial, + rxrpc_propose_ack_rotate_rx); + rxrpc_transmit_ack_packets(call->peer->local); } } @@ -285,46 +295,38 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, { struct rxrpc_skb_priv *sp; struct sk_buff *skb; - rxrpc_serial_t serial; - rxrpc_seq_t hard_ack, top, seq; + rxrpc_seq_t seq = 0; size_t remain; unsigned int rx_pkt_offset, rx_pkt_len; - int ix, copy, ret = -EAGAIN, ret2; + int copy, ret = -EAGAIN, ret2; rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { - seq = call->rx_hard_ack; + seq = lower_32_bits(atomic64_read(&call->ackr_window)) - 1; ret = 1; goto done; } - /* Barriers against rxrpc_input_data(). */ - hard_ack = call->rx_hard_ack; - seq = hard_ack + 1; - - while (top = smp_load_acquire(&call->rx_top), - before_eq(seq, top) - ) { - ix = seq & RXRPC_RXTX_BUFF_MASK; - skb = call->rxtx_buffer[ix]; - if (!skb) { - trace_rxrpc_recvdata(call, rxrpc_recvmsg_hole, seq, - rx_pkt_offset, rx_pkt_len, 0); - rxrpc_transmit_ack_packets(call->peer->local); - break; - } - smp_rmb(); + /* No one else can be removing stuff from the queue, so we shouldn't + * need the Rx lock to walk it. + */ + skb = skb_peek(&call->recvmsg_queue); + while (skb) { rxrpc_see_skb(skb, rxrpc_skb_seen); sp = rxrpc_skb(skb); + seq = sp->hdr.seq; - if (!(flags & MSG_PEEK)) { - serial = sp->hdr.serial; - trace_rxrpc_receive(call, rxrpc_receive_front, - serial, seq); + if (after_eq(call->rx_consumed, seq)) { + kdebug("obsolete %x %x", call->rx_consumed, seq); + goto skip_obsolete; } + if (!(flags & MSG_PEEK)) + trace_rxrpc_receive(call, rxrpc_receive_front, + sp->hdr.serial, seq); + if (msg) sock_recv_timestamp(msg, sock->sk, skb); @@ -338,6 +340,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, ret = ret2; goto out; } + rxrpc_transmit_ack_packets(call->peer->local); } else { trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); @@ -370,16 +373,17 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, break; } + skip_obsolete: /* The whole packet has been transferred. */ if (sp->hdr.flags & RXRPC_LAST_PACKET) ret = 1; rx_pkt_offset = 0; rx_pkt_len = 0; + skb = skb_peek_next(skb, &call->recvmsg_queue); + if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); - - seq++; } out: @@ -522,8 +526,7 @@ try_again: ret = 0; rxrpc_transmit_ack_packets(call->peer->local); - if (after(call->rx_top, call->rx_hard_ack) && - call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK]) + if (!skb_queue_empty(&call->recvmsg_queue)) rxrpc_notify_socket(call); break; default: diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index b19937776aa9..d87c99b36e01 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -1191,7 +1191,6 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), ticket, ticket_len) < 0) - goto protocol_error_free; ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len, &session_key, &expiry, _abort_code); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 2bd987364e44..cde3224a5cd2 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -14,7 +14,7 @@ static struct ctl_table_header *rxrpc_sysctl_reg_table; static const unsigned int four = 4; static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; static const unsigned int n_65535 = 65535; -static const unsigned int n_max_acks = RXRPC_RXTX_BUFF_SIZE - 1; +static const unsigned int n_max_acks = 255; static const unsigned long one_jiffy = 1; static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; -- cgit v1.2.3-70-g09d2 From a4ea4c47761943d90cd5d1688b3c3c65922ff2b1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 31 Mar 2022 23:55:08 +0100 Subject: rxrpc: Don't use a ring buffer for call Tx queue Change the way the Tx queueing works to make the following ends easier to achieve: (1) The filling of packets, the encryption of packets and the transmission of packets can be handled in parallel by separate threads, rather than rxrpc_sendmsg() allocating, filling, encrypting and transmitting each packet before moving onto the next one. (2) Get rid of the fixed-size ring which sets a hard limit on the number of packets that can be retained in the ring. This allows the number of packets to increase without having to allocate a very large ring or having variable-sized rings. [Note: the downside of this is that it's then less efficient to locate a packet for retransmission as we then have to step through a list and examine each buffer in the list.] (3) Allow the filler/encrypter to run ahead of the transmission window. (4) Make it easier to do zero copy UDP from the packet buffers. (5) Make it easier to do zero copy from userspace to the packet buffers - and thence to UDP (only if for unauthenticated connections). To that end, the following changes are made: (1) Use the new rxrpc_txbuf struct instead of sk_buff for keeping packets to be transmitted in. This allows them to be placed on multiple queues simultaneously. An sk_buff isn't really necessary as it's never passed on to lower-level networking code. (2) Keep the transmissable packets in a linked list on the call struct rather than in a ring. As a consequence, the annotation buffer isn't used either; rather a flag is set on the packet to indicate ackedness. (3) Use the RXRPC_CALL_TX_LAST flag to indicate that the last packet to be transmitted has been queued. Add RXRPC_CALL_TX_ALL_ACKED to indicate that all packets up to and including the last got hard acked. (4) Wire headers are now stored in the txbuf rather than being concocted on the stack and they're stored immediately before the data, thereby allowing zerocopy of a single span. (5) Don't bother with instant-resend on transmission failure; rather, leave it for a timer or an ACK packet to trigger. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 78 +++++++++--------- net/rxrpc/af_rxrpc.c | 5 +- net/rxrpc/ar-internal.h | 31 +++---- net/rxrpc/call_event.c | 111 +++++++++---------------- net/rxrpc/call_object.c | 15 +++- net/rxrpc/input.c | 145 ++++++++++++++------------------- net/rxrpc/insecure.c | 3 +- net/rxrpc/output.c | 89 +++++++++----------- net/rxrpc/proc.c | 9 +-- net/rxrpc/rxkad.c | 102 +++++++++-------------- net/rxrpc/sendmsg.c | 188 +++++++++++++++++-------------------------- net/rxrpc/txbuf.c | 34 ++++++++ 12 files changed, 349 insertions(+), 461 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 284a1560b0a8..71ca74e40ec8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -75,6 +75,7 @@ EM(rxrpc_call_got, "GOT") \ EM(rxrpc_call_got_kernel, "Gke") \ EM(rxrpc_call_got_timer, "GTM") \ + EM(rxrpc_call_got_tx, "Gtx") \ EM(rxrpc_call_got_userid, "Gus") \ EM(rxrpc_call_new_client, "NWc") \ EM(rxrpc_call_new_service, "NWs") \ @@ -83,20 +84,22 @@ EM(rxrpc_call_put_noqueue, "PnQ") \ EM(rxrpc_call_put_notimer, "PnT") \ EM(rxrpc_call_put_timer, "PTM") \ + EM(rxrpc_call_put_tx, "Ptx") \ EM(rxrpc_call_put_userid, "Pus") \ EM(rxrpc_call_queued, "QUE") \ EM(rxrpc_call_queued_ref, "QUR") \ EM(rxrpc_call_release, "RLS") \ E_(rxrpc_call_seen, "SEE") -#define rxrpc_transmit_traces \ - EM(rxrpc_transmit_await_reply, "AWR") \ - EM(rxrpc_transmit_end, "END") \ - EM(rxrpc_transmit_queue, "QUE") \ - EM(rxrpc_transmit_queue_last, "QLS") \ - EM(rxrpc_transmit_rotate, "ROT") \ - EM(rxrpc_transmit_rotate_last, "RLS") \ - E_(rxrpc_transmit_wait, "WAI") +#define rxrpc_txqueue_traces \ + EM(rxrpc_txqueue_await_reply, "AWR") \ + EM(rxrpc_txqueue_dequeue, "DEQ") \ + EM(rxrpc_txqueue_end, "END") \ + EM(rxrpc_txqueue_queue, "QUE") \ + EM(rxrpc_txqueue_queue_last, "QLS") \ + EM(rxrpc_txqueue_rotate, "ROT") \ + EM(rxrpc_txqueue_rotate_last, "RLS") \ + E_(rxrpc_txqueue_wait, "WAI") #define rxrpc_receive_traces \ EM(rxrpc_receive_end, "END") \ @@ -259,6 +262,7 @@ EM(rxrpc_txbuf_alloc_ack, "ALLOC ACK ") \ EM(rxrpc_txbuf_alloc_data, "ALLOC DATA ") \ EM(rxrpc_txbuf_free, "FREE ") \ + EM(rxrpc_txbuf_get_buffer, "GET BUFFER ") \ EM(rxrpc_txbuf_get_trans, "GET TRANS ") \ EM(rxrpc_txbuf_get_retrans, "GET RETRANS") \ EM(rxrpc_txbuf_put_ack_tx, "PUT ACK TX ") \ @@ -266,6 +270,7 @@ EM(rxrpc_txbuf_put_nomem, "PUT NOMEM ") \ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ + EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") @@ -295,9 +300,9 @@ enum rxrpc_rtt_rx_trace { rxrpc_rtt_rx_traces } __mode(byte); enum rxrpc_rtt_tx_trace { rxrpc_rtt_tx_traces } __mode(byte); enum rxrpc_skb_trace { rxrpc_skb_traces } __mode(byte); enum rxrpc_timer_trace { rxrpc_timer_traces } __mode(byte); -enum rxrpc_transmit_trace { rxrpc_transmit_traces } __mode(byte); enum rxrpc_tx_point { rxrpc_tx_points } __mode(byte); enum rxrpc_txbuf_trace { rxrpc_txbuf_traces } __mode(byte); +enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #endif /* end __RXRPC_DECLARE_TRACE_ENUMS_ONCE_ONLY */ @@ -323,9 +328,9 @@ rxrpc_rtt_rx_traces; rxrpc_rtt_tx_traces; rxrpc_skb_traces; rxrpc_timer_traces; -rxrpc_transmit_traces; rxrpc_tx_points; rxrpc_txbuf_traces; +rxrpc_txqueue_traces; /* * Now redefine the EM() and E_() macros to map the enums to the strings that @@ -605,15 +610,16 @@ TRACE_EVENT(rxrpc_call_complete, __entry->abort_code) ); -TRACE_EVENT(rxrpc_transmit, - TP_PROTO(struct rxrpc_call *call, enum rxrpc_transmit_trace why), +TRACE_EVENT(rxrpc_txqueue, + TP_PROTO(struct rxrpc_call *call, enum rxrpc_txqueue_trace why), TP_ARGS(call, why), TP_STRUCT__entry( __field(unsigned int, call ) - __field(enum rxrpc_transmit_trace, why ) - __field(rxrpc_seq_t, tx_hard_ack ) + __field(enum rxrpc_txqueue_trace, why ) + __field(rxrpc_seq_t, acks_hard_ack ) + __field(rxrpc_seq_t, tx_bottom ) __field(rxrpc_seq_t, tx_top ) __field(int, tx_winsize ) ), @@ -621,16 +627,19 @@ TRACE_EVENT(rxrpc_transmit, TP_fast_assign( __entry->call = call->debug_id; __entry->why = why; - __entry->tx_hard_ack = call->tx_hard_ack; + __entry->acks_hard_ack = call->acks_hard_ack; + __entry->tx_bottom = call->tx_bottom; __entry->tx_top = call->tx_top; __entry->tx_winsize = call->tx_winsize; ), - TP_printk("c=%08x %s f=%08x n=%u/%u", + TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u", __entry->call, - __print_symbolic(__entry->why, rxrpc_transmit_traces), - __entry->tx_hard_ack + 1, - __entry->tx_top - __entry->tx_hard_ack, + __print_symbolic(__entry->why, rxrpc_txqueue_traces), + __entry->tx_bottom, + __entry->acks_hard_ack, + __entry->tx_top - __entry->tx_bottom, + __entry->tx_top - __entry->acks_hard_ack, __entry->tx_winsize) ); @@ -1200,29 +1209,25 @@ TRACE_EVENT(rxrpc_drop_ack, ); TRACE_EVENT(rxrpc_retransmit, - TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, u8 annotation, - s64 expiry), + TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, s64 expiry), - TP_ARGS(call, seq, annotation, expiry), + TP_ARGS(call, seq, expiry), TP_STRUCT__entry( __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) - __field(u8, annotation ) __field(s64, expiry ) ), TP_fast_assign( __entry->call = call->debug_id; __entry->seq = seq; - __entry->annotation = annotation; __entry->expiry = expiry; ), - TP_printk("c=%08x q=%x a=%02x xp=%lld", + TP_printk("c=%08x q=%x xp=%lld", __entry->call, __entry->seq, - __entry->annotation, __entry->expiry) ); @@ -1245,14 +1250,14 @@ TRACE_EVENT(rxrpc_congest, TP_fast_assign( __entry->call = call->debug_id; __entry->change = change; - __entry->hard_ack = call->tx_hard_ack; + __entry->hard_ack = call->acks_hard_ack; __entry->top = call->tx_top; __entry->lowest_nak = call->acks_lowest_nak; __entry->ack_serial = ack_serial; memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nr=%u,%u nw=%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), @@ -1362,26 +1367,23 @@ TRACE_EVENT(rxrpc_connect_call, ); TRACE_EVENT(rxrpc_resend, - TP_PROTO(struct rxrpc_call *call, int ix), + TP_PROTO(struct rxrpc_call *call), - TP_ARGS(call, ix), + TP_ARGS(call), TP_STRUCT__entry( __field(unsigned int, call ) - __field(int, ix ) - __array(u8, anno, 64 ) + __field(rxrpc_seq_t, seq ) ), TP_fast_assign( __entry->call = call->debug_id; - __entry->ix = ix; - memcpy(__entry->anno, call->rxtx_annotations, 64); + __entry->seq = call->acks_hard_ack; ), - TP_printk("c=%08x ix=%u a=%64phN", + TP_printk("c=%08x q=%x", __entry->call, - __entry->ix, - __entry->anno) + __entry->seq) ); TRACE_EVENT(rxrpc_rx_icmp, @@ -1461,7 +1463,7 @@ TRACE_EVENT(rxrpc_call_reset, __entry->call_id = call->call_id; __entry->call_serial = call->rx_serial; __entry->conn_serial = call->conn->hi_serial; - __entry->tx_seq = call->tx_hard_ack; + __entry->tx_seq = call->acks_hard_ack; __entry->rx_seq = call->rx_highest_seq; ), diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index ceba28e9dce6..2f3991cf8715 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -39,7 +39,7 @@ atomic_t rxrpc_debug_id; EXPORT_SYMBOL(rxrpc_debug_id); /* count of skbs currently in use */ -atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; +atomic_t rxrpc_n_rx_skbs; struct workqueue_struct *rxrpc_workqueue; @@ -979,7 +979,7 @@ static int __init af_rxrpc_init(void) goto error_call_jar; } - rxrpc_workqueue = alloc_workqueue("krxrpcd", 0, 1); + rxrpc_workqueue = alloc_workqueue("krxrpcd", WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_UNBOUND, 1); if (!rxrpc_workqueue) { pr_notice("Failed to allocate work queue\n"); goto error_work_queue; @@ -1059,7 +1059,6 @@ static void __exit af_rxrpc_exit(void) sock_unregister(PF_RXRPC); proto_unregister(&rxrpc_proto); unregister_pernet_device(&rxrpc_net_ops); - ASSERTCMP(atomic_read(&rxrpc_n_tx_skbs), ==, 0); ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0); /* Make sure the local and peer records pinned by any dying connections diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c6c5bb3d3688..d7aebe82e17c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -195,7 +195,6 @@ struct rxrpc_host_header { * - max 48 bytes (struct sk_buff::cb) */ struct rxrpc_skb_priv { - u16 remain; u16 offset; /* Offset of data */ u16 len; /* Length of data */ u8 flags; @@ -243,7 +242,7 @@ struct rxrpc_security { size_t *, size_t *, size_t *); /* impose security on a packet */ - int (*secure_packet)(struct rxrpc_call *, struct sk_buff *, size_t); + int (*secure_packet)(struct rxrpc_call *, struct rxrpc_txbuf *); /* verify the security on a received packet */ int (*verify_packet)(struct rxrpc_call *, struct sk_buff *); @@ -497,6 +496,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXPOSED, /* The call was exposed to the world */ RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ + RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ @@ -594,7 +594,7 @@ struct rxrpc_call { struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ struct list_head sock_link; /* Link in rx->sock_calls */ struct rb_node sock_node; /* Node in rx->calls */ - struct sk_buff *tx_pending; /* Tx socket buffer being filled */ + struct rxrpc_txbuf *tx_pending; /* Tx buffer being filled */ wait_queue_head_t waitq; /* Wait queue for channel or Tx */ s64 tx_total_len; /* Total length left to be transmitted (or -1) */ __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ @@ -632,22 +632,16 @@ struct rxrpc_call { #define RXRPC_INIT_RX_WINDOW_SIZE 63 struct sk_buff **rxtx_buffer; u8 *rxtx_annotations; -#define RXRPC_TX_ANNO_ACK 0 -#define RXRPC_TX_ANNO_UNACK 1 -#define RXRPC_TX_ANNO_NAK 2 -#define RXRPC_TX_ANNO_RETRANS 3 -#define RXRPC_TX_ANNO_MASK 0x03 -#define RXRPC_TX_ANNO_LAST 0x04 -#define RXRPC_TX_ANNO_RESENT 0x08 - - rxrpc_seq_t tx_hard_ack; /* Dead slot in buffer; the first transmitted but - * not hard-ACK'd packet follows this. - */ /* Transmitted data tracking. */ + spinlock_t tx_lock; /* Transmit queue lock */ + struct list_head tx_buffer; /* Buffer of transmissible packets */ + rxrpc_seq_t tx_bottom; /* First packet in buffer */ + rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ u16 tx_backoff; /* Delay to insert due to Tx failure */ u8 tx_winsize; /* Maximum size of Tx window */ +#define RXRPC_TX_MAX_WINDOW 128 /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ @@ -657,6 +651,7 @@ struct rxrpc_call { rxrpc_seq_t rx_consumed; /* Highest packet consumed */ rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ + spinlock_t input_lock; /* Lock for packet input to this call */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA @@ -671,8 +666,6 @@ struct rxrpc_call { u8 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ - spinlock_t input_lock; /* Lock for packet input to this call */ - /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ @@ -697,6 +690,7 @@ struct rxrpc_call { ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ rxrpc_seq_t acks_first_seq; /* first sequence number received */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ + rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ @@ -809,7 +803,7 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) /* * af_rxrpc.c */ -extern atomic_t rxrpc_n_tx_skbs, rxrpc_n_rx_skbs; +extern atomic_t rxrpc_n_rx_skbs; extern struct workqueue_struct *rxrpc_workqueue; /* @@ -831,6 +825,7 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, void rxrpc_send_ACK(struct rxrpc_call *, u8, rxrpc_serial_t, enum rxrpc_propose_ack_trace); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); +void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); void rxrpc_process_call(struct work_struct *); void rxrpc_reduce_call_timer(struct rxrpc_call *call, @@ -1034,7 +1029,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) */ void rxrpc_transmit_ack_packets(struct rxrpc_local *); int rxrpc_send_abort_packet(struct rxrpc_call *); -int rxrpc_send_data_packet(struct rxrpc_call *, struct sk_buff *, bool); +int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *); void rxrpc_reject_packets(struct rxrpc_local *); void rxrpc_send_keepalive(struct rxrpc_peer *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 36f60ac1d95d..3c37b280eb20 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -148,62 +148,52 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) */ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) { - struct sk_buff *skb; + struct rxrpc_txbuf *txb; unsigned long resend_at; - rxrpc_seq_t cursor, seq, top; + rxrpc_seq_t transmitted = READ_ONCE(call->tx_transmitted); ktime_t now, max_age, oldest, ack_ts; - int ix; - u8 annotation, anno_type, retrans = 0, unacked = 0; + bool unacked = false; + LIST_HEAD(retrans_queue); - _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); + _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); now = ktime_get_real(); max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); - spin_lock_bh(&call->lock); - - cursor = call->tx_hard_ack; - top = call->tx_top; - ASSERT(before_eq(cursor, top)); - if (cursor == top) - goto out_unlock; + spin_lock(&call->tx_lock); /* Scan the packet list without dropping the lock and decide which of * the packets in the Tx buffer we're going to resend and what the new * resend timeout will be. */ - trace_rxrpc_resend(call, (cursor + 1) & RXRPC_RXTX_BUFF_MASK); + trace_rxrpc_resend(call); oldest = now; - for (seq = cursor + 1; before_eq(seq, top); seq++) { - ix = seq & RXRPC_RXTX_BUFF_MASK; - annotation = call->rxtx_annotations[ix]; - anno_type = annotation & RXRPC_TX_ANNO_MASK; - annotation &= ~RXRPC_TX_ANNO_MASK; - if (anno_type == RXRPC_TX_ANNO_ACK) + list_for_each_entry(txb, &call->tx_buffer, call_link) { + if (test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) continue; + if (after(txb->seq, transmitted)) + break; - skb = call->rxtx_buffer[ix]; - rxrpc_see_skb(skb, rxrpc_skb_seen); + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); - if (anno_type == RXRPC_TX_ANNO_UNACK) { - if (ktime_after(skb->tstamp, max_age)) { - if (ktime_before(skb->tstamp, oldest)) - oldest = skb->tstamp; + if (test_bit(RXRPC_TXBUF_RESENT, &txb->flags)) { + if (ktime_after(txb->last_sent, max_age)) { + if (ktime_before(txb->last_sent, oldest)) + oldest = txb->last_sent; continue; } - if (!(annotation & RXRPC_TX_ANNO_RESENT)) - unacked++; + unacked = true; } - /* Okay, we need to retransmit a packet. */ - call->rxtx_annotations[ix] = RXRPC_TX_ANNO_RETRANS | annotation; - retrans++; - trace_rxrpc_retransmit(call, seq, annotation | anno_type, - ktime_to_ns(ktime_sub(skb->tstamp, max_age))); + rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans); + list_move_tail(&txb->tx_link, &retrans_queue); } + spin_unlock(&call->tx_lock); + resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); - resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, retrans); + resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, + !list_empty(&retrans_queue)); WRITE_ONCE(call->resend_at, resend_at); if (unacked) @@ -213,7 +203,8 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) * that an ACK got lost somewhere. Send a ping to find out instead of * retransmitting data. */ - if (!retrans) { + if (list_empty(&retrans_queue)) { + spin_lock_bh(&call->lock); rxrpc_reduce_call_timer(call, resend_at, now_j, rxrpc_timer_set_for_resend); spin_unlock_bh(&call->lock); @@ -225,50 +216,19 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) goto out; } - /* Now go through the Tx window and perform the retransmissions. We - * have to drop the lock for each send. If an ACK comes in whilst the - * lock is dropped, it may clear some of the retransmission markers for - * packets that it soft-ACKs. - */ - for (seq = cursor + 1; before_eq(seq, top); seq++) { - ix = seq & RXRPC_RXTX_BUFF_MASK; - annotation = call->rxtx_annotations[ix]; - anno_type = annotation & RXRPC_TX_ANNO_MASK; - if (anno_type != RXRPC_TX_ANNO_RETRANS) - continue; - - /* We need to reset the retransmission state, but we need to do - * so before we drop the lock as a new ACK/NAK may come in and - * confuse things - */ - annotation &= ~RXRPC_TX_ANNO_MASK; - annotation |= RXRPC_TX_ANNO_UNACK | RXRPC_TX_ANNO_RESENT; - call->rxtx_annotations[ix] = annotation; - - skb = call->rxtx_buffer[ix]; - if (!skb) - continue; - - rxrpc_get_skb(skb, rxrpc_skb_got); - spin_unlock_bh(&call->lock); - + while ((txb = list_first_entry_or_null(&retrans_queue, + struct rxrpc_txbuf, tx_link))) { + list_del_init(&txb->tx_link); + set_bit(RXRPC_TXBUF_RESENT, &txb->flags); rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); - if (rxrpc_send_data_packet(call, skb, true) < 0) { - rxrpc_free_skb(skb, rxrpc_skb_freed); - return; - } + rxrpc_send_data_packet(call, txb); + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans); - if (rxrpc_is_client_call(call)) - rxrpc_expose_client_call(call); - - rxrpc_free_skb(skb, rxrpc_skb_freed); - spin_lock_bh(&call->lock); - if (after(call->tx_hard_ack, seq)) - seq = call->tx_hard_ack; + trace_rxrpc_retransmit(call, txb->seq, + ktime_to_ns(ktime_sub(txb->last_sent, + max_age))); } -out_unlock: - spin_unlock_bh(&call->lock); out: _leave(""); } @@ -301,6 +261,9 @@ recheck_state: goto recheck_state; } + if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) + rxrpc_shrink_call_tx_buffer(call); + if (call->state == RXRPC_CALL_COMPLETE) { rxrpc_delete_call_timer(call); goto out_put; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 4d8601c6a32d..ed5f6f0e4286 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -155,11 +155,13 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->accept_link); INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); + INIT_LIST_HEAD(&call->tx_buffer); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->lock); spin_lock_init(&call->notify_lock); + spin_lock_init(&call->tx_lock); spin_lock_init(&call->input_lock); rwlock_init(&call->state_lock); refcount_set(&call->ref, 1); @@ -175,7 +177,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->tx_winsize = 16; call->cong_cwnd = 2; - call->cong_ssthresh = RXRPC_RXTX_BUFF_SIZE - 1; + call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; @@ -510,7 +512,7 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) } /* - * Clean up the RxTx skb ring. + * Clean up the Rx skb ring. */ static void rxrpc_cleanup_ring(struct rxrpc_call *call) { @@ -686,6 +688,8 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) */ void rxrpc_cleanup_call(struct rxrpc_call *call) { + struct rxrpc_txbuf *txb; + _net("DESTROY CALL %d", call->debug_id); memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); @@ -694,7 +698,12 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); rxrpc_cleanup_ring(call); - rxrpc_free_skb(call->tx_pending, rxrpc_skb_cleaned); + while ((txb = list_first_entry_or_null(&call->tx_buffer, + struct rxrpc_txbuf, call_link))) { + list_del(&txb->call_link); + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); + } + rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 947e7196e2be..b1f7debd4f3e 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -32,7 +32,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, bool resend = false; summary->flight_size = - (call->tx_top - call->tx_hard_ack) - summary->nr_acks; + (call->tx_top - call->acks_hard_ack) - summary->nr_acks; if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; @@ -150,8 +150,8 @@ resume_normality: out: cumulative_acks = 0; out_no_clear_ca: - if (cwnd >= RXRPC_RXTX_BUFF_SIZE - 1) - cwnd = RXRPC_RXTX_BUFF_SIZE - 1; + if (cwnd >= RXRPC_TX_MAX_WINDOW) + cwnd = RXRPC_TX_MAX_WINDOW; call->cong_cwnd = cwnd; call->cong_cumul_acks = cumulative_acks; trace_rxrpc_congest(call, summary, acked_serial, change); @@ -169,9 +169,8 @@ send_extra_data: /* Send some previously unsent DATA if we have some to advance the ACK * state. */ - if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & - RXRPC_TX_ANNO_LAST || - summary->nr_acks != call->tx_top - call->tx_hard_ack) { + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) || + summary->nr_acks != call->tx_top - call->acks_hard_ack) { call->cong_extra++; wake_up(&call->waitq); } @@ -184,53 +183,40 @@ send_extra_data: static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, struct rxrpc_ack_summary *summary) { - struct sk_buff *skb, *list = NULL; + struct rxrpc_txbuf *txb; bool rot_last = false; - int ix; - u8 annotation; - if (call->acks_lowest_nak == call->tx_hard_ack) { - call->acks_lowest_nak = to; - } else if (before_eq(call->acks_lowest_nak, to)) { - summary->new_low_nack = true; - call->acks_lowest_nak = to; - } - - spin_lock(&call->lock); - - while (before(call->tx_hard_ack, to)) { - call->tx_hard_ack++; - ix = call->tx_hard_ack & RXRPC_RXTX_BUFF_MASK; - skb = call->rxtx_buffer[ix]; - annotation = call->rxtx_annotations[ix]; - rxrpc_see_skb(skb, rxrpc_skb_rotated); - call->rxtx_buffer[ix] = NULL; - call->rxtx_annotations[ix] = 0; - skb->next = list; - list = skb; - - if (annotation & RXRPC_TX_ANNO_LAST) { + list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { + if (before_eq(txb->seq, call->acks_hard_ack)) + continue; + if (!test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) + summary->nr_rot_new_acks++; + if (test_bit(RXRPC_TXBUF_LAST, &txb->flags)) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } - if ((annotation & RXRPC_TX_ANNO_MASK) != RXRPC_TX_ANNO_ACK) - summary->nr_rot_new_acks++; + if (txb->seq == to) + break; } - spin_unlock(&call->lock); + if (rot_last) + set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); - trace_rxrpc_transmit(call, (rot_last ? - rxrpc_transmit_rotate_last : - rxrpc_transmit_rotate)); - wake_up(&call->waitq); + _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); - while (list) { - skb = list; - list = skb->next; - skb_mark_not_on_list(skb); - rxrpc_free_skb(skb, rxrpc_skb_freed); + if (call->acks_lowest_nak == call->acks_hard_ack) { + call->acks_lowest_nak = to; + } else if (before_eq(call->acks_lowest_nak, to)) { + summary->new_low_nack = true; + call->acks_lowest_nak = to; } + smp_store_release(&call->acks_hard_ack, to); + + trace_rxrpc_txqueue(call, (rot_last ? + rxrpc_txqueue_rotate_last : + rxrpc_txqueue_rotate)); + wake_up(&call->waitq); return rot_last; } @@ -270,9 +256,9 @@ static bool rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, write_unlock(&call->state_lock); if (state == RXRPC_CALL_CLIENT_AWAIT_REPLY) - trace_rxrpc_transmit(call, rxrpc_transmit_await_reply); + trace_rxrpc_txqueue(call, rxrpc_txqueue_await_reply); else - trace_rxrpc_transmit(call, rxrpc_transmit_end); + trace_rxrpc_txqueue(call, rxrpc_txqueue_end); _leave(" = ok"); return true; @@ -678,30 +664,21 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, */ static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call) { - rxrpc_seq_t top, bottom, seq; + struct rxrpc_txbuf *txb; + rxrpc_seq_t top, bottom; bool resend = false; - spin_lock_bh(&call->lock); - - bottom = call->tx_hard_ack + 1; - top = call->acks_lost_top; + bottom = READ_ONCE(call->acks_hard_ack) + 1; + top = READ_ONCE(call->acks_lost_top); if (before(bottom, top)) { - for (seq = bottom; before_eq(seq, top); seq++) { - int ix = seq & RXRPC_RXTX_BUFF_MASK; - u8 annotation = call->rxtx_annotations[ix]; - u8 anno_type = annotation & RXRPC_TX_ANNO_MASK; - - if (anno_type != RXRPC_TX_ANNO_UNACK) + list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { + if (test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) continue; - annotation &= ~RXRPC_TX_ANNO_MASK; - annotation |= RXRPC_TX_ANNO_RETRANS; - call->rxtx_annotations[ix] = annotation; + set_bit(RXRPC_TXBUF_RETRANS, &txb->flags); resend = true; } } - spin_unlock_bh(&call->lock); - if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_queue_call(call); } @@ -735,8 +712,8 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), rwind, ntohl(ackinfo->jumbo_max)); - if (rwind > RXRPC_RXTX_BUFF_SIZE - 1) - rwind = RXRPC_RXTX_BUFF_SIZE - 1; + if (rwind > RXRPC_TX_MAX_WINDOW) + rwind = RXRPC_TX_MAX_WINDOW; if (call->tx_winsize != rwind) { if (rwind > call->tx_winsize) wake = true; @@ -775,22 +752,24 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, rxrpc_seq_t seq, int nr_acks, struct rxrpc_ack_summary *summary) { - int ix; - u8 annotation, anno_type; - - for (; nr_acks > 0; nr_acks--, seq++) { - ix = seq & RXRPC_RXTX_BUFF_MASK; - annotation = call->rxtx_annotations[ix]; - anno_type = annotation & RXRPC_TX_ANNO_MASK; - annotation &= ~RXRPC_TX_ANNO_MASK; - switch (*acks++) { + struct rxrpc_txbuf *txb; + + list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { + if (before(txb->seq, seq)) + continue; + if (after_eq(txb->seq, seq + nr_acks)) + break; + switch (acks[txb->seq - seq]) { case RXRPC_ACK_TYPE_ACK: summary->nr_acks++; - if (anno_type == RXRPC_TX_ANNO_ACK) + if (test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) continue; + /* A lot of the time the packet is going to + * have been ACK.'d already. + */ + clear_bit(RXRPC_TXBUF_NACKED, &txb->flags); + set_bit(RXRPC_TXBUF_ACKED, &txb->flags); summary->nr_new_acks++; - call->rxtx_annotations[ix] = - RXRPC_TX_ANNO_ACK | annotation; break; case RXRPC_ACK_TYPE_NACK: if (!summary->nr_nacks && @@ -799,13 +778,12 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, summary->new_low_nack = true; } summary->nr_nacks++; - if (anno_type == RXRPC_TX_ANNO_NAK) + if (test_bit(RXRPC_TXBUF_NACKED, &txb->flags)) continue; summary->nr_new_nacks++; - if (anno_type == RXRPC_TX_ANNO_RETRANS) - continue; - call->rxtx_annotations[ix] = - RXRPC_TX_ANNO_NAK | annotation; + clear_bit(RXRPC_TXBUF_ACKED, &txb->flags); + set_bit(RXRPC_TXBUF_NACKED, &txb->flags); + set_bit(RXRPC_TXBUF_RETRANS, &txb->flags); break; default: return rxrpc_proto_abort("SFT", call, 0); @@ -930,7 +908,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (unlikely(buf.ack.reason == RXRPC_ACK_OUT_OF_SEQUENCE) && first_soft_ack == 1 && prev_pkt == 0 && - call->tx_hard_ack == 0 && + call->acks_hard_ack == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); @@ -989,7 +967,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (before(hard_ack, call->tx_hard_ack) || + if (before(hard_ack, call->acks_hard_ack) || after(hard_ack, call->tx_top)) { rxrpc_proto_abort("AKW", call, 0); goto out; @@ -999,7 +977,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (after(hard_ack, call->tx_hard_ack)) { + if (after(hard_ack, call->acks_hard_ack)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, "ETA"); goto out; @@ -1015,8 +993,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) &summary); } - if (call->rxtx_annotations[call->tx_top & RXRPC_RXTX_BUFF_MASK] & - RXRPC_TX_ANNO_LAST && + if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && summary.nr_acks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) rxrpc_propose_ping(call, ack_serial, diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index fd68f0e3af27..0eb8471bfc53 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -25,8 +25,7 @@ static int none_how_much_data(struct rxrpc_call *call, size_t remain, return 0; } -static int none_secure_packet(struct rxrpc_call *call, struct sk_buff *skb, - size_t data_size) +static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { return 0; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 0a4f37d7b6b5..e2f501cef040 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -335,7 +335,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) * channel instead, thereby closing off this call. */ if (rxrpc_is_client_call(call) && - test_bit(RXRPC_CALL_TX_LAST, &call->flags)) + test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags)) return 0; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) @@ -383,20 +383,17 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) /* * send a packet through the transport endpoint */ -int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, - bool retrans) +int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; - struct rxrpc_wire_header whdr; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct msghdr msg; - struct kvec iov[2]; + struct kvec iov[1]; rxrpc_serial_t serial; size_t len; int ret, rtt_slot = -1; - _enter(",{%d}", skb->len); + _enter("%x,{%d}", txb->seq, txb->len); if (hlist_unhashed(&call->error_link)) { spin_lock_bh(&call->peer->lock); @@ -406,29 +403,16 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, /* Each transmission of a Tx packet needs a new serial number */ serial = atomic_inc_return(&conn->serial); - - whdr.epoch = htonl(conn->proto.epoch); - whdr.cid = htonl(call->cid); - whdr.callNumber = htonl(call->call_id); - whdr.seq = htonl(sp->hdr.seq); - whdr.serial = htonl(serial); - whdr.type = RXRPC_PACKET_TYPE_DATA; - whdr.flags = sp->hdr.flags; - whdr.userStatus = 0; - whdr.securityIndex = call->security_ix; - whdr._rsvd = htons(sp->hdr._rsvd); - whdr.serviceId = htons(call->service_id); + txb->wire.serial = htonl(serial); if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) && - sp->hdr.seq == 1) - whdr.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; + txb->seq == 1) + txb->wire.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = skb->head; - iov[1].iov_len = skb->len; - len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); + iov[0].iov_base = &txb->wire; + iov[0].iov_len = sizeof(txb->wire) + txb->len; + len = iov[0].iov_len; + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -443,19 +427,19 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (whdr.flags & RXRPC_REQUEST_ACK) + if (txb->wire.flags & RXRPC_REQUEST_ACK) why = rxrpc_reqack_already_on; - else if ((whdr.flags & RXRPC_LAST_PACKET) && rxrpc_to_client(sp)) + else if (test_bit(RXRPC_TXBUF_LAST, &txb->flags) && rxrpc_sending_to_client(txb)) why = rxrpc_reqack_no_srv_last; else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) why = rxrpc_reqack_ack_lost; - else if (retrans) + else if (test_bit(RXRPC_TXBUF_RESENT, &txb->flags)) why = rxrpc_reqack_retrans; else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2) why = rxrpc_reqack_slow_start; else if (call->tx_winsize <= 2) why = rxrpc_reqack_small_txwin; - else if (call->peer->rtt_count < 3 && sp->hdr.seq & 1) + else if (call->peer->rtt_count < 3 && txb->seq & 1) why = rxrpc_reqack_more_rtt; else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; @@ -463,35 +447,36 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, goto dont_set_request_ack; rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); - trace_rxrpc_req_ack(call->debug_id, sp->hdr.seq, why); + trace_rxrpc_req_ack(call->debug_id, txb->seq, why); if (why != rxrpc_reqack_no_srv_last) - whdr.flags |= RXRPC_REQUEST_ACK; + txb->wire.flags |= RXRPC_REQUEST_ACK; dont_set_request_ack: if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { ret = 0; - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, - whdr.flags, retrans, true); + trace_rxrpc_tx_data(call, txb->seq, serial, + txb->wire.flags, + test_bit(RXRPC_TXBUF_RESENT, &txb->flags), + true); goto done; } } - trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans, - false); + trace_rxrpc_tx_data(call, txb->seq, serial, txb->wire.flags, + test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false); + cmpxchg(&call->tx_transmitted, txb->seq - 1, txb->seq); /* send the packet with the don't fragment bit set if we currently * think it's small enough */ - if (iov[1].iov_len >= call->peer->maxdata) + if (txb->len >= call->peer->maxdata) goto send_fragmentable; down_read(&conn->params.local->defrag_sem); - sp->hdr.serial = serial; - smp_wmb(); /* Set serial before timestamp */ - skb->tstamp = ktime_get_real(); - if (whdr.flags & RXRPC_REQUEST_ACK) + txb->last_sent = ktime_get_real(); + if (txb->wire.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); /* send the packet by UDP @@ -510,7 +495,7 @@ dont_set_request_ack: trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_nofrag); } else { - trace_rxrpc_tx_packet(call->debug_id, &whdr, + trace_rxrpc_tx_packet(call->debug_id, &txb->wire, rxrpc_tx_point_call_data_nofrag); } @@ -520,8 +505,8 @@ dont_set_request_ack: done: if (ret >= 0) { - if (whdr.flags & RXRPC_REQUEST_ACK) { - call->peer->rtt_last_req = skb->tstamp; + if (txb->wire.flags & RXRPC_REQUEST_ACK) { + call->peer->rtt_last_req = txb->last_sent; if (call->peer->rtt_count > 1) { unsigned long nowj = jiffies, ack_lost_at; @@ -533,7 +518,7 @@ done: } } - if (sp->hdr.seq == 1 && + if (txb->seq == 1 && !test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { unsigned long nowj = jiffies, expect_rx_by; @@ -565,23 +550,21 @@ send_fragmentable: down_write(&conn->params.local->defrag_sem); - sp->hdr.serial = serial; - smp_wmb(); /* Set serial before timestamp */ - skb->tstamp = ktime_get_real(); - if (whdr.flags & RXRPC_REQUEST_ACK) + txb->last_sent = ktime_get_real(); + if (txb->wire.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); switch (conn->params.local->srx.transport.family) { case AF_INET6: case AF_INET: ip_sock_set_mtu_discover(conn->params.local->socket->sk, - IP_PMTUDISC_DONT); + IP_PMTUDISC_DONT); rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag); ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); conn->params.peer->last_tx_at = ktime_get_seconds(); ip_sock_set_mtu_discover(conn->params.local->socket->sk, - IP_PMTUDISC_DO); + IP_PMTUDISC_DO); break; default: @@ -593,7 +576,7 @@ send_fragmentable: trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_frag); } else { - trace_rxrpc_tx_packet(call->debug_id, &whdr, + trace_rxrpc_tx_packet(call->debug_id, &txb->wire, rxrpc_tx_point_call_data_frag); } rxrpc_tx_backoff(call, ret); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index d48af0178866..0807753ec2dc 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -54,7 +54,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); unsigned long timeout = 0; - rxrpc_seq_t tx_hard_ack; + rxrpc_seq_t acks_hard_ack; char lbuff[50], rbuff[50]; u64 wtmp; @@ -91,7 +91,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) timeout -= jiffies; } - tx_hard_ack = READ_ONCE(call->tx_hard_ack); + acks_hard_ack = READ_ONCE(call->acks_hard_ack); wtmp = atomic64_read_acquire(&call->ackr_window); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" @@ -106,7 +106,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rxrpc_call_states[call->state], call->abort_code, call->debug_id, - tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack, + acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, lower_32_bits(wtmp), upper_32_bits(wtmp) - lower_32_bits(wtmp), call->rx_serial, timeout); @@ -459,9 +459,8 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin])); seq_printf(seq, - "Buffers : txb=%u,%u rxb=%u\n", + "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_nr_txbuf), - atomic_read(&rxrpc_n_tx_skbs), atomic_read(&rxrpc_n_rx_skbs)); return 0; } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index d87c99b36e01..8fc055587f0e 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -259,11 +259,10 @@ static void rxkad_free_call_crypto(struct rxrpc_call *call) * partially encrypt a packet (level 1 security) */ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, - struct sk_buff *skb, u32 data_size, + struct rxrpc_txbuf *txb, struct skcipher_request *req) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxkad_level1_hdr hdr; + struct rxkad_level1_hdr *hdr = (void *)txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -271,22 +270,22 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, _enter(""); - check = sp->hdr.seq ^ call->call_id; - data_size |= (u32)check << 16; - - hdr.data_size = htonl(data_size); - memcpy(skb->head, &hdr, sizeof(hdr)); + check = txb->seq ^ ntohl(txb->wire.callNumber); + hdr->data_size = htonl((u32)check << 16 | txb->len); - pad = sizeof(struct rxkad_level1_hdr) + data_size; + txb->len += sizeof(struct rxkad_level1_hdr); + pad = txb->len; pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; - if (pad) - skb_put_zero(skb, pad); + if (pad) { + memset(txb->data + txb->offset, 0, pad); + txb->len += pad; + } /* start the encryption afresh */ memset(&iv, 0, sizeof(iv)); - sg_init_one(&sg, skb->head, 8); + sg_init_one(&sg, txb->data, 8); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); @@ -301,87 +300,60 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, * wholly encrypt a packet (level 2 security) */ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, - struct sk_buff *skb, - u32 data_size, + struct rxrpc_txbuf *txb, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxkad_level2_hdr rxkhdr; - struct rxrpc_skb_priv *sp; + struct rxkad_level2_hdr *rxkhdr = (void *)txb->data; struct rxrpc_crypt iv; - struct scatterlist sg[16]; - unsigned int len; + struct scatterlist sg; size_t pad; u16 check; - int err; - - sp = rxrpc_skb(skb); + int ret; _enter(""); - check = sp->hdr.seq ^ call->call_id; + check = txb->seq ^ ntohl(txb->wire.callNumber); - rxkhdr.data_size = htonl(data_size | (u32)check << 16); - rxkhdr.checksum = 0; - memcpy(skb->head, &rxkhdr, sizeof(rxkhdr)); + rxkhdr->data_size = htonl(txb->len | (u32)check << 16); + rxkhdr->checksum = 0; - pad = sizeof(struct rxkad_level2_hdr) + data_size; + txb->len += sizeof(struct rxkad_level2_hdr); + pad = txb->len; pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; - if (pad) - skb_put_zero(skb, pad); + if (pad) { + memset(txb->data + txb->offset, 0, pad); + txb->len += pad; + } /* encrypt from the session key */ token = call->conn->params.key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg[0], skb->head, sizeof(rxkhdr)); + sg_init_one(&sg, txb->data, txb->len); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg[0], &sg[0], sizeof(rxkhdr), iv.x); - crypto_skcipher_encrypt(req); - - /* we want to encrypt the skbuff in-place */ - err = -EMSGSIZE; - if (skb_shinfo(skb)->nr_frags > 16) - goto out; - - len = round_up(data_size, RXKAD_ALIGN); - - sg_init_table(sg, ARRAY_SIZE(sg)); - err = skb_to_sgvec(skb, sg, 8, len); - if (unlikely(err < 0)) - goto out; - skcipher_request_set_crypt(req, sg, sg, len, iv.x); - crypto_skcipher_encrypt(req); - - _leave(" = 0"); - err = 0; - -out: + skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x); + ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); - return err; + return ret; } /* * checksum an RxRPC packet header */ -static int rxkad_secure_packet(struct rxrpc_call *call, - struct sk_buff *skb, - size_t data_size) +static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { - struct rxrpc_skb_priv *sp; struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg; u32 x, y; int ret; - sp = rxrpc_skb(skb); - - _enter("{%d{%x}},{#%u},%zu,", + _enter("{%d{%x}},{#%u},%u,", call->debug_id, key_serial(call->conn->params.key), - sp->hdr.seq, data_size); + txb->seq, txb->len); if (!call->conn->rxkad.cipher) return 0; @@ -398,9 +370,9 @@ static int rxkad_secure_packet(struct rxrpc_call *call, memcpy(&iv, call->conn->rxkad.csum_iv.x, sizeof(iv)); /* calculate the security checksum */ - x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); - x |= sp->hdr.seq & 0x3fffffff; - call->crypto_buf[0] = htonl(call->call_id); + x = (ntohl(txb->wire.cid) & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); + x |= txb->seq & 0x3fffffff; + call->crypto_buf[0] = txb->wire.callNumber; call->crypto_buf[1] = htonl(x); sg_init_one(&sg, call->crypto_buf, 8); @@ -414,17 +386,17 @@ static int rxkad_secure_packet(struct rxrpc_call *call, y = (y >> 16) & 0xffff; if (y == 0) y = 1; /* zero checksums are not permitted */ - sp->hdr.cksum = y; + txb->wire.cksum = htons(y); switch (call->conn->params.security_level) { case RXRPC_SECURITY_PLAIN: ret = 0; break; case RXRPC_SECURITY_AUTH: - ret = rxkad_secure_packet_auth(call, skb, data_size, req); + ret = rxkad_secure_packet_auth(call, txb, req); break; case RXRPC_SECURITY_ENCRYPT: - ret = rxkad_secure_packet_encrypt(call, skb, data_size, req); + ret = rxkad_secure_packet_encrypt(call, txb, req); break; default: ret = -EPERM; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index e32805a49324..a96ae7f58148 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -25,7 +25,7 @@ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) unsigned int win_size = min_t(unsigned int, call->tx_winsize, call->cong_cwnd + call->cong_extra); - rxrpc_seq_t tx_win = READ_ONCE(call->tx_hard_ack); + rxrpc_seq_t tx_win = smp_load_acquire(&call->acks_hard_ack); if (_tx_win) *_tx_win = tx_win; @@ -50,7 +50,12 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, if (signal_pending(current)) return sock_intr_errno(*timeo); - trace_rxrpc_transmit(call, rxrpc_transmit_wait); + if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) { + rxrpc_shrink_call_tx_buffer(call); + continue; + } + + trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } @@ -71,12 +76,11 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rtt = 2; timeout = rtt; - tx_start = READ_ONCE(call->tx_hard_ack); + tx_start = smp_load_acquire(&call->acks_hard_ack); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - tx_win = READ_ONCE(call->tx_hard_ack); if (rxrpc_check_tx_space(call, &tx_win)) return 0; @@ -87,12 +91,17 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, tx_win == tx_start && signal_pending(current)) return -EINTR; + if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) { + rxrpc_shrink_call_tx_buffer(call); + continue; + } + if (tx_win != tx_start) { timeout = rtt; tx_start = tx_win; } - trace_rxrpc_transmit(call, rxrpc_transmit_wait); + trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); timeout = schedule_timeout(timeout); } } @@ -112,7 +121,12 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, if (call->state >= RXRPC_CALL_COMPLETE) return call->error; - trace_rxrpc_transmit(call, rxrpc_transmit_wait); + if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) { + rxrpc_shrink_call_tx_buffer(call); + continue; + } + + trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } } @@ -129,8 +143,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, DECLARE_WAITQUEUE(myself, current); int ret; - _enter(",{%u,%u,%u}", - call->tx_hard_ack, call->tx_top, call->tx_winsize); + _enter(",{%u,%u,%u,%u}", + call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); @@ -154,24 +168,6 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, return ret; } -/* - * Schedule an instant Tx resend. - */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call, int ix) -{ - spin_lock_bh(&call->lock); - - if (call->state < RXRPC_CALL_COMPLETE) { - call->rxtx_annotations[ix] = - (call->rxtx_annotations[ix] & RXRPC_TX_ANNO_LAST) | - RXRPC_TX_ANNO_RETRANS; - if (!test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_queue_call(call); - } - - spin_unlock_bh(&call->lock); -} - /* * Notify the owner of the call that the transmit phase is ended and the last * packet has been queued. @@ -188,40 +184,35 @@ static void rxrpc_notify_end_tx(struct rxrpc_sock *rx, struct rxrpc_call *call, * the packet immediately. Returns the error from rxrpc_send_data_packet() * in case the caller wants to do something with it. */ -static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, - struct sk_buff *skb, bool last, - rxrpc_notify_end_tx_t notify_end_tx) +static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, + struct rxrpc_txbuf *txb, + rxrpc_notify_end_tx_t notify_end_tx) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned long now; - rxrpc_seq_t seq = sp->hdr.seq; - int ret, ix; - u8 annotation = RXRPC_TX_ANNO_UNACK; - - _net("queue skb %p [%d]", skb, seq); + rxrpc_seq_t seq = txb->seq; + bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags); + int ret; rxrpc_inc_stat(call->rxnet, stat_tx_data); ASSERTCMP(seq, ==, call->tx_top + 1); - if (last) - annotation |= RXRPC_TX_ANNO_LAST; - /* We have to set the timestamp before queueing as the retransmit * algorithm can see the packet as soon as we queue it. */ - skb->tstamp = ktime_get_real(); + txb->last_sent = ktime_get_real(); - ix = seq & RXRPC_RXTX_BUFF_MASK; - rxrpc_get_skb(skb, rxrpc_skb_got); - call->rxtx_annotations[ix] = annotation; - smp_wmb(); - call->rxtx_buffer[ix] = skb; + /* Add the packet to the call's output buffer */ + rxrpc_get_txbuf(txb, rxrpc_txbuf_get_buffer); + spin_lock(&call->tx_lock); + list_add_tail(&txb->call_link, &call->tx_buffer); call->tx_top = seq; + spin_unlock(&call->tx_lock); + if (last) - trace_rxrpc_transmit(call, rxrpc_transmit_queue_last); + trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else - trace_rxrpc_transmit(call, rxrpc_transmit_queue); + trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { _debug("________awaiting reply/ACK__________"); @@ -254,7 +245,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, if (seq == 1 && rxrpc_is_client_call(call)) rxrpc_expose_client_call(call); - ret = rxrpc_send_data_packet(call, skb, false); + ret = rxrpc_send_data_packet(call, txb); if (ret < 0) { switch (ret) { case -ENETUNREACH: @@ -264,8 +255,6 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, 0, ret); goto out; } - _debug("need instant resend %d", ret); - rxrpc_instant_resend(call, ix); } else { unsigned long now = jiffies; unsigned long resend_at = now + call->peer->rto_j; @@ -276,9 +265,7 @@ static int rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, } out: - rxrpc_free_skb(skb, rxrpc_skb_freed); - _leave(" = %d", ret); - return ret; + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans); } /* @@ -292,8 +279,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, rxrpc_notify_end_tx_t notify_end_tx, bool *_dropped_lock) { - struct rxrpc_skb_priv *sp; - struct sk_buff *skb; + struct rxrpc_txbuf *txb; struct sock *sk = &rx->sk; enum rxrpc_call_state state; long timeo; @@ -327,14 +313,15 @@ reload: goto maybe_error; } - skb = call->tx_pending; + txb = call->tx_pending; call->tx_pending = NULL; - rxrpc_see_skb(skb, rxrpc_skb_seen); + if (txb) + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); do { rxrpc_transmit_ack_packets(call->peer->local); - if (!skb) { + if (!txb) { size_t remain, bufsize, chunk, offset; _debug("alloc"); @@ -355,52 +342,31 @@ reload: _debug("SIZE: %zu/%zu @%zu", chunk, bufsize, offset); /* create a buffer that we can retain until it's ACK'd */ - skb = sock_alloc_send_skb( - sk, bufsize, msg->msg_flags & MSG_DONTWAIT, &ret); - if (!skb) + ret = -ENOMEM; + txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_DATA, + GFP_KERNEL); + if (!txb) goto maybe_error; - sp = rxrpc_skb(skb); - rxrpc_new_skb(skb, rxrpc_skb_new); - - _debug("ALLOC SEND %p", skb); - - ASSERTCMP(skb->mark, ==, 0); - - __skb_put(skb, offset); - - sp->remain = chunk; - if (sp->remain > skb_tailroom(skb)) - sp->remain = skb_tailroom(skb); - - _net("skb: hr %d, tr %d, hl %d, rm %d", - skb_headroom(skb), - skb_tailroom(skb), - skb_headlen(skb), - sp->remain); - - skb->ip_summed = CHECKSUM_UNNECESSARY; + txb->offset = offset; + txb->space -= offset; + txb->space = min_t(size_t, chunk, txb->space); } _debug("append"); - sp = rxrpc_skb(skb); /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { - int copy = skb_tailroom(skb); - ASSERTCMP(copy, >, 0); - if (copy > msg_data_left(msg)) - copy = msg_data_left(msg); - if (copy > sp->remain) - copy = sp->remain; - - _debug("add"); - ret = skb_add_data(skb, &msg->msg_iter, copy); - _debug("added"); - if (ret < 0) + size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); + + _debug("add %zu", copy); + if (!copy_from_iter_full(txb->data + txb->offset, copy, + &msg->msg_iter)) goto efault; - sp->remain -= copy; - skb->mark += copy; + _debug("added"); + txb->space -= copy; + txb->len += copy; + txb->offset += copy; copied += copy; if (call->tx_total_len != -1) call->tx_total_len -= copy; @@ -412,32 +378,22 @@ reload: goto call_terminated; /* add the packet to the send queue if it's now full */ - if (sp->remain <= 0 || + if (!txb->space || (msg_data_left(msg) == 0 && !more)) { - struct rxrpc_connection *conn = call->conn; - uint32_t seq; - - seq = call->tx_top + 1; - - sp->hdr.seq = seq; - sp->hdr._rsvd = 0; - sp->hdr.flags = conn->out_clientflag; - - if (msg_data_left(msg) == 0 && !more) - sp->hdr.flags |= RXRPC_LAST_PACKET; - else if (call->tx_top - call->tx_hard_ack < + if (msg_data_left(msg) == 0 && !more) { + txb->wire.flags |= RXRPC_LAST_PACKET; + __set_bit(RXRPC_TXBUF_LAST, &txb->flags); + } + else if (call->tx_top - call->acks_hard_ack < call->tx_winsize) - sp->hdr.flags |= RXRPC_MORE_PACKETS; + txb->wire.flags |= RXRPC_MORE_PACKETS; - ret = call->security->secure_packet(call, skb, skb->mark); + ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; - ret = rxrpc_queue_packet(rx, call, skb, - !msg_data_left(msg) && !more, - notify_end_tx); - /* Should check for failure here */ - skb = NULL; + rxrpc_queue_packet(rx, call, txb, notify_end_tx); + txb = NULL; } } while (msg_data_left(msg) > 0); @@ -450,12 +406,12 @@ success: read_unlock_bh(&call->state_lock); } out: - call->tx_pending = skb; + call->tx_pending = txb; _leave(" = %d", ret); return ret; call_terminated: - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_send_aborted); _leave(" = %d", call->error); return call->error; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 45a7b48a5e10..96bfee89927b 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -99,3 +99,37 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) call_rcu(&txb->rcu, rxrpc_free_txbuf); } } + +/* + * Shrink the transmit buffer. + */ +void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) +{ + struct rxrpc_txbuf *txb; + rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack); + + _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); + + for (;;) { + spin_lock(&call->tx_lock); + txb = list_first_entry_or_null(&call->tx_buffer, + struct rxrpc_txbuf, call_link); + if (!txb) + break; + hard_ack = smp_load_acquire(&call->acks_hard_ack); + if (before(hard_ack, txb->seq)) + break; + + ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); + call->tx_bottom++; + list_del_rcu(&txb->call_link); + + trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); + + spin_unlock(&call->tx_lock); + + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); + } + + spin_unlock(&call->tx_lock); +} -- cgit v1.2.3-70-g09d2 From 4e76bd406d6e9208ea558953862a47524829688c Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 May 2022 16:13:13 +0100 Subject: rxrpc: Remove call->lock call->lock is no longer necessary, so remove it. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/call_event.c | 22 ++-------------------- net/rxrpc/call_object.c | 3 --- net/rxrpc/input.c | 3 --- net/rxrpc/output.c | 6 +----- 5 files changed, 3 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d7aebe82e17c..5ec30e84360b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -601,7 +601,6 @@ struct rxrpc_call { unsigned long user_call_ID; /* user-defined call ID */ unsigned long flags; unsigned long events; - spinlock_t lock; spinlock_t notify_lock; /* Kernel notification lock */ rwlock_t state_lock; /* lock for state transition */ u32 abort_code; /* Local/remote abort code */ diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 3c37b280eb20..dbfaf8170929 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -26,24 +26,19 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, unsigned long now = jiffies; unsigned long ping_at = now + rxrpc_idle_ack_delay; - spin_lock_bh(&call->lock); - if (time_before(ping_at, call->ping_at)) { WRITE_ONCE(call->ping_at, ping_at); rxrpc_reduce_call_timer(call, ping_at, now, rxrpc_timer_set_for_ping); trace_rxrpc_propose_ack(call, why, RXRPC_ACK_PING, serial); } - - spin_unlock_bh(&call->lock); } /* * Propose a DELAY ACK be sent in the future. */ -static void __rxrpc_propose_delay_ACK(struct rxrpc_call *call, - rxrpc_serial_t serial, - enum rxrpc_propose_ack_trace why) +void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, + enum rxrpc_propose_ack_trace why) { unsigned long expiry = rxrpc_soft_ack_delay; unsigned long now = jiffies, ack_at; @@ -68,17 +63,6 @@ static void __rxrpc_propose_delay_ACK(struct rxrpc_call *call, trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); } -/* - * Propose a DELAY ACK be sent, locking the call structure - */ -void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, - enum rxrpc_propose_ack_trace why) -{ - spin_lock_bh(&call->lock); - __rxrpc_propose_delay_ACK(call, serial, why); - spin_unlock_bh(&call->lock); -} - /* * Queue an ACK for immediate transmission. */ @@ -204,10 +188,8 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) * retransmitting data. */ if (list_empty(&retrans_queue)) { - spin_lock_bh(&call->lock); rxrpc_reduce_call_timer(call, resend_at, now_j, rxrpc_timer_set_for_resend); - spin_unlock_bh(&call->lock); ack_ts = ktime_sub(now, call->acks_latest_ts); if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3)) goto out; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index ed5f6f0e4286..a3ae2ab45f9e 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -159,7 +159,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); - spin_lock_init(&call->lock); spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); spin_lock_init(&call->input_lock); @@ -543,10 +542,8 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); - spin_lock_bh(&call->lock); if (test_and_set_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); - spin_unlock_bh(&call->lock); rxrpc_put_call_slot(call); rxrpc_delete_call_timer(call); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index b1f7debd4f3e..e6e1267915de 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -279,9 +279,6 @@ static bool rxrpc_receiving_reply(struct rxrpc_call *call) rxrpc_seq_t top = READ_ONCE(call->tx_top); if (call->ackr_reason) { - spin_lock_bh(&call->lock); - call->ackr_reason = 0; - spin_unlock_bh(&call->lock); now = jiffies; timo = now + MAX_JIFFY_OFFSET; WRITE_ONCE(call->resend_at, timo); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e2f501cef040..2c3f7e4e30d7 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -229,13 +229,9 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * if (txb->ack.reason == RXRPC_ACK_IDLE) clear_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags); - spin_lock_bh(&call->lock); n = rxrpc_fill_out_ack(conn, call, txb); - spin_unlock_bh(&call->lock); - if (n == 0) { - kfree(pkt); + if (n == 0) return 0; - } iov[0].iov_base = &txb->wire; iov[0].iov_len = sizeof(txb->wire) + sizeof(txb->ack) + n; -- cgit v1.2.3-70-g09d2 From d57a3a151660902091491ac2633134e1be92557f Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 7 May 2022 10:06:13 +0100 Subject: rxrpc: Save last ACK's SACK table rather than marking txbufs Improve the tracking of which packets need to be transmitted by saving the last ACK packet that we receive that has a populated soft-ACK table rather than marking packets. Then we can step through the soft-ACK table and look at the packets we've transmitted beyond that to determine which packets we might want to retransmit. We also look at the highest serial number that has been acked to try and guess which packets we've transmitted the peer is likely to have seen. If necessary, we send a ping to retrieve that number. One downside that might be a problem is that we can't then compare the previous acked/unacked state so easily in rxrpc_input_soft_acks() - which is a potential problem for the slow-start algorithm. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 7 +- net/core/skbuff.c | 1 + net/rxrpc/ar-internal.h | 12 ++-- net/rxrpc/call_event.c | 117 ++++++++++++++++++++++++------ net/rxrpc/call_object.c | 2 + net/rxrpc/input.c | 164 ++++++++++++++++++++----------------------- net/rxrpc/sendmsg.c | 1 - 7 files changed, 185 insertions(+), 119 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 71ca74e40ec8..a11de55c3c14 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -17,6 +17,7 @@ * Declare tracing information enums and their string mappings for display. */ #define rxrpc_skb_traces \ + EM(rxrpc_skb_ack, "ACK") \ EM(rxrpc_skb_cleaned, "CLN") \ EM(rxrpc_skb_cloned_jumbo, "CLJ") \ EM(rxrpc_skb_freed, "FRE") \ @@ -1257,7 +1258,7 @@ TRACE_EVENT(rxrpc_congest, memcpy(&__entry->sum, summary, sizeof(__entry->sum)); ), - TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u,%u r=%u b=%u u=%u d=%u l=%x%s%s%s", + TP_printk("c=%08x r=%08x %s q=%08x %s cw=%u ss=%u nA=%u,%u+%u r=%u b=%u u=%u d=%u l=%x%s%s%s", __entry->call, __entry->ack_serial, __print_symbolic(__entry->sum.ack_reason, rxrpc_ack_names), @@ -1265,8 +1266,8 @@ TRACE_EVENT(rxrpc_congest, __print_symbolic(__entry->sum.mode, rxrpc_congest_modes), __entry->sum.cwnd, __entry->sum.ssthresh, - __entry->sum.nr_acks, __entry->sum.nr_nacks, - __entry->sum.nr_new_acks, __entry->sum.nr_new_nacks, + __entry->sum.nr_acks, __entry->sum.saw_nacks, + __entry->sum.nr_new_acks, __entry->sum.nr_rot_new_acks, __entry->top - __entry->hard_ack, __entry->sum.cumulative_acks, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1d84a17eada5..2143244e8ec3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6431,6 +6431,7 @@ void skb_condense(struct sk_buff *skb) */ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); } +EXPORT_SYMBOL(skb_condense); #ifdef CONFIG_SKB_EXTENSIONS static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 5ec30e84360b..168d03b56ada 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -694,6 +694,8 @@ struct rxrpc_call { rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ + struct sk_buff *acks_soft_tbl; /* The last ACK packet with NAKs in it */ + spinlock_t acks_ack_lock; /* Access to ->acks_last_ack */ }; /* @@ -702,10 +704,9 @@ struct rxrpc_call { struct rxrpc_ack_summary { u8 ack_reason; u8 nr_acks; /* Number of ACKs in packet */ - u8 nr_nacks; /* Number of NACKs in packet */ u8 nr_new_acks; /* Number of new ACKs in packet */ - u8 nr_new_nacks; /* Number of new NACKs in packet */ u8 nr_rot_new_acks; /* Number of rotated new ACKs */ + bool saw_nacks; /* Saw NACKs in packet */ bool new_low_nack; /* T if new low NACK found */ bool retrans_timeo; /* T if reTx due to timeout happened */ u8 flight_size; /* Number of unreceived transmissions */ @@ -765,11 +766,8 @@ struct rxrpc_txbuf { unsigned int space; /* Remaining data space */ unsigned int offset; /* Offset of fill point */ unsigned long flags; -#define RXRPC_TXBUF_ACKED 0 /* Set if ACK'd */ -#define RXRPC_TXBUF_NACKED 1 /* Set if NAK'd */ -#define RXRPC_TXBUF_LAST 2 /* Set if last packet in Tx phase */ -#define RXRPC_TXBUF_RESENT 3 /* Set if has been resent */ -#define RXRPC_TXBUF_RETRANS 4 /* Set if should be retransmitted */ +#define RXRPC_TXBUF_LAST 0 /* Set if last packet in Tx phase */ +#define RXRPC_TXBUF_RESENT 1 /* Set if has been resent */ u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ struct { /* The packet for encrypting and DMA'ing. We align it such diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index dbfaf8170929..1e21a708390e 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -132,48 +132,127 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) */ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) { + struct rxrpc_ackpacket *ack = NULL; struct rxrpc_txbuf *txb; + struct sk_buff *ack_skb = NULL; unsigned long resend_at; rxrpc_seq_t transmitted = READ_ONCE(call->tx_transmitted); ktime_t now, max_age, oldest, ack_ts; bool unacked = false; + unsigned int i; LIST_HEAD(retrans_queue); _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); now = ktime_get_real(); max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); + oldest = now; + + /* See if there's an ACK saved with a soft-ACK table in it. */ + if (call->acks_soft_tbl) { + spin_lock_bh(&call->acks_ack_lock); + ack_skb = call->acks_soft_tbl; + if (ack_skb) { + rxrpc_get_skb(ack_skb, rxrpc_skb_ack); + ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); + } + spin_unlock_bh(&call->acks_ack_lock); + } + + if (list_empty(&call->tx_buffer)) + goto no_resend; spin_lock(&call->tx_lock); - /* Scan the packet list without dropping the lock and decide which of - * the packets in the Tx buffer we're going to resend and what the new - * resend timeout will be. - */ + if (list_empty(&call->tx_buffer)) + goto no_further_resend; + trace_rxrpc_resend(call); - oldest = now; - list_for_each_entry(txb, &call->tx_buffer, call_link) { - if (test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) - continue; - if (after(txb->seq, transmitted)) - break; + txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link); - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); + /* Scan the soft ACK table without dropping the lock and resend any + * explicitly NAK'd packets. + */ + if (ack) { + for (i = 0; i < ack->nAcks; i++) { + rxrpc_seq_t seq; - if (test_bit(RXRPC_TXBUF_RESENT, &txb->flags)) { - if (ktime_after(txb->last_sent, max_age)) { - if (ktime_before(txb->last_sent, oldest)) - oldest = txb->last_sent; + if (ack->acks[i] & 1) continue; + seq = ntohl(ack->firstPacket) + i; + if (after(txb->seq, transmitted)) + break; + if (after(txb->seq, seq)) + continue; /* A new hard ACK probably came in */ + list_for_each_entry_from(txb, &call->tx_buffer, call_link) { + if (txb->seq == seq) + goto found_txb; + } + goto no_further_resend; + + found_txb: + if (after(ntohl(txb->wire.serial), call->acks_highest_serial)) + continue; /* Ack point not yet reached */ + + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); + + if (list_empty(&txb->tx_link)) { + rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans); + rxrpc_get_call(call, rxrpc_call_got_tx); + list_add_tail(&txb->tx_link, &retrans_queue); + set_bit(RXRPC_TXBUF_RESENT, &txb->flags); } - unacked = true; + + trace_rxrpc_retransmit(call, txb->seq, + ktime_to_ns(ktime_sub(txb->last_sent, + max_age))); + + if (list_is_last(&txb->call_link, &call->tx_buffer)) + goto no_further_resend; + txb = list_next_entry(txb, call_link); } + } - rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans); - list_move_tail(&txb->tx_link, &retrans_queue); + /* Fast-forward through the Tx queue to the point the peer says it has + * seen. Anything between the soft-ACK table and that point will get + * ACK'd or NACK'd in due course, so don't worry about it here; here we + * need to consider retransmitting anything beyond that point. + * + * Note that ACK for a packet can beat the update of tx_transmitted. + */ + if (after_eq(READ_ONCE(call->acks_prev_seq), READ_ONCE(call->tx_transmitted))) + goto no_further_resend; + + list_for_each_entry_from(txb, &call->tx_buffer, call_link) { + if (before_eq(txb->seq, READ_ONCE(call->acks_prev_seq))) + continue; + if (after(txb->seq, READ_ONCE(call->tx_transmitted))) + break; /* Not transmitted yet */ + + if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE && + before(ntohl(txb->wire.serial), ntohl(ack->serial))) + goto do_resend; /* Wasn't accounted for by a more recent ping. */ + + if (ktime_after(txb->last_sent, max_age)) { + if (ktime_before(txb->last_sent, oldest)) + oldest = txb->last_sent; + continue; + } + + do_resend: + unacked = true; + if (list_empty(&txb->tx_link)) { + rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans); + list_add_tail(&txb->tx_link, &retrans_queue); + set_bit(RXRPC_TXBUF_RESENT, &txb->flags); + rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); + } } +no_further_resend: spin_unlock(&call->tx_lock); +no_resend: + rxrpc_free_skb(ack_skb, rxrpc_skb_freed); resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, @@ -201,8 +280,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) while ((txb = list_first_entry_or_null(&retrans_queue, struct rxrpc_txbuf, tx_link))) { list_del_init(&txb->tx_link); - set_bit(RXRPC_TXBUF_RESENT, &txb->flags); - rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); rxrpc_send_data_packet(call, txb); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index a3ae2ab45f9e..91771031ad3c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -162,6 +162,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); spin_lock_init(&call->input_lock); + spin_lock_init(&call->acks_ack_lock); rwlock_init(&call->state_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; @@ -701,6 +702,7 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); } rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); + rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_cleaned); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e6e1267915de..5c17fed4b60f 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -60,7 +60,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, switch (call->cong_mode) { case RXRPC_CALL_SLOW_START: - if (summary->nr_nacks > 0) + if (summary->saw_nacks) goto packet_loss_detected; if (summary->cumulative_acks > 0) cwnd += 1; @@ -71,7 +71,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, goto out; case RXRPC_CALL_CONGEST_AVOIDANCE: - if (summary->nr_nacks > 0) + if (summary->saw_nacks) goto packet_loss_detected; /* We analyse the number of packets that get ACK'd per RTT @@ -90,7 +90,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, goto out; case RXRPC_CALL_PACKET_LOSS: - if (summary->nr_nacks == 0) + if (!summary->saw_nacks) goto resume_normality; if (summary->new_low_nack) { @@ -128,7 +128,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } else { change = rxrpc_cong_progress; cwnd = call->cong_ssthresh; - if (summary->nr_nacks == 0) + if (!summary->saw_nacks) goto resume_normality; } goto out; @@ -189,8 +189,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { if (before_eq(txb->seq, call->acks_hard_ack)) continue; - if (!test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) - summary->nr_rot_new_acks++; + summary->nr_rot_new_acks++; if (test_bit(RXRPC_TXBUF_LAST, &txb->flags)) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; @@ -661,22 +660,8 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, */ static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call) { - struct rxrpc_txbuf *txb; - rxrpc_seq_t top, bottom; - bool resend = false; - - bottom = READ_ONCE(call->acks_hard_ack) + 1; - top = READ_ONCE(call->acks_lost_top); - if (before(bottom, top)) { - list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { - if (test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) - continue; - set_bit(RXRPC_TXBUF_RETRANS, &txb->flags); - resend = true; - } - } - - if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) + if (after(call->acks_lost_top, call->acks_prev_seq) && + !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) rxrpc_queue_call(call); } @@ -749,41 +734,19 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, rxrpc_seq_t seq, int nr_acks, struct rxrpc_ack_summary *summary) { - struct rxrpc_txbuf *txb; + unsigned int i; - list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { - if (before(txb->seq, seq)) - continue; - if (after_eq(txb->seq, seq + nr_acks)) - break; - switch (acks[txb->seq - seq]) { - case RXRPC_ACK_TYPE_ACK: + for (i = 0; i < nr_acks; i++) { + if (acks[i] == RXRPC_ACK_TYPE_ACK) { summary->nr_acks++; - if (test_bit(RXRPC_TXBUF_ACKED, &txb->flags)) - continue; - /* A lot of the time the packet is going to - * have been ACK.'d already. - */ - clear_bit(RXRPC_TXBUF_NACKED, &txb->flags); - set_bit(RXRPC_TXBUF_ACKED, &txb->flags); summary->nr_new_acks++; - break; - case RXRPC_ACK_TYPE_NACK: - if (!summary->nr_nacks && - call->acks_lowest_nak != seq) { - call->acks_lowest_nak = seq; + } else { + if (!summary->saw_nacks && + call->acks_lowest_nak != seq + i) { + call->acks_lowest_nak = seq + i; summary->new_low_nack = true; } - summary->nr_nacks++; - if (test_bit(RXRPC_TXBUF_NACKED, &txb->flags)) - continue; - summary->nr_new_nacks++; - clear_bit(RXRPC_TXBUF_ACKED, &txb->flags); - set_bit(RXRPC_TXBUF_NACKED, &txb->flags); - set_bit(RXRPC_TXBUF_RETRANS, &txb->flags); - break; - default: - return rxrpc_proto_abort("SFT", call, 0); + summary->saw_nacks = true; } } } @@ -825,12 +788,10 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; + struct rxrpc_ackpacket ack; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - union { - struct rxrpc_ackpacket ack; - struct rxrpc_ackinfo info; - u8 acks[RXRPC_MAXACKS]; - } buf; + struct rxrpc_ackinfo info; + struct sk_buff *skb_old = NULL, *skb_put = skb; rxrpc_serial_t ack_serial, acked_serial; rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; @@ -838,30 +799,28 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) _enter(""); offset = sizeof(struct rxrpc_wire_header); - if (skb_copy_bits(skb, offset, &buf.ack, sizeof(buf.ack)) < 0) { - _debug("extraction failure"); - return rxrpc_proto_abort("XAK", call, 0); + if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) { + rxrpc_proto_abort("XAK", call, 0); + goto out_not_locked; } - offset += sizeof(buf.ack); + offset += sizeof(ack); ack_serial = sp->hdr.serial; - acked_serial = ntohl(buf.ack.serial); - first_soft_ack = ntohl(buf.ack.firstPacket); - prev_pkt = ntohl(buf.ack.previousPacket); + acked_serial = ntohl(ack.serial); + first_soft_ack = ntohl(ack.firstPacket); + prev_pkt = ntohl(ack.previousPacket); hard_ack = first_soft_ack - 1; - nr_acks = buf.ack.nAcks; - summary.ack_reason = (buf.ack.reason < RXRPC_ACK__INVALID ? - buf.ack.reason : RXRPC_ACK__INVALID); + nr_acks = ack.nAcks; + summary.ack_reason = (ack.reason < RXRPC_ACK__INVALID ? + ack.reason : RXRPC_ACK__INVALID); trace_rxrpc_rx_ack(call, ack_serial, acked_serial, first_soft_ack, prev_pkt, summary.ack_reason, nr_acks); - rxrpc_inc_stat(call->rxnet, stat_rx_acks[buf.ack.reason]); + rxrpc_inc_stat(call->rxnet, stat_rx_acks[ack.reason]); - switch (buf.ack.reason) { + switch (ack.reason) { case RXRPC_ACK_PING_RESPONSE: - rxrpc_input_ping_response(call, skb->tstamp, acked_serial, - ack_serial); rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, rxrpc_rtt_rx_ping_response); break; @@ -876,7 +835,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) break; } - if (buf.ack.reason == RXRPC_ACK_PING) { + if (ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", ack_serial); rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, rxrpc_propose_ack_respond_to_ping); @@ -889,7 +848,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. */ - if (unlikely(buf.ack.reason == RXRPC_ACK_EXCEEDS_WINDOW) && + if (unlikely(ack.reason == RXRPC_ACK_EXCEEDS_WINDOW) && first_soft_ack == 1 && prev_pkt == 0 && rxrpc_is_client_call(call)) { @@ -902,7 +861,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * indicate a change of address. However, we can retransmit the call * if we still have it buffered to the beginning. */ - if (unlikely(buf.ack.reason == RXRPC_ACK_OUT_OF_SEQUENCE) && + if (unlikely(ack.reason == RXRPC_ACK_OUT_OF_SEQUENCE) && first_soft_ack == 1 && prev_pkt == 0 && call->acks_hard_ack == 0 && @@ -917,14 +876,19 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, first_soft_ack, call->acks_first_seq, prev_pkt, call->acks_prev_seq); - return; + goto out_not_locked; } - buf.info.rxMTU = 0; + info.rxMTU = 0; ioffset = offset + nr_acks + 3; - if (skb->len >= ioffset + sizeof(buf.info) && - skb_copy_bits(skb, ioffset, &buf.info, sizeof(buf.info)) < 0) - return rxrpc_proto_abort("XAI", call, 0); + if (skb->len >= ioffset + sizeof(info) && + skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) { + rxrpc_proto_abort("XAI", call, 0); + goto out_not_locked; + } + + if (nr_acks > 0) + skb_condense(skb); spin_lock(&call->input_lock); @@ -940,13 +904,22 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) call->acks_first_seq = first_soft_ack; call->acks_prev_seq = prev_pkt; - if (buf.ack.reason != RXRPC_ACK_PING && - after(acked_serial, call->acks_highest_serial)) - call->acks_highest_serial = acked_serial; + switch (ack.reason) { + case RXRPC_ACK_PING: + break; + case RXRPC_ACK_PING_RESPONSE: + rxrpc_input_ping_response(call, skb->tstamp, acked_serial, + ack_serial); + fallthrough; + default: + if (after(acked_serial, call->acks_highest_serial)) + call->acks_highest_serial = acked_serial; + break; + } /* Parse rwind and mtu sizes if provided. */ - if (buf.info.rxMTU) - rxrpc_input_ackinfo(call, skb, &buf.info); + if (info.rxMTU) + rxrpc_input_ackinfo(call, skb, &info); if (first_soft_ack == 0) { rxrpc_proto_abort("AK0", call, 0); @@ -982,12 +955,24 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } if (nr_acks > 0) { - if (skb_copy_bits(skb, offset, buf.acks, nr_acks) < 0) { + if (offset > (int)skb->len - nr_acks) { rxrpc_proto_abort("XSA", call, 0); goto out; } - rxrpc_input_soft_acks(call, buf.acks, first_soft_ack, nr_acks, - &summary); + + spin_lock(&call->acks_ack_lock); + skb_old = call->acks_soft_tbl; + call->acks_soft_tbl = skb; + spin_unlock(&call->acks_ack_lock); + + rxrpc_input_soft_acks(call, skb->data + offset, first_soft_ack, + nr_acks, &summary); + skb_put = NULL; + } else if (call->acks_soft_tbl) { + spin_lock(&call->acks_ack_lock); + skb_old = call->acks_soft_tbl; + call->acks_soft_tbl = NULL; + spin_unlock(&call->acks_ack_lock); } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && @@ -999,6 +984,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_congestion_management(call, skb, &summary, acked_serial); out: spin_unlock(&call->input_lock); +out_not_locked: + rxrpc_free_skb(skb_put, rxrpc_skb_freed); + rxrpc_free_skb(skb_old, rxrpc_skb_freed); } /* @@ -1071,7 +1059,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, case RXRPC_PACKET_TYPE_ACK: rxrpc_input_ack(call, skb); - break; + goto no_free; case RXRPC_PACKET_TYPE_BUSY: _proto("Rx BUSY %%%u", sp->hdr.serial); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index a96ae7f58148..9b567aff3e84 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -600,7 +600,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) __releases(&rx->sk.sk_lock.slock) - __releases(&call->user_mutex) { enum rxrpc_call_state state; struct rxrpc_call *call; -- cgit v1.2.3-70-g09d2 From 6869ddb87d475bde2da0dbd4d71270996d65cd47 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 15 Jun 2022 14:49:26 +0100 Subject: rxrpc: Remove the rxtx ring The Rx/Tx ring is no longer used, so remove it. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 15 --------------- net/rxrpc/call_object.c | 24 ------------------------ 2 files changed, 39 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 168d03b56ada..775eb91aabb2 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -617,21 +617,6 @@ struct rxrpc_call { unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ - /* Rx/Tx circular buffer, depending on phase. - * - * In the Rx phase, packets are annotated with 0 or the number of the - * segment of a jumbo packet each buffer refers to. There can be up to - * 47 segments in a maximum-size UDP packet. - * - * In the Tx phase, packets are annotated with which buffers have been - * acked. - */ -#define RXRPC_RXTX_BUFF_SIZE 64 -#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1) -#define RXRPC_INIT_RX_WINDOW_SIZE 63 - struct sk_buff **rxtx_buffer; - u8 *rxtx_annotations; - /* Transmitted data tracking. */ spinlock_t tx_lock; /* Transmit queue lock */ struct list_head tx_buffer; /* Buffer of transmissible packets */ diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 91771031ad3c..aa19daaa487b 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -129,16 +129,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, if (!call) return NULL; - call->rxtx_buffer = kcalloc(RXRPC_RXTX_BUFF_SIZE, - sizeof(struct sk_buff *), - gfp); - if (!call->rxtx_buffer) - goto nomem; - - call->rxtx_annotations = kcalloc(RXRPC_RXTX_BUFF_SIZE, sizeof(u8), gfp); - if (!call->rxtx_annotations) - goto nomem_2; - mutex_init(&call->user_mutex); /* Prevent lockdep reporting a deadlock false positive between the afs @@ -183,12 +173,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); return call; - -nomem_2: - kfree(call->rxtx_buffer); -nomem: - kmem_cache_free(rxrpc_call_jar, call); - return NULL; } /* @@ -516,12 +500,6 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) */ static void rxrpc_cleanup_ring(struct rxrpc_call *call) { - int i; - - for (i = 0; i < RXRPC_RXTX_BUFF_SIZE; i++) { - rxrpc_free_skb(call->rxtx_buffer[i], rxrpc_skb_cleaned); - call->rxtx_buffer[i] = NULL; - } skb_queue_purge(&call->recvmsg_queue); skb_queue_purge(&call->rx_oos_queue); } @@ -658,8 +636,6 @@ static void rxrpc_destroy_call(struct work_struct *work) rxrpc_put_connection(call->conn); rxrpc_put_peer(call->peer); - kfree(call->rxtx_buffer); - kfree(call->rxtx_annotations); kmem_cache_free(rxrpc_call_jar, call); if (atomic_dec_and_test(&rxnet->nr_calls)) wake_up_var(&rxnet->nr_calls); -- cgit v1.2.3-70-g09d2 From 1fc4fa2ac93dcf3542f2dc6f7ff88fb022da5116 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 3 Oct 2022 18:49:11 +0100 Subject: rxrpc: Fix congestion management rxrpc has a problem in its congestion management in that it saves the congestion window size (cwnd) from one call to another, but if this is 0 at the time is saved, then the next call may not actually manage to ever transmit anything. To this end: (1) Don't save cwnd between calls, but rather reset back down to the initial cwnd and re-enter slow-start if data transmission is idle for more than an RTT. (2) Preserve ssthresh instead, as that is a handy estimate of pipe capacity. Knowing roughly when to stop slow start and enter congestion avoidance can reduce the tendency to overshoot and drop larger amounts of packets when probing. In future, cwind growth also needs to be constrained when the window isn't being filled due to being application limited. Reported-by: Simon Wilkinson cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 1 + net/rxrpc/ar-internal.h | 9 +++++---- net/rxrpc/call_accept.c | 3 ++- net/rxrpc/call_object.c | 7 ++++++- net/rxrpc/conn_client.c | 3 ++- net/rxrpc/conn_object.c | 2 +- net/rxrpc/input.c | 21 ++++++++++++++++++++- net/rxrpc/output.c | 1 + net/rxrpc/peer_object.c | 7 +------ net/rxrpc/proc.c | 4 ++-- net/rxrpc/sendmsg.c | 22 +++++++++++++++++++--- 11 files changed, 60 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index a11de55c3c14..b9886d1df825 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -193,6 +193,7 @@ EM(rxrpc_cong_new_low_nack, " NewLowN") \ EM(rxrpc_cong_no_change, " -") \ EM(rxrpc_cong_progress, " Progres") \ + EM(rxrpc_cong_idle_reset, " IdleRes") \ EM(rxrpc_cong_retransmit_again, " ReTxAgn") \ EM(rxrpc_cong_rtt_window_end, " RttWinE") \ E_(rxrpc_cong_saw_nack, " SawNack") diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 775eb91aabb2..6bbe28ecf583 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -332,7 +332,7 @@ struct rxrpc_peer { u32 rto_j; /* Retransmission timeout in jiffies */ u8 backoff; /* Backoff timeout */ - u8 cong_cwnd; /* Congestion window size */ + u8 cong_ssthresh; /* Congestion slow-start threshold */ }; /* @@ -626,6 +626,7 @@ struct rxrpc_call { u16 tx_backoff; /* Delay to insert due to Tx failure */ u8 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 + ktime_t tx_last_sent; /* Last time a transmission occurred */ /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ @@ -687,10 +688,10 @@ struct rxrpc_call { * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { + u16 nr_acks; /* Number of ACKs in packet */ + u16 nr_new_acks; /* Number of new ACKs in packet */ + u16 nr_rot_new_acks; /* Number of rotated new ACKs */ u8 ack_reason; - u8 nr_acks; /* Number of ACKs in packet */ - u8 nr_new_acks; /* Number of new ACKs in packet */ - u8 nr_rot_new_acks; /* Number of rotated new ACKs */ bool saw_nacks; /* Saw NACKs in packet */ bool new_low_nack; /* T if new low NACK found */ bool retrans_timeo; /* T if reTx due to timeout happened */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index d8db277d5ebe..48790ee77019 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -324,7 +324,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, call->security = conn->security; call->security_ix = conn->security_ix; call->peer = rxrpc_get_peer(conn->params.peer); - call->cong_cwnd = call->peer->cong_cwnd; + call->cong_ssthresh = call->peer->cong_ssthresh; + call->tx_last_sent = ktime_get_real(); return call; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index aa19daaa487b..1befe22cd301 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -166,7 +166,12 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->rx_winsize = rxrpc_rx_window_size; call->tx_winsize = 16; - call->cong_cwnd = 2; + if (RXRPC_TX_SMSS > 2190) + call->cong_cwnd = 2; + else if (RXRPC_TX_SMSS > 1095) + call->cong_cwnd = 3; + else + call->cong_cwnd = 4; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; call->rxnet = rxnet; diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3c9eeb5b750c..f020f308ed9e 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -363,7 +363,8 @@ static struct rxrpc_bundle *rxrpc_prep_call(struct rxrpc_sock *rx, if (!cp->peer) goto error; - call->cong_cwnd = cp->peer->cong_cwnd; + call->tx_last_sent = ktime_get_real(); + call->cong_ssthresh = cp->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; else diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index f7ea71ae6159..156bd26daf74 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -207,7 +207,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) { struct rxrpc_connection *conn = call->conn; - call->peer->cong_cwnd = call->cong_cwnd; + call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { spin_lock_bh(&call->peer->lock); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 5c17fed4b60f..bdf70b81addc 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -58,6 +58,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, summary->cumulative_acks = cumulative_acks; summary->dup_acks = call->cong_dup_acks; + /* If we haven't transmitted anything for >1RTT, we should reset the + * congestion management state. + */ + if ((call->cong_mode == RXRPC_CALL_SLOW_START || + call->cong_mode == RXRPC_CALL_CONGEST_AVOIDANCE) && + ktime_before(ktime_add_us(call->tx_last_sent, + call->peer->srtt_us >> 3), + ktime_get_real()) + ) { + change = rxrpc_cong_idle_reset; + summary->mode = RXRPC_CALL_SLOW_START; + if (RXRPC_TX_SMSS > 2190) + summary->cwnd = 2; + else if (RXRPC_TX_SMSS > 1095) + summary->cwnd = 3; + else + summary->cwnd = 4; + } + switch (call->cong_mode) { case RXRPC_CALL_SLOW_START: if (summary->saw_nacks) @@ -205,7 +224,7 @@ static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, if (call->acks_lowest_nak == call->acks_hard_ack) { call->acks_lowest_nak = to; - } else if (before_eq(call->acks_lowest_nak, to)) { + } else if (after(to, call->acks_lowest_nak)) { summary->new_low_nack = true; call->acks_lowest_nak = to; } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 2c3f7e4e30d7..46432e70a16b 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -501,6 +501,7 @@ dont_set_request_ack: done: if (ret >= 0) { + call->tx_last_sent = txb->last_sent; if (txb->wire.flags & RXRPC_REQUEST_ACK) { call->peer->rtt_last_req = txb->last_sent; if (call->peer->rtt_count > 1) { diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 26d2ae9baaf2..041a51225c5f 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -227,12 +227,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) rxrpc_peer_init_rtt(peer); - if (RXRPC_TX_SMSS > 2190) - peer->cong_cwnd = 2; - else if (RXRPC_TX_SMSS > 1095) - peer->cong_cwnd = 3; - else - peer->cong_cwnd = 4; + peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here); } diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 0807753ec2dc..fae22a8b38d6 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -217,7 +217,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "Proto Local " " Remote " - " Use CW MTU LastUse RTT RTO\n" + " Use SST MTU LastUse RTT RTO\n" ); return 0; } @@ -235,7 +235,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) lbuff, rbuff, refcount_read(&peer->ref), - peer->cong_cwnd, + peer->cong_ssthresh, peer->mtu, now - peer->last_tx_at, peer->srtt_us >> 3, diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 9b567aff3e84..e5fd8a95bf71 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -22,11 +22,27 @@ */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { - unsigned int win_size = - min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra); + unsigned int win_size; rxrpc_seq_t tx_win = smp_load_acquire(&call->acks_hard_ack); + /* If we haven't transmitted anything for >1RTT, we should reset the + * congestion management state. + */ + if (ktime_before(ktime_add_us(call->tx_last_sent, + call->peer->srtt_us >> 3), + ktime_get_real())) { + if (RXRPC_TX_SMSS > 2190) + win_size = 2; + else if (RXRPC_TX_SMSS > 1095) + win_size = 3; + else + win_size = 4; + win_size += call->cong_extra; + } else { + win_size = min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra); + } + if (_tx_win) *_tx_win = tx_win; return call->tx_top - tx_win < win_size; -- cgit v1.2.3-70-g09d2 From 30d95efe06e18bd55691902bb4ec873e4b21a754 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 4 Nov 2022 23:13:40 +0000 Subject: rxrpc: Allocate an skcipher each time needed rather than reusing In the rxkad security class, allocate the skcipher used to do packet encryption and decription rather than allocating one up front and reusing it for each packet. Reusing the skcipher precludes doing crypto in parallel. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 2 -- net/rxrpc/rxkad.c | 52 ++++++++++++++++++++++++++----------------------- 2 files changed, 28 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6bbe28ecf583..0273a9029229 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -583,7 +583,6 @@ struct rxrpc_call { unsigned long expect_term_by; /* When we expect call termination by */ u32 next_rx_timo; /* Timeout for next Rx packet (jif) */ u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ - struct skcipher_request *cipher_req; /* Packet cipher request buffer */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ @@ -597,7 +596,6 @@ struct rxrpc_call { struct rxrpc_txbuf *tx_pending; /* Tx buffer being filled */ wait_queue_head_t waitq; /* Wait queue for channel or Tx */ s64 tx_total_len; /* Total length left to be transmitted (or -1) */ - __be32 crypto_buf[2]; /* Temporary packet crypto buffer */ unsigned long user_call_ID; /* user-defined call ID */ unsigned long flags; unsigned long events; diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 8fc055587f0e..2706e59bf992 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -233,16 +233,8 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn, static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call) { struct crypto_skcipher *tfm = &call->conn->rxkad.cipher->base; - struct skcipher_request *cipher_req = call->cipher_req; - if (!cipher_req) { - cipher_req = skcipher_request_alloc(tfm, GFP_NOFS); - if (!cipher_req) - return NULL; - call->cipher_req = cipher_req; - } - - return cipher_req; + return skcipher_request_alloc(tfm, GFP_NOFS); } /* @@ -250,9 +242,6 @@ static struct skcipher_request *rxkad_get_call_crypto(struct rxrpc_call *call) */ static void rxkad_free_call_crypto(struct rxrpc_call *call) { - if (call->cipher_req) - skcipher_request_free(call->cipher_req); - call->cipher_req = NULL; } /* @@ -348,6 +337,9 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg; + union { + __be32 buf[2]; + } crypto __aligned(8); u32 x, y; int ret; @@ -372,17 +364,17 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) /* calculate the security checksum */ x = (ntohl(txb->wire.cid) & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); x |= txb->seq & 0x3fffffff; - call->crypto_buf[0] = txb->wire.callNumber; - call->crypto_buf[1] = htonl(x); + crypto.buf[0] = txb->wire.callNumber; + crypto.buf[1] = htonl(x); - sg_init_one(&sg, call->crypto_buf, 8); + sg_init_one(&sg, crypto.buf, 8); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - y = ntohl(call->crypto_buf[1]); + y = ntohl(crypto.buf[1]); y = (y >> 16) & 0xffff; if (y == 0) y = 1; /* zero checksums are not permitted */ @@ -403,6 +395,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) break; } + skcipher_request_free(req); _leave(" = %d [set %x]", ret, y); return ret; } @@ -593,8 +586,12 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg; + union { + __be32 buf[2]; + } crypto __aligned(8); rxrpc_seq_t seq = sp->hdr.seq; bool aborted; + int ret; u16 cksum; u32 x, y; @@ -614,17 +611,17 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) /* validate the security checksum */ x = (call->cid & RXRPC_CHANNELMASK) << (32 - RXRPC_CIDSHIFT); x |= seq & 0x3fffffff; - call->crypto_buf[0] = htonl(call->call_id); - call->crypto_buf[1] = htonl(x); + crypto.buf[0] = htonl(call->call_id); + crypto.buf[1] = htonl(x); - sg_init_one(&sg, call->crypto_buf, 8); + sg_init_one(&sg, crypto.buf, 8); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); skcipher_request_set_crypt(req, &sg, &sg, 8, iv.x); crypto_skcipher_encrypt(req); skcipher_request_zero(req); - y = ntohl(call->crypto_buf[1]); + y = ntohl(crypto.buf[1]); cksum = (y >> 16) & 0xffff; if (cksum == 0) cksum = 1; /* zero checksums are not permitted */ @@ -637,15 +634,22 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) switch (call->conn->params.security_level) { case RXRPC_SECURITY_PLAIN: - return 0; + ret = 0; + break; case RXRPC_SECURITY_AUTH: - return rxkad_verify_packet_1(call, skb, seq, req); + ret = rxkad_verify_packet_1(call, skb, seq, req); + break; case RXRPC_SECURITY_ENCRYPT: - return rxkad_verify_packet_2(call, skb, seq, req); + ret = rxkad_verify_packet_2(call, skb, seq, req); + break; default: - return -ENOANO; + ret = -ENOANO; + break; } + skcipher_request_free(req); + return ret; + protocol_error: if (aborted) rxrpc_send_abort_packet(call); -- cgit v1.2.3-70-g09d2 From bd039b5ea2a91ea707ee8539df26456bd5be80af Mon Sep 17 00:00:00 2001 From: Andy Ren Date: Mon, 7 Nov 2022 09:42:42 -0800 Subject: net/core: Allow live renaming when an interface is up Allow a network interface to be renamed when the interface is up. As described in the netconsole documentation [1], when netconsole is used as a built-in, it will bring up the specified interface as soon as possible. As a result, user space will not be able to rename the interface since the kernel disallows renaming of interfaces that are administratively up unless the 'IFF_LIVE_RENAME_OK' private flag was set by the kernel. The original solution [2] to this problem was to add a new parameter to the netconsole configuration parameters that allows renaming of the interface used by netconsole while it is administratively up. However, during the discussion that followed, it became apparent that we have no reason to keep the current restriction and instead we should allow user space to rename interfaces regardless of their administrative state: 1. The restriction was put in place over 20 years ago when renaming was only possible via IOCTL and before rtnetlink started notifying user space about such changes like it does today. 2. The 'IFF_LIVE_RENAME_OK' flag was added over 3 years ago in version 5.2 and no regressions were reported. 3. In-kernel listeners to 'NETDEV_CHANGENAME' do not seem to care about the administrative state of interface. Therefore, allow user space to rename running interfaces by removing the restriction and the associated 'IFF_LIVE_RENAME_OK' flag. Help in possible triage by emitting a message to the kernel log that an interface was renamed while UP. [1] https://www.kernel.org/doc/Documentation/networking/netconsole.rst [2] https://lore.kernel.org/netdev/20221102002420.2613004-1-andy.ren@getcruise.com/ Signed-off-by: Andy Ren Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +--- net/core/dev.c | 19 ++----------------- net/core/failover.c | 6 +++--- 3 files changed, 6 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d45713a06568..4be87b89e481 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1650,7 +1650,6 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device - * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN @@ -1686,7 +1685,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, - IFF_LIVE_RENAME_OK = 1<<30, + /* was IFF_LIVE_RENAME_OK */ IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; @@ -1721,7 +1720,6 @@ enum netdev_priv_flags { #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER -#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK #define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /* Specifies the type of the struct net_device::ml_priv pointer */ diff --git a/net/core/dev.c b/net/core/dev.c index 3bacee3bee78..707de6b841d0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1163,22 +1163,6 @@ int dev_change_name(struct net_device *dev, const char *newname) net = dev_net(dev); - /* Some auto-enslaved devices e.g. failover slaves are - * special, as userspace might rename the device after - * the interface had been brought up and running since - * the point kernel initiated auto-enslavement. Allow - * live name change even when these slave devices are - * up and running. - * - * Typically, users of these auto-enslaving devices - * don't actually care about slave name change, as - * they are supposed to operate on master interface - * directly. - */ - if (dev->flags & IFF_UP && - likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK))) - return -EBUSY; - down_write(&devnet_rename_sem); if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { @@ -1195,7 +1179,8 @@ int dev_change_name(struct net_device *dev, const char *newname) } if (oldname[0] && !strchr(oldname, '%')) - netdev_info(dev, "renamed from %s\n", oldname); + netdev_info(dev, "renamed from %s%s\n", oldname, + dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; dev->name_assign_type = NET_NAME_RENAMED; diff --git a/net/core/failover.c b/net/core/failover.c index 864d2d83eff4..655411c4ca51 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev) goto err_upper_link; } - slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); + slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); + slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; err_upper_link: netdev_rx_handler_unregister(slave_dev); done: @@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev) netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_LIVE_RENAME_OK); + slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) -- cgit v1.2.3-70-g09d2 From 154ba79c9f160e652a2c9c46435b928b3bfae11f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Nov 2022 12:41:28 -0800 Subject: genetlink: correctly begin the iteration over policies The return value from genl_op_iter_init() only tells us if there are any policies but to begin the iteration (and therefore load the first entry) we need to call genl_op_iter_next(). Note that it's safe to call genl_op_iter_next() on a family with no ops, it will just return false. This may lead to various crashes, a warning in netlink_policy_dump_get_policy_idx() when policy is not found or.. no problem at all if the kmalloc'ed memory happens to be zeroed. Fixes: b502b3185cd6 ("genetlink: use iterator in the op to policy map dumping") Link: https://lore.kernel.org/r/20221108204128.330287-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/netlink/genetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 362a61179036..9b7dfc45dd67 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1437,7 +1437,9 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) ctx->op_iter = kmalloc(sizeof(*ctx->op_iter), GFP_KERNEL); if (!ctx->op_iter) return -ENOMEM; - ctx->dump_map = genl_op_iter_init(rt, ctx->op_iter); + + genl_op_iter_init(rt, ctx->op_iter); + ctx->dump_map = genl_op_iter_next(ctx->op_iter); for (genl_op_iter_init(rt, &i); genl_op_iter_next(&i); ) { if (i.doit.policy) { -- cgit v1.2.3-70-g09d2 From 3e52fba03a20234abc65a656cef063a1045d9723 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 8 Nov 2022 14:22:06 +0100 Subject: net: introduce a helper to move notifier block to different namespace Currently, net_dev() netdev notifier variant follows the netdev with per-net notifier from namespace to namespace. This is implemented by move_netdevice_notifiers_dev_net() helper. For devlink it is needed to re-register per-net notifier during devlink reload. Introduce a new helper called move_netdevice_notifier_net() and share the unregister/register code with existing move_netdevice_notifiers_dev_net() helper. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 22 ++++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4be87b89e481..02a2318da7c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2826,6 +2826,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, + struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); diff --git a/net/core/dev.c b/net/core/dev.c index 707de6b841d0..117e830cabb0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1861,6 +1861,22 @@ int unregister_netdevice_notifier_net(struct net *net, } EXPORT_SYMBOL(unregister_netdevice_notifier_net); +static void __move_netdevice_notifier_net(struct net *src_net, + struct net *dst_net, + struct notifier_block *nb) +{ + __unregister_netdevice_notifier_net(src_net, nb); + __register_netdevice_notifier_net(dst_net, nb, true); +} + +void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, + struct notifier_block *nb) +{ + rtnl_lock(); + __move_netdevice_notifier_net(src_net, dst_net, nb); + rtnl_unlock(); +} + int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) @@ -1897,10 +1913,8 @@ static void move_netdevice_notifiers_dev_net(struct net_device *dev, { struct netdev_net_notifier *nn; - list_for_each_entry(nn, &dev->net_notifier_list, list) { - __unregister_netdevice_notifier_net(dev_net(dev), nn->nb); - __register_netdevice_notifier_net(net, nn->nb, true); - } + list_for_each_entry(nn, &dev->net_notifier_list, list) + __move_netdevice_notifier_net(dev_net(dev), net, nn->nb); } /** -- cgit v1.2.3-70-g09d2 From 15feb56e30efea95992f5c572cee753db205eb7b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 8 Nov 2022 14:22:07 +0100 Subject: net: devlink: move netdev notifier block to dest namespace during reload The notifier block tracking netdev changes in devlink is registered during devlink_alloc() per-net, it is then unregistered in devlink_free(). When devlink moves from net namespace to another one, the notifier block needs to move along. Fix this by adding forgotten call to move the block. Reported-by: Ido Schimmel Fixes: 02a68a47eade ("net: devlink: track netdev with devlink_port assigned") Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 40fcdded57e6..ea0b319385fc 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4502,8 +4502,11 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, if (err) return err; - if (dest_net && !net_eq(dest_net, curr_net)) + if (dest_net && !net_eq(dest_net, curr_net)) { + move_netdevice_notifier_net(curr_net, dest_net, + &devlink->netdevice_nb); write_pnet(&devlink->_net, dest_net); + } err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); -- cgit v1.2.3-70-g09d2 From 9baedc3c8780256bfc68d318c31d0081cefe92a3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Nov 2022 11:47:07 +0100 Subject: bridge: switchdev: Let device drivers determine FDB offload indication Currently, FDB entries that are notified to the bridge via 'SWITCHDEV_FDB_ADD_TO_BRIDGE' are always marked as offloaded. With MAB enabled, this will no longer be universally true. Device drivers will report locked FDB entries to the bridge to let it know that the corresponding hosts required authorization, but it does not mean that these entries are necessarily programmed in the underlying hardware. Solve this by determining the offload indication based of the 'offloaded' bit in the FDB notification. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 96e91d69a9a8..145999b8c355 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -172,7 +172,7 @@ static int br_switchdev_event(struct notifier_block *unused, break; } br_fdb_offloaded_set(br, p, fdb_info->addr, - fdb_info->vid, true); + fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_DEL_TO_BRIDGE: fdb_info = ptr; -- cgit v1.2.3-70-g09d2 From 27fabd02abf30a9df9899f92d467591c7eabb1ba Mon Sep 17 00:00:00 2001 From: "Hans J. Schultz" Date: Tue, 8 Nov 2022 11:47:08 +0100 Subject: bridge: switchdev: Allow device drivers to install locked FDB entries When the bridge is offloaded to hardware, FDB entries are learned and aged-out by the hardware. Some device drivers synchronize the hardware and software FDBs by generating switchdev events towards the bridge. When a port is locked, the hardware must not learn autonomously, as otherwise any host will blindly gain authorization. Instead, the hardware should generate events regarding hosts that are trying to gain authorization and their MAC addresses should be notified by the device driver as locked FDB entries towards the bridge driver. Allow device drivers to notify the bridge driver about such entries by extending the 'switchdev_notifier_fdb_info' structure with the 'locked' bit. The bit can only be set by device drivers and not by the bridge driver. Prevent a locked entry from being installed if MAB is not enabled on the bridge port. If an entry already exists in the bridge driver, reject the locked entry if the current entry does not have the "locked" flag set or if it points to a different port. The same semantics are implemented in the software data path. Signed-off-by: Hans J. Schultz Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/net/switchdev.h | 1 + net/bridge/br.c | 3 ++- net/bridge/br_fdb.c | 22 ++++++++++++++++++++-- net/bridge/br_private.h | 2 +- net/bridge/br_switchdev.c | 4 ++++ 5 files changed, 28 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 7dcdc97c0bc3..ca0312b78294 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -248,6 +248,7 @@ struct switchdev_notifier_fdb_info { u16 vid; u8 added_by_user:1, is_local:1, + locked:1, offloaded:1; }; diff --git a/net/bridge/br.c b/net/bridge/br.c index 145999b8c355..4f5098d33a46 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -166,7 +166,8 @@ static int br_switchdev_event(struct notifier_block *unused, case SWITCHDEV_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, - fdb_info->vid, false); + fdb_info->vid, + fdb_info->locked, false); if (err) { err = notifier_from_errno(err); break; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 3b83af4458b8..e69a872bfc1d 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1139,7 +1139,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, "FDB entry towards bridge must be permanent"); return -EINVAL; } - err = br_fdb_external_learn_add(br, p, addr, vid, true); + err = br_fdb_external_learn_add(br, p, addr, vid, false, true); } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); @@ -1377,7 +1377,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) } int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid, + const unsigned char *addr, u16 vid, bool locked, bool swdev_notify) { struct net_bridge_fdb_entry *fdb; @@ -1386,6 +1386,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, trace_br_fdb_external_learn_add(br, p, addr, vid); + if (locked && (!p || !(p->flags & BR_PORT_MAB))) + return -EINVAL; + spin_lock_bh(&br->hash_lock); fdb = br_fdb_find(br, addr, vid); @@ -1398,6 +1401,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, if (!p) flags |= BIT(BR_FDB_LOCAL); + if (locked) + flags |= BIT(BR_FDB_LOCKED); + fdb = fdb_create(br, p, addr, vid, flags); if (!fdb) { err = -ENOMEM; @@ -1405,6 +1411,13 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, } fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify); } else { + if (locked && + (!test_bit(BR_FDB_LOCKED, &fdb->flags) || + READ_ONCE(fdb->dst) != p)) { + err = -EINVAL; + goto err_unlock; + } + fdb->updated = jiffies; if (READ_ONCE(fdb->dst) != p) { @@ -1421,6 +1434,11 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, modified = true; } + if (locked != test_bit(BR_FDB_LOCKED, &fdb->flags)) { + change_bit(BR_FDB_LOCKED, &fdb->flags); + modified = true; + } + if (swdev_notify) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4ce8b8e5ae0b..4c4fda930068 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -811,7 +811,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, - bool swdev_notify); + bool locked, bool swdev_notify); int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, bool swdev_notify); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 8f3d76c751dd..8a0abe35137d 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -136,6 +136,7 @@ static void br_switchdev_fdb_populate(struct net_bridge *br, item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); + item->locked = false; item->info.dev = (!p || item->is_local) ? br->dev : p->dev; item->info.ctx = ctx; } @@ -146,6 +147,9 @@ br_switchdev_fdb_notify(struct net_bridge *br, { struct switchdev_notifier_fdb_info item; + if (test_bit(BR_FDB_LOCKED, &fdb->flags)) + return; + br_switchdev_fdb_populate(br, &item, fdb, NULL); switch (type) { -- cgit v1.2.3-70-g09d2 From 9c0ca02bace4837d123e1a5a30f6f44dbdc5fb92 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Nov 2022 11:47:09 +0100 Subject: bridge: switchdev: Reflect MAB bridge port flag to device drivers Reflect the 'BR_PORT_MAB' flag to device drivers so that: * Drivers that support MAB could act upon the flag being toggled. * Drivers that do not support MAB will prevent MAB from being enabled. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Reviewed-by: Vladimir Oltean Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_switchdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 8a0abe35137d..7eb6fd5bb917 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -71,7 +71,7 @@ bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, } /* Flags that can be offloaded to hardware */ -#define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | \ +#define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | BR_PORT_MAB | \ BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED | \ BR_HAIRPIN_MODE | BR_ISOLATED | BR_MULTICAST_TO_UNICAST) -- cgit v1.2.3-70-g09d2 From 2640a82bbc08393c846c7b55178079bb8ca31a8c Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 8 Nov 2022 11:47:10 +0100 Subject: devlink: Add packet traps for 802.1X operation Add packet traps for 802.1X operation. The "eapol" control trap is used to trap EAPOL packets and is required for the correct operation of the control plane. The "locked_port" drop trap can be enabled to gain visibility into packets that were dropped by the device due to the locked bridge port check. Signed-off-by: Ido Schimmel Reviewed-by: Petr Machata Signed-off-by: Petr Machata Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-trap.rst | 13 +++++++++++++ include/net/devlink.h | 9 +++++++++ net/core/devlink.c | 3 +++ 3 files changed, 25 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 90d1381b88de..2c14dfe69b3a 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -485,6 +485,16 @@ be added to the following table: - Traps incoming packets that the device decided to drop because the destination MAC is not configured in the MAC table and the interface is not in promiscuous mode + * - ``eapol`` + - ``control`` + - Traps "Extensible Authentication Protocol over LAN" (EAPOL) packets + specified in IEEE 802.1X + * - ``locked_port`` + - ``drop`` + - Traps packets that the device decided to drop because they failed the + locked bridge port check. That is, packets that were received via a + locked port and whose {SMAC, VID} does not correspond to an FDB entry + pointing to the port Driver-specific Packet Traps ============================ @@ -589,6 +599,9 @@ narrow. The description of these groups must be added to the following table: * - ``parser_error_drops`` - Contains packet traps for packets that were marked by the device during parsing as erroneous + * - ``eapol`` + - Contains packet traps for "Extensible Authentication Protocol over LAN" + (EAPOL) packets specified in IEEE 802.1X Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index fa6e936af1a5..611a23a3deb2 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -894,6 +894,8 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_ESP_PARSING, DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_NEXTHOP, DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER, + DEVLINK_TRAP_GENERIC_ID_EAPOL, + DEVLINK_TRAP_GENERIC_ID_LOCKED_PORT, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -930,6 +932,7 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP, DEVLINK_TRAP_GROUP_GENERIC_ID_PARSER_ERROR_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_EAPOL, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -1121,6 +1124,10 @@ enum devlink_trap_group_generic_id { "blackhole_nexthop" #define DEVLINK_TRAP_GENERIC_NAME_DMAC_FILTER \ "dmac_filter" +#define DEVLINK_TRAP_GENERIC_NAME_EAPOL \ + "eapol" +#define DEVLINK_TRAP_GENERIC_NAME_LOCKED_PORT \ + "locked_port" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -1174,6 +1181,8 @@ enum devlink_trap_group_generic_id { "acl_trap" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_PARSER_ERROR_DROPS \ "parser_error_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_EAPOL \ + "eapol" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index ea0b319385fc..6bbe230c4ec5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -11734,6 +11734,8 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(ESP_PARSING, DROP), DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP), DEVLINK_TRAP(DMAC_FILTER, DROP), + DEVLINK_TRAP(EAPOL, CONTROL), + DEVLINK_TRAP(LOCKED_PORT, DROP), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -11769,6 +11771,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(ACL_SAMPLE), DEVLINK_TRAP_GROUP(ACL_TRAP), DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS), + DEVLINK_TRAP_GROUP(EAPOL), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3-70-g09d2 From e081ecf084d31809242fb0b9f35484d5fb3a161a Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Tue, 8 Nov 2022 13:33:28 +0100 Subject: gro: avoid checking for a failed search After searching for a protocol handler in dev_gro_receive, checking for failure is redundant. Skip the failure code after finding the corresponding handler. Suggested-by: Eric Dumazet Signed-off-by: Richard Gobert Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20221108123320.GA59373@debian Signed-off-by: Paolo Abeni --- net/core/gro.c | 70 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 35 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/core/gro.c b/net/core/gro.c index bc9451743307..8e0fe85a647d 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -489,45 +489,45 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { - if (ptype->type != type || !ptype->callbacks.gro_receive) - continue; - - skb_set_network_header(skb, skb_gro_offset(skb)); - skb_reset_mac_len(skb); - BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); - BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), - sizeof(u32))); /* Avoid slow unaligned acc */ - *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; - NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); - NAPI_GRO_CB(skb)->is_atomic = 1; - NAPI_GRO_CB(skb)->count = 1; - if (unlikely(skb_is_gso(skb))) { - NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; - /* Only support TCP at the moment. */ - if (!skb_is_gso_tcp(skb)) - NAPI_GRO_CB(skb)->flush = 1; - } - - /* Setup for GRO checksum validation */ - switch (skb->ip_summed) { - case CHECKSUM_COMPLETE: - NAPI_GRO_CB(skb)->csum = skb->csum; - NAPI_GRO_CB(skb)->csum_valid = 1; - break; - case CHECKSUM_UNNECESSARY: - NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; - break; - } + if (ptype->type == type && ptype->callbacks.gro_receive) + goto found_ptype; + } + rcu_read_unlock(); + goto normal; + +found_ptype: + skb_set_network_header(skb, skb_gro_offset(skb)); + skb_reset_mac_len(skb); + BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), + sizeof(u32))); /* Avoid slow unaligned acc */ + *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->count = 1; + if (unlikely(skb_is_gso(skb))) { + NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; + /* Only support TCP at the moment. */ + if (!skb_is_gso_tcp(skb)) + NAPI_GRO_CB(skb)->flush = 1; + } - pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, - ipv6_gro_receive, inet_gro_receive, - &gro_list->list, skb); + /* Setup for GRO checksum validation */ + switch (skb->ip_summed) { + case CHECKSUM_COMPLETE: + NAPI_GRO_CB(skb)->csum = skb->csum; + NAPI_GRO_CB(skb)->csum_valid = 1; + break; + case CHECKSUM_UNNECESSARY: + NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; break; } - rcu_read_unlock(); - if (&ptype->list == head) - goto normal; + pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, + ipv6_gro_receive, inet_gro_receive, + &gro_list->list, skb); + + rcu_read_unlock(); if (PTR_ERR(pp) == -EINPROGRESS) { ret = GRO_CONSUMED; -- cgit v1.2.3-70-g09d2 From edaf5df22cb8e7e849773ce69fcc9bc20ca92160 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 8 Nov 2022 12:57:54 +0900 Subject: ethtool: ethtool_get_drvinfo: populate drvinfo fields even if callback exits If ethtool_ops::get_drvinfo() callback isn't set, ethtool_get_drvinfo() will fill the ethtool_drvinfo::name and ethtool_drvinfo::bus_info fields. However, if the driver provides the callback function, those two fields are not touched. This means that the driver has to fill these itself. Allow the driver to leave those two fields empty and populate them in such case. This way, the driver can rely on the default values for the name and the bus_info. If the driver provides values, do nothing. Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20221108035754.2143-1-mailhol.vincent@wanadoo.fr Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index e40f5e9e109b..99272a67525c 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -706,15 +706,22 @@ static int ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp) { const struct ethtool_ops *ops = dev->ethtool_ops; + struct device *parent = dev->dev.parent; rsp->info.cmd = ETHTOOL_GDRVINFO; strscpy(rsp->info.version, UTS_RELEASE, sizeof(rsp->info.version)); if (ops->get_drvinfo) { ops->get_drvinfo(dev, &rsp->info); - } else if (dev->dev.parent && dev->dev.parent->driver) { - strscpy(rsp->info.bus_info, dev_name(dev->dev.parent), + if (!rsp->info.bus_info[0] && parent) + strscpy(rsp->info.bus_info, dev_name(parent), + sizeof(rsp->info.bus_info)); + if (!rsp->info.driver[0] && parent && parent->driver) + strscpy(rsp->info.driver, parent->driver->name, + sizeof(rsp->info.driver)); + } else if (parent && parent->driver) { + strscpy(rsp->info.bus_info, dev_name(parent), sizeof(rsp->info.bus_info)); - strscpy(rsp->info.driver, dev->dev.parent->driver->name, + strscpy(rsp->info.driver, parent->driver->name, sizeof(rsp->info.driver)); } else if (dev->rtnl_link_ops) { strscpy(rsp->info.driver, dev->rtnl_link_ops->kind, -- cgit v1.2.3-70-g09d2 From 1fb22ed671950bbc0c402034d206fcb334d1fd3a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 10 Nov 2022 10:51:50 +0200 Subject: devlink: Fix warning when unregistering a port When a devlink port is unregistered, its type is expected to be unset or otherwise a WARNING is generated [1]. This was supposed to be handled by cited commit by clearing the type upon 'NETDEV_PRE_UNINIT'. The assumption was that no other events can be generated for the netdev after this event, but this proved to be wrong. After the event is generated, netdev_wait_allrefs_any() will rebroadcast a 'NETDEV_UNREGISTER' until the netdev's reference count drops to 1. This causes devlink to set the port type back to Ethernet. Fix by only setting and clearing the port type upon 'NETDEV_POST_INIT' and 'NETDEV_PRE_UNINIT', respectively. For all other events, preserve the port type. [1] WARNING: CPU: 0 PID: 11 at net/core/devlink.c:9998 devl_port_unregister+0x2f6/0x390 net/core/devlink.c:9998 Modules linked in: CPU: 1 PID: 11 Comm: kworker/u4:1 Not tainted 6.1.0-rc3-next-20221107-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: netns cleanup_net RIP: 0010:devl_port_unregister+0x2f6/0x390 net/core/devlink.c:9998 [...] Call Trace: __nsim_dev_port_del+0x1bb/0x240 drivers/net/netdevsim/dev.c:1433 nsim_dev_port_del_all drivers/net/netdevsim/dev.c:1443 [inline] nsim_dev_reload_destroy+0x171/0x510 drivers/net/netdevsim/dev.c:1660 nsim_dev_reload_down+0x6b/0xd0 drivers/net/netdevsim/dev.c:968 devlink_reload+0x1c2/0x6b0 net/core/devlink.c:4501 devlink_pernet_pre_exit+0x104/0x1c0 net/core/devlink.c:12609 ops_pre_exit_list net/core/net_namespace.c:159 [inline] cleanup_net+0x451/0xb10 net/core/net_namespace.c:594 process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289 worker_thread+0x665/0x1080 kernel/workqueue.c:2436 kthread+0x2e4/0x3a0 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 Fixes: 02a68a47eade ("net: devlink: track netdev with devlink_port assigned") Reported-by: syzbot+85e47e1a08b3e159b159@syzkaller.appspotmail.com Reported-by: syzbot+c2ca18f0fccdd1f09c66@syzkaller.appspotmail.com Reviewed-by: Jiri Pirko Signed-off-by: Ido Schimmel Link: https://lore.kernel.org/r/20221110085150.520800-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 6bbe230c4ec5..7f789bbcbbd7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -10177,7 +10177,7 @@ static int devlink_netdevice_event(struct notifier_block *nb, * we take into account netdev pointer appearing in this * namespace. */ - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, + __devlink_port_type_set(devlink_port, devlink_port->type, netdev); break; case NETDEV_UNREGISTER: @@ -10185,7 +10185,7 @@ static int devlink_netdevice_event(struct notifier_block *nb, * also during net namespace change so we need to clear * pointer to netdev that is going to another net namespace. */ - __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, + __devlink_port_type_set(devlink_port, devlink_port->type, NULL); break; case NETDEV_PRE_UNINIT: -- cgit v1.2.3-70-g09d2 From c1b05105573b2cd5845921eb0d2caa26e2144a34 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 9 Nov 2022 10:32:54 -0800 Subject: genetlink: fix single op policy dump when do is present Jonathan reports crashes when running net-next in Meta's fleet. Stats collection uses ethtool -I which does a per-op policy dump to check if stats are supported. We don't initialize the dumpit information if doit succeeds due to evaluation short-circuiting. The crash may look like this: BUG: kernel NULL pointer dereference, address: 0000000000000cc0 RIP: 0010:netlink_policy_dump_add_policy+0x174/0x2a0 ctrl_dumppolicy_start+0x19f/0x2f0 genl_start+0xe7/0x140 Or we may trigger a warning: WARNING: CPU: 1 PID: 785 at net/netlink/policy.c:87 netlink_policy_dump_get_policy_idx+0x79/0x80 RIP: 0010:netlink_policy_dump_get_policy_idx+0x79/0x80 ctrl_dumppolicy_put_op+0x214/0x360 depending on what garbage we pick up from the stack. Reported-by: Jonathan Lemon Fixes: 26588edbef60 ("genetlink: support split policies in ctrl_dumppolicy_put_op()") Reviewed-by: Jacob Keller Tested-by: Leon Romanovsky Link: https://lore.kernel.org/r/20221109183254.554051-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/netlink/genetlink.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 9b7dfc45dd67..600993c80050 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -282,6 +282,7 @@ genl_cmd_full_to_split(struct genl_split_ops *op, return 0; } +/* Must make sure that op is initialized to 0 on failure */ static int genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family, struct genl_split_ops *op) @@ -302,6 +303,21 @@ genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family, return err; } +/* For policy dumping only, get ops of both do and dump. + * Fail if both are missing, genl_get_cmd() will zero-init in case of failure. + */ +static int +genl_get_cmd_both(u32 cmd, const struct genl_family *family, + struct genl_split_ops *doit, struct genl_split_ops *dumpit) +{ + int err1, err2; + + err1 = genl_get_cmd(cmd, GENL_CMD_CAP_DO, family, doit); + err2 = genl_get_cmd(cmd, GENL_CMD_CAP_DUMP, family, dumpit); + + return err1 && err2 ? -ENOENT : 0; +} + static bool genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter) { @@ -1406,10 +1422,10 @@ static int ctrl_dumppolicy_start(struct netlink_callback *cb) ctx->single_op = true; ctx->op = nla_get_u32(tb[CTRL_ATTR_OP]); - if (genl_get_cmd(ctx->op, GENL_CMD_CAP_DO, rt, &doit) && - genl_get_cmd(ctx->op, GENL_CMD_CAP_DUMP, rt, &dump)) { + err = genl_get_cmd_both(ctx->op, rt, &doit, &dump); + if (err) { NL_SET_BAD_ATTR(cb->extack, tb[CTRL_ATTR_OP]); - return -ENOENT; + return err; } if (doit.policy) { @@ -1551,13 +1567,9 @@ static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) if (ctx->single_op) { struct genl_split_ops doit, dumpit; - if (genl_get_cmd(ctx->op, GENL_CMD_CAP_DO, - ctx->rt, &doit) && - genl_get_cmd(ctx->op, GENL_CMD_CAP_DUMP, - ctx->rt, &dumpit)) { - WARN_ON(1); + if (WARN_ON(genl_get_cmd_both(ctx->op, ctx->rt, + &doit, &dumpit))) return -ENOENT; - } if (ctrl_dumppolicy_put_op(skb, cb, &doit, &dumpit)) return skb->len; -- cgit v1.2.3-70-g09d2 From 9bb053490f1a5a0914eb9f7b4116a0e4a95d4f8e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Mon, 7 Nov 2022 15:04:18 -0800 Subject: bpf: Add hwtstamp field for the sockops prog The bpf-tc prog has already been able to access the skb_hwtstamps(skb)->hwtstamp. This patch extends the same hwtstamp access to the sockops prog. In sockops, the skb is also available to the bpf prog during the BPF_SOCK_OPS_PARSE_HDR_OPT_CB event. There is a use case that the hwtstamp will be useful to the sockops prog to better measure the one-way-delay when the sender has put the tx timestamp in the tcp header option. Signed-off-by: Martin KaFai Lau Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221107230420.4192307-2-martin.lau@linux.dev --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 39 +++++++++++++++++++++++++++++++-------- tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 33 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 94659f6b3395..fb4c911d2a03 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6445,6 +6445,7 @@ struct bpf_sock_ops { * the outgoing header has not * been written yet. */ + __u64 skb_hwtstamp; }; /* Definitions for bpf_sock_ops_cb_flags */ diff --git a/net/core/filter.c b/net/core/filter.c index cb3b635e35be..cd667cdbdb26 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8925,6 +8925,10 @@ static bool sock_ops_is_valid_access(int off, int size, bpf_ctx_record_field_size(info, size_default); return bpf_ctx_narrow_access_ok(off, size, size_default); + case offsetof(struct bpf_sock_ops, skb_hwtstamp): + if (size != sizeof(__u64)) + return false; + break; default: if (size != size_default) return false; @@ -9108,21 +9112,21 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, return insn; } -static struct bpf_insn *bpf_convert_shinfo_access(const struct bpf_insn *si, +static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg, struct bpf_insn *insn) { /* si->dst_reg = skb_shinfo(SKB); */ #ifdef NET_SKBUFF_DATA_USES_OFFSET *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - BPF_REG_AX, si->src_reg, + BPF_REG_AX, skb_reg, offsetof(struct sk_buff, end)); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), - si->dst_reg, si->src_reg, + dst_reg, skb_reg, offsetof(struct sk_buff, head)); - *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX); #else *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), - si->dst_reg, si->src_reg, + dst_reg, skb_reg, offsetof(struct sk_buff, end)); #endif @@ -9515,7 +9519,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, gso_segs): - insn = bpf_convert_shinfo_access(si, insn); + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, @@ -9523,7 +9527,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, target_size)); break; case offsetof(struct __sk_buff, gso_size): - insn = bpf_convert_shinfo_access(si, insn); + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size), si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, @@ -9550,7 +9554,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); - insn = bpf_convert_shinfo_access(si, insn); + insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn); *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, bpf_target_off(struct skb_shared_info, @@ -10400,6 +10404,25 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, tcp_flags), si->dst_reg, si->dst_reg, off); break; + case offsetof(struct bpf_sock_ops, skb_hwtstamp): { + struct bpf_insn *jmp_on_null_skb; + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern, + skb), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + skb)); + /* Reserve one insn to test skb == NULL */ + jmp_on_null_skb = insn++; + insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn); + *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + hwtstamps, 8, + target_size)); + *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, + insn - jmp_on_null_skb - 1); + break; + } } return insn - insn_buf; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 94659f6b3395..fb4c911d2a03 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -6445,6 +6445,7 @@ struct bpf_sock_ops { * the outgoing header has not * been written yet. */ + __u64 skb_hwtstamp; }; /* Definitions for bpf_sock_ops_cb_flags */ -- cgit v1.2.3-70-g09d2 From 354259fa73e2aac92ae5e19522adb69a92c15b49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Nov 2022 09:57:58 +0000 Subject: net: remove skb->vlan_present skb->vlan_present seems redundant. We can instead derive it from this boolean expression: vlan_present = skb->vlan_proto != 0 || skb->vlan_tci != 0 Add a new union, to access both fields in a single load/store when possible. union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; This allows following patch to remove a conditional test in GRO stack. Note: We move remcsum_offload to keep TC_AT_INGRESS_MASK and SKB_MONO_DELIVERY_TIME_MASK unchanged. Signed-off-by: Eric Dumazet Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Signed-off-by: Jakub Kicinski --- arch/sparc/net/bpf_jit_comp_32.c | 10 +++++----- .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 +- include/linux/if_vlan.h | 9 +++------ include/linux/skbuff.h | 18 ++++++++++-------- lib/test_bpf.c | 1 - net/core/filter.c | 22 ++++++++++------------ 6 files changed, 29 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c index b1dbf2fa8c0a..a74e5004c6c8 100644 --- a/arch/sparc/net/bpf_jit_comp_32.c +++ b/arch/sparc/net/bpf_jit_comp_32.c @@ -555,11 +555,11 @@ void bpf_jit_compile(struct bpf_prog *fp) emit_skb_load16(vlan_tci, r_A); break; case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: - __emit_skb_load8(__pkt_vlan_present_offset, r_A); - if (PKT_VLAN_PRESENT_BIT) - emit_alu_K(SRL, PKT_VLAN_PRESENT_BIT); - if (PKT_VLAN_PRESENT_BIT < 7) - emit_andi(r_A, 1, r_A); + emit_skb_load32(vlan_all, r_A); + emit_cmpi(r_A, 0); + emit_branch_off(BE, 12); + emit_nop(); + emit_loadimm(1, r_A); break; case BPF_LD | BPF_W | BPF_LEN: emit_skb_load32(len, r_A); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 303930499a4c..c1ea60bc2630 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1973,7 +1973,7 @@ static u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb, #endif #ifdef CONFIG_DCB - if (!skb->vlan_present) + if (!skb_vlan_tag_present(skb)) goto pick_tx; vlan_prio = skb->vlan_tci >> 13; diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index e00c4ee81ff7..6864b89ef868 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -76,7 +76,7 @@ static inline bool is_vlan_dev(const struct net_device *dev) return dev->priv_flags & IFF_802_1Q_VLAN; } -#define skb_vlan_tag_present(__skb) ((__skb)->vlan_present) +#define skb_vlan_tag_present(__skb) (!!(__skb)->vlan_all) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK)) @@ -471,7 +471,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { - skb->vlan_present = 0; + skb->vlan_all = 0; } /** @@ -483,9 +483,7 @@ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { - dst->vlan_present = src->vlan_present; - dst->vlan_proto = src->vlan_proto; - dst->vlan_tci = src->vlan_tci; + dst->vlan_all = src->vlan_all; } /* @@ -519,7 +517,6 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, { skb->vlan_proto = vlan_proto; skb->vlan_tci = vlan_tci; - skb->vlan_present = 1; } /** diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59c9fd55699d..4e464a27adaf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -818,7 +818,7 @@ typedef unsigned char *sk_buff_data_t; * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff - * @vlan_present: VLAN tag is present + * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) @@ -951,7 +951,7 @@ struct sk_buff { /* private: */ __u8 __pkt_vlan_present_offset[0]; /* public: */ - __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */ + __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 dst_pending_confirm:1; @@ -966,7 +966,6 @@ struct sk_buff { __u8 ipvs_property:1; __u8 inner_protocol_type:1; - __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; @@ -999,8 +998,13 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; - __be16 vlan_proto; - __u16 vlan_tci; + union { + u32 vlan_all; + struct { + __be16 vlan_proto; + __u16 vlan_tci; + }; + }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; @@ -1059,15 +1063,13 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) -/* if you move pkt_vlan_present, tc_at_ingress, or mono_delivery_time +/* if you move tc_at_ingress or mono_delivery_time * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD -#define PKT_VLAN_PRESENT_BIT 7 #define TC_AT_INGRESS_MASK (1 << 0) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 2) #else -#define PKT_VLAN_PRESENT_BIT 0 #define TC_AT_INGRESS_MASK (1 << 7) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 5820704165a6..ade9ac672adb 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -14346,7 +14346,6 @@ static struct sk_buff *populate_skb(char *buf, int size) skb->hash = SKB_HASH; skb->queue_mapping = SKB_QUEUE_MAP; skb->vlan_tci = SKB_VLAN_TCI; - skb->vlan_present = SKB_VLAN_PRESENT; skb->vlan_proto = htons(ETH_P_IP); dev_net_set(&dev, &init_net); skb->dev = &dev; diff --git a/net/core/filter.c b/net/core/filter.c index bb0136e7a8e4..358d5e70671a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -325,11 +325,11 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, offsetof(struct sk_buff, vlan_tci)); break; case SKF_AD_VLAN_TAG_PRESENT: - *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET); - if (PKT_VLAN_PRESENT_BIT) - *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); - if (PKT_VLAN_PRESENT_BIT < 7) - *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); + BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4); + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, vlan_all)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); + *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1); break; } @@ -9290,13 +9290,11 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct __sk_buff, vlan_present): - *target_size = 1; - *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, - PKT_VLAN_PRESENT_OFFSET); - if (PKT_VLAN_PRESENT_BIT) - *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); - if (PKT_VLAN_PRESENT_BIT < 7) - *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + bpf_target_off(struct sk_buff, + vlan_all, 4, target_size)); + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); + *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1); break; case offsetof(struct __sk_buff, vlan_tci): -- cgit v1.2.3-70-g09d2 From be3ed48683f0d7ed808783fd7d919459b58b5b6b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Nov 2022 09:57:59 +0000 Subject: net: gro: no longer use skb_vlan_tag_present() We can remove a conditional test in gro_list_prepare() by comparing vlan_all fields of the two skbs. Notes: While comparing the vlan_proto is not strictly needed, because part of the following compare_ether_header() call, using 32bit word is actually faster than using 16bit values. napi_reuse_skb() makes sure to clear skb->vlan_all, as it already calls __vlan_hwaccel_clear_tag() Signed-off-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: Jakub Kicinski --- net/core/gro.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/gro.c b/net/core/gro.c index 8e0fe85a647d..fd8c6a7e8d3e 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -370,9 +370,7 @@ static void gro_list_prepare(const struct list_head *head, } diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); - if (skb_vlan_tag_present(p)) - diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); + diffs |= p->vlan_all ^ skb->vlan_all; diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), -- cgit v1.2.3-70-g09d2 From 00df24f19179c8e8c81fffa95c91d5c2eab8964e Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Nov 2022 15:23:18 -0800 Subject: mptcp: use msk instead of mptcp_sk Use msk instead of mptcp_sk(sk) in the functions where the variable "msk = mptcp_sk(sk)" has been defined. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 109eea2c65ff..64d7070de901 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1625,7 +1625,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) * check for a different subflow usage only after * spooling the first chunk of data */ - xmit_ssk = first ? ssk : mptcp_subflow_get_send(mptcp_sk(sk)); + xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); if (!xmit_ssk) goto out; if (xmit_ssk != ssk) { @@ -2275,7 +2275,7 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) struct mptcp_data_frag *cur, *rtx_head; struct mptcp_sock *msk = mptcp_sk(sk); - if (__mptcp_check_fallback(mptcp_sk(sk))) + if (__mptcp_check_fallback(msk)) return false; if (tcp_rtx_and_write_queues_empty(sk)) @@ -2949,7 +2949,7 @@ cleanup: sock_hold(sk); pr_debug("msk=%p state=%d", sk, sk->sk_state); - if (mptcp_sk(sk)->token) + if (msk->token) mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); if (sk->sk_state == TCP_CLOSE) { @@ -3008,8 +3008,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) mptcp_stop_timer(sk); sk_stop_timer(sk, &sk->sk_timer); - if (mptcp_sk(sk)->token) - mptcp_event(MPTCP_EVENT_CLOSED, mptcp_sk(sk), NULL, GFP_KERNEL); + if (msk->token) + mptcp_event(MPTCP_EVENT_CLOSED, msk, NULL, GFP_KERNEL); /* msk->subflow is still intact, the following will not free the first * subflow -- cgit v1.2.3-70-g09d2 From 73a0052a61f98354f39f461e03f1a7e513b84578 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Nov 2022 15:23:19 -0800 Subject: mptcp: change 'first' as a parameter The function mptcp_subflow_process_delegated() uses the input ssk first, while __mptcp_check_push() invokes the packet scheduler first. So this patch adds a new parameter named 'first' for the function __mptcp_subflow_push_pending() to deal with these two cases separately. With this change, the code that invokes the packet scheduler in the function __mptcp_check_push() can be removed, and replaced by invoking __mptcp_subflow_push_pending() directly. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 64d7070de901..5a344788f843 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1602,7 +1602,7 @@ out: __mptcp_check_send_data_fin(sk); } -static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) +static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sendmsg_info info = { @@ -1611,7 +1611,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) struct mptcp_data_frag *dfrag; struct sock *xmit_ssk; int len, copied = 0; - bool first = true; info.flags = 0; while ((dfrag = mptcp_send_head(sk))) { @@ -1621,8 +1620,7 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) while (len > 0) { int ret = 0; - /* the caller already invoked the packet scheduler, - * check for a different subflow usage only after + /* check for a different subflow usage only after * spooling the first chunk of data */ xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk); @@ -3220,16 +3218,10 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) if (!mptcp_send_head(sk)) return; - if (!sock_owned_by_user(sk)) { - struct sock *xmit_ssk = mptcp_subflow_get_send(mptcp_sk(sk)); - - if (xmit_ssk == ssk) - __mptcp_subflow_push_pending(sk, ssk); - else if (xmit_ssk) - mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk), MPTCP_DELEGATE_SEND); - } else { + if (!sock_owned_by_user(sk)) + __mptcp_subflow_push_pending(sk, ssk, false); + else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); - } } #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ @@ -3320,7 +3312,7 @@ void mptcp_subflow_process_delegated(struct sock *ssk) if (test_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status)) { mptcp_data_lock(sk); if (!sock_owned_by_user(sk)) - __mptcp_subflow_push_pending(sk, ssk); + __mptcp_subflow_push_pending(sk, ssk, true); else __set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); -- cgit v1.2.3-70-g09d2 From 80638684e840684042325fa5a3fa1eb59fd123cc Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 10 Nov 2022 15:23:20 -0800 Subject: mptcp: get sk from msk directly Use '(struct sock *)msk' to get 'sk' from 'msk' in a more direct way instead of using '&msk->sk.icsk_inet.sk'. Reviewed-by: Mat Martineau Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 4 ++-- net/mptcp/protocol.c | 4 ++-- net/mptcp/sockopt.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 9e82250cbb70..5cb65f0928f4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -291,7 +291,7 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) goto create_err; } - sk = &msk->sk.icsk_inet.sk; + sk = (struct sock *)msk; lock_sock(sk); err = __mptcp_subflow_connect(sk, &addr_l, &addr_r); @@ -403,7 +403,7 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) goto destroy_err; } - sk = &msk->sk.icsk_inet.sk; + sk = (struct sock *)msk; lock_sock(sk); ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); if (ssk) { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5a344788f843..3796d1bfef6b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2454,7 +2454,7 @@ static bool mptcp_check_close_timeout(const struct sock *sk) static void mptcp_check_fastclose(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *tmp; - struct sock *sk = &msk->sk.icsk_inet.sk; + struct sock *sk = (struct sock *)msk; if (likely(!READ_ONCE(msk->rcv_fastclose))) return; @@ -2616,7 +2616,7 @@ static void mptcp_do_fastclose(struct sock *sk) static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); - struct sock *sk = &msk->sk.icsk_inet.sk; + struct sock *sk = (struct sock *)msk; unsigned long fail_tout; int state; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index f85e9bbfe86f..f62f6483ef77 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -987,7 +987,7 @@ static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { struct mptcp_subflow_context *subflow; - struct sock *sk = &msk->sk.icsk_inet.sk; + struct sock *sk = (struct sock *)msk; unsigned int sfcount = 0, copied = 0; struct mptcp_subflow_data sfd; char __user *infoptr; @@ -1078,8 +1078,8 @@ static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addr static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval, int __user *optlen) { - struct sock *sk = &msk->sk.icsk_inet.sk; struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; unsigned int sfcount = 0, copied = 0; struct mptcp_subflow_data sfd; char __user *addrptr; -- cgit v1.2.3-70-g09d2 From 4373bf4b72f93af9d045450a44d294953c590e2d Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Thu, 10 Nov 2022 15:23:22 -0800 Subject: mptcp: Fix grammar in a comment We kept getting initial patches from new contributors to remove a duplicate 'the' (since grammar checking scripts flag it), but submitters never followed up after code review. Reviewed-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/token.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mptcp/token.c b/net/mptcp/token.c index f52ee7b26aed..65430f314a68 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -287,8 +287,8 @@ EXPORT_SYMBOL_GPL(mptcp_token_get_sock); * This function returns the first mptcp connection structure found inside the * token container starting from the specified position, or NULL. * - * On successful iteration, the iterator is move to the next position and the - * the acquires a reference to the returned socket. + * On successful iteration, the iterator is moved to the next position and + * a reference to the returned socket is acquired. */ struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num) -- cgit v1.2.3-70-g09d2 From 3e35f26d339718e155fa7cf5993a99924124a041 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 10 Nov 2022 10:54:22 +0200 Subject: bridge: Add missing parentheses No changes in generated code. Reported-by: Petr Machata Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Link: https://lore.kernel.org/r/20221110085422.521059-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- net/bridge/br_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index d04d2205ad4e..3027e8f6be15 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -121,7 +121,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb test_bit(BR_FDB_LOCAL, &fdb_src->flags)) { /* FDB mismatch. Drop the packet without roaming. */ goto drop; - } else if test_bit(BR_FDB_LOCKED, &fdb_src->flags) { + } else if (test_bit(BR_FDB_LOCKED, &fdb_src->flags)) { /* FDB match, but entry is locked. Refresh it and drop * the packet. */ -- cgit v1.2.3-70-g09d2 From fac30731b9b8a0b7580bf73ddd25035ef0a733a5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Nov 2022 17:48:29 +0000 Subject: tcp: adopt try_cmpxchg() in tcp_release_cb() try_cmpxchg() is slighly more efficient (at least on x86), and smp_load_acquire(&sk->sk_tsq_flags) could avoid a KCSAN report. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20221110174829.3403442-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c69f4d966024..d1cb1ecf8f21 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1077,15 +1077,15 @@ static void tcp_tasklet_func(struct tasklet_struct *t) */ void tcp_release_cb(struct sock *sk) { - unsigned long flags, nflags; + unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags); + unsigned long nflags; /* perform an atomic operation only if at least one flag is set */ do { - flags = sk->sk_tsq_flags; if (!(flags & TCP_DEFERRED_ALL)) return; nflags = flags & ~TCP_DEFERRED_ALL; - } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); + } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags)); if (flags & TCPF_TSQ_DEFERRED) { tcp_tsq_write(sk); -- cgit v1.2.3-70-g09d2 From b548b17a93fd18357a5a6f535c10c1e68719ad32 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 10 Nov 2022 19:02:39 +0000 Subject: tcp: tcp_wfree() refactoring Use try_cmpxchg() (instead of cmpxchg()) in a more readable way. oval = smp_load_acquire(&sk->sk_tsq_flags); do { ... } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); Reduce indentation level. Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20221110190239.3531280-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_output.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d1cb1ecf8f21..894410dc9293 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1139,6 +1139,8 @@ void tcp_wfree(struct sk_buff *skb) struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; + struct tsq_tasklet *tsq; + bool empty; /* Keep one reference on sk_wmem_alloc. * Will be released by sk_free() from here or tcp_tasklet_func() @@ -1155,28 +1157,23 @@ void tcp_wfree(struct sk_buff *skb) if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current) goto out; - for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) { - struct tsq_tasklet *tsq; - bool empty; - + oval = smp_load_acquire(&sk->sk_tsq_flags); + do { if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED)) goto out; nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; - nval = cmpxchg(&sk->sk_tsq_flags, oval, nval); - if (nval != oval) - continue; + } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); - /* queue this socket to tasklet queue */ - local_irq_save(flags); - tsq = this_cpu_ptr(&tsq_tasklet); - empty = list_empty(&tsq->head); - list_add(&tp->tsq_node, &tsq->head); - if (empty) - tasklet_schedule(&tsq->tasklet); - local_irq_restore(flags); - return; - } + /* queue this socket to tasklet queue */ + local_irq_save(flags); + tsq = this_cpu_ptr(&tsq_tasklet); + empty = list_empty(&tsq->head); + list_add(&tp->tsq_node, &tsq->head); + if (empty) + tasklet_schedule(&tsq->tasklet); + local_irq_restore(flags); + return; out: sk_free(sk); } -- cgit v1.2.3-70-g09d2 From 41cf3a9156ba8e13e557e7908f9e22563b1f2828 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 10 Nov 2022 09:39:39 +0000 Subject: rxrpc: Fix missing IPV6 #ifdef Fix rxrpc_encap_err_rcv() to make the call to ipv6_icmp_error conditional on IPV6 support being enabled. Fixes: b6c66c4324e7 ("rxrpc: Use the core ICMP/ICMP6 parsers") Reported-by: kernel test robot Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org --- net/rxrpc/local_object.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index a178f71e5082..a943fdf91e24 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -33,7 +33,8 @@ static void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err, { if (ip_hdr(skb)->version == IPVERSION) return ip_icmp_error(sk, skb, err, port, info, payload); - return ipv6_icmp_error(sk, skb, err, port, info, payload); + if (IS_ENABLED(CONFIG_AF_RXRPC_IPV6)) + return ipv6_icmp_error(sk, skb, err, port, info, payload); } /* -- cgit v1.2.3-70-g09d2 From 2fd450cd83e3c978a4902aceb4aac36fa341067c Mon Sep 17 00:00:00 2001 From: xu xin Date: Fri, 11 Nov 2022 09:04:20 +0000 Subject: ipasdv4/tcp_ipv4: remove redundant assignment The value of 'st->state' has been verified as "TCP_SEQ_STATE_LISTENING", it's unnecessary to assign TCP_SEQ_STATE_LISTENING to it, so we can remove it. Signed-off-by: xu xin Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ebab9e8b184c..f0343538d1f8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2480,7 +2480,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) case TCP_SEQ_STATE_LISTENING: if (st->bucket > hinfo->lhash2_mask) break; - st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_first(seq); while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); -- cgit v1.2.3-70-g09d2 From 70ea86a0dfed10e00ee2666dadeb563bab00efea Mon Sep 17 00:00:00 2001 From: Steen Hegelund Date: Fri, 11 Nov 2022 14:05:14 +0100 Subject: net: flow_offload: add support for ARP frame matching This adds a new flow_rule_match_arp function that allows drivers to be able to dissect ARP frames. Signed-off-by: Steen Hegelund Signed-off-by: David S. Miller --- include/net/flow_offload.h | 6 ++++++ net/core/flow_offload.c | 7 +++++++ 2 files changed, 13 insertions(+) (limited to 'net') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 7a60bc6d72c9..0400a0ac8a29 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -32,6 +32,10 @@ struct flow_match_vlan { struct flow_dissector_key_vlan *key, *mask; }; +struct flow_match_arp { + struct flow_dissector_key_arp *key, *mask; +}; + struct flow_match_ipv4_addrs { struct flow_dissector_key_ipv4_addrs *key, *mask; }; @@ -98,6 +102,8 @@ void flow_rule_match_vlan(const struct flow_rule *rule, struct flow_match_vlan *out); void flow_rule_match_cvlan(const struct flow_rule *rule, struct flow_match_vlan *out); +void flow_rule_match_arp(const struct flow_rule *rule, + struct flow_match_arp *out); void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out); void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index abe423fd5736..acfc1f88ea79 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -97,6 +97,13 @@ void flow_rule_match_cvlan(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_cvlan); +void flow_rule_match_arp(const struct flow_rule *rule, + struct flow_match_arp *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ARP, out); +} +EXPORT_SYMBOL(flow_rule_match_arp); + void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out) { -- cgit v1.2.3-70-g09d2 From d9282e48c6088105a98b98153a707fdbcdbf75b1 Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Mon, 14 Nov 2022 12:00:08 +1100 Subject: tcp: Add listening address to SYN flood message The SYN flood message prints the listening port number, but with many processes bound to the same port on different IPs, it's impossible to tell which socket is the problem. Add the listen IP address to the SYN flood message. For IPv6 use "[IP]:port" as per RFC-5952 and to provide ease of copy-paste to "ss" filters. For IPv4 use "IP:port" to match. Each protcol's "any" address and a host address now look like: Possible SYN flooding on port 0.0.0.0:9001. Possible SYN flooding on port 127.0.0.1:9001. Possible SYN flooding on port [::]:9001. Possible SYN flooding on port [fc00::1]:9001. Signed-off-by: Jamie Bainbridge Reviewed-by: Eric Dumazet Reviewed-by: Stephen Hemminger Link: https://lore.kernel.org/r/4fedab7ce54a389aeadbdc639f6b4f4988e9d2d7.1668386107.git.jamie.bainbridge@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d764b5854dfc..94024fdc2da1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6842,9 +6842,17 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); if (!queue->synflood_warned && syncookies != 2 && - xchg(&queue->synflood_warned, 1) == 0) - net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, sk->sk_num, msg); + xchg(&queue->synflood_warned, 1) == 0) { + if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { + net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", + proto, &sk->sk_v6_rcv_saddr, + sk->sk_num, msg); + } else { + net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n", + proto, &sk->sk_rcv_saddr, + sk->sk_num, msg); + } + } return want_cookie; } -- cgit v1.2.3-70-g09d2 From 6728aea7216c0c06c98e2e58d753a5e8b2ae1c6f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:28 +0530 Subject: bpf: Refactor btf_struct_access Instead of having to pass multiple arguments that describe the register, pass the bpf_reg_state into the btf_struct_access callback. Currently, all call sites simply reuse the btf and btf_id of the reg they want to check the access of. The only exception to this pattern is the callsite in check_ptr_to_map_access, hence for that case create a dummy reg to simulate PTR_TO_BTF_ID access. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 ++++++++--------- include/linux/filter.h | 8 ++++---- kernel/bpf/btf.c | 11 +++++++---- kernel/bpf/verifier.c | 12 +++++++----- net/bpf/bpf_dummy_struct_ops.c | 14 +++++++------- net/core/filter.c | 34 ++++++++++++++-------------------- net/ipv4/bpf_tcp_ca.c | 13 ++++++------- net/netfilter/nf_conntrack_bpf.c | 17 +++++++---------- 8 files changed, 60 insertions(+), 66 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index afc1c51b59ff..49f9d2bec401 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -771,6 +771,7 @@ struct bpf_prog_ops { union bpf_attr __user *uattr); }; +struct bpf_reg_state; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto * @@ -792,9 +793,8 @@ struct bpf_verifier_ops { struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); }; @@ -2080,9 +2080,9 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, return btf_ctx_access(off, size, type, prog, info); } -int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, +int btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, @@ -2333,9 +2333,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) } static inline int btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag) { return -EACCES; diff --git a/include/linux/filter.h b/include/linux/filter.h index efc42a6e3aed..787d35dbf5b0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -568,10 +568,10 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index c0d73d71c539..875355ff3718 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6017,15 +6017,18 @@ error: return -EINVAL; } -int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype __maybe_unused, +int btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype __maybe_unused, u32 *next_btf_id, enum bpf_type_flag *flag) { + const struct btf *btf = reg->btf; enum bpf_type_flag tmp_flag = 0; + const struct btf_type *t; + u32 id = reg->btf_id; int err; - u32 id; + t = btf_type_by_id(btf, id); do { err = btf_struct_walk(log, btf, t, off, size, &id, &tmp_flag); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c588e5483540..5e74f460dfd0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4688,16 +4688,14 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (env->ops->btf_struct_access) { - ret = env->ops->btf_struct_access(&env->log, reg->btf, t, - off, size, atype, &btf_id, &flag); + ret = env->ops->btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); return -EACCES; } - ret = btf_struct_access(&env->log, reg->btf, t, off, size, - atype, &btf_id, &flag); + ret = btf_struct_access(&env->log, reg, off, size, atype, &btf_id, &flag); } if (ret < 0) @@ -4723,6 +4721,7 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, { struct bpf_reg_state *reg = regs + regno; struct bpf_map *map = reg->map_ptr; + struct bpf_reg_state map_reg; enum bpf_type_flag flag = 0; const struct btf_type *t; const char *tname; @@ -4761,7 +4760,10 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id, &flag); + /* Simulate access to a PTR_TO_BTF_ID */ + memset(&map_reg, 0, sizeof(map_reg)); + mark_btf_ld_reg(env, &map_reg, 0, PTR_TO_BTF_ID, btf_vmlinux, *map->ops->map_btf_id, 0); + ret = btf_struct_access(&env->log, &map_reg, off, size, atype, &btf_id, &flag); if (ret < 0) return ret; diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index e78dadfc5829..2d434c1f4617 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -156,29 +156,29 @@ static bool bpf_dummy_ops_is_valid_access(int off, int size, } static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, - int size, enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag) { const struct btf_type *state; + const struct btf_type *t; s32 type_id; int err; - type_id = btf_find_by_name_kind(btf, "bpf_dummy_ops_state", + type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state", BTF_KIND_STRUCT); if (type_id < 0) return -EINVAL; - state = btf_type_by_id(btf, type_id); + t = btf_type_by_id(reg->btf, reg->btf_id); + state = btf_type_by_id(reg->btf, type_id); if (t != state) { bpf_log(log, "only access to bpf_dummy_ops_state is supported\n"); return -EACCES; } - err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id, - flag); + err = btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); if (err < 0) return err; diff --git a/net/core/filter.c b/net/core/filter.c index 6dd2baf5eeb2..37fad5a9b752 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8651,28 +8651,25 @@ static bool tc_cls_act_is_valid_access(int off, int size, DEFINE_MUTEX(nf_conn_btf_access_lock); EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); -int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); +int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag); EXPORT_SYMBOL_GPL(nfct_btf_struct_access); static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, - int size, enum bpf_access_type atype, - u32 *next_btf_id, - enum bpf_type_flag *flag) + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag) { int ret = -EACCES; if (atype == BPF_READ) - return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, - flag); + return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) - ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); mutex_unlock(&nf_conn_btf_access_lock); return ret; @@ -8738,21 +8735,18 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); static int xdp_btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, - int size, enum bpf_access_type atype, - u32 *next_btf_id, - enum bpf_type_flag *flag) + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag) { int ret = -EACCES; if (atype == BPF_READ) - return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, - flag); + return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); mutex_lock(&nf_conn_btf_access_lock); if (nfct_btf_struct_access) - ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + ret = nfct_btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); mutex_unlock(&nf_conn_btf_access_lock); return ret; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 6da16ae6a962..d15c91de995f 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -69,18 +69,17 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, } static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, - int size, enum bpf_access_type atype, - u32 *next_btf_id, - enum bpf_type_flag *flag) + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag) { + const struct btf_type *t; size_t end; if (atype == BPF_READ) - return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, - flag); + return btf_struct_access(log, reg, off, size, atype, next_btf_id, flag); + t = btf_type_by_id(reg->btf, reg->btf_id); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); return -EACCES; diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 8639e7efd0e2..24002bc61e07 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -191,19 +191,16 @@ BTF_ID(struct, nf_conn___init) /* Check writes into `struct nf_conn` */ static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, - int size, enum bpf_access_type atype, - u32 *next_btf_id, - enum bpf_type_flag *flag) + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag) { - const struct btf_type *ncit; - const struct btf_type *nct; + const struct btf_type *ncit, *nct, *t; size_t end; - ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]); - nct = btf_type_by_id(btf, btf_nf_conn_ids[0]); - + ncit = btf_type_by_id(reg->btf, btf_nf_conn_ids[1]); + nct = btf_type_by_id(reg->btf, btf_nf_conn_ids[0]); + t = btf_type_by_id(reg->btf, reg->btf_id); if (t != nct && t != ncit) { bpf_log(log, "only read is supported\n"); return -EACCES; -- cgit v1.2.3-70-g09d2 From 7d34aa3e03b6a56306296bd98b26c6a1710cd57b Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 14 Oct 2022 23:45:58 +0200 Subject: netfilter: nf_tables: Extend nft_expr_ops::dump callback parameters Add a 'reset' flag just like with nft_object_ops::dump. This will be useful to reset "anonymous stateful objects", e.g. simple rule counters. No functional change intended. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- include/net/netfilter/nft_fib.h | 2 +- include/net/netfilter/nft_meta.h | 4 ++-- include/net/netfilter/nft_reject.h | 3 ++- net/ipv4/netfilter/nft_dup_ipv4.c | 3 ++- net/ipv6/netfilter/nft_dup_ipv6.c | 3 ++- net/netfilter/nf_tables_api.c | 2 +- net/netfilter/nft_bitwise.c | 6 ++++-- net/netfilter/nft_byteorder.c | 3 ++- net/netfilter/nft_cmp.c | 9 ++++++--- net/netfilter/nft_compat.c | 9 ++++++--- net/netfilter/nft_connlimit.c | 3 ++- net/netfilter/nft_counter.c | 3 ++- net/netfilter/nft_ct.c | 6 ++++-- net/netfilter/nft_dup_netdev.c | 3 ++- net/netfilter/nft_dynset.c | 3 ++- net/netfilter/nft_exthdr.c | 9 ++++++--- net/netfilter/nft_fib.c | 2 +- net/netfilter/nft_flow_offload.c | 3 ++- net/netfilter/nft_fwd_netdev.c | 6 ++++-- net/netfilter/nft_hash.c | 4 ++-- net/netfilter/nft_immediate.c | 3 ++- net/netfilter/nft_inner.c | 3 ++- net/netfilter/nft_last.c | 3 ++- net/netfilter/nft_limit.c | 5 +++-- net/netfilter/nft_log.c | 3 ++- net/netfilter/nft_lookup.c | 3 ++- net/netfilter/nft_masq.c | 3 ++- net/netfilter/nft_meta.c | 5 +++-- net/netfilter/nft_nat.c | 3 ++- net/netfilter/nft_numgen.c | 6 ++++-- net/netfilter/nft_objref.c | 6 ++++-- net/netfilter/nft_osf.c | 3 ++- net/netfilter/nft_payload.c | 6 ++++-- net/netfilter/nft_queue.c | 6 ++++-- net/netfilter/nft_quota.c | 3 ++- net/netfilter/nft_range.c | 3 ++- net/netfilter/nft_redir.c | 3 ++- net/netfilter/nft_reject.c | 3 ++- net/netfilter/nft_rt.c | 2 +- net/netfilter/nft_socket.c | 2 +- net/netfilter/nft_synproxy.c | 3 ++- net/netfilter/nft_tproxy.c | 2 +- net/netfilter/nft_tunnel.c | 2 +- net/netfilter/nft_xfrm.c | 2 +- 45 files changed, 110 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 38e2b396e38a..c557a57fb0f1 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -927,7 +927,8 @@ struct nft_expr_ops { void (*destroy_clone)(const struct nft_ctx *ctx, const struct nft_expr *expr); int (*dump)(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, + bool reset); int (*validate)(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data); diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index eed099eae672..167640b843ef 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -18,7 +18,7 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in) return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK; } -int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset); int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h index f3a5285a511c..ba1238f12a48 100644 --- a/include/net/netfilter/nft_meta.h +++ b/include/net/netfilter/nft_meta.h @@ -24,10 +24,10 @@ int nft_meta_set_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]); int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); int nft_meta_set_dump(struct sk_buff *skb, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h index 56b123a42220..6d9ba62efd75 100644 --- a/include/net/netfilter/nft_reject.h +++ b/include/net/netfilter/nft_reject.h @@ -22,7 +22,8 @@ int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]); -int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr); +int nft_reject_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset); int nft_reject_icmp_code(u8 code); int nft_reject_icmpv6_code(u8 code); diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index 0bcd6aee6000..a522c3a3be52 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -52,7 +52,8 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx, return err; } -static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dup_ipv4_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 70a405b4006f..c82f3fdd4a65 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -50,7 +50,8 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx, return err; } -static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dup_ipv6_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_dup_ipv6 *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 62da204eed41..741a0e386406 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2769,7 +2769,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb, NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; - if (expr->ops->dump(skb, expr) < 0) + if (expr->ops->dump(skb, expr, false) < 0) goto nla_put_failure; nla_nest_end(skb, data); } diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index e6e402b247d0..84eae7cabc67 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -232,7 +232,8 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb, return 0; } -static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_bitwise_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_bitwise *priv = nft_expr_priv(expr); int err = 0; @@ -393,7 +394,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx, } static int -nft_bitwise_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +nft_bitwise_fast_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr); struct nft_data data; diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c index f952a80275a8..b66647a5a171 100644 --- a/net/netfilter/nft_byteorder.c +++ b/net/netfilter/nft_byteorder.c @@ -148,7 +148,8 @@ static int nft_byteorder_init(const struct nft_ctx *ctx, priv->len); } -static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_byteorder_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_byteorder *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c index 963cf831799c..6eb21a4f5698 100644 --- a/net/netfilter/nft_cmp.c +++ b/net/netfilter/nft_cmp.c @@ -92,7 +92,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return 0; } -static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_cmp_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); @@ -253,7 +254,8 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx, return __nft_cmp_offload(ctx, flow, &cmp); } -static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_cmp_fast_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ; @@ -347,7 +349,8 @@ static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx, return __nft_cmp_offload(ctx, flow, &cmp); } -static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_cmp16_fast_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ; diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index c16172427622..5284cd2ad532 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -324,7 +324,8 @@ static int nft_extension_dump_info(struct sk_buff *skb, int attr, return 0; } -static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_target_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct xt_target *target = expr->ops->data; void *info = nft_expr_priv(expr); @@ -572,12 +573,14 @@ nla_put_failure: return -1; } -static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_match_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { return __nft_match_dump(skb, expr, nft_expr_priv(expr)); } -static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e) +static int nft_match_large_dump(struct sk_buff *skb, + const struct nft_expr *e, bool reset) { struct nft_xt_match_priv *priv = nft_expr_priv(e); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index d657f999a11b..de9d1980df69 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -185,7 +185,8 @@ static void nft_connlimit_eval(const struct nft_expr *expr, nft_connlimit_do_eval(priv, regs, pkt, NULL); } -static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_connlimit_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_connlimit *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index f4d3573e8782..06482fb9c145 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -201,7 +201,8 @@ void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, nft_counter_do_eval(priv, regs, pkt); } -static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_counter_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index a3f01f209a53..a0696d7ea10c 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -641,7 +641,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx, nf_ct_netns_put(ctx->net, ctx->family); } -static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ct_get_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); @@ -703,7 +704,8 @@ static bool nft_ct_get_reduce(struct nft_regs_track *track, return nft_expr_reduce_bitwise(track, expr); } -static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ct_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c index 63507402716d..e5739a59ebf1 100644 --- a/net/netfilter/nft_dup_netdev.c +++ b/net/netfilter/nft_dup_netdev.c @@ -44,7 +44,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx, sizeof(int)); } -static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dup_netdev_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_dup_netdev *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 6983e6ddeef9..01c61e090639 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -357,7 +357,8 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } -static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_dynset_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_dynset *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index a67ea9c3ae57..ed929d0d37ce 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -576,7 +576,8 @@ nla_put_failure: return -1; } -static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_exthdr_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); @@ -586,7 +587,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } -static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_exthdr_dump_set(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); @@ -596,7 +598,8 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } -static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_exthdr_dump_strip(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_exthdr *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 1f12d7ade606..6e049fd48760 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -118,7 +118,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } EXPORT_SYMBOL_GPL(nft_fib_init); -int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_fib *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index a25c88bc8b75..e860d8fe0e5e 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -433,7 +433,8 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx, nf_ct_netns_put(ctx->net, ctx->family); } -static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_flow_offload_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_flow_offload *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 7c5876dc9ff2..7b9d4d1bd17c 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -56,7 +56,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx, sizeof(int)); } -static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_fwd_netdev_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_fwd_netdev *priv = nft_expr_priv(expr); @@ -186,7 +187,8 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx, addr_len); } -static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_fwd_neigh_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_fwd_neigh *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index e5631e88b285..ee8d487b69c0 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -139,7 +139,7 @@ static int nft_symhash_init(const struct nft_ctx *ctx, } static int nft_jhash_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_jhash *priv = nft_expr_priv(expr); @@ -176,7 +176,7 @@ static bool nft_jhash_reduce(struct nft_regs_track *track, } static int nft_symhash_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_symhash *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 5f28b21abc7d..c9d2f7c29f53 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -147,7 +147,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, } } -static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_immediate_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 809f0d0787ec..6d96b826db4e 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -347,7 +347,8 @@ static int nft_inner_init(const struct nft_ctx *ctx, return 0; } -static int nft_inner_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_inner_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_inner *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index bb15a55dad5c..7f2bda6641bd 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -65,7 +65,8 @@ static void nft_last_eval(const struct nft_expr *expr, WRITE_ONCE(last->set, 1); } -static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_last_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_last_priv *priv = nft_expr_priv(expr); struct nft_last *last = priv->last; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 981addb2d051..145dc62c6247 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -193,7 +193,8 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx, return 0; } -static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_limit_pkts_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr); @@ -251,7 +252,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx, } static int nft_limit_bytes_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_limit_priv *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 0e13c003f0c1..5defe6e4fd98 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -241,7 +241,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx, nf_logger_put(ctx->family, li->type); } -static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_log_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_log *priv = nft_expr_priv(expr); const struct nf_loginfo *li = &priv->loginfo; diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index dfae12759c7c..cae5a6724163 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -178,7 +178,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } -static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_lookup_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_lookup *priv = nft_expr_priv(expr); u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0; diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 2a0adc497bbb..e55e455275c4 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -73,7 +73,8 @@ static int nft_masq_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -static int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_masq_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_masq *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 8c39adeebb5c..e384e0de7a54 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -669,7 +669,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx, EXPORT_SYMBOL_GPL(nft_meta_set_init); int nft_meta_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_meta *priv = nft_expr_priv(expr); @@ -684,7 +684,8 @@ nla_put_failure: } EXPORT_SYMBOL_GPL(nft_meta_get_dump); -int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_meta_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_meta *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index e5fd6995e4bf..047999150390 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -255,7 +255,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return nf_ct_netns_get(ctx->net, family); } -static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_nat_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_nat *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index 45d3dc9e96f2..7d29db7c2ac0 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -112,7 +112,8 @@ nla_put_failure: return -1; } -static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ng_inc_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ng_inc *priv = nft_expr_priv(expr); @@ -168,7 +169,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx, NULL, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_ng_random_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_ng_random *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 74e0eea4abac..7b01aa2ef653 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -47,7 +47,8 @@ static int nft_objref_init(const struct nft_ctx *ctx, return 0; } -static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_objref_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_object *obj = nft_objref_priv(expr); @@ -155,7 +156,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, return 0; } -static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_objref_map_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_objref_map *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index adacf95b6e2b..70820c66b591 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -92,7 +92,8 @@ static int nft_osf_init(const struct nft_ctx *ctx, return 0; } -static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_osf_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_osf *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 53e64d8aa01f..336ac668cae3 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -231,7 +231,8 @@ static int nft_payload_init(const struct nft_ctx *ctx, priv->len); } -static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_payload_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_payload *priv = nft_expr_priv(expr); @@ -919,7 +920,8 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, priv->len); } -static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_payload_set_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_payload_set *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index da29e92c03e2..b2b8127c8d43 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -152,7 +152,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx, return 0; } -static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_queue_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); @@ -168,7 +169,8 @@ nla_put_failure: } static int -nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr) +nft_queue_sreg_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_queue *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index e6b0df68feea..b1a1217bca4c 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -217,7 +217,8 @@ static int nft_quota_init(const struct nft_ctx *ctx, return nft_quota_do_init(tb, priv); } -static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_quota_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_quota *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c index 832f0d725a9e..0566d6aaf1e5 100644 --- a/net/netfilter/nft_range.c +++ b/net/netfilter/nft_range.c @@ -111,7 +111,8 @@ err1: return err; } -static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_range_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_range_expr *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 5086adfe731c..5f7739987559 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -75,7 +75,8 @@ static int nft_redir_init(const struct nft_ctx *ctx, return nf_ct_netns_get(ctx->net, ctx->family); } -static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_redir_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_redir *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index 927ff8459bd9..f2addc844dd2 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -69,7 +69,8 @@ int nft_reject_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_reject_init); -int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr) +int nft_reject_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 71931ec91721..5990fdd7b3cc 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -146,7 +146,7 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, } static int nft_rt_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_rt *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 49a5348a6a14..85f8df87efda 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -199,7 +199,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, } static int nft_socket_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_socket *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 6cf9a04fbfe2..13da882669a4 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -272,7 +272,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx, nft_synproxy_do_destroy(ctx); } -static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_synproxy_dump(struct sk_buff *skb, + const struct nft_expr *expr, bool reset) { struct nft_synproxy *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c index 62da25ad264b..ea83f661417e 100644 --- a/net/netfilter/nft_tproxy.c +++ b/net/netfilter/nft_tproxy.c @@ -294,7 +294,7 @@ static void nft_tproxy_destroy(const struct nft_ctx *ctx, } static int nft_tproxy_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_tproxy *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 983ade4be3b3..b059aa541798 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -108,7 +108,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, } static int nft_tunnel_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_tunnel *priv = nft_expr_priv(expr); diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 1c5343c936a8..c88fd078a9ae 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -212,7 +212,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr, } static int nft_xfrm_get_dump(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { const struct nft_xfrm *priv = nft_expr_priv(expr); -- cgit v1.2.3-70-g09d2 From 8daa8fde3fc3f069ff0b5c87079a5c1df7743113 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 14 Oct 2022 23:45:59 +0200 Subject: netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET Analogous to NFT_MSG_GETOBJ_RESET, but for rules: Reset stateful expressions like counters or quotas. The latter two are the only consumers, adjust their 'dump' callbacks to respect the parameter introduced earlier. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 49 +++++++++++++++++++++----------- net/netfilter/nft_counter.c | 2 +- net/netfilter/nft_dynset.c | 4 +-- net/netfilter/nft_inner.c | 2 +- net/netfilter/nft_quota.c | 2 +- 7 files changed, 41 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index c557a57fb0f1..e69ce23566ea 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -383,7 +383,7 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr); + const struct nft_expr *expr, bool reset); bool nft_expr_reduce_bitwise(struct nft_regs_track *track, const struct nft_expr *expr); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e4b739d57480..cfa844da1ce6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -97,6 +97,7 @@ enum nft_verdicts { * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes) * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes) * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes) + * @NFT_MSG_GETRULE_RESET: get rules and reset stateful expressions (enum nft_obj_attributes) */ enum nf_tables_msg_types { NFT_MSG_NEWTABLE, @@ -124,6 +125,7 @@ enum nf_tables_msg_types { NFT_MSG_NEWFLOWTABLE, NFT_MSG_GETFLOWTABLE, NFT_MSG_DELFLOWTABLE, + NFT_MSG_GETRULE_RESET, NFT_MSG_MAX, }; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 741a0e386406..80e613405f6f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2759,7 +2759,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = { }; static int nf_tables_fill_expr_info(struct sk_buff *skb, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name)) goto nla_put_failure; @@ -2769,7 +2769,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb, NFTA_EXPR_DATA); if (data == NULL) goto nla_put_failure; - if (expr->ops->dump(skb, expr, false) < 0) + if (expr->ops->dump(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, data); } @@ -2781,14 +2781,14 @@ nla_put_failure: }; int nft_expr_dump(struct sk_buff *skb, unsigned int attr, - const struct nft_expr *expr) + const struct nft_expr *expr, bool reset) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, attr); if (!nest) goto nla_put_failure; - if (nf_tables_fill_expr_info(skb, expr) < 0) + if (nf_tables_fill_expr_info(skb, expr, reset) < 0) goto nla_put_failure; nla_nest_end(skb, nest); return 0; @@ -3034,7 +3034,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule, u64 handle) + const struct nft_rule *rule, u64 handle, + bool reset) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; @@ -3067,7 +3068,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, if (list == NULL) goto nla_put_failure; nft_rule_for_each_expr(expr, next, rule) { - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0) goto nla_put_failure; } nla_nest_end(skb, list); @@ -3118,7 +3119,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, event, flags, ctx->family, ctx->table, - ctx->chain, rule, handle); + ctx->chain, rule, handle, false); if (err < 0) { kfree_skb(skb); goto err; @@ -3139,7 +3140,8 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, unsigned int *idx, struct netlink_callback *cb, const struct nft_table *table, - const struct nft_chain *chain) + const struct nft_chain *chain, + bool reset) { struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; @@ -3166,7 +3168,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, handle) < 0) + table, chain, rule, handle, reset) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -3189,6 +3191,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); int family = nfmsg->nfgen_family; struct nftables_pernet *nft_net; + bool reset = false; + + if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + reset = true; rcu_read_lock(); nft_net = nft_pernet(net); @@ -3213,14 +3219,15 @@ static int nf_tables_dump_rules(struct sk_buff *skb, if (!nft_is_active(net, chain)) continue; __nf_tables_dump_rules(skb, &idx, - cb, table, chain); + cb, table, chain, reset); break; } goto done; } list_for_each_entry_rcu(chain, &table->chains, list) { - if (__nf_tables_dump_rules(skb, &idx, cb, table, chain)) + if (__nf_tables_dump_rules(skb, &idx, + cb, table, chain, reset)) goto done; } @@ -3291,6 +3298,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, struct net *net = info->net; struct nft_table *table; struct sk_buff *skb2; + bool reset = false; int err; if (info->nlh->nlmsg_flags & NLM_F_DUMP) { @@ -3327,9 +3335,12 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, if (!skb2) return -ENOMEM; + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET) + reset = true; + err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule, 0); + family, table, chain, rule, 0, reset); if (err < 0) goto err_fill_rule_info; @@ -4104,7 +4115,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); - if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0) + if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) goto nla_put_failure; nla_nest_end(skb, nest); @@ -4115,7 +4126,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, for (i = 0; i < set->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, - set->exprs[i]) < 0) + set->exprs[i], false) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -4946,7 +4957,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, if (num_exprs == 1) { expr = nft_setelem_expr_at(elem_expr, 0); - if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0) + if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0) return -1; return 0; @@ -4957,7 +4968,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb, nft_setelem_expr_foreach(expr, elem_expr, size) { expr = nft_setelem_expr_at(elem_expr, size); - if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0) + if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -8311,6 +8322,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, + [NFT_MSG_GETRULE_RESET] = { + .call = nf_tables_getrule, + .type = NFNL_CB_RCU, + .attr_count = NFTA_RULE_MAX, + .policy = nft_rule_policy, + }, [NFT_MSG_DELRULE] = { .call = nf_tables_delrule, .type = NFNL_CB_BATCH, diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 06482fb9c145..dccc68a5135a 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -206,7 +206,7 @@ static int nft_counter_dump(struct sk_buff *skb, { struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - return nft_counter_do_dump(skb, priv, false); + return nft_counter_do_dump(skb, priv, reset); } static int nft_counter_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 01c61e090639..274579b1696e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -380,7 +380,7 @@ static int nft_dynset_dump(struct sk_buff *skb, if (priv->set->num_exprs == 0) { if (priv->num_exprs == 1) { if (nft_expr_dump(skb, NFTA_DYNSET_EXPR, - priv->expr_array[0])) + priv->expr_array[0], reset)) goto nla_put_failure; } else if (priv->num_exprs > 1) { struct nlattr *nest; @@ -391,7 +391,7 @@ static int nft_dynset_dump(struct sk_buff *skb, for (i = 0; i < priv->num_exprs; i++) { if (nft_expr_dump(skb, NFTA_LIST_ELEM, - priv->expr_array[i])) + priv->expr_array[i], reset)) goto nla_put_failure; } nla_nest_end(skb, nest); diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 6d96b826db4e..28e2873ba24e 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -359,7 +359,7 @@ static int nft_inner_dump(struct sk_buff *skb, goto nla_put_failure; if (nft_expr_dump(skb, NFTA_INNER_EXPR, - (struct nft_expr *)&priv->expr) < 0) + (struct nft_expr *)&priv->expr, reset) < 0) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index b1a1217bca4c..123578e28917 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -222,7 +222,7 @@ static int nft_quota_dump(struct sk_buff *skb, { struct nft_quota *priv = nft_expr_priv(expr); - return nft_quota_do_dump(skb, priv, false); + return nft_quota_do_dump(skb, priv, reset); } static void nft_quota_destroy(const struct nft_ctx *ctx, -- cgit v1.2.3-70-g09d2 From 971095c6fa4aebc0af923c32a28124871fe35136 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Fri, 4 Nov 2022 11:55:04 +0800 Subject: netfilter: rpfilter/fib: clean up some inconsistent indenting No functional modification involved. net/ipv4/netfilter/nft_fib_ipv4.c:141 nft_fib4_eval() warn: inconsistent indenting. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2733 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_fib_ipv4.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index fc65d69f23e1..9eee535c64dd 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -138,12 +138,11 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, break; } - if (!oif) { - found = FIB_RES_DEV(res); + if (!oif) { + found = FIB_RES_DEV(res); } else { if (!fib_info_nh_uses_dev(res.fi, oif)) return; - found = oif; } -- cgit v1.2.3-70-g09d2 From d2c806abcf0b582131e1f93589d628dac0c07bf4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 2 Nov 2022 13:46:33 +0100 Subject: netfilter: conntrack: use siphash_4u64 This function is used for every packet, siphash_4u64 is noticeably faster than using local buffer + siphash: Before: 1.23% kpktgend_0 [kernel.vmlinux] [k] __siphash_unaligned 0.14% kpktgend_0 [nf_conntrack] [k] hash_conntrack_raw After: 0.79% kpktgend_0 [kernel.vmlinux] [k] siphash_4u64 0.15% kpktgend_0 [nf_conntrack] [k] hash_conntrack_raw In the pktgen test this gives about ~2.4% performance improvement. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f97bda06d2a9..057ebdcc25d7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -211,28 +211,24 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, unsigned int zoneid, const struct net *net) { - struct { - struct nf_conntrack_man src; - union nf_inet_addr dst_addr; - unsigned int zone; - u32 net_mix; - u16 dport; - u16 proto; - } __aligned(SIPHASH_ALIGNMENT) combined; + u64 a, b, c, d; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); - memset(&combined, 0, sizeof(combined)); + /* The direction must be ignored, handle usable tuplehash members manually */ + a = (u64)tuple->src.u3.all[0] << 32 | tuple->src.u3.all[3]; + b = (u64)tuple->dst.u3.all[0] << 32 | tuple->dst.u3.all[3]; - /* The direction must be ignored, so handle usable members manually. */ - combined.src = tuple->src; - combined.dst_addr = tuple->dst.u3; - combined.zone = zoneid; - combined.net_mix = net_hash_mix(net); - combined.dport = (__force __u16)tuple->dst.u.all; - combined.proto = tuple->dst.protonum; + c = (__force u64)tuple->src.u.all << 32 | (__force u64)tuple->dst.u.all << 16; + c |= tuple->dst.protonum; - return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); + d = (u64)zoneid << 32 | net_hash_mix(net); + + /* IPv4: u3.all[1,2,3] == 0 */ + c ^= (u64)tuple->src.u3.all[1] << 32 | tuple->src.u3.all[2]; + d += (u64)tuple->dst.u3.all[1] << 32 | tuple->dst.u3.all[2]; + + return (u32)siphash_4u64(a, b, c, d, &nf_conntrack_hash_rnd); } static u32 scale_hash(u32 hash) -- cgit v1.2.3-70-g09d2 From 7eba4505394e21df44dcace6b5d741a8e2deea3a Mon Sep 17 00:00:00 2001 From: Daniel Machon Date: Mon, 14 Nov 2022 10:29:50 +0100 Subject: net: dcb: move getapptrust to separate function This patch fixes a frame size warning, reported by kernel test robot. >> net/dcb/dcbnl.c:1230:1: warning: the frame size of 1244 bytes is >> larger than 1024 bytes [-Wframe-larger-than=] The getapptrust part of dcbnl_ieee_fill is moved to a separate function, and the selector array is now dynamically allocated, instead of stack allocated. Tested on microchip sparx5 driver. Fixes: 6182d5875c33 ("net: dcb: add new apptrust attribute") Reported-by: kernel test robot Signed-off-by: Daniel Machon Link: https://lore.kernel.org/r/20221114092950.2490451-1-daniel.machon@microchip.com Signed-off-by: Paolo Abeni --- net/dcb/dcbnl.c | 65 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index cec0632f96db..f9949e051f49 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -1060,11 +1060,50 @@ nla_put_failure: return err; } +static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb) +{ + const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; + enum ieee_attrs_app type; + struct nlattr *apptrust; + int nselectors, err, i; + u8 *selectors; + + selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL); + if (!selectors) + return -ENOMEM; + + err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); + if (err) { + err = 0; + goto out; + } + + apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); + if (!apptrust) { + err = -EMSGSIZE; + goto out; + } + + for (i = 0; i < nselectors; i++) { + type = dcbnl_app_attr_type_get(selectors[i]); + err = nla_put_u8(skb, type, selectors[i]); + if (err) { + nla_nest_cancel(skb, apptrust); + goto out; + } + } + nla_nest_end(skb, apptrust); + +out: + kfree(selectors); + return err; +} + /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; - struct nlattr *ieee, *app, *apptrust; + struct nlattr *ieee, *app; struct dcb_app_type *itr; int dcbx; int err; @@ -1168,27 +1207,9 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) nla_nest_end(skb, app); if (ops->dcbnl_getapptrust) { - u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; - int nselectors, i; - - apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); - if (!apptrust) - return -EMSGSIZE; - - err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); - if (!err) { - for (i = 0; i < nselectors; i++) { - enum ieee_attrs_app type = - dcbnl_app_attr_type_get(selectors[i]); - err = nla_put_u8(skb, type, selectors[i]); - if (err) { - nla_nest_cancel(skb, apptrust); - return err; - } - } - } - - nla_nest_end(skb, apptrust); + err = dcbnl_getapptrust(netdev, skb); + if (err) + return err; } /* get peer info if available */ -- cgit v1.2.3-70-g09d2 From 32637e33003f36e75e9147788cc0e2f21706ef99 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 8 Nov 2022 15:06:00 +0100 Subject: bpf: Expand map key argument of bpf_redirect_map to u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For queueing packets in XDP we want to add a new redirect map type with support for 64-bit indexes. To prepare fore this, expand the width of the 'key' argument to the bpf_redirect_map() helper. Since BPF registers are always 64-bit, this should be safe to do after the fact. Acked-by: Song Liu Reviewed-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221108140601.149971-3-toke@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/filter.h | 12 ++++++------ include/uapi/linux/bpf.h | 2 +- kernel/bpf/cpumap.c | 4 ++-- kernel/bpf/devmap.c | 4 ++-- kernel/bpf/verifier.c | 2 +- net/core/filter.c | 4 ++-- net/xdp/xskmap.c | 4 ++-- 8 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 49f9d2bec401..54462dd28824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -135,7 +135,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure diff --git a/include/linux/filter.h b/include/linux/filter.h index 787d35dbf5b0..bf701976056e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -643,13 +643,13 @@ struct bpf_nh_params { }; struct bpf_redirect_info { - u32 flags; - u32 tgt_index; + u64 tgt_index; void *tgt_value; struct bpf_map *map; + u32 flags; + u32 kern_flags; u32 map_id; enum bpf_map_type map_type; - u32 kern_flags; struct bpf_nh_params nh; }; @@ -1504,7 +1504,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { @@ -1515,7 +1515,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; - ri->tgt_value = lookup_elem(map, ifindex); + ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program @@ -1527,7 +1527,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind return flags & action_mask; } - ri->tgt_index = ifindex; + ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6580448e9f77..ab86145df760 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2647,7 +2647,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) * Description * Redirect the packet to the endpoint referenced by *map* at * index *key*. Depending on its type, this *map* can contain diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 6b6a78c04b90..e0b2d016f0bf 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -667,9 +667,9 @@ static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } -static int cpu_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int cpu_map_redirect(struct bpf_map *map, u64 index, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + return __bpf_xdp_redirect_map(map, index, flags, 0, __cpu_map_lookup_elem); } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f9a87dcc5535..d01e4c55b376 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -992,14 +992,14 @@ static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, map, key, value, map_flags); } -static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, __dev_map_lookup_elem); } -static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags) { return __bpf_xdp_redirect_map(map, ifindex, flags, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5e74f460dfd0..be24774961ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -14384,7 +14384,7 @@ static int do_misc_fixups(struct bpf_verifier_env *env) BUILD_BUG_ON(!__same_type(ops->map_peek_elem, (int (*)(struct bpf_map *map, void *value))NULL)); BUILD_BUG_ON(!__same_type(ops->map_redirect, - (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL)); + (int (*)(struct bpf_map *map, u64 index, u64 flags))NULL)); BUILD_BUG_ON(!__same_type(ops->map_for_each_callback, (int (*)(struct bpf_map *map, bpf_callback_t callback_fn, diff --git a/net/core/filter.c b/net/core/filter.c index 37fad5a9b752..754dd01354d8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4414,10 +4414,10 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { .arg2_type = ARG_ANYTHING, }; -BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, +BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key, u64, flags) { - return map->ops->map_redirect(map, ifindex, flags); + return map->ops->map_redirect(map, key, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c index acc8e52a4f5f..771d0fa90ef5 100644 --- a/net/xdp/xskmap.c +++ b/net/xdp/xskmap.c @@ -231,9 +231,9 @@ static int xsk_map_delete_elem(struct bpf_map *map, void *key) return 0; } -static int xsk_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags) +static int xsk_map_redirect(struct bpf_map *map, u64 index, u64 flags) { - return __bpf_xdp_redirect_map(map, ifindex, flags, 0, + return __bpf_xdp_redirect_map(map, index, flags, 0, __xsk_map_lookup_elem); } -- cgit v1.2.3-70-g09d2 From 570d0a588dfbb11f9ef80ed933d0403a0e995688 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 14 Nov 2022 13:42:11 +0100 Subject: net: dsa: add support for DSA rx offloading via metadata dst If a metadata dst is present with the type METADATA_HW_PORT_MUX on a dsa cpu port netdev, assume that it carries the port number and that there is no DSA tag present in the skb data. Signed-off-by: Felix Fietkau Reviewed-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/core/flow_dissector.c | 4 +++- net/dsa/dsa.c | 19 ++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 25cd35f5922e..3e81798ed3e0 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -971,12 +971,14 @@ bool __skb_flow_dissect(const struct net *net, #if IS_ENABLED(CONFIG_NET_DSA) if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) && proto == htons(ETH_P_XDSA))) { + struct metadata_dst *md_dst = skb_metadata_dst(skb); const struct dsa_device_ops *ops; int offset = 0; ops = skb->dev->dsa_ptr->tag_ops; /* Only DSA header taggers break flow dissection */ - if (ops->needed_headroom) { + if (ops->needed_headroom && + (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) { if (ops->flow_dissect) ops->flow_dissect(skb, &proto, &offset); else diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 64b14f655b23..6caf2ec648fd 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "dsa_priv.h" @@ -216,6 +217,7 @@ static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *unused) { + struct metadata_dst *md_dst = skb_metadata_dst(skb); struct dsa_port *cpu_dp = dev->dsa_ptr; struct sk_buff *nskb = NULL; struct dsa_slave_priv *p; @@ -229,7 +231,22 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; - nskb = cpu_dp->rcv(skb, dev); + if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) { + unsigned int port = md_dst->u.port_info.port_id; + + skb_dst_drop(skb); + if (!skb_has_extensions(skb)) + skb->slow_gro = 0; + + skb->dev = dsa_master_find_slave(dev, 0, port); + if (likely(skb->dev)) { + dsa_default_offload_fwd_mark(skb); + nskb = skb; + } + } else { + nskb = cpu_dp->rcv(skb, dev); + } + if (!nskb) { kfree_skb(skb); return 0; -- cgit v1.2.3-70-g09d2 From 53d04b9811107633f25be02a5d981a6070d09e6e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 14 Nov 2022 19:07:30 +0200 Subject: net: dsa: remove phylink_validate() method As of now, no DSA driver uses a custom link mode validation procedure anymore. So remove this DSA operation and let phylink determine what is supported based on config->mac_capabilities (if provided by the driver). Leave a comment why we left the code that we did, and that there is more work to do. Signed-off-by: Vladimir Oltean Reviewed-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 3 --- net/dsa/port.c | 18 ++++++++---------- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index ee369670e20e..dde364688739 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -880,9 +880,6 @@ struct dsa_switch_ops { */ void (*phylink_get_caps)(struct dsa_switch *ds, int port, struct phylink_config *config); - void (*phylink_validate)(struct dsa_switch *ds, int port, - unsigned long *supported, - struct phylink_link_state *state); struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds, int port, phy_interface_t iface); diff --git a/net/dsa/port.c b/net/dsa/port.c index 208168276995..48c9eaa74aee 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1536,16 +1536,14 @@ static void dsa_port_phylink_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state) { - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_validate) { - if (config->mac_capabilities) - phylink_generic_validate(config, supported, state); - return; - } - - ds->ops->phylink_validate(ds, dp->index, supported, state); + /* Skip call for drivers which don't yet set mac_capabilities, + * since validating in that case would mean their PHY will advertise + * nothing. In turn, skipping validation makes them advertise + * everything that the PHY supports, so those drivers should be + * converted ASAP. + */ + if (config->mac_capabilities) + phylink_generic_validate(config, supported, state); } static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, -- cgit v1.2.3-70-g09d2 From 6423ac2eb31ec33f8526dc48f1e541b665333970 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Nov 2022 16:00:21 +0000 Subject: rxrpc: Fix oops from calling udpv6_sendmsg() on AF_INET socket If rxrpc sees an IPv6 address, it assumes it can call udpv6_sendmsg() on it - even if it got it on an IPv4 socket. Fix do_udp_sendmsg() to give an error in such a case. general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] ... RIP: 0010:ipv6_addr_v4mapped include/net/ipv6.h:749 [inline] RIP: 0010:udpv6_sendmsg+0xd0a/0x2c70 net/ipv6/udp.c:1361 ... Call Trace: do_udp_sendmsg net/rxrpc/output.c:27 [inline] do_udp_sendmsg net/rxrpc/output.c:21 [inline] rxrpc_send_abort_packet+0x73b/0x860 net/rxrpc/output.c:367 rxrpc_release_calls_on_socket+0x211/0x300 net/rxrpc/call_object.c:595 rxrpc_release_sock net/rxrpc/af_rxrpc.c:886 [inline] rxrpc_release+0x263/0x5a0 net/rxrpc/af_rxrpc.c:917 __sock_release+0xcd/0x280 net/socket.c:650 sock_close+0x18/0x20 net/socket.c:1365 __fput+0x27c/0xa90 fs/file_table.c:320 task_work_run+0x16b/0x270 kernel/task_work.c:179 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xb35/0x2a20 kernel/exit.c:820 do_group_exit+0xd0/0x2a0 kernel/exit.c:950 __do_sys_exit_group kernel/exit.c:961 [inline] __se_sys_exit_group kernel/exit.c:959 [inline] __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:959 Fixes: ed472b0c8783 ("rxrpc: Call udp_sendmsg() directly") Reported-by: Eric Dumazet Suggested-by: Eric Dumazet Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/output.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 46432e70a16b..a2fe1a262f8a 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -18,15 +18,21 @@ extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -static ssize_t do_udp_sendmsg(struct socket *sk, struct msghdr *msg, size_t len) +static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) { -#if IS_ENABLED(CONFIG_AF_RXRPC_IPV6) struct sockaddr *sa = msg->msg_name; + struct sock *sk = socket->sk; - if (sa->sa_family == AF_INET6) - return udpv6_sendmsg(sk->sk, msg, len); -#endif - return udp_sendmsg(sk->sk, msg, len); + if (IS_ENABLED(CONFIG_AF_RXRPC_IPV6)) { + if (sa->sa_family == AF_INET6) { + if (sk->sk_family != AF_INET6) { + pr_warn("AF_INET6 address on AF_INET socket\n"); + return -ENOPROTOOPT; + } + return udpv6_sendmsg(sk, msg, len); + } + } + return udp_sendmsg(sk, msg, len); } struct rxrpc_abort_buffer { -- cgit v1.2.3-70-g09d2 From 66f6fd278c6780ea8c8bb7dac839132d8e76dd53 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 14 Nov 2022 11:24:09 +0000 Subject: rxrpc: Fix network address validation Fix network address validation on entry to uapi functions such as connect() for AF_RXRPC. The check for address compatibility with the transport socket isn't correct and allows an AF_INET6 address to be given to an AF_INET socket, resulting in an oops now that rxrpc is calling udp_sendmsg() directly. Sample program: #define _GNU_SOURCE #include #include #include #include #include static unsigned char ctrl[256] = "\x18\x00\x00\x00\x00\x00\x00\x00\x10\x01\x00\x00\x01"; int main(void) { struct sockaddr_rxrpc srx = { .srx_family = AF_RXRPC, .transport_type = SOCK_DGRAM, .transport_len = 28, .transport.sin6.sin6_family = AF_INET6, }; struct mmsghdr vec = { .msg_hdr.msg_control = ctrl, .msg_hdr.msg_controllen = 0x18, }; int s; s = socket(AF_RXRPC, SOCK_DGRAM, AF_INET); if (s < 0) { perror("socket"); exit(1); } if (connect(s, (struct sockaddr *)&srx, sizeof(srx)) < 0) { perror("connect"); exit(1); } if (sendmmsg(s, &vec, 1, MSG_NOSIGNAL | MSG_MORE) < 0) { perror("sendmmsg"); exit(1); } return 0; } If working properly, connect() should fail with EAFNOSUPPORT. Fixes: ed472b0c8783 ("rxrpc: Call udp_sendmsg() directly") Reported-by: Eric Dumazet Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/af_rxrpc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2f3991cf8715..aacdd96a9886 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -93,12 +93,11 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, srx->transport_len > len) return -EINVAL; - if (srx->transport.family != rx->family && - srx->transport.family == AF_INET && rx->family != AF_INET6) - return -EAFNOSUPPORT; - switch (srx->transport.family) { case AF_INET: + if (rx->family != AF_INET && + rx->family != AF_INET6) + return -EAFNOSUPPORT; if (srx->transport_len < sizeof(struct sockaddr_in)) return -EINVAL; tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad); @@ -106,6 +105,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: + if (rx->family != AF_INET6) + return -EAFNOSUPPORT; if (srx->transport_len < sizeof(struct sockaddr_in6)) return -EINVAL; tail = offsetof(struct sockaddr_rxrpc, transport) + -- cgit v1.2.3-70-g09d2 From 02ae6a7034d7b2e3d89e33d73da10a1f156789a0 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 8 Nov 2022 14:23:55 -0600 Subject: wifi: cfg80211: Avoid clashing function prototypes When built with Control Flow Integrity, function prototypes between caller and function declaration must match. These mismatches are visible at compile time with the new -Wcast-function-type-strict in Clang[1]. Fix a total of 73 warnings like these: drivers/net/wireless/intersil/orinoco/wext.c:1379:27: warning: cast from 'int (*)(struct net_device *, struct iw_request_info *, struct iw_param *, char *)' to 'iw_handler' (aka 'int (*)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') converts to incompatible function type [-Wcast-function-type-strict] IW_HANDLER(SIOCGIWPOWER, (iw_handler)orinoco_ioctl_getpower), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../net/wireless/wext-compat.c:1607:33: warning: cast from 'int (*)(struct net_device *, struct iw_request_info *, struct iw_point *, char *)' to 'iw_handler' (aka 'int (*)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') converts to incompatible function type [-Wcast-function-type-strict] [IW_IOCTL_IDX(SIOCSIWGENIE)] = (iw_handler) cfg80211_wext_siwgenie, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ../drivers/net/wireless/intersil/orinoco/wext.c:1390:27: error: incompatible function pointer types initializing 'const iw_handler' (aka 'int (*const)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') with an expression of type 'int (struct net_device *, struct iw_request_info *, struct iw_param *, char *)' [-Wincompatible-function-pointer-types] IW_HANDLER(SIOCGIWRETRY, cfg80211_wext_giwretry), ^~~~~~~~~~~~~~~~~~~~~~ The cfg80211 Wireless Extension handler callbacks (iw_handler) use a union for the data argument. Actually use the union and perform explicit member selection in the function body instead of having a function prototype mismatch. There are no resulting binary differences before/after changes. These changes were made partly manually and partly with the help of Coccinelle. Link: https://github.com/KSPP/linux/issues/234 Link: https://reviews.llvm.org/D134831 [1] Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/a68822bf8dd587988131bb6a295280cb4293f05d.1667934775.git.gustavoars@kernel.org --- drivers/net/wireless/intel/ipw2x00/ipw2200.c | 2 +- drivers/net/wireless/intersil/orinoco/wext.c | 22 ++-- include/net/cfg80211-wext.h | 20 +-- net/wireless/scan.c | 3 +- net/wireless/wext-compat.c | 180 ++++++++++++--------------- net/wireless/wext-compat.h | 8 +- net/wireless/wext-sme.c | 5 +- 7 files changed, 113 insertions(+), 127 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c index 79d5c09757d4..ca802af8cddc 100644 --- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c @@ -9856,7 +9856,7 @@ static int ipw_wx_sw_reset(struct net_device *dev, /* Rebase the WE IOCTLs to zero for the handler array */ static iw_handler ipw_wx_handlers[] = { - IW_HANDLER(SIOCGIWNAME, (iw_handler)cfg80211_wext_giwname), + IW_HANDLER(SIOCGIWNAME, cfg80211_wext_giwname), IW_HANDLER(SIOCSIWFREQ, ipw_wx_set_freq), IW_HANDLER(SIOCGIWFREQ, ipw_wx_get_freq), IW_HANDLER(SIOCSIWMODE, ipw_wx_set_mode), diff --git a/drivers/net/wireless/intersil/orinoco/wext.c b/drivers/net/wireless/intersil/orinoco/wext.c index b8eb5d60192f..dea1ff044342 100644 --- a/drivers/net/wireless/intersil/orinoco/wext.c +++ b/drivers/net/wireless/intersil/orinoco/wext.c @@ -1363,31 +1363,31 @@ static const struct iw_priv_args orinoco_privtab[] = { static const iw_handler orinoco_handler[] = { IW_HANDLER(SIOCSIWCOMMIT, orinoco_ioctl_commit), - IW_HANDLER(SIOCGIWNAME, (iw_handler)cfg80211_wext_giwname), + IW_HANDLER(SIOCGIWNAME, cfg80211_wext_giwname), IW_HANDLER(SIOCSIWFREQ, orinoco_ioctl_setfreq), IW_HANDLER(SIOCGIWFREQ, orinoco_ioctl_getfreq), - IW_HANDLER(SIOCSIWMODE, (iw_handler)cfg80211_wext_siwmode), - IW_HANDLER(SIOCGIWMODE, (iw_handler)cfg80211_wext_giwmode), + IW_HANDLER(SIOCSIWMODE, cfg80211_wext_siwmode), + IW_HANDLER(SIOCGIWMODE, cfg80211_wext_giwmode), IW_HANDLER(SIOCSIWSENS, orinoco_ioctl_setsens), IW_HANDLER(SIOCGIWSENS, orinoco_ioctl_getsens), - IW_HANDLER(SIOCGIWRANGE, (iw_handler)cfg80211_wext_giwrange), + IW_HANDLER(SIOCGIWRANGE, cfg80211_wext_giwrange), IW_HANDLER(SIOCSIWSPY, iw_handler_set_spy), IW_HANDLER(SIOCGIWSPY, iw_handler_get_spy), IW_HANDLER(SIOCSIWTHRSPY, iw_handler_set_thrspy), IW_HANDLER(SIOCGIWTHRSPY, iw_handler_get_thrspy), IW_HANDLER(SIOCSIWAP, orinoco_ioctl_setwap), IW_HANDLER(SIOCGIWAP, orinoco_ioctl_getwap), - IW_HANDLER(SIOCSIWSCAN, (iw_handler)cfg80211_wext_siwscan), - IW_HANDLER(SIOCGIWSCAN, (iw_handler)cfg80211_wext_giwscan), + IW_HANDLER(SIOCSIWSCAN, cfg80211_wext_siwscan), + IW_HANDLER(SIOCGIWSCAN, cfg80211_wext_giwscan), IW_HANDLER(SIOCSIWESSID, orinoco_ioctl_setessid), IW_HANDLER(SIOCGIWESSID, orinoco_ioctl_getessid), IW_HANDLER(SIOCSIWRATE, orinoco_ioctl_setrate), IW_HANDLER(SIOCGIWRATE, orinoco_ioctl_getrate), - IW_HANDLER(SIOCSIWRTS, (iw_handler)cfg80211_wext_siwrts), - IW_HANDLER(SIOCGIWRTS, (iw_handler)cfg80211_wext_giwrts), - IW_HANDLER(SIOCSIWFRAG, (iw_handler)cfg80211_wext_siwfrag), - IW_HANDLER(SIOCGIWFRAG, (iw_handler)cfg80211_wext_giwfrag), - IW_HANDLER(SIOCGIWRETRY, (iw_handler)cfg80211_wext_giwretry), + IW_HANDLER(SIOCSIWRTS, cfg80211_wext_siwrts), + IW_HANDLER(SIOCGIWRTS, cfg80211_wext_giwrts), + IW_HANDLER(SIOCSIWFRAG, cfg80211_wext_siwfrag), + IW_HANDLER(SIOCGIWFRAG, cfg80211_wext_giwfrag), + IW_HANDLER(SIOCGIWRETRY, cfg80211_wext_giwretry), IW_HANDLER(SIOCSIWENCODE, orinoco_ioctl_setiwencode), IW_HANDLER(SIOCGIWENCODE, orinoco_ioctl_getiwencode), IW_HANDLER(SIOCSIWPOWER, orinoco_ioctl_setpower), diff --git a/include/net/cfg80211-wext.h b/include/net/cfg80211-wext.h index ad77caf2ffde..0ee36d97e068 100644 --- a/include/net/cfg80211-wext.h +++ b/include/net/cfg80211-wext.h @@ -19,34 +19,34 @@ */ int cfg80211_wext_giwname(struct net_device *dev, struct iw_request_info *info, - char *name, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, - u32 *mode, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, - u32 *mode, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwscan(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwrange(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwrts(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rts, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rts, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, - struct iw_param *frag, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, - struct iw_param *frag, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_giwretry(struct net_device *dev, struct iw_request_info *info, - struct iw_param *retry, char *extra); + union iwreq_data *wrqu, char *extra); #endif /* __NET_CFG80211_WEXT_H */ diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 806a5f1330ff..853619bc0f1a 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -3229,8 +3229,9 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *rdev, int cfg80211_wext_giwscan(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_point *data = &wrqu->data; struct cfg80211_registered_device *rdev; int res; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index ddf340bfa07a..8a24dfca75af 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -25,16 +25,17 @@ int cfg80211_wext_giwname(struct net_device *dev, struct iw_request_info *info, - char *name, char *extra) + union iwreq_data *wrqu, char *extra) { - strcpy(name, "IEEE 802.11"); + strcpy(wrqu->name, "IEEE 802.11"); return 0; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwname); int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, - u32 *mode, char *extra) + union iwreq_data *wrqu, char *extra) { + __u32 *mode = &wrqu->mode; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; struct vif_params vifparams; @@ -71,8 +72,9 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode); int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, - u32 *mode, char *extra) + union iwreq_data *wrqu, char *extra) { + __u32 *mode = &wrqu->mode; struct wireless_dev *wdev = dev->ieee80211_ptr; if (!wdev) @@ -108,8 +110,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwmode); int cfg80211_wext_giwrange(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_range *range = (struct iw_range *) extra; enum nl80211_band band; @@ -251,8 +254,9 @@ int cfg80211_wext_freq(struct iw_freq *freq) int cfg80211_wext_siwrts(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rts, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *rts = &wrqu->rts; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 orts = wdev->wiphy->rts_threshold; @@ -281,8 +285,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts); int cfg80211_wext_giwrts(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rts, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *rts = &wrqu->rts; struct wireless_dev *wdev = dev->ieee80211_ptr; rts->value = wdev->wiphy->rts_threshold; @@ -295,8 +300,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwrts); int cfg80211_wext_siwfrag(struct net_device *dev, struct iw_request_info *info, - struct iw_param *frag, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *frag = &wrqu->frag; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 ofrag = wdev->wiphy->frag_threshold; @@ -325,8 +331,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_siwfrag); int cfg80211_wext_giwfrag(struct net_device *dev, struct iw_request_info *info, - struct iw_param *frag, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *frag = &wrqu->frag; struct wireless_dev *wdev = dev->ieee80211_ptr; frag->value = wdev->wiphy->frag_threshold; @@ -339,8 +346,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwfrag); static int cfg80211_wext_siwretry(struct net_device *dev, struct iw_request_info *info, - struct iw_param *retry, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *retry = &wrqu->retry; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u32 changed = 0; @@ -378,8 +386,9 @@ static int cfg80211_wext_siwretry(struct net_device *dev, int cfg80211_wext_giwretry(struct net_device *dev, struct iw_request_info *info, - struct iw_param *retry, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *retry = &wrqu->retry; struct wireless_dev *wdev = dev->ieee80211_ptr; retry->disabled = 0; @@ -588,8 +597,9 @@ static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, static int cfg80211_wext_siwencode(struct net_device *dev, struct iw_request_info *info, - struct iw_point *erq, char *keybuf) + union iwreq_data *wrqu, char *keybuf) { + struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int idx, err; @@ -664,8 +674,9 @@ out: static int cfg80211_wext_siwencodeext(struct net_device *dev, struct iw_request_info *info, - struct iw_point *erq, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct iw_encode_ext *ext = (struct iw_encode_ext *) extra; @@ -767,8 +778,9 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, static int cfg80211_wext_giwencode(struct net_device *dev, struct iw_request_info *info, - struct iw_point *erq, char *keybuf) + union iwreq_data *wrqu, char *keybuf) { + struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; int idx; @@ -804,8 +816,9 @@ static int cfg80211_wext_giwencode(struct net_device *dev, static int cfg80211_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, - struct iw_freq *wextfreq, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_freq *wextfreq = &wrqu->freq; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = { @@ -870,8 +883,9 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, static int cfg80211_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, - struct iw_freq *freq, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_freq *freq = &wrqu->freq; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_chan_def chandef = {}; @@ -1147,8 +1161,9 @@ static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt) static int cfg80211_wext_siwauth(struct net_device *dev, struct iw_request_info *info, - struct iw_param *data, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *data = &wrqu->param; struct wireless_dev *wdev = dev->ieee80211_ptr; if (wdev->iftype != NL80211_IFTYPE_STATION) @@ -1180,7 +1195,7 @@ static int cfg80211_wext_siwauth(struct net_device *dev, static int cfg80211_wext_giwauth(struct net_device *dev, struct iw_request_info *info, - struct iw_param *data, char *extra) + union iwreq_data *wrqu, char *extra) { /* XXX: what do we need? */ @@ -1189,8 +1204,9 @@ static int cfg80211_wext_giwauth(struct net_device *dev, static int cfg80211_wext_siwpower(struct net_device *dev, struct iw_request_info *info, - struct iw_param *wrq, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *wrq = &wrqu->power; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); bool ps; @@ -1238,8 +1254,9 @@ static int cfg80211_wext_siwpower(struct net_device *dev, static int cfg80211_wext_giwpower(struct net_device *dev, struct iw_request_info *info, - struct iw_param *wrq, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *wrq = &wrqu->power; struct wireless_dev *wdev = dev->ieee80211_ptr; wrq->disabled = !wdev->ps; @@ -1249,8 +1266,9 @@ static int cfg80211_wext_giwpower(struct net_device *dev, static int cfg80211_wext_siwrate(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rate, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *rate = &wrqu->bitrate; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bitrate_mask mask; @@ -1307,8 +1325,9 @@ static int cfg80211_wext_siwrate(struct net_device *dev, static int cfg80211_wext_giwrate(struct net_device *dev, struct iw_request_info *info, - struct iw_param *rate, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_param *rate = &wrqu->bitrate; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct station_info sinfo = {}; @@ -1430,8 +1449,9 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) static int cfg80211_wext_siwap(struct net_device *dev, struct iw_request_info *info, - struct sockaddr *ap_addr, char *extra) + union iwreq_data *wrqu, char *extra) { + struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; @@ -1455,8 +1475,9 @@ static int cfg80211_wext_siwap(struct net_device *dev, static int cfg80211_wext_giwap(struct net_device *dev, struct iw_request_info *info, - struct sockaddr *ap_addr, char *extra) + union iwreq_data *wrqu, char *extra) { + struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; @@ -1480,8 +1501,9 @@ static int cfg80211_wext_giwap(struct net_device *dev, static int cfg80211_wext_siwessid(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *ssid) + union iwreq_data *wrqu, char *ssid) { + struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; @@ -1505,8 +1527,9 @@ static int cfg80211_wext_siwessid(struct net_device *dev, static int cfg80211_wext_giwessid(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *ssid) + union iwreq_data *wrqu, char *ssid) { + struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int ret; @@ -1533,7 +1556,7 @@ static int cfg80211_wext_giwessid(struct net_device *dev, static int cfg80211_wext_siwpmksa(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra) + union iwreq_data *wrqu, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -1584,78 +1607,39 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, return ret; } -#define DEFINE_WEXT_COMPAT_STUB(func, type) \ - static int __ ## func(struct net_device *dev, \ - struct iw_request_info *info, \ - union iwreq_data *wrqu, \ - char *extra) \ - { \ - return func(dev, info, (type *)wrqu, extra); \ - } - -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwname, char) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfreq, struct iw_freq) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfreq, struct iw_freq) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmode, u32) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwmode, u32) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrange, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwap, struct sockaddr) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwap, struct sockaddr) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmlme, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwscan, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwessid, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwessid, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrate, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrate, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrts, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrts, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfrag, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfrag, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwretry, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwretry, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencode, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwencode, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwpower, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpower, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwgenie, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwauth, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwauth, struct iw_param) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencodeext, struct iw_point) -DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpmksa, struct iw_point) - static const iw_handler cfg80211_handlers[] = { - [IW_IOCTL_IDX(SIOCGIWNAME)] = __cfg80211_wext_giwname, - [IW_IOCTL_IDX(SIOCSIWFREQ)] = __cfg80211_wext_siwfreq, - [IW_IOCTL_IDX(SIOCGIWFREQ)] = __cfg80211_wext_giwfreq, - [IW_IOCTL_IDX(SIOCSIWMODE)] = __cfg80211_wext_siwmode, - [IW_IOCTL_IDX(SIOCGIWMODE)] = __cfg80211_wext_giwmode, - [IW_IOCTL_IDX(SIOCGIWRANGE)] = __cfg80211_wext_giwrange, - [IW_IOCTL_IDX(SIOCSIWAP)] = __cfg80211_wext_siwap, - [IW_IOCTL_IDX(SIOCGIWAP)] = __cfg80211_wext_giwap, - [IW_IOCTL_IDX(SIOCSIWMLME)] = __cfg80211_wext_siwmlme, - [IW_IOCTL_IDX(SIOCSIWSCAN)] = cfg80211_wext_siwscan, - [IW_IOCTL_IDX(SIOCGIWSCAN)] = __cfg80211_wext_giwscan, - [IW_IOCTL_IDX(SIOCSIWESSID)] = __cfg80211_wext_siwessid, - [IW_IOCTL_IDX(SIOCGIWESSID)] = __cfg80211_wext_giwessid, - [IW_IOCTL_IDX(SIOCSIWRATE)] = __cfg80211_wext_siwrate, - [IW_IOCTL_IDX(SIOCGIWRATE)] = __cfg80211_wext_giwrate, - [IW_IOCTL_IDX(SIOCSIWRTS)] = __cfg80211_wext_siwrts, - [IW_IOCTL_IDX(SIOCGIWRTS)] = __cfg80211_wext_giwrts, - [IW_IOCTL_IDX(SIOCSIWFRAG)] = __cfg80211_wext_siwfrag, - [IW_IOCTL_IDX(SIOCGIWFRAG)] = __cfg80211_wext_giwfrag, - [IW_IOCTL_IDX(SIOCSIWTXPOW)] = cfg80211_wext_siwtxpower, - [IW_IOCTL_IDX(SIOCGIWTXPOW)] = cfg80211_wext_giwtxpower, - [IW_IOCTL_IDX(SIOCSIWRETRY)] = __cfg80211_wext_siwretry, - [IW_IOCTL_IDX(SIOCGIWRETRY)] = __cfg80211_wext_giwretry, - [IW_IOCTL_IDX(SIOCSIWENCODE)] = __cfg80211_wext_siwencode, - [IW_IOCTL_IDX(SIOCGIWENCODE)] = __cfg80211_wext_giwencode, - [IW_IOCTL_IDX(SIOCSIWPOWER)] = __cfg80211_wext_siwpower, - [IW_IOCTL_IDX(SIOCGIWPOWER)] = __cfg80211_wext_giwpower, - [IW_IOCTL_IDX(SIOCSIWGENIE)] = __cfg80211_wext_siwgenie, - [IW_IOCTL_IDX(SIOCSIWAUTH)] = __cfg80211_wext_siwauth, - [IW_IOCTL_IDX(SIOCGIWAUTH)] = __cfg80211_wext_giwauth, - [IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= __cfg80211_wext_siwencodeext, - [IW_IOCTL_IDX(SIOCSIWPMKSA)] = __cfg80211_wext_siwpmksa, + IW_HANDLER(SIOCGIWNAME, cfg80211_wext_giwname), + IW_HANDLER(SIOCSIWFREQ, cfg80211_wext_siwfreq), + IW_HANDLER(SIOCGIWFREQ, cfg80211_wext_giwfreq), + IW_HANDLER(SIOCSIWMODE, cfg80211_wext_siwmode), + IW_HANDLER(SIOCGIWMODE, cfg80211_wext_giwmode), + IW_HANDLER(SIOCGIWRANGE, cfg80211_wext_giwrange), + IW_HANDLER(SIOCSIWAP, cfg80211_wext_siwap), + IW_HANDLER(SIOCGIWAP, cfg80211_wext_giwap), + IW_HANDLER(SIOCSIWMLME, cfg80211_wext_siwmlme), + IW_HANDLER(SIOCSIWSCAN, cfg80211_wext_siwscan), + IW_HANDLER(SIOCGIWSCAN, cfg80211_wext_giwscan), + IW_HANDLER(SIOCSIWESSID, cfg80211_wext_siwessid), + IW_HANDLER(SIOCGIWESSID, cfg80211_wext_giwessid), + IW_HANDLER(SIOCSIWRATE, cfg80211_wext_siwrate), + IW_HANDLER(SIOCGIWRATE, cfg80211_wext_giwrate), + IW_HANDLER(SIOCSIWRTS, cfg80211_wext_siwrts), + IW_HANDLER(SIOCGIWRTS, cfg80211_wext_giwrts), + IW_HANDLER(SIOCSIWFRAG, cfg80211_wext_siwfrag), + IW_HANDLER(SIOCGIWFRAG, cfg80211_wext_giwfrag), + IW_HANDLER(SIOCSIWTXPOW, cfg80211_wext_siwtxpower), + IW_HANDLER(SIOCGIWTXPOW, cfg80211_wext_giwtxpower), + IW_HANDLER(SIOCSIWRETRY, cfg80211_wext_siwretry), + IW_HANDLER(SIOCGIWRETRY, cfg80211_wext_giwretry), + IW_HANDLER(SIOCSIWENCODE, cfg80211_wext_siwencode), + IW_HANDLER(SIOCGIWENCODE, cfg80211_wext_giwencode), + IW_HANDLER(SIOCSIWPOWER, cfg80211_wext_siwpower), + IW_HANDLER(SIOCGIWPOWER, cfg80211_wext_giwpower), + IW_HANDLER(SIOCSIWGENIE, cfg80211_wext_siwgenie), + IW_HANDLER(SIOCSIWAUTH, cfg80211_wext_siwauth), + IW_HANDLER(SIOCGIWAUTH, cfg80211_wext_giwauth), + IW_HANDLER(SIOCSIWENCODEEXT, cfg80211_wext_siwencodeext), + IW_HANDLER(SIOCSIWPMKSA, cfg80211_wext_siwpmksa), }; const struct iw_handler_def cfg80211_wext_handler = { diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h index 8d3cc1552e2f..c02eb789e676 100644 --- a/net/wireless/wext-compat.h +++ b/net/wireless/wext-compat.h @@ -13,7 +13,7 @@ int cfg80211_ibss_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, - struct iw_freq *freq, char *extra); + struct iw_freq *wextfreq, char *extra); int cfg80211_ibss_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra); @@ -32,7 +32,7 @@ int cfg80211_ibss_wext_giwessid(struct net_device *dev, int cfg80211_mgd_wext_siwfreq(struct net_device *dev, struct iw_request_info *info, - struct iw_freq *freq, char *extra); + struct iw_freq *wextfreq, char *extra); int cfg80211_mgd_wext_giwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra); @@ -51,10 +51,10 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev, int cfg80211_wext_siwmlme(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra); + union iwreq_data *wrqu, char *extra); int cfg80211_wext_freq(struct iw_freq *freq); diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 68f45afc352d..191c6d98c700 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -324,8 +324,9 @@ int cfg80211_mgd_wext_giwap(struct net_device *dev, int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra) + union iwreq_data *wrqu, char *extra) { + struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *ie = extra; @@ -374,7 +375,7 @@ int cfg80211_wext_siwgenie(struct net_device *dev, int cfg80211_wext_siwmlme(struct net_device *dev, struct iw_request_info *info, - struct iw_point *data, char *extra) + union iwreq_data *wrqu, char *extra) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_mlme *mlme = (struct iw_mlme *)extra; -- cgit v1.2.3-70-g09d2 From 919dfa0b20ae56060dce0436eb710717f8987d18 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:53 -0800 Subject: udp: Clean up some functions. This patch adds no functional change and cleans up some functions that the following patches touch around so that we make them tidy and easy to review/revert. The change is mainly to keep reverse christmas tree order. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/udp.c | 39 +++++++++++++++++++++++---------------- net/ipv6/udp.c | 12 ++++++++---- 2 files changed, 31 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index b859d6c8298e..a34de263e9ce 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -232,16 +232,16 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr) { - struct udp_hslot *hslot, *hslot2; struct udp_table *udptable = sk->sk_prot->h.udp_table; - int error = 1; + struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); + int error = 1; if (!snum) { + DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); + unsigned short first, last; int low, high, remaining; unsigned int rand; - unsigned short first, last; - DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; @@ -2519,10 +2519,13 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { - struct sock *sk, *result; unsigned short hnum = ntohs(loc_port); - unsigned int slot = udp_hashfn(net, hnum, udp_table.mask); - struct udp_hslot *hslot = &udp_table.hash[slot]; + struct sock *sk, *result; + struct udp_hslot *hslot; + unsigned int slot; + + slot = udp_hashfn(net, hnum, udp_table.mask); + hslot = &udp_table.hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) @@ -2550,14 +2553,18 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { - unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); - unsigned int slot2 = hash2 & udp_table.mask; - struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); - const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + unsigned short hnum = ntohs(loc_port); + unsigned int hash2, slot2; + struct udp_hslot *hslot2; + __portpair ports; struct sock *sk; + hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); + slot2 = hash2 & udp_table.mask; + hslot2 = &udp_table.hash2[slot2]; + ports = INET_COMBINED_PORTS(rmt_port, hnum); + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk; @@ -2970,10 +2977,10 @@ EXPORT_SYMBOL(udp_prot); static struct sock *udp_get_first(struct seq_file *seq, int start) { - struct sock *sk; - struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + struct udp_seq_afinfo *afinfo; + struct sock *sk; if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; @@ -3004,9 +3011,9 @@ found: static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { - struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); + struct udp_seq_afinfo *afinfo; if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; @@ -3062,8 +3069,8 @@ EXPORT_SYMBOL(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { - struct udp_seq_afinfo *afinfo; struct udp_iter_state *state = seq->private; + struct udp_seq_afinfo *afinfo; if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e2de3d906c82..727de67e4c90 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1064,12 +1064,16 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, int dif, int sdif) { unsigned short hnum = ntohs(loc_port); - unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); - unsigned int slot2 = hash2 & udp_table.mask; - struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; - const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + unsigned int hash2, slot2; + struct udp_hslot *hslot2; + __portpair ports; struct sock *sk; + hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); + slot2 = hash2 & udp_table.mask; + hslot2 = &udp_table.hash2[slot2]; + ports = INET_COMBINED_PORTS(rmt_port, hnum); + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (sk->sk_state == TCP_ESTABLISHED && inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif)) -- cgit v1.2.3-70-g09d2 From 67fb43308f4b354f13aabcc66dd5d99bfbb7e838 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:54 -0800 Subject: udp: Set NULL to sk->sk_prot->h.udp_table. We will soon introduce an optional per-netns hash table for UDP. This means we cannot use the global sk->sk_prot->h.udp_table to fetch a UDP hash table. Instead, set NULL to sk->sk_prot->h.udp_table for UDP and get a proper table from net->ipv4.udp_table. Note that we still need sk->sk_prot->h.udp_table for UDP LITE. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 1 + net/ipv4/udp.c | 15 +++++++++++---- net/ipv6/udp.c | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 25f90bba4889..e4cc4d3cacc4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -43,6 +43,7 @@ struct tcp_fastopen_context; struct netns_ipv4 { struct inet_timewait_death_row tcp_death_row; + struct udp_table *udp_table; #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a34de263e9ce..6206c27a1659 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -131,6 +131,11 @@ EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) +static struct udp_table *udp_get_table_prot(struct sock *sk) +{ + return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table; +} + static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, @@ -232,7 +237,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr) { - struct udp_table *udptable = sk->sk_prot->h.udp_table; + struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); int error = 1; @@ -1999,7 +2004,7 @@ EXPORT_SYMBOL(udp_disconnect); void udp_lib_unhash(struct sock *sk) { if (sk_hashed(sk)) { - struct udp_table *udptable = sk->sk_prot->h.udp_table; + struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; hslot = udp_hashslot(udptable, sock_net(sk), @@ -2030,7 +2035,7 @@ EXPORT_SYMBOL(udp_lib_unhash); void udp_lib_rehash(struct sock *sk, u16 newhash) { if (sk_hashed(sk)) { - struct udp_table *udptable = sk->sk_prot->h.udp_table; + struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2, *nhslot2; hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); @@ -2967,7 +2972,7 @@ struct proto udp_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), - .h.udp_table = &udp_table, + .h.udp_table = NULL, .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); @@ -3280,6 +3285,8 @@ EXPORT_SYMBOL(udp_flow_hashrnd); static int __net_init udp_sysctl_init(struct net *net) { + net->ipv4.udp_table = &udp_table; + net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 727de67e4c90..bbd6dc398f3b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1774,7 +1774,7 @@ struct proto udpv6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), - .h.udp_table = &udp_table, + .h.udp_table = NULL, .diag_destroy = udp_abort, }; -- cgit v1.2.3-70-g09d2 From 478aee5d6bf617c932f4e9c2981f17e86e093fc5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:55 -0800 Subject: udp: Set NULL to udp_seq_afinfo.udp_table. We will soon introduce an optional per-netns hash table for UDP. This means we cannot use the global udp_seq_afinfo.udp_table to fetch a UDP hash table. Instead, set NULL to udp_seq_afinfo.udp_table for UDP and get a proper table from net->ipv4.udp_table. Note that we still need udp_seq_afinfo.udp_table for UDP LITE. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/udp.c | 32 ++++++++++++++++++++++++-------- net/ipv6/udp.c | 2 +- 2 files changed, 25 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6206c27a1659..a1a15eb76304 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2980,11 +2980,18 @@ EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS +static struct udp_table *udp_get_table_afinfo(struct udp_seq_afinfo *afinfo, + struct net *net) +{ + return afinfo->udp_table ? : net->ipv4.udp_table; +} + static struct sock *udp_get_first(struct seq_file *seq, int start) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); struct udp_seq_afinfo *afinfo; + struct udp_table *udptable; struct sock *sk; if (state->bpf_seq_afinfo) @@ -2992,9 +2999,11 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) else afinfo = pde_data(file_inode(seq->file)); - for (state->bucket = start; state->bucket <= afinfo->udp_table->mask; + udptable = udp_get_table_afinfo(afinfo, net); + + for (state->bucket = start; state->bucket <= udptable->mask; ++state->bucket) { - struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket]; + struct udp_hslot *hslot = &udptable->hash[state->bucket]; if (hlist_empty(&hslot->head)) continue; @@ -3019,6 +3028,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); struct udp_seq_afinfo *afinfo; + struct udp_table *udptable; if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; @@ -3032,8 +3042,11 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) sk->sk_family != afinfo->family))); if (!sk) { - if (state->bucket <= afinfo->udp_table->mask) - spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock); + udptable = udp_get_table_afinfo(afinfo, net); + + if (state->bucket <= udptable->mask) + spin_unlock_bh(&udptable->hash[state->bucket].lock); + return udp_get_first(seq, state->bucket + 1); } return sk; @@ -3076,14 +3089,17 @@ void udp_seq_stop(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; struct udp_seq_afinfo *afinfo; + struct udp_table *udptable; if (state->bpf_seq_afinfo) afinfo = state->bpf_seq_afinfo; else afinfo = pde_data(file_inode(seq->file)); - if (state->bucket <= afinfo->udp_table->mask) - spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock); + udptable = udp_get_table_afinfo(afinfo, seq_file_net(seq)); + + if (state->bucket <= udptable->mask) + spin_unlock_bh(&udptable->hash[state->bucket].lock); } EXPORT_SYMBOL(udp_seq_stop); @@ -3196,7 +3212,7 @@ EXPORT_SYMBOL(udp_seq_ops); static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, - .udp_table = &udp_table, + .udp_table = NULL, }; static int __net_init udp4_proc_init_net(struct net *net) @@ -3316,7 +3332,7 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) return -ENOMEM; afinfo->family = AF_UNSPEC; - afinfo->udp_table = &udp_table; + afinfo->udp_table = NULL; st->bpf_seq_afinfo = afinfo; ret = bpf_iter_init_seq_net(priv_data, aux); if (ret) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index bbd6dc398f3b..c3dee1f8d3bd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1724,7 +1724,7 @@ EXPORT_SYMBOL(udp6_seq_ops); static struct udp_seq_afinfo udp6_seq_afinfo = { .family = AF_INET6, - .udp_table = &udp_table, + .udp_table = NULL, }; int __net_init udp6_proc_init(struct net *net) -- cgit v1.2.3-70-g09d2 From ba6aac1516779dd0ced22c136a2c2c4a9c70cf29 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:56 -0800 Subject: udp: Access &udp_table via net. We will soon introduce an optional per-netns hash table for UDP. This means we cannot use udp_table directly in most places. Instead, access it via net->ipv4.udp_table. The access will be valid only while initialising udp_table itself and creating/destroying each netns. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/core/filter.c | 4 ++-- net/ipv4/udp.c | 23 +++++++++++++---------- net/ipv4/udp_diag.c | 6 +++--- net/ipv4/udp_offload.c | 5 +++-- net/ipv6/udp.c | 19 +++++++++++-------- net/ipv6/udp_offload.c | 5 +++-- 6 files changed, 35 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 6dd2baf5eeb2..da4f697e4fa4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6432,7 +6432,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, else sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, - dif, sdif, &udp_table, NULL); + dif, sdif, net->ipv4.udp_table, NULL); #if IS_ENABLED(CONFIG_IPV6) } else { struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; @@ -6448,7 +6448,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, src6, tuple->ipv6.sport, dst6, tuple->ipv6.dport, dif, sdif, - &udp_table, NULL); + net->ipv4.udp_table, NULL); #endif } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a1a15eb76304..37e79158d145 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -472,7 +472,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net, struct sock *sk, *reuse_sk; bool no_reuseport; - if (udptable != &udp_table) + if (udptable != net->ipv4.udp_table) return NULL; /* only UDP is supported */ no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport, @@ -553,10 +553,11 @@ struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = ip_hdr(skb); + struct net *net = dev_net(skb->dev); - return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + return __udp4_lib_lookup(net, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - inet_sdif(skb), &udp_table, NULL); + inet_sdif(skb), net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). @@ -569,7 +570,7 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, struct sock *sk; sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, - dif, 0, &udp_table, NULL); + dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -807,7 +808,7 @@ out: int udp_err(struct sk_buff *skb, u32 info) { - return __udp4_lib_err(skb, info, &udp_table); + return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table); } /* @@ -2524,13 +2525,14 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { + struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); struct sock *sk, *result; struct udp_hslot *hslot; unsigned int slot; - slot = udp_hashfn(net, hnum, udp_table.mask); - hslot = &udp_table.hash[slot]; + slot = udp_hashfn(net, hnum, udptable->mask); + hslot = &udptable->hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) @@ -2558,6 +2560,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { + struct udp_table *udptable = net->ipv4.udp_table; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); unsigned short hnum = ntohs(loc_port); unsigned int hash2, slot2; @@ -2566,8 +2569,8 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, struct sock *sk; hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udp_table.mask; - hslot2 = &udp_table.hash2[slot2]; + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { @@ -2649,7 +2652,7 @@ int udp_v4_early_demux(struct sk_buff *skb) int udp_rcv(struct sk_buff *skb) { - return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP); + return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); } void udp_destroy_sock(struct sock *sk) diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 1ed8c4d78e5c..de3f2d31f510 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -147,13 +147,13 @@ done: static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - udp_dump(&udp_table, skb, cb, r); + udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return udp_dump_one(&udp_table, cb, req); + return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, @@ -225,7 +225,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb, static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { - return __udp_diag_destroy(in_skb, req, &udp_table); + return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 6d1a4bec2614..aedde65e2268 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -600,10 +600,11 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = skb_gro_network_header(skb); + struct net *net = dev_net(skb->dev); - return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + return __udp4_lib_lookup(net, iph->saddr, sport, iph->daddr, dport, inet_iif(skb), - inet_sdif(skb), &udp_table, NULL); + inet_sdif(skb), net->ipv4.udp_table, NULL); } INDIRECT_CALLABLE_SCOPE diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c3dee1f8d3bd..9fb2f33ee3a7 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -217,7 +217,7 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net, struct sock *sk, *reuse_sk; bool no_reuseport; - if (udptable != &udp_table) + if (udptable != net->ipv4.udp_table) return NULL; /* only UDP is supported */ no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_UDP, saddr, sport, @@ -298,10 +298,11 @@ struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = dev_net(skb->dev); - return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + return __udp6_lib_lookup(net, &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), &udp_table, NULL); + inet6_sdif(skb), net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). @@ -314,7 +315,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be struct sock *sk; sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport, - dif, 0, &udp_table, NULL); + dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; @@ -689,7 +690,8 @@ static __inline__ int udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { - return __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table); + return __udp6_lib_err(skb, opt, type, code, offset, info, + dev_net(skb->dev)->ipv4.udp_table); } static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) @@ -1063,6 +1065,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, __be16 rmt_port, const struct in6_addr *rmt_addr, int dif, int sdif) { + struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); unsigned int hash2, slot2; struct udp_hslot *hslot2; @@ -1070,8 +1073,8 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net, struct sock *sk; hash2 = ipv6_portaddr_hash(net, loc_addr, hnum); - slot2 = hash2 & udp_table.mask; - hslot2 = &udp_table.hash2[slot2]; + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { @@ -1127,7 +1130,7 @@ void udp_v6_early_demux(struct sk_buff *skb) INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb) { - return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP); + return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); } /* diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 7720d04ed396..e0e10f6bcdc1 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -116,10 +116,11 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = skb_gro_network_header(skb); + struct net *net = dev_net(skb->dev); - return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + return __udp6_lib_lookup(net, &iph->saddr, sport, &iph->daddr, dport, inet6_iif(skb), - inet6_sdif(skb), &udp_table, NULL); + inet6_sdif(skb), net->ipv4.udp_table, NULL); } INDIRECT_CALLABLE_SCOPE -- cgit v1.2.3-70-g09d2 From 9804985bf27f8fbcf0d96c7435b5ad94a2a6ea20 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:57 -0800 Subject: udp: Introduce optional per-netns hash table. The maximum hash table size is 64K due to the nature of the protocol. [0] It's smaller than TCP, and fewer sockets can cause a performance drop. On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in different netns, creating 32Mi sockets without data transfer in the root netns causes regression for the iperf3's connection. uhash_entries sockets length Gbps 64K 1 1 5.69 1Mi 16 5.27 2Mi 32 4.90 4Mi 64 4.09 8Mi 128 2.96 16Mi 256 2.06 32Mi 512 1.12 The per-netns hash table breaks the lengthy lists into shorter ones. It is useful on a multi-tenant system with thousands of netns. With smaller hash tables, we can look up sockets faster, isolate noisy neighbours, and reduce lock contention. The max size of the per-netns table is 64K as well. This is because the possible hash range by udp_hashfn() always fits in 64K within the same netns and we cannot make full use of the whole buckets larger than 64K. /* 0 < num < 64K -> X < hash < X + 64K */ (num + net_hash_mix(net)) & mask; Also, the min size is 128. We use a bitmap to search for an available port in udp_lib_get_port(). To keep the bitmap on the stack and not fire the CONFIG_FRAME_WARN error at build time, we round up the table size to 128. The sysctl usage is the same with TCP: $ dmesg | cut -d ' ' -f 6- | grep "UDP hash" UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc) # sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 65536 # can be changed by uhash_entries # sysctl net.ipv4.udp_child_hash_entries net.ipv4.udp_child_hash_entries = 0 # disabled by default # ip netns add test1 # ip netns exec test1 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = -65536 # share the global table # sysctl -w net.ipv4.udp_child_hash_entries=100 net.ipv4.udp_child_hash_entries = 100 # ip netns add test2 # ip netns exec test2 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 128 # own a per-netns table with 2^n buckets We could optimise the hash table lookup/iteration further by removing the netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. [0]: https://lore.kernel.org/netdev/4ACC2815.7010101@gmail.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 27 +++++++++ include/linux/udp.h | 2 + include/net/netns/ipv4.h | 2 + net/ipv4/sysctl_net_ipv4.c | 40 +++++++++++++ net/ipv4/udp.c | 101 +++++++++++++++++++++++++++++++-- 5 files changed, 166 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 815efc89ad73..727b25cc7ec4 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1177,6 +1177,33 @@ udp_rmem_min - INTEGER udp_wmem_min - INTEGER UDP does not have tx memory accounting and this tunable has no effect. +udp_hash_entries - INTEGER + Show the number of hash buckets for UDP sockets in the current + networking namespace. + + A negative value means the networking namespace does not own its + hash buckets and shares the initial networking namespace's one. + +udp_child_ehash_entries - INTEGER + Control the number of hash buckets for UDP sockets in the child + networking namespace, which must be set before clone() or unshare(). + + If the value is not 0, the kernel uses a value rounded up to 2^n + as the actual hash bucket size. 0 is a special value, meaning + the child networking namespace will share the initial networking + namespace's hash buckets. + + Note that the child will use the global one in case the kernel + fails to allocate enough memory. In addition, the global hash + buckets are spread over available NUMA nodes, but the allocation + of the child hash table depends on the current process's NUMA + policy, which could result in performance differences. + + Possible values: 0, 2^n (n: 7 (128) - 16 (64K)) + + Default: 0 + + RAW variables ============= diff --git a/include/linux/udp.h b/include/linux/udp.h index dea57aa37df6..a2892e151644 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb) return (struct udphdr *)skb_transport_header(skb); } +#define UDP_HTABLE_SIZE_MIN_PERNET 128 #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) +#define UDP_HTABLE_SIZE_MAX 65536 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) { diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index e4cc4d3cacc4..db762e35aca9 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -208,6 +208,8 @@ struct netns_ipv4 { atomic_t dev_addr_genid; + unsigned int sysctl_udp_child_hash_entries; + #ifdef CONFIG_SYSCTL unsigned long *sysctl_local_reserved_ports; int sysctl_ip_prot_sock; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0af28cedd071..0d0cc4ef2b85 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -40,6 +40,7 @@ static int one_day_secs = 24 * 3600; static u32 fib_multipath_hash_fields_all_mask __maybe_unused = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; +static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; @@ -402,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write, if (!net_eq(net, &init_net) && !hinfo->pernet) tcp_ehash_entries *= -1; + memset(&tbl, 0, sizeof(tbl)); tbl.data = &tcp_ehash_entries; tbl.maxlen = sizeof(int); return proc_dointvec(&tbl, write, buffer, lenp, ppos); } +static int proc_udp_hash_entries(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_udp_child_hash_entries); + int udp_hash_entries; + struct ctl_table tbl; + + udp_hash_entries = net->ipv4.udp_table->mask + 1; + + /* A negative number indicates that the child netns + * shares the global udp_table. + */ + if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table) + udp_hash_entries *= -1; + + memset(&tbl, 0, sizeof(tbl)); + tbl.data = &udp_hash_entries; + tbl.maxlen = sizeof(int); + + return proc_dointvec(&tbl, write, buffer, lenp, ppos); +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, @@ -1361,6 +1386,21 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = &tcp_child_ehash_entries_max, }, + { + .procname = "udp_hash_entries", + .data = &init_net.ipv4.sysctl_udp_child_hash_entries, + .mode = 0444, + .proc_handler = proc_udp_hash_entries, + }, + { + .procname = "udp_child_hash_entries", + .data = &init_net.ipv4.sysctl_udp_child_hash_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &udp_child_hash_entries_max, + }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 37e79158d145..1fb7d1ed1cb1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -129,7 +129,7 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); #define MAX_UDP_PORTS 65536 -#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) +#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET) static struct udp_table *udp_get_table_prot(struct sock *sk) { @@ -3277,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name) &table->log, &table->mask, UDP_HTABLE_SIZE_MIN, - 64 * 1024); + UDP_HTABLE_SIZE_MAX); table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { @@ -3302,22 +3302,111 @@ u32 udp_flow_hashrnd(void) } EXPORT_SYMBOL(udp_flow_hashrnd); -static int __net_init udp_sysctl_init(struct net *net) +static void __net_init udp_sysctl_init(struct net *net) { - net->ipv4.udp_table = &udp_table; - net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_udp_l3mdev_accept = 0; #endif +} + +static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries) +{ + struct udp_table *udptable; + int i; + + udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); + if (!udptable) + goto out; + + udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot), + GFP_KERNEL_ACCOUNT); + if (!udptable->hash) + goto free_table; + + udptable->hash2 = udptable->hash + hash_entries; + udptable->mask = hash_entries - 1; + udptable->log = ilog2(hash_entries); + + for (i = 0; i < hash_entries; i++) { + INIT_HLIST_HEAD(&udptable->hash[i].head); + udptable->hash[i].count = 0; + spin_lock_init(&udptable->hash[i].lock); + + INIT_HLIST_HEAD(&udptable->hash2[i].head); + udptable->hash2[i].count = 0; + spin_lock_init(&udptable->hash2[i].lock); + } + + return udptable; + +free_table: + kfree(udptable); +out: + return NULL; +} + +static void __net_exit udp_pernet_table_free(struct net *net) +{ + struct udp_table *udptable = net->ipv4.udp_table; + + if (udptable == &udp_table) + return; + + kvfree(udptable->hash); + kfree(udptable); +} + +static void __net_init udp_set_table(struct net *net) +{ + struct udp_table *udptable; + unsigned int hash_entries; + struct net *old_net; + + if (net_eq(net, &init_net)) + goto fallback; + + old_net = current->nsproxy->net_ns; + hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries); + if (!hash_entries) + goto fallback; + + /* Set min to keep the bitmap on stack in udp_lib_get_port() */ + if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET) + hash_entries = UDP_HTABLE_SIZE_MIN_PERNET; + else + hash_entries = roundup_pow_of_two(hash_entries); + + udptable = udp_pernet_table_alloc(hash_entries); + if (udptable) { + net->ipv4.udp_table = udptable; + } else { + pr_warn("Failed to allocate UDP hash table (entries: %u) " + "for a netns, fallback to the global one\n", + hash_entries); +fallback: + net->ipv4.udp_table = &udp_table; + } +} + +static int __net_init udp_pernet_init(struct net *net) +{ + udp_sysctl_init(net); + udp_set_table(net); return 0; } +static void __net_exit udp_pernet_exit(struct net *net) +{ + udp_pernet_table_free(net); +} + static struct pernet_operations __net_initdata udp_sysctl_ops = { - .init = udp_sysctl_init, + .init = udp_pernet_init, + .exit = udp_pernet_exit, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) -- cgit v1.2.3-70-g09d2 From 8c55facecd7ade835287298ce325f930d888d8ec Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 14 Nov 2022 16:42:56 +0200 Subject: net: linkwatch: only report IF_OPER_LOWERLAYERDOWN if iflink is actually down RFC 2863 says: The lowerLayerDown state is also a refinement on the down state. This new state indicates that this interface runs "on top of" one or more other interfaces (see ifStackTable) and that this interface is down specifically because one or more of these lower-layer interfaces are down. DSA interfaces are virtual network devices, stacked on top of the DSA master, but they have a physical MAC, with a PHY that reports a real link status. But since DSA (perhaps improperly) uses an iflink to describe the relationship to its master since commit c084080151e1 ("dsa: set ->iflink on slave interfaces to the ifindex of the parent"), default_operstate() will misinterpret this to mean that every time the carrier of a DSA interface is not ok, it is because of the master being not ok. In fact, since commit c0a8a9c27493 ("net: dsa: automatically bring user ports down when master goes down"), DSA cannot even in theory be in the lowerLayerDown state, because it just calls dev_close_many(), thereby going down, when the master goes down. We could revert the commit that creates an iflink between a DSA user port and its master, especially since now we have an alternative IFLA_DSA_MASTER which has less side effects. But there may be tooling in use which relies on the iflink, which has existed since 2009. We could also probably do something local within DSA to overwrite what rfc2863_policy() did, in a way similar to hsr_set_operstate(), but this seems like a hack. What seems appropriate is to follow the iflink, and check the carrier status of that interface as well. If that's down too, yes, keep reporting lowerLayerDown, otherwise just down. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/core/link_watch.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index aa6cb1f90966..c469d1c4db5d 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -38,9 +38,23 @@ static unsigned char default_operstate(const struct net_device *dev) if (netif_testing(dev)) return IF_OPER_TESTING; - if (!netif_carrier_ok(dev)) - return (dev->ifindex != dev_get_iflink(dev) ? - IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); + /* Some uppers (DSA) have additional sources for being down, so + * first check whether lower is indeed the source of its down state. + */ + if (!netif_carrier_ok(dev)) { + int iflink = dev_get_iflink(dev); + struct net_device *peer; + + if (iflink == dev->ifindex) + return IF_OPER_DOWN; + + peer = __dev_get_by_index(dev_net(dev), iflink); + if (!peer) + return IF_OPER_DOWN; + + return netif_carrier_ok(peer) ? IF_OPER_DOWN : + IF_OPER_LOWERLAYERDOWN; + } if (netif_dormant(dev)) return IF_OPER_DORMANT; -- cgit v1.2.3-70-g09d2 From 57fc05e8e82d015d5d58572e146ac8579a66efea Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 09:10:56 +0000 Subject: net: mm_account_pinned_pages() optimization Adopt atomic_long_try_cmpxchg() in mm_account_pinned_pages() as it is slightly more efficient. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 90d085290d49..4bf95e36ed16 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1267,13 +1267,12 @@ int mm_account_pinned_pages(struct mmpin *mmp, size_t size) max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; user = mmp->user ? : current_user(); + old_pg = atomic_long_read(&user->locked_vm); do { - old_pg = atomic_long_read(&user->locked_vm); new_pg = old_pg + num_pg; if (new_pg > max_pg) return -ENOBUFS; - } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) != - old_pg); + } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg)); if (!mmp->user) { mmp->user = get_uid(user); -- cgit v1.2.3-70-g09d2 From 30189806fbb945c4cb753878aab0aa5af07bc921 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 09:10:57 +0000 Subject: ipv6: fib6_new_sernum() optimization Adopt atomic_try_cmpxchg() which is slightly more efficient. Signed-off-by: Eric Dumazet Cc: David Ahern Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 413f66781e50..2438da5ff6da 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -91,13 +91,12 @@ static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) static int fib6_new_sernum(struct net *net) { - int new, old; + int new, old = atomic_read(&net->ipv6.fib6_sernum); do { - old = atomic_read(&net->ipv6.fib6_sernum); new = old < INT_MAX ? old + 1 : 1; - } while (atomic_cmpxchg(&net->ipv6.fib6_sernum, - old, new) != old); + } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); + return new; } -- cgit v1.2.3-70-g09d2 From 6af645a5b2da7898b2c37e8f39f1bc572c4ab85a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 09:10:58 +0000 Subject: net: net_{enable|disable}_timestamp() optimizations Adopting atomic_try_cmpxchg() makes the code cleaner. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 117e830cabb0..10b56648a9d4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2073,13 +2073,10 @@ static DECLARE_WORK(netstamp_work, netstamp_clear); void net_enable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL - int wanted; + int wanted = atomic_read(&netstamp_wanted); - while (1) { - wanted = atomic_read(&netstamp_wanted); - if (wanted <= 0) - break; - if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) + while (wanted > 0) { + if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1)) return; } atomic_inc(&netstamp_needed_deferred); @@ -2093,13 +2090,10 @@ EXPORT_SYMBOL(net_enable_timestamp); void net_disable_timestamp(void) { #ifdef CONFIG_JUMP_LABEL - int wanted; + int wanted = atomic_read(&netstamp_wanted); - while (1) { - wanted = atomic_read(&netstamp_wanted); - if (wanted <= 1) - break; - if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) + while (wanted > 1) { + if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1)) return; } atomic_dec(&netstamp_needed_deferred); -- cgit v1.2.3-70-g09d2 From 1462160c7455cfc495693e47d0eabb371ec4f7ff Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 09:10:59 +0000 Subject: net: adopt try_cmpxchg() in napi_schedule_prep() and napi_complete_done() This makes the code slightly more efficient. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 10b56648a9d4..0c12c7ad04f2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5979,10 +5979,9 @@ EXPORT_SYMBOL(__napi_schedule); */ bool napi_schedule_prep(struct napi_struct *n) { - unsigned long val, new; + unsigned long new, val = READ_ONCE(n->state); do { - val = READ_ONCE(n->state); if (unlikely(val & NAPIF_STATE_DISABLE)) return false; new = val | NAPIF_STATE_SCHED; @@ -5995,7 +5994,7 @@ bool napi_schedule_prep(struct napi_struct *n) */ new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED * NAPIF_STATE_MISSED; - } while (cmpxchg(&n->state, val, new) != val); + } while (!try_cmpxchg(&n->state, &val, new)); return !(val & NAPIF_STATE_SCHED); } @@ -6063,9 +6062,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done) local_irq_restore(flags); } + val = READ_ONCE(n->state); do { - val = READ_ONCE(n->state); - WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED)); new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED | @@ -6078,7 +6076,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) */ new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED * NAPIF_STATE_SCHED; - } while (cmpxchg(&n->state, val, new) != val); + } while (!try_cmpxchg(&n->state, &val, new)); if (unlikely(val & NAPIF_STATE_MISSED)) { __napi_schedule(n); -- cgit v1.2.3-70-g09d2 From 4ffa1d1c6842a97e84cfbe56bfcf70edb23608e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 09:11:00 +0000 Subject: net: adopt try_cmpxchg() in napi_{enable|disable}() This makes code a bit cleaner. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 0c12c7ad04f2..fb943dad9651 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6397,8 +6397,8 @@ void napi_disable(struct napi_struct *n) might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); - for ( ; ; ) { - val = READ_ONCE(n->state); + val = READ_ONCE(n->state); + do { if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); continue; @@ -6406,10 +6406,7 @@ void napi_disable(struct napi_struct *n) new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); - - if (cmpxchg(&n->state, val, new) == val) - break; - } + } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); @@ -6426,16 +6423,15 @@ EXPORT_SYMBOL(napi_disable); */ void napi_enable(struct napi_struct *n) { - unsigned long val, new; + unsigned long new, val = READ_ONCE(n->state); do { - val = READ_ONCE(n->state); BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); if (n->dev->threaded && n->thread) new |= NAPIF_STATE_THREADED; - } while (cmpxchg(&n->state, val, new) != val); + } while (!try_cmpxchg(&n->state, &val, new)); } EXPORT_SYMBOL(napi_enable); -- cgit v1.2.3-70-g09d2 From 4ebf802cf1c6a87fd14e7936728c99e18d7ba794 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 09:11:01 +0000 Subject: net: __sock_gen_cookie() cleanup Adopt atomic64_try_cmpxchg() and remove the loop, to make the intent more obvious. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock_diag.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index f7cf74cdd3db..b11593cae5a0 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -25,14 +25,14 @@ DEFINE_COOKIE(sock_cookie); u64 __sock_gen_cookie(struct sock *sk) { - while (1) { - u64 res = atomic64_read(&sk->sk_cookie); + u64 res = atomic64_read(&sk->sk_cookie); - if (res) - return res; - res = gen_cookie_next(&sock_cookie); - atomic64_cmpxchg(&sk->sk_cookie, 0, res); + if (!res) { + u64 new = gen_cookie_next(&sock_cookie); + + atomic64_try_cmpxchg(&sk->sk_cookie, &res, new); } + return res; } int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie) -- cgit v1.2.3-70-g09d2 From 6c1c5097781f563b70a81683ea6fdac21637573b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 08:53:55 +0000 Subject: net: add atomic_long_t to net_device_stats fields Long standing KCSAN issues are caused by data-race around some dev->stats changes. Most performance critical paths already use per-cpu variables, or per-queue ones. It is reasonable (and more correct) to use atomic operations for the slow paths. This patch adds an union for each field of net_device_stats, so that we can convert paths that are not yet protected by a spinlock or a mutex. netdev_stats_to_stats64() no longer has an #if BITS_PER_LONG==64 Note that the memcpy() we were using on 64bit arches had no provision to avoid load-tearing, while atomic_long_read() is providing the needed protection at no cost. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 58 ++++++++++++++++++++++++++++------------------- include/net/dst.h | 5 ++-- net/core/dev.c | 14 +++--------- 3 files changed, 40 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 02a2318da7c7..23b3903b0678 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -171,31 +171,38 @@ static inline bool dev_xmit_complete(int rc) * (unsigned long) so they can be read and written atomically. */ +#define NET_DEV_STAT(FIELD) \ + union { \ + unsigned long FIELD; \ + atomic_long_t __##FIELD; \ + } + struct net_device_stats { - unsigned long rx_packets; - unsigned long tx_packets; - unsigned long rx_bytes; - unsigned long tx_bytes; - unsigned long rx_errors; - unsigned long tx_errors; - unsigned long rx_dropped; - unsigned long tx_dropped; - unsigned long multicast; - unsigned long collisions; - unsigned long rx_length_errors; - unsigned long rx_over_errors; - unsigned long rx_crc_errors; - unsigned long rx_frame_errors; - unsigned long rx_fifo_errors; - unsigned long rx_missed_errors; - unsigned long tx_aborted_errors; - unsigned long tx_carrier_errors; - unsigned long tx_fifo_errors; - unsigned long tx_heartbeat_errors; - unsigned long tx_window_errors; - unsigned long rx_compressed; - unsigned long tx_compressed; + NET_DEV_STAT(rx_packets); + NET_DEV_STAT(tx_packets); + NET_DEV_STAT(rx_bytes); + NET_DEV_STAT(tx_bytes); + NET_DEV_STAT(rx_errors); + NET_DEV_STAT(tx_errors); + NET_DEV_STAT(rx_dropped); + NET_DEV_STAT(tx_dropped); + NET_DEV_STAT(multicast); + NET_DEV_STAT(collisions); + NET_DEV_STAT(rx_length_errors); + NET_DEV_STAT(rx_over_errors); + NET_DEV_STAT(rx_crc_errors); + NET_DEV_STAT(rx_frame_errors); + NET_DEV_STAT(rx_fifo_errors); + NET_DEV_STAT(rx_missed_errors); + NET_DEV_STAT(tx_aborted_errors); + NET_DEV_STAT(tx_carrier_errors); + NET_DEV_STAT(tx_fifo_errors); + NET_DEV_STAT(tx_heartbeat_errors); + NET_DEV_STAT(tx_window_errors); + NET_DEV_STAT(rx_compressed); + NET_DEV_STAT(tx_compressed); }; +#undef NET_DEV_STAT /* per-cpu stats, allocated on demand. * Try to fit them in a single cache line, for dev_get_stats() sake. @@ -5171,4 +5178,9 @@ extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; +/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */ +#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD) +#define DEV_STATS_ADD(DEV, FIELD, VAL) \ + atomic_long_add((VAL), &(DEV)->stats.__##FIELD) + #endif /* _LINUX_NETDEVICE_H */ diff --git a/include/net/dst.h b/include/net/dst.h index 00b479ce6b99..d67fda89cd0f 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -356,9 +356,8 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev, struct net *net) { - /* TODO : stats should be SMP safe */ - dev->stats.rx_packets++; - dev->stats.rx_bytes += skb->len; + DEV_STATS_INC(dev, rx_packets); + DEV_STATS_ADD(dev, rx_bytes, skb->len); __skb_tunnel_rx(skb, dev, net); } diff --git a/net/core/dev.c b/net/core/dev.c index fb943dad9651..d0fb4af9a126 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10369,24 +10369,16 @@ void netdev_run_todo(void) void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, const struct net_device_stats *netdev_stats) { -#if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); - memcpy(stats64, netdev_stats, sizeof(*netdev_stats)); - /* zero out counters that only exist in rtnl_link_stats64 */ - memset((char *)stats64 + sizeof(*netdev_stats), 0, - sizeof(*stats64) - sizeof(*netdev_stats)); -#else - size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); - const unsigned long *src = (const unsigned long *)netdev_stats; + size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t); + const atomic_long_t *src = (atomic_long_t *)netdev_stats; u64 *dst = (u64 *)stats64; BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); for (i = 0; i < n; i++) - dst[i] = src[i]; + dst[i] = atomic_long_read(&src[i]); /* zero out counters that only exist in rtnl_link_stats64 */ memset((char *)stats64 + n * sizeof(u64), 0, sizeof(*stats64) - n * sizeof(u64)); -#endif } EXPORT_SYMBOL(netdev_stats_to_stats64); -- cgit v1.2.3-70-g09d2 From cb34b7cf17ecf33499c9298943f85af247abc1e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 08:53:56 +0000 Subject: ipv6/sit: use DEV_STATS_INC() to avoid data-races syzbot/KCSAN reported that multiple cpus are updating dev->stats.tx_error concurrently. This is because sit tunnels are NETIF_F_LLTX, meaning their ndo_start_xmit() is not protected by a spinlock. While original KCSAN report was about tx path, rx path has the same issue. Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/sit.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5703d3cbea9b..70d81bba5093 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -694,7 +694,7 @@ static int ipip6_rcv(struct sk_buff *skb) skb->dev = tunnel->dev; if (packet_is_spoofed(skb, iph, tunnel)) { - tunnel->dev->stats.rx_errors++; + DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } @@ -714,8 +714,8 @@ static int ipip6_rcv(struct sk_buff *skb) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; + DEV_STATS_INC(tunnel->dev, rx_frame_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } } @@ -942,7 +942,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (!rt) { rt = ip_route_output_flow(tunnel->net, &fl4, NULL); if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); @@ -950,14 +950,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { ip_rt_put(rt); - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); goto tx_error; } @@ -970,7 +970,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - t_hlen; if (mtu < IPV4_MIN_MTU) { - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); ip_rt_put(rt); goto tx_error; } @@ -1009,7 +1009,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } @@ -1039,7 +1039,7 @@ tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } @@ -1058,7 +1058,7 @@ static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb, return NETDEV_TX_OK; tx_error: kfree_skb(skb); - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } @@ -1087,7 +1087,7 @@ static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, return NETDEV_TX_OK; tx_err: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; -- cgit v1.2.3-70-g09d2 From 2fad1ba354d4a82b1b635a78f25d9682d4febb5e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 08:53:57 +0000 Subject: ipv6: tunnels: use DEV_STATS_INC() Most of code paths in tunnels are lockless (eg NETIF_F_LLTX in tx). Adopt SMP safe DEV_STATS_{INC|ADD}() to update dev->stats fields. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 11 ++++------- net/ipv6/ip6_tunnel.c | 26 ++++++++++++-------------- net/ipv6/ip6_vti.c | 16 +++++++--------- net/ipv6/ip6mr.c | 10 +++++----- 4 files changed, 28 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 7673e1dd1147..89f5f0f3f5d6 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -895,7 +895,6 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net_device_stats *stats = &t->dev->stats; __be16 payload_protocol; int ret; @@ -925,8 +924,8 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, tx_err: if (!t->parms.collect_md || !IS_ERR(skb_tunnel_info_txcheck(skb))) - stats->tx_errors++; - stats->tx_dropped++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } @@ -937,7 +936,6 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, struct ip_tunnel_info *tun_info = NULL; struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); - struct net_device_stats *stats; bool truncate = false; int encap_limit = -1; __u8 dsfield = false; @@ -1086,10 +1084,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, return NETDEV_TX_OK; tx_err: - stats = &t->dev->stats; if (!IS_ERR(tun_info)) - stats->tx_errors++; - stats->tx_dropped++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 2fb4c6ad7243..47b6607a1370 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -803,8 +803,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, (tunnel->parms.i_flags & TUNNEL_CSUM)) || ((tpi->flags & TUNNEL_CSUM) && !(tunnel->parms.i_flags & TUNNEL_CSUM))) { - tunnel->dev->stats.rx_crc_errors++; - tunnel->dev->stats.rx_errors++; + DEV_STATS_INC(tunnel->dev, rx_crc_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } @@ -812,8 +812,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, if (!(tpi->flags & TUNNEL_SEQ) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { - tunnel->dev->stats.rx_fifo_errors++; - tunnel->dev->stats.rx_errors++; + DEV_STATS_INC(tunnel->dev, rx_fifo_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; @@ -824,8 +824,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { - tunnel->dev->stats.rx_length_errors++; - tunnel->dev->stats.rx_errors++; + DEV_STATS_INC(tunnel->dev, rx_length_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } @@ -849,8 +849,8 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, &ipv6h->saddr, ipv6_get_dsfield(ipv6h)); if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; + DEV_STATS_INC(tunnel->dev, rx_frame_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } @@ -1071,7 +1071,6 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, { struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; - struct net_device_stats *stats = &t->dev->stats; struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; @@ -1166,7 +1165,7 @@ route_lookup: tdev = dst->dev; if (tdev == dev) { - stats->collisions++; + DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; @@ -1265,7 +1264,7 @@ route_lookup: ip6tunnel_xmit(NULL, skb, dev); return 0; tx_err_link_failure: - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); @@ -1408,7 +1407,6 @@ static netdev_tx_t ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net_device_stats *stats = &t->dev->stats; u8 ipproto; int ret; @@ -1438,8 +1436,8 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_err: - stats->tx_errors++; - stats->tx_dropped++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 151337d7f67b..10b222865d46 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -317,7 +317,7 @@ static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { - t->dev->stats.rx_dropped++; + DEV_STATS_INC(t->dev, rx_dropped); rcu_read_unlock(); goto discard; } @@ -359,8 +359,8 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err) dev = t->dev; if (err) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_dropped); return 0; } @@ -446,7 +446,6 @@ static int vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip6_tnl *t = netdev_priv(dev); - struct net_device_stats *stats = &t->dev->stats; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; @@ -506,7 +505,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) tdev = dst->dev; if (tdev == dev) { - stats->collisions++; + DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; @@ -544,7 +543,7 @@ xmit: return 0; tx_err_link_failure: - stats->tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); @@ -555,7 +554,6 @@ static netdev_tx_t vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net_device_stats *stats = &t->dev->stats; struct flowi fl; int ret; @@ -591,8 +589,8 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; tx_err: - stats->tx_errors++; - stats->tx_dropped++; + DEV_STATS_INC(dev, tx_errors); + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index facdc78a43e5..23e766597f36 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -608,8 +608,8 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; - dev->stats.tx_bytes += skb->len; - dev->stats.tx_packets++; + DEV_STATS_ADD(dev, tx_bytes, skb->len); + DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), MRT6MSG_WHOLEPKT); @@ -618,7 +618,7 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, return NETDEV_TX_OK; tx_err: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } @@ -2044,8 +2044,8 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, if (vif->flags & MIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); - vif_dev->stats.tx_bytes += skb->len; - vif_dev->stats.tx_packets++; + DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); + DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); goto out_free; } -- cgit v1.2.3-70-g09d2 From c4794d22251b979b12a6c8e2d3848b662a44fdb6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 08:53:58 +0000 Subject: ipv4: tunnels: use DEV_STATS_INC() Most of code paths in tunnels are lockless (eg NETIF_F_LLTX in tx). Adopt SMP safe DEV_STATS_INC() to update dev->stats fields. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 10 +++++----- net/ipv4/ip_tunnel.c | 32 ++++++++++++++++---------------- net/ipv4/ip_vti.c | 20 ++++++++++---------- net/ipv4/ipip.c | 2 +- net/ipv4/ipmr.c | 12 ++++++------ 5 files changed, 38 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index d8ee5238c395..a4ccef3e6935 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -510,7 +510,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev, err_free_skb: kfree_skb(skb); - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); } static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) @@ -592,7 +592,7 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) err_free_skb: kfree_skb(skb); - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); } static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) @@ -663,7 +663,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, free_skb: kfree_skb(skb); - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } @@ -717,7 +717,7 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, free_skb: kfree_skb(skb); - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } @@ -745,7 +745,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, free_skb: kfree_skb(skb); - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 019f3b0839c5..de90b09dfe78 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -368,23 +368,23 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, #ifdef CONFIG_NET_IPGRE_BROADCAST if (ipv4_is_multicast(iph->daddr)) { - tunnel->dev->stats.multicast++; + DEV_STATS_INC(tunnel->dev, multicast); skb->pkt_type = PACKET_BROADCAST; } #endif if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) || ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) { - tunnel->dev->stats.rx_crc_errors++; - tunnel->dev->stats.rx_errors++; + DEV_STATS_INC(tunnel->dev, rx_crc_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } if (tunnel->parms.i_flags&TUNNEL_SEQ) { if (!(tpi->flags&TUNNEL_SEQ) || (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { - tunnel->dev->stats.rx_fifo_errors++; - tunnel->dev->stats.rx_errors++; + DEV_STATS_INC(tunnel->dev, rx_fifo_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } tunnel->i_seqno = ntohl(tpi->seq) + 1; @@ -398,8 +398,8 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; + DEV_STATS_INC(tunnel->dev, rx_frame_errors); + DEV_STATS_INC(tunnel->dev, rx_errors); goto drop; } } @@ -581,7 +581,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error; } if (use_cache) @@ -590,7 +590,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } if (rt->dst.dev == dev) { ip_rt_put(rt); - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); goto tx_error; } @@ -625,10 +625,10 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, df, !net_eq(tunnel->net, dev_net(dev))); return; tx_error: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); goto kfree; tx_dropped: - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); kfree: kfree_skb(skb); } @@ -662,7 +662,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, /* NBMA tunnel */ if (!skb_dst(skb)) { - dev->stats.tx_fifo_errors++; + DEV_STATS_INC(dev, tx_fifo_errors); goto tx_error; } @@ -749,7 +749,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, rt = ip_route_output_key(tunnel->net, &fl4); if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error; } if (use_cache) @@ -762,7 +762,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (rt->dst.dev == dev) { ip_rt_put(rt); - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); goto tx_error; } @@ -805,7 +805,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, if (skb_cow_head(skb, dev->needed_headroom)) { ip_rt_put(rt); - dev->stats.tx_dropped++; + DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return; } @@ -819,7 +819,7 @@ tx_error_icmp: dst_link_failure(skb); #endif tx_error: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); } EXPORT_SYMBOL_GPL(ip_tunnel_xmit); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 8c2bd1d9ddce..53bfd8af6920 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -107,8 +107,8 @@ static int vti_rcv_cb(struct sk_buff *skb, int err) dev = tunnel->dev; if (err) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; + DEV_STATS_INC(dev, rx_errors); + DEV_STATS_INC(dev, rx_dropped); return 0; } @@ -183,7 +183,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst = &rt->dst; @@ -198,14 +198,14 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, if (dst->error) { dst_release(dst); dst = NULL; - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } skb_dst_set(skb, dst); break; #endif default: - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } } @@ -213,7 +213,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, dst_hold(dst); dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } @@ -221,7 +221,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto xmit; if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { - dev->stats.tx_carrier_errors++; + DEV_STATS_INC(dev, tx_carrier_errors); dst_release(dst); goto tx_error_icmp; } @@ -230,7 +230,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, if (tdev == dev) { dst_release(dst); - dev->stats.collisions++; + DEV_STATS_INC(dev, collisions); goto tx_error; } @@ -267,7 +267,7 @@ xmit: tx_error_icmp: dst_link_failure(skb); tx_error: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } @@ -304,7 +304,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) return vti_xmit(skb, dev, &fl); tx_err: - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 180f9daf5bec..abea77759b7e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -310,7 +310,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, tx_error: kfree_skb(skb); - dev->stats.tx_errors++; + DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e04544ac4b45..b58df3c1bf7d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -506,8 +506,8 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) return err; } - dev->stats.tx_bytes += skb->len; - dev->stats.tx_packets++; + DEV_STATS_ADD(dev, tx_bytes, skb->len); + DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */ @@ -1839,8 +1839,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, if (vif->flags & VIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); - vif_dev->stats.tx_bytes += skb->len; - vif_dev->stats.tx_packets++; + DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); + DEV_STATS_INC(vif_dev, tx_packets); ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT); goto out_free; } @@ -1898,8 +1898,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, if (vif->flags & VIFF_TUNNEL) { ip_encap(net, skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ - vif_dev->stats.tx_packets++; - vif_dev->stats.tx_bytes += skb->len; + DEV_STATS_INC(vif_dev, tx_packets); + DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); } IPCB(skb)->flags |= IPSKB_FORWARDED; -- cgit v1.2.3-70-g09d2 From 1d7322f28fde3f90f2a2ed0b62fafc004b3f822a Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 15 Nov 2022 10:14:24 +0800 Subject: ax25: af_ax25: Remove unnecessary (void*) conversions The valptr pointer is of (void *) type, so other pointers need not be forced to assign values to it. Signed-off-by: Li zeming Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 6b4c25a92377..d8da400cb4de 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -723,7 +723,7 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname, if (maxlen < 1) return -EFAULT; - valptr = (void *) &val; + valptr = &val; length = min_t(unsigned int, maxlen, sizeof(int)); lock_sock(sk); @@ -785,7 +785,7 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname, length = 1; } - valptr = (void *) devname; + valptr = devname; break; default: -- cgit v1.2.3-70-g09d2 From bf36267e3ad3df80a3a18eb0422723069a434934 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 09:18:51 +0000 Subject: tcp: annotate data-race around queue->synflood_warned Annotate the lockless read of queue->synflood_warned. Following xchg() has the needed data-race resolution. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 94024fdc2da1..0ae291e53eab 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6841,7 +6841,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!queue->synflood_warned && syncookies != 2 && + if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && xchg(&queue->synflood_warned, 1) == 0) { if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", -- cgit v1.2.3-70-g09d2 From b0798310f84c97d91e02c950d54677cad91ec5dd Mon Sep 17 00:00:00 2001 From: Li zeming Date: Tue, 15 Nov 2022 10:07:05 +0800 Subject: sctp: sm_statefuns: Remove pointer casts of the same type The subh.addip_hdr pointer is also of type (struct sctp_addiphdr *), so it does not require a cast. Signed-off-by: Li zeming Link: https://lore.kernel.org/r/20221115020705.3220-1-zeming@nfschina.com Signed-off-by: Paolo Abeni --- net/sctp/sm_statefuns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index f6ee7f4040c1..ce5426171206 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -4044,7 +4044,7 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, (void *)err_param, commands); if (last_asconf) { - addip_hdr = (struct sctp_addiphdr *)last_asconf->subh.addip_hdr; + addip_hdr = last_asconf->subh.addip_hdr; sent_serial = ntohl(addip_hdr->serial); } else { sent_serial = asoc->addip_serial - 1; -- cgit v1.2.3-70-g09d2 From 0171a1d22bb99174671484f409f66f5b96c073b4 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 16 Nov 2022 11:52:02 +0100 Subject: net: dsa: refactor name assignment for user ports The following two patches each have a (small) chance of causing regressions for userspace and will in that case of course need to be reverted. In order to prepare for that and make those two patches independent and individually revertable, refactor the code which sets the names for user ports by moving the "fall back to eth%d if no label is given in device tree" to dsa_slave_create(). No functional change (at least none intended). Signed-off-by: Rasmus Villemoes Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 3 --- net/dsa/slave.c | 13 +++++++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 12145c852902..d3055b49080f 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1365,9 +1365,6 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) static int dsa_port_parse_user(struct dsa_port *dp, const char *name) { - if (!name) - name = "eth%d"; - dp->type = DSA_PORT_TYPE_USER; dp->name = name; diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 4176482cd03f..2ae75d4d8389 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2366,16 +2366,25 @@ int dsa_slave_create(struct dsa_port *port) { struct net_device *master = dsa_port_to_master(port); struct dsa_switch *ds = port->ds; - const char *name = port->name; struct net_device *slave_dev; struct dsa_slave_priv *p; + const char *name; + int assign_type; int ret; if (!ds->num_tx_queues) ds->num_tx_queues = 1; + if (port->name) { + name = port->name; + assign_type = NET_NAME_UNKNOWN; + } else { + name = "eth%d"; + assign_type = NET_NAME_UNKNOWN; + } + slave_dev = alloc_netdev_mqs(sizeof(struct dsa_slave_priv), name, - NET_NAME_UNKNOWN, ether_setup, + assign_type, ether_setup, ds->num_tx_queues, 1); if (slave_dev == NULL) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 6fdb038420407d9781b2ba2429f836434a25e91a Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 16 Nov 2022 11:52:03 +0100 Subject: net: dsa: use NET_NAME_PREDICTABLE for user ports with name given in DT When a user port has a label in device tree, the corresponding netdevice is, to quote include/uapi/linux/netdevice.h, "predictably named by the kernel". This is also explicitly one of the intended use cases for NET_NAME_PREDICTABLE, quoting 685343fc3ba6 ("net: add name_assign_type netdev attribute"): NET_NAME_PREDICTABLE: The ifname has been assigned by the kernel in a predictable way [...] Examples include [...] and names deduced from hardware properties (including being given explicitly by the firmware). Expose that information properly for the benefit of userspace tools that make decisions based on the name_assign_type attribute, e.g. a systemd-udev rule with "kernel" in NamePolicy. Signed-off-by: Rasmus Villemoes Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2ae75d4d8389..5405b730eb92 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2377,7 +2377,7 @@ int dsa_slave_create(struct dsa_port *port) if (port->name) { name = port->name; - assign_type = NET_NAME_UNKNOWN; + assign_type = NET_NAME_PREDICTABLE; } else { name = "eth%d"; assign_type = NET_NAME_UNKNOWN; -- cgit v1.2.3-70-g09d2 From b8790661d90d7bbfb745df40262d13a0e8ae6400 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 16 Nov 2022 11:52:04 +0100 Subject: net: dsa: set name_assign_type to NET_NAME_ENUM for enumerated user ports When a user port does not have a label in device tree, and we thus fall back to the eth%d scheme, the proper constant to use is NET_NAME_ENUM. See also commit e9f656b7a214 ("net: ethernet: set default assignment identifier to NET_NAME_ENUM"), which in turn quoted commit 685343fc3ba6 ("net: add name_assign_type netdev attribute"): ... when the kernel has given the interface a name using global device enumeration based on order of discovery (ethX, wlanY, etc) ... are labelled NET_NAME_ENUM. Signed-off-by: Rasmus Villemoes Reviewed-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5405b730eb92..24d8ad36fc8b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2380,7 +2380,7 @@ int dsa_slave_create(struct dsa_port *port) assign_type = NET_NAME_PREDICTABLE; } else { name = "eth%d"; - assign_type = NET_NAME_UNKNOWN; + assign_type = NET_NAME_ENUM; } slave_dev = alloc_netdev_mqs(sizeof(struct dsa_slave_priv), name, -- cgit v1.2.3-70-g09d2 From 9999f85ba34651726018e0f50d4afdf6c8cc8096 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Nov 2022 03:18:42 +0200 Subject: net: dsa: stop exposing tag proto module helpers to the world The DSA tagging protocol driver macros are in the public include/net/dsa.h probably because that's also where the DSA_TAG_PROTO_*_VALUE macros are (MODULE_ALIAS_DSA_TAG_DRIVER hinges on those macro definitions). But there is no reason to expose these helpers to . That header is shared between switch drivers (drivers/net/dsa/), tagging protocol drivers (net/dsa/tag_*.c), the DSA core (net/dsa/ sans tag_*.c), and the rest of the world (DSA master drivers, network stack, etc). Too much exposure. On the other hand, net/dsa/dsa_priv.h is included only by the DSA core and by DSA tagging protocol drivers (or IOW, "friend" modules). Also a bit too much exposure - I've contemplated creating a new header which is only included by tagging protocol drivers, but completely separating a new dsa_tag_proto.h from dsa_priv.h is not immediately trivial - for example dsa_slave_to_port() is used both from the fast path and from the control path. So for now, move these definitions to dsa_priv.h which at least hides them from the world. Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 70 ------------------------------------------------------ net/dsa/dsa_priv.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index dde364688739..82da44561f4c 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -118,10 +118,6 @@ struct dsa_netdevice_ops { int cmd); }; -#define DSA_TAG_DRIVER_ALIAS "dsa_tag-" -#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ - MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) - struct dsa_lag { struct net_device *dev; unsigned int id; @@ -1400,70 +1396,4 @@ static inline bool dsa_slave_dev_check(const struct net_device *dev) netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev); void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up); -struct dsa_tag_driver { - const struct dsa_device_ops *ops; - struct list_head list; - struct module *owner; -}; - -void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count, - struct module *owner); -void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count); - -#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ -static int __init dsa_tag_driver_module_init(void) \ -{ \ - dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ - THIS_MODULE); \ - return 0; \ -} \ -module_init(dsa_tag_driver_module_init); \ - \ -static void __exit dsa_tag_driver_module_exit(void) \ -{ \ - dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ -} \ -module_exit(dsa_tag_driver_module_exit) - -/** - * module_dsa_tag_drivers() - Helper macro for registering DSA tag - * drivers - * @__ops_array: Array of tag driver structures - * - * Helper macro for DSA tag drivers which do not do anything special - * in module init/exit. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit(). - */ -#define module_dsa_tag_drivers(__ops_array) \ -dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) - -#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops - -/* Create a static structure we can build a linked list of dsa_tag - * drivers - */ -#define DSA_TAG_DRIVER(__ops) \ -static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ - .ops = &__ops, \ -} - -/** - * module_dsa_tag_driver() - Helper macro for registering a single DSA tag - * driver - * @__ops: Single tag driver structures - * - * Helper macro for DSA tag drivers which do not do anything special - * in module init/exit. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit(). - */ -#define module_dsa_tag_driver(__ops) \ -DSA_TAG_DRIVER(__ops); \ - \ -static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ - &DSA_TAG_DRIVER_NAME(__ops) \ -}; \ -module_dsa_tag_drivers(dsa_tag_driver_array) #endif - diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 71e9707d11d4..23d2dfdbc1ab 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -17,6 +17,76 @@ #define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG +#define DSA_TAG_DRIVER_ALIAS "dsa_tag-" +#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ + MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) + +struct dsa_tag_driver { + const struct dsa_device_ops *ops; + struct list_head list; + struct module *owner; +}; + +void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count, + struct module *owner); +void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count); + +#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ +static int __init dsa_tag_driver_module_init(void) \ +{ \ + dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ + THIS_MODULE); \ + return 0; \ +} \ +module_init(dsa_tag_driver_module_init); \ + \ +static void __exit dsa_tag_driver_module_exit(void) \ +{ \ + dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ +} \ +module_exit(dsa_tag_driver_module_exit) + +/** + * module_dsa_tag_drivers() - Helper macro for registering DSA tag + * drivers + * @__ops_array: Array of tag driver structures + * + * Helper macro for DSA tag drivers which do not do anything special + * in module init/exit. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit(). + */ +#define module_dsa_tag_drivers(__ops_array) \ +dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) + +#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops + +/* Create a static structure we can build a linked list of dsa_tag + * drivers + */ +#define DSA_TAG_DRIVER(__ops) \ +static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ + .ops = &__ops, \ +} + +/** + * module_dsa_tag_driver() - Helper macro for registering a single DSA tag + * driver + * @__ops: Single tag driver structures + * + * Helper macro for DSA tag drivers which do not do anything special + * in module init/exit. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit(). + */ +#define module_dsa_tag_driver(__ops) \ +DSA_TAG_DRIVER(__ops); \ + \ +static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ + &DSA_TAG_DRIVER_NAME(__ops) \ +}; \ +module_dsa_tag_drivers(dsa_tag_driver_array) + enum { DSA_NOTIFIER_AGEING_TIME, DSA_NOTIFIER_BRIDGE_JOIN, -- cgit v1.2.3-70-g09d2 From 2610937d7e95d010e7301277408457cd385a9288 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Nov 2022 03:18:43 +0200 Subject: net: dsa: rename tagging protocol driver modalias It's autumn cleanup time, and today's target are modaliases. Michael says that for users of modinfo, "dsa_tag-20" is not the most suggestive name, and recommends a change to "dsa_tag-id-20". Andrew points out that other modaliases have a prefix delimited by colons, so he recommends "dsa_tag:20" instead of "dsa_tag-20". To satisfy both proposals, Florian recommends "dsa_tag:id-20". The modaliases are not stable ABI, and the essential information (protocol ID) is still conveyed in the new string, which request_module() must be adapted to form. Link: 20221027210830.3577793-1-vladimir.oltean@nxp.com Suggested-by: Andrew Lunn Suggested-by: Michael Walle Suggested-by: Florian Fainelli Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Signed-off-by: Jakub Kicinski --- net/dsa/dsa.c | 2 +- net/dsa/dsa_priv.h | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 6caf2ec648fd..cc6a497a0e50 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -108,7 +108,7 @@ const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol) const struct dsa_device_ops *ops; bool found = false; - request_module("%s%d", DSA_TAG_DRIVER_ALIAS, tag_protocol); + request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol); mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 23d2dfdbc1ab..383d224c8143 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -17,9 +17,10 @@ #define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG -#define DSA_TAG_DRIVER_ALIAS "dsa_tag-" -#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ - MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE)) +#define DSA_TAG_DRIVER_ALIAS "dsa_tag:" +#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ + MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \ + __stringify(__proto##_VALUE)) struct dsa_tag_driver { const struct dsa_device_ops *ops; -- cgit v1.2.3-70-g09d2 From 94793a56b3df0ff2b8c5680f926c19effd8b9ccc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Nov 2022 03:18:44 +0200 Subject: net: dsa: provide a second modalias to tag proto drivers based on their name Currently, tagging protocol drivers have a modalias of "dsa_tag:id-", where the number is one of DSA_TAG_PROTO_*_VALUE. This modalias makes it possible for the request_module() call in dsa_tag_driver_get() to work, given the input it has - an integer returned by ds->ops->get_tag_protocol(). It is also possible to change tagging protocols at (pseudo-)runtime, via sysfs or via device tree, and this works via the name string of the tagging protocol rather than via its id (DSA_TAG_PROTO_*_VALUE). In the latter case, there is no request_module() call, because there is no association that the DSA core has between the string name and the ID, to construct the modalias. The module is simply assumed to have been inserted. This is actually slightly problematic when the tagging protocol change should take place at probe time, since it's expected that the dependency module should get autoloaded. For this purpose, let's introduce a second modalias, so that the DSA core can call request_module() by name. There is no reason to make the modalias by name optional, so just modify the MODULE_ALIAS_DSA_TAG_DRIVER() macro to take both the ID and the name as arguments, and generate two modaliases behind the scenes. Suggested-by: Michael Walle Signed-off-by: Vladimir Oltean Tested-by: Michael Walle # on kontron-sl28 w/ ocelot_8021q Tested-by: Michael Walle Signed-off-by: Jakub Kicinski --- net/dsa/dsa_priv.h | 6 +++++- net/dsa/tag_ar9331.c | 6 ++++-- net/dsa/tag_brcm.c | 16 ++++++++++------ net/dsa/tag_dsa.c | 11 +++++++---- net/dsa/tag_gswip.c | 6 ++++-- net/dsa/tag_hellcreek.c | 6 ++++-- net/dsa/tag_ksz.c | 21 +++++++++++++-------- net/dsa/tag_lan9303.c | 6 ++++-- net/dsa/tag_mtk.c | 6 ++++-- net/dsa/tag_ocelot.c | 11 +++++++---- net/dsa/tag_ocelot_8021q.c | 6 ++++-- net/dsa/tag_qca.c | 6 ++++-- net/dsa/tag_rtl4_a.c | 6 ++++-- net/dsa/tag_rtl8_4.c | 7 +++++-- net/dsa/tag_rzn1_a5psw.c | 6 ++++-- net/dsa/tag_sja1105.c | 11 +++++++---- net/dsa/tag_trailer.c | 6 ++++-- net/dsa/tag_xrs700x.c | 6 ++++-- 18 files changed, 98 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 383d224c8143..020386ff0db9 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -17,8 +17,12 @@ #define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG +/* Create 2 modaliases per tagging protocol, one to auto-load the module + * given the ID reported by get_tag_protocol(), and the other by name. + */ #define DSA_TAG_DRIVER_ALIAS "dsa_tag:" -#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto) \ +#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto, __name) \ + MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __name); \ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \ __stringify(__proto##_VALUE)) diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 8a02ac44282f..bfa161a4f502 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -9,6 +9,8 @@ #include "dsa_priv.h" +#define AR9331_NAME "ar9331" + #define AR9331_HDR_LEN 2 #define AR9331_HDR_VERSION 1 @@ -80,7 +82,7 @@ static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb, } static const struct dsa_device_ops ar9331_netdev_ops = { - .name = "ar9331", + .name = AR9331_NAME, .proto = DSA_TAG_PROTO_AR9331, .xmit = ar9331_tag_xmit, .rcv = ar9331_tag_rcv, @@ -88,5 +90,5 @@ static const struct dsa_device_ops ar9331_netdev_ops = { }; MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331, AR9331_NAME); module_dsa_tag_driver(ar9331_netdev_ops); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 16889ea3e0a7..9e7477ed70f1 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -12,6 +12,10 @@ #include "dsa_priv.h" +#define BRCM_NAME "brcm" +#define BRCM_LEGACY_NAME "brcm-legacy" +#define BRCM_PREPEND_NAME "brcm-prepend" + /* Legacy Broadcom tag (6 bytes) */ #define BRCM_LEG_TAG_LEN 6 @@ -196,7 +200,7 @@ static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops brcm_netdev_ops = { - .name = "brcm", + .name = BRCM_NAME, .proto = DSA_TAG_PROTO_BRCM, .xmit = brcm_tag_xmit, .rcv = brcm_tag_rcv, @@ -204,7 +208,7 @@ static const struct dsa_device_ops brcm_netdev_ops = { }; DSA_TAG_DRIVER(brcm_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME); #endif #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) @@ -273,7 +277,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, } static const struct dsa_device_ops brcm_legacy_netdev_ops = { - .name = "brcm-legacy", + .name = BRCM_LEGACY_NAME, .proto = DSA_TAG_PROTO_BRCM_LEGACY, .xmit = brcm_leg_tag_xmit, .rcv = brcm_leg_tag_rcv, @@ -281,7 +285,7 @@ static const struct dsa_device_ops brcm_legacy_netdev_ops = { }; DSA_TAG_DRIVER(brcm_legacy_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY, BRCM_LEGACY_NAME); #endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */ #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) @@ -300,7 +304,7 @@ static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, } static const struct dsa_device_ops brcm_prepend_netdev_ops = { - .name = "brcm-prepend", + .name = BRCM_PREPEND_NAME, .proto = DSA_TAG_PROTO_BRCM_PREPEND, .xmit = brcm_tag_xmit_prepend, .rcv = brcm_tag_rcv_prepend, @@ -308,7 +312,7 @@ static const struct dsa_device_ops brcm_prepend_netdev_ops = { }; DSA_TAG_DRIVER(brcm_prepend_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_PREPEND); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_PREPEND, BRCM_PREPEND_NAME); #endif static struct dsa_tag_driver *dsa_tag_driver_array[] = { diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index e4b6e3f2a3db..9fe77f5cc759 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -52,6 +52,9 @@ #include "dsa_priv.h" +#define DSA_NAME "dsa" +#define EDSA_NAME "edsa" + #define DSA_HLEN 4 /** @@ -339,7 +342,7 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops dsa_netdev_ops = { - .name = "dsa", + .name = DSA_NAME, .proto = DSA_TAG_PROTO_DSA, .xmit = dsa_xmit, .rcv = dsa_rcv, @@ -347,7 +350,7 @@ static const struct dsa_device_ops dsa_netdev_ops = { }; DSA_TAG_DRIVER(dsa_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA, DSA_NAME); #endif /* CONFIG_NET_DSA_TAG_DSA */ #if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA) @@ -381,7 +384,7 @@ static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops edsa_netdev_ops = { - .name = "edsa", + .name = EDSA_NAME, .proto = DSA_TAG_PROTO_EDSA, .xmit = edsa_xmit, .rcv = edsa_rcv, @@ -389,7 +392,7 @@ static const struct dsa_device_ops edsa_netdev_ops = { }; DSA_TAG_DRIVER(edsa_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA, EDSA_NAME); #endif /* CONFIG_NET_DSA_TAG_EDSA */ static struct dsa_tag_driver *dsa_tag_drivers[] = { diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index df7140984da3..020050dff3e4 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -12,6 +12,8 @@ #include "dsa_priv.h" +#define GSWIP_NAME "gswip" + #define GSWIP_TX_HEADER_LEN 4 /* special tag in TX path header */ @@ -98,7 +100,7 @@ static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, } static const struct dsa_device_ops gswip_netdev_ops = { - .name = "gswip", + .name = GSWIP_NAME, .proto = DSA_TAG_PROTO_GSWIP, .xmit = gswip_tag_xmit, .rcv = gswip_tag_rcv, @@ -106,6 +108,6 @@ static const struct dsa_device_ops gswip_netdev_ops = { }; MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_GSWIP); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_GSWIP, GSWIP_NAME); module_dsa_tag_driver(gswip_netdev_ops); diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index 846588c0070a..03fd5f2877c8 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -13,6 +13,8 @@ #include "dsa_priv.h" +#define HELLCREEK_NAME "hellcreek" + #define HELLCREEK_TAG_LEN 1 static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, @@ -57,7 +59,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, } static const struct dsa_device_ops hellcreek_netdev_ops = { - .name = "hellcreek", + .name = HELLCREEK_NAME, .proto = DSA_TAG_PROTO_HELLCREEK, .xmit = hellcreek_xmit, .rcv = hellcreek_rcv, @@ -65,6 +67,6 @@ static const struct dsa_device_ops hellcreek_netdev_ops = { }; MODULE_LICENSE("Dual MIT/GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK, HELLCREEK_NAME); module_dsa_tag_driver(hellcreek_netdev_ops); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 38fa19c1e2d5..37db5156f9a3 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -9,6 +9,11 @@ #include #include "dsa_priv.h" +#define KSZ8795_NAME "ksz8795" +#define KSZ9477_NAME "ksz9477" +#define KSZ9893_NAME "ksz9893" +#define LAN937X_NAME "lan937x" + /* Typically only one byte is used for tail tag. */ #define KSZ_EGRESS_TAG_LEN 1 #define KSZ_INGRESS_TAG_LEN 1 @@ -74,7 +79,7 @@ static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops ksz8795_netdev_ops = { - .name = "ksz8795", + .name = KSZ8795_NAME, .proto = DSA_TAG_PROTO_KSZ8795, .xmit = ksz8795_xmit, .rcv = ksz8795_rcv, @@ -82,7 +87,7 @@ static const struct dsa_device_ops ksz8795_netdev_ops = { }; DSA_TAG_DRIVER(ksz8795_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME); /* * For Ingress (Host -> KSZ9477), 2 bytes are added before FCS. @@ -147,7 +152,7 @@ static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops ksz9477_netdev_ops = { - .name = "ksz9477", + .name = KSZ9477_NAME, .proto = DSA_TAG_PROTO_KSZ9477, .xmit = ksz9477_xmit, .rcv = ksz9477_rcv, @@ -155,7 +160,7 @@ static const struct dsa_device_ops ksz9477_netdev_ops = { }; DSA_TAG_DRIVER(ksz9477_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477, KSZ9477_NAME); #define KSZ9893_TAIL_TAG_OVERRIDE BIT(5) #define KSZ9893_TAIL_TAG_LOOKUP BIT(6) @@ -183,7 +188,7 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, } static const struct dsa_device_ops ksz9893_netdev_ops = { - .name = "ksz9893", + .name = KSZ9893_NAME, .proto = DSA_TAG_PROTO_KSZ9893, .xmit = ksz9893_xmit, .rcv = ksz9477_rcv, @@ -191,7 +196,7 @@ static const struct dsa_device_ops ksz9893_netdev_ops = { }; DSA_TAG_DRIVER(ksz9893_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME); /* For xmit, 2 bytes are added before FCS. * --------------------------------------------------------------------------- @@ -241,7 +246,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, } static const struct dsa_device_ops lan937x_netdev_ops = { - .name = "lan937x", + .name = LAN937X_NAME, .proto = DSA_TAG_PROTO_LAN937X, .xmit = lan937x_xmit, .rcv = ksz9477_rcv, @@ -249,7 +254,7 @@ static const struct dsa_device_ops lan937x_netdev_ops = { }; DSA_TAG_DRIVER(lan937x_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X, LAN937X_NAME); static struct dsa_tag_driver *dsa_tag_driver_array[] = { &DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops), diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 98d7d7120bab..4118292ed218 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -30,6 +30,8 @@ * Required when no forwarding between the external ports should happen. */ +#define LAN9303_NAME "lan9303" + #define LAN9303_TAG_LEN 4 # define LAN9303_TAG_TX_USE_ALR BIT(3) # define LAN9303_TAG_TX_STP_OVERRIDE BIT(4) @@ -110,7 +112,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops lan9303_netdev_ops = { - .name = "lan9303", + .name = LAN9303_NAME, .proto = DSA_TAG_PROTO_LAN9303, .xmit = lan9303_xmit, .rcv = lan9303_rcv, @@ -118,6 +120,6 @@ static const struct dsa_device_ops lan9303_netdev_ops = { }; MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN9303); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN9303, LAN9303_NAME); module_dsa_tag_driver(lan9303_netdev_ops); diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 415d8ece242a..ba37495ab5f4 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -10,6 +10,8 @@ #include "dsa_priv.h" +#define MTK_NAME "mtk" + #define MTK_HDR_LEN 4 #define MTK_HDR_XMIT_UNTAGGED 0 #define MTK_HDR_XMIT_TAGGED_TPID_8100 1 @@ -91,7 +93,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops mtk_netdev_ops = { - .name = "mtk", + .name = MTK_NAME, .proto = DSA_TAG_PROTO_MTK, .xmit = mtk_tag_xmit, .rcv = mtk_tag_rcv, @@ -99,6 +101,6 @@ static const struct dsa_device_ops mtk_netdev_ops = { }; MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MTK); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MTK, MTK_NAME); module_dsa_tag_driver(mtk_netdev_ops); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 0d81f172b7a6..8cc31ab47e28 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -4,6 +4,9 @@ #include #include "dsa_priv.h" +#define OCELOT_NAME "ocelot" +#define SEVILLE_NAME "seville" + /* If the port is under a VLAN-aware bridge, remove the VLAN header from the * payload and move it into the DSA tag, which will make the switch classify * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero, @@ -183,7 +186,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, } static const struct dsa_device_ops ocelot_netdev_ops = { - .name = "ocelot", + .name = OCELOT_NAME, .proto = DSA_TAG_PROTO_OCELOT, .xmit = ocelot_xmit, .rcv = ocelot_rcv, @@ -192,10 +195,10 @@ static const struct dsa_device_ops ocelot_netdev_ops = { }; DSA_TAG_DRIVER(ocelot_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT, OCELOT_NAME); static const struct dsa_device_ops seville_netdev_ops = { - .name = "seville", + .name = SEVILLE_NAME, .proto = DSA_TAG_PROTO_SEVILLE, .xmit = seville_xmit, .rcv = ocelot_rcv, @@ -204,7 +207,7 @@ static const struct dsa_device_ops seville_netdev_ops = { }; DSA_TAG_DRIVER(seville_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE, SEVILLE_NAME); static struct dsa_tag_driver *ocelot_tag_driver_array[] = { &DSA_TAG_DRIVER_NAME(ocelot_netdev_ops), diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 37ccf00404ea..d1ec68001487 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -12,6 +12,8 @@ #include #include "dsa_priv.h" +#define OCELOT_8021Q_NAME "ocelot-8021q" + struct ocelot_8021q_tagger_private { struct ocelot_8021q_tagger_data data; /* Must be first */ struct kthread_worker *xmit_worker; @@ -119,7 +121,7 @@ static int ocelot_connect(struct dsa_switch *ds) } static const struct dsa_device_ops ocelot_8021q_netdev_ops = { - .name = "ocelot-8021q", + .name = OCELOT_8021Q_NAME, .proto = DSA_TAG_PROTO_OCELOT_8021Q, .xmit = ocelot_xmit, .rcv = ocelot_rcv, @@ -130,6 +132,6 @@ static const struct dsa_device_ops ocelot_8021q_netdev_ops = { }; MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q, OCELOT_8021Q_NAME); module_dsa_tag_driver(ocelot_8021q_netdev_ops); diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 57d2e00f1e5d..73d6e111228d 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -10,6 +10,8 @@ #include "dsa_priv.h" +#define QCA_NAME "qca" + static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -107,7 +109,7 @@ static void qca_tag_disconnect(struct dsa_switch *ds) } static const struct dsa_device_ops qca_netdev_ops = { - .name = "qca", + .name = QCA_NAME, .proto = DSA_TAG_PROTO_QCA, .connect = qca_tag_connect, .disconnect = qca_tag_disconnect, @@ -118,6 +120,6 @@ static const struct dsa_device_ops qca_netdev_ops = { }; MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_QCA); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_QCA, QCA_NAME); module_dsa_tag_driver(qca_netdev_ops); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index 6d928ee3ef7a..18b52d77d200 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -20,6 +20,8 @@ #include "dsa_priv.h" +#define RTL4_A_NAME "rtl4a" + #define RTL4_A_HDR_LEN 4 #define RTL4_A_ETHERTYPE 0x8899 #define RTL4_A_PROTOCOL_SHIFT 12 @@ -112,7 +114,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, } static const struct dsa_device_ops rtl4a_netdev_ops = { - .name = "rtl4a", + .name = RTL4_A_NAME, .proto = DSA_TAG_PROTO_RTL4_A, .xmit = rtl4a_tag_xmit, .rcv = rtl4a_tag_rcv, @@ -121,4 +123,4 @@ static const struct dsa_device_ops rtl4a_netdev_ops = { module_dsa_tag_driver(rtl4a_netdev_ops); MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A, RTL4_A_NAME); diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c index a593ead7ff26..030a8cf0ad48 100644 --- a/net/dsa/tag_rtl8_4.c +++ b/net/dsa/tag_rtl8_4.c @@ -84,6 +84,9 @@ * 0x04 = RTL8365MB DSA protocol */ +#define RTL8_4_NAME "rtl8_4" +#define RTL8_4T_NAME "rtl8_4t" + #define RTL8_4_TAG_LEN 8 #define RTL8_4_PROTOCOL GENMASK(15, 8) @@ -234,7 +237,7 @@ static const struct dsa_device_ops rtl8_4_netdev_ops = { DSA_TAG_DRIVER(rtl8_4_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4, RTL8_4_NAME); /* Tail version */ static const struct dsa_device_ops rtl8_4t_netdev_ops = { @@ -247,7 +250,7 @@ static const struct dsa_device_ops rtl8_4t_netdev_ops = { DSA_TAG_DRIVER(rtl8_4t_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4T); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4T, RTL8_4T_NAME); static struct dsa_tag_driver *dsa_tag_drivers[] = { &DSA_TAG_DRIVER_NAME(rtl8_4_netdev_ops), diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c index e2a5ee6ae688..b9135069f9fc 100644 --- a/net/dsa/tag_rzn1_a5psw.c +++ b/net/dsa/tag_rzn1_a5psw.c @@ -22,6 +22,8 @@ * See struct a5psw_tag for layout */ +#define A5PSW_NAME "a5psw" + #define ETH_P_DSA_A5PSW 0xE001 #define A5PSW_TAG_LEN 8 #define A5PSW_CTRL_DATA_FORCE_FORWARD BIT(0) @@ -101,7 +103,7 @@ static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb, } static const struct dsa_device_ops a5psw_netdev_ops = { - .name = "a5psw", + .name = A5PSW_NAME, .proto = DSA_TAG_PROTO_RZN1_A5PSW, .xmit = a5psw_tag_xmit, .rcv = a5psw_tag_rcv, @@ -109,5 +111,5 @@ static const struct dsa_device_ops a5psw_netdev_ops = { }; MODULE_LICENSE("GPL v2"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW, A5PSW_NAME); module_dsa_tag_driver(a5psw_netdev_ops); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 83e4136516b0..3b6e642a90e9 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -7,6 +7,9 @@ #include #include "dsa_priv.h" +#define SJA1105_NAME "sja1105" +#define SJA1110_NAME "sja1110" + /* Is this a TX or an RX header? */ #define SJA1110_HEADER_HOST_TO_SWITCH BIT(15) @@ -786,7 +789,7 @@ static int sja1105_connect(struct dsa_switch *ds) } static const struct dsa_device_ops sja1105_netdev_ops = { - .name = "sja1105", + .name = SJA1105_NAME, .proto = DSA_TAG_PROTO_SJA1105, .xmit = sja1105_xmit, .rcv = sja1105_rcv, @@ -798,10 +801,10 @@ static const struct dsa_device_ops sja1105_netdev_ops = { }; DSA_TAG_DRIVER(sja1105_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105, SJA1105_NAME); static const struct dsa_device_ops sja1110_netdev_ops = { - .name = "sja1110", + .name = SJA1110_NAME, .proto = DSA_TAG_PROTO_SJA1110, .xmit = sja1110_xmit, .rcv = sja1110_rcv, @@ -813,7 +816,7 @@ static const struct dsa_device_ops sja1110_netdev_ops = { }; DSA_TAG_DRIVER(sja1110_netdev_ops); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110, SJA1110_NAME); static struct dsa_tag_driver *sja1105_tag_driver_array[] = { &DSA_TAG_DRIVER_NAME(sja1105_netdev_ops), diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 5749ba85c2b8..8754dfe680f6 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -10,6 +10,8 @@ #include "dsa_priv.h" +#define TRAILER_NAME "trailer" + static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); @@ -50,7 +52,7 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops trailer_netdev_ops = { - .name = "trailer", + .name = TRAILER_NAME, .proto = DSA_TAG_PROTO_TRAILER, .xmit = trailer_xmit, .rcv = trailer_rcv, @@ -58,6 +60,6 @@ static const struct dsa_device_ops trailer_netdev_ops = { }; MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_TRAILER); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_TRAILER, TRAILER_NAME); module_dsa_tag_driver(trailer_netdev_ops); diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index ff442b8af636..dc935dd90f98 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -9,6 +9,8 @@ #include "dsa_priv.h" +#define XRS700X_NAME "xrs700x" + static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_port *partner, *dp = dsa_slave_to_port(dev); @@ -51,7 +53,7 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev) } static const struct dsa_device_ops xrs700x_netdev_ops = { - .name = "xrs700x", + .name = XRS700X_NAME, .proto = DSA_TAG_PROTO_XRS700X, .xmit = xrs700x_xmit, .rcv = xrs700x_rcv, @@ -59,6 +61,6 @@ static const struct dsa_device_ops xrs700x_netdev_ops = { }; MODULE_LICENSE("GPL"); -MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X, XRS700X_NAME); module_dsa_tag_driver(xrs700x_netdev_ops); -- cgit v1.2.3-70-g09d2 From e8666130b995a6a1a99c319d33fae2046213c39b Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Nov 2022 03:18:45 +0200 Subject: net: dsa: strip sysfs "tagging" string of trailing newline Currently, dsa_find_tagger_by_name() uses sysfs_streq() which works both with strings that contain \n at the end (echo ocelot > .../dsa/tagging) and with strings that don't (printf ocelot > .../dsa/tagging). There will be a problem once we'll want to construct the modalias string based on which we auto-load the protocol kernel module. If the sysfs buffer ends in a newline, we need to strip it first. This is a preparatory patch specifically for that. Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Signed-off-by: Jakub Kicinski --- net/dsa/dsa.c | 4 ++-- net/dsa/dsa_priv.h | 2 +- net/dsa/master.c | 13 ++++++++++++- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index cc6a497a0e50..f74ddda670a8 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -79,7 +79,7 @@ const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) /* Function takes a reference on the module owning the tagger, * so dsa_tag_driver_put must be called afterwards. */ -const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf) +const struct dsa_device_ops *dsa_find_tagger_by_name(const char *name) { const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); struct dsa_tag_driver *dsa_tag_driver; @@ -88,7 +88,7 @@ const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf) list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { const struct dsa_device_ops *tmp = dsa_tag_driver->ops; - if (!sysfs_streq(buf, tmp->name)) + if (strcmp(name, tmp->name)) continue; if (!try_module_get(dsa_tag_driver->owner)) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 020386ff0db9..4559c0ee39d0 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -245,7 +245,7 @@ struct dsa_slave_priv { /* dsa.c */ const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol); void dsa_tag_driver_put(const struct dsa_device_ops *ops); -const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); +const struct dsa_device_ops *dsa_find_tagger_by_name(const char *name); bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b); diff --git a/net/dsa/master.c b/net/dsa/master.c index 421de166515f..f443bf4a3c8c 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -299,12 +299,23 @@ static ssize_t tagging_store(struct device *d, struct device_attribute *attr, const char *buf, size_t count) { const struct dsa_device_ops *new_tag_ops, *old_tag_ops; + const char *end = strchrnul(buf, '\n'), *name; struct net_device *dev = to_net_dev(d); struct dsa_port *cpu_dp = dev->dsa_ptr; + size_t len = end - buf; int err; + /* Empty string passed */ + if (!len) + return -ENOPROTOOPT; + + name = kstrndup(buf, len, GFP_KERNEL); + if (!name) + return -ENOMEM; + old_tag_ops = cpu_dp->tag_ops; - new_tag_ops = dsa_find_tagger_by_name(buf); + new_tag_ops = dsa_find_tagger_by_name(name); + kfree(name); /* Bad tagger name, or module is not loaded? */ if (IS_ERR(new_tag_ops)) return PTR_ERR(new_tag_ops); -- cgit v1.2.3-70-g09d2 From 54c087e83945eda014064f78c4807c5da1e09842 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Nov 2022 03:18:46 +0200 Subject: net: dsa: rename dsa_tag_driver_get() to dsa_tag_driver_get_by_id() A future patch will introduce one more way of getting a reference on a tagging protocl driver (by name). Rename the current method to "by_id". Signed-off-by: Vladimir Oltean Tested-by: Michael Walle Signed-off-by: Jakub Kicinski --- net/dsa/dsa.c | 2 +- net/dsa/dsa2.c | 2 +- net/dsa/dsa_priv.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index f74ddda670a8..577765cfa986 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -102,7 +102,7 @@ const struct dsa_device_ops *dsa_find_tagger_by_name(const char *name) return ops; } -const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol) +const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol) { struct dsa_tag_driver *dsa_tag_driver; const struct dsa_device_ops *ops; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index d3055b49080f..4b31b0258870 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1441,7 +1441,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master, } if (!tag_ops) - tag_ops = dsa_tag_driver_get(default_proto); + tag_ops = dsa_tag_driver_get_by_id(default_proto); if (IS_ERR(tag_ops)) { if (PTR_ERR(tag_ops) == -ENOPROTOOPT) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 4559c0ee39d0..0d33a22e3087 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -243,7 +243,7 @@ struct dsa_slave_priv { }; /* dsa.c */ -const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol); +const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol); void dsa_tag_driver_put(const struct dsa_device_ops *ops); const struct dsa_device_ops *dsa_find_tagger_by_name(const char *name); -- cgit v1.2.3-70-g09d2 From 0184c07a11a243569cb90104980237c8e101f363 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Nov 2022 03:18:47 +0200 Subject: net: dsa: autoload tag driver module on tagging protocol change Issue a request_module() call when an attempt to change the tagging protocol is made, either by sysfs or by device tree. In the case of ocelot (the only driver for which the default and the alternative tagging protocol are compiled as different modules), the user is now no longer required to insert tag_ocelot_8021q.ko manually. In the particular case of ocelot, this solves a problem where tag_ocelot_8021q.ko is built as module, and this is present in the device tree: &mscc_felix_port4 { dsa-tag-protocol = "ocelot-8021q"; }; &mscc_felix_port5 { dsa-tag-protocol = "ocelot-8021q"; }; Because no one attempts to load the module into the kernel at boot time, the switch driver will fail to probe (actually forever defer) until someone manually inserts tag_ocelot_8021q.ko. This is now no longer necessary and happens automatically. Rename dsa_find_tagger_by_name() to denote the change in functionality: there is now feature parity with dsa_tag_driver_get_by_id(), i.o.w. we also load the module if it's missing. Link: https://lore.kernel.org/lkml/20221027113248.420216-1-michael@walle.cc/ Suggested-by: Michael Walle Signed-off-by: Vladimir Oltean Tested-by: Michael Walle # on kontron-sl28 w/ ocelot_8021q Tested-by: Michael Walle Signed-off-by: Jakub Kicinski --- net/dsa/dsa.c | 4 +++- net/dsa/dsa2.c | 2 +- net/dsa/dsa_priv.h | 2 +- net/dsa/master.c | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 577765cfa986..4afd3edbd64d 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -79,11 +79,13 @@ const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) /* Function takes a reference on the module owning the tagger, * so dsa_tag_driver_put must be called afterwards. */ -const struct dsa_device_ops *dsa_find_tagger_by_name(const char *name) +const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name) { const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); struct dsa_tag_driver *dsa_tag_driver; + request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name); + mutex_lock(&dsa_tag_drivers_lock); list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { const struct dsa_device_ops *tmp = dsa_tag_driver->ops; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 4b31b0258870..f8df55e2e23a 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1431,7 +1431,7 @@ static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master, return -EINVAL; } - tag_ops = dsa_find_tagger_by_name(user_protocol); + tag_ops = dsa_tag_driver_get_by_name(user_protocol); if (IS_ERR(tag_ops)) { dev_warn(ds->dev, "Failed to find a tagging driver for protocol %s, using default\n", diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 0d33a22e3087..24e0ea218a35 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -244,8 +244,8 @@ struct dsa_slave_priv { /* dsa.c */ const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol); +const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name); void dsa_tag_driver_put(const struct dsa_device_ops *ops); -const struct dsa_device_ops *dsa_find_tagger_by_name(const char *name); bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b); diff --git a/net/dsa/master.c b/net/dsa/master.c index f443bf4a3c8c..e24f02743c21 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -314,9 +314,9 @@ static ssize_t tagging_store(struct device *d, struct device_attribute *attr, return -ENOMEM; old_tag_ops = cpu_dp->tag_ops; - new_tag_ops = dsa_find_tagger_by_name(name); + new_tag_ops = dsa_tag_driver_get_by_name(name); kfree(name); - /* Bad tagger name, or module is not loaded? */ + /* Bad tagger name? */ if (IS_ERR(new_tag_ops)) return PTR_ERR(new_tag_ops); -- cgit v1.2.3-70-g09d2 From cd502236835b678738810ecd501c85a3a7a11150 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:15 +0100 Subject: devlink: Introduce new attribute 'tx_priority' to devlink-rate To fully utilize offload capabilities of Intel 100G card QoS capabilities new attribute 'tx_priority' needs to be introduced. This attribute allows for usage of strict priority arbiter among siblings. This arbitration scheme attempts to schedule nodes based on their priority as long as the nodes remain within their bandwidth limit. Introduce new attribute in devlink-rate that will allow for configuration of strict priority. New attribute is optional. Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 6 ++++++ include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 31 +++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 611a23a3deb2..90d59d673cb1 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -114,6 +114,8 @@ struct devlink_rate { refcount_t refcnt; }; }; + + u32 tx_priority; }; struct devlink_port { @@ -1511,10 +1513,14 @@ struct devlink_ops { u64 tx_share, struct netlink_ext_ack *extack); int (*rate_leaf_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); + int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 2f24b53a87a5..1a9214d35ef5 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -607,6 +607,7 @@ enum devlink_attr { DEVLINK_ATTR_SELFTESTS, /* nested */ + DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 7f789bbcbbd7..bf6d3a3c28bb 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1203,6 +1203,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, devlink_rate->tx_max, DEVLINK_ATTR_PAD)) goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, + devlink_rate->tx_priority)) + goto nla_put_failure; if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) @@ -1936,6 +1939,7 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, { struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; + u32 priority; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { @@ -1964,6 +1968,20 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, devlink_rate->tx_max = rate; } + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) { + priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv, + priority, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv, + priority, info->extack); + + if (err) + return err; + devlink_rate->tx_priority = priority; + } + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, @@ -1995,6 +2013,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], + "TX priority set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); @@ -2009,6 +2033,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_PRIORITY], + "TX priority set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false; @@ -9187,6 +9217,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 }, [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, + [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, }; static const struct genl_small_ops devlink_nl_ops[] = { -- cgit v1.2.3-70-g09d2 From 6e2d7e84fcfee62d70e62aa9e469d10b8b6a7dc7 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:16 +0100 Subject: devlink: Introduce new attribute 'tx_weight' to devlink-rate To fully utilize offload capabilities of Intel 100G card QoS capabilities new attribute 'tx_weight' needs to be introduced. This attribute allows for usage of Weighted Fair Queuing arbitration scheme among siblings. This arbitration scheme can be used simultaneously with the strict priority. Introduce new attribute in devlink-rate that will allow for configuration of Weighted Fair Queueing. New attribute is optional. Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 5 +++++ include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 90d59d673cb1..366b23d3f973 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -116,6 +116,7 @@ struct devlink_rate { }; u32 tx_priority; + u32 tx_weight; }; struct devlink_port { @@ -1515,12 +1516,16 @@ struct devlink_ops { u64 tx_max, struct netlink_ext_ack *extack); int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_priority, struct netlink_ext_ack *extack); + int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_weight, struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack); int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_priority, struct netlink_ext_ack *extack); + int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, + u32 tx_weight, struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1a9214d35ef5..498d0d5d0957 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -608,6 +608,8 @@ enum devlink_attr { DEVLINK_ATTR_SELFTESTS, /* nested */ DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ + DEVLINK_ATTR_RATE_TX_WEIGHT, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index bf6d3a3c28bb..525bdf426163 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1206,6 +1206,11 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY, devlink_rate->tx_priority)) goto nla_put_failure; + + if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT, + devlink_rate->tx_weight)) + goto nla_put_failure; + if (devlink_rate->parent) if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, devlink_rate->parent->name)) @@ -1940,6 +1945,7 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, struct nlattr *nla_parent, **attrs = info->attrs; int err = -EOPNOTSUPP; u32 priority; + u32 weight; u64 rate; if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) { @@ -1982,6 +1988,20 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, devlink_rate->tx_priority = priority; } + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) { + weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]); + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv, + weight, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv, + weight, info->extack); + + if (err) + return err; + devlink_rate->tx_weight = weight; + } + nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME]; if (nla_parent) { err = devlink_nl_rate_parent_node_set(devlink_rate, info, @@ -2019,6 +2039,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX priority set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], + "TX weight set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes"); @@ -2039,6 +2065,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX priority set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], + "TX weight set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false; @@ -9218,6 +9250,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, + [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, }; static const struct genl_small_ops devlink_nl_ops[] = { -- cgit v1.2.3-70-g09d2 From caba177d7f4d7693a9157ece8c9a30944c949e34 Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:17 +0100 Subject: devlink: Enable creation of the devlink-rate nodes from the driver Intel 100G card internal firmware hierarchy for Hierarchicial QoS is very rigid and can't be easily removed. This requires an ability to export default hierarchy to allow user to modify it. Currently the driver is only able to create the 'leaf' nodes, which usually represent the vport. This is not enough for HQoS implemented in Intel hardware. Introduce new function devl_rate_node_create() that allows for creation of the devlink-rate nodes from the driver. Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 3 +++ net/core/devlink.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'net') diff --git a/include/net/devlink.h b/include/net/devlink.h index 366b23d3f973..339a2ed02d36 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1618,6 +1618,9 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external); int devl_rate_leaf_create(struct devlink_port *port, void *priv); +struct devlink_rate * +devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, + struct devlink_rate *parent); void devl_rate_leaf_destroy(struct devlink_port *devlink_port); void devl_rate_nodes_destroy(struct devlink *devlink); void devlink_port_linecard_set(struct devlink_port *devlink_port, diff --git a/net/core/devlink.c b/net/core/devlink.c index 525bdf426163..3dfee7cd9929 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -10384,6 +10384,51 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); +/** + * devl_rate_node_create - create devlink rate node + * @devlink: devlink instance + * @priv: driver private data + * @node_name: name of the resulting node + * @parent: parent devlink_rate struct + * + * Create devlink rate object of type node + */ +struct devlink_rate * +devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, + struct devlink_rate *parent) +{ + struct devlink_rate *rate_node; + + rate_node = devlink_rate_node_get_by_name(devlink, node_name); + if (!IS_ERR(rate_node)) + return ERR_PTR(-EEXIST); + + rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL); + if (!rate_node) + return ERR_PTR(-ENOMEM); + + if (parent) { + rate_node->parent = parent; + refcount_inc(&rate_node->parent->refcnt); + } + + rate_node->type = DEVLINK_RATE_TYPE_NODE; + rate_node->devlink = devlink; + rate_node->priv = priv; + + rate_node->name = kstrdup(node_name, GFP_KERNEL); + if (!rate_node->name) { + kfree(rate_node); + return ERR_PTR(-ENOMEM); + } + + refcount_set(&rate_node->refcnt, 1); + list_add(&rate_node->list, &devlink->rate_list); + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + return rate_node; +} +EXPORT_SYMBOL_GPL(devl_rate_node_create); + /** * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on -- cgit v1.2.3-70-g09d2 From 04d674f04e32d4109f0ee505a8855a384481212d Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:18 +0100 Subject: devlink: Allow for devlink-rate nodes parent reassignment Currently it's not possible to reassign the parent of the node using one command. As the previous commit introduced a way to export entire hierarchy from the driver, being able to modify and reassign parents become important. This way user might easily change QoS settings without interrupting traffic. Example command: devlink port function rate set pci/0000:4b:00.0/1 parent node_custom_1 This reassigns leaf node parent to node_custom_1. Signed-off-by: Michal Wilczynski Reviewed-by: Przemek Kitszel Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 3dfee7cd9929..61d431578f5f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1887,10 +1887,8 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, int err = -EOPNOTSUPP; parent = devlink_rate->parent; - if (parent && len) { - NL_SET_ERR_MSG_MOD(info->extack, "Rate object already has parent."); - return -EBUSY; - } else if (parent && !len) { + + if (parent && !len) { if (devlink_rate_is_leaf(devlink_rate)) err = ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, @@ -1904,7 +1902,7 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, refcount_dec(&parent->refcnt); devlink_rate->parent = NULL; - } else if (!parent && len) { + } else if (len) { parent = devlink_rate_node_get_by_name(devlink, parent_name); if (IS_ERR(parent)) return -ENODEV; @@ -1931,6 +1929,10 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, if (err) return err; + if (devlink_rate->parent) + /* we're reassigning to other parent in this case */ + refcount_dec(&devlink_rate->parent->refcnt); + refcount_inc(&parent->refcnt); devlink_rate->parent = parent; } -- cgit v1.2.3-70-g09d2 From f2fc15e271f2d17f2bee2c5a3b3e50252a7ba91f Mon Sep 17 00:00:00 2001 From: Michal Wilczynski Date: Tue, 15 Nov 2022 11:48:19 +0100 Subject: devlink: Allow to set up parent in devl_rate_leaf_create() Currently the driver is able to create leaf nodes for the devlink-rate, but is unable to set parent for them. This wasn't as issue before the possibility to export hierarchy from the driver. After adding the export feature, in order for the driver to supply correct hierarchy, it's necessary for it to be able to supply a parent name to devl_rate_leaf_create(). Introduce a new parameter 'parent_name' in devl_rate_leaf_create(). Signed-off-by: Michal Wilczynski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c | 4 ++-- drivers/net/netdevsim/dev.c | 2 +- include/net/devlink.h | 4 +++- net/core/devlink.c | 9 ++++++++- 4 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c index 9bc7be95db54..084a910bb4e7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c @@ -91,7 +91,7 @@ int mlx5_esw_offloads_devlink_port_register(struct mlx5_eswitch *esw, u16 vport_ if (err) goto reg_err; - err = devl_rate_leaf_create(dl_port, vport); + err = devl_rate_leaf_create(dl_port, vport, NULL); if (err) goto rate_err; @@ -160,7 +160,7 @@ int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_p if (err) return err; - err = devl_rate_leaf_create(dl_port, vport); + err = devl_rate_leaf_create(dl_port, vport, NULL); if (err) goto rate_err; diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index 705872eb7564..e14686594a71 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -1401,7 +1401,7 @@ static int __nsim_dev_port_add(struct nsim_dev *nsim_dev, enum nsim_dev_port_typ if (nsim_dev_port_is_vf(nsim_dev_port)) { err = devl_rate_leaf_create(&nsim_dev_port->devlink_port, - nsim_dev_port); + nsim_dev_port, NULL); if (err) goto err_nsim_destroy; } diff --git a/include/net/devlink.h b/include/net/devlink.h index 339a2ed02d36..074a79b8933f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1617,10 +1617,12 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, u16 pf, u32 sf, bool external); -int devl_rate_leaf_create(struct devlink_port *port, void *priv); struct devlink_rate * devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name, struct devlink_rate *parent); +int +devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, + struct devlink_rate *parent); void devl_rate_leaf_destroy(struct devlink_port *devlink_port); void devl_rate_nodes_destroy(struct devlink *devlink); void devlink_port_linecard_set(struct devlink_port *devlink_port, diff --git a/net/core/devlink.c b/net/core/devlink.c index 61d431578f5f..d93bc95cd7cb 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -10435,10 +10435,12 @@ EXPORT_SYMBOL_GPL(devl_rate_node_create); * devl_rate_leaf_create - create devlink rate leaf * @devlink_port: devlink port object to create rate object on * @priv: driver private data + * @parent: parent devlink_rate struct * * Create devlink rate object of type leaf on provided @devlink_port. */ -int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv) +int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv, + struct devlink_rate *parent) { struct devlink *devlink = devlink_port->devlink; struct devlink_rate *devlink_rate; @@ -10452,6 +10454,11 @@ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv) if (!devlink_rate) return -ENOMEM; + if (parent) { + devlink_rate->parent = parent; + refcount_inc(&devlink_rate->parent->refcnt); + } + devlink_rate->type = DEVLINK_RATE_TYPE_LEAF; devlink_rate->devlink = devlink; devlink_rate->devlink_port = devlink_port; -- cgit v1.2.3-70-g09d2 From 647541ea06a7bbbcf941e501c726b3e26328c102 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 15 Nov 2022 10:40:21 -0500 Subject: sctp: move SCTP_PAD4 and SCTP_TRUNC4 to linux/sctp.h Move these two macros from net/sctp/sctp.h to linux/sctp.h, so that it will be enough to include only linux/sctp.h in nft_exthdr.c and xt_sctp.c. It should not include "net/sctp/sctp.h" if a module does not have a dependence on SCTP module. Signed-off-by: Xin Long Reviewed-by: Saeed Mahameed Link: https://lore.kernel.org/r/ef6468a687f36da06f575c2131cd4612f6b7be88.1668526821.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/sctp.h | 5 +++++ include/net/sctp/sctp.h | 5 ----- net/netfilter/nft_exthdr.c | 1 - net/netfilter/xt_sctp.c | 1 - 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index a86e852507b3..358dc08e0831 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -820,4 +820,9 @@ struct sctp_new_encap_port_hdr { __be16 new_port; }; +/* Round an int up to the next multiple of 4. */ +#define SCTP_PAD4(s) (((s)+3)&~3) +/* Truncate to the previous multiple of 4. */ +#define SCTP_TRUNC4(s) ((s)&~3) + #endif /* __LINUX_SCTP_H__ */ diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index a04999ee99b0..01d904b34cf0 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -67,11 +67,6 @@ #define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT #endif -/* Round an int up to the next multiple of 4. */ -#define SCTP_PAD4(s) (((s)+3)&~3) -/* Truncate to the previous multiple of 4. */ -#define SCTP_TRUNC4(s) ((s)&~3) - /* * Function declarations. */ diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index ed929d0d37ce..a54a7f772cec 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -13,7 +13,6 @@ #include #include #include -#include #include struct nft_exthdr { diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index 680015ba7cb6..e8961094a282 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3-70-g09d2 From 26943aefa8704ca7871c34a2d1b2b2a418372666 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Nov 2022 15:01:16 -0500 Subject: sctp: verify the bind address with the tb_id from l3mdev After binding to a l3mdev, it should use the route table from the corresponding VRF to verify the addr when binding to an address. Note ipv6 doesn't need it, as binding to ipv6 address does not verify the addr with route lookup. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/protocol.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index bcd3384ab07a..dbfe7d1000c2 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -351,10 +351,13 @@ static int sctp_v4_addr_valid(union sctp_addr *addr, /* Should this be available for binding? */ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) { - struct net *net = sock_net(&sp->inet.sk); - int ret = inet_addr_type(net, addr->v4.sin_addr.s_addr); - + struct sock *sk = &sp->inet.sk; + struct net *net = sock_net(sk); + int tb_id = RT_TABLE_LOCAL; + int ret; + tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ?: tb_id; + ret = inet_addr_type_table(net, addr->v4.sin_addr.s_addr, tb_id); if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && -- cgit v1.2.3-70-g09d2 From 6fe1e52490a91cb23f6b3aafc93e7c5beb99f862 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Nov 2022 15:01:17 -0500 Subject: sctp: check ipv6 addr with sk_bound_dev if set When binding to an ipv6 address, it calls ipv6_chk_addr() to check if this address is on any dev. If a socket binds to a l3mdev but no dev is passed to do this check, all l3mdev and slaves will be skipped and the check will fail. This patch is to pass the bound_dev to make sure the devices under the same l3mdev can be returned in ipv6_chk_addr(). When the bound_dev is not a l3mdev or l3slave, l3mdev_master_dev_rcu() will return NULL in __ipv6_chk_addr_and_flags(), it will keep compitable with before when NULL dev was passed. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index d081858c2d07..e6274cdbdf6c 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -680,9 +680,11 @@ static int sctp_v6_is_any(const union sctp_addr *addr) /* Should this be available for binding? */ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) { - int type; - struct net *net = sock_net(&sp->inet.sk); const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr; + struct sock *sk = &sp->inet.sk; + struct net *net = sock_net(sk); + struct net_device *dev = NULL; + int type; type = ipv6_addr_type(in6); if (IPV6_ADDR_ANY == type) @@ -696,8 +698,14 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) if (!(type & IPV6_ADDR_UNICAST)) return 0; + if (sk->sk_bound_dev_if) { + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (!dev) + return 0; + } + return ipv6_can_nonlocal_bind(net, &sp->inet) || - ipv6_chk_addr(net, in6, NULL, 0); + ipv6_chk_addr(net, in6, dev, 0); } /* This function checks if the address is a valid address to be used for -- cgit v1.2.3-70-g09d2 From f87b1ac06c887210782eab9f105ffd9045be74cc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Nov 2022 15:01:18 -0500 Subject: sctp: check sk_bound_dev_if when matching ep in get_port In sctp_get_port_local(), when binding to IP and PORT, it should also check sk_bound_dev_if to match listening sk if it's set by SO_BINDTOIFINDEX, so that multiple sockets with the same IP and PORT, but different sk_bound_dev_if can be listened at the same time. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 3e83963d1b8a..4306164238ef 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8398,6 +8398,7 @@ pp_found: * in an endpoint. */ sk_for_each_bound(sk2, &pp->owner) { + int bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if); struct sctp_sock *sp2 = sctp_sk(sk2); struct sctp_endpoint *ep2 = sp2->ep; @@ -8408,7 +8409,9 @@ pp_found: uid_eq(uid, sock_i_uid(sk2)))) continue; - if (sctp_bind_addr_conflict(&ep2->base.bind_addr, + if ((!sk->sk_bound_dev_if || !bound_dev_if2 || + sk->sk_bound_dev_if == bound_dev_if2) && + sctp_bind_addr_conflict(&ep2->base.bind_addr, addr, sp2, sp)) { ret = 1; goto fail_unlock; -- cgit v1.2.3-70-g09d2 From 33e93ed2209d5971043bed41dd194bc583b57ef3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Nov 2022 15:01:19 -0500 Subject: sctp: add skb_sdif in struct sctp_af Add skb_sdif function in struct sctp_af to get the enslaved device for both ipv4 and ipv6 when adding SCTP VRF support in sctp_rcv in the next patch. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 1 + net/sctp/ipv6.c | 8 +++++++- net/sctp/protocol.c | 6 ++++++ 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 350f250b0dc7..7b4884c63b26 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -477,6 +477,7 @@ struct sctp_af { int (*available) (union sctp_addr *, struct sctp_sock *); int (*skb_iif) (const struct sk_buff *sk); + int (*skb_sdif)(const struct sk_buff *sk); int (*is_ce) (const struct sk_buff *sk); void (*seq_dump_addr)(struct seq_file *seq, union sctp_addr *addr); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e6274cdbdf6c..097bd60ce964 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -842,7 +842,12 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { - return IP6CB(skb)->iif; + return inet6_iif(skb); +} + +static int sctp_v6_skb_sdif(const struct sk_buff *skb) +{ + return inet6_sdif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ @@ -1142,6 +1147,7 @@ static struct sctp_af sctp_af_inet6 = { .is_any = sctp_v6_is_any, .available = sctp_v6_available, .skb_iif = sctp_v6_skb_iif, + .skb_sdif = sctp_v6_skb_sdif, .is_ce = sctp_v6_is_ce, .seq_dump_addr = sctp_v6_seq_dump_addr, .ecn_capable = sctp_v6_ecn_capable, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index dbfe7d1000c2..a18cf0471a8d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -567,6 +567,11 @@ static int sctp_v4_skb_iif(const struct sk_buff *skb) return inet_iif(skb); } +static int sctp_v4_skb_sdif(const struct sk_buff *skb) +{ + return inet_sdif(skb); +} + /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v4_is_ce(const struct sk_buff *skb) { @@ -1185,6 +1190,7 @@ static struct sctp_af sctp_af_inet = { .available = sctp_v4_available, .scope = sctp_v4_scope, .skb_iif = sctp_v4_skb_iif, + .skb_sdif = sctp_v4_skb_sdif, .is_ce = sctp_v4_is_ce, .seq_dump_addr = sctp_v4_seq_dump_addr, .ecn_capable = sctp_v4_ecn_capable, -- cgit v1.2.3-70-g09d2 From 0af03170637f47fb5cc6501d4b2dcbf1c14772a9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Nov 2022 15:01:20 -0500 Subject: sctp: add dif and sdif check in asoc and ep lookup This patch at first adds a pernet global l3mdev_accept to decide if it accepts the packets from a l3mdev when a SCTP socket doesn't bind to any interface. It's set to 1 to avoid any possible incompatible issue, and in next patch, a sysctl will be introduced to allow to change it. Then similar to inet/udp_sk_bound_dev_eq(), sctp_sk_bound_dev_eq() is added to check either dif or sdif is equal to sk_bound_dev_if, and to check sid is 0 or l3mdev_accept is 1 if sk_bound_dev_if is not set. This function is used to match a association or a endpoint, namely called by sctp_addrs_lookup_transport() and sctp_endpoint_is_match(). All functions that needs updating are: sctp_rcv(): asoc: __sctp_rcv_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() __sctp_rcv_lookup_harder() __sctp_rcv_init_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() __sctp_rcv_walk_lookup() __sctp_rcv_asconf_lookup() __sctp_lookup_association() -> sctp_addrs_lookup_transport() ep: __sctp_rcv_lookup_endpoint() -> sctp_endpoint_is_match() sctp_connect(): sctp_endpoint_is_peeled_off() __sctp_lookup_association() sctp_has_association() sctp_lookup_association() __sctp_lookup_association() -> sctp_addrs_lookup_transport() sctp_diag_dump_one(): sctp_transport_lookup_process() -> sctp_addrs_lookup_transport() Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/netns/sctp.h | 4 ++ include/net/sctp/sctp.h | 6 ++- include/net/sctp/structs.h | 8 ++-- net/sctp/diag.c | 3 +- net/sctp/endpointola.c | 13 ++++-- net/sctp/input.c | 108 +++++++++++++++++++++++++-------------------- net/sctp/protocol.c | 4 ++ net/sctp/socket.c | 4 +- 8 files changed, 89 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h index a681147aecd8..7eff3d981b89 100644 --- a/include/net/netns/sctp.h +++ b/include/net/netns/sctp.h @@ -175,6 +175,10 @@ struct netns_sctp { /* Threshold for autoclose timeout, in seconds. */ unsigned long max_autoclose; + +#ifdef CONFIG_NET_L3_MASTER_DEV + int l3mdev_accept; +#endif }; #endif /* __NETNS_SCTP_H__ */ diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 01d904b34cf0..c335dd01a597 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -109,7 +109,7 @@ struct sctp_transport *sctp_transport_get_idx(struct net *net, struct rhashtable_iter *iter, int pos); int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr, void *p); + const union sctp_addr *paddr, void *p, int dif); int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done, struct net *net, int *pos, void *p); int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); @@ -157,10 +157,12 @@ void sctp_unhash_transport(struct sctp_transport *t); struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr); + const union sctp_addr *paddr, + int dif, int sdif); struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr); +bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif); /* * sctp/proc.c diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 7b4884c63b26..afa3781e3ca2 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1379,10 +1379,12 @@ struct sctp_association *sctp_endpoint_lookup_assoc( struct sctp_transport **); bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr); -struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *, - struct net *, const union sctp_addr *); +struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, + struct net *net, + const union sctp_addr *laddr, + int dif, int sdif); bool sctp_has_association(struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr); + const union sctp_addr *paddr, int dif, int sdif); int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, diff --git a/net/sctp/diag.c b/net/sctp/diag.c index d9c6d8f30f09..a557009e9832 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -426,6 +426,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb, struct net *net = sock_net(skb->sk); const struct nlmsghdr *nlh = cb->nlh; union sctp_addr laddr, paddr; + int dif = req->id.idiag_if; struct sctp_comm_param commp = { .skb = skb, .r = req, @@ -454,7 +455,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb, } return sctp_transport_lookup_process(sctp_sock_dump_one, - net, &laddr, &paddr, &commp); + net, &laddr, &paddr, &commp, dif); } static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index efffde7f2328..7e77b450697c 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -246,12 +246,15 @@ void sctp_endpoint_put(struct sctp_endpoint *ep) /* Is this the endpoint we are looking for? */ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct net *net, - const union sctp_addr *laddr) + const union sctp_addr *laddr, + int dif, int sdif) { + int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if); struct sctp_endpoint *retval = NULL; - if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) && - net_eq(ep->base.net, net)) { + if (net_eq(ep->base.net, net) && + sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && + (htons(ep->base.bind_addr.port) == laddr->v4.sin_port)) { if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, sctp_sk(ep->base.sk))) retval = ep; @@ -298,6 +301,7 @@ out: bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr) { + int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if); struct sctp_sockaddr_entry *addr; struct net *net = ep->base.net; struct sctp_bind_addr *bp; @@ -307,7 +311,8 @@ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, * so the address_list can not change. */ list_for_each_entry(addr, &bp->address_list, list) { - if (sctp_has_association(net, &addr->a, paddr)) + if (sctp_has_association(net, &addr->a, paddr, + bound_dev_if, bound_dev_if)) return true; } diff --git a/net/sctp/input.c b/net/sctp/input.c index 4f43afa8678f..bf70371301ff 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -50,16 +50,19 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, - struct sctp_transport **transportp); + struct sctp_transport **transportp, + int dif, int sdif); static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, - const union sctp_addr *daddr); + const union sctp_addr *daddr, + int dif, int sdif); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, - struct sctp_transport **pt); + struct sctp_transport **pt, + int dif, int sdif); static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb); @@ -92,11 +95,11 @@ int sctp_rcv(struct sk_buff *skb) struct sctp_chunk *chunk; union sctp_addr src; union sctp_addr dest; - int bound_dev_if; int family; struct sctp_af *af; struct net *net = dev_net(skb->dev); bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb); + int dif, sdif; if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -141,6 +144,8 @@ int sctp_rcv(struct sk_buff *skb) /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); af->from_skb(&dest, skb, 0); + dif = af->skb_iif(skb); + sdif = af->skb_sdif(skb); /* If the packet is to or from a non-unicast address, * silently discard the packet. @@ -157,35 +162,15 @@ int sctp_rcv(struct sk_buff *skb) !af->addr_valid(&dest, NULL, skb)) goto discard_it; - asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport); + asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif); if (!asoc) - ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src); + ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; sk = rcvr->sk; - /* - * If a frame arrives on an interface and the receiving socket is - * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB - */ - bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); - if (bound_dev_if && (bound_dev_if != af->skb_iif(skb))) { - if (transport) { - sctp_transport_put(transport); - asoc = NULL; - transport = NULL; - } else { - sctp_endpoint_put(ep); - ep = NULL; - } - sk = net->sctp.ctl_sock; - ep = sctp_sk(sk)->ep; - sctp_endpoint_hold(ep); - rcvr = &ep->base; - } - /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) @@ -485,6 +470,8 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctp_association *asoc; struct sctp_transport *transport = NULL; __u32 vtag = ntohl(sctphdr->vtag); + int sdif = inet_sdif(skb); + int dif = inet_iif(skb); *app = NULL; *tpp = NULL; @@ -500,7 +487,7 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, /* Look for an association that matches the incoming ICMP error * packet. */ - asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport); + asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif); if (!asoc) return NULL; @@ -850,7 +837,8 @@ static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, - const union sctp_addr *paddr) + const union sctp_addr *paddr, + int dif, int sdif) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; @@ -863,7 +851,7 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(ep, &head->chain) { - if (sctp_endpoint_is_match(ep, net, laddr)) + if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif)) goto hit; } @@ -990,14 +978,26 @@ void sctp_unhash_transport(struct sctp_transport *t) sctp_hash_params); } +bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) +{ + bool l3mdev_accept = true; + +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept); +#endif + return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif); +} + /* return a transport with holding it */ struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr) + const union sctp_addr *paddr, + int dif, int sdif) { struct rhlist_head *tmp, *list; struct sctp_transport *t; + int bound_dev_if; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = net, @@ -1011,7 +1011,9 @@ struct sctp_transport *sctp_addrs_lookup_transport( if (!sctp_transport_hold(t)) continue; - if (sctp_bind_addr_match(&t->asoc->base.bind_addr, + bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if); + if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && + sctp_bind_addr_match(&t->asoc->base.bind_addr, laddr, sctp_sk(t->asoc->base.sk))) return t; sctp_transport_put(t); @@ -1048,12 +1050,13 @@ static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, - struct sctp_transport **pt) + struct sctp_transport **pt, + int dif, int sdif) { struct sctp_transport *t; struct sctp_association *asoc = NULL; - t = sctp_addrs_lookup_transport(net, local, peer); + t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif); if (!t) goto out; @@ -1069,12 +1072,13 @@ static struct sctp_association *sctp_lookup_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, - struct sctp_transport **transportp) + struct sctp_transport **transportp, + int dif, int sdif) { struct sctp_association *asoc; rcu_read_lock(); - asoc = __sctp_lookup_association(net, laddr, paddr, transportp); + asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); rcu_read_unlock(); return asoc; @@ -1083,11 +1087,12 @@ struct sctp_association *sctp_lookup_association(struct net *net, /* Is there an association matching the given local and peer addresses? */ bool sctp_has_association(struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr) + const union sctp_addr *paddr, + int dif, int sdif) { struct sctp_transport *transport; - if (sctp_lookup_association(net, laddr, paddr, &transport)) { + if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) { sctp_transport_put(transport); return true; } @@ -1115,7 +1120,8 @@ bool sctp_has_association(struct net *net, */ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, struct sk_buff *skb, - const union sctp_addr *laddr, struct sctp_transport **transportp) + const union sctp_addr *laddr, struct sctp_transport **transportp, + int dif, int sdif) { struct sctp_association *asoc; union sctp_addr addr; @@ -1154,7 +1160,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) continue; - asoc = __sctp_lookup_association(net, laddr, paddr, transportp); + asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) return asoc; } @@ -1181,7 +1187,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( struct sctp_chunkhdr *ch, const union sctp_addr *laddr, __be16 peer_port, - struct sctp_transport **transportp) + struct sctp_transport **transportp, + int dif, int sdif) { struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch; struct sctp_af *af; @@ -1201,7 +1208,7 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( if (!af->from_addr_param(&paddr, param, peer_port, 0)) return NULL; - return __sctp_lookup_association(net, laddr, &paddr, transportp); + return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif); } @@ -1217,7 +1224,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup( static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, - struct sctp_transport **transportp) + struct sctp_transport **transportp, + int dif, int sdif) { struct sctp_association *asoc = NULL; struct sctp_chunkhdr *ch; @@ -1260,7 +1268,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, asoc = __sctp_rcv_asconf_lookup( net, ch, laddr, sctp_hdr(skb)->source, - transportp); + transportp, dif, sdif); break; default: break; @@ -1285,7 +1293,8 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, - struct sctp_transport **transportp) + struct sctp_transport **transportp, + int dif, int sdif) { struct sctp_chunkhdr *ch; @@ -1309,9 +1318,9 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, /* If this is INIT/INIT-ACK look inside the chunk too. */ if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK) - return __sctp_rcv_init_lookup(net, skb, laddr, transportp); + return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif); - return __sctp_rcv_walk_lookup(net, skb, laddr, transportp); + return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif); } /* Lookup an association for an inbound skb. */ @@ -1319,11 +1328,12 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, - struct sctp_transport **transportp) + struct sctp_transport **transportp, + int dif, int sdif) { struct sctp_association *asoc; - asoc = __sctp_lookup_association(net, laddr, paddr, transportp); + asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) goto out; @@ -1331,7 +1341,7 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net, * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. */ - asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp); + asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif); if (asoc) goto out; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a18cf0471a8d..909a89a1cff4 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1394,6 +1394,10 @@ static int __net_init sctp_defaults_init(struct net *net) /* Initialize maximum autoclose timeout. */ net->sctp.max_autoclose = INT_MAX / HZ; +#ifdef CONFIG_NET_L3_MASTER_DEV + net->sctp.l3mdev_accept = 1; +#endif + status = sctp_sysctl_net_register(net); if (status) goto err_sysctl_register; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4306164238ef..5acbdf0d38f3 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5315,14 +5315,14 @@ EXPORT_SYMBOL_GPL(sctp_for_each_endpoint); int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net, const union sctp_addr *laddr, - const union sctp_addr *paddr, void *p) + const union sctp_addr *paddr, void *p, int dif) { struct sctp_transport *transport; struct sctp_endpoint *ep; int err = -ENOENT; rcu_read_lock(); - transport = sctp_addrs_lookup_transport(net, laddr, paddr); + transport = sctp_addrs_lookup_transport(net, laddr, paddr, dif, dif); if (!transport) { rcu_read_unlock(); return err; -- cgit v1.2.3-70-g09d2 From b712d0328c2c3ab456847f29f711e785f70cd8a5 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 16 Nov 2022 15:01:21 -0500 Subject: sctp: add sysctl net.sctp.l3mdev_accept This patch is to add sysctl net.sctp.l3mdev_accept to allow users to change the pernet global l3mdev_accept. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.rst | 9 +++++++++ net/sctp/sysctl.c | 11 +++++++++++ 2 files changed, 20 insertions(+) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 727b25cc7ec4..7fbd060d6047 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -3127,6 +3127,15 @@ ecn_enable - BOOLEAN Default: 1 +l3mdev_accept - BOOLEAN + Enabling this option allows a "global" bound socket to work + across L3 master domains (e.g., VRFs) with packets capable of + being received regardless of the L3 domain in which they + originated. Only valid when the kernel was compiled with + CONFIG_NET_L3_MASTER_DEV. + + Default: 1 (enabled) + ``/proc/sys/net/core/*`` ======================== diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index b46a416787ec..7f40ed117fc7 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -347,6 +347,17 @@ static struct ctl_table sctp_net_table[] = { .extra1 = &max_autoclose_min, .extra2 = &max_autoclose_max, }, +#ifdef CONFIG_NET_L3_MASTER_DEV + { + .procname = "l3mdev_accept", + .data = &init_net.sctp.l3mdev_accept, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif { .procname = "pf_enable", .data = &init_net.sctp.pf_enable, -- cgit v1.2.3-70-g09d2 From 101c1bb6c55691d01c73915c118828f7ca17a049 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 17 Nov 2022 10:43:38 +0300 Subject: rxrpc: fix rxkad_verify_response() The error handling for if skb_copy_bits() fails was accidentally deleted so the rxkad_decrypt_ticket() function is not called. Fixes: 5d7edbc9231e ("rxrpc: Get rid of the Rx ring") Signed-off-by: Dan Carpenter Acked-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/rxkad.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 2706e59bf992..110a5550c0a6 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -1165,8 +1165,10 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, eproto = tracepoint_string("rxkad_tkt_short"); abort_code = RXKADPACKETSHORT; - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), - ticket, ticket_len) < 0) + ret = skb_copy_bits(skb, sizeof(struct rxrpc_wire_header) + sizeof(*response), + ticket, ticket_len); + if (ret < 0) + goto temporary_error_free_ticket; ret = rxkad_decrypt_ticket(conn, server_key, skb, ticket, ticket_len, &session_key, &expiry, _abort_code); -- cgit v1.2.3-70-g09d2 From 38461894838bbbebab54cbd5a5459cc8d1b6dd9b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 17 Nov 2022 10:44:02 +0300 Subject: rxrpc: uninitialized variable in rxrpc_send_ack_packet() The "pkt" was supposed to have been deleted in a previous patch. It leads to an uninitialized variable bug. Fixes: 72f0c6fb0579 ("rxrpc: Allocate ACK records at proposal and queue for transmission") Signed-off-by: Dan Carpenter Acked-by: David Howells Signed-off-by: David S. Miller --- net/rxrpc/output.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 46432e70a16b..04f945e042ab 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -202,7 +202,6 @@ static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call, static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *txb) { struct rxrpc_connection *conn; - struct rxrpc_ack_buffer *pkt; struct rxrpc_call *call = txb->call; struct msghdr msg; struct kvec iov[1]; @@ -270,7 +269,6 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * rxrpc_set_keepalive(call); } - kfree(pkt); return ret; } -- cgit v1.2.3-70-g09d2 From fd896e38e5df2c5b68c78eee2fc425c4dcd3b4dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 17 Nov 2022 09:26:41 +0000 Subject: net: fix napi_disable() logic error Dan reported a new warning after my recent patch: New smatch warnings: net/core/dev.c:6409 napi_disable() error: uninitialized symbol 'new'. Indeed, we must first wait for STATE_SCHED and STATE_NPSVC to be cleared, to make sure @new variable has been initialized properly. Fixes: 4ffa1d1c6842 ("net: adopt try_cmpxchg() in napi_{enable|disable}()") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d0fb4af9a126..7627c475d991 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6399,9 +6399,9 @@ void napi_disable(struct napi_struct *n) val = READ_ONCE(n->state); do { - if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { + while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { usleep_range(20, 200); - continue; + val = READ_ONCE(n->state); } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; -- cgit v1.2.3-70-g09d2 From ab0377803dafc58f1e22296708c1c28e309414d6 Mon Sep 17 00:00:00 2001 From: Schspa Shi Date: Wed, 16 Nov 2022 19:45:11 +0800 Subject: mrp: introduce active flags to prevent UAF when applicant uninit The caller of del_timer_sync must prevent restarting of the timer, If we have no this synchronization, there is a small probability that the cancellation will not be successful. And syzbot report the fellowing crash: ================================================================== BUG: KASAN: use-after-free in hlist_add_head include/linux/list.h:929 [inline] BUG: KASAN: use-after-free in enqueue_timer+0x18/0xa4 kernel/time/timer.c:605 Write at addr f9ff000024df6058 by task syz-fuzzer/2256 Pointer tag: [f9], memory tag: [fe] CPU: 1 PID: 2256 Comm: syz-fuzzer Not tainted 6.1.0-rc5-syzkaller-00008- ge01d50cbd6ee #0 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace.part.0+0xe0/0xf0 arch/arm64/kernel/stacktrace.c:156 dump_backtrace arch/arm64/kernel/stacktrace.c:162 [inline] show_stack+0x18/0x40 arch/arm64/kernel/stacktrace.c:163 __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x68/0x84 lib/dump_stack.c:106 print_address_description mm/kasan/report.c:284 [inline] print_report+0x1a8/0x4a0 mm/kasan/report.c:395 kasan_report+0x94/0xb4 mm/kasan/report.c:495 __do_kernel_fault+0x164/0x1e0 arch/arm64/mm/fault.c:320 do_bad_area arch/arm64/mm/fault.c:473 [inline] do_tag_check_fault+0x78/0x8c arch/arm64/mm/fault.c:749 do_mem_abort+0x44/0x94 arch/arm64/mm/fault.c:825 el1_abort+0x40/0x60 arch/arm64/kernel/entry-common.c:367 el1h_64_sync_handler+0xd8/0xe4 arch/arm64/kernel/entry-common.c:427 el1h_64_sync+0x64/0x68 arch/arm64/kernel/entry.S:576 hlist_add_head include/linux/list.h:929 [inline] enqueue_timer+0x18/0xa4 kernel/time/timer.c:605 mod_timer+0x14/0x20 kernel/time/timer.c:1161 mrp_periodic_timer_arm net/802/mrp.c:614 [inline] mrp_periodic_timer+0xa0/0xc0 net/802/mrp.c:627 call_timer_fn.constprop.0+0x24/0x80 kernel/time/timer.c:1474 expire_timers+0x98/0xc4 kernel/time/timer.c:1519 To fix it, we can introduce a new active flags to make sure the timer will not restart. Reported-by: syzbot+6fd64001c20aa99e34a4@syzkaller.appspotmail.com Signed-off-by: Schspa Shi Signed-off-by: David S. Miller --- include/net/mrp.h | 1 + net/802/mrp.c | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/mrp.h b/include/net/mrp.h index 92cd3fb6cf9d..b28915ffea28 100644 --- a/include/net/mrp.h +++ b/include/net/mrp.h @@ -124,6 +124,7 @@ struct mrp_applicant { struct sk_buff *pdu; struct rb_root mad; struct rcu_head rcu; + bool active; }; struct mrp_port { diff --git a/net/802/mrp.c b/net/802/mrp.c index 155f74d8b14f..6c927d4b35f0 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -606,7 +606,10 @@ static void mrp_join_timer(struct timer_list *t) spin_unlock(&app->lock); mrp_queue_xmit(app); - mrp_join_timer_arm(app); + spin_lock(&app->lock); + if (likely(app->active)) + mrp_join_timer_arm(app); + spin_unlock(&app->lock); } static void mrp_periodic_timer_arm(struct mrp_applicant *app) @@ -620,11 +623,12 @@ static void mrp_periodic_timer(struct timer_list *t) struct mrp_applicant *app = from_timer(app, t, periodic_timer); spin_lock(&app->lock); - mrp_mad_event(app, MRP_EVENT_PERIODIC); - mrp_pdu_queue(app); + if (likely(app->active)) { + mrp_mad_event(app, MRP_EVENT_PERIODIC); + mrp_pdu_queue(app); + mrp_periodic_timer_arm(app); + } spin_unlock(&app->lock); - - mrp_periodic_timer_arm(app); } static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) @@ -872,6 +876,7 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) app->dev = dev; app->app = appl; app->mad = RB_ROOT; + app->active = true; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); @@ -900,6 +905,9 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) RCU_INIT_POINTER(port->applicants[appl->type], NULL); + spin_lock_bh(&app->lock); + app->active = false; + spin_unlock_bh(&app->lock); /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ -- cgit v1.2.3-70-g09d2 From d1e91173cd29ba13c5953d1820a7543140feec93 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Tue, 15 Nov 2022 09:49:21 -0500 Subject: bpf, docs: DEVMAPs and XDP_REDIRECT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add documentation for BPF_MAP_TYPE_DEVMAP and BPF_MAP_TYPE_DEVMAP_HASH including kernel version introduced, usage and examples. Add documentation that describes XDP_REDIRECT. Signed-off-by: Maryam Tahhan Signed-off-by: Daniel Borkmann Reviewed-by: Toke Høiland-Jørgensen Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221115144921.165483-1-mtahhan@redhat.com --- Documentation/bpf/index.rst | 1 + Documentation/bpf/map_devmap.rst | 222 +++++++++++++++++++++++++++++++++++++++ Documentation/bpf/redirect.rst | 81 ++++++++++++++ net/core/filter.c | 8 +- 4 files changed, 310 insertions(+), 2 deletions(-) create mode 100644 Documentation/bpf/map_devmap.rst create mode 100644 Documentation/bpf/redirect.rst (limited to 'net') diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst index 1b50de1983ee..1088d44634d6 100644 --- a/Documentation/bpf/index.rst +++ b/Documentation/bpf/index.rst @@ -29,6 +29,7 @@ that goes into great technical depth about the BPF Architecture. clang-notes linux-notes other + redirect .. only:: subproject and html diff --git a/Documentation/bpf/map_devmap.rst b/Documentation/bpf/map_devmap.rst new file mode 100644 index 000000000000..f64da348dbfe --- /dev/null +++ b/Documentation/bpf/map_devmap.rst @@ -0,0 +1,222 @@ +.. SPDX-License-Identifier: GPL-2.0-only +.. Copyright (C) 2022 Red Hat, Inc. + +================================================= +BPF_MAP_TYPE_DEVMAP and BPF_MAP_TYPE_DEVMAP_HASH +================================================= + +.. note:: + - ``BPF_MAP_TYPE_DEVMAP`` was introduced in kernel version 4.14 + - ``BPF_MAP_TYPE_DEVMAP_HASH`` was introduced in kernel version 5.4 + +``BPF_MAP_TYPE_DEVMAP`` and ``BPF_MAP_TYPE_DEVMAP_HASH`` are BPF maps primarily +used as backend maps for the XDP BPF helper call ``bpf_redirect_map()``. +``BPF_MAP_TYPE_DEVMAP`` is backed by an array that uses the key as +the index to lookup a reference to a net device. While ``BPF_MAP_TYPE_DEVMAP_HASH`` +is backed by a hash table that uses a key to lookup a reference to a net device. +The user provides either <``key``/ ``ifindex``> or <``key``/ ``struct bpf_devmap_val``> +pairs to update the maps with new net devices. + +.. note:: + - The key to a hash map doesn't have to be an ``ifindex``. + - While ``BPF_MAP_TYPE_DEVMAP_HASH`` allows for densely packing the net devices + it comes at the cost of a hash of the key when performing a look up. + +The setup and packet enqueue/send code is shared between the two types of +devmap; only the lookup and insertion is different. + +Usage +===== +Kernel BPF +---------- +.. c:function:: + long bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) + +Redirect the packet to the endpoint referenced by ``map`` at index ``key``. +For ``BPF_MAP_TYPE_DEVMAP`` and ``BPF_MAP_TYPE_DEVMAP_HASH`` this map contains +references to net devices (for forwarding packets through other ports). + +The lower two bits of *flags* are used as the return code if the map lookup +fails. This is so that the return value can be one of the XDP program return +codes up to ``XDP_TX``, as chosen by the caller. The higher bits of ``flags`` +can be set to ``BPF_F_BROADCAST`` or ``BPF_F_EXCLUDE_INGRESS`` as defined +below. + +With ``BPF_F_BROADCAST`` the packet will be broadcast to all the interfaces +in the map, with ``BPF_F_EXCLUDE_INGRESS`` the ingress interface will be excluded +from the broadcast. + +.. note:: + - The key is ignored if BPF_F_BROADCAST is set. + - The broadcast feature can also be used to implement multicast forwarding: + simply create multiple DEVMAPs, each one corresponding to a single multicast group. + +This helper will return ``XDP_REDIRECT`` on success, or the value of the two +lower bits of the ``flags`` argument if the map lookup fails. + +More information about redirection can be found :doc:`redirect` + +.. c:function:: + void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + +Net device entries can be retrieved using the ``bpf_map_lookup_elem()`` +helper. + +Userspace +--------- +.. note:: + DEVMAP entries can only be updated/deleted from user space and not + from an eBPF program. Trying to call these functions from a kernel eBPF + program will result in the program failing to load and a verifier warning. + +.. c:function:: + int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags); + + Net device entries can be added or updated using the ``bpf_map_update_elem()`` + helper. This helper replaces existing elements atomically. The ``value`` parameter + can be ``struct bpf_devmap_val`` or a simple ``int ifindex`` for backwards + compatibility. + + .. code-block:: c + + struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; + }; + + The ``flags`` argument can be one of the following: + + - ``BPF_ANY``: Create a new element or update an existing element. + - ``BPF_NOEXIST``: Create a new element only if it did not exist. + - ``BPF_EXIST``: Update an existing element. + + DEVMAPs can associate a program with a device entry by adding a ``bpf_prog.fd`` + to ``struct bpf_devmap_val``. Programs are run after ``XDP_REDIRECT`` and have + access to both Rx device and Tx device. The program associated with the ``fd`` + must have type XDP with expected attach type ``xdp_devmap``. + When a program is associated with a device index, the program is run on an + ``XDP_REDIRECT`` and before the buffer is added to the per-cpu queue. Examples + of how to attach/use xdp_devmap progs can be found in the kernel selftests: + + - ``tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c`` + - ``tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c`` + +.. c:function:: + int bpf_map_lookup_elem(int fd, const void *key, void *value); + + Net device entries can be retrieved using the ``bpf_map_lookup_elem()`` + helper. + +.. c:function:: + int bpf_map_delete_elem(int fd, const void *key); + + Net device entries can be deleted using the ``bpf_map_delete_elem()`` + helper. This helper will return 0 on success, or negative error in case of + failure. + +Examples +======== + +Kernel BPF +---------- + +The following code snippet shows how to declare a ``BPF_MAP_TYPE_DEVMAP`` +called tx_port. + +.. code-block:: c + + struct { + __uint(type, BPF_MAP_TYPE_DEVMAP); + __type(key, __u32); + __type(value, __u32); + __uint(max_entries, 256); + } tx_port SEC(".maps"); + +The following code snippet shows how to declare a ``BPF_MAP_TYPE_DEVMAP_HASH`` +called forward_map. + +.. code-block:: c + + struct { + __uint(type, BPF_MAP_TYPE_DEVMAP_HASH); + __type(key, __u32); + __type(value, struct bpf_devmap_val); + __uint(max_entries, 32); + } forward_map SEC(".maps"); + +.. note:: + + The value type in the DEVMAP above is a ``struct bpf_devmap_val`` + +The following code snippet shows a simple xdp_redirect_map program. This program +would work with a user space program that populates the devmap ``forward_map`` based +on ingress ifindexes. The BPF program (below) is redirecting packets using the +ingress ``ifindex`` as the ``key``. + +.. code-block:: c + + SEC("xdp") + int xdp_redirect_map_func(struct xdp_md *ctx) + { + int index = ctx->ingress_ifindex; + + return bpf_redirect_map(&forward_map, index, 0); + } + +The following code snippet shows a BPF program that is broadcasting packets to +all the interfaces in the ``tx_port`` devmap. + +.. code-block:: c + + SEC("xdp") + int xdp_redirect_map_func(struct xdp_md *ctx) + { + return bpf_redirect_map(&tx_port, 0, BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS); + } + +User space +---------- + +The following code snippet shows how to update a devmap called ``tx_port``. + +.. code-block:: c + + int update_devmap(int ifindex, int redirect_ifindex) + { + int ret; + + ret = bpf_map_update_elem(bpf_map__fd(tx_port), &ifindex, &redirect_ifindex, 0); + if (ret < 0) { + fprintf(stderr, "Failed to update devmap_ value: %s\n", + strerror(errno)); + } + + return ret; + } + +The following code snippet shows how to update a hash_devmap called ``forward_map``. + +.. code-block:: c + + int update_devmap(int ifindex, int redirect_ifindex) + { + struct bpf_devmap_val devmap_val = { .ifindex = redirect_ifindex }; + int ret; + + ret = bpf_map_update_elem(bpf_map__fd(forward_map), &ifindex, &devmap_val, 0); + if (ret < 0) { + fprintf(stderr, "Failed to update devmap_ value: %s\n", + strerror(errno)); + } + return ret; + } + +References +=========== + +- https://lwn.net/Articles/728146/ +- https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/commit/?id=6f9d451ab1a33728adb72d7ff66a7b374d665176 +- https://elixir.bootlin.com/linux/latest/source/net/core/filter.c#L4106 diff --git a/Documentation/bpf/redirect.rst b/Documentation/bpf/redirect.rst new file mode 100644 index 000000000000..2fa2b0b05004 --- /dev/null +++ b/Documentation/bpf/redirect.rst @@ -0,0 +1,81 @@ +.. SPDX-License-Identifier: GPL-2.0-only +.. Copyright (C) 2022 Red Hat, Inc. + +======== +Redirect +======== +XDP_REDIRECT +############ +Supported maps +-------------- + +XDP_REDIRECT works with the following map types: + +- ``BPF_MAP_TYPE_DEVMAP`` +- ``BPF_MAP_TYPE_DEVMAP_HASH`` +- ``BPF_MAP_TYPE_CPUMAP`` +- ``BPF_MAP_TYPE_XSKMAP`` + +For more information on these maps, please see the specific map documentation. + +Process +------- + +.. kernel-doc:: net/core/filter.c + :doc: xdp redirect + +.. note:: + Not all drivers support transmitting frames after a redirect, and for + those that do, not all of them support non-linear frames. Non-linear xdp + bufs/frames are bufs/frames that contain more than one fragment. + +Debugging packet drops +---------------------- +Silent packet drops for XDP_REDIRECT can be debugged using: + +- bpf_trace +- perf_record + +bpf_trace +^^^^^^^^^ +The following bpftrace command can be used to capture and count all XDP tracepoints: + +.. code-block:: none + + sudo bpftrace -e 'tracepoint:xdp:* { @cnt[probe] = count(); }' + Attaching 12 probes... + ^C + + @cnt[tracepoint:xdp:mem_connect]: 18 + @cnt[tracepoint:xdp:mem_disconnect]: 18 + @cnt[tracepoint:xdp:xdp_exception]: 19605 + @cnt[tracepoint:xdp:xdp_devmap_xmit]: 1393604 + @cnt[tracepoint:xdp:xdp_redirect]: 22292200 + +.. note:: + The various xdp tracepoints can be found in ``source/include/trace/events/xdp.h`` + +The following bpftrace command can be used to extract the ``ERRNO`` being returned as +part of the err parameter: + +.. code-block:: none + + sudo bpftrace -e \ + 'tracepoint:xdp:xdp_redirect*_err {@redir_errno[-args->err] = count();} + tracepoint:xdp:xdp_devmap_xmit {@devmap_errno[-args->err] = count();}' + +perf record +^^^^^^^^^^^ +The perf tool also supports recording tracepoints: + +.. code-block:: none + + perf record -a -e xdp:xdp_redirect_err \ + -e xdp:xdp_redirect_map_err \ + -e xdp:xdp_exception \ + -e xdp:xdp_devmap_xmit + +References +=========== + +- https://github.com/xdp-project/xdp-tutorial/tree/master/tracing02-xdp-monitor diff --git a/net/core/filter.c b/net/core/filter.c index 754dd01354d8..2a105fb1ceb2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4108,7 +4108,10 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -/* XDP_REDIRECT works by a three-step process, implemented in the functions +/** + * DOC: xdp redirect + * + * XDP_REDIRECT works by a three-step process, implemented in the functions * below: * * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target @@ -4123,7 +4126,8 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { * 3. Before exiting its NAPI poll loop, the driver will call xdp_do_flush(), * which will flush all the different bulk queues, thus completing the * redirect. - * + */ +/* * Pointers to the map entries will be kept around for this whole sequence of * steps, protected by RCU. However, there is no top-level rcu_read_lock() in * the core code; instead, the RCU protection relies on everything happening -- cgit v1.2.3-70-g09d2 From c73a72f4cbb47672c8cc7f7d7aba52f1cb15baca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Nov 2022 19:39:03 -0800 Subject: netlink: remove the flex array from struct nlmsghdr I've added a flex array to struct nlmsghdr in commit 738136a0e375 ("netlink: split up copies in the ack construction") to allow accessing the data easily. It leads to warnings with clang, if user space wraps this structure into another struct and the flex array is not at the end of the container. Reviewed-by: Kees Cook Reviewed-by: David Ahern Link: https://lore.kernel.org/all/20221114023927.GA685@u2004-local/ Link: https://lore.kernel.org/r/20221118033903.1651026-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/netlink.h | 2 -- net/netlink/af_netlink.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 5da0da59bf01..e2ae82e3f9f7 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -48,7 +48,6 @@ struct sockaddr_nl { * @nlmsg_flags: Additional flags * @nlmsg_seq: Sequence number * @nlmsg_pid: Sending process port ID - * @nlmsg_data: Message payload */ struct nlmsghdr { __u32 nlmsg_len; @@ -56,7 +55,6 @@ struct nlmsghdr { __u16 nlmsg_flags; __u32 nlmsg_seq; __u32 nlmsg_pid; - __u8 nlmsg_data[]; }; /* Flags values */ diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9ebdf3262015..d73091f6bb0f 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2514,7 +2514,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, if (!nlmsg_append(skb, nlmsg_len(nlh))) goto err_bad_put; - memcpy(errmsg->msg.nlmsg_data, nlh->nlmsg_data, + memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh), nlmsg_len(nlh)); } -- cgit v1.2.3-70-g09d2 From d169ecb536e46985e3e2acf5f2b39efae3953c2d Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 16 Nov 2022 09:07:33 +0100 Subject: net: dsa: tag_mtk: assign per-port queues Keeps traffic sent to the switch within link speed limits Signed-off-by: Felix Fietkau Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20221116080734.44013-6-nbd@nbd.name Signed-off-by: Jakub Kicinski --- net/dsa/tag_mtk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index ba37495ab5f4..8948c4f99f8e 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -27,6 +27,8 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, u8 xmit_tpid; u8 *mtk_tag; + skb_set_queue_mapping(skb, dp->index); + /* Build the special tag after the MAC Source Address. If VLAN header * is present, it's required that VLAN header and special tag is * being combined. Only in this way we can allow the switch can parse -- cgit v1.2.3-70-g09d2 From 3f00c52393445ed49aadc1a567aa502c6333b1a1 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 19 Nov 2022 23:10:02 -0600 Subject: bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs Kfuncs currently support specifying the KF_TRUSTED_ARGS flag to signal to the verifier that it should enforce that a BPF program passes it a "safe", trusted pointer. Currently, "safe" means that the pointer is either PTR_TO_CTX, or is refcounted. There may be cases, however, where the kernel passes a BPF program a safe / trusted pointer to an object that the BPF program wishes to use as a kptr, but because the object does not yet have a ref_obj_id from the perspective of the verifier, the program would be unable to pass it to a KF_ACQUIRE | KF_TRUSTED_ARGS kfunc. The solution is to expand the set of pointers that are considered trusted according to KF_TRUSTED_ARGS, so that programs can invoke kfuncs with these pointers without getting rejected by the verifier. There is already a PTR_UNTRUSTED flag that is set in some scenarios, such as when a BPF program reads a kptr directly from a map without performing a bpf_kptr_xchg() call. These pointers of course can and should be rejected by the verifier. Unfortunately, however, PTR_UNTRUSTED does not cover all the cases for safety that need to be addressed to adequately protect kfuncs. Specifically, pointers obtained by a BPF program "walking" a struct are _not_ considered PTR_UNTRUSTED according to BPF. For example, say that we were to add a kfunc called bpf_task_acquire(), with KF_ACQUIRE | KF_TRUSTED_ARGS, to acquire a struct task_struct *. If we only used PTR_UNTRUSTED to signal that a task was unsafe to pass to a kfunc, the verifier would mistakenly allow the following unsafe BPF program to be loaded: SEC("tp_btf/task_newtask") int BPF_PROG(unsafe_acquire_task, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired, *nested; nested = task->last_wakee; /* Would not be rejected by the verifier. */ acquired = bpf_task_acquire(nested); if (!acquired) return 0; bpf_task_release(acquired); return 0; } To address this, this patch defines a new type flag called PTR_TRUSTED which tracks whether a PTR_TO_BTF_ID pointer is safe to pass to a KF_TRUSTED_ARGS kfunc or a BPF helper function. PTR_TRUSTED pointers are passed directly from the kernel as a tracepoint or struct_ops callback argument. Any nested pointer that is obtained from walking a PTR_TRUSTED pointer is no longer PTR_TRUSTED. From the example above, the struct task_struct *task argument is PTR_TRUSTED, but the 'nested' pointer obtained from 'task->last_wakee' is not PTR_TRUSTED. A subsequent patch will add kfuncs for storing a task kfunc as a kptr, and then another patch will add selftests to validate. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221120051004.3605026-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- Documentation/bpf/kfuncs.rst | 30 +++++----- include/linux/bpf.h | 30 ++++++++++ include/linux/bpf_verifier.h | 7 +++ include/linux/btf.h | 65 ++++++++++++-------- kernel/bpf/btf.c | 8 +++ kernel/bpf/verifier.c | 69 ++++++++++++++++++---- kernel/trace/bpf_trace.c | 2 +- net/ipv4/bpf_tcp_ca.c | 4 +- tools/testing/selftests/bpf/verifier/calls.c | 2 +- .../testing/selftests/bpf/verifier/ref_tracking.c | 4 +- 10 files changed, 164 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst index 3b1501c3b6cd..90774479ab7a 100644 --- a/Documentation/bpf/kfuncs.rst +++ b/Documentation/bpf/kfuncs.rst @@ -161,22 +161,20 @@ KF_ACQUIRE and KF_RET_NULL flags. -------------------------- The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It -indicates that the all pointer arguments will always have a guaranteed lifetime, -and pointers to kernel objects are always passed to helpers in their unmodified -form (as obtained from acquire kfuncs). - -It can be used to enforce that a pointer to a refcounted object acquired from a -kfunc or BPF helper is passed as an argument to this kfunc without any -modifications (e.g. pointer arithmetic) such that it is trusted and points to -the original object. - -Meanwhile, it is also allowed pass pointers to normal memory to such kfuncs, -but those can have a non-zero offset. - -This flag is often used for kfuncs that operate (change some property, perform -some operation) on an object that was obtained using an acquire kfunc. Such -kfuncs need an unchanged pointer to ensure the integrity of the operation being -performed on the expected object. +indicates that the all pointer arguments are valid, and that all pointers to +BTF objects have been passed in their unmodified form (that is, at a zero +offset, and without having been obtained from walking another pointer). + +There are two types of pointers to kernel objects which are considered "valid": + +1. Pointers which are passed as tracepoint or struct_ops callback arguments. +2. Pointers which were returned from a KF_ACQUIRE or KF_KPTR_GET kfunc. + +Pointers to non-BTF objects (e.g. scalar pointers) may also be passed to +KF_TRUSTED_ARGS kfuncs, and may have a non-zero offset. + +The definition of "valid" pointers is subject to change at any time, and has +absolutely no ABI stability guarantees. 2.4.6 KF_SLEEPABLE flag ----------------------- diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b32376ce746..c9eafa67f2a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -543,6 +543,35 @@ enum bpf_type_flag { */ MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), + /* PTR was passed from the kernel in a trusted context, and may be + * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. + * PTR_UNTRUSTED refers to a kptr that was read directly from a map + * without invoking bpf_kptr_xchg(). What we really need to know is + * whether a pointer is safe to pass to a kfunc or BPF helper function. + * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF + * helpers, they do not cover all possible instances of unsafe + * pointers. For example, a pointer that was obtained from walking a + * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the + * fact that it may be NULL, invalid, etc. This is due to backwards + * compatibility requirements, as this was the behavior that was first + * introduced when kptrs were added. The behavior is now considered + * deprecated, and PTR_UNTRUSTED will eventually be removed. + * + * PTR_TRUSTED, on the other hand, is a pointer that the kernel + * guarantees to be valid and safe to pass to kfuncs and BPF helpers. + * For example, pointers passed to tracepoint arguments are considered + * PTR_TRUSTED, as are pointers that are passed to struct_ops + * callbacks. As alluded to above, pointers that are obtained from + * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a + * struct task_struct *task is PTR_TRUSTED, then accessing + * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored + * in a BPF register. Similarly, pointers passed to certain programs + * types such as kretprobes are not guaranteed to be valid, as they may + * for example contain an object that was recently freed. + */ + PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -636,6 +665,7 @@ enum bpf_return_type { RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + RET_PTR_TO_BTF_ID_TRUSTED = PTR_TRUSTED | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 608dde740fef..545152ac136c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -680,4 +680,11 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) + +static inline bool bpf_type_has_unsafe_modifiers(u32 type) +{ + return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; +} + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/btf.h b/include/linux/btf.h index d5b26380a60f..d38aa4251c28 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -19,36 +19,53 @@ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ #define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ -/* Trusted arguments are those which are meant to be referenced arguments with - * unchanged offset. It is used to enforce that pointers obtained from acquire - * kfuncs remain unmodified when being passed to helpers taking trusted args. +/* Trusted arguments are those which are guaranteed to be valid when passed to + * the kfunc. It is used to enforce that pointers obtained from either acquire + * kfuncs, or from the main kernel on a tracepoint or struct_ops callback + * invocation, remain unmodified when being passed to helpers taking trusted + * args. * - * Consider - * struct foo { - * int data; - * struct foo *next; - * }; + * Consider, for example, the following new task tracepoint: * - * struct bar { - * int data; - * struct foo f; - * }; + * SEC("tp_btf/task_newtask") + * int BPF_PROG(new_task_tp, struct task_struct *task, u64 clone_flags) + * { + * ... + * } * - * struct foo *f = alloc_foo(); // Acquire kfunc - * struct bar *b = alloc_bar(); // Acquire kfunc + * And the following kfunc: * - * If a kfunc set_foo_data() wants to operate only on the allocated object, it - * will set the KF_TRUSTED_ARGS flag, which will prevent unsafe usage like: + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) * - * set_foo_data(f, 42); // Allowed - * set_foo_data(f->next, 42); // Rejected, non-referenced pointer - * set_foo_data(&f->next, 42);// Rejected, referenced, but wrong type - * set_foo_data(&b->f, 42); // Rejected, referenced, but bad offset + * All invocations to the kfunc must pass the unmodified, unwalked task: * - * In the final case, usually for the purposes of type matching, it is deduced - * by looking at the type of the member at the offset, but due to the - * requirement of trusted argument, this deduction will be strict and not done - * for this case. + * bpf_task_acquire(task); // Allowed + * bpf_task_acquire(task->last_wakee); // Rejected, walked task + * + * Programs may also pass referenced tasks directly to the kfunc: + * + * struct task_struct *acquired; + * + * acquired = bpf_task_acquire(task); // Allowed, same as above + * bpf_task_acquire(acquired); // Allowed + * bpf_task_acquire(task); // Allowed + * bpf_task_acquire(acquired->last_wakee); // Rejected, walked task + * + * Programs may _not_, however, pass a task from an arbitrary fentry/fexit, or + * kprobe/kretprobe to the kfunc, as BPF cannot guarantee that all of these + * pointers are guaranteed to be safe. For example, the following BPF program + * would be rejected: + * + * SEC("kretprobe/free_task") + * int BPF_PROG(free_task_probe, struct task_struct *tsk) + * { + * struct task_struct *acquired; + * + * acquired = bpf_task_acquire(acquired); // Rejected, not a trusted pointer + * bpf_task_release(acquired); + * + * return 0; + * } */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index f7d5fab61535..d52054ec69c9 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5799,6 +5799,11 @@ static u32 get_ctx_arg_idx(struct btf *btf, const struct btf_type *func_proto, return nr_args + 1; } +static bool prog_type_args_trusted(enum bpf_prog_type prog_type) +{ + return prog_type == BPF_PROG_TYPE_TRACING || prog_type == BPF_PROG_TYPE_STRUCT_OPS; +} + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -5942,6 +5947,9 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } info->reg_type = PTR_TO_BTF_ID; + if (prog_type_args_trusted(prog->type)) + info->reg_type |= PTR_TRUSTED; + if (tgt_prog) { enum bpf_prog_type tgt_type; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 67a6f11d953c..5bc9d84d7924 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -589,12 +589,13 @@ static const char *reg_type_str(struct bpf_verifier_env *env, strncpy(postfix, "_or_null", 16); } - snprintf(prefix, sizeof(prefix), "%s%s%s%s%s", + snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s", type & MEM_RDONLY ? "rdonly_" : "", type & MEM_RINGBUF ? "ringbuf_" : "", type & MEM_USER ? "user_" : "", type & MEM_PERCPU ? "percpu_" : "", - type & PTR_UNTRUSTED ? "untrusted_" : "" + type & PTR_UNTRUSTED ? "untrusted_" : "", + type & PTR_TRUSTED ? "trusted_" : "" ); snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s", @@ -3856,7 +3857,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno) { const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); - int perm_flags = PTR_MAYBE_NULL; + int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED; const char *reg_name = ""; /* Only unreferenced case accepts untrusted pointers */ @@ -4732,6 +4733,9 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (type_flag(reg->type) & PTR_UNTRUSTED) flag |= PTR_UNTRUSTED; + /* Any pointer obtained from walking a trusted pointer is no longer trusted. */ + flag &= ~PTR_TRUSTED; + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); @@ -5844,6 +5848,7 @@ static const struct bpf_reg_types btf_id_sock_common_types = { PTR_TO_TCP_SOCK, PTR_TO_XDP_SOCK, PTR_TO_BTF_ID, + PTR_TO_BTF_ID | PTR_TRUSTED, }, .btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], }; @@ -5884,8 +5889,18 @@ static const struct bpf_reg_types scalar_types = { .types = { SCALAR_VALUE } }; static const struct bpf_reg_types context_types = { .types = { PTR_TO_CTX } }; static const struct bpf_reg_types ringbuf_mem_types = { .types = { PTR_TO_MEM | MEM_RINGBUF } }; static const struct bpf_reg_types const_map_ptr_types = { .types = { CONST_PTR_TO_MAP } }; -static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } }; -static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_BTF_ID | MEM_PERCPU } }; +static const struct bpf_reg_types btf_ptr_types = { + .types = { + PTR_TO_BTF_ID, + PTR_TO_BTF_ID | PTR_TRUSTED, + }, +}; +static const struct bpf_reg_types percpu_btf_ptr_types = { + .types = { + PTR_TO_BTF_ID | MEM_PERCPU, + PTR_TO_BTF_ID | MEM_PERCPU | PTR_TRUSTED, + } +}; static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } }; static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } }; static const struct bpf_reg_types const_str_ptr_types = { .types = { PTR_TO_MAP_VALUE } }; @@ -5973,7 +5988,7 @@ static int check_reg_type(struct bpf_verifier_env *env, u32 regno, return -EACCES; found: - if (reg->type == PTR_TO_BTF_ID) { + if (reg->type == PTR_TO_BTF_ID || reg->type & PTR_TRUSTED) { /* For bpf_sk_release, it needs to match against first member * 'struct sock_common', hence make an exception for it. This * allows bpf_sk_release to work for multiple socket types. @@ -6055,6 +6070,8 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, */ case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | PTR_TRUSTED: + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: /* When referenced PTR_TO_BTF_ID is passed to release function, * it's fixed offset must be 0. In the other cases, fixed offset * can be non-zero. @@ -7939,6 +7956,25 @@ static bool is_kfunc_arg_kptr_get(struct bpf_kfunc_call_arg_meta *meta, int arg) return arg == 0 && (meta->kfunc_flags & KF_KPTR_GET); } +static bool is_trusted_reg(const struct bpf_reg_state *reg) +{ + /* A referenced register is always trusted. */ + if (reg->ref_obj_id) + return true; + + /* If a register is not referenced, it is trusted if it has either the + * MEM_ALLOC or PTR_TRUSTED type modifiers, and no others. Some of the + * other type modifiers may be safe, but we elect to take an opt-in + * approach here as some (e.g. PTR_UNTRUSTED and PTR_MAYBE_NULL) are + * not. + * + * Eventually, we should make PTR_TRUSTED the single source of truth + * for whether a register is trusted. + */ + return type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS && + !bpf_type_has_unsafe_modifiers(reg->type); +} + static bool __kfunc_param_match_suffix(const struct btf *btf, const struct btf_param *arg, const char *suffix) @@ -8220,7 +8256,7 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env, const char *reg_ref_tname; u32 reg_ref_id; - if (reg->type == PTR_TO_BTF_ID) { + if (base_type(reg->type) == PTR_TO_BTF_ID) { reg_btf = reg->btf; reg_ref_id = reg->btf_id; } else { @@ -8366,6 +8402,7 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_ ptr = reg->map_ptr; break; case PTR_TO_BTF_ID | MEM_ALLOC: + case PTR_TO_BTF_ID | MEM_ALLOC | PTR_TRUSTED: ptr = reg->btf; break; default: @@ -8596,8 +8633,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ case KF_ARG_PTR_TO_BTF_ID: if (!is_kfunc_trusted_args(meta)) break; - if (!reg->ref_obj_id) { - verbose(env, "R%d must be referenced\n", regno); + + if (!is_trusted_reg(reg)) { + verbose(env, "R%d must be referenced or trusted\n", regno); return -EINVAL; } fallthrough; @@ -8702,9 +8740,13 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ break; case KF_ARG_PTR_TO_BTF_ID: /* Only base_type is checked, further checks are done here */ - if (reg->type != PTR_TO_BTF_ID && - (!reg2btf_ids[base_type(reg->type)] || type_flag(reg->type))) { - verbose(env, "arg#%d expected pointer to btf or socket\n", i); + if ((base_type(reg->type) != PTR_TO_BTF_ID || + bpf_type_has_unsafe_modifiers(reg->type)) && + !reg2btf_ids[base_type(reg->type)]) { + verbose(env, "arg#%d is %s ", i, reg_type_str(env, reg->type)); + verbose(env, "expected %s or socket\n", + reg_type_str(env, base_type(reg->type) | + (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); return -EINVAL; } ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); @@ -14713,6 +14755,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) break; case PTR_TO_BTF_ID: case PTR_TO_BTF_ID | PTR_UNTRUSTED: + case PTR_TO_BTF_ID | PTR_TRUSTED: /* PTR_TO_BTF_ID | MEM_ALLOC always has a valid lifetime, unlike * PTR_TO_BTF_ID, and an active ref_obj_id, but the same cannot * be said once it is marked PTR_UNTRUSTED, hence we must handle @@ -14720,6 +14763,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: + case PTR_TO_BTF_ID | PTR_UNTRUSTED | PTR_TRUSTED: + case PTR_TO_BTF_ID | PTR_UNTRUSTED | MEM_ALLOC | PTR_TRUSTED: if (type == BPF_READ) { insn->code = BPF_LDX | BPF_PROBE_MEM | BPF_SIZE((insn)->code); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f2d8d070d024..5b9008bc597b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -774,7 +774,7 @@ BPF_CALL_0(bpf_get_current_task_btf) const struct bpf_func_proto bpf_get_current_task_btf_proto = { .func = bpf_get_current_task_btf, .gpl_only = true, - .ret_type = RET_PTR_TO_BTF_ID, + .ret_type = RET_PTR_TO_BTF_ID_TRUSTED, .ret_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], }; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index d15c91de995f..4517d2bd186a 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -61,7 +61,9 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info)) return false; - if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) + if (base_type(info->reg_type) == PTR_TO_BTF_ID && + !bpf_type_has_unsafe_modifiers(info->reg_type) && + info->btf_id == sock_id) /* promote it to tcp_sock */ info->btf_id = tcp_sock_id; diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c index 86d6fef2e3b4..3193915c5ee6 100644 --- a/tools/testing/selftests/bpf/verifier/calls.c +++ b/tools/testing/selftests/bpf/verifier/calls.c @@ -109,7 +109,7 @@ }, .prog_type = BPF_PROG_TYPE_SCHED_CLS, .result = REJECT, - .errstr = "arg#0 expected pointer to btf or socket", + .errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket", .fixup_kfunc_btf_id = { { "bpf_kfunc_call_test_acquire", 3 }, { "bpf_kfunc_call_test_release", 5 }, diff --git a/tools/testing/selftests/bpf/verifier/ref_tracking.c b/tools/testing/selftests/bpf/verifier/ref_tracking.c index 55cba01c99d5..9540164712b7 100644 --- a/tools/testing/selftests/bpf/verifier/ref_tracking.c +++ b/tools/testing/selftests/bpf/verifier/ref_tracking.c @@ -142,7 +142,7 @@ .kfunc = "bpf", .expected_attach_type = BPF_LSM_MAC, .flags = BPF_F_SLEEPABLE, - .errstr = "arg#0 expected pointer to btf or socket", + .errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket", .fixup_kfunc_btf_id = { { "bpf_lookup_user_key", 2 }, { "bpf_key_put", 4 }, @@ -163,7 +163,7 @@ .kfunc = "bpf", .expected_attach_type = BPF_LSM_MAC, .flags = BPF_F_SLEEPABLE, - .errstr = "arg#0 expected pointer to btf or socket", + .errstr = "arg#0 is ptr_or_null_ expected ptr_ or socket", .fixup_kfunc_btf_id = { { "bpf_lookup_system_key", 1 }, { "bpf_key_put", 3 }, -- cgit v1.2.3-70-g09d2 From 7a7160edf1bfde25422262fb26851cef65f695d3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 18 Nov 2022 10:25:06 -0800 Subject: net: Return errno in sk->sk_prot->get_port(). We assume the correct errno is -EADDRINUSE when sk->sk_prot->get_port() fails, so some ->get_port() functions return just 1 on failure and the callers return -EADDRINUSE instead. However, mptcp_get_port() can return -EINVAL. Let's not ignore the error. Note the only exception is inet_autobind(), all of whose callers return -EAGAIN instead. Fixes: cec37a6e41aa ("mptcp: Handle MP_CAPABLE options for outgoing connections") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 4 ++-- net/ipv4/inet_connection_sock.c | 7 ++++--- net/ipv4/ping.c | 2 +- net/ipv4/udp.c | 2 +- net/ipv6/af_inet6.c | 4 ++-- 5 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 378bcd777514..5b4d86701822 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -522,9 +522,9 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { - if (sk->sk_prot->get_port(sk, snum)) { + err = sk->sk_prot->get_port(sk, snum); + if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; goto out_release_sock; } if (!(flags & BIND_FROM_BPF)) { diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4e84ed21d16f..4a34bc7cb15e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -471,11 +471,11 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; + int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; struct inet_bind_bucket *tb = NULL; bool head2_lock_acquired = false; - int ret = 1, port = snum, l3mdev; struct net *net = sock_net(sk); l3mdev = inet_sk_bound_l3mdev(sk); @@ -1186,7 +1186,7 @@ int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); - int err = -EADDRINUSE; + int err; reqsk_queue_alloc(&icsk->icsk_accept_queue); @@ -1202,7 +1202,8 @@ int inet_csk_listen_start(struct sock *sk) * after validation is complete. */ inet_sk_state_store(sk, TCP_LISTEN); - if (!sk->sk_prot->get_port(sk, inet->inet_num)) { + err = sk->sk_prot->get_port(sk, inet->inet_num); + if (!err) { inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index bde333b24837..bb9854c2b7a1 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -138,7 +138,7 @@ next_port: fail: spin_unlock(&ping_table.lock); - return 1; + return -EADDRINUSE; } EXPORT_SYMBOL_GPL(ping_get_port); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1fb7d1ed1cb1..9592fe3e444a 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -240,7 +240,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); - int error = 1; + int error = -EADDRINUSE; if (!snum) { DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 68075295d587..fee9163382c2 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -410,10 +410,10 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { - if (sk->sk_prot->get_port(sk, snum)) { + err = sk->sk_prot->get_port(sk, snum); + if (err) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); - err = -EADDRINUSE; goto out; } if (!(flags & BIND_FROM_BPF)) { -- cgit v1.2.3-70-g09d2 From 976d302fb6165ad620778d7ba834cde6e3fe9f9f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Nov 2022 10:46:07 -0800 Subject: mptcp: deduplicate error paths on endpoint creation When endpoint creation fails, we need to free the newly allocated entry and eventually destroy the paired mptcp listener socket. Consolidate such action in a single point let all the errors path reach it. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- net/mptcp/pm_netlink.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 9813ed0fde9b..fdf2ee29f762 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1003,16 +1003,12 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, return err; msk = mptcp_sk(entry->lsk->sk); - if (!msk) { - err = -EINVAL; - goto out; - } + if (!msk) + return -EINVAL; ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - err = -EINVAL; - goto out; - } + if (!ssock) + return -EINVAL; mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1022,20 +1018,16 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen); if (err) { pr_warn("kernel_bind error, err=%d", err); - goto out; + return err; } err = kernel_listen(ssock, backlog); if (err) { pr_warn("kernel_listen error, err=%d", err); - goto out; + return err; } return 0; - -out: - sock_release(entry->lsk); - return err; } int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) @@ -1327,7 +1319,7 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - entry = kmalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); + entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); if (!entry) { GENL_SET_ERR_MSG(info, "can't allocate addr"); return -ENOMEM; @@ -1338,22 +1330,21 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); if (ret) { GENL_SET_ERR_MSG(info, "create listen socket error"); - kfree(entry); - return ret; + goto out_free; } } ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) { GENL_SET_ERR_MSG(info, "too many addresses or duplicate one"); - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); - return ret; + goto out_free; } mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk)); - return 0; + +out_free: + __mptcp_pm_release_addr_entry(entry); + return ret; } int mptcp_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, -- cgit v1.2.3-70-g09d2 From a3400e8746b626531099e4d9fd8eac41be066683 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 18 Nov 2022 10:46:08 -0800 Subject: mptcp: more detailed error reporting on endpoint creation Endpoint creation can fail for a number of reasons; in case of failure append the error number to the extended ack message, using a newly introduced generic helper. Additionally let mptcp_pm_nl_append_new_local_addr() report different error reasons. Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Mat Martineau Signed-off-by: David S. Miller --- include/net/genetlink.h | 3 +++ net/mptcp/pm_netlink.c | 24 +++++++++++++----------- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/genetlink.h b/include/net/genetlink.h index d21210709f84..ed4622dd4828 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -125,6 +125,9 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) +#define GENL_SET_ERR_MSG_FMT(info, msg, args...) \ + NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args) + /* Report that a root attribute is missing */ #define GENL_REQ_ATTR_CHECK(info, attr) ({ \ struct genl_info *__info = (info); \ diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index fdf2ee29f762..d66fbd558263 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -912,10 +912,14 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, */ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) pernet->next_id = 1; - if (pernet->addrs >= MPTCP_PM_ADDR_MAX) + if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { + ret = -ERANGE; goto out; - if (test_bit(entry->addr.id, pernet->id_bitmap)) + } + if (test_bit(entry->addr.id, pernet->id_bitmap)) { + ret = -EBUSY; goto out; + } /* do not insert duplicate address, differentiate on port only * singled addresses @@ -929,8 +933,10 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, * endpoint is an implicit one and the user-space * did not provide an endpoint id */ - if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) + if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { + ret = -EEXIST; goto out; + } if (entry->addr.id) goto out; @@ -1016,16 +1022,12 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, addrlen = sizeof(struct sockaddr_in6); #endif err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen); - if (err) { - pr_warn("kernel_bind error, err=%d", err); + if (err) return err; - } err = kernel_listen(ssock, backlog); - if (err) { - pr_warn("kernel_listen error, err=%d", err); + if (err) return err; - } return 0; } @@ -1329,13 +1331,13 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) if (entry->addr.port) { ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); if (ret) { - GENL_SET_ERR_MSG(info, "create listen socket error"); + GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); goto out_free; } } ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); if (ret < 0) { - GENL_SET_ERR_MSG(info, "too many addresses or duplicate one"); + GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); goto out_free; } -- cgit v1.2.3-70-g09d2 From 114039b342014680911c35bd6b72624180fd669a Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 21 Nov 2022 10:03:39 -0800 Subject: bpf: Move skb->len == 0 checks into __bpf_redirect To avoid potentially breaking existing users. Both mac/no-mac cases have to be amended; mac_header >= network_header is not enough (verified with a new test, see next patch). Fixes: fd1894224407 ("bpf: Don't redirect packets with invalid pkt_len") Signed-off-by: Stanislav Fomichev Link: https://lore.kernel.org/r/20221121180340.1983627-1-sdf@google.com Signed-off-by: Martin KaFai Lau --- net/bpf/test_run.c | 3 --- net/core/filter.c | 11 ++++++----- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 13d578ce2a09..6fba440efc40 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -979,9 +979,6 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; - if (!skb->len) - return -EINVAL; - if (!__skb) return 0; diff --git a/net/core/filter.c b/net/core/filter.c index 2a105fb1ceb2..b6e1b81cdfae 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2124,12 +2124,13 @@ static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, { unsigned int mlen = skb_network_offset(skb); + if (unlikely(skb->len <= mlen)) { + kfree_skb(skb); + return -ERANGE; + } + if (mlen) { __skb_pull(skb, mlen); - if (unlikely(!skb->len)) { - kfree_skb(skb); - return -ERANGE; - } /* At ingress, the mac header has already been pulled once. * At egress, skb_pospull_rcsum has to be done in case that @@ -2149,7 +2150,7 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, u32 flags) { /* Verify that a link layer header is carried */ - if (unlikely(skb->mac_header >= skb->network_header)) { + if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) { kfree_skb(skb); return -ERANGE; } -- cgit v1.2.3-70-g09d2 From 32634819ad37290b5d5a84bf8b71ef5e972c4a20 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 18 Nov 2022 04:38:43 +0000 Subject: net: fix __sock_gen_cookie() I was mistaken how atomic64_try_cmpxchg(&sk_cookie, &res, new) is working. I was assuming @res would contain the final sk_cookie value, regardless of the success of our cmpxchg() We could do something like: if (atomic64_try_cmpxchg(&sk_cookie, &res, new) res = new; But we can avoid a conditional and read sk_cookie again. atomic64_cmpxchg(&sk_cookie, res, new); res = atomic64_read(&sk_cookie); Reported-by: coverity-bot Addresses-Coverity-ID: 1527347 ("Error handling issues") Fixes: 4ebf802cf1c6 ("net: __sock_gen_cookie() cleanup") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20221118043843.3703186-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/sock_diag.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index b11593cae5a0..b1e29e18d1d6 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -30,7 +30,10 @@ u64 __sock_gen_cookie(struct sock *sk) if (!res) { u64 new = gen_cookie_next(&sock_cookie); - atomic64_try_cmpxchg(&sk->sk_cookie, &res, new); + atomic64_cmpxchg(&sk->sk_cookie, res, new); + + /* Another thread might have changed sk_cookie before us. */ + res = atomic64_read(&sk->sk_cookie); } return res; } -- cgit v1.2.3-70-g09d2 From 98cbc40e4f7d15bc21a314a151071566e14ca39c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 15 Nov 2022 16:26:07 +0300 Subject: netfilter: nft_inner: fix IS_ERR() vs NULL check The __nft_expr_type_get() function returns NULL on error. It never returns error pointers. Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching") Signed-off-by: Dan Carpenter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2fa52b8d5ce1..833850a4780f 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2873,8 +2873,8 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, return -EINVAL; type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); - if (IS_ERR(type)) - return PTR_ERR(type); + if (!type) + return -ENOENT; if (!type->inner_ops) return -EOPNOTSUPP; -- cgit v1.2.3-70-g09d2 From c90b6b1005ec7423c7d0063eb27a9728498f6ec8 Mon Sep 17 00:00:00 2001 From: Saeed Mahameed Date: Tue, 22 Nov 2022 10:41:58 -0800 Subject: tcp: Fix build break when CONFIG_IPV6=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cited commit caused the following build break when CONFIG_IPV6 was disabled net/ipv4/tcp_input.c: In function ‘tcp_syn_flood_action’: include/net/sock.h:387:37: error: ‘const struct sock_common’ has no member named ‘skc_v6_rcv_saddr’; did you mean ‘skc_rcv_saddr’? Fix by using inet6_rcv_saddr() macro which handles this situation nicely. Fixes: d9282e48c608 ("tcp: Add listening address to SYN flood message") Signed-off-by: Saeed Mahameed Reported-by: Geert Uytterhoeven CC: Matthieu Baerts CC: Jamie Bainbridge Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20221122184158.170798-1-saeed@kernel.org Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0ae291e53eab..1efacbe948da 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6845,7 +6845,7 @@ static bool tcp_syn_flood_action(const struct sock *sk, const char *proto) xchg(&queue->synflood_warned, 1) == 0) { if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", - proto, &sk->sk_v6_rcv_saddr, + proto, inet6_rcv_saddr(sk), sk->sk_num, msg); } else { net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n", -- cgit v1.2.3-70-g09d2 From 815bc3ac75e9e34727f8ca78380266f34a3b6c66 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Sun, 20 Nov 2022 10:36:52 +0200 Subject: devlink: remove redundant health state set to error Reporter health_state is set twice to error in devlink_health_report(). Remove second time as it is redundant. Signed-off-by: Moshe Shemesh Reviewed-by: Eran Ben Elisha Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/1668933412-5498-1-git-send-email-moshe@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index d93bc95cd7cb..cea154ddce7a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -7846,8 +7846,6 @@ int devlink_health_report(struct devlink_health_reporter *reporter, return -ECANCELED; } - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - if (reporter->auto_dump) { mutex_lock(&reporter->dump_lock); /* store current dump of current error, for later analysis */ -- cgit v1.2.3-70-g09d2 From c5fb8ead3283955dc68671f853017b181f96fdc1 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:39 +0200 Subject: net: dsa: unexport dsa_dev_to_net_device() dsa.o and dsa2.o are linked into the same dsa_core.o, there is no reason to export this symbol when its only caller is local. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 -- net/dsa/dsa.c | 1 - net/dsa/dsa_priv.h | 2 ++ 3 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 82da44561f4c..d5bfcb63d4c2 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1285,8 +1285,6 @@ struct dsa_switch_driver { const struct dsa_switch_ops *ops; }; -struct net_device *dsa_dev_to_net_device(struct device *dev); - bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid, struct dsa_db db); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 4afd3edbd64d..07158c7560b5 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -182,7 +182,6 @@ struct net_device *dsa_dev_to_net_device(struct device *dev) return NULL; } -EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); /* Determine if we should defer delivery of skb until we have a rx timestamp. * diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 24e0ea218a35..b60987e8d931 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -247,6 +247,8 @@ const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol); const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name); void dsa_tag_driver_put(const struct dsa_device_ops *ops); +struct net_device *dsa_dev_to_net_device(struct device *dev); + bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b); bool dsa_schedule_work(struct work_struct *work); -- cgit v1.2.3-70-g09d2 From d2be320495b93ffb469e53100ba1668d0cd7eedc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:40 +0200 Subject: net: dsa: modularize DSA_TAG_PROTO_NONE There is no reason that I can see why the no-op tagging protocol should be registered manually, so make it a module and make all drivers which have any sort of reference to DSA_TAG_PROTO_NONE select it. Note that I don't know if ksz_get_tag_protocol() really needs this, or if it's just the logic which is poorly written. All switches seem to have their own tagging protocol, and DSA_TAG_PROTO_NONE is just a fallback that never gets used. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/Kconfig | 2 ++ drivers/net/dsa/b53/Kconfig | 1 + drivers/net/dsa/microchip/Kconfig | 1 + net/dsa/Kconfig | 6 ++++++ net/dsa/Makefile | 1 + net/dsa/dsa.c | 21 --------------------- net/dsa/dsa_priv.h | 1 - net/dsa/tag_none.c | 30 ++++++++++++++++++++++++++++++ 8 files changed, 41 insertions(+), 22 deletions(-) create mode 100644 net/dsa/tag_none.c (limited to 'net') diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig index 07507b4820d7..c26755f662c1 100644 --- a/drivers/net/dsa/Kconfig +++ b/drivers/net/dsa/Kconfig @@ -18,6 +18,7 @@ config NET_DSA_BCM_SF2 config NET_DSA_LOOP tristate "DSA mock-up Ethernet switch chip support" + select NET_DSA_TAG_NONE select FIXED_PHY help This enables support for a fake mock-up switch chip which @@ -99,6 +100,7 @@ config NET_DSA_SMSC_LAN9303_MDIO config NET_DSA_VITESSE_VSC73XX tristate + select NET_DSA_TAG_NONE select FIXED_PHY select VITESSE_PHY select GPIOLIB diff --git a/drivers/net/dsa/b53/Kconfig b/drivers/net/dsa/b53/Kconfig index 90b525160b71..ebaa4a80d544 100644 --- a/drivers/net/dsa/b53/Kconfig +++ b/drivers/net/dsa/b53/Kconfig @@ -2,6 +2,7 @@ menuconfig B53 tristate "Broadcom BCM53xx managed switch support" depends on NET_DSA + select NET_DSA_TAG_NONE select NET_DSA_TAG_BRCM select NET_DSA_TAG_BRCM_LEGACY select NET_DSA_TAG_BRCM_PREPEND diff --git a/drivers/net/dsa/microchip/Kconfig b/drivers/net/dsa/microchip/Kconfig index 06b1efdb5e7d..913f83ef013c 100644 --- a/drivers/net/dsa/microchip/Kconfig +++ b/drivers/net/dsa/microchip/Kconfig @@ -3,6 +3,7 @@ menuconfig NET_DSA_MICROCHIP_KSZ_COMMON tristate "Microchip KSZ8795/KSZ9477/LAN937x series switch support" depends on NET_DSA select NET_DSA_TAG_KSZ + select NET_DSA_TAG_NONE help This driver adds support for Microchip KSZ9477 series switch and KSZ8795/KSZ88x3 switch chips. diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 3eef72ce99a4..8e698bea99a3 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -18,6 +18,12 @@ if NET_DSA # Drivers must select the appropriate tagging format(s) +config NET_DSA_TAG_NONE + tristate "No-op tag driver" + help + Say Y or M if you want to enable support for switches which don't tag + frames over the CPU port. + config NET_DSA_TAG_AR9331 tristate "Tag driver for Atheros AR9331 SoC with built-in switch" help diff --git a/net/dsa/Makefile b/net/dsa/Makefile index bf57ef3bce2a..14e05ab64135 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 07158c7560b5..e609d64a2216 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -18,22 +18,6 @@ static LIST_HEAD(dsa_tag_drivers_list); static DEFINE_MUTEX(dsa_tag_drivers_lock); -static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - /* Just return the original SKB */ - return skb; -} - -static const struct dsa_device_ops none_ops = { - .name = "none", - .proto = DSA_TAG_PROTO_NONE, - .xmit = dsa_slave_notag_xmit, - .rcv = NULL, -}; - -DSA_TAG_DRIVER(none_ops); - static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver, struct module *owner) { @@ -551,9 +535,6 @@ static int __init dsa_init_module(void) dev_add_pack(&dsa_pack_type); - dsa_tag_driver_register(&DSA_TAG_DRIVER_NAME(none_ops), - THIS_MODULE); - rc = rtnl_link_register(&dsa_link_ops); if (rc) goto netlink_register_fail; @@ -561,7 +542,6 @@ static int __init dsa_init_module(void) return 0; netlink_register_fail: - dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops)); dsa_slave_unregister_notifier(); dev_remove_pack(&dsa_pack_type); register_notifier_fail: @@ -574,7 +554,6 @@ module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { rtnl_link_unregister(&dsa_link_ops); - dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops)); dsa_slave_unregister_notifier(); dev_remove_pack(&dsa_pack_type); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index b60987e8d931..c4ea5fda8f14 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -384,7 +384,6 @@ int dsa_port_change_master(struct dsa_port *dp, struct net_device *master, struct netlink_ext_ack *extack); /* slave.c */ -extern const struct dsa_device_ops notag_netdev_ops; extern struct notifier_block dsa_slave_switchdev_notifier; extern struct notifier_block dsa_slave_switchdev_blocking_notifier; diff --git a/net/dsa/tag_none.c b/net/dsa/tag_none.c new file mode 100644 index 000000000000..34a13c50d245 --- /dev/null +++ b/net/dsa/tag_none.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * net/dsa/tag_none.c - Traffic handling for switches with no tag + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2013 Florian Fainelli + * + * WARNING: do not use this for new switches. In case of no hardware + * tagging support, look at tag_8021q.c instead. + */ + +#include "dsa_priv.h" + +#define NONE_NAME "none" + +static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Just return the original SKB */ + return skb; +} + +static const struct dsa_device_ops none_ops = { + .name = NONE_NAME, + .proto = DSA_TAG_PROTO_NONE, + .xmit = dsa_slave_notag_xmit, +}; + +module_dsa_tag_driver(none_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_NONE, NONE_NAME); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-70-g09d2 From 5cf2c75b5b91bcf81d61b2d2ea1c71363bcacf89 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:41 +0200 Subject: net: dsa: move bulk of devlink code to devlink.{c,h} dsa.c and dsa2.c are bloated with too much off-topic code. Identify all code related to devlink and move it to a new devlink.c file. Steer clear of the dsa_priv.h dumping ground antipattern and create a dedicated devlink.h for it, which will be included only by the C files which need it. Usage of dsa_priv.h will be minimized in later patches. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/Makefile | 1 + net/dsa/devlink.c | 355 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/devlink.h | 13 ++ net/dsa/dsa.c | 107 ---------------- net/dsa/dsa2.c | 240 +----------------------------------- 5 files changed, 370 insertions(+), 346 deletions(-) create mode 100644 net/dsa/devlink.c create mode 100644 net/dsa/devlink.h (limited to 'net') diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 14e05ab64135..bc872c0d7011 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -2,6 +2,7 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o dsa_core-y += \ + devlink.o \ dsa.o \ dsa2.o \ master.o \ diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c new file mode 100644 index 000000000000..eff440b2b3c5 --- /dev/null +++ b/net/dsa/devlink.c @@ -0,0 +1,355 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * DSA devlink handling + */ + +#include +#include + +#include "devlink.h" + +static int dsa_devlink_info_get(struct devlink *dl, + struct devlink_info_req *req, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (ds->ops->devlink_info_get) + return ds->ops->devlink_info_get(ds, req, extack); + + return -EOPNOTSUPP; +} + +static int dsa_devlink_sb_pool_get(struct devlink *dl, + unsigned int sb_index, u16 pool_index, + struct devlink_sb_pool_info *pool_info) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_pool_get(ds, sb_index, pool_index, + pool_info); +} + +static int dsa_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index, + u16 pool_index, u32 size, + enum devlink_sb_threshold_type threshold_type, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_pool_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_pool_set(ds, sb_index, pool_index, size, + threshold_type, extack); +} + +static int dsa_devlink_sb_port_pool_get(struct devlink_port *dlp, + unsigned int sb_index, u16 pool_index, + u32 *p_threshold) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_port_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_port_pool_get(ds, port, sb_index, + pool_index, p_threshold); +} + +static int dsa_devlink_sb_port_pool_set(struct devlink_port *dlp, + unsigned int sb_index, u16 pool_index, + u32 threshold, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_port_pool_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_port_pool_set(ds, port, sb_index, + pool_index, threshold, extack); +} + +static int +dsa_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 *p_pool_index, u32 *p_threshold) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_tc_pool_bind_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_tc_pool_bind_get(ds, port, sb_index, + tc_index, pool_type, + p_pool_index, p_threshold); +} + +static int +dsa_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u16 pool_index, u32 threshold, + struct netlink_ext_ack *extack) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_tc_pool_bind_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_tc_pool_bind_set(ds, port, sb_index, + tc_index, pool_type, + pool_index, threshold, + extack); +} + +static int dsa_devlink_sb_occ_snapshot(struct devlink *dl, + unsigned int sb_index) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_occ_snapshot) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_snapshot(ds, sb_index); +} + +static int dsa_devlink_sb_occ_max_clear(struct devlink *dl, + unsigned int sb_index) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_sb_occ_max_clear) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_max_clear(ds, sb_index); +} + +static int dsa_devlink_sb_occ_port_pool_get(struct devlink_port *dlp, + unsigned int sb_index, + u16 pool_index, u32 *p_cur, + u32 *p_max) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_occ_port_pool_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_port_pool_get(ds, port, sb_index, + pool_index, p_cur, p_max); +} + +static int +dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp, + unsigned int sb_index, u16 tc_index, + enum devlink_sb_pool_type pool_type, + u32 *p_cur, u32 *p_max) +{ + struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); + int port = dsa_devlink_port_to_port(dlp); + + if (!ds->ops->devlink_sb_occ_tc_port_bind_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_sb_occ_tc_port_bind_get(ds, port, + sb_index, tc_index, + pool_type, p_cur, + p_max); +} + +const struct devlink_ops dsa_devlink_ops = { + .info_get = dsa_devlink_info_get, + .sb_pool_get = dsa_devlink_sb_pool_get, + .sb_pool_set = dsa_devlink_sb_pool_set, + .sb_port_pool_get = dsa_devlink_sb_port_pool_get, + .sb_port_pool_set = dsa_devlink_sb_port_pool_set, + .sb_tc_pool_bind_get = dsa_devlink_sb_tc_pool_bind_get, + .sb_tc_pool_bind_set = dsa_devlink_sb_tc_pool_bind_set, + .sb_occ_snapshot = dsa_devlink_sb_occ_snapshot, + .sb_occ_max_clear = dsa_devlink_sb_occ_max_clear, + .sb_occ_port_pool_get = dsa_devlink_sb_occ_port_pool_get, + .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get, +}; + +int dsa_devlink_param_get(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_param_get) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_get(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_get); + +int dsa_devlink_param_set(struct devlink *dl, u32 id, + struct devlink_param_gset_ctx *ctx) +{ + struct dsa_switch *ds = dsa_devlink_to_ds(dl); + + if (!ds->ops->devlink_param_set) + return -EOPNOTSUPP; + + return ds->ops->devlink_param_set(ds, id, ctx); +} +EXPORT_SYMBOL_GPL(dsa_devlink_param_set); + +int dsa_devlink_params_register(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + return devlink_params_register(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_register); + +void dsa_devlink_params_unregister(struct dsa_switch *ds, + const struct devlink_param *params, + size_t params_count) +{ + devlink_params_unregister(ds->devlink, params, params_count); +} +EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister); + +int dsa_devlink_resource_register(struct dsa_switch *ds, + const char *resource_name, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + const struct devlink_resource_size_params *size_params) +{ + return devlink_resource_register(ds->devlink, resource_name, + resource_size, resource_id, + parent_resource_id, + size_params); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_register); + +void dsa_devlink_resources_unregister(struct dsa_switch *ds) +{ + devlink_resources_unregister(ds->devlink); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister); + +void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, + u64 resource_id, + devlink_resource_occ_get_t *occ_get, + void *occ_get_priv) +{ + return devlink_resource_occ_get_register(ds->devlink, resource_id, + occ_get, occ_get_priv); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register); + +void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, + u64 resource_id) +{ + devlink_resource_occ_get_unregister(ds->devlink, resource_id); +} +EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); + +struct devlink_region * +dsa_devlink_region_create(struct dsa_switch *ds, + const struct devlink_region_ops *ops, + u32 region_max_snapshots, u64 region_size) +{ + return devlink_region_create(ds->devlink, ops, region_max_snapshots, + region_size); +} +EXPORT_SYMBOL_GPL(dsa_devlink_region_create); + +struct devlink_region * +dsa_devlink_port_region_create(struct dsa_switch *ds, + int port, + const struct devlink_port_region_ops *ops, + u32 region_max_snapshots, u64 region_size) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + + return devlink_port_region_create(&dp->devlink_port, ops, + region_max_snapshots, + region_size); +} +EXPORT_SYMBOL_GPL(dsa_devlink_port_region_create); + +void dsa_devlink_region_destroy(struct devlink_region *region) +{ + devlink_region_destroy(region); +} +EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy); + +int dsa_port_devlink_setup(struct dsa_port *dp) +{ + struct devlink_port *dlp = &dp->devlink_port; + struct dsa_switch_tree *dst = dp->ds->dst; + struct devlink_port_attrs attrs = {}; + struct devlink *dl = dp->ds->devlink; + struct dsa_switch *ds = dp->ds; + const unsigned char *id; + unsigned char len; + int err; + + memset(dlp, 0, sizeof(*dlp)); + devlink_port_init(dl, dlp); + + if (ds->ops->port_setup) { + err = ds->ops->port_setup(ds, dp->index); + if (err) + return err; + } + + id = (const unsigned char *)&dst->index; + len = sizeof(dst->index); + + attrs.phys.port_number = dp->index; + memcpy(attrs.switch_id.id, id, len); + attrs.switch_id.id_len = len; + + switch (dp->type) { + case DSA_PORT_TYPE_UNUSED: + attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED; + break; + case DSA_PORT_TYPE_CPU: + attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU; + break; + case DSA_PORT_TYPE_DSA: + attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA; + break; + case DSA_PORT_TYPE_USER: + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + break; + } + + devlink_port_attrs_set(dlp, &attrs); + err = devlink_port_register(dl, dlp, dp->index); + if (err) { + if (ds->ops->port_teardown) + ds->ops->port_teardown(ds, dp->index); + return err; + } + + return 0; +} + +void dsa_port_devlink_teardown(struct dsa_port *dp) +{ + struct devlink_port *dlp = &dp->devlink_port; + struct dsa_switch *ds = dp->ds; + + devlink_port_unregister(dlp); + + if (ds->ops->port_teardown) + ds->ops->port_teardown(ds, dp->index); + + devlink_port_fini(dlp); +} diff --git a/net/dsa/devlink.h b/net/dsa/devlink.h new file mode 100644 index 000000000000..d077c7f336da --- /dev/null +++ b/net/dsa/devlink.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_DEVLINK_H +#define __DSA_DEVLINK_H + +struct dsa_port; + +extern const struct devlink_ops dsa_devlink_ops; + +int dsa_port_devlink_setup(struct dsa_port *dp); +void dsa_port_devlink_teardown(struct dsa_port *dp); + +#endif diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index e609d64a2216..842a1f2488b2 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -344,113 +344,6 @@ void dsa_flush_workqueue(void) } EXPORT_SYMBOL_GPL(dsa_flush_workqueue); -int dsa_devlink_param_get(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx) -{ - struct dsa_switch *ds = dsa_devlink_to_ds(dl); - - if (!ds->ops->devlink_param_get) - return -EOPNOTSUPP; - - return ds->ops->devlink_param_get(ds, id, ctx); -} -EXPORT_SYMBOL_GPL(dsa_devlink_param_get); - -int dsa_devlink_param_set(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx) -{ - struct dsa_switch *ds = dsa_devlink_to_ds(dl); - - if (!ds->ops->devlink_param_set) - return -EOPNOTSUPP; - - return ds->ops->devlink_param_set(ds, id, ctx); -} -EXPORT_SYMBOL_GPL(dsa_devlink_param_set); - -int dsa_devlink_params_register(struct dsa_switch *ds, - const struct devlink_param *params, - size_t params_count) -{ - return devlink_params_register(ds->devlink, params, params_count); -} -EXPORT_SYMBOL_GPL(dsa_devlink_params_register); - -void dsa_devlink_params_unregister(struct dsa_switch *ds, - const struct devlink_param *params, - size_t params_count) -{ - devlink_params_unregister(ds->devlink, params, params_count); -} -EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister); - -int dsa_devlink_resource_register(struct dsa_switch *ds, - const char *resource_name, - u64 resource_size, - u64 resource_id, - u64 parent_resource_id, - const struct devlink_resource_size_params *size_params) -{ - return devlink_resource_register(ds->devlink, resource_name, - resource_size, resource_id, - parent_resource_id, - size_params); -} -EXPORT_SYMBOL_GPL(dsa_devlink_resource_register); - -void dsa_devlink_resources_unregister(struct dsa_switch *ds) -{ - devlink_resources_unregister(ds->devlink); -} -EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister); - -void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds, - u64 resource_id, - devlink_resource_occ_get_t *occ_get, - void *occ_get_priv) -{ - return devlink_resource_occ_get_register(ds->devlink, resource_id, - occ_get, occ_get_priv); -} -EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register); - -void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, - u64 resource_id) -{ - devlink_resource_occ_get_unregister(ds->devlink, resource_id); -} -EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); - -struct devlink_region * -dsa_devlink_region_create(struct dsa_switch *ds, - const struct devlink_region_ops *ops, - u32 region_max_snapshots, u64 region_size) -{ - return devlink_region_create(ds->devlink, ops, region_max_snapshots, - region_size); -} -EXPORT_SYMBOL_GPL(dsa_devlink_region_create); - -struct devlink_region * -dsa_devlink_port_region_create(struct dsa_switch *ds, - int port, - const struct devlink_port_region_ops *ops, - u32 region_max_snapshots, u64 region_size) -{ - struct dsa_port *dp = dsa_to_port(ds, port); - - return devlink_port_region_create(&dp->devlink_port, ops, - region_max_snapshots, - region_size); -} -EXPORT_SYMBOL_GPL(dsa_devlink_port_region_create); - -void dsa_devlink_region_destroy(struct devlink_region *region) -{ - devlink_region_destroy(region); -} -EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy); - struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) { if (!netdev || !dsa_slave_dev_check(netdev)) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f8df55e2e23a..05e682c25590 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -18,6 +18,7 @@ #include #include +#include "devlink.h" #include "dsa_priv.h" static DEFINE_MUTEX(dsa2_mutex); @@ -461,72 +462,6 @@ static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst) dp->cpu_dp = NULL; } -static int dsa_port_devlink_setup(struct dsa_port *dp) -{ - struct devlink_port *dlp = &dp->devlink_port; - struct dsa_switch_tree *dst = dp->ds->dst; - struct devlink_port_attrs attrs = {}; - struct devlink *dl = dp->ds->devlink; - struct dsa_switch *ds = dp->ds; - const unsigned char *id; - unsigned char len; - int err; - - memset(dlp, 0, sizeof(*dlp)); - devlink_port_init(dl, dlp); - - if (ds->ops->port_setup) { - err = ds->ops->port_setup(ds, dp->index); - if (err) - return err; - } - - id = (const unsigned char *)&dst->index; - len = sizeof(dst->index); - - attrs.phys.port_number = dp->index; - memcpy(attrs.switch_id.id, id, len); - attrs.switch_id.id_len = len; - - switch (dp->type) { - case DSA_PORT_TYPE_UNUSED: - attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED; - break; - case DSA_PORT_TYPE_CPU: - attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU; - break; - case DSA_PORT_TYPE_DSA: - attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA; - break; - case DSA_PORT_TYPE_USER: - attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; - break; - } - - devlink_port_attrs_set(dlp, &attrs); - err = devlink_port_register(dl, dlp, dp->index); - if (err) { - if (ds->ops->port_teardown) - ds->ops->port_teardown(ds, dp->index); - return err; - } - - return 0; -} - -static void dsa_port_devlink_teardown(struct dsa_port *dp) -{ - struct devlink_port *dlp = &dp->devlink_port; - struct dsa_switch *ds = dp->ds; - - devlink_port_unregister(dlp); - - if (ds->ops->port_teardown) - ds->ops->port_teardown(ds, dp->index); - - devlink_port_fini(dlp); -} - static int dsa_port_setup(struct dsa_port *dp) { bool dsa_port_link_registered = false; @@ -638,179 +573,6 @@ static int dsa_port_setup_as_unused(struct dsa_port *dp) return dsa_port_setup(dp); } -static int dsa_devlink_info_get(struct devlink *dl, - struct devlink_info_req *req, - struct netlink_ext_ack *extack) -{ - struct dsa_switch *ds = dsa_devlink_to_ds(dl); - - if (ds->ops->devlink_info_get) - return ds->ops->devlink_info_get(ds, req, extack); - - return -EOPNOTSUPP; -} - -static int dsa_devlink_sb_pool_get(struct devlink *dl, - unsigned int sb_index, u16 pool_index, - struct devlink_sb_pool_info *pool_info) -{ - struct dsa_switch *ds = dsa_devlink_to_ds(dl); - - if (!ds->ops->devlink_sb_pool_get) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_pool_get(ds, sb_index, pool_index, - pool_info); -} - -static int dsa_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index, - u16 pool_index, u32 size, - enum devlink_sb_threshold_type threshold_type, - struct netlink_ext_ack *extack) -{ - struct dsa_switch *ds = dsa_devlink_to_ds(dl); - - if (!ds->ops->devlink_sb_pool_set) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_pool_set(ds, sb_index, pool_index, size, - threshold_type, extack); -} - -static int dsa_devlink_sb_port_pool_get(struct devlink_port *dlp, - unsigned int sb_index, u16 pool_index, - u32 *p_threshold) -{ - struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); - int port = dsa_devlink_port_to_port(dlp); - - if (!ds->ops->devlink_sb_port_pool_get) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_port_pool_get(ds, port, sb_index, - pool_index, p_threshold); -} - -static int dsa_devlink_sb_port_pool_set(struct devlink_port *dlp, - unsigned int sb_index, u16 pool_index, - u32 threshold, - struct netlink_ext_ack *extack) -{ - struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); - int port = dsa_devlink_port_to_port(dlp); - - if (!ds->ops->devlink_sb_port_pool_set) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_port_pool_set(ds, port, sb_index, - pool_index, threshold, extack); -} - -static int -dsa_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp, - unsigned int sb_index, u16 tc_index, - enum devlink_sb_pool_type pool_type, - u16 *p_pool_index, u32 *p_threshold) -{ - struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); - int port = dsa_devlink_port_to_port(dlp); - - if (!ds->ops->devlink_sb_tc_pool_bind_get) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_tc_pool_bind_get(ds, port, sb_index, - tc_index, pool_type, - p_pool_index, p_threshold); -} - -static int -dsa_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp, - unsigned int sb_index, u16 tc_index, - enum devlink_sb_pool_type pool_type, - u16 pool_index, u32 threshold, - struct netlink_ext_ack *extack) -{ - struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); - int port = dsa_devlink_port_to_port(dlp); - - if (!ds->ops->devlink_sb_tc_pool_bind_set) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_tc_pool_bind_set(ds, port, sb_index, - tc_index, pool_type, - pool_index, threshold, - extack); -} - -static int dsa_devlink_sb_occ_snapshot(struct devlink *dl, - unsigned int sb_index) -{ - struct dsa_switch *ds = dsa_devlink_to_ds(dl); - - if (!ds->ops->devlink_sb_occ_snapshot) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_occ_snapshot(ds, sb_index); -} - -static int dsa_devlink_sb_occ_max_clear(struct devlink *dl, - unsigned int sb_index) -{ - struct dsa_switch *ds = dsa_devlink_to_ds(dl); - - if (!ds->ops->devlink_sb_occ_max_clear) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_occ_max_clear(ds, sb_index); -} - -static int dsa_devlink_sb_occ_port_pool_get(struct devlink_port *dlp, - unsigned int sb_index, - u16 pool_index, u32 *p_cur, - u32 *p_max) -{ - struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); - int port = dsa_devlink_port_to_port(dlp); - - if (!ds->ops->devlink_sb_occ_port_pool_get) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_occ_port_pool_get(ds, port, sb_index, - pool_index, p_cur, p_max); -} - -static int -dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp, - unsigned int sb_index, u16 tc_index, - enum devlink_sb_pool_type pool_type, - u32 *p_cur, u32 *p_max) -{ - struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp); - int port = dsa_devlink_port_to_port(dlp); - - if (!ds->ops->devlink_sb_occ_tc_port_bind_get) - return -EOPNOTSUPP; - - return ds->ops->devlink_sb_occ_tc_port_bind_get(ds, port, - sb_index, tc_index, - pool_type, p_cur, - p_max); -} - -static const struct devlink_ops dsa_devlink_ops = { - .info_get = dsa_devlink_info_get, - .sb_pool_get = dsa_devlink_sb_pool_get, - .sb_pool_set = dsa_devlink_sb_pool_set, - .sb_port_pool_get = dsa_devlink_sb_port_pool_get, - .sb_port_pool_set = dsa_devlink_sb_port_pool_set, - .sb_tc_pool_bind_get = dsa_devlink_sb_tc_pool_bind_get, - .sb_tc_pool_bind_set = dsa_devlink_sb_tc_pool_bind_set, - .sb_occ_snapshot = dsa_devlink_sb_occ_snapshot, - .sb_occ_max_clear = dsa_devlink_sb_occ_max_clear, - .sb_occ_port_pool_get = dsa_devlink_sb_occ_port_pool_get, - .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get, -}; - static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) { const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; -- cgit v1.2.3-70-g09d2 From d95fa75061fbd09a7ae78a76bc8902a84b8538a5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:42 +0200 Subject: net: dsa: if ds->setup is true, ds->devlink is always non-NULL Simplify dsa_switch_teardown() to remove the NULL checking for ds->devlink. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 05e682c25590..f890dfcbf412 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -682,8 +682,9 @@ static int dsa_switch_setup(struct dsa_switch *ds) goto free_slave_mii_bus; } - ds->setup = true; devlink_register(ds->devlink); + + ds->setup = true; return 0; free_slave_mii_bus: @@ -705,8 +706,7 @@ static void dsa_switch_teardown(struct dsa_switch *ds) if (!ds->setup) return; - if (ds->devlink) - devlink_unregister(ds->devlink); + devlink_unregister(ds->devlink); if (ds->slave_mii_bus && ds->ops->phy_read) { mdiobus_unregister(ds->slave_mii_bus); @@ -721,10 +721,8 @@ static void dsa_switch_teardown(struct dsa_switch *ds) dsa_switch_unregister_notifier(ds); - if (ds->devlink) { - devlink_free(ds->devlink); - ds->devlink = NULL; - } + devlink_free(ds->devlink); + ds->devlink = NULL; ds->setup = false; } -- cgit v1.2.3-70-g09d2 From 7aea535d40eaba97cb116ba2b30ecd897eee0f43 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:43 +0200 Subject: net: dsa: move rest of devlink setup/teardown to devlink.c The code that needed further refactoring into dedicated functions in dsa2.c was left aside. Move it now to devlink.c, and make dsa2.c stop including net/devlink.h. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/devlink.c | 38 +++++++++++++++++++++++++++++++++++++- net/dsa/devlink.h | 7 +++++-- net/dsa/dsa2.c | 24 +++++++----------------- 3 files changed, 49 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c index eff440b2b3c5..431bf52290a1 100644 --- a/net/dsa/devlink.c +++ b/net/dsa/devlink.c @@ -167,7 +167,7 @@ dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp, p_max); } -const struct devlink_ops dsa_devlink_ops = { +static const struct devlink_ops dsa_devlink_ops = { .info_get = dsa_devlink_info_get, .sb_pool_get = dsa_devlink_sb_pool_get, .sb_pool_set = dsa_devlink_sb_pool_set, @@ -353,3 +353,39 @@ void dsa_port_devlink_teardown(struct dsa_port *dp) devlink_port_fini(dlp); } + +void dsa_switch_devlink_register(struct dsa_switch *ds) +{ + devlink_register(ds->devlink); +} + +void dsa_switch_devlink_unregister(struct dsa_switch *ds) +{ + devlink_unregister(ds->devlink); +} + +int dsa_switch_devlink_alloc(struct dsa_switch *ds) +{ + struct dsa_devlink_priv *dl_priv; + struct devlink *dl; + + /* Add the switch to devlink before calling setup, so that setup can + * add dpipe tables + */ + dl = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev); + if (!dl) + return -ENOMEM; + + ds->devlink = dl; + + dl_priv = devlink_priv(ds->devlink); + dl_priv->ds = ds; + + return 0; +} + +void dsa_switch_devlink_free(struct dsa_switch *ds) +{ + devlink_free(ds->devlink); + ds->devlink = NULL; +} diff --git a/net/dsa/devlink.h b/net/dsa/devlink.h index d077c7f336da..4d9f4f23705b 100644 --- a/net/dsa/devlink.h +++ b/net/dsa/devlink.h @@ -4,10 +4,13 @@ #define __DSA_DEVLINK_H struct dsa_port; - -extern const struct devlink_ops dsa_devlink_ops; +struct dsa_switch; int dsa_port_devlink_setup(struct dsa_port *dp); void dsa_port_devlink_teardown(struct dsa_port *dp); +void dsa_switch_devlink_register(struct dsa_switch *ds); +void dsa_switch_devlink_unregister(struct dsa_switch *ds); +int dsa_switch_devlink_alloc(struct dsa_switch *ds); +void dsa_switch_devlink_free(struct dsa_switch *ds); #endif diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f890dfcbf412..c0ef49d86381 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include "devlink.h" @@ -627,7 +626,6 @@ static void dsa_switch_teardown_tag_protocol(struct dsa_switch *ds) static int dsa_switch_setup(struct dsa_switch *ds) { - struct dsa_devlink_priv *dl_priv; struct device_node *dn; int err; @@ -641,15 +639,9 @@ static int dsa_switch_setup(struct dsa_switch *ds) */ ds->phys_mii_mask |= dsa_user_ports(ds); - /* Add the switch to devlink before calling setup, so that setup can - * add dpipe tables - */ - ds->devlink = - devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev); - if (!ds->devlink) - return -ENOMEM; - dl_priv = devlink_priv(ds->devlink); - dl_priv->ds = ds; + err = dsa_switch_devlink_alloc(ds); + if (err) + return err; err = dsa_switch_register_notifier(ds); if (err) @@ -682,7 +674,7 @@ static int dsa_switch_setup(struct dsa_switch *ds) goto free_slave_mii_bus; } - devlink_register(ds->devlink); + dsa_switch_devlink_register(ds); ds->setup = true; return 0; @@ -696,8 +688,7 @@ teardown: unregister_notifier: dsa_switch_unregister_notifier(ds); devlink_free: - devlink_free(ds->devlink); - ds->devlink = NULL; + dsa_switch_devlink_free(ds); return err; } @@ -706,7 +697,7 @@ static void dsa_switch_teardown(struct dsa_switch *ds) if (!ds->setup) return; - devlink_unregister(ds->devlink); + dsa_switch_devlink_unregister(ds); if (ds->slave_mii_bus && ds->ops->phy_read) { mdiobus_unregister(ds->slave_mii_bus); @@ -721,8 +712,7 @@ static void dsa_switch_teardown(struct dsa_switch *ds) dsa_switch_unregister_notifier(ds); - devlink_free(ds->devlink); - ds->devlink = NULL; + dsa_switch_devlink_free(ds); ds->setup = false; } -- cgit v1.2.3-70-g09d2 From 022bba63c3ca02fc074c68b4e7b949bddcf320d6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:44 +0200 Subject: net: dsa: move headers exported by port.c to port.h Minimize the use of the bloated dsa_priv.h by moving the prototypes exported by port.c to their own header file. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 1 + net/dsa/dsa_priv.h | 97 -------------------------------------------- net/dsa/master.c | 1 + net/dsa/port.c | 1 + net/dsa/port.h | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/slave.c | 1 + net/dsa/switch.c | 1 + net/dsa/tag_8021q.c | 1 + 8 files changed, 120 insertions(+), 97 deletions(-) create mode 100644 net/dsa/port.h (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index c0ef49d86381..5a9cf74a0166 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -19,6 +19,7 @@ #include "devlink.h" #include "dsa_priv.h" +#include "port.h" static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index c4ea5fda8f14..81ddc52feb94 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -286,103 +286,6 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, /* netlink.c */ extern struct rtnl_link_ops dsa_link_ops __read_mostly; -/* port.c */ -bool dsa_port_supports_hwtstamp(struct dsa_port *dp, struct ifreq *ifr); -void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, - const struct dsa_device_ops *tag_ops); -int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age); -int dsa_port_set_mst_state(struct dsa_port *dp, - const struct switchdev_mst_state *state, - struct netlink_ext_ack *extack); -int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); -int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); -void dsa_port_disable_rt(struct dsa_port *dp); -void dsa_port_disable(struct dsa_port *dp); -int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, - struct netlink_ext_ack *extack); -void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br); -void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); -int dsa_port_lag_change(struct dsa_port *dp, - struct netdev_lag_lower_state_info *linfo); -int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, - struct netdev_lag_upper_info *uinfo, - struct netlink_ext_ack *extack); -void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); -void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); -int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, - struct netlink_ext_ack *extack); -bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); -int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); -int dsa_port_mst_enable(struct dsa_port *dp, bool on, - struct netlink_ext_ack *extack); -int dsa_port_vlan_msti(struct dsa_port *dp, - const struct switchdev_vlan_msti *msti); -int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu); -int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, - u16 vid); -int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, - u16 vid); -int dsa_port_standalone_host_fdb_add(struct dsa_port *dp, - const unsigned char *addr, u16 vid); -int dsa_port_standalone_host_fdb_del(struct dsa_port *dp, - const unsigned char *addr, u16 vid); -int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, - u16 vid); -int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, - u16 vid); -int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr, - u16 vid); -int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr, - u16 vid); -int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); -int dsa_port_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb); -int dsa_port_mdb_del(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb); -int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb); -int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb); -int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb); -int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp, - const struct switchdev_obj_port_mdb *mdb); -int dsa_port_pre_bridge_flags(const struct dsa_port *dp, - struct switchdev_brport_flags flags, - struct netlink_ext_ack *extack); -int dsa_port_bridge_flags(struct dsa_port *dp, - struct switchdev_brport_flags flags, - struct netlink_ext_ack *extack); -int dsa_port_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan, - struct netlink_ext_ack *extack); -int dsa_port_vlan_del(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan); -int dsa_port_host_vlan_add(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan, - struct netlink_ext_ack *extack); -int dsa_port_host_vlan_del(struct dsa_port *dp, - const struct switchdev_obj_port_vlan *vlan); -int dsa_port_mrp_add(const struct dsa_port *dp, - const struct switchdev_obj_mrp *mrp); -int dsa_port_mrp_del(const struct dsa_port *dp, - const struct switchdev_obj_mrp *mrp); -int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, - const struct switchdev_obj_ring_role_mrp *mrp); -int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, - const struct switchdev_obj_ring_role_mrp *mrp); -int dsa_port_phylink_create(struct dsa_port *dp); -void dsa_port_phylink_destroy(struct dsa_port *dp); -int dsa_shared_port_link_register_of(struct dsa_port *dp); -void dsa_shared_port_link_unregister_of(struct dsa_port *dp); -int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); -void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); -int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast); -void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast); -void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc); -int dsa_port_change_master(struct dsa_port *dp, struct net_device *master, - struct netlink_ext_ack *extack); - /* slave.c */ extern struct notifier_block dsa_slave_switchdev_notifier; extern struct notifier_block dsa_slave_switchdev_blocking_notifier; diff --git a/net/dsa/master.c b/net/dsa/master.c index e24f02743c21..0d3ef591b3b4 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -7,6 +7,7 @@ */ #include "dsa_priv.h" +#include "port.h" static int dsa_master_get_regs_len(struct net_device *dev) { diff --git a/net/dsa/port.c b/net/dsa/port.c index 707bd854cea2..0708fe8d4736 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -13,6 +13,7 @@ #include #include "dsa_priv.h" +#include "port.h" /** * dsa_port_notify - Notify the switching fabric of changes to a port diff --git a/net/dsa/port.h b/net/dsa/port.h new file mode 100644 index 000000000000..9c218660d223 --- /dev/null +++ b/net/dsa/port.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_PORT_H +#define __DSA_PORT_H + +#include +#include + +struct ifreq; +struct netdev_lag_lower_state_info; +struct netdev_lag_upper_info; +struct netlink_ext_ack; +struct switchdev_mst_state; +struct switchdev_obj_port_mdb; +struct switchdev_vlan_msti; +struct phy_device; + +bool dsa_port_supports_hwtstamp(struct dsa_port *dp, struct ifreq *ifr); +void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, + const struct dsa_device_ops *tag_ops); +int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age); +int dsa_port_set_mst_state(struct dsa_port *dp, + const struct switchdev_mst_state *state, + struct netlink_ext_ack *extack); +int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); +int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); +void dsa_port_disable_rt(struct dsa_port *dp); +void dsa_port_disable(struct dsa_port *dp); +int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, + struct netlink_ext_ack *extack); +void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br); +void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); +int dsa_port_lag_change(struct dsa_port *dp, + struct netdev_lag_lower_state_info *linfo); +int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, + struct netdev_lag_upper_info *uinfo, + struct netlink_ext_ack *extack); +void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); +void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); +int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, + struct netlink_ext_ack *extack); +bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); +int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock); +int dsa_port_mst_enable(struct dsa_port *dp, bool on, + struct netlink_ext_ack *extack); +int dsa_port_vlan_msti(struct dsa_port *dp, + const struct switchdev_vlan_msti *msti); +int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu); +int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_standalone_host_fdb_add(struct dsa_port *dp, + const unsigned char *addr, u16 vid); +int dsa_port_standalone_host_fdb_del(struct dsa_port *dp, + const unsigned char *addr, u16 vid); +int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid); +int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data); +int dsa_port_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb); +int dsa_port_pre_bridge_flags(const struct dsa_port *dp, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); +int dsa_port_bridge_flags(struct dsa_port *dp, + struct switchdev_brport_flags flags, + struct netlink_ext_ack *extack); +int dsa_port_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); +int dsa_port_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan); +int dsa_port_host_vlan_add(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan, + struct netlink_ext_ack *extack); +int dsa_port_host_vlan_del(struct dsa_port *dp, + const struct switchdev_obj_port_vlan *vlan); +int dsa_port_mrp_add(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp); +int dsa_port_mrp_del(const struct dsa_port *dp, + const struct switchdev_obj_mrp *mrp); +int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp); +int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, + const struct switchdev_obj_ring_role_mrp *mrp); +int dsa_port_phylink_create(struct dsa_port *dp); +void dsa_port_phylink_destroy(struct dsa_port *dp); +int dsa_shared_port_link_register_of(struct dsa_port *dp); +void dsa_shared_port_link_unregister_of(struct dsa_port *dp); +int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); +void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); +int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast); +void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast); +void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc); +int dsa_port_change_master(struct dsa_port *dp, struct net_device *master, + struct netlink_ext_ack *extack); + +#endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 24d8ad36fc8b..b782a1788f5a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -23,6 +23,7 @@ #include #include "dsa_priv.h" +#include "port.h" static void dsa_slave_standalone_event_work(struct work_struct *work) { diff --git a/net/dsa/switch.c b/net/dsa/switch.c index ce56acdba203..5ece5c5c2acf 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -13,6 +13,7 @@ #include #include "dsa_priv.h" +#include "port.h" static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, unsigned int ageing_time) diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 34e5ec5d3e23..a6617d7b692a 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -8,6 +8,7 @@ #include #include "dsa_priv.h" +#include "port.h" /* Binary structure of the fake 12-bit VID field (when the TPID is * ETH_P_DSA_8021Q): -- cgit v1.2.3-70-g09d2 From 94ef6fad3bf317b43cdc59ba171dff2486e59975 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:45 +0200 Subject: net: dsa: move headers exported by master.c to master.h Minimize the use of the bloated dsa_priv.h by moving the prototypes exported by master.c to their own header file. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 1 + net/dsa/dsa_priv.h | 9 --------- net/dsa/master.c | 6 ++++++ net/dsa/master.h | 19 +++++++++++++++++++ net/dsa/slave.c | 1 + 5 files changed, 27 insertions(+), 9 deletions(-) create mode 100644 net/dsa/master.h (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 5a9cf74a0166..10cd4ea9afe1 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -19,6 +19,7 @@ #include "devlink.h" #include "dsa_priv.h" +#include "master.h" #include "port.h" static DEFINE_MUTEX(dsa2_mutex); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 81ddc52feb94..94e385ec6da5 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -259,15 +259,6 @@ static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) return ops->needed_headroom + ops->needed_tailroom; } -/* master.c */ -int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); -void dsa_master_teardown(struct net_device *dev); -int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp, - struct netdev_lag_upper_info *uinfo, - struct netlink_ext_ack *extack); -void dsa_master_lag_teardown(struct net_device *lag_dev, - struct dsa_port *cpu_dp); - static inline struct net_device *dsa_master_find_slave(struct net_device *dev, int device, int port) { diff --git a/net/dsa/master.c b/net/dsa/master.c index 0d3ef591b3b4..6105821834a2 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -6,7 +6,13 @@ * Vivien Didelot */ +#include +#include +#include +#include + #include "dsa_priv.h" +#include "master.h" #include "port.h" static int dsa_master_get_regs_len(struct net_device *dev) diff --git a/net/dsa/master.h b/net/dsa/master.h new file mode 100644 index 000000000000..3fc0e610b5b5 --- /dev/null +++ b/net/dsa/master.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_MASTER_H +#define __DSA_MASTER_H + +struct dsa_port; +struct net_device; +struct netdev_lag_upper_info; +struct netlink_ext_ack; + +int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); +void dsa_master_teardown(struct net_device *dev); +int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp, + struct netdev_lag_upper_info *uinfo, + struct netlink_ext_ack *extack); +void dsa_master_lag_teardown(struct net_device *lag_dev, + struct dsa_port *cpu_dp); + +#endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index b782a1788f5a..523f9ebeb45b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -24,6 +24,7 @@ #include "dsa_priv.h" #include "port.h" +#include "master.h" static void dsa_slave_standalone_event_work(struct work_struct *work) { -- cgit v1.2.3-70-g09d2 From 09f92341681a23346c456938bcb2670de2cd99d4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:46 +0200 Subject: net: dsa: move headers exported by slave.c to slave.h Minimize the use of the bloated dsa_priv.h by moving the prototypes exported by slave.c to their own header file. This is just approximate to get the code structure right. There are some interdependencies with static inline code left in dsa_priv.h, so leave slave.h included from there for now. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa.c | 1 + net/dsa/dsa2.c | 1 + net/dsa/dsa_priv.h | 59 ++-------------------------------------------- net/dsa/netlink.c | 1 + net/dsa/port.c | 1 + net/dsa/slave.c | 1 + net/dsa/slave.h | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/switch.c | 1 + 8 files changed, 77 insertions(+), 57 deletions(-) create mode 100644 net/dsa/slave.h (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 842a1f2488b2..422f8853d1c4 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -14,6 +14,7 @@ #include #include "dsa_priv.h" +#include "slave.h" static LIST_HEAD(dsa_tag_drivers_list); static DEFINE_MUTEX(dsa_tag_drivers_lock); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 10cd4ea9afe1..f917e695d38c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -21,6 +21,7 @@ #include "dsa_priv.h" #include "master.h" #include "port.h" +#include "slave.h" static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 94e385ec6da5..fcff35b15dd4 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -7,13 +7,11 @@ #ifndef __DSA_PRIV_H #define __DSA_PRIV_H -#include -#include #include #include -#include #include -#include + +#include "slave.h" #define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG @@ -224,24 +222,6 @@ struct dsa_standalone_event_work { u16 vid; }; -struct dsa_slave_priv { - /* Copy of CPU port xmit for faster access in slave transmit hot path */ - struct sk_buff * (*xmit)(struct sk_buff *skb, - struct net_device *dev); - - struct gro_cells gcells; - - /* DSA port data, such as switch, port index, etc. */ - struct dsa_port *dp; - -#ifdef CONFIG_NET_POLL_CONTROLLER - struct netpoll *netpoll; -#endif - - /* TC context */ - struct list_head mall_tc_list; -}; - /* dsa.c */ const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol); const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name); @@ -277,41 +257,6 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, /* netlink.c */ extern struct rtnl_link_ops dsa_link_ops __read_mostly; -/* slave.c */ -extern struct notifier_block dsa_slave_switchdev_notifier; -extern struct notifier_block dsa_slave_switchdev_blocking_notifier; - -void dsa_slave_mii_bus_init(struct dsa_switch *ds); -int dsa_slave_create(struct dsa_port *dp); -void dsa_slave_destroy(struct net_device *slave_dev); -int dsa_slave_suspend(struct net_device *slave_dev); -int dsa_slave_resume(struct net_device *slave_dev); -int dsa_slave_register_notifier(void); -void dsa_slave_unregister_notifier(void); -void dsa_slave_sync_ha(struct net_device *dev); -void dsa_slave_unsync_ha(struct net_device *dev); -void dsa_slave_setup_tagger(struct net_device *slave); -int dsa_slave_change_mtu(struct net_device *dev, int new_mtu); -int dsa_slave_change_master(struct net_device *dev, struct net_device *master, - struct netlink_ext_ack *extack); -int dsa_slave_manage_vlan_filtering(struct net_device *dev, - bool vlan_filtering); - -static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - - return p->dp; -} - -static inline struct net_device * -dsa_slave_to_master(const struct net_device *dev) -{ - struct dsa_port *dp = dsa_slave_to_port(dev); - - return dsa_port_to_master(dp); -} - /* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged * frames as untagged, since the bridge will not untag them. */ diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c index ecf9ed1de185..824b09d904cc 100644 --- a/net/dsa/netlink.c +++ b/net/dsa/netlink.c @@ -5,6 +5,7 @@ #include #include "dsa_priv.h" +#include "slave.h" static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = { [IFLA_DSA_MASTER] = { .type = NLA_U32 }, diff --git a/net/dsa/port.c b/net/dsa/port.c index 0708fe8d4736..56728242f079 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -14,6 +14,7 @@ #include "dsa_priv.h" #include "port.h" +#include "slave.h" /** * dsa_port_notify - Notify the switching fabric of changes to a port diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 523f9ebeb45b..2cf83892072f 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -25,6 +25,7 @@ #include "dsa_priv.h" #include "port.h" #include "master.h" +#include "slave.h" static void dsa_slave_standalone_event_work(struct work_struct *work) { diff --git a/net/dsa/slave.h b/net/dsa/slave.h new file mode 100644 index 000000000000..d0abe609e00d --- /dev/null +++ b/net/dsa/slave.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_SLAVE_H +#define __DSA_SLAVE_H + +#include +#include +#include +#include +#include +#include +#include + +struct net_device; +struct netlink_ext_ack; + +extern struct notifier_block dsa_slave_switchdev_notifier; +extern struct notifier_block dsa_slave_switchdev_blocking_notifier; + +struct dsa_slave_priv { + /* Copy of CPU port xmit for faster access in slave transmit hot path */ + struct sk_buff * (*xmit)(struct sk_buff *skb, + struct net_device *dev); + + struct gro_cells gcells; + + /* DSA port data, such as switch, port index, etc. */ + struct dsa_port *dp; + +#ifdef CONFIG_NET_POLL_CONTROLLER + struct netpoll *netpoll; +#endif + + /* TC context */ + struct list_head mall_tc_list; +}; + +void dsa_slave_mii_bus_init(struct dsa_switch *ds); +int dsa_slave_create(struct dsa_port *dp); +void dsa_slave_destroy(struct net_device *slave_dev); +int dsa_slave_suspend(struct net_device *slave_dev); +int dsa_slave_resume(struct net_device *slave_dev); +int dsa_slave_register_notifier(void); +void dsa_slave_unregister_notifier(void); +void dsa_slave_sync_ha(struct net_device *dev); +void dsa_slave_unsync_ha(struct net_device *dev); +void dsa_slave_setup_tagger(struct net_device *slave); +int dsa_slave_change_mtu(struct net_device *dev, int new_mtu); +int dsa_slave_change_master(struct net_device *dev, struct net_device *master, + struct netlink_ext_ack *extack); +int dsa_slave_manage_vlan_filtering(struct net_device *dev, + bool vlan_filtering); + +static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + + return p->dp; +} + +static inline struct net_device * +dsa_slave_to_master(const struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + + return dsa_port_to_master(dp); +} + +#endif diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 5ece5c5c2acf..d0d5a1c7e6f6 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -14,6 +14,7 @@ #include "dsa_priv.h" #include "port.h" +#include "slave.h" static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, unsigned int ageing_time) -- cgit v1.2.3-70-g09d2 From bd954b826032e7bd6be8a53e30eb81c1b348aef6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:47 +0200 Subject: net: dsa: move tagging protocol code to tag.{c,h} It would be nice if tagging protocol drivers could include just the header they need, since they are (mostly) data path and isolated from most of the other DSA core code does. Create a tag.c and a tag.h file which are meant to support tagging protocol drivers. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/Makefile | 1 + net/dsa/dsa.c | 229 +-------------------------------- net/dsa/dsa2.c | 1 + net/dsa/dsa_priv.h | 296 ------------------------------------------- net/dsa/master.c | 1 + net/dsa/slave.c | 1 + net/dsa/tag.c | 243 +++++++++++++++++++++++++++++++++++ net/dsa/tag.h | 310 +++++++++++++++++++++++++++++++++++++++++++++ net/dsa/tag_8021q.c | 1 + net/dsa/tag_ar9331.c | 2 +- net/dsa/tag_brcm.c | 2 +- net/dsa/tag_dsa.c | 2 +- net/dsa/tag_gswip.c | 2 +- net/dsa/tag_hellcreek.c | 1 + net/dsa/tag_ksz.c | 3 +- net/dsa/tag_lan9303.c | 2 +- net/dsa/tag_mtk.c | 2 +- net/dsa/tag_none.c | 2 +- net/dsa/tag_ocelot.c | 3 +- net/dsa/tag_ocelot_8021q.c | 3 +- net/dsa/tag_qca.c | 2 +- net/dsa/tag_rtl4_a.c | 2 +- net/dsa/tag_rtl8_4.c | 2 +- net/dsa/tag_rzn1_a5psw.c | 2 +- net/dsa/tag_sja1105.c | 3 +- net/dsa/tag_trailer.c | 2 +- net/dsa/tag_xrs700x.c | 2 +- 27 files changed, 581 insertions(+), 541 deletions(-) create mode 100644 net/dsa/tag.c create mode 100644 net/dsa/tag.h (limited to 'net') diff --git a/net/dsa/Makefile b/net/dsa/Makefile index bc872c0d7011..93f5d5f1e495 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -10,6 +10,7 @@ dsa_core-y += \ port.o \ slave.o \ switch.o \ + tag.o \ tag_8021q.o # tagging formats diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 422f8853d1c4..6f87dd1ee6bf 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -10,127 +10,10 @@ #include #include #include -#include -#include #include "dsa_priv.h" #include "slave.h" - -static LIST_HEAD(dsa_tag_drivers_list); -static DEFINE_MUTEX(dsa_tag_drivers_lock); - -static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver, - struct module *owner) -{ - dsa_tag_driver->owner = owner; - - mutex_lock(&dsa_tag_drivers_lock); - list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list); - mutex_unlock(&dsa_tag_drivers_lock); -} - -void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count, struct module *owner) -{ - unsigned int i; - - for (i = 0; i < count; i++) - dsa_tag_driver_register(dsa_tag_driver_array[i], owner); -} - -static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver) -{ - mutex_lock(&dsa_tag_drivers_lock); - list_del(&dsa_tag_driver->list); - mutex_unlock(&dsa_tag_drivers_lock); -} -EXPORT_SYMBOL_GPL(dsa_tag_drivers_register); - -void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count) -{ - unsigned int i; - - for (i = 0; i < count; i++) - dsa_tag_driver_unregister(dsa_tag_driver_array[i]); -} -EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister); - -const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) -{ - return ops->name; -}; - -/* Function takes a reference on the module owning the tagger, - * so dsa_tag_driver_put must be called afterwards. - */ -const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name) -{ - const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); - struct dsa_tag_driver *dsa_tag_driver; - - request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name); - - mutex_lock(&dsa_tag_drivers_lock); - list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { - const struct dsa_device_ops *tmp = dsa_tag_driver->ops; - - if (strcmp(name, tmp->name)) - continue; - - if (!try_module_get(dsa_tag_driver->owner)) - break; - - ops = tmp; - break; - } - mutex_unlock(&dsa_tag_drivers_lock); - - return ops; -} - -const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol) -{ - struct dsa_tag_driver *dsa_tag_driver; - const struct dsa_device_ops *ops; - bool found = false; - - request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol); - - mutex_lock(&dsa_tag_drivers_lock); - list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { - ops = dsa_tag_driver->ops; - if (ops->proto == tag_protocol) { - found = true; - break; - } - } - - if (found) { - if (!try_module_get(dsa_tag_driver->owner)) - ops = ERR_PTR(-ENOPROTOOPT); - } else { - ops = ERR_PTR(-ENOPROTOOPT); - } - - mutex_unlock(&dsa_tag_drivers_lock); - - return ops; -} - -void dsa_tag_driver_put(const struct dsa_device_ops *ops) -{ - struct dsa_tag_driver *dsa_tag_driver; - - mutex_lock(&dsa_tag_drivers_lock); - list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { - if (dsa_tag_driver->ops == ops) { - module_put(dsa_tag_driver->owner); - break; - } - } - mutex_unlock(&dsa_tag_drivers_lock); -} +#include "tag.h" static int dev_is_class(struct device *dev, void *class) { @@ -168,111 +51,6 @@ struct net_device *dsa_dev_to_net_device(struct device *dev) return NULL; } -/* Determine if we should defer delivery of skb until we have a rx timestamp. - * - * Called from dsa_switch_rcv. For now, this will only work if tagging is - * enabled on the switch. Normally the MAC driver would retrieve the hardware - * timestamp when it reads the packet out of the hardware. However in a DSA - * switch, the DSA driver owning the interface to which the packet is - * delivered is never notified unless we do so here. - */ -static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, - struct sk_buff *skb) -{ - struct dsa_switch *ds = p->dp->ds; - unsigned int type; - - if (skb_headroom(skb) < ETH_HLEN) - return false; - - __skb_push(skb, ETH_HLEN); - - type = ptp_classify_raw(skb); - - __skb_pull(skb, ETH_HLEN); - - if (type == PTP_CLASS_NONE) - return false; - - if (likely(ds->ops->port_rxtstamp)) - return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); - - return false; -} - -static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *unused) -{ - struct metadata_dst *md_dst = skb_metadata_dst(skb); - struct dsa_port *cpu_dp = dev->dsa_ptr; - struct sk_buff *nskb = NULL; - struct dsa_slave_priv *p; - - if (unlikely(!cpu_dp)) { - kfree_skb(skb); - return 0; - } - - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) - return 0; - - if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) { - unsigned int port = md_dst->u.port_info.port_id; - - skb_dst_drop(skb); - if (!skb_has_extensions(skb)) - skb->slow_gro = 0; - - skb->dev = dsa_master_find_slave(dev, 0, port); - if (likely(skb->dev)) { - dsa_default_offload_fwd_mark(skb); - nskb = skb; - } - } else { - nskb = cpu_dp->rcv(skb, dev); - } - - if (!nskb) { - kfree_skb(skb); - return 0; - } - - skb = nskb; - skb_push(skb, ETH_HLEN); - skb->pkt_type = PACKET_HOST; - skb->protocol = eth_type_trans(skb, skb->dev); - - if (unlikely(!dsa_slave_dev_check(skb->dev))) { - /* Packet is to be injected directly on an upper - * device, e.g. a team/bond, so skip all DSA-port - * specific actions. - */ - netif_rx(skb); - return 0; - } - - p = netdev_priv(skb->dev); - - if (unlikely(cpu_dp->ds->untag_bridge_pvid)) { - nskb = dsa_untag_bridge_pvid(skb); - if (!nskb) { - kfree_skb(skb); - return 0; - } - skb = nskb; - } - - dev_sw_netstats_rx_add(skb->dev, skb->len); - - if (dsa_skb_defer_rx_timestamp(p, skb)) - return 0; - - gro_cells_receive(&p->gcells, skb); - - return 0; -} - #ifdef CONFIG_PM_SLEEP static bool dsa_port_is_initialized(const struct dsa_port *dp) { @@ -327,11 +105,6 @@ int dsa_switch_resume(struct dsa_switch *ds) EXPORT_SYMBOL_GPL(dsa_switch_resume); #endif -static struct packet_type dsa_pack_type __read_mostly = { - .type = cpu_to_be16(ETH_P_XDSA), - .func = dsa_switch_rcv, -}; - static struct workqueue_struct *dsa_owq; bool dsa_schedule_work(struct work_struct *work) diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index f917e695d38c..5373edf45a62 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -22,6 +22,7 @@ #include "master.h" #include "port.h" #include "slave.h" +#include "tag.h" static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index fcff35b15dd4..eee2c9729e32 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -11,85 +11,8 @@ #include #include -#include "slave.h" - #define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG -/* Create 2 modaliases per tagging protocol, one to auto-load the module - * given the ID reported by get_tag_protocol(), and the other by name. - */ -#define DSA_TAG_DRIVER_ALIAS "dsa_tag:" -#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto, __name) \ - MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __name); \ - MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \ - __stringify(__proto##_VALUE)) - -struct dsa_tag_driver { - const struct dsa_device_ops *ops; - struct list_head list; - struct module *owner; -}; - -void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count, - struct module *owner); -void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], - unsigned int count); - -#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ -static int __init dsa_tag_driver_module_init(void) \ -{ \ - dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ - THIS_MODULE); \ - return 0; \ -} \ -module_init(dsa_tag_driver_module_init); \ - \ -static void __exit dsa_tag_driver_module_exit(void) \ -{ \ - dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ -} \ -module_exit(dsa_tag_driver_module_exit) - -/** - * module_dsa_tag_drivers() - Helper macro for registering DSA tag - * drivers - * @__ops_array: Array of tag driver structures - * - * Helper macro for DSA tag drivers which do not do anything special - * in module init/exit. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit(). - */ -#define module_dsa_tag_drivers(__ops_array) \ -dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) - -#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops - -/* Create a static structure we can build a linked list of dsa_tag - * drivers - */ -#define DSA_TAG_DRIVER(__ops) \ -static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ - .ops = &__ops, \ -} - -/** - * module_dsa_tag_driver() - Helper macro for registering a single DSA tag - * driver - * @__ops: Single tag driver structures - * - * Helper macro for DSA tag drivers which do not do anything special - * in module init/exit. Each module may only use this macro once, and - * calling it replaces module_init() and module_exit(). - */ -#define module_dsa_tag_driver(__ops) \ -DSA_TAG_DRIVER(__ops); \ - \ -static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ - &DSA_TAG_DRIVER_NAME(__ops) \ -}; \ -module_dsa_tag_drivers(dsa_tag_driver_array) - enum { DSA_NOTIFIER_AGEING_TIME, DSA_NOTIFIER_BRIDGE_JOIN, @@ -223,234 +146,15 @@ struct dsa_standalone_event_work { }; /* dsa.c */ -const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol); -const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name); -void dsa_tag_driver_put(const struct dsa_device_ops *ops); - struct net_device *dsa_dev_to_net_device(struct device *dev); bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b); bool dsa_schedule_work(struct work_struct *work); -const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); - -static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) -{ - return ops->needed_headroom + ops->needed_tailroom; -} - -static inline struct net_device *dsa_master_find_slave(struct net_device *dev, - int device, int port) -{ - struct dsa_port *cpu_dp = dev->dsa_ptr; - struct dsa_switch_tree *dst = cpu_dp->dst; - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - if (dp->ds->index == device && dp->index == port && - dp->type == DSA_PORT_TYPE_USER) - return dp->slave; - - return NULL; -} /* netlink.c */ extern struct rtnl_link_ops dsa_link_ops __read_mostly; -/* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged - * frames as untagged, since the bridge will not untag them. - */ -static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) -{ - struct dsa_port *dp = dsa_slave_to_port(skb->dev); - struct net_device *br = dsa_port_bridge_dev_get(dp); - struct net_device *dev = skb->dev; - struct net_device *upper_dev; - u16 vid, pvid, proto; - int err; - - if (!br || br_vlan_enabled(br)) - return skb; - - err = br_vlan_get_proto(br, &proto); - if (err) - return skb; - - /* Move VLAN tag from data to hwaccel */ - if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { - skb = skb_vlan_untag(skb); - if (!skb) - return NULL; - } - - if (!skb_vlan_tag_present(skb)) - return skb; - - vid = skb_vlan_tag_get_id(skb); - - /* We already run under an RCU read-side critical section since - * we are called from netif_receive_skb_list_internal(). - */ - err = br_vlan_get_pvid_rcu(dev, &pvid); - if (err) - return skb; - - if (vid != pvid) - return skb; - - /* The sad part about attempting to untag from DSA is that we - * don't know, unless we check, if the skb will end up in - * the bridge's data path - br_allowed_ingress() - or not. - * For example, there might be an 8021q upper for the - * default_pvid of the bridge, which will steal VLAN-tagged traffic - * from the bridge's data path. This is a configuration that DSA - * supports because vlan_filtering is 0. In that case, we should - * definitely keep the tag, to make sure it keeps working. - */ - upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid); - if (upper_dev) - return skb; - - __vlan_hwaccel_clear_tag(skb); - - return skb; -} - -/* For switches without hardware support for DSA tagging to be able - * to support termination through the bridge. - */ -static inline struct net_device * -dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid) -{ - struct dsa_port *cpu_dp = master->dsa_ptr; - struct dsa_switch_tree *dst = cpu_dp->dst; - struct bridge_vlan_info vinfo; - struct net_device *slave; - struct dsa_port *dp; - int err; - - list_for_each_entry(dp, &dst->ports, list) { - if (dp->type != DSA_PORT_TYPE_USER) - continue; - - if (!dp->bridge) - continue; - - if (dp->stp_state != BR_STATE_LEARNING && - dp->stp_state != BR_STATE_FORWARDING) - continue; - - /* Since the bridge might learn this packet, keep the CPU port - * affinity with the port that will be used for the reply on - * xmit. - */ - if (dp->cpu_dp != cpu_dp) - continue; - - slave = dp->slave; - - err = br_vlan_get_info_rcu(slave, vid, &vinfo); - if (err) - continue; - - return slave; - } - - return NULL; -} - -/* If the ingress port offloads the bridge, we mark the frame as autonomously - * forwarded by hardware, so the software bridge doesn't forward in twice, back - * to us, because we already did. However, if we're in fallback mode and we do - * software bridging, we are not offloading it, therefore the dp->bridge - * pointer is not populated, and flooding needs to be done by software (we are - * effectively operating in standalone ports mode). - */ -static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb) -{ - struct dsa_port *dp = dsa_slave_to_port(skb->dev); - - skb->offload_fwd_mark = !!(dp->bridge); -} - -/* Helper for removing DSA header tags from packets in the RX path. - * Must not be called before skb_pull(len). - * skb->data - * | - * v - * | | | | | | | | | | | | | | | | | | | - * +-----------------------+-----------------------+---------------+-------+ - * | Destination MAC | Source MAC | DSA header | EType | - * +-----------------------+-----------------------+---------------+-------+ - * | | - * <----- len -----> <----- len -----> - * | - * >>>>>>> v - * >>>>>>> | | | | | | | | | | | | | | | - * >>>>>>> +-----------------------+-----------------------+-------+ - * >>>>>>> | Destination MAC | Source MAC | EType | - * +-----------------------+-----------------------+-------+ - * ^ - * | - * skb->data - */ -static inline void dsa_strip_etype_header(struct sk_buff *skb, int len) -{ - memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN); -} - -/* Helper for creating space for DSA header tags in TX path packets. - * Must not be called before skb_push(len). - * - * Before: - * - * <<<<<<< | | | | | | | | | | | | | | | - * ^ <<<<<<< +-----------------------+-----------------------+-------+ - * | <<<<<<< | Destination MAC | Source MAC | EType | - * | +-----------------------+-----------------------+-------+ - * <----- len -----> - * | - * | - * skb->data - * - * After: - * - * | | | | | | | | | | | | | | | | | | | - * +-----------------------+-----------------------+---------------+-------+ - * | Destination MAC | Source MAC | DSA header | EType | - * +-----------------------+-----------------------+---------------+-------+ - * ^ | | - * | <----- len -----> - * skb->data - */ -static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len) -{ - memmove(skb->data, skb->data + len, 2 * ETH_ALEN); -} - -/* On RX, eth_type_trans() on the DSA master pulls ETH_HLEN bytes starting from - * skb_mac_header(skb), which leaves skb->data pointing at the first byte after - * what the DSA master perceives as the EtherType (the beginning of the L3 - * protocol). Since DSA EtherType header taggers treat the EtherType as part of - * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header - * is located 2 bytes behind skb->data. Note that EtherType in this context - * means the first 2 bytes of the DSA header, not the encapsulated EtherType - * that will become visible after the DSA header is stripped. - */ -static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb) -{ - return skb->data - 2; -} - -/* On TX, skb->data points to skb_mac_header(skb), which means that EtherType - * header taggers start exactly where the EtherType is (the EtherType is - * treated as part of the DSA header). - */ -static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) -{ - return skb->data + 2 * ETH_ALEN; -} - /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); diff --git a/net/dsa/master.c b/net/dsa/master.c index 6105821834a2..e38b349ca7f8 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -14,6 +14,7 @@ #include "dsa_priv.h" #include "master.h" #include "port.h" +#include "tag.h" static int dsa_master_get_regs_len(struct net_device *dev) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 2cf83892072f..a928aaf68804 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -26,6 +26,7 @@ #include "port.h" #include "master.h" #include "slave.h" +#include "tag.h" static void dsa_slave_standalone_event_work(struct work_struct *work) { diff --git a/net/dsa/tag.c b/net/dsa/tag.c new file mode 100644 index 000000000000..383721e167d6 --- /dev/null +++ b/net/dsa/tag.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * DSA tagging protocol handling + * + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2013 Florian Fainelli + * Copyright (c) 2016 Andrew Lunn + */ + +#include +#include +#include +#include +#include + +#include "slave.h" +#include "tag.h" + +static LIST_HEAD(dsa_tag_drivers_list); +static DEFINE_MUTEX(dsa_tag_drivers_lock); + +/* Determine if we should defer delivery of skb until we have a rx timestamp. + * + * Called from dsa_switch_rcv. For now, this will only work if tagging is + * enabled on the switch. Normally the MAC driver would retrieve the hardware + * timestamp when it reads the packet out of the hardware. However in a DSA + * switch, the DSA driver owning the interface to which the packet is + * delivered is never notified unless we do so here. + */ +static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, + struct sk_buff *skb) +{ + struct dsa_switch *ds = p->dp->ds; + unsigned int type; + + if (skb_headroom(skb) < ETH_HLEN) + return false; + + __skb_push(skb, ETH_HLEN); + + type = ptp_classify_raw(skb); + + __skb_pull(skb, ETH_HLEN); + + if (type == PTP_CLASS_NONE) + return false; + + if (likely(ds->ops->port_rxtstamp)) + return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); + + return false; +} + +static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *unused) +{ + struct metadata_dst *md_dst = skb_metadata_dst(skb); + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct sk_buff *nskb = NULL; + struct dsa_slave_priv *p; + + if (unlikely(!cpu_dp)) { + kfree_skb(skb); + return 0; + } + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return 0; + + if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) { + unsigned int port = md_dst->u.port_info.port_id; + + skb_dst_drop(skb); + if (!skb_has_extensions(skb)) + skb->slow_gro = 0; + + skb->dev = dsa_master_find_slave(dev, 0, port); + if (likely(skb->dev)) { + dsa_default_offload_fwd_mark(skb); + nskb = skb; + } + } else { + nskb = cpu_dp->rcv(skb, dev); + } + + if (!nskb) { + kfree_skb(skb); + return 0; + } + + skb = nskb; + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); + + if (unlikely(!dsa_slave_dev_check(skb->dev))) { + /* Packet is to be injected directly on an upper + * device, e.g. a team/bond, so skip all DSA-port + * specific actions. + */ + netif_rx(skb); + return 0; + } + + p = netdev_priv(skb->dev); + + if (unlikely(cpu_dp->ds->untag_bridge_pvid)) { + nskb = dsa_untag_bridge_pvid(skb); + if (!nskb) { + kfree_skb(skb); + return 0; + } + skb = nskb; + } + + dev_sw_netstats_rx_add(skb->dev, skb->len); + + if (dsa_skb_defer_rx_timestamp(p, skb)) + return 0; + + gro_cells_receive(&p->gcells, skb); + + return 0; +} + +struct packet_type dsa_pack_type __read_mostly = { + .type = cpu_to_be16(ETH_P_XDSA), + .func = dsa_switch_rcv, +}; + +static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver, + struct module *owner) +{ + dsa_tag_driver->owner = owner; + + mutex_lock(&dsa_tag_drivers_lock); + list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list); + mutex_unlock(&dsa_tag_drivers_lock); +} + +void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count, struct module *owner) +{ + unsigned int i; + + for (i = 0; i < count; i++) + dsa_tag_driver_register(dsa_tag_driver_array[i], owner); +} + +static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver) +{ + mutex_lock(&dsa_tag_drivers_lock); + list_del(&dsa_tag_driver->list); + mutex_unlock(&dsa_tag_drivers_lock); +} +EXPORT_SYMBOL_GPL(dsa_tag_drivers_register); + +void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count) +{ + unsigned int i; + + for (i = 0; i < count; i++) + dsa_tag_driver_unregister(dsa_tag_driver_array[i]); +} +EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister); + +const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops) +{ + return ops->name; +}; + +/* Function takes a reference on the module owning the tagger, + * so dsa_tag_driver_put must be called afterwards. + */ +const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name) +{ + const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT); + struct dsa_tag_driver *dsa_tag_driver; + + request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name); + + mutex_lock(&dsa_tag_drivers_lock); + list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { + const struct dsa_device_ops *tmp = dsa_tag_driver->ops; + + if (strcmp(name, tmp->name)) + continue; + + if (!try_module_get(dsa_tag_driver->owner)) + break; + + ops = tmp; + break; + } + mutex_unlock(&dsa_tag_drivers_lock); + + return ops; +} + +const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol) +{ + struct dsa_tag_driver *dsa_tag_driver; + const struct dsa_device_ops *ops; + bool found = false; + + request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol); + + mutex_lock(&dsa_tag_drivers_lock); + list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { + ops = dsa_tag_driver->ops; + if (ops->proto == tag_protocol) { + found = true; + break; + } + } + + if (found) { + if (!try_module_get(dsa_tag_driver->owner)) + ops = ERR_PTR(-ENOPROTOOPT); + } else { + ops = ERR_PTR(-ENOPROTOOPT); + } + + mutex_unlock(&dsa_tag_drivers_lock); + + return ops; +} + +void dsa_tag_driver_put(const struct dsa_device_ops *ops) +{ + struct dsa_tag_driver *dsa_tag_driver; + + mutex_lock(&dsa_tag_drivers_lock); + list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) { + if (dsa_tag_driver->ops == ops) { + module_put(dsa_tag_driver->owner); + break; + } + } + mutex_unlock(&dsa_tag_drivers_lock); +} diff --git a/net/dsa/tag.h b/net/dsa/tag.h new file mode 100644 index 000000000000..7cfbca824f1c --- /dev/null +++ b/net/dsa/tag.h @@ -0,0 +1,310 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_TAG_H +#define __DSA_TAG_H + +#include +#include +#include +#include + +#include "port.h" +#include "slave.h" + +struct dsa_tag_driver { + const struct dsa_device_ops *ops; + struct list_head list; + struct module *owner; +}; + +extern struct packet_type dsa_pack_type; + +const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol); +const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name); +void dsa_tag_driver_put(const struct dsa_device_ops *ops); +const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); + +static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) +{ + return ops->needed_headroom + ops->needed_tailroom; +} + +static inline struct net_device *dsa_master_find_slave(struct net_device *dev, + int device, int port) +{ + struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dp->ds->index == device && dp->index == port && + dp->type == DSA_PORT_TYPE_USER) + return dp->slave; + + return NULL; +} + +/* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged + * frames as untagged, since the bridge will not untag them. + */ +static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct net_device *br = dsa_port_bridge_dev_get(dp); + struct net_device *dev = skb->dev; + struct net_device *upper_dev; + u16 vid, pvid, proto; + int err; + + if (!br || br_vlan_enabled(br)) + return skb; + + err = br_vlan_get_proto(br, &proto); + if (err) + return skb; + + /* Move VLAN tag from data to hwaccel */ + if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { + skb = skb_vlan_untag(skb); + if (!skb) + return NULL; + } + + if (!skb_vlan_tag_present(skb)) + return skb; + + vid = skb_vlan_tag_get_id(skb); + + /* We already run under an RCU read-side critical section since + * we are called from netif_receive_skb_list_internal(). + */ + err = br_vlan_get_pvid_rcu(dev, &pvid); + if (err) + return skb; + + if (vid != pvid) + return skb; + + /* The sad part about attempting to untag from DSA is that we + * don't know, unless we check, if the skb will end up in + * the bridge's data path - br_allowed_ingress() - or not. + * For example, there might be an 8021q upper for the + * default_pvid of the bridge, which will steal VLAN-tagged traffic + * from the bridge's data path. This is a configuration that DSA + * supports because vlan_filtering is 0. In that case, we should + * definitely keep the tag, to make sure it keeps working. + */ + upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid); + if (upper_dev) + return skb; + + __vlan_hwaccel_clear_tag(skb); + + return skb; +} + +/* For switches without hardware support for DSA tagging to be able + * to support termination through the bridge. + */ +static inline struct net_device * +dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct bridge_vlan_info vinfo; + struct net_device *slave; + struct dsa_port *dp; + int err; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->type != DSA_PORT_TYPE_USER) + continue; + + if (!dp->bridge) + continue; + + if (dp->stp_state != BR_STATE_LEARNING && + dp->stp_state != BR_STATE_FORWARDING) + continue; + + /* Since the bridge might learn this packet, keep the CPU port + * affinity with the port that will be used for the reply on + * xmit. + */ + if (dp->cpu_dp != cpu_dp) + continue; + + slave = dp->slave; + + err = br_vlan_get_info_rcu(slave, vid, &vinfo); + if (err) + continue; + + return slave; + } + + return NULL; +} + +/* If the ingress port offloads the bridge, we mark the frame as autonomously + * forwarded by hardware, so the software bridge doesn't forward in twice, back + * to us, because we already did. However, if we're in fallback mode and we do + * software bridging, we are not offloading it, therefore the dp->bridge + * pointer is not populated, and flooding needs to be done by software (we are + * effectively operating in standalone ports mode). + */ +static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + + skb->offload_fwd_mark = !!(dp->bridge); +} + +/* Helper for removing DSA header tags from packets in the RX path. + * Must not be called before skb_pull(len). + * skb->data + * | + * v + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+---------------+-------+ + * | Destination MAC | Source MAC | DSA header | EType | + * +-----------------------+-----------------------+---------------+-------+ + * | | + * <----- len -----> <----- len -----> + * | + * >>>>>>> v + * >>>>>>> | | | | | | | | | | | | | | | + * >>>>>>> +-----------------------+-----------------------+-------+ + * >>>>>>> | Destination MAC | Source MAC | EType | + * +-----------------------+-----------------------+-------+ + * ^ + * | + * skb->data + */ +static inline void dsa_strip_etype_header(struct sk_buff *skb, int len) +{ + memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN); +} + +/* Helper for creating space for DSA header tags in TX path packets. + * Must not be called before skb_push(len). + * + * Before: + * + * <<<<<<< | | | | | | | | | | | | | | | + * ^ <<<<<<< +-----------------------+-----------------------+-------+ + * | <<<<<<< | Destination MAC | Source MAC | EType | + * | +-----------------------+-----------------------+-------+ + * <----- len -----> + * | + * | + * skb->data + * + * After: + * + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+---------------+-------+ + * | Destination MAC | Source MAC | DSA header | EType | + * +-----------------------+-----------------------+---------------+-------+ + * ^ | | + * | <----- len -----> + * skb->data + */ +static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len) +{ + memmove(skb->data, skb->data + len, 2 * ETH_ALEN); +} + +/* On RX, eth_type_trans() on the DSA master pulls ETH_HLEN bytes starting from + * skb_mac_header(skb), which leaves skb->data pointing at the first byte after + * what the DSA master perceives as the EtherType (the beginning of the L3 + * protocol). Since DSA EtherType header taggers treat the EtherType as part of + * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header + * is located 2 bytes behind skb->data. Note that EtherType in this context + * means the first 2 bytes of the DSA header, not the encapsulated EtherType + * that will become visible after the DSA header is stripped. + */ +static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb) +{ + return skb->data - 2; +} + +/* On TX, skb->data points to skb_mac_header(skb), which means that EtherType + * header taggers start exactly where the EtherType is (the EtherType is + * treated as part of the DSA header). + */ +static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) +{ + return skb->data + 2 * ETH_ALEN; +} + +/* Create 2 modaliases per tagging protocol, one to auto-load the module + * given the ID reported by get_tag_protocol(), and the other by name. + */ +#define DSA_TAG_DRIVER_ALIAS "dsa_tag:" +#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto, __name) \ + MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __name); \ + MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \ + __stringify(__proto##_VALUE)) + +void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count, + struct module *owner); +void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[], + unsigned int count); + +#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \ +static int __init dsa_tag_driver_module_init(void) \ +{ \ + dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \ + THIS_MODULE); \ + return 0; \ +} \ +module_init(dsa_tag_driver_module_init); \ + \ +static void __exit dsa_tag_driver_module_exit(void) \ +{ \ + dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \ +} \ +module_exit(dsa_tag_driver_module_exit) + +/** + * module_dsa_tag_drivers() - Helper macro for registering DSA tag + * drivers + * @__ops_array: Array of tag driver structures + * + * Helper macro for DSA tag drivers which do not do anything special + * in module init/exit. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit(). + */ +#define module_dsa_tag_drivers(__ops_array) \ +dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array)) + +#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops + +/* Create a static structure we can build a linked list of dsa_tag + * drivers + */ +#define DSA_TAG_DRIVER(__ops) \ +static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \ + .ops = &__ops, \ +} + +/** + * module_dsa_tag_driver() - Helper macro for registering a single DSA tag + * driver + * @__ops: Single tag driver structures + * + * Helper macro for DSA tag drivers which do not do anything special + * in module init/exit. Each module may only use this macro once, and + * calling it replaces module_init() and module_exit(). + */ +#define module_dsa_tag_driver(__ops) \ +DSA_TAG_DRIVER(__ops); \ + \ +static struct dsa_tag_driver *dsa_tag_driver_array[] = { \ + &DSA_TAG_DRIVER_NAME(__ops) \ +}; \ +module_dsa_tag_drivers(dsa_tag_driver_array) + +#endif diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index a6617d7b692a..ee5dd1a54b51 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -9,6 +9,7 @@ #include "dsa_priv.h" #include "port.h" +#include "tag.h" /* Binary structure of the fake 12-bit VID field (when the TPID is * ETH_P_DSA_8021Q): diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index bfa161a4f502..7f3b7d730b85 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -7,7 +7,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" #define AR9331_NAME "ar9331" diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 9e7477ed70f1..10239daa5745 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -10,7 +10,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" #define BRCM_NAME "brcm" #define BRCM_LEGACY_NAME "brcm-legacy" diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 9fe77f5cc759..1fd7fa26db64 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -50,7 +50,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" #define DSA_NAME "dsa" #define EDSA_NAME "edsa" diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 020050dff3e4..e279cd9057b0 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -10,7 +10,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" #define GSWIP_NAME "gswip" diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index 03fd5f2877c8..a047041e7686 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -12,6 +12,7 @@ #include #include "dsa_priv.h" +#include "tag.h" #define HELLCREEK_NAME "hellcreek" diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 37db5156f9a3..0f6ae143afc9 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -7,7 +7,8 @@ #include #include #include -#include "dsa_priv.h" + +#include "tag.h" #define KSZ8795_NAME "ksz8795" #define KSZ9477_NAME "ksz9477" diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 4118292ed218..c25f5536706b 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -7,7 +7,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" /* To define the outgoing port and to discover the incoming port a regular * VLAN tag is used by the LAN9303. But its VID meaning is 'special': diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index 8948c4f99f8e..40af80452747 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -8,7 +8,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" #define MTK_NAME "mtk" diff --git a/net/dsa/tag_none.c b/net/dsa/tag_none.c index 34a13c50d245..d2fd179c4227 100644 --- a/net/dsa/tag_none.c +++ b/net/dsa/tag_none.c @@ -8,7 +8,7 @@ * tagging support, look at tag_8021q.c instead. */ -#include "dsa_priv.h" +#include "tag.h" #define NONE_NAME "none" diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 8cc31ab47e28..28ebecafdd24 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -2,7 +2,8 @@ /* Copyright 2019 NXP */ #include -#include "dsa_priv.h" + +#include "tag.h" #define OCELOT_NAME "ocelot" #define SEVILLE_NAME "seville" diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index d1ec68001487..7f0c2d71e89b 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -10,7 +10,8 @@ */ #include #include -#include "dsa_priv.h" + +#include "tag.h" #define OCELOT_8021Q_NAME "ocelot-8021q" diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 73d6e111228d..e757c8de06f1 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -8,7 +8,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" #define QCA_NAME "qca" diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index 18b52d77d200..c327314b95e3 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -18,7 +18,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" #define RTL4_A_NAME "rtl4a" diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c index 030a8cf0ad48..4f67834fd121 100644 --- a/net/dsa/tag_rtl8_4.c +++ b/net/dsa/tag_rtl8_4.c @@ -77,7 +77,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" /* Protocols supported: * diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c index b9135069f9fc..437a6820ac42 100644 --- a/net/dsa/tag_rzn1_a5psw.c +++ b/net/dsa/tag_rzn1_a5psw.c @@ -10,7 +10,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" /* To define the outgoing port and to discover the incoming port a TAG is * inserted after Src MAC : diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 3b6e642a90e9..8f581617e15c 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -5,7 +5,8 @@ #include #include #include -#include "dsa_priv.h" + +#include "tag.h" #define SJA1105_NAME "sja1105" #define SJA1110_NAME "sja1110" diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 8754dfe680f6..7361b9106382 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -8,7 +8,7 @@ #include #include -#include "dsa_priv.h" +#include "tag.h" #define TRAILER_NAME "trailer" diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index dc935dd90f98..af19969f9bc4 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -7,7 +7,7 @@ #include -#include "dsa_priv.h" +#include "tag.h" #define XRS700X_NAME "xrs700x" -- cgit v1.2.3-70-g09d2 From 0c603136e1e0868fb5325c3b2addc669a10ed384 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:48 +0200 Subject: net: dsa: move headers exported by switch.c to switch.h Reduce code bloat in dsa_priv.h by moving the prototypes exported by switch.h into their own header file. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 1 + net/dsa/dsa_priv.h | 4 ---- net/dsa/port.c | 1 + net/dsa/switch.c | 1 + net/dsa/switch.h | 11 +++++++++++ net/dsa/tag_8021q.c | 1 + 6 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 net/dsa/switch.h (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 5373edf45a62..7c6b689267d0 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -22,6 +22,7 @@ #include "master.h" #include "port.h" #include "slave.h" +#include "switch.h" #include "tag.h" static DEFINE_MUTEX(dsa2_mutex); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index eee2c9729e32..4f21228c6f52 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -155,10 +155,6 @@ bool dsa_schedule_work(struct work_struct *work); /* netlink.c */ extern struct rtnl_link_ops dsa_link_ops __read_mostly; -/* switch.c */ -int dsa_switch_register_notifier(struct dsa_switch *ds); -void dsa_switch_unregister_notifier(struct dsa_switch *ds); - static inline bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) { return ds->ops->port_fdb_add && ds->ops->port_fdb_del && diff --git a/net/dsa/port.c b/net/dsa/port.c index 56728242f079..bf2c98215021 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -15,6 +15,7 @@ #include "dsa_priv.h" #include "port.h" #include "slave.h" +#include "switch.h" /** * dsa_port_notify - Notify the switching fabric of changes to a port diff --git a/net/dsa/switch.c b/net/dsa/switch.c index d0d5a1c7e6f6..6a1c84c5ec8b 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -15,6 +15,7 @@ #include "dsa_priv.h" #include "port.h" #include "slave.h" +#include "switch.h" static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, unsigned int ageing_time) diff --git a/net/dsa/switch.h b/net/dsa/switch.h new file mode 100644 index 000000000000..b831b6fb45e9 --- /dev/null +++ b/net/dsa/switch.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_SWITCH_H +#define __DSA_SWITCH_H + +struct dsa_switch; + +int dsa_switch_register_notifier(struct dsa_switch *ds); +void dsa_switch_unregister_notifier(struct dsa_switch *ds); + +#endif diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index ee5dd1a54b51..abd994dc76d5 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -9,6 +9,7 @@ #include "dsa_priv.h" #include "port.h" +#include "switch.h" #include "tag.h" /* Binary structure of the fake 12-bit VID field (when the TPID is -- cgit v1.2.3-70-g09d2 From 6dbdfce7735786f9f2dd3af615c8a03ffa1246f5 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:49 +0200 Subject: net: dsa: move dsa_tree_notify() and dsa_broadcast() to switch.c There isn't an intuitive place for these 2 cross-chip notifier functions according to the function-to-file classification based on names (dsa_switch_*() goes to switch.c), but I consider these to be part of the cross-chip notifier handling, therefore part of switch.c. Move them there to reduce bloat in dsa2.c (the place where all code with no better place to go goes). Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa2.c | 46 ---------------------------------------------- net/dsa/dsa_priv.h | 2 -- net/dsa/switch.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/switch.h | 4 ++++ 4 files changed, 50 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 7c6b689267d0..7a314c8b3aaa 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -31,52 +31,6 @@ LIST_HEAD(dsa_tree_list); /* Track the bridges with forwarding offload enabled */ static unsigned long dsa_fwd_offloading_bridges; -/** - * dsa_tree_notify - Execute code for all switches in a DSA switch tree. - * @dst: collection of struct dsa_switch devices to notify. - * @e: event, must be of type DSA_NOTIFIER_* - * @v: event-specific value. - * - * Given a struct dsa_switch_tree, this can be used to run a function once for - * each member DSA switch. The other alternative of traversing the tree is only - * through its ports list, which does not uniquely list the switches. - */ -int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v) -{ - struct raw_notifier_head *nh = &dst->nh; - int err; - - err = raw_notifier_call_chain(nh, e, v); - - return notifier_to_errno(err); -} - -/** - * dsa_broadcast - Notify all DSA trees in the system. - * @e: event, must be of type DSA_NOTIFIER_* - * @v: event-specific value. - * - * Can be used to notify the switching fabric of events such as cross-chip - * bridging between disjoint trees (such as islands of tagger-compatible - * switches bridged by an incompatible middle switch). - * - * WARNING: this function is not reliable during probe time, because probing - * between trees is asynchronous and not all DSA trees might have probed. - */ -int dsa_broadcast(unsigned long e, void *v) -{ - struct dsa_switch_tree *dst; - int err = 0; - - list_for_each_entry(dst, &dsa_tree_list, list) { - err = dsa_tree_notify(dst, e, v); - if (err) - break; - } - - return err; -} - /** * dsa_lag_map() - Map LAG structure to a linear LAG array * @dst: Tree in which to record the mapping. diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 4f21228c6f52..8cf8608344f5 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -175,8 +175,6 @@ void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag); struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, const struct net_device *lag_dev); struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst); -int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v); -int dsa_broadcast(unsigned long e, void *v); int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, const struct dsa_device_ops *tag_ops, const struct dsa_device_ops *old_tag_ops); diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 6a1c84c5ec8b..b534116dc519 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -1016,6 +1016,52 @@ static int dsa_switch_event(struct notifier_block *nb, return notifier_from_errno(err); } +/** + * dsa_tree_notify - Execute code for all switches in a DSA switch tree. + * @dst: collection of struct dsa_switch devices to notify. + * @e: event, must be of type DSA_NOTIFIER_* + * @v: event-specific value. + * + * Given a struct dsa_switch_tree, this can be used to run a function once for + * each member DSA switch. The other alternative of traversing the tree is only + * through its ports list, which does not uniquely list the switches. + */ +int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v) +{ + struct raw_notifier_head *nh = &dst->nh; + int err; + + err = raw_notifier_call_chain(nh, e, v); + + return notifier_to_errno(err); +} + +/** + * dsa_broadcast - Notify all DSA trees in the system. + * @e: event, must be of type DSA_NOTIFIER_* + * @v: event-specific value. + * + * Can be used to notify the switching fabric of events such as cross-chip + * bridging between disjoint trees (such as islands of tagger-compatible + * switches bridged by an incompatible middle switch). + * + * WARNING: this function is not reliable during probe time, because probing + * between trees is asynchronous and not all DSA trees might have probed. + */ +int dsa_broadcast(unsigned long e, void *v) +{ + struct dsa_switch_tree *dst; + int err = 0; + + list_for_each_entry(dst, &dsa_tree_list, list) { + err = dsa_tree_notify(dst, e, v); + if (err) + break; + } + + return err; +} + int dsa_switch_register_notifier(struct dsa_switch *ds) { ds->nb.notifier_call = dsa_switch_event; diff --git a/net/dsa/switch.h b/net/dsa/switch.h index b831b6fb45e9..b2fd496bc56f 100644 --- a/net/dsa/switch.h +++ b/net/dsa/switch.h @@ -3,8 +3,12 @@ #ifndef __DSA_SWITCH_H #define __DSA_SWITCH_H +struct dsa_switch_tree; struct dsa_switch; +int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v); +int dsa_broadcast(unsigned long e, void *v); + int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); -- cgit v1.2.3-70-g09d2 From 495550a4844bb4b4d10e26a7d22e9e231adb1b84 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:50 +0200 Subject: net: dsa: move notifier definitions to switch.h Reduce bloat in dsa_priv.h by moving the cross-chip notifier data structures to switch.h. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa_priv.h | 104 +------------------------------------------------- net/dsa/switch.h | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 105 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 8cf8608344f5..e5d421cdaa8f 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -13,109 +13,7 @@ #define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG -enum { - DSA_NOTIFIER_AGEING_TIME, - DSA_NOTIFIER_BRIDGE_JOIN, - DSA_NOTIFIER_BRIDGE_LEAVE, - DSA_NOTIFIER_FDB_ADD, - DSA_NOTIFIER_FDB_DEL, - DSA_NOTIFIER_HOST_FDB_ADD, - DSA_NOTIFIER_HOST_FDB_DEL, - DSA_NOTIFIER_LAG_FDB_ADD, - DSA_NOTIFIER_LAG_FDB_DEL, - DSA_NOTIFIER_LAG_CHANGE, - DSA_NOTIFIER_LAG_JOIN, - DSA_NOTIFIER_LAG_LEAVE, - DSA_NOTIFIER_MDB_ADD, - DSA_NOTIFIER_MDB_DEL, - DSA_NOTIFIER_HOST_MDB_ADD, - DSA_NOTIFIER_HOST_MDB_DEL, - DSA_NOTIFIER_VLAN_ADD, - DSA_NOTIFIER_VLAN_DEL, - DSA_NOTIFIER_HOST_VLAN_ADD, - DSA_NOTIFIER_HOST_VLAN_DEL, - DSA_NOTIFIER_MTU, - DSA_NOTIFIER_TAG_PROTO, - DSA_NOTIFIER_TAG_PROTO_CONNECT, - DSA_NOTIFIER_TAG_PROTO_DISCONNECT, - DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, - DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, - DSA_NOTIFIER_MASTER_STATE_CHANGE, -}; - -/* DSA_NOTIFIER_AGEING_TIME */ -struct dsa_notifier_ageing_time_info { - unsigned int ageing_time; -}; - -/* DSA_NOTIFIER_BRIDGE_* */ -struct dsa_notifier_bridge_info { - const struct dsa_port *dp; - struct dsa_bridge bridge; - bool tx_fwd_offload; - struct netlink_ext_ack *extack; -}; - -/* DSA_NOTIFIER_FDB_* */ -struct dsa_notifier_fdb_info { - const struct dsa_port *dp; - const unsigned char *addr; - u16 vid; - struct dsa_db db; -}; - -/* DSA_NOTIFIER_LAG_FDB_* */ -struct dsa_notifier_lag_fdb_info { - struct dsa_lag *lag; - const unsigned char *addr; - u16 vid; - struct dsa_db db; -}; - -/* DSA_NOTIFIER_MDB_* */ -struct dsa_notifier_mdb_info { - const struct dsa_port *dp; - const struct switchdev_obj_port_mdb *mdb; - struct dsa_db db; -}; - -/* DSA_NOTIFIER_LAG_* */ -struct dsa_notifier_lag_info { - const struct dsa_port *dp; - struct dsa_lag lag; - struct netdev_lag_upper_info *info; - struct netlink_ext_ack *extack; -}; - -/* DSA_NOTIFIER_VLAN_* */ -struct dsa_notifier_vlan_info { - const struct dsa_port *dp; - const struct switchdev_obj_port_vlan *vlan; - struct netlink_ext_ack *extack; -}; - -/* DSA_NOTIFIER_MTU */ -struct dsa_notifier_mtu_info { - const struct dsa_port *dp; - int mtu; -}; - -/* DSA_NOTIFIER_TAG_PROTO_* */ -struct dsa_notifier_tag_proto_info { - const struct dsa_device_ops *tag_ops; -}; - -/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */ -struct dsa_notifier_tag_8021q_vlan_info { - const struct dsa_port *dp; - u16 vid; -}; - -/* DSA_NOTIFIER_MASTER_STATE_CHANGE */ -struct dsa_notifier_master_state_info { - const struct net_device *master; - bool operational; -}; +struct dsa_notifier_tag_8021q_vlan_info; struct dsa_switchdev_event_work { struct net_device *dev; diff --git a/net/dsa/switch.h b/net/dsa/switch.h index b2fd496bc56f..15e67b95eb6e 100644 --- a/net/dsa/switch.h +++ b/net/dsa/switch.h @@ -3,8 +3,113 @@ #ifndef __DSA_SWITCH_H #define __DSA_SWITCH_H -struct dsa_switch_tree; -struct dsa_switch; +#include + +struct netlink_ext_ack; + +enum { + DSA_NOTIFIER_AGEING_TIME, + DSA_NOTIFIER_BRIDGE_JOIN, + DSA_NOTIFIER_BRIDGE_LEAVE, + DSA_NOTIFIER_FDB_ADD, + DSA_NOTIFIER_FDB_DEL, + DSA_NOTIFIER_HOST_FDB_ADD, + DSA_NOTIFIER_HOST_FDB_DEL, + DSA_NOTIFIER_LAG_FDB_ADD, + DSA_NOTIFIER_LAG_FDB_DEL, + DSA_NOTIFIER_LAG_CHANGE, + DSA_NOTIFIER_LAG_JOIN, + DSA_NOTIFIER_LAG_LEAVE, + DSA_NOTIFIER_MDB_ADD, + DSA_NOTIFIER_MDB_DEL, + DSA_NOTIFIER_HOST_MDB_ADD, + DSA_NOTIFIER_HOST_MDB_DEL, + DSA_NOTIFIER_VLAN_ADD, + DSA_NOTIFIER_VLAN_DEL, + DSA_NOTIFIER_HOST_VLAN_ADD, + DSA_NOTIFIER_HOST_VLAN_DEL, + DSA_NOTIFIER_MTU, + DSA_NOTIFIER_TAG_PROTO, + DSA_NOTIFIER_TAG_PROTO_CONNECT, + DSA_NOTIFIER_TAG_PROTO_DISCONNECT, + DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, + DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, + DSA_NOTIFIER_MASTER_STATE_CHANGE, +}; + +/* DSA_NOTIFIER_AGEING_TIME */ +struct dsa_notifier_ageing_time_info { + unsigned int ageing_time; +}; + +/* DSA_NOTIFIER_BRIDGE_* */ +struct dsa_notifier_bridge_info { + const struct dsa_port *dp; + struct dsa_bridge bridge; + bool tx_fwd_offload; + struct netlink_ext_ack *extack; +}; + +/* DSA_NOTIFIER_FDB_* */ +struct dsa_notifier_fdb_info { + const struct dsa_port *dp; + const unsigned char *addr; + u16 vid; + struct dsa_db db; +}; + +/* DSA_NOTIFIER_LAG_FDB_* */ +struct dsa_notifier_lag_fdb_info { + struct dsa_lag *lag; + const unsigned char *addr; + u16 vid; + struct dsa_db db; +}; + +/* DSA_NOTIFIER_MDB_* */ +struct dsa_notifier_mdb_info { + const struct dsa_port *dp; + const struct switchdev_obj_port_mdb *mdb; + struct dsa_db db; +}; + +/* DSA_NOTIFIER_LAG_* */ +struct dsa_notifier_lag_info { + const struct dsa_port *dp; + struct dsa_lag lag; + struct netdev_lag_upper_info *info; + struct netlink_ext_ack *extack; +}; + +/* DSA_NOTIFIER_VLAN_* */ +struct dsa_notifier_vlan_info { + const struct dsa_port *dp; + const struct switchdev_obj_port_vlan *vlan; + struct netlink_ext_ack *extack; +}; + +/* DSA_NOTIFIER_MTU */ +struct dsa_notifier_mtu_info { + const struct dsa_port *dp; + int mtu; +}; + +/* DSA_NOTIFIER_TAG_PROTO_* */ +struct dsa_notifier_tag_proto_info { + const struct dsa_device_ops *tag_ops; +}; + +/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */ +struct dsa_notifier_tag_8021q_vlan_info { + const struct dsa_port *dp; + u16 vid; +}; + +/* DSA_NOTIFIER_MASTER_STATE_CHANGE */ +struct dsa_notifier_master_state_info { + const struct net_device *master; + bool operational; +}; int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v); int dsa_broadcast(unsigned long e, void *v); -- cgit v1.2.3-70-g09d2 From 165c2fb93bed2e73c63d064b315a9da15a3e4694 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:51 +0200 Subject: net: dsa: merge dsa.c into dsa2.c There is no longer a meaningful distinction between what goes into dsa2.c and what goes into dsa.c. Merge the 2 into a single file. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/Makefile | 1 - net/dsa/dsa.c | 234 ----------------------------------------------------- net/dsa/dsa2.c | 222 +++++++++++++++++++++++++++++++++++++++++++++++++- net/dsa/dsa_priv.h | 2 - 4 files changed, 221 insertions(+), 238 deletions(-) delete mode 100644 net/dsa/dsa.c (limited to 'net') diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 93f5d5f1e495..f38d0f4bf76c 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -3,7 +3,6 @@ obj-$(CONFIG_NET_DSA) += dsa_core.o dsa_core-y += \ devlink.o \ - dsa.o \ dsa2.o \ master.o \ netlink.o \ diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c deleted file mode 100644 index 6f87dd1ee6bf..000000000000 --- a/net/dsa/dsa.c +++ /dev/null @@ -1,234 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dsa/dsa.c - Hardware switch handling - * Copyright (c) 2008-2009 Marvell Semiconductor - * Copyright (c) 2013 Florian Fainelli - */ - -#include -#include -#include -#include -#include - -#include "dsa_priv.h" -#include "slave.h" -#include "tag.h" - -static int dev_is_class(struct device *dev, void *class) -{ - if (dev->class != NULL && !strcmp(dev->class->name, class)) - return 1; - - return 0; -} - -static struct device *dev_find_class(struct device *parent, char *class) -{ - if (dev_is_class(parent, class)) { - get_device(parent); - return parent; - } - - return device_find_child(parent, class, dev_is_class); -} - -struct net_device *dsa_dev_to_net_device(struct device *dev) -{ - struct device *d; - - d = dev_find_class(dev, "net"); - if (d != NULL) { - struct net_device *nd; - - nd = to_net_dev(d); - dev_hold(nd); - put_device(d); - - return nd; - } - - return NULL; -} - -#ifdef CONFIG_PM_SLEEP -static bool dsa_port_is_initialized(const struct dsa_port *dp) -{ - return dp->type == DSA_PORT_TYPE_USER && dp->slave; -} - -int dsa_switch_suspend(struct dsa_switch *ds) -{ - struct dsa_port *dp; - int ret = 0; - - /* Suspend slave network devices */ - dsa_switch_for_each_port(dp, ds) { - if (!dsa_port_is_initialized(dp)) - continue; - - ret = dsa_slave_suspend(dp->slave); - if (ret) - return ret; - } - - if (ds->ops->suspend) - ret = ds->ops->suspend(ds); - - return ret; -} -EXPORT_SYMBOL_GPL(dsa_switch_suspend); - -int dsa_switch_resume(struct dsa_switch *ds) -{ - struct dsa_port *dp; - int ret = 0; - - if (ds->ops->resume) - ret = ds->ops->resume(ds); - - if (ret) - return ret; - - /* Resume slave network devices */ - dsa_switch_for_each_port(dp, ds) { - if (!dsa_port_is_initialized(dp)) - continue; - - ret = dsa_slave_resume(dp->slave); - if (ret) - return ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(dsa_switch_resume); -#endif - -static struct workqueue_struct *dsa_owq; - -bool dsa_schedule_work(struct work_struct *work) -{ - return queue_work(dsa_owq, work); -} - -void dsa_flush_workqueue(void) -{ - flush_workqueue(dsa_owq); -} -EXPORT_SYMBOL_GPL(dsa_flush_workqueue); - -struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) -{ - if (!netdev || !dsa_slave_dev_check(netdev)) - return ERR_PTR(-ENODEV); - - return dsa_slave_to_port(netdev); -} -EXPORT_SYMBOL_GPL(dsa_port_from_netdev); - -bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b) -{ - if (a->type != b->type) - return false; - - switch (a->type) { - case DSA_DB_PORT: - return a->dp == b->dp; - case DSA_DB_LAG: - return a->lag.dev == b->lag.dev; - case DSA_DB_BRIDGE: - return a->bridge.num == b->bridge.num; - default: - WARN_ON(1); - return false; - } -} - -bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid, - struct dsa_db db) -{ - struct dsa_port *dp = dsa_to_port(ds, port); - struct dsa_mac_addr *a; - - lockdep_assert_held(&dp->addr_lists_lock); - - list_for_each_entry(a, &dp->fdbs, list) { - if (!ether_addr_equal(a->addr, addr) || a->vid != vid) - continue; - - if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) - return true; - } - - return false; -} -EXPORT_SYMBOL_GPL(dsa_fdb_present_in_other_db); - -bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb, - struct dsa_db db) -{ - struct dsa_port *dp = dsa_to_port(ds, port); - struct dsa_mac_addr *a; - - lockdep_assert_held(&dp->addr_lists_lock); - - list_for_each_entry(a, &dp->mdbs, list) { - if (!ether_addr_equal(a->addr, mdb->addr) || a->vid != mdb->vid) - continue; - - if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) - return true; - } - - return false; -} -EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db); - -static int __init dsa_init_module(void) -{ - int rc; - - dsa_owq = alloc_ordered_workqueue("dsa_ordered", - WQ_MEM_RECLAIM); - if (!dsa_owq) - return -ENOMEM; - - rc = dsa_slave_register_notifier(); - if (rc) - goto register_notifier_fail; - - dev_add_pack(&dsa_pack_type); - - rc = rtnl_link_register(&dsa_link_ops); - if (rc) - goto netlink_register_fail; - - return 0; - -netlink_register_fail: - dsa_slave_unregister_notifier(); - dev_remove_pack(&dsa_pack_type); -register_notifier_fail: - destroy_workqueue(dsa_owq); - - return rc; -} -module_init(dsa_init_module); - -static void __exit dsa_cleanup_module(void) -{ - rtnl_link_unregister(&dsa_link_ops); - - dsa_slave_unregister_notifier(); - dev_remove_pack(&dsa_pack_type); - destroy_workqueue(dsa_owq); -} -module_exit(dsa_cleanup_module); - -MODULE_AUTHOR("Lennert Buytenhek "); -MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:dsa"); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 7a314c8b3aaa..7a75b0767dd1 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * net/dsa/dsa2.c - Hardware switch handling, binding version 2 + * DSA topology and switch handling + * * Copyright (c) 2008-2009 Marvell Semiconductor * Copyright (c) 2013 Florian Fainelli * Copyright (c) 2016 Andrew Lunn @@ -9,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -28,9 +30,22 @@ static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); +static struct workqueue_struct *dsa_owq; + /* Track the bridges with forwarding offload enabled */ static unsigned long dsa_fwd_offloading_bridges; +bool dsa_schedule_work(struct work_struct *work) +{ + return queue_work(dsa_owq, work); +} + +void dsa_flush_workqueue(void) +{ + flush_workqueue(dsa_owq); +} +EXPORT_SYMBOL_GPL(dsa_flush_workqueue); + /** * dsa_lag_map() - Map LAG structure to a linear LAG array * @dst: Tree in which to record the mapping. @@ -1331,6 +1346,42 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) return dsa_switch_parse_ports_of(ds, dn); } +static int dev_is_class(struct device *dev, void *class) +{ + if (dev->class != NULL && !strcmp(dev->class->name, class)) + return 1; + + return 0; +} + +static struct device *dev_find_class(struct device *parent, char *class) +{ + if (dev_is_class(parent, class)) { + get_device(parent); + return parent; + } + + return device_find_child(parent, class, dev_is_class); +} + +static struct net_device *dsa_dev_to_net_device(struct device *dev) +{ + struct device *d; + + d = dev_find_class(dev, "net"); + if (d != NULL) { + struct net_device *nd; + + nd = to_net_dev(d); + dev_hold(nd); + put_device(d); + + return nd; + } + + return NULL; +} + static int dsa_port_parse(struct dsa_port *dp, const char *name, struct device *dev) { @@ -1524,3 +1575,172 @@ out: mutex_unlock(&dsa2_mutex); } EXPORT_SYMBOL_GPL(dsa_switch_shutdown); + +#ifdef CONFIG_PM_SLEEP +static bool dsa_port_is_initialized(const struct dsa_port *dp) +{ + return dp->type == DSA_PORT_TYPE_USER && dp->slave; +} + +int dsa_switch_suspend(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int ret = 0; + + /* Suspend slave network devices */ + dsa_switch_for_each_port(dp, ds) { + if (!dsa_port_is_initialized(dp)) + continue; + + ret = dsa_slave_suspend(dp->slave); + if (ret) + return ret; + } + + if (ds->ops->suspend) + ret = ds->ops->suspend(ds); + + return ret; +} +EXPORT_SYMBOL_GPL(dsa_switch_suspend); + +int dsa_switch_resume(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int ret = 0; + + if (ds->ops->resume) + ret = ds->ops->resume(ds); + + if (ret) + return ret; + + /* Resume slave network devices */ + dsa_switch_for_each_port(dp, ds) { + if (!dsa_port_is_initialized(dp)) + continue; + + ret = dsa_slave_resume(dp->slave); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_switch_resume); +#endif + +struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) +{ + if (!netdev || !dsa_slave_dev_check(netdev)) + return ERR_PTR(-ENODEV); + + return dsa_slave_to_port(netdev); +} +EXPORT_SYMBOL_GPL(dsa_port_from_netdev); + +bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b) +{ + if (a->type != b->type) + return false; + + switch (a->type) { + case DSA_DB_PORT: + return a->dp == b->dp; + case DSA_DB_LAG: + return a->lag.dev == b->lag.dev; + case DSA_DB_BRIDGE: + return a->bridge.num == b->bridge.num; + default: + WARN_ON(1); + return false; + } +} + +bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + + lockdep_assert_held(&dp->addr_lists_lock); + + list_for_each_entry(a, &dp->fdbs, list) { + if (!ether_addr_equal(a->addr, addr) || a->vid != vid) + continue; + + if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(dsa_fdb_present_in_other_db); + +bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + + lockdep_assert_held(&dp->addr_lists_lock); + + list_for_each_entry(a, &dp->mdbs, list) { + if (!ether_addr_equal(a->addr, mdb->addr) || a->vid != mdb->vid) + continue; + + if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db); + +static int __init dsa_init_module(void) +{ + int rc; + + dsa_owq = alloc_ordered_workqueue("dsa_ordered", + WQ_MEM_RECLAIM); + if (!dsa_owq) + return -ENOMEM; + + rc = dsa_slave_register_notifier(); + if (rc) + goto register_notifier_fail; + + dev_add_pack(&dsa_pack_type); + + rc = rtnl_link_register(&dsa_link_ops); + if (rc) + goto netlink_register_fail; + + return 0; + +netlink_register_fail: + dsa_slave_unregister_notifier(); + dev_remove_pack(&dsa_pack_type); +register_notifier_fail: + destroy_workqueue(dsa_owq); + + return rc; +} +module_init(dsa_init_module); + +static void __exit dsa_cleanup_module(void) +{ + rtnl_link_unregister(&dsa_link_ops); + + dsa_slave_unregister_notifier(); + dev_remove_pack(&dsa_pack_type); + destroy_workqueue(dsa_owq); +} +module_exit(dsa_cleanup_module); + +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:dsa"); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index e5d421cdaa8f..3f6f84150592 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -44,8 +44,6 @@ struct dsa_standalone_event_work { }; /* dsa.c */ -struct net_device *dsa_dev_to_net_device(struct device *dev); - bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b); bool dsa_schedule_work(struct work_struct *work); -- cgit v1.2.3-70-g09d2 From 47d2ce03dcfb6b7f0373aac6c667715d94caba78 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:52 +0200 Subject: net: dsa: rename dsa2.c back into dsa.c and create its header The previous change moved the code into the larger file (dsa2.c) to minimize the delta. Rename that now to dsa.c, and create dsa.h, where all related definitions from dsa_priv.h go. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/Makefile | 2 +- net/dsa/dsa.c | 1747 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/dsa/dsa.h | 40 ++ net/dsa/dsa2.c | 1746 --------------------------------------------------- net/dsa/dsa_priv.h | 28 - net/dsa/master.c | 2 +- net/dsa/port.c | 2 +- net/dsa/slave.c | 1 + net/dsa/switch.c | 1 + 9 files changed, 1792 insertions(+), 1777 deletions(-) create mode 100644 net/dsa/dsa.c create mode 100644 net/dsa/dsa.h delete mode 100644 net/dsa/dsa2.c (limited to 'net') diff --git a/net/dsa/Makefile b/net/dsa/Makefile index f38d0f4bf76c..cc7e93a562fe 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_NET_DSA) += dsa_core.o dsa_core-y += \ devlink.o \ - dsa2.o \ + dsa.o \ master.o \ netlink.o \ port.o \ diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c new file mode 100644 index 000000000000..fee4d28b7304 --- /dev/null +++ b/net/dsa/dsa.c @@ -0,0 +1,1747 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * DSA topology and switch handling + * + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2013 Florian Fainelli + * Copyright (c) 2016 Andrew Lunn + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "devlink.h" +#include "dsa.h" +#include "dsa_priv.h" +#include "master.h" +#include "port.h" +#include "slave.h" +#include "switch.h" +#include "tag.h" + +static DEFINE_MUTEX(dsa2_mutex); +LIST_HEAD(dsa_tree_list); + +static struct workqueue_struct *dsa_owq; + +/* Track the bridges with forwarding offload enabled */ +static unsigned long dsa_fwd_offloading_bridges; + +bool dsa_schedule_work(struct work_struct *work) +{ + return queue_work(dsa_owq, work); +} + +void dsa_flush_workqueue(void) +{ + flush_workqueue(dsa_owq); +} +EXPORT_SYMBOL_GPL(dsa_flush_workqueue); + +/** + * dsa_lag_map() - Map LAG structure to a linear LAG array + * @dst: Tree in which to record the mapping. + * @lag: LAG structure that is to be mapped to the tree's array. + * + * dsa_lag_id/dsa_lag_by_id can then be used to translate between the + * two spaces. The size of the mapping space is determined by the + * driver by setting ds->num_lag_ids. It is perfectly legal to leave + * it unset if it is not needed, in which case these functions become + * no-ops. + */ +void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag) +{ + unsigned int id; + + for (id = 1; id <= dst->lags_len; id++) { + if (!dsa_lag_by_id(dst, id)) { + dst->lags[id - 1] = lag; + lag->id = id; + return; + } + } + + /* No IDs left, which is OK. Some drivers do not need it. The + * ones that do, e.g. mv88e6xxx, will discover that dsa_lag_id + * returns an error for this device when joining the LAG. The + * driver can then return -EOPNOTSUPP back to DSA, which will + * fall back to a software LAG. + */ +} + +/** + * dsa_lag_unmap() - Remove a LAG ID mapping + * @dst: Tree in which the mapping is recorded. + * @lag: LAG structure that was mapped. + * + * As there may be multiple users of the mapping, it is only removed + * if there are no other references to it. + */ +void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag) +{ + unsigned int id; + + dsa_lags_foreach_id(id, dst) { + if (dsa_lag_by_id(dst, id) == lag) { + dst->lags[id - 1] = NULL; + lag->id = 0; + break; + } + } +} + +struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, + const struct net_device *lag_dev) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_lag_dev_get(dp) == lag_dev) + return dp->lag; + + return NULL; +} + +struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst, + const struct net_device *br) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_bridge_dev_get(dp) == br) + return dp->bridge; + + return NULL; +} + +static int dsa_bridge_num_find(const struct net_device *bridge_dev) +{ + struct dsa_switch_tree *dst; + + list_for_each_entry(dst, &dsa_tree_list, list) { + struct dsa_bridge *bridge; + + bridge = dsa_tree_bridge_find(dst, bridge_dev); + if (bridge) + return bridge->num; + } + + return 0; +} + +unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max) +{ + unsigned int bridge_num = dsa_bridge_num_find(bridge_dev); + + /* Switches without FDB isolation support don't get unique + * bridge numbering + */ + if (!max) + return 0; + + if (!bridge_num) { + /* First port that requests FDB isolation or TX forwarding + * offload for this bridge + */ + bridge_num = find_next_zero_bit(&dsa_fwd_offloading_bridges, + DSA_MAX_NUM_OFFLOADING_BRIDGES, + 1); + if (bridge_num >= max) + return 0; + + set_bit(bridge_num, &dsa_fwd_offloading_bridges); + } + + return bridge_num; +} + +void dsa_bridge_num_put(const struct net_device *bridge_dev, + unsigned int bridge_num) +{ + /* Since we refcount bridges, we know that when we call this function + * it is no longer in use, so we can just go ahead and remove it from + * the bit mask. + */ + clear_bit(bridge_num, &dsa_fwd_offloading_bridges); +} + +struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) +{ + struct dsa_switch_tree *dst; + struct dsa_port *dp; + + list_for_each_entry(dst, &dsa_tree_list, list) { + if (dst->index != tree_index) + continue; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->ds->index != sw_index) + continue; + + return dp->ds; + } + } + + return NULL; +} +EXPORT_SYMBOL_GPL(dsa_switch_find); + +static struct dsa_switch_tree *dsa_tree_find(int index) +{ + struct dsa_switch_tree *dst; + + list_for_each_entry(dst, &dsa_tree_list, list) + if (dst->index == index) + return dst; + + return NULL; +} + +static struct dsa_switch_tree *dsa_tree_alloc(int index) +{ + struct dsa_switch_tree *dst; + + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return NULL; + + dst->index = index; + + INIT_LIST_HEAD(&dst->rtable); + + INIT_LIST_HEAD(&dst->ports); + + INIT_LIST_HEAD(&dst->list); + list_add_tail(&dst->list, &dsa_tree_list); + + kref_init(&dst->refcount); + + return dst; +} + +static void dsa_tree_free(struct dsa_switch_tree *dst) +{ + if (dst->tag_ops) + dsa_tag_driver_put(dst->tag_ops); + list_del(&dst->list); + kfree(dst); +} + +static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst) +{ + if (dst) + kref_get(&dst->refcount); + + return dst; +} + +static struct dsa_switch_tree *dsa_tree_touch(int index) +{ + struct dsa_switch_tree *dst; + + dst = dsa_tree_find(index); + if (dst) + return dsa_tree_get(dst); + else + return dsa_tree_alloc(index); +} + +static void dsa_tree_release(struct kref *ref) +{ + struct dsa_switch_tree *dst; + + dst = container_of(ref, struct dsa_switch_tree, refcount); + + dsa_tree_free(dst); +} + +static void dsa_tree_put(struct dsa_switch_tree *dst) +{ + if (dst) + kref_put(&dst->refcount, dsa_tree_release); +} + +static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, + struct device_node *dn) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dp->dn == dn) + return dp; + + return NULL; +} + +static struct dsa_link *dsa_link_touch(struct dsa_port *dp, + struct dsa_port *link_dp) +{ + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst; + struct dsa_link *dl; + + dst = ds->dst; + + list_for_each_entry(dl, &dst->rtable, list) + if (dl->dp == dp && dl->link_dp == link_dp) + return dl; + + dl = kzalloc(sizeof(*dl), GFP_KERNEL); + if (!dl) + return NULL; + + dl->dp = dp; + dl->link_dp = link_dp; + + INIT_LIST_HEAD(&dl->list); + list_add_tail(&dl->list, &dst->rtable); + + return dl; +} + +static bool dsa_port_setup_routing_table(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst = ds->dst; + struct device_node *dn = dp->dn; + struct of_phandle_iterator it; + struct dsa_port *link_dp; + struct dsa_link *dl; + int err; + + of_for_each_phandle(&it, err, dn, "link", NULL, 0) { + link_dp = dsa_tree_find_port_by_node(dst, it.node); + if (!link_dp) { + of_node_put(it.node); + return false; + } + + dl = dsa_link_touch(dp, link_dp); + if (!dl) { + of_node_put(it.node); + return false; + } + } + + return true; +} + +static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) +{ + bool complete = true; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) { + if (dsa_port_is_dsa(dp)) { + complete = dsa_port_setup_routing_table(dp); + if (!complete) + break; + } + } + + return complete; +} + +static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_cpu(dp)) + return dp; + + return NULL; +} + +struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst) +{ + struct device_node *ethernet; + struct net_device *master; + struct dsa_port *cpu_dp; + + cpu_dp = dsa_tree_find_first_cpu(dst); + ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0); + master = of_find_net_device_by_node(ethernet); + of_node_put(ethernet); + + return master; +} + +/* Assign the default CPU port (the first one in the tree) to all ports of the + * fabric which don't already have one as part of their own switch. + */ +static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp, *dp; + + cpu_dp = dsa_tree_find_first_cpu(dst); + if (!cpu_dp) { + pr_err("DSA: tree %d has no CPU port\n", dst->index); + return -EINVAL; + } + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->cpu_dp) + continue; + + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = cpu_dp; + } + + return 0; +} + +/* Perform initial assignment of CPU ports to user ports and DSA links in the + * fabric, giving preference to CPU ports local to each switch. Default to + * using the first CPU port in the switch tree if the port does not have a CPU + * port local to this switch. + */ +static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp, *dp; + + list_for_each_entry(cpu_dp, &dst->ports, list) { + if (!dsa_port_is_cpu(cpu_dp)) + continue; + + /* Prefer a local CPU port */ + dsa_switch_for_each_port(dp, cpu_dp->ds) { + /* Prefer the first local CPU port found */ + if (dp->cpu_dp) + continue; + + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = cpu_dp; + } + } + + return dsa_tree_setup_default_cpu(dst); +} + +static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = NULL; +} + +static int dsa_port_setup(struct dsa_port *dp) +{ + bool dsa_port_link_registered = false; + struct dsa_switch *ds = dp->ds; + bool dsa_port_enabled = false; + int err = 0; + + if (dp->setup) + return 0; + + err = dsa_port_devlink_setup(dp); + if (err) + return err; + + switch (dp->type) { + case DSA_PORT_TYPE_UNUSED: + dsa_port_disable(dp); + break; + case DSA_PORT_TYPE_CPU: + if (dp->dn) { + err = dsa_shared_port_link_register_of(dp); + if (err) + break; + dsa_port_link_registered = true; + } else { + dev_warn(ds->dev, + "skipping link registration for CPU port %d\n", + dp->index); + } + + err = dsa_port_enable(dp, NULL); + if (err) + break; + dsa_port_enabled = true; + + break; + case DSA_PORT_TYPE_DSA: + if (dp->dn) { + err = dsa_shared_port_link_register_of(dp); + if (err) + break; + dsa_port_link_registered = true; + } else { + dev_warn(ds->dev, + "skipping link registration for DSA port %d\n", + dp->index); + } + + err = dsa_port_enable(dp, NULL); + if (err) + break; + dsa_port_enabled = true; + + break; + case DSA_PORT_TYPE_USER: + of_get_mac_address(dp->dn, dp->mac); + err = dsa_slave_create(dp); + break; + } + + if (err && dsa_port_enabled) + dsa_port_disable(dp); + if (err && dsa_port_link_registered) + dsa_shared_port_link_unregister_of(dp); + if (err) { + dsa_port_devlink_teardown(dp); + return err; + } + + dp->setup = true; + + return 0; +} + +static void dsa_port_teardown(struct dsa_port *dp) +{ + if (!dp->setup) + return; + + switch (dp->type) { + case DSA_PORT_TYPE_UNUSED: + break; + case DSA_PORT_TYPE_CPU: + dsa_port_disable(dp); + if (dp->dn) + dsa_shared_port_link_unregister_of(dp); + break; + case DSA_PORT_TYPE_DSA: + dsa_port_disable(dp); + if (dp->dn) + dsa_shared_port_link_unregister_of(dp); + break; + case DSA_PORT_TYPE_USER: + if (dp->slave) { + dsa_slave_destroy(dp->slave); + dp->slave = NULL; + } + break; + } + + dsa_port_devlink_teardown(dp); + + dp->setup = false; +} + +static int dsa_port_setup_as_unused(struct dsa_port *dp) +{ + dp->type = DSA_PORT_TYPE_UNUSED; + return dsa_port_setup(dp); +} + +static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) +{ + const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; + struct dsa_switch_tree *dst = ds->dst; + int err; + + if (tag_ops->proto == dst->default_proto) + goto connect; + + rtnl_lock(); + err = ds->ops->change_tag_protocol(ds, tag_ops->proto); + rtnl_unlock(); + if (err) { + dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", + tag_ops->name, ERR_PTR(err)); + return err; + } + +connect: + if (tag_ops->connect) { + err = tag_ops->connect(ds); + if (err) + return err; + } + + if (ds->ops->connect_tag_protocol) { + err = ds->ops->connect_tag_protocol(ds, tag_ops->proto); + if (err) { + dev_err(ds->dev, + "Unable to connect to tag protocol \"%s\": %pe\n", + tag_ops->name, ERR_PTR(err)); + goto disconnect; + } + } + + return 0; + +disconnect: + if (tag_ops->disconnect) + tag_ops->disconnect(ds); + + return err; +} + +static void dsa_switch_teardown_tag_protocol(struct dsa_switch *ds) +{ + const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; + + if (tag_ops->disconnect) + tag_ops->disconnect(ds); +} + +static int dsa_switch_setup(struct dsa_switch *ds) +{ + struct device_node *dn; + int err; + + if (ds->setup) + return 0; + + /* Initialize ds->phys_mii_mask before registering the slave MDIO bus + * driver and before ops->setup() has run, since the switch drivers and + * the slave MDIO bus driver rely on these values for probing PHY + * devices or not + */ + ds->phys_mii_mask |= dsa_user_ports(ds); + + err = dsa_switch_devlink_alloc(ds); + if (err) + return err; + + err = dsa_switch_register_notifier(ds); + if (err) + goto devlink_free; + + ds->configure_vlan_while_not_filtering = true; + + err = ds->ops->setup(ds); + if (err < 0) + goto unregister_notifier; + + err = dsa_switch_setup_tag_protocol(ds); + if (err) + goto teardown; + + if (!ds->slave_mii_bus && ds->ops->phy_read) { + ds->slave_mii_bus = mdiobus_alloc(); + if (!ds->slave_mii_bus) { + err = -ENOMEM; + goto teardown; + } + + dsa_slave_mii_bus_init(ds); + + dn = of_get_child_by_name(ds->dev->of_node, "mdio"); + + err = of_mdiobus_register(ds->slave_mii_bus, dn); + of_node_put(dn); + if (err < 0) + goto free_slave_mii_bus; + } + + dsa_switch_devlink_register(ds); + + ds->setup = true; + return 0; + +free_slave_mii_bus: + if (ds->slave_mii_bus && ds->ops->phy_read) + mdiobus_free(ds->slave_mii_bus); +teardown: + if (ds->ops->teardown) + ds->ops->teardown(ds); +unregister_notifier: + dsa_switch_unregister_notifier(ds); +devlink_free: + dsa_switch_devlink_free(ds); + return err; +} + +static void dsa_switch_teardown(struct dsa_switch *ds) +{ + if (!ds->setup) + return; + + dsa_switch_devlink_unregister(ds); + + if (ds->slave_mii_bus && ds->ops->phy_read) { + mdiobus_unregister(ds->slave_mii_bus); + mdiobus_free(ds->slave_mii_bus); + ds->slave_mii_bus = NULL; + } + + dsa_switch_teardown_tag_protocol(ds); + + if (ds->ops->teardown) + ds->ops->teardown(ds); + + dsa_switch_unregister_notifier(ds); + + dsa_switch_devlink_free(ds); + + ds->setup = false; +} + +/* First tear down the non-shared, then the shared ports. This ensures that + * all work items scheduled by our switchdev handlers for user ports have + * completed before we destroy the refcounting kept on the shared ports. + */ +static void dsa_tree_teardown_ports(struct dsa_switch_tree *dst) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) + dsa_port_teardown(dp); + + dsa_flush_workqueue(); + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) + dsa_port_teardown(dp); +} + +static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); +} + +/* Bring shared ports up first, then non-shared ports */ +static int dsa_tree_setup_ports(struct dsa_switch_tree *dst) +{ + struct dsa_port *dp; + int err = 0; + + list_for_each_entry(dp, &dst->ports, list) { + if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) { + err = dsa_port_setup(dp); + if (err) + goto teardown; + } + } + + list_for_each_entry(dp, &dst->ports, list) { + if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) { + err = dsa_port_setup(dp); + if (err) { + err = dsa_port_setup_as_unused(dp); + if (err) + goto teardown; + } + } + } + + return 0; + +teardown: + dsa_tree_teardown_ports(dst); + + return err; +} + +static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) +{ + struct dsa_port *dp; + int err = 0; + + list_for_each_entry(dp, &dst->ports, list) { + err = dsa_switch_setup(dp->ds); + if (err) { + dsa_tree_teardown_switches(dst); + break; + } + } + + return err; +} + +static int dsa_tree_setup_master(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp; + int err = 0; + + rtnl_lock(); + + dsa_tree_for_each_cpu_port(cpu_dp, dst) { + struct net_device *master = cpu_dp->master; + bool admin_up = (master->flags & IFF_UP) && + !qdisc_tx_is_noop(master); + + err = dsa_master_setup(master, cpu_dp); + if (err) + break; + + /* Replay master state event */ + dsa_tree_master_admin_state_change(dst, master, admin_up); + dsa_tree_master_oper_state_change(dst, master, + netif_oper_up(master)); + } + + rtnl_unlock(); + + return err; +} + +static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp; + + rtnl_lock(); + + dsa_tree_for_each_cpu_port(cpu_dp, dst) { + struct net_device *master = cpu_dp->master; + + /* Synthesizing an "admin down" state is sufficient for + * the switches to get a notification if the master is + * currently up and running. + */ + dsa_tree_master_admin_state_change(dst, master, false); + + dsa_master_teardown(master); + } + + rtnl_unlock(); +} + +static int dsa_tree_setup_lags(struct dsa_switch_tree *dst) +{ + unsigned int len = 0; + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->ds->num_lag_ids > len) + len = dp->ds->num_lag_ids; + } + + if (!len) + return 0; + + dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL); + if (!dst->lags) + return -ENOMEM; + + dst->lags_len = len; + return 0; +} + +static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst) +{ + kfree(dst->lags); +} + +static int dsa_tree_setup(struct dsa_switch_tree *dst) +{ + bool complete; + int err; + + if (dst->setup) { + pr_err("DSA: tree %d already setup! Disjoint trees?\n", + dst->index); + return -EEXIST; + } + + complete = dsa_tree_setup_routing_table(dst); + if (!complete) + return 0; + + err = dsa_tree_setup_cpu_ports(dst); + if (err) + return err; + + err = dsa_tree_setup_switches(dst); + if (err) + goto teardown_cpu_ports; + + err = dsa_tree_setup_ports(dst); + if (err) + goto teardown_switches; + + err = dsa_tree_setup_master(dst); + if (err) + goto teardown_ports; + + err = dsa_tree_setup_lags(dst); + if (err) + goto teardown_master; + + dst->setup = true; + + pr_info("DSA: tree %d setup\n", dst->index); + + return 0; + +teardown_master: + dsa_tree_teardown_master(dst); +teardown_ports: + dsa_tree_teardown_ports(dst); +teardown_switches: + dsa_tree_teardown_switches(dst); +teardown_cpu_ports: + dsa_tree_teardown_cpu_ports(dst); + + return err; +} + +static void dsa_tree_teardown(struct dsa_switch_tree *dst) +{ + struct dsa_link *dl, *next; + + if (!dst->setup) + return; + + dsa_tree_teardown_lags(dst); + + dsa_tree_teardown_master(dst); + + dsa_tree_teardown_ports(dst); + + dsa_tree_teardown_switches(dst); + + dsa_tree_teardown_cpu_ports(dst); + + list_for_each_entry_safe(dl, next, &dst->rtable, list) { + list_del(&dl->list); + kfree(dl); + } + + pr_info("DSA: tree %d torn down\n", dst->index); + + dst->setup = false; +} + +static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst, + const struct dsa_device_ops *tag_ops) +{ + const struct dsa_device_ops *old_tag_ops = dst->tag_ops; + struct dsa_notifier_tag_proto_info info; + int err; + + dst->tag_ops = tag_ops; + + /* Notify the switches from this tree about the connection + * to the new tagger + */ + info.tag_ops = tag_ops; + err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info); + if (err && err != -EOPNOTSUPP) + goto out_disconnect; + + /* Notify the old tagger about the disconnection from this tree */ + info.tag_ops = old_tag_ops; + dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info); + + return 0; + +out_disconnect: + info.tag_ops = tag_ops; + dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info); + dst->tag_ops = old_tag_ops; + + return err; +} + +/* Since the dsa/tagging sysfs device attribute is per master, the assumption + * is that all DSA switches within a tree share the same tagger, otherwise + * they would have formed disjoint trees (different "dsa,member" values). + */ +int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, + const struct dsa_device_ops *tag_ops, + const struct dsa_device_ops *old_tag_ops) +{ + struct dsa_notifier_tag_proto_info info; + struct dsa_port *dp; + int err = -EBUSY; + + if (!rtnl_trylock()) + return restart_syscall(); + + /* At the moment we don't allow changing the tag protocol under + * traffic. The rtnl_mutex also happens to serialize concurrent + * attempts to change the tagging protocol. If we ever lift the IFF_UP + * restriction, there needs to be another mutex which serializes this. + */ + dsa_tree_for_each_user_port(dp, dst) { + if (dsa_port_to_master(dp)->flags & IFF_UP) + goto out_unlock; + + if (dp->slave->flags & IFF_UP) + goto out_unlock; + } + + /* Notify the tag protocol change */ + info.tag_ops = tag_ops; + err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); + if (err) + goto out_unwind_tagger; + + err = dsa_tree_bind_tag_proto(dst, tag_ops); + if (err) + goto out_unwind_tagger; + + rtnl_unlock(); + + return 0; + +out_unwind_tagger: + info.tag_ops = old_tag_ops; + dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); +out_unlock: + rtnl_unlock(); + return err; +} + +static void dsa_tree_master_state_change(struct dsa_switch_tree *dst, + struct net_device *master) +{ + struct dsa_notifier_master_state_info info; + struct dsa_port *cpu_dp = master->dsa_ptr; + + info.master = master; + info.operational = dsa_port_master_is_operational(cpu_dp); + + dsa_tree_notify(dst, DSA_NOTIFIER_MASTER_STATE_CHANGE, &info); +} + +void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst, + struct net_device *master, + bool up) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + bool notify = false; + + /* Don't keep track of admin state on LAG DSA masters, + * but rather just of physical DSA masters + */ + if (netif_is_lag_master(master)) + return; + + if ((dsa_port_master_is_operational(cpu_dp)) != + (up && cpu_dp->master_oper_up)) + notify = true; + + cpu_dp->master_admin_up = up; + + if (notify) + dsa_tree_master_state_change(dst, master); +} + +void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst, + struct net_device *master, + bool up) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + bool notify = false; + + /* Don't keep track of oper state on LAG DSA masters, + * but rather just of physical DSA masters + */ + if (netif_is_lag_master(master)) + return; + + if ((dsa_port_master_is_operational(cpu_dp)) != + (cpu_dp->master_admin_up && up)) + notify = true; + + cpu_dp->master_oper_up = up; + + if (notify) + dsa_tree_master_state_change(dst, master); +} + +static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) +{ + struct dsa_switch_tree *dst = ds->dst; + struct dsa_port *dp; + + dsa_switch_for_each_port(dp, ds) + if (dp->index == index) + return dp; + + dp = kzalloc(sizeof(*dp), GFP_KERNEL); + if (!dp) + return NULL; + + dp->ds = ds; + dp->index = index; + + mutex_init(&dp->addr_lists_lock); + mutex_init(&dp->vlans_lock); + INIT_LIST_HEAD(&dp->fdbs); + INIT_LIST_HEAD(&dp->mdbs); + INIT_LIST_HEAD(&dp->vlans); + INIT_LIST_HEAD(&dp->list); + list_add_tail(&dp->list, &dst->ports); + + return dp; +} + +static int dsa_port_parse_user(struct dsa_port *dp, const char *name) +{ + dp->type = DSA_PORT_TYPE_USER; + dp->name = name; + + return 0; +} + +static int dsa_port_parse_dsa(struct dsa_port *dp) +{ + dp->type = DSA_PORT_TYPE_DSA; + + return 0; +} + +static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, + struct net_device *master) +{ + enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE; + struct dsa_switch *mds, *ds = dp->ds; + unsigned int mdp_upstream; + struct dsa_port *mdp; + + /* It is possible to stack DSA switches onto one another when that + * happens the switch driver may want to know if its tagging protocol + * is going to work in such a configuration. + */ + if (dsa_slave_dev_check(master)) { + mdp = dsa_slave_to_port(master); + mds = mdp->ds; + mdp_upstream = dsa_upstream_port(mds, mdp->index); + tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream, + DSA_TAG_PROTO_NONE); + } + + /* If the master device is not itself a DSA slave in a disjoint DSA + * tree, then return immediately. + */ + return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol); +} + +static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master, + const char *user_protocol) +{ + const struct dsa_device_ops *tag_ops = NULL; + struct dsa_switch *ds = dp->ds; + struct dsa_switch_tree *dst = ds->dst; + enum dsa_tag_protocol default_proto; + + /* Find out which protocol the switch would prefer. */ + default_proto = dsa_get_tag_protocol(dp, master); + if (dst->default_proto) { + if (dst->default_proto != default_proto) { + dev_err(ds->dev, + "A DSA switch tree can have only one tagging protocol\n"); + return -EINVAL; + } + } else { + dst->default_proto = default_proto; + } + + /* See if the user wants to override that preference. */ + if (user_protocol) { + if (!ds->ops->change_tag_protocol) { + dev_err(ds->dev, "Tag protocol cannot be modified\n"); + return -EINVAL; + } + + tag_ops = dsa_tag_driver_get_by_name(user_protocol); + if (IS_ERR(tag_ops)) { + dev_warn(ds->dev, + "Failed to find a tagging driver for protocol %s, using default\n", + user_protocol); + tag_ops = NULL; + } + } + + if (!tag_ops) + tag_ops = dsa_tag_driver_get_by_id(default_proto); + + if (IS_ERR(tag_ops)) { + if (PTR_ERR(tag_ops) == -ENOPROTOOPT) + return -EPROBE_DEFER; + + dev_warn(ds->dev, "No tagger for this switch\n"); + return PTR_ERR(tag_ops); + } + + if (dst->tag_ops) { + if (dst->tag_ops != tag_ops) { + dev_err(ds->dev, + "A DSA switch tree can have only one tagging protocol\n"); + + dsa_tag_driver_put(tag_ops); + return -EINVAL; + } + + /* In the case of multiple CPU ports per switch, the tagging + * protocol is still reference-counted only per switch tree. + */ + dsa_tag_driver_put(tag_ops); + } else { + dst->tag_ops = tag_ops; + } + + dp->master = master; + dp->type = DSA_PORT_TYPE_CPU; + dsa_port_set_tag_protocol(dp, dst->tag_ops); + dp->dst = dst; + + /* At this point, the tree may be configured to use a different + * tagger than the one chosen by the switch driver during + * .setup, in the case when a user selects a custom protocol + * through the DT. + * + * This is resolved by syncing the driver with the tree in + * dsa_switch_setup_tag_protocol once .setup has run and the + * driver is ready to accept calls to .change_tag_protocol. If + * the driver does not support the custom protocol at that + * point, the tree is wholly rejected, thereby ensuring that the + * tree and driver are always in agreement on the protocol to + * use. + */ + return 0; +} + +static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) +{ + struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0); + const char *name = of_get_property(dn, "label", NULL); + bool link = of_property_read_bool(dn, "link"); + + dp->dn = dn; + + if (ethernet) { + struct net_device *master; + const char *user_protocol; + + master = of_find_net_device_by_node(ethernet); + of_node_put(ethernet); + if (!master) + return -EPROBE_DEFER; + + user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL); + return dsa_port_parse_cpu(dp, master, user_protocol); + } + + if (link) + return dsa_port_parse_dsa(dp); + + return dsa_port_parse_user(dp, name); +} + +static int dsa_switch_parse_ports_of(struct dsa_switch *ds, + struct device_node *dn) +{ + struct device_node *ports, *port; + struct dsa_port *dp; + int err = 0; + u32 reg; + + ports = of_get_child_by_name(dn, "ports"); + if (!ports) { + /* The second possibility is "ethernet-ports" */ + ports = of_get_child_by_name(dn, "ethernet-ports"); + if (!ports) { + dev_err(ds->dev, "no ports child node found\n"); + return -EINVAL; + } + } + + for_each_available_child_of_node(ports, port) { + err = of_property_read_u32(port, "reg", ®); + if (err) { + of_node_put(port); + goto out_put_node; + } + + if (reg >= ds->num_ports) { + dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%u)\n", + port, reg, ds->num_ports); + of_node_put(port); + err = -EINVAL; + goto out_put_node; + } + + dp = dsa_to_port(ds, reg); + + err = dsa_port_parse_of(dp, port); + if (err) { + of_node_put(port); + goto out_put_node; + } + } + +out_put_node: + of_node_put(ports); + return err; +} + +static int dsa_switch_parse_member_of(struct dsa_switch *ds, + struct device_node *dn) +{ + u32 m[2] = { 0, 0 }; + int sz; + + /* Don't error out if this optional property isn't found */ + sz = of_property_read_variable_u32_array(dn, "dsa,member", m, 2, 2); + if (sz < 0 && sz != -EINVAL) + return sz; + + ds->index = m[1]; + + ds->dst = dsa_tree_touch(m[0]); + if (!ds->dst) + return -ENOMEM; + + if (dsa_switch_find(ds->dst->index, ds->index)) { + dev_err(ds->dev, + "A DSA switch with index %d already exists in tree %d\n", + ds->index, ds->dst->index); + return -EEXIST; + } + + if (ds->dst->last_switch < ds->index) + ds->dst->last_switch = ds->index; + + return 0; +} + +static int dsa_switch_touch_ports(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int port; + + for (port = 0; port < ds->num_ports; port++) { + dp = dsa_port_touch(ds, port); + if (!dp) + return -ENOMEM; + } + + return 0; +} + +static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) +{ + int err; + + err = dsa_switch_parse_member_of(ds, dn); + if (err) + return err; + + err = dsa_switch_touch_ports(ds); + if (err) + return err; + + return dsa_switch_parse_ports_of(ds, dn); +} + +static int dev_is_class(struct device *dev, void *class) +{ + if (dev->class != NULL && !strcmp(dev->class->name, class)) + return 1; + + return 0; +} + +static struct device *dev_find_class(struct device *parent, char *class) +{ + if (dev_is_class(parent, class)) { + get_device(parent); + return parent; + } + + return device_find_child(parent, class, dev_is_class); +} + +static struct net_device *dsa_dev_to_net_device(struct device *dev) +{ + struct device *d; + + d = dev_find_class(dev, "net"); + if (d != NULL) { + struct net_device *nd; + + nd = to_net_dev(d); + dev_hold(nd); + put_device(d); + + return nd; + } + + return NULL; +} + +static int dsa_port_parse(struct dsa_port *dp, const char *name, + struct device *dev) +{ + if (!strcmp(name, "cpu")) { + struct net_device *master; + + master = dsa_dev_to_net_device(dev); + if (!master) + return -EPROBE_DEFER; + + dev_put(master); + + return dsa_port_parse_cpu(dp, master, NULL); + } + + if (!strcmp(name, "dsa")) + return dsa_port_parse_dsa(dp); + + return dsa_port_parse_user(dp, name); +} + +static int dsa_switch_parse_ports(struct dsa_switch *ds, + struct dsa_chip_data *cd) +{ + bool valid_name_found = false; + struct dsa_port *dp; + struct device *dev; + const char *name; + unsigned int i; + int err; + + for (i = 0; i < DSA_MAX_PORTS; i++) { + name = cd->port_names[i]; + dev = cd->netdev[i]; + dp = dsa_to_port(ds, i); + + if (!name) + continue; + + err = dsa_port_parse(dp, name, dev); + if (err) + return err; + + valid_name_found = true; + } + + if (!valid_name_found && i == DSA_MAX_PORTS) + return -EINVAL; + + return 0; +} + +static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) +{ + int err; + + ds->cd = cd; + + /* We don't support interconnected switches nor multiple trees via + * platform data, so this is the unique switch of the tree. + */ + ds->index = 0; + ds->dst = dsa_tree_touch(0); + if (!ds->dst) + return -ENOMEM; + + err = dsa_switch_touch_ports(ds); + if (err) + return err; + + return dsa_switch_parse_ports(ds, cd); +} + +static void dsa_switch_release_ports(struct dsa_switch *ds) +{ + struct dsa_port *dp, *next; + + dsa_switch_for_each_port_safe(dp, next, ds) { + WARN_ON(!list_empty(&dp->fdbs)); + WARN_ON(!list_empty(&dp->mdbs)); + WARN_ON(!list_empty(&dp->vlans)); + list_del(&dp->list); + kfree(dp); + } +} + +static int dsa_switch_probe(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst; + struct dsa_chip_data *pdata; + struct device_node *np; + int err; + + if (!ds->dev) + return -ENODEV; + + pdata = ds->dev->platform_data; + np = ds->dev->of_node; + + if (!ds->num_ports) + return -EINVAL; + + if (np) { + err = dsa_switch_parse_of(ds, np); + if (err) + dsa_switch_release_ports(ds); + } else if (pdata) { + err = dsa_switch_parse(ds, pdata); + if (err) + dsa_switch_release_ports(ds); + } else { + err = -ENODEV; + } + + if (err) + return err; + + dst = ds->dst; + dsa_tree_get(dst); + err = dsa_tree_setup(dst); + if (err) { + dsa_switch_release_ports(ds); + dsa_tree_put(dst); + } + + return err; +} + +int dsa_register_switch(struct dsa_switch *ds) +{ + int err; + + mutex_lock(&dsa2_mutex); + err = dsa_switch_probe(ds); + dsa_tree_put(ds->dst); + mutex_unlock(&dsa2_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(dsa_register_switch); + +static void dsa_switch_remove(struct dsa_switch *ds) +{ + struct dsa_switch_tree *dst = ds->dst; + + dsa_tree_teardown(dst); + dsa_switch_release_ports(ds); + dsa_tree_put(dst); +} + +void dsa_unregister_switch(struct dsa_switch *ds) +{ + mutex_lock(&dsa2_mutex); + dsa_switch_remove(ds); + mutex_unlock(&dsa2_mutex); +} +EXPORT_SYMBOL_GPL(dsa_unregister_switch); + +/* If the DSA master chooses to unregister its net_device on .shutdown, DSA is + * blocking that operation from completion, due to the dev_hold taken inside + * netdev_upper_dev_link. Unlink the DSA slave interfaces from being uppers of + * the DSA master, so that the system can reboot successfully. + */ +void dsa_switch_shutdown(struct dsa_switch *ds) +{ + struct net_device *master, *slave_dev; + struct dsa_port *dp; + + mutex_lock(&dsa2_mutex); + + if (!ds->setup) + goto out; + + rtnl_lock(); + + dsa_switch_for_each_user_port(dp, ds) { + master = dsa_port_to_master(dp); + slave_dev = dp->slave; + + netdev_upper_dev_unlink(master, slave_dev); + } + + /* Disconnect from further netdevice notifiers on the master, + * since netdev_uses_dsa() will now return false. + */ + dsa_switch_for_each_cpu_port(dp, ds) + dp->master->dsa_ptr = NULL; + + rtnl_unlock(); +out: + mutex_unlock(&dsa2_mutex); +} +EXPORT_SYMBOL_GPL(dsa_switch_shutdown); + +#ifdef CONFIG_PM_SLEEP +static bool dsa_port_is_initialized(const struct dsa_port *dp) +{ + return dp->type == DSA_PORT_TYPE_USER && dp->slave; +} + +int dsa_switch_suspend(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int ret = 0; + + /* Suspend slave network devices */ + dsa_switch_for_each_port(dp, ds) { + if (!dsa_port_is_initialized(dp)) + continue; + + ret = dsa_slave_suspend(dp->slave); + if (ret) + return ret; + } + + if (ds->ops->suspend) + ret = ds->ops->suspend(ds); + + return ret; +} +EXPORT_SYMBOL_GPL(dsa_switch_suspend); + +int dsa_switch_resume(struct dsa_switch *ds) +{ + struct dsa_port *dp; + int ret = 0; + + if (ds->ops->resume) + ret = ds->ops->resume(ds); + + if (ret) + return ret; + + /* Resume slave network devices */ + dsa_switch_for_each_port(dp, ds) { + if (!dsa_port_is_initialized(dp)) + continue; + + ret = dsa_slave_resume(dp->slave); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_switch_resume); +#endif + +struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) +{ + if (!netdev || !dsa_slave_dev_check(netdev)) + return ERR_PTR(-ENODEV); + + return dsa_slave_to_port(netdev); +} +EXPORT_SYMBOL_GPL(dsa_port_from_netdev); + +bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b) +{ + if (a->type != b->type) + return false; + + switch (a->type) { + case DSA_DB_PORT: + return a->dp == b->dp; + case DSA_DB_LAG: + return a->lag.dev == b->lag.dev; + case DSA_DB_BRIDGE: + return a->bridge.num == b->bridge.num; + default: + WARN_ON(1); + return false; + } +} + +bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid, + struct dsa_db db) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + + lockdep_assert_held(&dp->addr_lists_lock); + + list_for_each_entry(a, &dp->fdbs, list) { + if (!ether_addr_equal(a->addr, addr) || a->vid != vid) + continue; + + if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(dsa_fdb_present_in_other_db); + +bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, + const struct switchdev_obj_port_mdb *mdb, + struct dsa_db db) +{ + struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_mac_addr *a; + + lockdep_assert_held(&dp->addr_lists_lock); + + list_for_each_entry(a, &dp->mdbs, list) { + if (!ether_addr_equal(a->addr, mdb->addr) || a->vid != mdb->vid) + continue; + + if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) + return true; + } + + return false; +} +EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db); + +static int __init dsa_init_module(void) +{ + int rc; + + dsa_owq = alloc_ordered_workqueue("dsa_ordered", + WQ_MEM_RECLAIM); + if (!dsa_owq) + return -ENOMEM; + + rc = dsa_slave_register_notifier(); + if (rc) + goto register_notifier_fail; + + dev_add_pack(&dsa_pack_type); + + rc = rtnl_link_register(&dsa_link_ops); + if (rc) + goto netlink_register_fail; + + return 0; + +netlink_register_fail: + dsa_slave_unregister_notifier(); + dev_remove_pack(&dsa_pack_type); +register_notifier_fail: + destroy_workqueue(dsa_owq); + + return rc; +} +module_init(dsa_init_module); + +static void __exit dsa_cleanup_module(void) +{ + rtnl_link_unregister(&dsa_link_ops); + + dsa_slave_unregister_notifier(); + dev_remove_pack(&dsa_pack_type); + destroy_workqueue(dsa_owq); +} +module_exit(dsa_cleanup_module); + +MODULE_AUTHOR("Lennert Buytenhek "); +MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:dsa"); diff --git a/net/dsa/dsa.h b/net/dsa/dsa.h new file mode 100644 index 000000000000..b7e17ae1094d --- /dev/null +++ b/net/dsa/dsa.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_H +#define __DSA_H + +#include +#include + +struct dsa_db; +struct dsa_device_ops; +struct dsa_lag; +struct dsa_switch_tree; +struct net_device; +struct work_struct; + +extern struct list_head dsa_tree_list; + +bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b); +bool dsa_schedule_work(struct work_struct *work); +void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag); +void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag); +struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, + const struct net_device *lag_dev); +struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst); +int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, + const struct dsa_device_ops *tag_ops, + const struct dsa_device_ops *old_tag_ops); +void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst, + struct net_device *master, + bool up); +void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst, + struct net_device *master, + bool up); +unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max); +void dsa_bridge_num_put(const struct net_device *bridge_dev, + unsigned int bridge_num); +struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst, + const struct net_device *br); + +#endif diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c deleted file mode 100644 index 7a75b0767dd1..000000000000 --- a/net/dsa/dsa2.c +++ /dev/null @@ -1,1746 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DSA topology and switch handling - * - * Copyright (c) 2008-2009 Marvell Semiconductor - * Copyright (c) 2013 Florian Fainelli - * Copyright (c) 2016 Andrew Lunn - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "devlink.h" -#include "dsa_priv.h" -#include "master.h" -#include "port.h" -#include "slave.h" -#include "switch.h" -#include "tag.h" - -static DEFINE_MUTEX(dsa2_mutex); -LIST_HEAD(dsa_tree_list); - -static struct workqueue_struct *dsa_owq; - -/* Track the bridges with forwarding offload enabled */ -static unsigned long dsa_fwd_offloading_bridges; - -bool dsa_schedule_work(struct work_struct *work) -{ - return queue_work(dsa_owq, work); -} - -void dsa_flush_workqueue(void) -{ - flush_workqueue(dsa_owq); -} -EXPORT_SYMBOL_GPL(dsa_flush_workqueue); - -/** - * dsa_lag_map() - Map LAG structure to a linear LAG array - * @dst: Tree in which to record the mapping. - * @lag: LAG structure that is to be mapped to the tree's array. - * - * dsa_lag_id/dsa_lag_by_id can then be used to translate between the - * two spaces. The size of the mapping space is determined by the - * driver by setting ds->num_lag_ids. It is perfectly legal to leave - * it unset if it is not needed, in which case these functions become - * no-ops. - */ -void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag) -{ - unsigned int id; - - for (id = 1; id <= dst->lags_len; id++) { - if (!dsa_lag_by_id(dst, id)) { - dst->lags[id - 1] = lag; - lag->id = id; - return; - } - } - - /* No IDs left, which is OK. Some drivers do not need it. The - * ones that do, e.g. mv88e6xxx, will discover that dsa_lag_id - * returns an error for this device when joining the LAG. The - * driver can then return -EOPNOTSUPP back to DSA, which will - * fall back to a software LAG. - */ -} - -/** - * dsa_lag_unmap() - Remove a LAG ID mapping - * @dst: Tree in which the mapping is recorded. - * @lag: LAG structure that was mapped. - * - * As there may be multiple users of the mapping, it is only removed - * if there are no other references to it. - */ -void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag) -{ - unsigned int id; - - dsa_lags_foreach_id(id, dst) { - if (dsa_lag_by_id(dst, id) == lag) { - dst->lags[id - 1] = NULL; - lag->id = 0; - break; - } - } -} - -struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, - const struct net_device *lag_dev) -{ - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - if (dsa_port_lag_dev_get(dp) == lag_dev) - return dp->lag; - - return NULL; -} - -struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst, - const struct net_device *br) -{ - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - if (dsa_port_bridge_dev_get(dp) == br) - return dp->bridge; - - return NULL; -} - -static int dsa_bridge_num_find(const struct net_device *bridge_dev) -{ - struct dsa_switch_tree *dst; - - list_for_each_entry(dst, &dsa_tree_list, list) { - struct dsa_bridge *bridge; - - bridge = dsa_tree_bridge_find(dst, bridge_dev); - if (bridge) - return bridge->num; - } - - return 0; -} - -unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max) -{ - unsigned int bridge_num = dsa_bridge_num_find(bridge_dev); - - /* Switches without FDB isolation support don't get unique - * bridge numbering - */ - if (!max) - return 0; - - if (!bridge_num) { - /* First port that requests FDB isolation or TX forwarding - * offload for this bridge - */ - bridge_num = find_next_zero_bit(&dsa_fwd_offloading_bridges, - DSA_MAX_NUM_OFFLOADING_BRIDGES, - 1); - if (bridge_num >= max) - return 0; - - set_bit(bridge_num, &dsa_fwd_offloading_bridges); - } - - return bridge_num; -} - -void dsa_bridge_num_put(const struct net_device *bridge_dev, - unsigned int bridge_num) -{ - /* Since we refcount bridges, we know that when we call this function - * it is no longer in use, so we can just go ahead and remove it from - * the bit mask. - */ - clear_bit(bridge_num, &dsa_fwd_offloading_bridges); -} - -struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) -{ - struct dsa_switch_tree *dst; - struct dsa_port *dp; - - list_for_each_entry(dst, &dsa_tree_list, list) { - if (dst->index != tree_index) - continue; - - list_for_each_entry(dp, &dst->ports, list) { - if (dp->ds->index != sw_index) - continue; - - return dp->ds; - } - } - - return NULL; -} -EXPORT_SYMBOL_GPL(dsa_switch_find); - -static struct dsa_switch_tree *dsa_tree_find(int index) -{ - struct dsa_switch_tree *dst; - - list_for_each_entry(dst, &dsa_tree_list, list) - if (dst->index == index) - return dst; - - return NULL; -} - -static struct dsa_switch_tree *dsa_tree_alloc(int index) -{ - struct dsa_switch_tree *dst; - - dst = kzalloc(sizeof(*dst), GFP_KERNEL); - if (!dst) - return NULL; - - dst->index = index; - - INIT_LIST_HEAD(&dst->rtable); - - INIT_LIST_HEAD(&dst->ports); - - INIT_LIST_HEAD(&dst->list); - list_add_tail(&dst->list, &dsa_tree_list); - - kref_init(&dst->refcount); - - return dst; -} - -static void dsa_tree_free(struct dsa_switch_tree *dst) -{ - if (dst->tag_ops) - dsa_tag_driver_put(dst->tag_ops); - list_del(&dst->list); - kfree(dst); -} - -static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst) -{ - if (dst) - kref_get(&dst->refcount); - - return dst; -} - -static struct dsa_switch_tree *dsa_tree_touch(int index) -{ - struct dsa_switch_tree *dst; - - dst = dsa_tree_find(index); - if (dst) - return dsa_tree_get(dst); - else - return dsa_tree_alloc(index); -} - -static void dsa_tree_release(struct kref *ref) -{ - struct dsa_switch_tree *dst; - - dst = container_of(ref, struct dsa_switch_tree, refcount); - - dsa_tree_free(dst); -} - -static void dsa_tree_put(struct dsa_switch_tree *dst) -{ - if (dst) - kref_put(&dst->refcount, dsa_tree_release); -} - -static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst, - struct device_node *dn) -{ - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - if (dp->dn == dn) - return dp; - - return NULL; -} - -static struct dsa_link *dsa_link_touch(struct dsa_port *dp, - struct dsa_port *link_dp) -{ - struct dsa_switch *ds = dp->ds; - struct dsa_switch_tree *dst; - struct dsa_link *dl; - - dst = ds->dst; - - list_for_each_entry(dl, &dst->rtable, list) - if (dl->dp == dp && dl->link_dp == link_dp) - return dl; - - dl = kzalloc(sizeof(*dl), GFP_KERNEL); - if (!dl) - return NULL; - - dl->dp = dp; - dl->link_dp = link_dp; - - INIT_LIST_HEAD(&dl->list); - list_add_tail(&dl->list, &dst->rtable); - - return dl; -} - -static bool dsa_port_setup_routing_table(struct dsa_port *dp) -{ - struct dsa_switch *ds = dp->ds; - struct dsa_switch_tree *dst = ds->dst; - struct device_node *dn = dp->dn; - struct of_phandle_iterator it; - struct dsa_port *link_dp; - struct dsa_link *dl; - int err; - - of_for_each_phandle(&it, err, dn, "link", NULL, 0) { - link_dp = dsa_tree_find_port_by_node(dst, it.node); - if (!link_dp) { - of_node_put(it.node); - return false; - } - - dl = dsa_link_touch(dp, link_dp); - if (!dl) { - of_node_put(it.node); - return false; - } - } - - return true; -} - -static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst) -{ - bool complete = true; - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) { - if (dsa_port_is_dsa(dp)) { - complete = dsa_port_setup_routing_table(dp); - if (!complete) - break; - } - } - - return complete; -} - -static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) -{ - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - if (dsa_port_is_cpu(dp)) - return dp; - - return NULL; -} - -struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst) -{ - struct device_node *ethernet; - struct net_device *master; - struct dsa_port *cpu_dp; - - cpu_dp = dsa_tree_find_first_cpu(dst); - ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0); - master = of_find_net_device_by_node(ethernet); - of_node_put(ethernet); - - return master; -} - -/* Assign the default CPU port (the first one in the tree) to all ports of the - * fabric which don't already have one as part of their own switch. - */ -static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) -{ - struct dsa_port *cpu_dp, *dp; - - cpu_dp = dsa_tree_find_first_cpu(dst); - if (!cpu_dp) { - pr_err("DSA: tree %d has no CPU port\n", dst->index); - return -EINVAL; - } - - list_for_each_entry(dp, &dst->ports, list) { - if (dp->cpu_dp) - continue; - - if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) - dp->cpu_dp = cpu_dp; - } - - return 0; -} - -/* Perform initial assignment of CPU ports to user ports and DSA links in the - * fabric, giving preference to CPU ports local to each switch. Default to - * using the first CPU port in the switch tree if the port does not have a CPU - * port local to this switch. - */ -static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst) -{ - struct dsa_port *cpu_dp, *dp; - - list_for_each_entry(cpu_dp, &dst->ports, list) { - if (!dsa_port_is_cpu(cpu_dp)) - continue; - - /* Prefer a local CPU port */ - dsa_switch_for_each_port(dp, cpu_dp->ds) { - /* Prefer the first local CPU port found */ - if (dp->cpu_dp) - continue; - - if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) - dp->cpu_dp = cpu_dp; - } - } - - return dsa_tree_setup_default_cpu(dst); -} - -static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst) -{ - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) - dp->cpu_dp = NULL; -} - -static int dsa_port_setup(struct dsa_port *dp) -{ - bool dsa_port_link_registered = false; - struct dsa_switch *ds = dp->ds; - bool dsa_port_enabled = false; - int err = 0; - - if (dp->setup) - return 0; - - err = dsa_port_devlink_setup(dp); - if (err) - return err; - - switch (dp->type) { - case DSA_PORT_TYPE_UNUSED: - dsa_port_disable(dp); - break; - case DSA_PORT_TYPE_CPU: - if (dp->dn) { - err = dsa_shared_port_link_register_of(dp); - if (err) - break; - dsa_port_link_registered = true; - } else { - dev_warn(ds->dev, - "skipping link registration for CPU port %d\n", - dp->index); - } - - err = dsa_port_enable(dp, NULL); - if (err) - break; - dsa_port_enabled = true; - - break; - case DSA_PORT_TYPE_DSA: - if (dp->dn) { - err = dsa_shared_port_link_register_of(dp); - if (err) - break; - dsa_port_link_registered = true; - } else { - dev_warn(ds->dev, - "skipping link registration for DSA port %d\n", - dp->index); - } - - err = dsa_port_enable(dp, NULL); - if (err) - break; - dsa_port_enabled = true; - - break; - case DSA_PORT_TYPE_USER: - of_get_mac_address(dp->dn, dp->mac); - err = dsa_slave_create(dp); - break; - } - - if (err && dsa_port_enabled) - dsa_port_disable(dp); - if (err && dsa_port_link_registered) - dsa_shared_port_link_unregister_of(dp); - if (err) { - dsa_port_devlink_teardown(dp); - return err; - } - - dp->setup = true; - - return 0; -} - -static void dsa_port_teardown(struct dsa_port *dp) -{ - if (!dp->setup) - return; - - switch (dp->type) { - case DSA_PORT_TYPE_UNUSED: - break; - case DSA_PORT_TYPE_CPU: - dsa_port_disable(dp); - if (dp->dn) - dsa_shared_port_link_unregister_of(dp); - break; - case DSA_PORT_TYPE_DSA: - dsa_port_disable(dp); - if (dp->dn) - dsa_shared_port_link_unregister_of(dp); - break; - case DSA_PORT_TYPE_USER: - if (dp->slave) { - dsa_slave_destroy(dp->slave); - dp->slave = NULL; - } - break; - } - - dsa_port_devlink_teardown(dp); - - dp->setup = false; -} - -static int dsa_port_setup_as_unused(struct dsa_port *dp) -{ - dp->type = DSA_PORT_TYPE_UNUSED; - return dsa_port_setup(dp); -} - -static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) -{ - const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; - struct dsa_switch_tree *dst = ds->dst; - int err; - - if (tag_ops->proto == dst->default_proto) - goto connect; - - rtnl_lock(); - err = ds->ops->change_tag_protocol(ds, tag_ops->proto); - rtnl_unlock(); - if (err) { - dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", - tag_ops->name, ERR_PTR(err)); - return err; - } - -connect: - if (tag_ops->connect) { - err = tag_ops->connect(ds); - if (err) - return err; - } - - if (ds->ops->connect_tag_protocol) { - err = ds->ops->connect_tag_protocol(ds, tag_ops->proto); - if (err) { - dev_err(ds->dev, - "Unable to connect to tag protocol \"%s\": %pe\n", - tag_ops->name, ERR_PTR(err)); - goto disconnect; - } - } - - return 0; - -disconnect: - if (tag_ops->disconnect) - tag_ops->disconnect(ds); - - return err; -} - -static void dsa_switch_teardown_tag_protocol(struct dsa_switch *ds) -{ - const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; - - if (tag_ops->disconnect) - tag_ops->disconnect(ds); -} - -static int dsa_switch_setup(struct dsa_switch *ds) -{ - struct device_node *dn; - int err; - - if (ds->setup) - return 0; - - /* Initialize ds->phys_mii_mask before registering the slave MDIO bus - * driver and before ops->setup() has run, since the switch drivers and - * the slave MDIO bus driver rely on these values for probing PHY - * devices or not - */ - ds->phys_mii_mask |= dsa_user_ports(ds); - - err = dsa_switch_devlink_alloc(ds); - if (err) - return err; - - err = dsa_switch_register_notifier(ds); - if (err) - goto devlink_free; - - ds->configure_vlan_while_not_filtering = true; - - err = ds->ops->setup(ds); - if (err < 0) - goto unregister_notifier; - - err = dsa_switch_setup_tag_protocol(ds); - if (err) - goto teardown; - - if (!ds->slave_mii_bus && ds->ops->phy_read) { - ds->slave_mii_bus = mdiobus_alloc(); - if (!ds->slave_mii_bus) { - err = -ENOMEM; - goto teardown; - } - - dsa_slave_mii_bus_init(ds); - - dn = of_get_child_by_name(ds->dev->of_node, "mdio"); - - err = of_mdiobus_register(ds->slave_mii_bus, dn); - of_node_put(dn); - if (err < 0) - goto free_slave_mii_bus; - } - - dsa_switch_devlink_register(ds); - - ds->setup = true; - return 0; - -free_slave_mii_bus: - if (ds->slave_mii_bus && ds->ops->phy_read) - mdiobus_free(ds->slave_mii_bus); -teardown: - if (ds->ops->teardown) - ds->ops->teardown(ds); -unregister_notifier: - dsa_switch_unregister_notifier(ds); -devlink_free: - dsa_switch_devlink_free(ds); - return err; -} - -static void dsa_switch_teardown(struct dsa_switch *ds) -{ - if (!ds->setup) - return; - - dsa_switch_devlink_unregister(ds); - - if (ds->slave_mii_bus && ds->ops->phy_read) { - mdiobus_unregister(ds->slave_mii_bus); - mdiobus_free(ds->slave_mii_bus); - ds->slave_mii_bus = NULL; - } - - dsa_switch_teardown_tag_protocol(ds); - - if (ds->ops->teardown) - ds->ops->teardown(ds); - - dsa_switch_unregister_notifier(ds); - - dsa_switch_devlink_free(ds); - - ds->setup = false; -} - -/* First tear down the non-shared, then the shared ports. This ensures that - * all work items scheduled by our switchdev handlers for user ports have - * completed before we destroy the refcounting kept on the shared ports. - */ -static void dsa_tree_teardown_ports(struct dsa_switch_tree *dst) -{ - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) - dsa_port_teardown(dp); - - dsa_flush_workqueue(); - - list_for_each_entry(dp, &dst->ports, list) - if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) - dsa_port_teardown(dp); -} - -static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) -{ - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - dsa_switch_teardown(dp->ds); -} - -/* Bring shared ports up first, then non-shared ports */ -static int dsa_tree_setup_ports(struct dsa_switch_tree *dst) -{ - struct dsa_port *dp; - int err = 0; - - list_for_each_entry(dp, &dst->ports, list) { - if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) { - err = dsa_port_setup(dp); - if (err) - goto teardown; - } - } - - list_for_each_entry(dp, &dst->ports, list) { - if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) { - err = dsa_port_setup(dp); - if (err) { - err = dsa_port_setup_as_unused(dp); - if (err) - goto teardown; - } - } - } - - return 0; - -teardown: - dsa_tree_teardown_ports(dst); - - return err; -} - -static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) -{ - struct dsa_port *dp; - int err = 0; - - list_for_each_entry(dp, &dst->ports, list) { - err = dsa_switch_setup(dp->ds); - if (err) { - dsa_tree_teardown_switches(dst); - break; - } - } - - return err; -} - -static int dsa_tree_setup_master(struct dsa_switch_tree *dst) -{ - struct dsa_port *cpu_dp; - int err = 0; - - rtnl_lock(); - - dsa_tree_for_each_cpu_port(cpu_dp, dst) { - struct net_device *master = cpu_dp->master; - bool admin_up = (master->flags & IFF_UP) && - !qdisc_tx_is_noop(master); - - err = dsa_master_setup(master, cpu_dp); - if (err) - break; - - /* Replay master state event */ - dsa_tree_master_admin_state_change(dst, master, admin_up); - dsa_tree_master_oper_state_change(dst, master, - netif_oper_up(master)); - } - - rtnl_unlock(); - - return err; -} - -static void dsa_tree_teardown_master(struct dsa_switch_tree *dst) -{ - struct dsa_port *cpu_dp; - - rtnl_lock(); - - dsa_tree_for_each_cpu_port(cpu_dp, dst) { - struct net_device *master = cpu_dp->master; - - /* Synthesizing an "admin down" state is sufficient for - * the switches to get a notification if the master is - * currently up and running. - */ - dsa_tree_master_admin_state_change(dst, master, false); - - dsa_master_teardown(master); - } - - rtnl_unlock(); -} - -static int dsa_tree_setup_lags(struct dsa_switch_tree *dst) -{ - unsigned int len = 0; - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) { - if (dp->ds->num_lag_ids > len) - len = dp->ds->num_lag_ids; - } - - if (!len) - return 0; - - dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL); - if (!dst->lags) - return -ENOMEM; - - dst->lags_len = len; - return 0; -} - -static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst) -{ - kfree(dst->lags); -} - -static int dsa_tree_setup(struct dsa_switch_tree *dst) -{ - bool complete; - int err; - - if (dst->setup) { - pr_err("DSA: tree %d already setup! Disjoint trees?\n", - dst->index); - return -EEXIST; - } - - complete = dsa_tree_setup_routing_table(dst); - if (!complete) - return 0; - - err = dsa_tree_setup_cpu_ports(dst); - if (err) - return err; - - err = dsa_tree_setup_switches(dst); - if (err) - goto teardown_cpu_ports; - - err = dsa_tree_setup_ports(dst); - if (err) - goto teardown_switches; - - err = dsa_tree_setup_master(dst); - if (err) - goto teardown_ports; - - err = dsa_tree_setup_lags(dst); - if (err) - goto teardown_master; - - dst->setup = true; - - pr_info("DSA: tree %d setup\n", dst->index); - - return 0; - -teardown_master: - dsa_tree_teardown_master(dst); -teardown_ports: - dsa_tree_teardown_ports(dst); -teardown_switches: - dsa_tree_teardown_switches(dst); -teardown_cpu_ports: - dsa_tree_teardown_cpu_ports(dst); - - return err; -} - -static void dsa_tree_teardown(struct dsa_switch_tree *dst) -{ - struct dsa_link *dl, *next; - - if (!dst->setup) - return; - - dsa_tree_teardown_lags(dst); - - dsa_tree_teardown_master(dst); - - dsa_tree_teardown_ports(dst); - - dsa_tree_teardown_switches(dst); - - dsa_tree_teardown_cpu_ports(dst); - - list_for_each_entry_safe(dl, next, &dst->rtable, list) { - list_del(&dl->list); - kfree(dl); - } - - pr_info("DSA: tree %d torn down\n", dst->index); - - dst->setup = false; -} - -static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst, - const struct dsa_device_ops *tag_ops) -{ - const struct dsa_device_ops *old_tag_ops = dst->tag_ops; - struct dsa_notifier_tag_proto_info info; - int err; - - dst->tag_ops = tag_ops; - - /* Notify the switches from this tree about the connection - * to the new tagger - */ - info.tag_ops = tag_ops; - err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info); - if (err && err != -EOPNOTSUPP) - goto out_disconnect; - - /* Notify the old tagger about the disconnection from this tree */ - info.tag_ops = old_tag_ops; - dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info); - - return 0; - -out_disconnect: - info.tag_ops = tag_ops; - dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info); - dst->tag_ops = old_tag_ops; - - return err; -} - -/* Since the dsa/tagging sysfs device attribute is per master, the assumption - * is that all DSA switches within a tree share the same tagger, otherwise - * they would have formed disjoint trees (different "dsa,member" values). - */ -int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, - const struct dsa_device_ops *tag_ops, - const struct dsa_device_ops *old_tag_ops) -{ - struct dsa_notifier_tag_proto_info info; - struct dsa_port *dp; - int err = -EBUSY; - - if (!rtnl_trylock()) - return restart_syscall(); - - /* At the moment we don't allow changing the tag protocol under - * traffic. The rtnl_mutex also happens to serialize concurrent - * attempts to change the tagging protocol. If we ever lift the IFF_UP - * restriction, there needs to be another mutex which serializes this. - */ - dsa_tree_for_each_user_port(dp, dst) { - if (dsa_port_to_master(dp)->flags & IFF_UP) - goto out_unlock; - - if (dp->slave->flags & IFF_UP) - goto out_unlock; - } - - /* Notify the tag protocol change */ - info.tag_ops = tag_ops; - err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); - if (err) - goto out_unwind_tagger; - - err = dsa_tree_bind_tag_proto(dst, tag_ops); - if (err) - goto out_unwind_tagger; - - rtnl_unlock(); - - return 0; - -out_unwind_tagger: - info.tag_ops = old_tag_ops; - dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info); -out_unlock: - rtnl_unlock(); - return err; -} - -static void dsa_tree_master_state_change(struct dsa_switch_tree *dst, - struct net_device *master) -{ - struct dsa_notifier_master_state_info info; - struct dsa_port *cpu_dp = master->dsa_ptr; - - info.master = master; - info.operational = dsa_port_master_is_operational(cpu_dp); - - dsa_tree_notify(dst, DSA_NOTIFIER_MASTER_STATE_CHANGE, &info); -} - -void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst, - struct net_device *master, - bool up) -{ - struct dsa_port *cpu_dp = master->dsa_ptr; - bool notify = false; - - /* Don't keep track of admin state on LAG DSA masters, - * but rather just of physical DSA masters - */ - if (netif_is_lag_master(master)) - return; - - if ((dsa_port_master_is_operational(cpu_dp)) != - (up && cpu_dp->master_oper_up)) - notify = true; - - cpu_dp->master_admin_up = up; - - if (notify) - dsa_tree_master_state_change(dst, master); -} - -void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst, - struct net_device *master, - bool up) -{ - struct dsa_port *cpu_dp = master->dsa_ptr; - bool notify = false; - - /* Don't keep track of oper state on LAG DSA masters, - * but rather just of physical DSA masters - */ - if (netif_is_lag_master(master)) - return; - - if ((dsa_port_master_is_operational(cpu_dp)) != - (cpu_dp->master_admin_up && up)) - notify = true; - - cpu_dp->master_oper_up = up; - - if (notify) - dsa_tree_master_state_change(dst, master); -} - -static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) -{ - struct dsa_switch_tree *dst = ds->dst; - struct dsa_port *dp; - - dsa_switch_for_each_port(dp, ds) - if (dp->index == index) - return dp; - - dp = kzalloc(sizeof(*dp), GFP_KERNEL); - if (!dp) - return NULL; - - dp->ds = ds; - dp->index = index; - - mutex_init(&dp->addr_lists_lock); - mutex_init(&dp->vlans_lock); - INIT_LIST_HEAD(&dp->fdbs); - INIT_LIST_HEAD(&dp->mdbs); - INIT_LIST_HEAD(&dp->vlans); - INIT_LIST_HEAD(&dp->list); - list_add_tail(&dp->list, &dst->ports); - - return dp; -} - -static int dsa_port_parse_user(struct dsa_port *dp, const char *name) -{ - dp->type = DSA_PORT_TYPE_USER; - dp->name = name; - - return 0; -} - -static int dsa_port_parse_dsa(struct dsa_port *dp) -{ - dp->type = DSA_PORT_TYPE_DSA; - - return 0; -} - -static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp, - struct net_device *master) -{ - enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE; - struct dsa_switch *mds, *ds = dp->ds; - unsigned int mdp_upstream; - struct dsa_port *mdp; - - /* It is possible to stack DSA switches onto one another when that - * happens the switch driver may want to know if its tagging protocol - * is going to work in such a configuration. - */ - if (dsa_slave_dev_check(master)) { - mdp = dsa_slave_to_port(master); - mds = mdp->ds; - mdp_upstream = dsa_upstream_port(mds, mdp->index); - tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream, - DSA_TAG_PROTO_NONE); - } - - /* If the master device is not itself a DSA slave in a disjoint DSA - * tree, then return immediately. - */ - return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol); -} - -static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master, - const char *user_protocol) -{ - const struct dsa_device_ops *tag_ops = NULL; - struct dsa_switch *ds = dp->ds; - struct dsa_switch_tree *dst = ds->dst; - enum dsa_tag_protocol default_proto; - - /* Find out which protocol the switch would prefer. */ - default_proto = dsa_get_tag_protocol(dp, master); - if (dst->default_proto) { - if (dst->default_proto != default_proto) { - dev_err(ds->dev, - "A DSA switch tree can have only one tagging protocol\n"); - return -EINVAL; - } - } else { - dst->default_proto = default_proto; - } - - /* See if the user wants to override that preference. */ - if (user_protocol) { - if (!ds->ops->change_tag_protocol) { - dev_err(ds->dev, "Tag protocol cannot be modified\n"); - return -EINVAL; - } - - tag_ops = dsa_tag_driver_get_by_name(user_protocol); - if (IS_ERR(tag_ops)) { - dev_warn(ds->dev, - "Failed to find a tagging driver for protocol %s, using default\n", - user_protocol); - tag_ops = NULL; - } - } - - if (!tag_ops) - tag_ops = dsa_tag_driver_get_by_id(default_proto); - - if (IS_ERR(tag_ops)) { - if (PTR_ERR(tag_ops) == -ENOPROTOOPT) - return -EPROBE_DEFER; - - dev_warn(ds->dev, "No tagger for this switch\n"); - return PTR_ERR(tag_ops); - } - - if (dst->tag_ops) { - if (dst->tag_ops != tag_ops) { - dev_err(ds->dev, - "A DSA switch tree can have only one tagging protocol\n"); - - dsa_tag_driver_put(tag_ops); - return -EINVAL; - } - - /* In the case of multiple CPU ports per switch, the tagging - * protocol is still reference-counted only per switch tree. - */ - dsa_tag_driver_put(tag_ops); - } else { - dst->tag_ops = tag_ops; - } - - dp->master = master; - dp->type = DSA_PORT_TYPE_CPU; - dsa_port_set_tag_protocol(dp, dst->tag_ops); - dp->dst = dst; - - /* At this point, the tree may be configured to use a different - * tagger than the one chosen by the switch driver during - * .setup, in the case when a user selects a custom protocol - * through the DT. - * - * This is resolved by syncing the driver with the tree in - * dsa_switch_setup_tag_protocol once .setup has run and the - * driver is ready to accept calls to .change_tag_protocol. If - * the driver does not support the custom protocol at that - * point, the tree is wholly rejected, thereby ensuring that the - * tree and driver are always in agreement on the protocol to - * use. - */ - return 0; -} - -static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn) -{ - struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0); - const char *name = of_get_property(dn, "label", NULL); - bool link = of_property_read_bool(dn, "link"); - - dp->dn = dn; - - if (ethernet) { - struct net_device *master; - const char *user_protocol; - - master = of_find_net_device_by_node(ethernet); - of_node_put(ethernet); - if (!master) - return -EPROBE_DEFER; - - user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL); - return dsa_port_parse_cpu(dp, master, user_protocol); - } - - if (link) - return dsa_port_parse_dsa(dp); - - return dsa_port_parse_user(dp, name); -} - -static int dsa_switch_parse_ports_of(struct dsa_switch *ds, - struct device_node *dn) -{ - struct device_node *ports, *port; - struct dsa_port *dp; - int err = 0; - u32 reg; - - ports = of_get_child_by_name(dn, "ports"); - if (!ports) { - /* The second possibility is "ethernet-ports" */ - ports = of_get_child_by_name(dn, "ethernet-ports"); - if (!ports) { - dev_err(ds->dev, "no ports child node found\n"); - return -EINVAL; - } - } - - for_each_available_child_of_node(ports, port) { - err = of_property_read_u32(port, "reg", ®); - if (err) { - of_node_put(port); - goto out_put_node; - } - - if (reg >= ds->num_ports) { - dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%u)\n", - port, reg, ds->num_ports); - of_node_put(port); - err = -EINVAL; - goto out_put_node; - } - - dp = dsa_to_port(ds, reg); - - err = dsa_port_parse_of(dp, port); - if (err) { - of_node_put(port); - goto out_put_node; - } - } - -out_put_node: - of_node_put(ports); - return err; -} - -static int dsa_switch_parse_member_of(struct dsa_switch *ds, - struct device_node *dn) -{ - u32 m[2] = { 0, 0 }; - int sz; - - /* Don't error out if this optional property isn't found */ - sz = of_property_read_variable_u32_array(dn, "dsa,member", m, 2, 2); - if (sz < 0 && sz != -EINVAL) - return sz; - - ds->index = m[1]; - - ds->dst = dsa_tree_touch(m[0]); - if (!ds->dst) - return -ENOMEM; - - if (dsa_switch_find(ds->dst->index, ds->index)) { - dev_err(ds->dev, - "A DSA switch with index %d already exists in tree %d\n", - ds->index, ds->dst->index); - return -EEXIST; - } - - if (ds->dst->last_switch < ds->index) - ds->dst->last_switch = ds->index; - - return 0; -} - -static int dsa_switch_touch_ports(struct dsa_switch *ds) -{ - struct dsa_port *dp; - int port; - - for (port = 0; port < ds->num_ports; port++) { - dp = dsa_port_touch(ds, port); - if (!dp) - return -ENOMEM; - } - - return 0; -} - -static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) -{ - int err; - - err = dsa_switch_parse_member_of(ds, dn); - if (err) - return err; - - err = dsa_switch_touch_ports(ds); - if (err) - return err; - - return dsa_switch_parse_ports_of(ds, dn); -} - -static int dev_is_class(struct device *dev, void *class) -{ - if (dev->class != NULL && !strcmp(dev->class->name, class)) - return 1; - - return 0; -} - -static struct device *dev_find_class(struct device *parent, char *class) -{ - if (dev_is_class(parent, class)) { - get_device(parent); - return parent; - } - - return device_find_child(parent, class, dev_is_class); -} - -static struct net_device *dsa_dev_to_net_device(struct device *dev) -{ - struct device *d; - - d = dev_find_class(dev, "net"); - if (d != NULL) { - struct net_device *nd; - - nd = to_net_dev(d); - dev_hold(nd); - put_device(d); - - return nd; - } - - return NULL; -} - -static int dsa_port_parse(struct dsa_port *dp, const char *name, - struct device *dev) -{ - if (!strcmp(name, "cpu")) { - struct net_device *master; - - master = dsa_dev_to_net_device(dev); - if (!master) - return -EPROBE_DEFER; - - dev_put(master); - - return dsa_port_parse_cpu(dp, master, NULL); - } - - if (!strcmp(name, "dsa")) - return dsa_port_parse_dsa(dp); - - return dsa_port_parse_user(dp, name); -} - -static int dsa_switch_parse_ports(struct dsa_switch *ds, - struct dsa_chip_data *cd) -{ - bool valid_name_found = false; - struct dsa_port *dp; - struct device *dev; - const char *name; - unsigned int i; - int err; - - for (i = 0; i < DSA_MAX_PORTS; i++) { - name = cd->port_names[i]; - dev = cd->netdev[i]; - dp = dsa_to_port(ds, i); - - if (!name) - continue; - - err = dsa_port_parse(dp, name, dev); - if (err) - return err; - - valid_name_found = true; - } - - if (!valid_name_found && i == DSA_MAX_PORTS) - return -EINVAL; - - return 0; -} - -static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) -{ - int err; - - ds->cd = cd; - - /* We don't support interconnected switches nor multiple trees via - * platform data, so this is the unique switch of the tree. - */ - ds->index = 0; - ds->dst = dsa_tree_touch(0); - if (!ds->dst) - return -ENOMEM; - - err = dsa_switch_touch_ports(ds); - if (err) - return err; - - return dsa_switch_parse_ports(ds, cd); -} - -static void dsa_switch_release_ports(struct dsa_switch *ds) -{ - struct dsa_port *dp, *next; - - dsa_switch_for_each_port_safe(dp, next, ds) { - WARN_ON(!list_empty(&dp->fdbs)); - WARN_ON(!list_empty(&dp->mdbs)); - WARN_ON(!list_empty(&dp->vlans)); - list_del(&dp->list); - kfree(dp); - } -} - -static int dsa_switch_probe(struct dsa_switch *ds) -{ - struct dsa_switch_tree *dst; - struct dsa_chip_data *pdata; - struct device_node *np; - int err; - - if (!ds->dev) - return -ENODEV; - - pdata = ds->dev->platform_data; - np = ds->dev->of_node; - - if (!ds->num_ports) - return -EINVAL; - - if (np) { - err = dsa_switch_parse_of(ds, np); - if (err) - dsa_switch_release_ports(ds); - } else if (pdata) { - err = dsa_switch_parse(ds, pdata); - if (err) - dsa_switch_release_ports(ds); - } else { - err = -ENODEV; - } - - if (err) - return err; - - dst = ds->dst; - dsa_tree_get(dst); - err = dsa_tree_setup(dst); - if (err) { - dsa_switch_release_ports(ds); - dsa_tree_put(dst); - } - - return err; -} - -int dsa_register_switch(struct dsa_switch *ds) -{ - int err; - - mutex_lock(&dsa2_mutex); - err = dsa_switch_probe(ds); - dsa_tree_put(ds->dst); - mutex_unlock(&dsa2_mutex); - - return err; -} -EXPORT_SYMBOL_GPL(dsa_register_switch); - -static void dsa_switch_remove(struct dsa_switch *ds) -{ - struct dsa_switch_tree *dst = ds->dst; - - dsa_tree_teardown(dst); - dsa_switch_release_ports(ds); - dsa_tree_put(dst); -} - -void dsa_unregister_switch(struct dsa_switch *ds) -{ - mutex_lock(&dsa2_mutex); - dsa_switch_remove(ds); - mutex_unlock(&dsa2_mutex); -} -EXPORT_SYMBOL_GPL(dsa_unregister_switch); - -/* If the DSA master chooses to unregister its net_device on .shutdown, DSA is - * blocking that operation from completion, due to the dev_hold taken inside - * netdev_upper_dev_link. Unlink the DSA slave interfaces from being uppers of - * the DSA master, so that the system can reboot successfully. - */ -void dsa_switch_shutdown(struct dsa_switch *ds) -{ - struct net_device *master, *slave_dev; - struct dsa_port *dp; - - mutex_lock(&dsa2_mutex); - - if (!ds->setup) - goto out; - - rtnl_lock(); - - dsa_switch_for_each_user_port(dp, ds) { - master = dsa_port_to_master(dp); - slave_dev = dp->slave; - - netdev_upper_dev_unlink(master, slave_dev); - } - - /* Disconnect from further netdevice notifiers on the master, - * since netdev_uses_dsa() will now return false. - */ - dsa_switch_for_each_cpu_port(dp, ds) - dp->master->dsa_ptr = NULL; - - rtnl_unlock(); -out: - mutex_unlock(&dsa2_mutex); -} -EXPORT_SYMBOL_GPL(dsa_switch_shutdown); - -#ifdef CONFIG_PM_SLEEP -static bool dsa_port_is_initialized(const struct dsa_port *dp) -{ - return dp->type == DSA_PORT_TYPE_USER && dp->slave; -} - -int dsa_switch_suspend(struct dsa_switch *ds) -{ - struct dsa_port *dp; - int ret = 0; - - /* Suspend slave network devices */ - dsa_switch_for_each_port(dp, ds) { - if (!dsa_port_is_initialized(dp)) - continue; - - ret = dsa_slave_suspend(dp->slave); - if (ret) - return ret; - } - - if (ds->ops->suspend) - ret = ds->ops->suspend(ds); - - return ret; -} -EXPORT_SYMBOL_GPL(dsa_switch_suspend); - -int dsa_switch_resume(struct dsa_switch *ds) -{ - struct dsa_port *dp; - int ret = 0; - - if (ds->ops->resume) - ret = ds->ops->resume(ds); - - if (ret) - return ret; - - /* Resume slave network devices */ - dsa_switch_for_each_port(dp, ds) { - if (!dsa_port_is_initialized(dp)) - continue; - - ret = dsa_slave_resume(dp->slave); - if (ret) - return ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(dsa_switch_resume); -#endif - -struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) -{ - if (!netdev || !dsa_slave_dev_check(netdev)) - return ERR_PTR(-ENODEV); - - return dsa_slave_to_port(netdev); -} -EXPORT_SYMBOL_GPL(dsa_port_from_netdev); - -bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b) -{ - if (a->type != b->type) - return false; - - switch (a->type) { - case DSA_DB_PORT: - return a->dp == b->dp; - case DSA_DB_LAG: - return a->lag.dev == b->lag.dev; - case DSA_DB_BRIDGE: - return a->bridge.num == b->bridge.num; - default: - WARN_ON(1); - return false; - } -} - -bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid, - struct dsa_db db) -{ - struct dsa_port *dp = dsa_to_port(ds, port); - struct dsa_mac_addr *a; - - lockdep_assert_held(&dp->addr_lists_lock); - - list_for_each_entry(a, &dp->fdbs, list) { - if (!ether_addr_equal(a->addr, addr) || a->vid != vid) - continue; - - if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) - return true; - } - - return false; -} -EXPORT_SYMBOL_GPL(dsa_fdb_present_in_other_db); - -bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb, - struct dsa_db db) -{ - struct dsa_port *dp = dsa_to_port(ds, port); - struct dsa_mac_addr *a; - - lockdep_assert_held(&dp->addr_lists_lock); - - list_for_each_entry(a, &dp->mdbs, list) { - if (!ether_addr_equal(a->addr, mdb->addr) || a->vid != mdb->vid) - continue; - - if (a->db.type == db.type && !dsa_db_equal(&a->db, &db)) - return true; - } - - return false; -} -EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db); - -static int __init dsa_init_module(void) -{ - int rc; - - dsa_owq = alloc_ordered_workqueue("dsa_ordered", - WQ_MEM_RECLAIM); - if (!dsa_owq) - return -ENOMEM; - - rc = dsa_slave_register_notifier(); - if (rc) - goto register_notifier_fail; - - dev_add_pack(&dsa_pack_type); - - rc = rtnl_link_register(&dsa_link_ops); - if (rc) - goto netlink_register_fail; - - return 0; - -netlink_register_fail: - dsa_slave_unregister_notifier(); - dev_remove_pack(&dsa_pack_type); -register_notifier_fail: - destroy_workqueue(dsa_owq); - - return rc; -} -module_init(dsa_init_module); - -static void __exit dsa_cleanup_module(void) -{ - rtnl_link_unregister(&dsa_link_ops); - - dsa_slave_unregister_notifier(); - dev_remove_pack(&dsa_pack_type); - destroy_workqueue(dsa_owq); -} -module_exit(dsa_cleanup_module); - -MODULE_AUTHOR("Lennert Buytenhek "); -MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:dsa"); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 3f6f84150592..b7ec6efe8b74 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -43,11 +43,6 @@ struct dsa_standalone_event_work { u16 vid; }; -/* dsa.c */ -bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b); - -bool dsa_schedule_work(struct work_struct *work); - /* netlink.c */ extern struct rtnl_link_ops dsa_link_ops __read_mostly; @@ -65,33 +60,10 @@ static inline bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) !ds->needs_standalone_vlan_filtering; } -/* dsa2.c */ -void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag); -void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag); -struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, - const struct net_device *lag_dev); -struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst); -int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, - const struct dsa_device_ops *tag_ops, - const struct dsa_device_ops *old_tag_ops); -void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst, - struct net_device *master, - bool up); -void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst, - struct net_device *master, - bool up); -unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max); -void dsa_bridge_num_put(const struct net_device *bridge_dev, - unsigned int bridge_num); -struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst, - const struct net_device *br); - /* tag_8021q.c */ int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, struct dsa_notifier_tag_8021q_vlan_info *info); int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, struct dsa_notifier_tag_8021q_vlan_info *info); -extern struct list_head dsa_tree_list; - #endif diff --git a/net/dsa/master.c b/net/dsa/master.c index e38b349ca7f8..26d90140d271 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -11,7 +11,7 @@ #include #include -#include "dsa_priv.h" +#include "dsa.h" #include "master.h" #include "port.h" #include "tag.h" diff --git a/net/dsa/port.c b/net/dsa/port.c index bf2c98215021..e6d5c05b41b4 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -12,7 +12,7 @@ #include #include -#include "dsa_priv.h" +#include "dsa.h" #include "port.h" #include "slave.h" #include "switch.h" diff --git a/net/dsa/slave.c b/net/dsa/slave.c index a928aaf68804..d4c436930a04 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -22,6 +22,7 @@ #include #include +#include "dsa.h" #include "dsa_priv.h" #include "port.h" #include "master.h" diff --git a/net/dsa/switch.c b/net/dsa/switch.c index b534116dc519..4420af0081af 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -12,6 +12,7 @@ #include #include +#include "dsa.h" #include "dsa_priv.h" #include "port.h" #include "slave.h" -- cgit v1.2.3-70-g09d2 From 8e396fec21469610e6f1efb6ae8324e72ae8e135 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:53 +0200 Subject: net: dsa: move definitions from dsa_priv.h to slave.c There are some definitions in dsa_priv.h which are only used from slave.c. So move them to slave.c. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa_priv.h | 42 ------------------------------------------ net/dsa/slave.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index b7ec6efe8b74..aa685d2309e0 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -15,51 +15,9 @@ struct dsa_notifier_tag_8021q_vlan_info; -struct dsa_switchdev_event_work { - struct net_device *dev; - struct net_device *orig_dev; - struct work_struct work; - unsigned long event; - /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and - * SWITCHDEV_FDB_DEL_TO_DEVICE - */ - unsigned char addr[ETH_ALEN]; - u16 vid; - bool host_addr; -}; - -enum dsa_standalone_event { - DSA_UC_ADD, - DSA_UC_DEL, - DSA_MC_ADD, - DSA_MC_DEL, -}; - -struct dsa_standalone_event_work { - struct work_struct work; - struct net_device *dev; - enum dsa_standalone_event event; - unsigned char addr[ETH_ALEN]; - u16 vid; -}; - /* netlink.c */ extern struct rtnl_link_ops dsa_link_ops __read_mostly; -static inline bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) -{ - return ds->ops->port_fdb_add && ds->ops->port_fdb_del && - ds->fdb_isolation && !ds->vlan_filtering_is_global && - !ds->needs_standalone_vlan_filtering; -} - -static inline bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) -{ - return ds->ops->port_mdb_add && ds->ops->port_mdb_del && - ds->fdb_isolation && !ds->vlan_filtering_is_global && - !ds->needs_standalone_vlan_filtering; -} - /* tag_8021q.c */ int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, struct dsa_notifier_tag_8021q_vlan_info *info); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d4c436930a04..337cbd80633a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -29,6 +29,48 @@ #include "slave.h" #include "tag.h" +struct dsa_switchdev_event_work { + struct net_device *dev; + struct net_device *orig_dev; + struct work_struct work; + unsigned long event; + /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and + * SWITCHDEV_FDB_DEL_TO_DEVICE + */ + unsigned char addr[ETH_ALEN]; + u16 vid; + bool host_addr; +}; + +enum dsa_standalone_event { + DSA_UC_ADD, + DSA_UC_DEL, + DSA_MC_ADD, + DSA_MC_DEL, +}; + +struct dsa_standalone_event_work { + struct work_struct work; + struct net_device *dev; + enum dsa_standalone_event event; + unsigned char addr[ETH_ALEN]; + u16 vid; +}; + +static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds) +{ + return ds->ops->port_fdb_add && ds->ops->port_fdb_del && + ds->fdb_isolation && !ds->vlan_filtering_is_global && + !ds->needs_standalone_vlan_filtering; +} + +static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds) +{ + return ds->ops->port_mdb_add && ds->ops->port_mdb_del && + ds->fdb_isolation && !ds->vlan_filtering_is_global && + !ds->needs_standalone_vlan_filtering; +} + static void dsa_slave_standalone_event_work(struct work_struct *work) { struct dsa_standalone_event_work *standalone_work = -- cgit v1.2.3-70-g09d2 From 19d05ea712ecbbb67d302664da5ec58b37b9aece Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:54 +0200 Subject: net: dsa: move tag_8021q headers to their proper place tag_8021q definitions are all over the place. Some are exported to linux/dsa/8021q.h (visible by DSA core, taggers, switch drivers and everyone else), and some are in dsa_priv.h. Move the structures that don't need external visibility into tag_8021q.c, and the ones which don't need the world or switch drivers to see them into tag_8021q.h. We also have the tag_8021q.h inclusion from switch.c, which is basically the entire reason why tag_8021q.c was built into DSA in commit 8b6e638b4be2 ("net: dsa: build tag_8021q.c as part of DSA core"). I still don't know how to better deal with that, so leave it alone. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 31 +------------------------------ include/net/dsa.h | 1 + net/dsa/dsa_priv.h | 8 -------- net/dsa/port.c | 1 + net/dsa/switch.c | 1 + net/dsa/tag_8021q.c | 15 +++++++++++++++ net/dsa/tag_8021q.h | 27 +++++++++++++++++++++++++++ net/dsa/tag_ocelot_8021q.c | 1 + net/dsa/tag_sja1105.c | 1 + 9 files changed, 48 insertions(+), 38 deletions(-) create mode 100644 net/dsa/tag_8021q.h (limited to 'net') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 3ed117e299ec..f3664ee12170 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -5,28 +5,8 @@ #ifndef _NET_DSA_8021Q_H #define _NET_DSA_8021Q_H -#include -#include #include - -struct dsa_switch; -struct dsa_port; -struct sk_buff; -struct net_device; - -struct dsa_tag_8021q_vlan { - struct list_head list; - int port; - u16 vid; - refcount_t refcount; -}; - -struct dsa_8021q_context { - struct dsa_switch *ds; - struct list_head vlans; - /* EtherType of RX VID, used for filtering on master interface */ - __be16 proto; -}; +#include int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); @@ -38,15 +18,6 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge); -struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, - u16 tpid, u16 tci); - -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, - int *vbid); - -struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, - int vbid); - u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num); u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp); diff --git a/include/net/dsa.h b/include/net/dsa.h index d5bfcb63d4c2..96086289aa9b 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -22,6 +22,7 @@ #include #include +struct dsa_8021q_context; struct tc_action; struct phy_device; struct fixed_phy_status; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index aa685d2309e0..265659954ffd 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -13,15 +13,7 @@ #define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG -struct dsa_notifier_tag_8021q_vlan_info; - /* netlink.c */ extern struct rtnl_link_ops dsa_link_ops __read_mostly; -/* tag_8021q.c */ -int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, - struct dsa_notifier_tag_8021q_vlan_info *info); -int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, - struct dsa_notifier_tag_8021q_vlan_info *info); - #endif diff --git a/net/dsa/port.c b/net/dsa/port.c index e6d5c05b41b4..67ad1adec2a2 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -16,6 +16,7 @@ #include "port.h" #include "slave.h" #include "switch.h" +#include "tag_8021q.h" /** * dsa_port_notify - Notify the switching fabric of changes to a port diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 4420af0081af..e53cc0c3c933 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -17,6 +17,7 @@ #include "port.h" #include "slave.h" #include "switch.h" +#include "tag_8021q.h" static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, unsigned int ageing_time) diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index abd994dc76d5..ac2eb933106e 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -11,6 +11,7 @@ #include "port.h" #include "switch.h" #include "tag.h" +#include "tag_8021q.h" /* Binary structure of the fake 12-bit VID field (when the TPID is * ETH_P_DSA_8021Q): @@ -63,6 +64,20 @@ #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ DSA_8021Q_PORT_MASK) +struct dsa_tag_8021q_vlan { + struct list_head list; + int port; + u16 vid; + refcount_t refcount; +}; + +struct dsa_8021q_context { + struct dsa_switch *ds; + struct list_head vlans; + /* EtherType of RX VID, used for filtering on master interface */ + __be16 proto; +}; + u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num) { /* The VBID value of 0 is reserved for precise TX, but it is also diff --git a/net/dsa/tag_8021q.h b/net/dsa/tag_8021q.h new file mode 100644 index 000000000000..b75cbaa028ef --- /dev/null +++ b/net/dsa/tag_8021q.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_TAG_8021Q_H +#define __DSA_TAG_8021Q_H + +#include + +#include "switch.h" + +struct sk_buff; +struct net_device; + +struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, + u16 tpid, u16 tci); + +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, + int *vbid); + +struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, + int vbid); + +int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info); +int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info); + +#endif diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 7f0c2d71e89b..1f0b8c20eba5 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -12,6 +12,7 @@ #include #include "tag.h" +#include "tag_8021q.h" #define OCELOT_8021Q_NAME "ocelot-8021q" diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 8f581617e15c..f14f51b41491 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -7,6 +7,7 @@ #include #include "tag.h" +#include "tag_8021q.h" #define SJA1105_NAME "sja1105" #define SJA1110_NAME "sja1110" -- cgit v1.2.3-70-g09d2 From 5917bfe688672a6afc816ad472a274eb16c9bb7a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:55 +0200 Subject: net: dsa: kill off dsa_priv.h The last remnants in dsa_priv.h are a netlink-related definition for which we create a new header, and DSA_MAX_NUM_OFFLOADING_BRIDGES which is only used from dsa.c, so move it there. Some inclusions need to be adjusted now that we no longer have headers included transitively from dsa_priv.h. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- net/dsa/dsa.c | 4 +++- net/dsa/dsa_priv.h | 19 ------------------- net/dsa/netlink.c | 2 +- net/dsa/netlink.h | 8 ++++++++ net/dsa/slave.c | 2 +- net/dsa/switch.c | 2 +- net/dsa/tag_8021q.c | 1 - net/dsa/tag_hellcreek.c | 1 - 8 files changed, 14 insertions(+), 25 deletions(-) delete mode 100644 net/dsa/dsa_priv.h create mode 100644 net/dsa/netlink.h (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index fee4d28b7304..e5f156940c67 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -21,13 +21,15 @@ #include "devlink.h" #include "dsa.h" -#include "dsa_priv.h" #include "master.h" +#include "netlink.h" #include "port.h" #include "slave.h" #include "switch.h" #include "tag.h" +#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG + static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h deleted file mode 100644 index 265659954ffd..000000000000 --- a/net/dsa/dsa_priv.h +++ /dev/null @@ -1,19 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * net/dsa/dsa_priv.h - Hardware switch handling - * Copyright (c) 2008-2009 Marvell Semiconductor - */ - -#ifndef __DSA_PRIV_H -#define __DSA_PRIV_H - -#include -#include -#include - -#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG - -/* netlink.c */ -extern struct rtnl_link_ops dsa_link_ops __read_mostly; - -#endif diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c index 824b09d904cc..bd4bbaf851de 100644 --- a/net/dsa/netlink.c +++ b/net/dsa/netlink.c @@ -4,7 +4,7 @@ #include #include -#include "dsa_priv.h" +#include "netlink.h" #include "slave.h" static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = { diff --git a/net/dsa/netlink.h b/net/dsa/netlink.h new file mode 100644 index 000000000000..7eda2fa15722 --- /dev/null +++ b/net/dsa/netlink.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef __DSA_NETLINK_H +#define __DSA_NETLINK_H + +extern struct rtnl_link_ops dsa_link_ops __read_mostly; + +#endif diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 337cbd80633a..aab79c355224 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -23,9 +23,9 @@ #include #include "dsa.h" -#include "dsa_priv.h" #include "port.h" #include "master.h" +#include "netlink.h" #include "slave.h" #include "tag.h" diff --git a/net/dsa/switch.c b/net/dsa/switch.c index e53cc0c3c933..d5bc4bb7310d 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -13,7 +13,7 @@ #include #include "dsa.h" -#include "dsa_priv.h" +#include "netlink.h" #include "port.h" #include "slave.h" #include "switch.h" diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index ac2eb933106e..b1263917fcb2 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -7,7 +7,6 @@ #include #include -#include "dsa_priv.h" #include "port.h" #include "switch.h" #include "tag.h" diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index a047041e7686..71884296fc70 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -11,7 +11,6 @@ #include #include -#include "dsa_priv.h" #include "tag.h" #define HELLCREEK_NAME "hellcreek" -- cgit v1.2.3-70-g09d2 From 64a8f8f7127da228d59a39e2c5e75f86590f90b4 Mon Sep 17 00:00:00 2001 From: Maxim Korotkov Date: Tue, 22 Nov 2022 15:29:01 +0300 Subject: ethtool: avoiding integer overflow in ethtool_phys_id() The value of an arithmetic expression "n * id.data" is subject to possible overflow due to a failure to cast operands to a larger data type before performing arithmetic. Used macro for multiplication instead operator for avoiding overflow. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Maxim Korotkov Reviewed-by: Alexander Lobakin Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20221122122901.22294-1-korotkov.maxim.s@gmail.com Signed-off-by: Jakub Kicinski --- net/ethtool/ioctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 99272a67525c..c2f1a542e6fa 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -2013,7 +2013,8 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) } else { /* Driver expects to be called at twice the frequency in rc */ int n = rc * 2, interval = HZ / n; - u64 count = n * id.data, i = 0; + u64 count = mul_u32_u32(n, id.data); + u64 i = 0; do { rtnl_lock(); -- cgit v1.2.3-70-g09d2 From f157c416c51ab8b7f5e833dfde8ace3a6325c19a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:38 +0100 Subject: xfrm: a few coding style clean ups Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 9 ++++++--- net/xfrm/xfrm_user.c | 6 +++--- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d80519c4e389..a049f91d4446 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4414,7 +4414,8 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ - if ((err = xfrm_migrate_check(m, num_migrate)) < 0) + err = xfrm_migrate_check(m, num_migrate); + if (err < 0) goto out; if (dir >= XFRM_POLICY_MAX) { @@ -4423,7 +4424,8 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 1 - find policy */ - if ((pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id)) == NULL) { + pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); + if (!pol) { err = -ENOENT; goto out; } @@ -4445,7 +4447,8 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 3 - update policy */ - if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0) + err = xfrm_policy_migrate(pol, m, num_migrate); + if (err < 0) goto restore_state; /* Stage 4 - delete old state(s) */ diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index e73f9efc54c1..25de6e8faf8d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1538,7 +1538,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, &p->info.saddr, 1, family); err = -ENOENT; - if (x == NULL) + if (!x) goto out_noput; err = xfrm_alloc_spi(x, p->min, p->max); @@ -2718,7 +2718,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_encap_tmpl *encap = NULL; u32 if_id = 0; - if (attrs[XFRMA_MIGRATE] == NULL) + if (!attrs[XFRMA_MIGRATE]) return -EINVAL; kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL; @@ -2727,7 +2727,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; - err = copy_from_user_migrate((struct xfrm_migrate *)m, kmp, attrs, &n); + err = copy_from_user_migrate(m, kmp, attrs, &n); if (err) return err; -- cgit v1.2.3-70-g09d2 From a25b19f36f921b90bcb826c80b568266b8ad40a4 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:39 +0100 Subject: xfrm: add extack to xfrm_add_sa_expire Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 25de6e8faf8d..1664baefae80 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2584,8 +2584,11 @@ static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh, spin_lock_bh(&x->lock); err = -EINVAL; - if (x->km.state != XFRM_STATE_VALID) + if (x->km.state != XFRM_STATE_VALID) { + NL_SET_ERR_MSG(extack, "SA must be in VALID state"); goto out; + } + km_state_expired(x, ue->hard, nlh->nlmsg_pid); if (ue->hard) { -- cgit v1.2.3-70-g09d2 From 880e475d2b0b1131a6e91464b2145820893e4ddf Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:40 +0100 Subject: xfrm: add extack to xfrm_del_sa Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 1664baefae80..06a379d35ebb 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -862,12 +862,12 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, goto out; if (xfrm_state_kern(x)) { + NL_SET_ERR_MSG(extack, "SA is in use by tunnels"); err = -EPERM; goto out; } err = xfrm_state_delete(x); - if (err < 0) goto out; -- cgit v1.2.3-70-g09d2 From 643bc1a2ee30efc9ab832401e89c9400cd9f52ac Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:41 +0100 Subject: xfrm: add extack to xfrm_new_ae and xfrm_replay_verify_len Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 06a379d35ebb..13607df4f30d 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -515,7 +515,8 @@ static int attach_aead(struct xfrm_state *x, struct nlattr *rta, } static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn, - struct nlattr *rp) + struct nlattr *rp, + struct netlink_ext_ack *extack) { struct xfrm_replay_state_esn *up; unsigned int ulen; @@ -528,13 +529,25 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es /* Check the overall length and the internal bitmap length to avoid * potential overflow. */ - if (nla_len(rp) < (int)ulen || - xfrm_replay_state_esn_len(replay_esn) != ulen || - replay_esn->bmp_len != up->bmp_len) + if (nla_len(rp) < (int)ulen) { + NL_SET_ERR_MSG(extack, "ESN attribute is too short"); return -EINVAL; + } - if (up->replay_window > up->bmp_len * sizeof(__u32) * 8) + if (xfrm_replay_state_esn_len(replay_esn) != ulen) { + NL_SET_ERR_MSG(extack, "New ESN size doesn't match the existing SA's ESN size"); return -EINVAL; + } + + if (replay_esn->bmp_len != up->bmp_len) { + NL_SET_ERR_MSG(extack, "New ESN bitmap size doesn't match the existing SA's ESN bitmap"); + return -EINVAL; + } + + if (up->replay_window > up->bmp_len * sizeof(__u32) * 8) { + NL_SET_ERR_MSG(extack, "ESN replay window is longer than the bitmap"); + return -EINVAL; + } return 0; } @@ -2433,12 +2446,16 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *et = attrs[XFRMA_ETIMER_THRESH]; struct nlattr *rt = attrs[XFRMA_REPLAY_THRESH]; - if (!lt && !rp && !re && !et && !rt) + if (!lt && !rp && !re && !et && !rt) { + NL_SET_ERR_MSG(extack, "Missing required attribute for AE"); return err; + } /* pedantic mode - thou shalt sayeth replaceth */ - if (!(nlh->nlmsg_flags&NLM_F_REPLACE)) + if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG(extack, "NLM_F_REPLACE flag is required"); return err; + } mark = xfrm_mark_get(attrs, &m); @@ -2446,10 +2463,12 @@ static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, if (x == NULL) return -ESRCH; - if (x->km.state != XFRM_STATE_VALID) + if (x->km.state != XFRM_STATE_VALID) { + NL_SET_ERR_MSG(extack, "SA must be in VALID state"); goto out; + } - err = xfrm_replay_verify_len(x->replay_esn, re); + err = xfrm_replay_verify_len(x->replay_esn, re, extack); if (err) goto out; -- cgit v1.2.3-70-g09d2 From bd12240337f43522b99c43f8976af34c712b5f57 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:42 +0100 Subject: xfrm: add extack to xfrm_do_migrate Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 3 ++- net/key/af_key.c | 2 +- net/xfrm/xfrm_policy.c | 28 ++++++++++++++++++++-------- net/xfrm/xfrm_user.c | 16 +++++++++++----- 4 files changed, 34 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index dbc81f5eb553..576566bd0be9 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1703,7 +1703,8 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap, u32 if_id); + struct xfrm_encap_tmpl *encap, u32 if_id, + struct netlink_ext_ack *extack); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport); diff --git a/net/key/af_key.c b/net/key/af_key.c index c85df5b958d2..7f4ff5fe2257 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2626,7 +2626,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL, 0); + kma ? &k : NULL, net, NULL, 0, NULL); out: return err; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index a049f91d4446..9b9e2765363d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -4333,7 +4333,8 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm /* update endpoint address(es) of template(s) */ static int xfrm_policy_migrate(struct xfrm_policy *pol, - struct xfrm_migrate *m, int num_migrate) + struct xfrm_migrate *m, int num_migrate, + struct netlink_ext_ack *extack) { struct xfrm_migrate *mp; int i, j, n = 0; @@ -4341,6 +4342,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, write_lock_bh(&pol->lock); if (unlikely(pol->walk.dead)) { /* target policy has been deleted */ + NL_SET_ERR_MSG(extack, "Target policy not found"); write_unlock_bh(&pol->lock); return -ENOENT; } @@ -4372,17 +4374,22 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, return 0; } -static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) +static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate, + struct netlink_ext_ack *extack) { int i, j; - if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) + if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) { + NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)"); return -EINVAL; + } for (i = 0; i < num_migrate; i++) { if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || - xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) + xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) { + NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null"); return -EINVAL; + } /* check if there is any duplicated entry */ for (j = i + 1; j < num_migrate; j++) { @@ -4393,8 +4400,10 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) m[i].proto == m[j].proto && m[i].mode == m[j].mode && m[i].reqid == m[j].reqid && - m[i].old_family == m[j].old_family) + m[i].old_family == m[j].old_family) { + NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique"); return -EINVAL; + } } } @@ -4404,7 +4413,8 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap, u32 if_id) + struct xfrm_encap_tmpl *encap, u32 if_id, + struct netlink_ext_ack *extack) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4414,11 +4424,12 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ - err = xfrm_migrate_check(m, num_migrate); + err = xfrm_migrate_check(m, num_migrate, extack); if (err < 0) goto out; if (dir >= XFRM_POLICY_MAX) { + NL_SET_ERR_MSG(extack, "Invalid policy direction"); err = -EINVAL; goto out; } @@ -4426,6 +4437,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, /* Stage 1 - find policy */ pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id); if (!pol) { + NL_SET_ERR_MSG(extack, "Target policy not found"); err = -ENOENT; goto out; } @@ -4447,7 +4459,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* Stage 3 - update policy */ - err = xfrm_policy_migrate(pol, m, num_migrate); + err = xfrm_policy_migrate(pol, m, num_migrate, extack); if (err < 0) goto restore_state; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 13607df4f30d..c5d6a92d73cb 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2687,7 +2687,8 @@ nomem: #ifdef CONFIG_XFRM_MIGRATE static int copy_from_user_migrate(struct xfrm_migrate *ma, struct xfrm_kmaddress *k, - struct nlattr **attrs, int *num) + struct nlattr **attrs, int *num, + struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_MIGRATE]; struct xfrm_user_migrate *um; @@ -2706,8 +2707,10 @@ static int copy_from_user_migrate(struct xfrm_migrate *ma, um = nla_data(rt); num_migrate = nla_len(rt) / sizeof(*um); - if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH) + if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH) { + NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)"); return -EINVAL; + } for (i = 0; i < num_migrate; i++, um++, ma++) { memcpy(&ma->old_daddr, &um->old_daddr, sizeof(ma->old_daddr)); @@ -2740,8 +2743,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_encap_tmpl *encap = NULL; u32 if_id = 0; - if (!attrs[XFRMA_MIGRATE]) + if (!attrs[XFRMA_MIGRATE]) { + NL_SET_ERR_MSG(extack, "Missing required MIGRATE attribute"); return -EINVAL; + } kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL; @@ -2749,7 +2754,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; - err = copy_from_user_migrate(m, kmp, attrs, &n); + err = copy_from_user_migrate(m, kmp, attrs, &n, extack); if (err) return err; @@ -2766,7 +2771,8 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id); + err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, + if_id, extack); kfree(encap); -- cgit v1.2.3-70-g09d2 From c2dad11e0466a27d40041845cf63cdfb4fbd991f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:43 +0100 Subject: xfrm: add extack to xfrm_alloc_userspi Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 5 +++-- net/key/af_key.c | 4 ++-- net/xfrm/xfrm_state.c | 21 ++++++++++++++++----- net/xfrm/xfrm_user.c | 8 +++++--- 4 files changed, 26 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 576566bd0be9..e0cc6791c001 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1681,8 +1681,9 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, int xfrm_policy_flush(struct net *net, u8 type, bool task_valid); void xfrm_policy_hash_rebuild(struct net *net); u32 xfrm_get_acqseq(void); -int verify_spi_info(u8 proto, u32 min, u32 max); -int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi); +int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack); +int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, + struct netlink_ext_ack *extack); struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, diff --git a/net/key/af_key.c b/net/key/af_key.c index 7f4ff5fe2257..e1d2155605aa 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1377,13 +1377,13 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ max_spi = range->sadb_spirange_max; } - err = verify_spi_info(x->id.proto, min_spi, max_spi); + err = verify_spi_info(x->id.proto, min_spi, max_spi, NULL); if (err) { xfrm_state_put(x); return err; } - err = xfrm_alloc_spi(x, min_spi, max_spi); + err = xfrm_alloc_spi(x, min_spi, max_spi, NULL); resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x); if (IS_ERR(resp_skb)) { diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 81df34b3da6e..d0ae17e3bb38 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2017,7 +2017,7 @@ u32 xfrm_get_acqseq(void) } EXPORT_SYMBOL(xfrm_get_acqseq); -int verify_spi_info(u8 proto, u32 min, u32 max) +int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack) { switch (proto) { case IPPROTO_AH: @@ -2026,22 +2026,28 @@ int verify_spi_info(u8 proto, u32 min, u32 max) case IPPROTO_COMP: /* IPCOMP spi is 16-bits. */ - if (max >= 0x10000) + if (max >= 0x10000) { + NL_SET_ERR_MSG(extack, "IPCOMP SPI must be <= 65535"); return -EINVAL; + } break; default: + NL_SET_ERR_MSG(extack, "Invalid protocol, must be one of AH, ESP, IPCOMP"); return -EINVAL; } - if (min > max) + if (min > max) { + NL_SET_ERR_MSG(extack, "Invalid SPI range: min > max"); return -EINVAL; + } return 0; } EXPORT_SYMBOL(verify_spi_info); -int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) +int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, + struct netlink_ext_ack *extack) { struct net *net = xs_net(x); unsigned int h; @@ -2053,8 +2059,10 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) u32 mark = x->mark.v & x->mark.m; spin_lock_bh(&x->lock); - if (x->km.state == XFRM_STATE_DEAD) + if (x->km.state == XFRM_STATE_DEAD) { + NL_SET_ERR_MSG(extack, "Target ACQUIRE is in DEAD state"); goto unlock; + } err = 0; if (x->id.spi) @@ -2065,6 +2073,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) if (minspi == maxspi) { x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family); if (x0) { + NL_SET_ERR_MSG(extack, "Requested SPI is already in use"); xfrm_state_put(x0); goto unlock; } @@ -2089,6 +2098,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; + } else { + NL_SET_ERR_MSG(extack, "No SPI available in the requested range"); } unlock: diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c5d6a92d73cb..5c280e04e02c 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1523,7 +1523,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, u32 if_id = 0; p = nlmsg_data(nlh); - err = verify_spi_info(p->info.id.proto, p->min, p->max); + err = verify_spi_info(p->info.id.proto, p->min, p->max, extack); if (err) goto out_noput; @@ -1551,10 +1551,12 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, &p->info.saddr, 1, family); err = -ENOENT; - if (!x) + if (!x) { + NL_SET_ERR_MSG(extack, "Target ACQUIRE not found"); goto out_noput; + } - err = xfrm_alloc_spi(x, p->min, p->max); + err = xfrm_alloc_spi(x, p->min, p->max, extack); if (err) goto out; -- cgit v1.2.3-70-g09d2 From a741721680092b64ff71fc1f1c790123c6d40a02 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Thu, 24 Nov 2022 15:43:44 +0100 Subject: xfrm: add extack to xfrm_set_spdinfo Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 5c280e04e02c..0eb4696661c8 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1367,20 +1367,28 @@ static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_SPD_IPV4_HTHRESH]) { struct nlattr *rta = attrs[XFRMA_SPD_IPV4_HTHRESH]; - if (nla_len(rta) < sizeof(*thresh4)) + if (nla_len(rta) < sizeof(*thresh4)) { + NL_SET_ERR_MSG(extack, "Invalid SPD_IPV4_HTHRESH attribute length"); return -EINVAL; + } thresh4 = nla_data(rta); - if (thresh4->lbits > 32 || thresh4->rbits > 32) + if (thresh4->lbits > 32 || thresh4->rbits > 32) { + NL_SET_ERR_MSG(extack, "Invalid hash threshold (must be <= 32 for IPv4)"); return -EINVAL; + } } if (attrs[XFRMA_SPD_IPV6_HTHRESH]) { struct nlattr *rta = attrs[XFRMA_SPD_IPV6_HTHRESH]; - if (nla_len(rta) < sizeof(*thresh6)) + if (nla_len(rta) < sizeof(*thresh6)) { + NL_SET_ERR_MSG(extack, "Invalid SPD_IPV6_HTHRESH attribute length"); return -EINVAL; + } thresh6 = nla_data(rta); - if (thresh6->lbits > 128 || thresh6->rbits > 128) + if (thresh6->lbits > 128 || thresh6->rbits > 128) { + NL_SET_ERR_MSG(extack, "Invalid hash threshold (must be <= 128 for IPv6)"); return -EINVAL; + } } if (thresh4 || thresh6) { -- cgit v1.2.3-70-g09d2 From 4c47867bc789bdc722f3bb760355c2c246fbe9af Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 24 Nov 2022 12:15:54 +0100 Subject: of: net: export of_get_mac_address_nvmem() Export of_get_mac_addr_nvmem() and rename it to of_get_mac_address_nvmem() in order to fit the convention followed by the existing exported helpers of the same kind. This way, OF compatible drivers using eg. fwnode_get_mac_address() can do a direct call to it instead of calling of_get_mac_address() just for the nvmem step, avoiding to repeat an expensive DT lookup which has already been done once. Eventually, fwnode_get_mac_address() should probably be updated to perform the nvmem lookup directly, but as of today, nvmem cells seem not to be supported by ACPI yet which would defeat this kind of extension. Suggested-by: Marcin Wojtas Signed-off-by: Miquel Raynal Signed-off-by: Paolo Abeni --- include/linux/of_net.h | 6 ++++++ net/core/of_net.c | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 0484b613ca64..d88715a0b3a5 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -14,6 +14,7 @@ struct net_device; extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface); extern int of_get_mac_address(struct device_node *np, u8 *mac); +extern int of_get_mac_address_nvmem(struct device_node *np, u8 *mac); int of_get_ethdev_address(struct device_node *np, struct net_device *dev); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else @@ -28,6 +29,11 @@ static inline int of_get_mac_address(struct device_node *np, u8 *mac) return -ENODEV; } +static inline int of_get_mac_address_nvmem(struct device_node *np, u8 *mac) +{ + return -ENODEV; +} + static inline int of_get_ethdev_address(struct device_node *np, struct net_device *dev) { return -ENODEV; diff --git a/net/core/of_net.c b/net/core/of_net.c index f1a9bf7578e7..55d3fe229269 100644 --- a/net/core/of_net.c +++ b/net/core/of_net.c @@ -57,7 +57,7 @@ static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr) return -ENODEV; } -static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) +int of_get_mac_address_nvmem(struct device_node *np, u8 *addr) { struct platform_device *pdev = of_find_device_by_node(np); struct nvmem_cell *cell; @@ -94,6 +94,7 @@ static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) return 0; } +EXPORT_SYMBOL(of_get_mac_address_nvmem); /** * of_get_mac_address() @@ -140,7 +141,7 @@ int of_get_mac_address(struct device_node *np, u8 *addr) if (!ret) return 0; - return of_get_mac_addr_nvmem(np, addr); + return of_get_mac_address_nvmem(np, addr); } EXPORT_SYMBOL(of_get_mac_address); -- cgit v1.2.3-70-g09d2 From 7666dbec7268458505d44ef3ae22fd9181c09b01 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Fri, 25 Nov 2022 11:02:55 +0100 Subject: net: devlink: add WARN_ON_ONCE to check return value of unregister_netdevice_notifier_net() call As the return value is not 0 only in case there is no such notifier block registered, add a WARN_ON_ONCE() to yell about it. Suggested-by: Ido Schimmel Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/20221125100255.1786741-1-jiri@resnulli.us Signed-off-by: Paolo Abeni --- net/core/devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index cea154ddce7a..0e10a8a68c5e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -9907,8 +9907,8 @@ void devlink_free(struct devlink *devlink) xa_destroy(&devlink->snapshot_ids); - unregister_netdevice_notifier_net(devlink_net(devlink), - &devlink->netdevice_nb); + WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), + &devlink->netdevice_nb)); xa_erase(&devlinks, devlink->index); -- cgit v1.2.3-70-g09d2 From 51147284eb7d685a689a5d1b7772faec278a2338 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Tue, 29 Nov 2022 14:55:34 +0100 Subject: ieee802154: Advertize coordinators discovery Let's introduce the basics for advertizing discovered PANs and coordinators, which is: - A new "scan" netlink message group. - A couple of netlink command/attribute. - The main netlink helper to send a netlink message with all the necessary information to forward the main information to the user. Two netlink attributes are proactively added to support future UWB complex channels, but are not actually used yet. Co-developed-by: David Girault Signed-off-by: David Girault Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221129135535.532513-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/net/cfg802154.h | 18 ++++++++ include/net/nl802154.h | 43 +++++++++++++++++++ net/ieee802154/nl802154.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++ net/ieee802154/nl802154.h | 2 + 4 files changed, 166 insertions(+) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index e1481f9cf049..d09c393d229f 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -260,6 +260,24 @@ struct ieee802154_addr { }; }; +/** + * struct ieee802154_coord_desc - Coordinator descriptor + * @addr: PAN ID and coordinator address + * @page: page this coordinator is using + * @channel: channel this coordinator is using + * @superframe_spec: SuperFrame specification as received + * @link_quality: link quality indicator at which the beacon was received + * @gts_permit: the coordinator accepts GTS requests + */ +struct ieee802154_coord_desc { + struct ieee802154_addr addr; + u8 page; + u8 channel; + u16 superframe_spec; + u8 link_quality; + bool gts_permit; +}; + struct ieee802154_llsec_key_id { u8 mode; u8 id; diff --git a/include/net/nl802154.h b/include/net/nl802154.h index f5850b569c52..b79a89d5207c 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -72,6 +72,8 @@ enum nl802154_commands { NL802154_CMD_NEW_SEC_LEVEL, NL802154_CMD_DEL_SEC_LEVEL, + NL802154_CMD_SCAN_EVENT, + /* add new commands above here */ /* used to define NL802154_CMD_MAX below */ @@ -131,6 +133,8 @@ enum nl802154_attrs { NL802154_ATTR_PID, NL802154_ATTR_NETNS_FD, + NL802154_ATTR_COORDINATOR, + /* add attributes here, update the policy in nl802154.c */ #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL @@ -216,6 +220,45 @@ enum nl802154_wpan_phy_capability_attr { NL802154_CAP_ATTR_MAX = __NL802154_CAP_ATTR_AFTER_LAST - 1 }; +/** + * enum nl802154_coord - Netlink attributes for a coord + * + * @__NL802154_COORD_INVALID: invalid + * @NL802154_COORD_PANID: PANID of the coordinator (2 bytes) + * @NL802154_COORD_ADDR: coordinator address, (8 bytes or 2 bytes) + * @NL802154_COORD_CHANNEL: channel number, related to @NL802154_COORD_PAGE (u8) + * @NL802154_COORD_PAGE: channel page, related to @NL802154_COORD_CHANNEL (u8) + * @NL802154_COORD_PREAMBLE_CODE: Preamble code used when the beacon was received, + * this is PHY dependent and optional (u8) + * @NL802154_COORD_MEAN_PRF: Mean PRF used when the beacon was received, + * this is PHY dependent and optional (u8) + * @NL802154_COORD_SUPERFRAME_SPEC: superframe specification of the PAN (u16) + * @NL802154_COORD_LINK_QUALITY: signal quality of beacon in unspecified units, + * scaled to 0..255 (u8) + * @NL802154_COORD_GTS_PERMIT: set to true if GTS is permitted on this PAN + * @NL802154_COORD_PAYLOAD_DATA: binary data containing the raw data from the + * frame payload, (only if beacon or probe response had data) + * @NL802154_COORD_PAD: attribute used for padding for 64-bit alignment + * @NL802154_COORD_MAX: highest coordinator attribute + */ +enum nl802154_coord { + __NL802154_COORD_INVALID, + NL802154_COORD_PANID, + NL802154_COORD_ADDR, + NL802154_COORD_CHANNEL, + NL802154_COORD_PAGE, + NL802154_COORD_PREAMBLE_CODE, + NL802154_COORD_MEAN_PRF, + NL802154_COORD_SUPERFRAME_SPEC, + NL802154_COORD_LINK_QUALITY, + NL802154_COORD_GTS_PERMIT, + NL802154_COORD_PAYLOAD_DATA, + NL802154_COORD_PAD, + + /* keep last */ + NL802154_COORD_MAX, +}; + /** * enum nl802154_cca_modes - cca modes * diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 38c4f3cb010e..80dc73182785 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -26,10 +26,12 @@ static struct genl_family nl802154_fam; /* multicast groups */ enum nl802154_multicast_groups { NL802154_MCGRP_CONFIG, + NL802154_MCGRP_SCAN, }; static const struct genl_multicast_group nl802154_mcgrps[] = { [NL802154_MCGRP_CONFIG] = { .name = "config", }, + [NL802154_MCGRP_SCAN] = { .name = "scan", }, }; /* returns ERR_PTR values */ @@ -216,6 +218,9 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_PID] = { .type = NLA_U32 }, [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 }, + + [NL802154_ATTR_COORDINATOR] = { .type = NLA_NESTED }, + #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, @@ -1281,6 +1286,104 @@ static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info) return err; } +static int nl802154_prep_scan_event_msg(struct sk_buff *msg, + struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, + u32 portid, u32 seq, int flags, u8 cmd, + struct ieee802154_coord_desc *desc) +{ + struct nlattr *nla; + void *hdr; + + hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); + if (!hdr) + return -ENOBUFS; + + if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx)) + goto nla_put_failure; + + if (wpan_dev->netdev && + nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex)) + goto nla_put_failure; + + if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV, + wpan_dev_id(wpan_dev), NL802154_ATTR_PAD)) + goto nla_put_failure; + + nla = nla_nest_start_noflag(msg, NL802154_ATTR_COORDINATOR); + if (!nla) + goto nla_put_failure; + + if (nla_put(msg, NL802154_COORD_PANID, IEEE802154_PAN_ID_LEN, + &desc->addr.pan_id)) + goto nla_put_failure; + + if (desc->addr.mode == IEEE802154_ADDR_SHORT) { + if (nla_put(msg, NL802154_COORD_ADDR, + IEEE802154_SHORT_ADDR_LEN, + &desc->addr.short_addr)) + goto nla_put_failure; + } else { + if (nla_put(msg, NL802154_COORD_ADDR, + IEEE802154_EXTENDED_ADDR_LEN, + &desc->addr.extended_addr)) + goto nla_put_failure; + } + + if (nla_put_u8(msg, NL802154_COORD_CHANNEL, desc->channel)) + goto nla_put_failure; + + if (nla_put_u8(msg, NL802154_COORD_PAGE, desc->page)) + goto nla_put_failure; + + if (nla_put_u16(msg, NL802154_COORD_SUPERFRAME_SPEC, + desc->superframe_spec)) + goto nla_put_failure; + + if (nla_put_u8(msg, NL802154_COORD_LINK_QUALITY, desc->link_quality)) + goto nla_put_failure; + + if (desc->gts_permit && nla_put_flag(msg, NL802154_COORD_GTS_PERMIT)) + goto nla_put_failure; + + /* TODO: NL802154_COORD_PAYLOAD_DATA if any */ + + nla_nest_end(msg, nla); + + genlmsg_end(msg, hdr); + + return 0; + + nla_put_failure: + genlmsg_cancel(msg, hdr); + + return -EMSGSIZE; +} + +int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + struct ieee802154_coord_desc *desc) +{ + struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy); + struct sk_buff *msg; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!msg) + return -ENOMEM; + + ret = nl802154_prep_scan_event_msg(msg, rdev, wpan_dev, 0, 0, 0, + NL802154_CMD_SCAN_EVENT, + desc); + if (ret < 0) { + nlmsg_free(msg); + return ret; + } + + return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(wpan_phy), + msg, 0, NL802154_MCGRP_SCAN, GFP_ATOMIC); +} +EXPORT_SYMBOL_GPL(nl802154_scan_event); + #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, diff --git a/net/ieee802154/nl802154.h b/net/ieee802154/nl802154.h index 8c4b6d08954c..89b805500032 100644 --- a/net/ieee802154/nl802154.h +++ b/net/ieee802154/nl802154.h @@ -4,5 +4,7 @@ int nl802154_init(void); void nl802154_exit(void); +int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + struct ieee802154_coord_desc *desc); #endif /* __IEEE802154_NL802154_H */ -- cgit v1.2.3-70-g09d2 From e29e3c7ce6d4b2f164ebd717e4794c626fc1c954 Mon Sep 17 00:00:00 2001 From: David Girault Date: Tue, 29 Nov 2022 14:55:35 +0100 Subject: mac802154: Trace the registration of new PANs Add an internal trace when valid beacons are received. Signed-off-by: David Girault Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221129135535.532513-3-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- net/mac802154/trace.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net') diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h index df855c33daf2..689396d6c76a 100644 --- a/net/mac802154/trace.h +++ b/net/mac802154/trace.h @@ -264,6 +264,31 @@ TRACE_EVENT(802154_drv_set_promiscuous_mode, BOOL_TO_STR(__entry->on)) ); +TRACE_EVENT(802154_new_scan_event, + TP_PROTO(struct ieee802154_coord_desc *desc), + TP_ARGS(desc), + TP_STRUCT__entry( + __field(__le16, pan_id) + __field(__le64, addr) + __field(u8, channel) + __field(u8, page) + ), + TP_fast_assign( + __entry->page = desc->page; + __entry->channel = desc->channel; + __entry->pan_id = desc->addr.pan_id; + __entry->addr = desc->addr.extended_addr; + ), + TP_printk("panid: %u, coord_addr: 0x%llx, page: %u, channel: %u", + __le16_to_cpu(__entry->pan_id), __le64_to_cpu(__entry->addr), + __entry->page, __entry->channel) +); + +DEFINE_EVENT(802154_new_scan_event, 802154_scan_event, + TP_PROTO(struct ieee802154_coord_desc *desc), + TP_ARGS(desc) +); + #endif /* !__MAC802154_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3-70-g09d2 From 7a945ce0c19bbdf821d5f7ce1515e7fb8e444465 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 29 Nov 2022 01:39:34 +0000 Subject: udp_tunnel: Add checks for nla_nest_start() in __udp_tunnel_nic_dump_write() As the nla_nest_start() may fail with NULL returned, the return value should be checked. Note that this is not a real bug, nothing will break here. The next nla_put() will fail as well and we'll bail (and nla_nest_cancel() can handle NULL). But we keep getting those "fixes" so whatever. Signed-off-by: Yuan Can Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20221129013934.55184-1-yuancan@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/udp_tunnel_nic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index bc3a043a5d5c..029219749785 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -624,6 +624,8 @@ __udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, continue; nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + if (!nest) + return -EMSGSIZE; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, utn->entries[table][j].port) || -- cgit v1.2.3-70-g09d2 From 1e777f39b4d75e599a3aac8e0f67d739474f198c Mon Sep 17 00:00:00 2001 From: Dmytro Shytyi Date: Fri, 25 Nov 2022 23:29:47 +0100 Subject: mptcp: add MSG_FASTOPEN sendmsg flag support Since commit 54f1944ed6d2 ("mptcp: factor out mptcp_connect()"), all the infrastructure is now in place to support the MSG_FASTOPEN flag, we just need to call into the fastopen path in mptcp_sendmsg(). Co-developed-by: Benjamin Hesmans Signed-off-by: Benjamin Hesmans Acked-by: Paolo Abeni Signed-off-by: Dmytro Shytyi Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3722a8580b61..efc84816a692 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1711,17 +1711,14 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int ret = 0; long timeo; - /* we don't support FASTOPEN yet */ - if (msg->msg_flags & MSG_FASTOPEN) - return -EOPNOTSUPP; - /* silently ignore everything else */ - msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; + msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL | MSG_FASTOPEN; lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) { + if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect || + msg->msg_flags & MSG_FASTOPEN))) { int copied_syn = 0; ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn); -- cgit v1.2.3-70-g09d2 From fe33d38626779ffcc1c88204b1931774dc204cb5 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 25 Nov 2022 23:29:48 +0100 Subject: mptcp: track accurately the incoming MPC suboption type Currently in the receive path we don't need to discriminate between MPC SYN, MPC SYN-ACK and MPC ACK, but soon the fastopen code will need that info to properly track the fully established status. Track the exact MPC suboption type into the receive opt bitmap. No functional change intended. Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 30d289044e71..784a205e80da 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -26,6 +26,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, { u8 subtype = *ptr >> 4; int expected_opsize; + u16 subopt; u8 version; u8 flags; u8 i; @@ -38,11 +39,15 @@ static void mptcp_parse_option(const struct sk_buff *skb, expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; else expected_opsize = TCPOLEN_MPTCP_MPC_ACK; + subopt = OPTION_MPTCP_MPC_ACK; } else { - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) { expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; - else + subopt = OPTION_MPTCP_MPC_SYNACK; + } else { expected_opsize = TCPOLEN_MPTCP_MPC_SYN; + subopt = OPTION_MPTCP_MPC_SYN; + } } /* Cfr RFC 8684 Section 3.3.0: @@ -85,7 +90,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); - mp_opt->suboptions |= OPTIONS_MPTCP_MPC; + mp_opt->suboptions |= subopt; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; -- cgit v1.2.3-70-g09d2 From b3ea6b272d79a43baaaa9af871ee66f6fda4688f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 25 Nov 2022 23:29:49 +0100 Subject: mptcp: consolidate initial ack seq generation Currently the initial ack sequence is generated on demand whenever it's requested and the remote key is handy. The relevant code is scattered in different places and can lead to multiple, unneeded, crypto operations. This change consolidates the ack sequence generation code in a single helper, storing the sequence number at the subflow level. The above additionally saves a few conditional in fast-path and will simplify the upcoming fast-open implementation. Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 5 +++-- net/mptcp/protocol.c | 19 +----------------- net/mptcp/protocol.h | 9 ++++++--- net/mptcp/subflow.c | 57 ++++++++++++++++++++++++++++++++-------------------- 4 files changed, 45 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 784a205e80da..ae076468fcb9 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -953,8 +953,9 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return subflow->mp_capable; } - if (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || - ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo)) { + if (subflow->remote_key_valid && + (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || + ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo))) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index efc84816a692..66a599d2ee19 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3045,7 +3045,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); struct mptcp_sock *msk; - u64 ack_seq; if (!nsk) return NULL; @@ -3071,15 +3070,6 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; - if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) { - msk->can_ack = true; - msk->remote_key = mp_opt->sndr_key; - mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); - ack_seq++; - WRITE_ONCE(msk->ack_seq, ack_seq); - atomic64_set(&msk->rcv_wnd_sent, ack_seq); - } - sock_reset_flag(nsk, SOCK_RCU_FREE); /* will be fully established after successful MPC subflow creation */ inet_sk_state_store(nsk, TCP_SYN_RECV); @@ -3352,7 +3342,6 @@ void mptcp_finish_connect(struct sock *ssk) struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; struct sock *sk; - u64 ack_seq; subflow = mptcp_subflow_ctx(ssk); sk = subflow->conn; @@ -3360,22 +3349,16 @@ void mptcp_finish_connect(struct sock *ssk) pr_debug("msk=%p, token=%u", sk, subflow->token); - mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); - ack_seq++; - subflow->map_seq = ack_seq; + subflow->map_seq = subflow->iasn; subflow->map_subflow_seq = 1; /* the socket is not connected yet, no msk/subflow ops can access/race * accessing the field below */ - WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->local_key, subflow->local_key); WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); - WRITE_ONCE(msk->ack_seq, ack_seq); - WRITE_ONCE(msk->can_ack, 1); WRITE_ONCE(msk->snd_una, msk->write_seq); - atomic64_set(&msk->rcv_wnd_sent, ack_seq); mptcp_pm_new_connection(msk, ssk, 0); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 6a09ab99a12d..b5abea3d1a9c 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -467,7 +467,7 @@ struct mptcp_subflow_context { send_fastclose : 1, send_infinite_map : 1, rx_eof : 1, - can_ack : 1, /* only after processing the remote a key */ + remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1, /* local_id is correctly initialized */ @@ -477,7 +477,10 @@ struct mptcp_subflow_context { u64 thmac; u32 local_nonce; u32 remote_token; - u8 hmac[MPTCPOPT_HMAC_LEN]; + union { + u8 hmac[MPTCPOPT_HMAC_LEN]; /* MPJ subflow only */ + u64 iasn; /* initial ack sequence number, MPC subflows only */ + }; u8 local_id; u8 remote_id; u8 reset_seen:1; @@ -603,7 +606,7 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, - struct mptcp_options_received *mp_opt); + const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index f3c336872475..eca4de08ca9c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -392,11 +392,33 @@ static void mptcp_set_connected(struct sock *sk) mptcp_data_unlock(sk); } +static void subflow_set_remote_key(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt) +{ + /* active MPC subflow will reach here multiple times: + * at subflow_finish_connect() time and at 4th ack time + */ + if (subflow->remote_key_valid) + return; + + subflow->remote_key_valid = 1; + subflow->remote_key = mp_opt->sndr_key; + mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); + subflow->iasn++; + + WRITE_ONCE(msk->remote_key, subflow->remote_key); + WRITE_ONCE(msk->ack_seq, subflow->iasn); + WRITE_ONCE(msk->can_ack, true); + atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct mptcp_options_received mp_opt; struct sock *parent = subflow->conn; + struct mptcp_sock *msk; subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); @@ -404,6 +426,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (subflow->conn_finished) return; + msk = mptcp_sk(parent); mptcp_propagate_sndbuf(parent, sk); subflow->rel_write_seq = 1; subflow->conn_finished = 1; @@ -416,19 +439,16 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); - pr_fallback(mptcp_sk(subflow->conn)); + pr_fallback(msk); goto fallback; } if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) - WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); + WRITE_ONCE(msk->csum_enabled, true); if (mp_opt.deny_join_id0) - WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true); + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; - subflow->can_ack = 1; - subflow->remote_key = mp_opt.sndr_key; - pr_debug("subflow=%p, remote_key=%llu", subflow, - subflow->remote_key); + subflow_set_remote_key(msk, subflow, &mp_opt); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); mptcp_set_connected(parent); @@ -466,7 +486,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->mp_join = 1; MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); - if (subflow_use_different_dport(mptcp_sk(parent), sk)) { + if (subflow_use_different_dport(msk, sk)) { pr_debug("synack inet_dport=%d %d", ntohs(inet_sk(sk)->inet_dport), ntohs(inet_sk(parent)->inet_dport)); @@ -474,7 +494,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } } else if (mptcp_check_fallback(sk)) { fallback: - mptcp_rcv_space_init(mptcp_sk(parent), sk); + mptcp_rcv_space_init(msk, sk); mptcp_set_connected(parent); } return; @@ -637,13 +657,12 @@ static void subflow_drop_ctx(struct sock *ssk) } void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, - struct mptcp_options_received *mp_opt) + const struct mptcp_options_received *mp_opt) { struct mptcp_sock *msk = mptcp_sk(subflow->conn); - subflow->remote_key = mp_opt->sndr_key; + subflow_set_remote_key(msk, subflow, mp_opt); subflow->fully_established = 1; - subflow->can_ack = 1; WRITE_ONCE(msk->fully_established, true); } @@ -1198,16 +1217,8 @@ static bool subflow_check_data_avail(struct sock *ssk) if (WARN_ON_ONCE(!skb)) goto no_data; - /* if msk lacks the remote key, this subflow must provide an - * MP_CAPABLE-based mapping - */ - if (unlikely(!READ_ONCE(msk->can_ack))) { - if (!subflow->mpc_map) - goto fallback; - WRITE_ONCE(msk->remote_key, subflow->remote_key); - WRITE_ONCE(msk->ack_seq, subflow->map_seq); - WRITE_ONCE(msk->can_ack, true); - } + if (unlikely(!READ_ONCE(msk->can_ack))) + goto fallback; old_ack = READ_ONCE(msk->ack_seq); ack_seq = mptcp_subflow_get_mapped_dsn(subflow); @@ -1480,6 +1491,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, &flags, &ifindex); + subflow->remote_key_valid = 1; subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; @@ -1873,6 +1885,7 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->ssn_offset = subflow_req->ssn_offset; new_ctx->mp_join = 1; new_ctx->fully_established = 1; + new_ctx->remote_key_valid = 1; new_ctx->backup = subflow_req->backup; new_ctx->remote_id = subflow_req->remote_id; new_ctx->token = subflow_req->token; -- cgit v1.2.3-70-g09d2 From dfc8d06030335a816d81aa92fe5d1f84d06998ad Mon Sep 17 00:00:00 2001 From: Dmytro Shytyi Date: Fri, 25 Nov 2022 23:29:50 +0100 Subject: mptcp: implement delayed seq generation for passive fastopen With fastopen in place, the first subflow socket is created before the MPC handshake completes, and we need to properly initialize the sequence numbers at MPC ACK reception. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Dmytro Shytyi Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/Makefile | 2 +- net/mptcp/fastopen.c | 28 ++++++++++++++++++++++++++++ net/mptcp/options.c | 9 ++++++--- net/mptcp/protocol.c | 9 --------- net/mptcp/protocol.h | 16 +++++++++++++++- net/mptcp/subflow.c | 5 ++++- 6 files changed, 54 insertions(+), 15 deletions(-) create mode 100644 net/mptcp/fastopen.c (limited to 'net') diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 6e7df47c9584..a3829ce548f9 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ - mib.o pm_netlink.o sockopt.o pm_userspace.o + mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c new file mode 100644 index 000000000000..19c332af0834 --- /dev/null +++ b/net/mptcp/fastopen.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP Fast Open Mechanism + * + * Copyright (c) 2021-2022, Dmytro SHYTYI + */ + +#include "protocol.h" + +void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt) +{ + struct sock *sk = (struct sock *)msk; + struct sk_buff *skb; + + mptcp_data_lock(sk); + skb = skb_peek_tail(&sk->sk_receive_queue); + if (skb) { + WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); + pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx", sk, + MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, + MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); + MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; + MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; + } + + pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq); + mptcp_data_unlock(sk); +} diff --git a/net/mptcp/options.c b/net/mptcp/options.c index ae076468fcb9..5ded85e2c374 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -939,7 +939,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && !subflow->request_join) tcp_send_ack(ssk); - goto fully_established; + goto check_notify; } /* we must process OoO packets before the first subflow is fully @@ -950,6 +950,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { if (subflow->mp_join) goto reset; + if (subflow->is_mptfo && mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) + goto set_fully_established; return subflow->mp_capable; } @@ -961,7 +963,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, */ subflow->fully_established = 1; WRITE_ONCE(msk->fully_established, true); - goto fully_established; + goto check_notify; } /* If the first established packet does not contain MP_CAPABLE + data @@ -980,11 +982,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, if (mp_opt->deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); +set_fully_established: if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); mptcp_subflow_fully_established(subflow, mp_opt); -fully_established: +check_notify: /* if the subflow is not already linked into the conn_list, we can't * notify the PM: this subflow is still on the listener queue * and the PM possibly acquiring the subflow lock could race with diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 66a599d2ee19..3fa0aa44bf5b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -36,15 +36,6 @@ struct mptcp6_sock { }; #endif -struct mptcp_skb_cb { - u64 map_seq; - u64 end_seq; - u32 offset; - u8 has_rxtstamp:1; -}; - -#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) - enum { MPTCP_CMSG_TS = BIT(0), MPTCP_CMSG_INQ = BIT(1), diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b5abea3d1a9c..618ac85abaaf 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -126,6 +126,15 @@ #define MPTCP_CONNECTED 6 #define MPTCP_RESET_SCHEDULER 7 +struct mptcp_skb_cb { + u64 map_seq; + u64 end_seq; + u32 offset; + u8 has_rxtstamp:1; +}; + +#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) + static inline bool before64(__u64 seq1, __u64 seq2) { return (__s64)(seq1 - seq2) < 0; @@ -471,7 +480,9 @@ struct mptcp_subflow_context { disposable : 1, /* ctx can be free at ulp release time */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ local_id_valid : 1, /* local_id is correctly initialized */ - valid_csum_seen : 1; /* at least one csum validated */ + valid_csum_seen : 1, /* at least one csum validated */ + is_mptfo : 1, /* subflow is doing TFO */ + __unused : 8; enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -829,6 +840,9 @@ void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_ void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); +void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt); + static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { return READ_ONCE(msk->pm.addr_signal) & diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index eca4de08ca9c..cbeb7718f092 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -664,6 +664,9 @@ void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, subflow_set_remote_key(msk, subflow, mp_opt); subflow->fully_established = 1; WRITE_ONCE(msk->fully_established, true); + + if (subflow->is_mptfo) + mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, @@ -779,7 +782,7 @@ create_child: /* with OoO packets we can reach here without ingress * mpc option */ - if (mp_opt.suboptions & OPTIONS_MPTCP_MPC) + if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) mptcp_subflow_fully_established(ctx, &mp_opt); } else if (ctx->mp_join) { struct mptcp_sock *owner; -- cgit v1.2.3-70-g09d2 From 36b122baf6a8bd46b4a591f12f4ed17b22257408 Mon Sep 17 00:00:00 2001 From: Dmytro Shytyi Date: Fri, 25 Nov 2022 23:29:51 +0100 Subject: mptcp: add subflow_v(4,6)_send_synack() The send_synack() needs to be overridden for MPTCP to support TFO for two reasons: - There is not be enough space in the TCP options if the TFO cookie has to be added in the SYN+ACK with other options: MSS (4), SACK OK (2), Timestamps (10), Window Scale (3+1), TFO (10+2), MP_CAPABLE (12). MPTCPv1 specs -- RFC 8684, section B.1 [1] -- suggest to drop the TCP timestamps option in this case. - The data received in the SYN has to be handled: the SKB can be dequeued from the subflow sk and transferred to the MPTCP sk. Counters need to be updated accordingly and the application can be notified at the end because some bytes have been received. [1] https://www.rfc-editor.org/rfc/rfc8684.html#section-b.1 Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Matthieu Baerts Signed-off-by: Matthieu Baerts Signed-off-by: Dmytro Shytyi Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/fastopen.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 3 +++ net/mptcp/subflow.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index 19c332af0834..d237d142171c 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -6,6 +6,51 @@ #include "protocol.h" +void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, + struct request_sock *req) +{ + struct sock *ssk = subflow->tcp_sock; + struct sock *sk = subflow->conn; + struct sk_buff *skb; + struct tcp_sock *tp; + + tp = tcp_sk(ssk); + + subflow->is_mptfo = 1; + + skb = skb_peek(&ssk->sk_receive_queue); + if (WARN_ON_ONCE(!skb)) + return; + + /* dequeue the skb from sk receive queue */ + __skb_unlink(skb, &ssk->sk_receive_queue); + skb_ext_reset(skb); + skb_orphan(skb); + + /* We copy the fastopen data, but that don't belong to the mptcp sequence + * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset() + */ + tp->copied_seq += skb->len; + subflow->ssn_offset += skb->len; + + /* initialize a dummy sequence number, we will update it at MPC + * completion, if needed + */ + MPTCP_SKB_CB(skb)->map_seq = -skb->len; + MPTCP_SKB_CB(skb)->end_seq = 0; + MPTCP_SKB_CB(skb)->offset = 0; + MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + + mptcp_data_lock(sk); + + mptcp_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); + + sk->sk_data_ready(sk); + + mptcp_data_unlock(sk); +} + void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3fa0aa44bf5b..b0d387be500a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -191,7 +191,7 @@ static void mptcp_rfree(struct sk_buff *skb) mptcp_rmem_uncharge(sk, len); } -static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) +void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 618ac85abaaf..8b4379a2cd85 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -633,6 +633,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); +void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); @@ -842,6 +843,8 @@ bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt); +void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, + struct request_sock *req); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index cbeb7718f092..29904303f5c2 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -307,7 +307,48 @@ static struct dst_entry *subflow_v4_route_req(const struct sock *sk, return NULL; } +static void subflow_prep_synack(const struct sock *sk, struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct inet_request_sock *ireq = inet_rsk(req); + + /* clear tstamp_ok, as needed depending on cookie */ + if (foc && foc->len > -1) + ireq->tstamp_ok = 0; + + if (synack_type == TCP_SYNACK_FASTOPEN) + mptcp_fastopen_subflow_synack_set_params(subflow, req); +} + +static int subflow_v4_send_synack(const struct sock *sk, struct dst_entry *dst, + struct flowi *fl, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) +{ + subflow_prep_synack(sk, req, foc, synack_type); + + return tcp_request_sock_ipv4_ops.send_synack(sk, dst, fl, req, foc, + synack_type, syn_skb); +} + #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static int subflow_v6_send_synack(const struct sock *sk, struct dst_entry *dst, + struct flowi *fl, + struct request_sock *req, + struct tcp_fastopen_cookie *foc, + enum tcp_synack_type synack_type, + struct sk_buff *syn_skb) +{ + subflow_prep_synack(sk, req, foc, synack_type); + + return tcp_request_sock_ipv6_ops.send_synack(sk, dst, fl, req, foc, + synack_type, syn_skb); +} + static struct dst_entry *subflow_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, @@ -1945,6 +1986,7 @@ void __init mptcp_subflow_init(void) subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; + subflow_request_sock_ipv4_ops.send_synack = subflow_v4_send_synack; subflow_specific = ipv4_specific; subflow_specific.conn_request = subflow_v4_conn_request; @@ -1958,6 +2000,7 @@ void __init mptcp_subflow_init(void) #if IS_ENABLED(CONFIG_MPTCP_IPV6) subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; + subflow_request_sock_ipv6_ops.send_synack = subflow_v6_send_synack; subflow_v6_specific = ipv6_specific; subflow_v6_specific.conn_request = subflow_v6_conn_request; -- cgit v1.2.3-70-g09d2 From 4ffb0a02346c14f5e83711668f19b6b551cd32ed Mon Sep 17 00:00:00 2001 From: Dmytro Shytyi Date: Fri, 25 Nov 2022 23:29:52 +0100 Subject: mptcp: add TCP_FASTOPEN sock option The TCP_FASTOPEN socket option is one way for the application to tell the kernel TFO support has to be enabled for the listener socket. The only thing to do here with MPTCP is to relay the request to the first subflow like it is already done for the other TCP_FASTOPEN* socket options. Acked-by: Paolo Abeni Signed-off-by: Dmytro Shytyi Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index f62f6483ef77..c1bca711c35c 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -559,6 +559,7 @@ static bool mptcp_supported_sockopt(int level, int optname) case TCP_NOTSENT_LOWAT: case TCP_TX_DELAY: case TCP_INQ: + case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_NO_COOKIE: return true; @@ -569,7 +570,7 @@ static bool mptcp_supported_sockopt(int level, int optname) /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, * TCP_REPAIR_WINDOW are not supported, better avoid this mess */ - /* TCP_FASTOPEN_KEY, TCP_FASTOPEN are not supported because + /* TCP_FASTOPEN_KEY is not supported because * fastopen for the listener side is currently unsupported */ } @@ -801,6 +802,7 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); return 0; + case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_NO_COOKIE: return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, @@ -1166,6 +1168,7 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_INFO: case TCP_CC_INFO: case TCP_DEFER_ACCEPT: + case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: case TCP_FASTOPEN_NO_COOKIE: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, -- cgit v1.2.3-70-g09d2 From cb99816cb59d4253758827186d5a5616f9825b6c Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 25 Nov 2022 23:29:53 +0100 Subject: mptcp: add support for TCP_FASTOPEN_KEY sockopt The goal of this socket option is to set different keys per listener, see commit 1fba70e5b6be ("tcp: socket option to set TCP fast open key") for more details about this socket option. The only thing to do here with MPTCP is to relay the request to the first subflow like it is already done for the other TCP_FASTOPEN* socket options. Acked-by: Paolo Abeni Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- net/mptcp/sockopt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index c1bca711c35c..a47423ebb33a 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -561,6 +561,7 @@ static bool mptcp_supported_sockopt(int level, int optname) case TCP_INQ: case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return true; } @@ -570,9 +571,6 @@ static bool mptcp_supported_sockopt(int level, int optname) /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, * TCP_REPAIR_WINDOW are not supported, better avoid this mess */ - /* TCP_FASTOPEN_KEY is not supported because - * fastopen for the listener side is currently unsupported - */ } return false; } @@ -804,6 +802,7 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, return 0; case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); @@ -1170,6 +1169,7 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_DEFER_ACCEPT: case TCP_FASTOPEN: case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: case TCP_FASTOPEN_NO_COOKIE: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); -- cgit v1.2.3-70-g09d2 From bff3d0534804452e19c097ae6b4eb4b4d846d67f Mon Sep 17 00:00:00 2001 From: Sriram Yagnaraman Date: Fri, 4 Nov 2022 18:18:35 +0100 Subject: netfilter: conntrack: add sctp DATA_SENT state SCTP conntrack currently assumes that the SCTP endpoints will probe secondary paths using HEARTBEAT before sending traffic. But, according to RFC 9260, SCTP endpoints can send any traffic on any of the confirmed paths after SCTP association is up. SCTP endpoints that sends INIT will confirm all peer addresses that upper layer configures, and the SCTP endpoint that receives COOKIE_ECHO will only confirm the address it sent the INIT_ACK to. So, we can have a situation where the INIT sender can start to use secondary paths without the need to send HEARTBEAT. This patch allows DATA/SACK packets to create new connection tracking entry. A new state has been added to indicate that a DATA/SACK chunk has been seen in the original direction - SCTP_CONNTRACK_DATA_SENT. State transitions mostly follows the HEARTBEAT_SENT, except on receiving HEARTBEAT/HEARTBEAT_ACK/DATA/SACK in the reply direction. State transitions in original direction: - DATA_SENT behaves similar to HEARTBEAT_SENT for all chunks, except that it remains in DATA_SENT on receving HEARTBEAT, HEARTBEAT_ACK/DATA/SACK chunks State transitions in reply direction: - DATA_SENT behaves similar to HEARTBEAT_SENT for all chunks, except that it moves to HEARTBEAT_ACKED on receiving HEARTBEAT/HEARTBEAT_ACK/DATA/SACK chunks Note: This patch still doesn't solve the problem when the SCTP endpoint decides to use primary paths for association establishment but uses a secondary path for association shutdown. We still have to depend on timeout for connections to expire in such a case. Signed-off-by: Sriram Yagnaraman Reviewed-by: Marcelo Ricardo Leitner Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_sctp.h | 1 + include/uapi/linux/netfilter/nfnetlink_cttimeout.h | 1 + net/netfilter/nf_conntrack_proto_sctp.c | 104 ++++++++++++--------- net/netfilter/nf_conntrack_standalone.c | 8 ++ 4 files changed, 71 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index edc6ddab0de6..c742469afe21 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -16,6 +16,7 @@ enum sctp_conntrack { SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, SCTP_CONNTRACK_HEARTBEAT_SENT, SCTP_CONNTRACK_HEARTBEAT_ACKED, + SCTP_CONNTRACK_DATA_SENT, SCTP_CONNTRACK_MAX }; diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h index 6b20fb22717b..94e74034706d 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h @@ -95,6 +95,7 @@ enum ctattr_timeout_sctp { CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_SENT, CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, + CTA_TIMEOUT_SCTP_DATA_SENT, __CTA_TIMEOUT_SCTP_MAX }; #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 5a936334b517..d88b92a8ffca 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -60,6 +60,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, + [SCTP_CONNTRACK_DATA_SENT] = 30 SECS, }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 @@ -74,6 +75,7 @@ static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT #define sHS SCTP_CONNTRACK_HEARTBEAT_SENT #define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED +#define sDS SCTP_CONNTRACK_DATA_SENT #define sIV SCTP_CONNTRACK_MAX /* @@ -90,15 +92,16 @@ COOKIE WAIT - We have seen an INIT chunk in the original direction, or als COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. -SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply direction. SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. -HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to - that of the HEARTBEAT chunk. Secondary connection is - established. +HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK/DATA/SACK in the direction + opposite to that of the HEARTBEAT/DATA chunk. Secondary connection + is established. +DATA_SENT - We have seen a DATA/SACK in a new flow. */ /* TODO @@ -112,36 +115,38 @@ cookie echoed to closed. */ /* SCTP conntrack state transitions */ -static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][12][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, -/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */ +/* init */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA, sCW}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS, sCL}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA, sCL},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA, sCL}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS}, +/* data/sack */ {sDS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS} }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, -/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, -/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sDS */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCW, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL, sIV}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR, sIV}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA, sIV}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA, sIV},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA, sIV}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA, sIV}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA, sHA}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA}, +/* data/sack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA, sHA}, } }; @@ -253,6 +258,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir, pr_debug("SCTP_CID_HEARTBEAT_ACK"); i = 10; break; + case SCTP_CID_DATA: + case SCTP_CID_SACK: + pr_debug("SCTP_CID_DATA/SACK"); + i = 11; + break; default: /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", @@ -306,7 +316,9 @@ sctp_new(struct nf_conn *ct, const struct sk_buff *skb, ih->init_tag); ct->proto.sctp.vtag[IP_CT_DIR_REPLY] = ih->init_tag; - } else if (sch->type == SCTP_CID_HEARTBEAT) { + } else if (sch->type == SCTP_CID_HEARTBEAT || + sch->type == SCTP_CID_DATA || + sch->type == SCTP_CID_SACK) { pr_debug("Setting vtag %x for secondary conntrack\n", sh->vtag); ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; @@ -392,19 +404,19 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, if (!sctp_new(ct, skb, sh, dataoff)) return -NF_ACCEPT; - } - - /* Check the verification tag (Sec 8.5) */ - if (!test_bit(SCTP_CID_INIT, map) && - !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && - !test_bit(SCTP_CID_COOKIE_ECHO, map) && - !test_bit(SCTP_CID_ABORT, map) && - !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && - !test_bit(SCTP_CID_HEARTBEAT, map) && - !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && - sh->vtag != ct->proto.sctp.vtag[dir]) { - pr_debug("Verification tag check failed\n"); - goto out; + } else { + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, map) && + !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, map) && + !test_bit(SCTP_CID_COOKIE_ECHO, map) && + !test_bit(SCTP_CID_ABORT, map) && + !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + !test_bit(SCTP_CID_HEARTBEAT, map) && + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && + sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out; + } } old_state = new_state = SCTP_CONNTRACK_NONE; @@ -464,6 +476,11 @@ int nf_conntrack_sctp_packet(struct nf_conn *ct, } else if (ct->proto.sctp.flags & SCTP_FLAG_HEARTBEAT_VTAG_FAILED) { ct->proto.sctp.flags &= ~SCTP_FLAG_HEARTBEAT_VTAG_FAILED; } + } else if (sch->type == SCTP_CID_DATA || sch->type == SCTP_CID_SACK) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting vtag %x for dir %d\n", sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } } old_state = ct->proto.sctp.state; @@ -684,6 +701,7 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_DATA_SENT] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 4ffe84c5a82c..a884c06abf09 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -602,6 +602,7 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_ACKED, + NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, @@ -892,6 +893,12 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + [NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_DATA_SENT] = { + .procname = "nf_conntrack_sctp_timeout_data_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, #endif #ifdef CONFIG_NF_CT_PROTO_DCCP [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { @@ -1036,6 +1043,7 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, XASSIGN(SHUTDOWN_ACK_SENT, sn); XASSIGN(HEARTBEAT_SENT, sn); XASSIGN(HEARTBEAT_ACKED, sn); + XASSIGN(DATA_SENT, sn); #undef XASSIGN #endif } -- cgit v1.2.3-70-g09d2 From a70e483460d58e64504dd679fd127e9549385c86 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Nov 2022 12:21:58 +0100 Subject: netfilter: conntrack: merge ipv4+ipv6 confirm functions No need to have distinct functions. After merge, ipv6 can avoid protooff computation if the connection neither needs sequence adjustment nor helper invocation -- this is the normal case. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_core.h | 3 +- net/bridge/netfilter/nf_conntrack_bridge.c | 32 +------- net/netfilter/nf_conntrack_proto.c | 124 +++++++++++++---------------- 3 files changed, 57 insertions(+), 102 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index b2b9de70d9f4..71d1269fe4d4 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -71,8 +71,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) return ret; } -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo); +unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 73242962be5d..5c5dd437f1c2 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -366,42 +366,12 @@ static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk, return br_dev_queue_push_xmit(net, sk, skb); } -static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - int protoff; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - switch (skb->protocol) { - case htons(ETH_P_IP): - protoff = skb_network_offset(skb) + ip_hdrlen(skb); - break; - case htons(ETH_P_IPV6): { - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) - return nf_conntrack_confirm(skb); - } - break; - default: - return NF_ACCEPT; - } - return nf_confirm(skb, protoff, ct, ctinfo); -} - static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int ret; - ret = nf_ct_bridge_confirm(skb); + ret = nf_confirm(priv, skb, state); if (ret != NF_ACCEPT) return ret; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 895b09cbd7cf..99323fb12d0f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -121,17 +121,61 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) }; EXPORT_SYMBOL_GPL(nf_ct_l4proto_find); -unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) +static bool in_vrf_postrouting(const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (state->hook == NF_INET_POST_ROUTING && + netif_is_l3_master(state->out)) + return true; +#endif + return false; +} + +unsigned int nf_confirm(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) { const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + bool seqadj_needed; + __be16 frag_off; + u8 pnum; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || in_vrf_postrouting(state)) + return NF_ACCEPT; help = nfct_help(ct); + + seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb); + if (!help && !seqadj_needed) + return nf_conntrack_confirm(skb); + + /* helper->help() do not expect ICMP packets */ + if (ctinfo == IP_CT_RELATED_REPLY) + return nf_conntrack_confirm(skb); + + switch (nf_ct_l3num(ct)) { + case NFPROTO_IPV4: + protoff = skb_network_offset(skb) + ip_hdrlen(skb); + break; + case NFPROTO_IPV6: + pnum = ipv6_hdr(skb)->nexthdr; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) + return nf_conntrack_confirm(skb); + break; + default: + return nf_conntrack_confirm(skb); + } + if (help) { const struct nf_conntrack_helper *helper; int ret; - /* rcu_read_lock()ed by nf_hook_thresh */ + /* rcu_read_lock()ed by nf_hook */ helper = rcu_dereference(help->helper); if (helper) { ret = helper->help(skb, @@ -142,12 +186,10 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } } - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_is_loopback_packet(skb)) { - if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return NF_DROP; - } + if (seqadj_needed && + !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) { + NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); + return NF_DROP; } /* We've seen it coming out the other side: confirm it */ @@ -155,35 +197,6 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } EXPORT_SYMBOL_GPL(nf_confirm); -static bool in_vrf_postrouting(const struct nf_hook_state *state) -{ -#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) - if (state->hook == NF_INET_POST_ROUTING && - netif_is_l3_master(state->out)) - return true; -#endif - return false; -} - -static unsigned int ipv4_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - if (in_vrf_postrouting(state)) - return NF_ACCEPT; - - return nf_confirm(skb, - skb_network_offset(skb) + ip_hdrlen(skb), - ct, ctinfo); -} - static unsigned int ipv4_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -230,13 +243,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = { .priority = NF_IP_PRI_CONNTRACK, }, { - .hook = ipv4_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, }, { - .hook = ipv4_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_CONNTRACK_CONFIRM, @@ -373,33 +386,6 @@ static struct nf_sockopt_ops so_getorigdst6 = { .owner = THIS_MODULE, }; -static unsigned int ipv6_confirm(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - struct nf_conn *ct; - enum ip_conntrack_info ctinfo; - unsigned char pnum = ipv6_hdr(skb)->nexthdr; - __be16 frag_off; - int protoff; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return nf_conntrack_confirm(skb); - - if (in_vrf_postrouting(state)) - return NF_ACCEPT; - - protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, - &frag_off); - if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return nf_conntrack_confirm(skb); - } - - return nf_confirm(skb, protoff, ct, ctinfo); -} - static unsigned int ipv6_conntrack_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -428,13 +414,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = { .priority = NF_IP6_PRI_CONNTRACK, }, { - .hook = ipv6_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_LAST, }, { - .hook = ipv6_confirm, + .hook = nf_confirm, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_LAST - 1, -- cgit v1.2.3-70-g09d2 From e9374524950512a1769f610a868fcdf89ea59b8e Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Tue, 22 Nov 2022 20:30:57 +0100 Subject: netfilter: ipset: Add support for new bitmask parameter Add a new parameter to complement the existing 'netmask' option. The main difference between netmask and bitmask is that bitmask takes any arbitrary ip address as input, it does not have to be a valid netmask. The name of the new parameter is 'bitmask'. This lets us mask out arbitrary bits in the ip address, for example: ipset create set1 hash:ip bitmask 255.128.255.0 ipset create set2 hash:ip,port family inet6 bitmask ffff::ff80 Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 10 ++++ include/uapi/linux/netfilter/ipset/ip_set.h | 2 + net/netfilter/ipset/ip_set_hash_gen.h | 71 +++++++++++++++++++++++++---- net/netfilter/ipset/ip_set_hash_ip.c | 19 ++++---- net/netfilter/ipset/ip_set_hash_ipport.c | 24 +++++++++- net/netfilter/ipset/ip_set_hash_netnet.c | 26 +++++++++-- 6 files changed, 126 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ada1296c87d5..ab934ad951a8 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -515,6 +515,16 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, *skbinfo = ext->skbinfo; } +static inline void +nf_inet_addr_mask_inplace(union nf_inet_addr *a1, + const union nf_inet_addr *mask) +{ + a1->all[0] &= mask->all[0]; + a1->all[1] &= mask->all[1]; + a1->all[2] &= mask->all[2]; + a1->all[3] &= mask->all[3]; +} + #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h index 79e5d68b87af..333807efd32b 100644 --- a/include/uapi/linux/netfilter/ipset/ip_set.h +++ b/include/uapi/linux/netfilter/ipset/ip_set.h @@ -85,6 +85,7 @@ enum { IPSET_ATTR_CADT_LINENO = IPSET_ATTR_LINENO, /* 9 */ IPSET_ATTR_MARK, /* 10 */ IPSET_ATTR_MARKMASK, /* 11 */ + IPSET_ATTR_BITMASK, /* 12 */ /* Reserve empty slots */ IPSET_ATTR_CADT_MAX = 16, /* Create-only specific attributes */ @@ -153,6 +154,7 @@ enum ipset_errno { IPSET_ERR_COMMENT, IPSET_ERR_INVALID_MARKMASK, IPSET_ERR_SKBINFO, + IPSET_ERR_BITMASK_NETMASK_EXCL, /* Type specific error codes */ IPSET_ERR_TYPE_SPECIFIC = 4352, diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 3adc291d9ce1..fdb5225a3e5c 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -159,6 +159,17 @@ htable_size(u8 hbits) (SET_WITH_TIMEOUT(set) && \ ip_set_timeout_expired(ext_timeout(d, set))) +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) +static const union nf_inet_addr onesmask = { + .all[0] = 0xffffffff, + .all[1] = 0xffffffff, + .all[2] = 0xffffffff, + .all[3] = 0xffffffff +}; + +static const union nf_inet_addr zeromask = {}; +#endif + #endif /* _IP_SET_HASH_GEN_H */ #ifndef MTYPE @@ -283,8 +294,9 @@ struct htype { u32 markmask; /* markmask value for mark mask to store */ #endif u8 bucketsize; /* max elements in an array block */ -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) u8 netmask; /* netmask value for subnets to store */ + union nf_inet_addr bitmask; /* stores bitmask */ #endif struct list_head ad; /* Resize add|del backlist */ struct mtype_elem next; /* temporary storage for uadd */ @@ -459,8 +471,8 @@ mtype_same_set(const struct ip_set *a, const struct ip_set *b) /* Resizing changes htable_bits, so we ignore it */ return x->maxelem == y->maxelem && a->timeout == b->timeout && -#ifdef IP_SET_HASH_WITH_NETMASK - x->netmask == y->netmask && +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + nf_inet_addr_cmp(&x->bitmask, &y->bitmask) && #endif #ifdef IP_SET_HASH_WITH_MARKMASK x->markmask == y->markmask && @@ -1264,9 +1276,21 @@ mtype_head(struct ip_set *set, struct sk_buff *skb) htonl(jhash_size(htable_bits))) || nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem))) goto nla_put_failure; +#ifdef IP_SET_HASH_WITH_BITMASK + /* if netmask is set to anything other than HOST_MASK we know that the user supplied netmask + * and not bitmask. These two are mutually exclusive. */ + if (h->netmask == HOST_MASK && !nf_inet_addr_cmp(&onesmask, &h->bitmask)) { + if (set->family == NFPROTO_IPV4) { + if (nla_put_ipaddr4(skb, IPSET_ATTR_BITMASK, h->bitmask.ip)) + goto nla_put_failure; + } else if (set->family == NFPROTO_IPV6) { + if (nla_put_ipaddr6(skb, IPSET_ATTR_BITMASK, &h->bitmask.in6)) + goto nla_put_failure; + } + } +#endif #ifdef IP_SET_HASH_WITH_NETMASK - if (h->netmask != HOST_MASK && - nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) + if (h->netmask != HOST_MASK && nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask)) goto nla_put_failure; #endif #ifdef IP_SET_HASH_WITH_MARKMASK @@ -1429,8 +1453,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, u32 markmask; #endif u8 hbits; -#ifdef IP_SET_HASH_WITH_NETMASK - u8 netmask; +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + int ret __attribute__((unused)) = 0; + u8 netmask = set->family == NFPROTO_IPV4 ? 32 : 128; + union nf_inet_addr bitmask = onesmask; #endif size_t hsize; struct htype *h; @@ -1468,7 +1494,6 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, #endif #ifdef IP_SET_HASH_WITH_NETMASK - netmask = set->family == NFPROTO_IPV4 ? 32 : 128; if (tb[IPSET_ATTR_NETMASK]) { netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]); @@ -1476,6 +1501,33 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, (set->family == NFPROTO_IPV6 && netmask > 128) || netmask == 0) return -IPSET_ERR_INVALID_NETMASK; + + /* we convert netmask to bitmask and store it */ + if (set->family == NFPROTO_IPV4) + bitmask.ip = ip_set_netmask(netmask); + else + ip6_netmask(&bitmask, netmask); + } +#endif + +#ifdef IP_SET_HASH_WITH_BITMASK + if (tb[IPSET_ATTR_BITMASK]) { + /* bitmask and netmask do the same thing, allow only one of these options */ + if (tb[IPSET_ATTR_NETMASK]) + return -IPSET_ERR_BITMASK_NETMASK_EXCL; + + if (set->family == NFPROTO_IPV4) { + ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_BITMASK], &bitmask.ip); + if (ret || !bitmask.ip) + return -IPSET_ERR_INVALID_NETMASK; + } else if (set->family == NFPROTO_IPV6) { + ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_BITMASK], &bitmask); + if (ret || ipv6_addr_any(&bitmask.in6)) + return -IPSET_ERR_INVALID_NETMASK; + } + + if (nf_inet_addr_cmp(&bitmask, &zeromask)) + return -IPSET_ERR_INVALID_NETMASK; } #endif @@ -1518,7 +1570,8 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set, for (i = 0; i < ahash_numof_locks(hbits); i++) spin_lock_init(&t->hregion[i].lock); h->maxelem = maxelem; -#ifdef IP_SET_HASH_WITH_NETMASK +#if defined(IP_SET_HASH_WITH_NETMASK) || defined(IP_SET_HASH_WITH_BITMASK) + h->bitmask = bitmask; h->netmask = netmask; #endif #ifdef IP_SET_HASH_WITH_MARKMASK diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index dd30c03d5a23..556f74268a28 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -24,7 +24,8 @@ /* 2 Comments support */ /* 3 Forceadd support */ /* 4 skbinfo support */ -#define IPSET_TYPE_REV_MAX 5 /* bucketsize, initval support */ +/* 5 bucketsize, initval support */ +#define IPSET_TYPE_REV_MAX 6 /* bitmask support */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -34,6 +35,7 @@ MODULE_ALIAS("ip_set_hash:ip"); /* Type specific function prefix */ #define HTYPE hash_ip #define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -86,7 +88,7 @@ hash_ip4_kadt(struct ip_set *set, const struct sk_buff *skb, __be32 ip; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &ip); - ip &= ip_set_netmask(h->netmask); + ip &= h->bitmask.ip; if (ip == 0) return -EINVAL; @@ -119,7 +121,7 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - ip &= ip_set_hostmask(h->netmask); + ip &= ntohl(h->bitmask.ip); e.ip = htonl(ip); if (e.ip == 0) return -IPSET_ERR_HASH_ELEM; @@ -187,12 +189,6 @@ hash_ip6_data_equal(const struct hash_ip6_elem *ip1, return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6); } -static void -hash_ip6_netmask(union nf_inet_addr *ip, u8 prefix) -{ - ip6_netmask(ip, prefix); -} - static bool hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *e) { @@ -229,7 +225,7 @@ hash_ip6_kadt(struct ip_set *set, const struct sk_buff *skb, struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -EINVAL; @@ -268,7 +264,7 @@ hash_ip6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; - hash_ip6_netmask(&e.ip, h->netmask); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); if (ipv6_addr_any(&e.ip.in6)) return -IPSET_ERR_HASH_ELEM; @@ -295,6 +291,7 @@ static struct ip_set_type hash_ip_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index 7303138e46be..2ffbd0b78a8c 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -26,7 +26,8 @@ /* 3 Comments support added */ /* 4 Forceadd support added */ /* 5 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 6 /* bucketsize, initval support added */ +/* 6 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 7 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik "); @@ -35,6 +36,8 @@ MODULE_ALIAS("ip_set_hash:ip,port"); /* Type specific function prefix */ #define HTYPE hash_ipport +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK /* IPv4 variant */ @@ -92,12 +95,16 @@ hash_ipport4_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -129,6 +136,10 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + e.ip &= h->bitmask.ip; + if (e.ip == 0) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -253,12 +264,17 @@ hash_ipport6_kadt(struct ip_set *set, const struct sk_buff *skb, ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipport6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); + const struct MTYPE *h = set->data; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -298,6 +314,10 @@ hash_ipport6_uadt(struct ip_set *set, struct nlattr *tb[], if (ret) return ret; + nf_inet_addr_mask_inplace(&e.ip, &h->bitmask); + if (ipv6_addr_any(&e.ip.in6)) + return -EINVAL; + e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { @@ -356,6 +376,8 @@ static struct ip_set_type hash_ipport_type __read_mostly = { [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 3d09eefe998a..cdfb78c6e0d3 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -23,7 +23,8 @@ #define IPSET_TYPE_REV_MIN 0 /* 1 Forceadd support added */ /* 2 skbinfo support added */ -#define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ +/* 3 bucketsize, initval support added */ +#define IPSET_TYPE_REV_MAX 4 /* bitmask support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith "); @@ -33,6 +34,8 @@ MODULE_ALIAS("ip_set_hash:net,net"); /* Type specific function prefix */ #define HTYPE hash_netnet #define IP_SET_HASH_WITH_NETS +#define IP_SET_HASH_WITH_NETMASK +#define IP_SET_HASH_WITH_BITMASK #define IPSET_NET_COUNT 2 /* IPv4 variants */ @@ -153,8 +156,8 @@ hash_netnet4_kadt(struct ip_set *set, const struct sk_buff *skb, ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); ip4addrptr(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.ip[1]); - e.ip[0] &= ip_set_netmask(e.cidr[0]); - e.ip[1] &= ip_set_netmask(e.cidr[1]); + e.ip[0] &= (ip_set_netmask(e.cidr[0]) & h->bitmask.ip); + e.ip[1] &= (ip_set_netmask(e.cidr[1]) & h->bitmask.ip); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -213,8 +216,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) { - e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); - e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); + e.ip[0] = htonl(ip & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[0])); + e.ip[1] = htonl(ip2_from & ntohl(h->bitmask.ip) & ip_set_hostmask(e.cidr[1])); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; @@ -404,6 +407,11 @@ hash_netnet6_kadt(struct ip_set *set, const struct sk_buff *skb, ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -EINVAL; + return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } @@ -414,6 +422,7 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); + const struct hash_netnet6 *h = set->data; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -453,6 +462,11 @@ hash_netnet6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); + nf_inet_addr_mask_inplace(&e.ip[0], &h->bitmask); + nf_inet_addr_mask_inplace(&e.ip[1], &h->bitmask); + if (e.cidr[0] == HOST_MASK && ipv6_addr_any(&e.ip[0].in6)) + return -IPSET_ERR_HASH_ELEM; + if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -484,6 +498,8 @@ static struct ip_set_type hash_netnet_type __read_mostly = { [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, + [IPSET_ATTR_NETMASK] = { .type = NLA_U8 }, + [IPSET_ATTR_BITMASK] = { .type = NLA_NESTED }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, -- cgit v1.2.3-70-g09d2 From 7d7cfb48d81353e826493d24c7cec7360950968f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 22 Nov 2022 16:00:09 +0100 Subject: netfilter: conntrack: set icmpv6 redirects as RELATED icmp conntrack will set icmp redirects as RELATED, but icmpv6 will not do this. For icmpv6, only icmp errors (code <= 128) are examined for RELATED state. ICMPV6 Redirects are part of neighbour discovery mechanism, those are handled by marking a selected subset (e.g. neighbour solicitations) as UNTRACKED, but not REDIRECT -- they will thus be flagged as INVALID. Add minimal support for REDIRECTs. No parsing of neighbour options is added for simplicity, so this will only check that we have the embeeded original header (ND_OPT_REDIRECT_HDR), and then attempt to do a flow lookup for this tuple. Also extend the existing test case to cover redirects. Fixes: 9fb9cbb1082d ("[NETFILTER]: Add nf_conntrack subsystem.") Reported-by: Eric Garver Link: https://github.com/firewalld/firewalld/issues/1046 Signed-off-by: Florian Westphal Acked-by: Eric Garver Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_proto_icmpv6.c | 53 ++++++++++++++++++++++ .../selftests/netfilter/conntrack_icmp_related.sh | 36 ++++++++++++++- 2 files changed, 87 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 61e3b05cf02c..1020d67600a9 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -129,6 +129,56 @@ static void icmpv6_error_log(const struct sk_buff *skb, nf_l4proto_log_invalid(skb, state, IPPROTO_ICMPV6, "%s", msg); } +static noinline_for_stack int +nf_conntrack_icmpv6_redirect(struct nf_conn *tmpl, struct sk_buff *skb, + unsigned int dataoff, + const struct nf_hook_state *state) +{ + u8 hl = ipv6_hdr(skb)->hop_limit; + union nf_inet_addr outer_daddr; + union { + struct nd_opt_hdr nd_opt; + struct rd_msg rd_msg; + } tmp; + const struct nd_opt_hdr *nd_opt; + const struct rd_msg *rd_msg; + + rd_msg = skb_header_pointer(skb, dataoff, sizeof(*rd_msg), &tmp.rd_msg); + if (!rd_msg) { + icmpv6_error_log(skb, state, "short redirect"); + return -NF_ACCEPT; + } + + if (rd_msg->icmph.icmp6_code != 0) + return NF_ACCEPT; + + if (hl != 255 || !(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { + icmpv6_error_log(skb, state, "invalid saddr or hoplimit for redirect"); + return -NF_ACCEPT; + } + + dataoff += sizeof(*rd_msg); + + /* warning: rd_msg no longer usable after this call */ + nd_opt = skb_header_pointer(skb, dataoff, sizeof(*nd_opt), &tmp.nd_opt); + if (!nd_opt || nd_opt->nd_opt_len == 0) { + icmpv6_error_log(skb, state, "redirect without options"); + return -NF_ACCEPT; + } + + /* We could call ndisc_parse_options(), but it would need + * skb_linearize() and a bit more work. + */ + if (nd_opt->nd_opt_type != ND_OPT_REDIRECT_HDR) + return NF_ACCEPT; + + memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr, + sizeof(outer_daddr.ip6)); + dataoff += 8; + return nf_conntrack_inet_error(tmpl, skb, dataoff, state, + IPPROTO_ICMPV6, &outer_daddr); +} + int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, @@ -159,6 +209,9 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, return NF_ACCEPT; } + if (icmp6h->icmp6_type == NDISC_REDIRECT) + return nf_conntrack_icmpv6_redirect(tmpl, skb, dataoff, state); + /* is not error message ? */ if (icmp6h->icmp6_type >= 128) return NF_ACCEPT; diff --git a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh index b48e1833bc89..76645aaf2b58 100755 --- a/tools/testing/selftests/netfilter/conntrack_icmp_related.sh +++ b/tools/testing/selftests/netfilter/conntrack_icmp_related.sh @@ -35,6 +35,8 @@ cleanup() { for i in 1 2;do ip netns del nsrouter$i;done } +trap cleanup EXIT + ipv4() { echo -n 192.168.$1.2 } @@ -146,11 +148,17 @@ ip netns exec nsclient1 nft -f - < /dev/null + +expect="packets 1 bytes 112" +check_counter nsclient1 "redir4" "$expect" +if [ $? -ne 0 ];then + ret=1 +fi + +ip netns exec "nsclient1" ping -c 1 dead:1::42 > /dev/null +expect="packets 1 bytes 192" +check_counter nsclient1 "redir6" "$expect" +if [ $? -ne 0 ];then + ret=1 +fi + +if [ $ret -eq 0 ];then + echo "PASS: icmp redirects had RELATED state" +else + echo "ERROR: icmp redirect RELATED state test has failed" +fi + exit $ret -- cgit v1.2.3-70-g09d2 From 7a9841ca025275b5b0edfb0b618934abb6ceec15 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:38 +0800 Subject: bpf, sockmap: Fix repeated calls to sock_put() when msg has more_data In tcp_bpf_send_verdict() redirection, the eval variable is assigned to __SK_REDIRECT after the apply_bytes data is sent, if msg has more_data, sock_put() will be called multiple times. We should reset the eval variable to __SK_NONE every time more_data starts. This causes: IPv4: Attempt to release TCP socket in state 1 00000000b4c925d7 ------------[ cut here ]------------ refcount_t: addition on 0; use-after-free. WARNING: CPU: 5 PID: 4482 at lib/refcount.c:25 refcount_warn_saturate+0x7d/0x110 Modules linked in: CPU: 5 PID: 4482 Comm: sockhash_bypass Kdump: loaded Not tainted 6.0.0 #1 Hardware name: Red Hat KVM, BIOS 1.11.0-2.el7 04/01/2014 Call Trace: __tcp_transmit_skb+0xa1b/0xb90 ? __alloc_skb+0x8c/0x1a0 ? __kmalloc_node_track_caller+0x184/0x320 tcp_write_xmit+0x22a/0x1110 __tcp_push_pending_frames+0x32/0xf0 do_tcp_sendpages+0x62d/0x640 tcp_bpf_push+0xae/0x2c0 tcp_bpf_sendmsg_redir+0x260/0x410 ? preempt_count_add+0x70/0xa0 tcp_bpf_send_verdict+0x386/0x4b0 tcp_bpf_sendmsg+0x21b/0x3b0 sock_sendmsg+0x58/0x70 __sys_sendto+0xfa/0x170 ? xfd_validate_state+0x1d/0x80 ? switch_fpu_return+0x59/0xe0 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x37/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: cd9733f5d75c ("tcp_bpf: Fix one concurrency problem in the tcp_bpf_send_verdict function") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-2-git-send-email-yangpc@wangsu.com --- net/ipv4/tcp_bpf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index cf9c3e8f7ccb..f3e868f4cd9e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -279,7 +279,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; - u32 eval = __SK_NONE; + u32 eval; int ret; more_data: @@ -310,6 +310,7 @@ more_data: tosend = msg->sg.size; if (psock->apply_bytes && psock->apply_bytes < tosend) tosend = psock->apply_bytes; + eval = __SK_NONE; switch (psock->eval) { case __SK_PASS: -- cgit v1.2.3-70-g09d2 From a351d6087bf7d3d8440d58d3bf244ec64b89394a Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:39 +0800 Subject: bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes When redirecting, we use sk_msg_to_ingress() to get the BPF_F_INGRESS flag from the msg->flags. If apply_bytes is used and it is larger than the current data being processed, sk_psock_msg_verdict() will not be called when sendmsg() is called again. At this time, the msg->flags is 0, and we lost the BPF_F_INGRESS flag. So we need to save the BPF_F_INGRESS flag in sk_psock and use it when redirection. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-3-git-send-email-yangpc@wangsu.com --- include/linux/skmsg.h | 1 + include/net/tcp.h | 4 ++-- net/core/skmsg.c | 9 ++++++--- net/ipv4/tcp_bpf.c | 11 ++++++----- net/tls/tls_sw.c | 6 ++++-- 5 files changed, 19 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 70d6cb94e580..84f787416a54 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -82,6 +82,7 @@ struct sk_psock { u32 apply_bytes; u32 cork_bytes; u32 eval; + bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6b814e788f00..b87e7381bddf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2319,8 +2319,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #endif /* CONFIG_BPF_SYSCALL */ -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, - int flags); +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, + struct sk_msg *msg, u32 bytes, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e6b9ced3eda8..53d0251788aa 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -886,13 +886,16 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock, ret = sk_psock_map_verd(ret, msg->sk_redir); psock->apply_bytes = msg->apply_bytes; if (ret == __SK_REDIRECT) { - if (psock->sk_redir) + if (psock->sk_redir) { sock_put(psock->sk_redir); - psock->sk_redir = msg->sk_redir; - if (!psock->sk_redir) { + psock->sk_redir = NULL; + } + if (!msg->sk_redir) { ret = __SK_DROP; goto out; } + psock->redir_ingress = sk_msg_to_ingress(msg); + psock->sk_redir = msg->sk_redir; sock_hold(psock->sk_redir); } out: diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index f3e868f4cd9e..275c5ca9e04d 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -131,10 +131,9 @@ static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg, return ret; } -int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, - u32 bytes, int flags) +int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress, + struct sk_msg *msg, u32 bytes, int flags) { - bool ingress = sk_msg_to_ingress(msg); struct sk_psock *psock = sk_psock_get(sk); int ret; @@ -276,7 +275,7 @@ msg_bytes_ready: static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg, int *copied, int flags) { - bool cork = false, enospc = sk_msg_full(msg); + bool cork = false, enospc = sk_msg_full(msg), redir_ingress; struct sock *sk_redir; u32 tosend, origsize, sent, delta = 0; u32 eval; @@ -322,6 +321,7 @@ more_data: sk_msg_apply_bytes(psock, tosend); break; case __SK_REDIRECT: + redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); if (!psock->apply_bytes) { @@ -338,7 +338,8 @@ more_data: release_sock(sk); origsize = msg->sg.size; - ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, + msg, tosend, flags); sent = origsize - msg->sg.size; if (eval == __SK_REDIRECT) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 264cf367e265..9ed978634125 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -792,7 +792,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk, struct sk_psock *psock; struct sock *sk_redir; struct tls_rec *rec; - bool enospc, policy; + bool enospc, policy, redir_ingress; int err = 0, send; u32 delta = 0; @@ -837,6 +837,7 @@ more_data: } break; case __SK_REDIRECT: + redir_ingress = psock->redir_ingress; sk_redir = psock->sk_redir; memcpy(&msg_redir, msg, sizeof(*msg)); if (msg->apply_bytes < send) @@ -846,7 +847,8 @@ more_data: sk_msg_return_zero(sk, msg, send); msg->sg.size -= send; release_sock(sk); - err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags); + err = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress, + &msg_redir, send, flags); lock_sock(sk); if (err < 0) { *copied -= sk_msg_free_nocharge(sk, &msg_redir); -- cgit v1.2.3-70-g09d2 From 9072931f020bfd907d6d89ee21ff1481cd78b407 Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:40 +0800 Subject: bpf, sockmap: Fix data loss caused by using apply_bytes on ingress redirect Use apply_bytes on ingress redirect, when apply_bytes is less than the length of msg data, some data may be skipped and lost in bpf_tcp_ingress(). If there is still data in the scatterlist that has not been consumed, we cannot move the msg iter. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-4-git-send-email-yangpc@wangsu.com --- net/ipv4/tcp_bpf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 275c5ca9e04d..94aad3870c5f 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -45,8 +45,11 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, tmp->sg.end = i; if (apply) { apply_bytes -= size; - if (!apply_bytes) + if (!apply_bytes) { + if (sge->length) + sk_msg_iter_var_prev(i); break; + } } } while (i != msg->sg.end); -- cgit v1.2.3-70-g09d2 From 3144bfa5078e0df7507a4de72061501e6a0e56be Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 29 Nov 2022 21:21:47 -0800 Subject: bpf: Fix a compilation failure with clang lto build When building the kernel with clang lto (CONFIG_LTO_CLANG_FULL=y), the following compilation error will appear: $ make LLVM=1 LLVM_IAS=1 -j ... ld.lld: error: ld-temp.o :26889:1: symbol 'cgroup_storage_map_btf_ids' is already defined cgroup_storage_map_btf_ids:; ^ make[1]: *** [/.../bpf-next/scripts/Makefile.vmlinux_o:61: vmlinux.o] Error 1 In local_storage.c, we have BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) Commit c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") added the above identical BTF_ID_LIST_SINGLE definition in bpf_cgrp_storage.c. With duplicated definitions, llvm linker complains with lto build. Also, extracting btf_id of 'struct bpf_local_storage_map' is defined four times for sk, inode, task and cgrp local storages. Let us define a single global one with a different name than cgroup_storage_map_btf_ids, which also fixed the lto compilation error. Fixes: c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221130052147.1591625-1-yhs@fb.com --- include/linux/btf_ids.h | 1 + kernel/bpf/bpf_cgrp_storage.c | 3 +-- kernel/bpf/bpf_inode_storage.c | 4 +--- kernel/bpf/bpf_task_storage.c | 4 ++-- net/core/bpf_sk_storage.c | 3 +-- 5 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 93397711a68c..3a4f7cd882ca 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -266,5 +266,6 @@ MAX_BTF_TRACING_TYPE, extern u32 btf_tracing_ids[]; extern u32 bpf_cgroup_btf_id[]; +extern u32 bpf_local_storage_map_btf_id[]; #endif diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c index 309403800f82..6cdf6d9ed91d 100644 --- a/kernel/bpf/bpf_cgrp_storage.c +++ b/kernel/bpf/bpf_cgrp_storage.c @@ -211,7 +211,6 @@ BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgro return ret; } -BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -222,7 +221,7 @@ const struct bpf_map_ops cgrp_storage_map_ops = { .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &cgroup_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6a1d4d22816a..05f4c66c9089 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -213,8 +213,6 @@ static void inode_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(map, &inode_cache, NULL); } -BTF_ID_LIST_SINGLE(inode_storage_map_btf_ids, struct, - bpf_local_storage_map) const struct bpf_map_ops inode_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -225,7 +223,7 @@ const struct bpf_map_ops inode_storage_map_ops = { .map_update_elem = bpf_fd_inode_storage_update_elem, .map_delete_elem = bpf_fd_inode_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &inode_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = inode_storage_ptr, }; diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 8e832db8151a..1e486055a523 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -324,7 +324,7 @@ static void task_storage_map_free(struct bpf_map *map) bpf_local_storage_map_free(map, &task_cache, &bpf_task_storage_busy); } -BTF_ID_LIST_SINGLE(task_storage_map_btf_ids, struct, bpf_local_storage_map) +BTF_ID_LIST_GLOBAL_SINGLE(bpf_local_storage_map_btf_id, struct, bpf_local_storage_map) const struct bpf_map_ops task_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -335,7 +335,7 @@ const struct bpf_map_ops task_storage_map_ops = { .map_update_elem = bpf_pid_task_storage_update_elem, .map_delete_elem = bpf_pid_task_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &task_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = task_storage_ptr, }; diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 9d2288c0736e..bb378c33f542 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -310,7 +310,6 @@ bpf_sk_storage_ptr(void *owner) return &sk->sk_bpf_storage; } -BTF_ID_LIST_SINGLE(sk_storage_map_btf_ids, struct, bpf_local_storage_map) const struct bpf_map_ops sk_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, @@ -321,7 +320,7 @@ const struct bpf_map_ops sk_storage_map_ops = { .map_update_elem = bpf_fd_sk_storage_update_elem, .map_delete_elem = bpf_fd_sk_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, - .map_btf_id = &sk_storage_map_btf_ids[0], + .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_local_storage_charge = bpf_sk_storage_charge, .map_local_storage_uncharge = bpf_sk_storage_uncharge, .map_owner_storage_ptr = bpf_sk_storage_ptr, -- cgit v1.2.3-70-g09d2 From 28e0c250f17ab3b6eb4dcee1a8208125bc77c61e Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 28 Nov 2022 12:36:39 -0800 Subject: devlink: use min_t to calculate data_size The calculation for the data_size in the devlink_nl_read_snapshot_fill function uses an if statement that is better expressed using the min_t macro. Signed-off-by: Jacob Keller Acked-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 0e10a8a68c5e..6ef9974f5357 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6485,10 +6485,8 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, u32 data_size; u8 *data; - if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE) - data_size = end_offset - curr_offset; - else - data_size = DEVLINK_REGION_READ_CHUNK_SIZE; + data_size = min_t(u32, end_offset - curr_offset, + DEVLINK_REGION_READ_CHUNK_SIZE); data = &snapshot->data[curr_offset]; err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink, -- cgit v1.2.3-70-g09d2 From 611fd12ce0fbc809d5e54febb1f39e93532c9deb Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 28 Nov 2022 12:36:40 -0800 Subject: devlink: report extended error message in region_read_dumpit() Report extended error details in the devlink_nl_cmd_region_read_dumpit() function, by using the extack structure from the netlink_callback. Signed-off-by: Jacob Keller Acked-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 6ef9974f5357..fee69280deb4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6507,10 +6507,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, { const struct genl_dumpit_info *info = genl_dumpit_info(cb); u64 ret_offset, start_offset, end_offset = U64_MAX; + struct nlattr *chunks_attr, *region_attr; struct nlattr **attrs = info->attrs; struct devlink_port *port = NULL; struct devlink_region *region; - struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; unsigned int index; @@ -6525,8 +6525,14 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, devl_lock(devlink); - if (!attrs[DEVLINK_ATTR_REGION_NAME] || - !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { + if (!attrs[DEVLINK_ATTR_REGION_NAME]) { + NL_SET_ERR_MSG(cb->extack, "No region name provided"); + err = -EINVAL; + goto out_unlock; + } + + if (!attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { + NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); err = -EINVAL; goto out_unlock; } @@ -6541,7 +6547,8 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, } } - region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]); + region_attr = attrs[DEVLINK_ATTR_REGION_NAME]; + region_name = nla_data(region_attr); if (port) region = devlink_port_region_get_by_name(port, region_name); @@ -6549,6 +6556,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, region = devlink_region_get_by_name(devlink, region_name); if (!region) { + NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist"); err = -EINVAL; goto out_unlock; } -- cgit v1.2.3-70-g09d2 From e004ea10599d1a8279e959c86e63fc9d0bf2032b Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 28 Nov 2022 12:36:41 -0800 Subject: devlink: find snapshot in devlink_nl_cmd_region_read_dumpit The snapshot pointer is obtained inside of the function devlink_nl_region_read_snapshot_fill. Simplify this function by locating the snapshot upfront in devlink_nl_cmd_region_read_dumpit instead. This aligns with how other netlink attributes are handled, and allows us to exit slightly earlier if an invalid snapshot ID is provided. It also allows us to pass the snapshot pointer directly to the devlink_nl_region_read_snapshot_fill, and remove the now unused attrs parameter. Signed-off-by: Jacob Keller Acked-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index fee69280deb4..e3166c9b5954 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6463,24 +6463,16 @@ nla_put_failure: static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, struct devlink *devlink, - struct devlink_region *region, - struct nlattr **attrs, + struct devlink_snapshot *snapshot, u64 start_offset, u64 end_offset, u64 *new_offset) { - struct devlink_snapshot *snapshot; u64 curr_offset = start_offset; - u32 snapshot_id; int err = 0; *new_offset = start_offset; - snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); - snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (!snapshot) - return -EINVAL; - while (curr_offset < end_offset) { u32 data_size; u8 *data; @@ -6506,14 +6498,16 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct nlattr *chunks_attr, *region_attr, *snapshot_attr; u64 ret_offset, start_offset, end_offset = U64_MAX; - struct nlattr *chunks_attr, *region_attr; struct nlattr **attrs = info->attrs; struct devlink_port *port = NULL; + struct devlink_snapshot *snapshot; struct devlink_region *region; const char *region_name; struct devlink *devlink; unsigned int index; + u32 snapshot_id; void *hdr; int err; @@ -6561,6 +6555,15 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; + snapshot_id = nla_get_u32(snapshot_attr); + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) { + NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); + err = -EINVAL; + goto out_unlock; + } + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { if (!start_offset) @@ -6610,7 +6613,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, } err = devlink_nl_region_read_snapshot_fill(skb, devlink, - region, attrs, + snapshot, start_offset, end_offset, &ret_offset); -- cgit v1.2.3-70-g09d2 From 284e9d1ebbe2944d95120f85054d161a6fccbf7f Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 28 Nov 2022 12:36:42 -0800 Subject: devlink: remove unnecessary parameter from chunk_fill function The devlink parameter of the devlink_nl_cmd_region_read_chunk_fill function is not used. Remove it, to simplify the function signature. Once removed, it is also obvious that the devlink parameter is not necessary for the devlink_nl_region_read_snapshot_fill either. Signed-off-by: Jacob Keller Acked-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index e3166c9b5954..5b8a9bdbc13d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6431,7 +6431,6 @@ unlock: } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, - struct devlink *devlink, u8 *chunk, u32 chunk_size, u64 addr) { @@ -6462,7 +6461,6 @@ nla_put_failure: #define DEVLINK_REGION_READ_CHUNK_SIZE 256 static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, - struct devlink *devlink, struct devlink_snapshot *snapshot, u64 start_offset, u64 end_offset, @@ -6481,9 +6479,7 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, DEVLINK_REGION_READ_CHUNK_SIZE); data = &snapshot->data[curr_offset]; - err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink, - data, data_size, - curr_offset); + err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset); if (err) break; @@ -6612,9 +6608,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - err = devlink_nl_region_read_snapshot_fill(skb, devlink, - snapshot, - start_offset, + err = devlink_nl_region_read_snapshot_fill(skb, snapshot, start_offset, end_offset, &ret_offset); if (err && err != -EMSGSIZE) -- cgit v1.2.3-70-g09d2 From 2d4caf0988bd3e9663c300e95cead91f2f954fae Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 28 Nov 2022 12:36:43 -0800 Subject: devlink: refactor region_read_snapshot_fill to use a callback function The devlink_nl_region_read_snapshot_fill is used to copy the contents of a snapshot into a message for reporting to userspace via the DEVLINK_CMG_REGION_READ netlink message. A future change is going to add support for directly reading from a region. Almost all of the logic for this new capability is identical. To help reduce code duplication and make this logic more generic, refactor the function to take a cb and cb_priv pointer for doing the actual copy. Add a devlink_region_snapshot_fill implementation that will simply copy the relevant chunk of the region. This does require allocating some storage for the chunk as opposed to simply passing the correct address forward to the devlink_nl_cmg_region_read_chunk_fill function. A future change to implement support for directly reading from a region without a snapshot will provide a separate implementation that calls the newly added devlink region operation. Signed-off-by: Jacob Keller Acked-by: Jakub Kicinski Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 5b8a9bdbc13d..6c05cfaa571d 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6460,25 +6460,36 @@ nla_put_failure: #define DEVLINK_REGION_READ_CHUNK_SIZE 256 -static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, - struct devlink_snapshot *snapshot, - u64 start_offset, - u64 end_offset, - u64 *new_offset) +typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, + struct netlink_ext_ack *extack); + +static int +devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb, + void *cb_priv, u64 start_offset, u64 end_offset, + u64 *new_offset, struct netlink_ext_ack *extack) { u64 curr_offset = start_offset; int err = 0; + u8 *data; + + /* Allocate and re-use a single buffer */ + data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL); + if (!data) + return -ENOMEM; *new_offset = start_offset; while (curr_offset < end_offset) { u32 data_size; - u8 *data; data_size = min_t(u32, end_offset - curr_offset, DEVLINK_REGION_READ_CHUNK_SIZE); - data = &snapshot->data[curr_offset]; + err = cb(cb_priv, data, data_size, curr_offset, extack); + if (err) + break; + err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset); if (err) break; @@ -6487,9 +6498,23 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, } *new_offset = curr_offset; + kfree(data); + return err; } +static int +devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, + struct netlink_ext_ack __always_unused *extack) +{ + struct devlink_snapshot *snapshot = cb_priv; + + memcpy(chunk, &snapshot->data[curr_offset], chunk_size); + + return 0; +} + static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { @@ -6608,8 +6633,9 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - err = devlink_nl_region_read_snapshot_fill(skb, snapshot, start_offset, - end_offset, &ret_offset); + err = devlink_nl_region_read_fill(skb, &devlink_region_snapshot_fill, + snapshot, start_offset, end_offset, + &ret_offset, cb->extack); if (err && err != -EMSGSIZE) goto nla_put_failure; -- cgit v1.2.3-70-g09d2 From af6397c9ee2b42988c912dcad2fca1f43d5c1c99 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 28 Nov 2022 12:36:44 -0800 Subject: devlink: support directly reading from region memory To read from a region, user space must currently request a new snapshot of the region and then read from that snapshot. This can sometimes be overkill if user space only reads a tiny portion. They first create the snapshot, then request a read, then destroy the snapshot. For regions which have a single underlying "contents", it makes sense to allow supporting direct reading of the region data. Extend the DEVLINK_CMD_REGION_READ to allow direct reading from a region if requested via the new DEVLINK_ATTR_REGION_DIRECT. If this attribute is set, then perform a direct read instead of using a snapshot. Direct read is mutually exclusive with DEVLINK_ATTR_REGION_SNAPSHOT_ID, and care is taken to ensure that we reject commands which provide incorrect attributes. Regions must enable support for direct read by implementing the .read() callback function. If a region does not support such direct reads, a suitable extended error message is reported. Signed-off-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- .../networking/devlink/devlink-region.rst | 13 ++++ include/net/devlink.h | 16 +++++ include/uapi/linux/devlink.h | 2 + net/core/devlink.c | 80 +++++++++++++++++----- 4 files changed, 94 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst index f06dca9a1eb6..9232cd7da301 100644 --- a/Documentation/networking/devlink/devlink-region.rst +++ b/Documentation/networking/devlink/devlink-region.rst @@ -31,6 +31,15 @@ in its ``devlink_region_ops`` structure. If snapshot id is not set in the ``DEVLINK_CMD_REGION_NEW`` request kernel will allocate one and send the snapshot information to user space. +Regions may optionally allow directly reading from their contents without a +snapshot. Direct read requests are not atomic. In particular a read request +of size 256 bytes or larger will be split into multiple chunks. If atomic +access is required, use a snapshot. A driver wishing to enable this for a +region should implement the ``.read`` callback in the ``devlink_region_ops`` +structure. User space can request a direct read by using the +``DEVLINK_ATTR_REGION_DIRECT`` attribute instead of specifying a snapshot +id. + example usage ------------- @@ -65,6 +74,10 @@ example usage $ devlink region read pci/0000:00:05.0/fw-health snapshot 1 address 0 length 16 0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30 + # Read from the region without a snapshot + $ devlink region read pci/0000:00:05.0/fw-health address 16 length 16 + 0000000000000010 0000 0000 ffff ff04 0029 8c00 0028 8cc8 + As regions are likely very device or driver specific, no generic regions are defined. See the driver-specific documentation files for information on the specific regions a driver supports. diff --git a/include/net/devlink.h b/include/net/devlink.h index 074a79b8933f..02528f736f65 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -650,6 +650,10 @@ struct devlink_info_req; * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @read: callback to directly read a portion of the region. On success, + * the data pointer will be updated with the contents of the + * requested portion of the region. The function will be called + * while the devlink instance lock is held. * @priv: Pointer to driver private data for the region operation */ struct devlink_region_ops { @@ -659,6 +663,10 @@ struct devlink_region_ops { const struct devlink_region_ops *ops, struct netlink_ext_ack *extack, u8 **data); + int (*read)(struct devlink *devlink, + const struct devlink_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data); void *priv; }; @@ -670,6 +678,10 @@ struct devlink_region_ops { * the data variable must be updated to point to the snapshot data. * The function will be called while the devlink instance lock is * held. + * @read: callback to directly read a portion of the region. On success, + * the data pointer will be updated with the contents of the + * requested portion of the region. The function will be called + * while the devlink instance lock is held. * @priv: Pointer to driver private data for the region operation */ struct devlink_port_region_ops { @@ -679,6 +691,10 @@ struct devlink_port_region_ops { const struct devlink_port_region_ops *ops, struct netlink_ext_ack *extack, u8 **data); + int (*read)(struct devlink_port *port, + const struct devlink_port_region_ops *ops, + struct netlink_ext_ack *extack, + u64 offset, u32 size, u8 *data); void *priv; }; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 498d0d5d0957..70191d96af89 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -610,6 +610,8 @@ enum devlink_attr { DEVLINK_ATTR_RATE_TX_PRIORITY, /* u32 */ DEVLINK_ATTR_RATE_TX_WEIGHT, /* u32 */ + DEVLINK_ATTR_REGION_DIRECT, /* flag */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 6c05cfaa571d..298041a44aa8 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6515,6 +6515,26 @@ devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size, return 0; } +static int +devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, struct netlink_ext_ack *extack) +{ + struct devlink_region *region = cb_priv; + + return region->port_ops->read(region->port, region->port_ops, extack, + curr_offset, chunk_size, chunk); +} + +static int +devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size, + u64 curr_offset, struct netlink_ext_ack *extack) +{ + struct devlink_region *region = cb_priv; + + return region->ops->read(region->devlink, region->ops, extack, + curr_offset, chunk_size, chunk); +} + static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { @@ -6523,12 +6543,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; struct devlink_port *port = NULL; - struct devlink_snapshot *snapshot; + devlink_chunk_fill_t *region_cb; struct devlink_region *region; const char *region_name; struct devlink *devlink; unsigned int index; - u32 snapshot_id; + void *region_cb_priv; void *hdr; int err; @@ -6546,12 +6566,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } - if (!attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { - NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); - err = -EINVAL; - goto out_unlock; - } - if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) { index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]); @@ -6577,12 +6591,43 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, } snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; - snapshot_id = nla_get_u32(snapshot_attr); - snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); - if (!snapshot) { - NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); - err = -EINVAL; - goto out_unlock; + if (!snapshot_attr) { + if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { + NL_SET_ERR_MSG(cb->extack, "No snapshot id provided"); + err = -EINVAL; + goto out_unlock; + } + + if (!region->ops->read) { + NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read"); + err = -EOPNOTSUPP; + goto out_unlock; + } + + if (port) + region_cb = &devlink_region_port_direct_fill; + else + region_cb = &devlink_region_direct_fill; + region_cb_priv = region; + } else { + struct devlink_snapshot *snapshot; + u32 snapshot_id; + + if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) { + NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot"); + err = -EINVAL; + goto out_unlock; + } + + snapshot_id = nla_get_u32(snapshot_attr); + snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id); + if (!snapshot) { + NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist"); + err = -EINVAL; + goto out_unlock; + } + region_cb = &devlink_region_snapshot_fill; + region_cb_priv = snapshot; } if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && @@ -6633,9 +6678,9 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - err = devlink_nl_region_read_fill(skb, &devlink_region_snapshot_fill, - snapshot, start_offset, end_offset, - &ret_offset, cb->extack); + err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv, + start_offset, end_offset, &ret_offset, + cb->extack); if (err && err != -EMSGSIZE) goto nla_put_failure; @@ -9280,6 +9325,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED }, [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 }, + [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG }, }; static const struct genl_small_ops devlink_nl_ops[] = { -- cgit v1.2.3-70-g09d2 From 226bf980550627c88549b112ac6c8fb40873afb4 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 29 Nov 2022 18:51:38 +0900 Subject: net: devlink: let the core report the driver name instead of the drivers The driver name is available in device_driver::name. Right now, drivers still have to report this piece of information themselves in their devlink_ops::info_get callback function. In order to factorize code, make devlink_nl_info_fill() add the driver name attribute. Now that the core sets the driver name attribute, drivers are not supposed to call devlink_info_driver_name_put() anymore. Remove devlink_info_driver_name_put() and clean-up all the drivers using this function in their callback. Signed-off-by: Vincent Mailhol Tested-by: Ido Schimmel # mlxsw Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- .../crypto/marvell/octeontx2/otx2_cpt_devlink.c | 4 ---- drivers/net/dsa/hirschmann/hellcreek.c | 5 ----- drivers/net/dsa/mv88e6xxx/devlink.c | 5 ----- drivers/net/dsa/sja1105/sja1105_devlink.c | 12 +++------- drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c | 4 ---- .../ethernet/freescale/dpaa2/dpaa2-eth-devlink.c | 11 +-------- .../net/ethernet/fungible/funeth/funeth_devlink.c | 2 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c | 5 ----- .../hisilicon/hns3/hns3vf/hclgevf_devlink.c | 5 ----- drivers/net/ethernet/intel/ice/ice_devlink.c | 6 ----- .../ethernet/marvell/octeontx2/af/rvu_devlink.c | 2 +- .../ethernet/marvell/octeontx2/nic/otx2_devlink.c | 9 +------- .../ethernet/marvell/prestera/prestera_devlink.c | 5 ----- drivers/net/ethernet/mellanox/mlx5/core/devlink.c | 4 ---- drivers/net/ethernet/mellanox/mlxsw/core.c | 5 ----- drivers/net/ethernet/netronome/nfp/nfp_devlink.c | 4 ---- .../net/ethernet/pensando/ionic/ionic_devlink.c | 4 ---- drivers/net/ethernet/qlogic/qed/qed_devlink.c | 4 ---- drivers/net/netdevsim/dev.c | 3 --- drivers/ptp/ptp_ocp.c | 4 ---- include/net/devlink.h | 2 -- net/core/devlink.c | 26 +++++++++++++++------- 22 files changed, 25 insertions(+), 106 deletions(-) (limited to 'net') diff --git a/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c b/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c index 7503f6b18ac5..a2aba0b0d68a 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c @@ -76,10 +76,6 @@ static int otx2_cpt_devlink_info_get(struct devlink *dl, struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf; int err; - err = devlink_info_driver_name_put(req, "rvu_cptpf"); - if (err) - return err; - err = otx2_cpt_dl_info_firmware_version_put(req, cptpf->eng_grps.grp, "fw.ae", OTX2_CPT_AE_TYPES); if (err) diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index 951f7935c872..595a548bb0a8 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -1176,11 +1176,6 @@ static int hellcreek_devlink_info_get(struct dsa_switch *ds, struct netlink_ext_ack *extack) { struct hellcreek *hellcreek = ds->priv; - int ret; - - ret = devlink_info_driver_name_put(req, "hellcreek"); - if (ret) - return ret; return devlink_info_version_fixed_put(req, DEVLINK_INFO_VERSION_GENERIC_ASIC_ID, diff --git a/drivers/net/dsa/mv88e6xxx/devlink.c b/drivers/net/dsa/mv88e6xxx/devlink.c index 1266eabee086..a08dab75e0c0 100644 --- a/drivers/net/dsa/mv88e6xxx/devlink.c +++ b/drivers/net/dsa/mv88e6xxx/devlink.c @@ -821,11 +821,6 @@ int mv88e6xxx_devlink_info_get(struct dsa_switch *ds, struct netlink_ext_ack *extack) { struct mv88e6xxx_chip *chip = ds->priv; - int err; - - err = devlink_info_driver_name_put(req, "mv88e6xxx"); - if (err) - return err; return devlink_info_version_fixed_put(req, DEVLINK_INFO_VERSION_GENERIC_ASIC_ID, diff --git a/drivers/net/dsa/sja1105/sja1105_devlink.c b/drivers/net/dsa/sja1105/sja1105_devlink.c index 10c6fea1227f..da532614f34a 100644 --- a/drivers/net/dsa/sja1105/sja1105_devlink.c +++ b/drivers/net/dsa/sja1105/sja1105_devlink.c @@ -120,16 +120,10 @@ int sja1105_devlink_info_get(struct dsa_switch *ds, struct netlink_ext_ack *extack) { struct sja1105_private *priv = ds->priv; - int rc; - - rc = devlink_info_driver_name_put(req, "sja1105"); - if (rc) - return rc; - rc = devlink_info_version_fixed_put(req, - DEVLINK_INFO_VERSION_GENERIC_ASIC_ID, - priv->info->name); - return rc; + return devlink_info_version_fixed_put(req, + DEVLINK_INFO_VERSION_GENERIC_ASIC_ID, + priv->info->name); } int sja1105_devlink_setup(struct dsa_switch *ds) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 8a6f788f6294..26913dc816d3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -892,10 +892,6 @@ static int bnxt_dl_info_get(struct devlink *dl, struct devlink_info_req *req, u32 ver = 0; int rc; - rc = devlink_info_driver_name_put(req, DRV_MODULE_NAME); - if (rc) - return rc; - if (BNXT_PF(bp) && (bp->flags & BNXT_FLAG_DSN_VALID)) { sprintf(buf, "%02X-%02X-%02X-%02X-%02X-%02X-%02X-%02X", bp->dsn[7], bp->dsn[6], bp->dsn[5], bp->dsn[4], diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c index 5c6dd3029e2f..76f808d38066 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c @@ -37,18 +37,9 @@ static int dpaa2_eth_dl_info_get(struct devlink *devlink, struct dpaa2_eth_devlink_priv *dl_priv = devlink_priv(devlink); struct dpaa2_eth_priv *priv = dl_priv->dpaa2_priv; char buf[10]; - int err; - - err = devlink_info_driver_name_put(req, KBUILD_MODNAME); - if (err) - return err; scnprintf(buf, 10, "%d.%d", priv->dpni_ver_major, priv->dpni_ver_minor); - err = devlink_info_version_running_put(req, "dpni", buf); - if (err) - return err; - - return 0; + return devlink_info_version_running_put(req, "dpni", buf); } static struct dpaa2_eth_trap_item * diff --git a/drivers/net/ethernet/fungible/funeth/funeth_devlink.c b/drivers/net/ethernet/fungible/funeth/funeth_devlink.c index d50c222948b4..6668375edff6 100644 --- a/drivers/net/ethernet/fungible/funeth/funeth_devlink.c +++ b/drivers/net/ethernet/fungible/funeth/funeth_devlink.c @@ -6,7 +6,7 @@ static int fun_dl_info_get(struct devlink *dl, struct devlink_info_req *req, struct netlink_ext_ack *extack) { - return devlink_info_driver_name_put(req, KBUILD_MODNAME); + return 0; } static const struct devlink_ops fun_dl_ops = { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c index 4c441e6a5082..3d3b69605423 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c @@ -13,11 +13,6 @@ static int hclge_devlink_info_get(struct devlink *devlink, struct hclge_devlink_priv *priv = devlink_priv(devlink); char version_str[HCLGE_DEVLINK_FW_STRING_LEN]; struct hclge_dev *hdev = priv->hdev; - int ret; - - ret = devlink_info_driver_name_put(req, KBUILD_MODNAME); - if (ret) - return ret; snprintf(version_str, sizeof(version_str), "%lu.%lu.%lu.%lu", hnae3_get_field(hdev->fw_version, HNAE3_FW_VERSION_BYTE3_MASK, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c index fdc19868b818..a6c3c5e8f0ab 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c @@ -13,11 +13,6 @@ static int hclgevf_devlink_info_get(struct devlink *devlink, struct hclgevf_devlink_priv *priv = devlink_priv(devlink); char version_str[HCLGEVF_DEVLINK_FW_STRING_LEN]; struct hclgevf_dev *hdev = priv->hdev; - int ret; - - ret = devlink_info_driver_name_put(req, KBUILD_MODNAME); - if (ret) - return ret; snprintf(version_str, sizeof(version_str), "%lu.%lu.%lu.%lu", hnae3_get_field(hdev->fw_version, HNAE3_FW_VERSION_BYTE3_MASK, diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c index 946d64e577c9..8286e47b4bae 100644 --- a/drivers/net/ethernet/intel/ice/ice_devlink.c +++ b/drivers/net/ethernet/intel/ice/ice_devlink.c @@ -311,12 +311,6 @@ static int ice_devlink_info_get(struct devlink *devlink, } } - err = devlink_info_driver_name_put(req, KBUILD_MODNAME); - if (err) { - NL_SET_ERR_MSG_MOD(extack, "Unable to set driver name"); - goto out_free_ctx; - } - ice_info_get_dsn(pf, ctx); err = devlink_info_serial_number_put(req, ctx->buf); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c index 88dee589cb21..f15439d26d21 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c @@ -1550,7 +1550,7 @@ static int rvu_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, static int rvu_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, struct netlink_ext_ack *extack) { - return devlink_info_driver_name_put(req, DRV_NAME); + return 0; } static const struct devlink_ops rvu_devlink_ops = { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c index 777a27047c8e..5cc6416cf1a6 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c @@ -77,18 +77,11 @@ static const struct devlink_param otx2_dl_params[] = { otx2_dl_mcam_count_validate), }; -/* Devlink OPs */ static int otx2_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, struct netlink_ext_ack *extack) { - struct otx2_devlink *otx2_dl = devlink_priv(devlink); - struct otx2_nic *pfvf = otx2_dl->pfvf; - - if (is_otx2_vf(pfvf->pcifunc)) - return devlink_info_driver_name_put(req, "rvu_nicvf"); - - return devlink_info_driver_name_put(req, "rvu_nicpf"); + return 0; } static const struct devlink_ops otx2_devlink_ops = { diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c index 84ad05c9f12d..2a4c9df4eb79 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c @@ -355,11 +355,6 @@ static int prestera_dl_info_get(struct devlink *dl, { struct prestera_switch *sw = devlink_priv(dl); char buf[16]; - int err; - - err = devlink_info_driver_name_put(req, PRESTERA_DRV_NAME); - if (err) - return err; snprintf(buf, sizeof(buf), "%d.%d.%d", sw->dev->fw_rev.maj, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c index cc2ae427dcb0..751bc4a9edcf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c @@ -46,10 +46,6 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, u32 running_fw, stored_fw; int err; - err = devlink_info_driver_name_put(req, KBUILD_MODNAME); - if (err) - return err; - err = devlink_info_version_fixed_put(req, "fw.psid", dev->board_id); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index a83f6bc30072..a0a06e2eff82 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1459,11 +1459,6 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, char buf[32]; int err; - err = devlink_info_driver_name_put(req, - mlxsw_core->bus_info->device_kind); - if (err) - return err; - mlxsw_reg_mgir_pack(mgir_pl); err = mlxsw_reg_query(mlxsw_core, MLXSW_REG(mgir), mgir_pl); if (err) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c index 784f23602a8a..bf6bae557158 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c @@ -239,10 +239,6 @@ nfp_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, char *buf = NULL; int err; - err = devlink_info_driver_name_put(req, "nfp"); - if (err) - return err; - vendor = nfp_hwinfo_lookup(pf->hwinfo, "assembly.vendor"); part = nfp_hwinfo_lookup(pf->hwinfo, "assembly.partno"); sn = nfp_hwinfo_lookup(pf->hwinfo, "assembly.serial"); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c index 567f778433e2..e6ff757895ab 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c @@ -26,10 +26,6 @@ static int ionic_dl_info_get(struct devlink *dl, struct devlink_info_req *req, char buf[16]; int err = 0; - err = devlink_info_driver_name_put(req, IONIC_DRV_NAME); - if (err) - return err; - err = devlink_info_version_running_put(req, DEVLINK_INFO_VERSION_GENERIC_FW, idev->dev_info.fw_version); diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c index 6bb4e165b592..922c47797af6 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c +++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c @@ -162,10 +162,6 @@ static int qed_devlink_info_get(struct devlink *devlink, dev_info = &cdev->common_dev_info; - err = devlink_info_driver_name_put(req, KBUILD_MODNAME); - if (err) - return err; - memcpy(buf, cdev->hwfns[0].hw_info.part_num, sizeof(cdev->hwfns[0].hw_info.part_num)); buf[sizeof(cdev->hwfns[0].hw_info.part_num)] = 0; diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c index e14686594a71..b962fc8e1397 100644 --- a/drivers/net/netdevsim/dev.c +++ b/drivers/net/netdevsim/dev.c @@ -994,9 +994,6 @@ static int nsim_dev_info_get(struct devlink *devlink, { int err; - err = devlink_info_driver_name_put(req, DRV_NAME); - if (err) - return err; err = devlink_info_version_stored_put_ext(req, "fw.mgmt", "10.20.30", DEVLINK_INFO_VERSION_TYPE_COMPONENT); if (err) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index 154d58cbd9ce..4bbaccd543ad 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -1647,10 +1647,6 @@ ptp_ocp_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req, char buf[32]; int err; - err = devlink_info_driver_name_put(req, KBUILD_MODNAME); - if (err) - return err; - fw_image = bp->fw_loader ? "loader" : "fw"; sprintf(buf, "%d.%d", bp->fw_tag, bp->fw_version); err = devlink_info_version_running_put(req, fw_image, buf); diff --git a/include/net/devlink.h b/include/net/devlink.h index 02528f736f65..5f6eca5e4a40 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1762,8 +1762,6 @@ int devlink_region_snapshot_create(struct devlink_region *region, u8 *data, u32 snapshot_id); int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn); -int devlink_info_driver_name_put(struct devlink_info_req *req, - const char *name); int devlink_info_board_serial_number_put(struct devlink_info_req *req, const char *bsn); diff --git a/net/core/devlink.c b/net/core/devlink.c index 298041a44aa8..60eb0f46520f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6707,14 +6707,6 @@ out_unlock: return err; } -int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) -{ - if (!req->msg) - return 0; - return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name); -} -EXPORT_SYMBOL_GPL(devlink_info_driver_name_put); - int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) { if (!req->msg) @@ -6823,11 +6815,25 @@ int devlink_info_version_running_put_ext(struct devlink_info_req *req, } EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext); +static int devlink_nl_driver_info_get(struct device_driver *drv, + struct devlink_info_req *req) +{ + if (!drv) + return 0; + + if (drv->name[0]) + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, + drv->name); + + return 0; +} + static int devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { + struct device *dev = devlink_to_dev(devlink); struct devlink_info_req req = {}; void *hdr; int err; @@ -6845,6 +6851,10 @@ devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto err_cancel_msg; + err = devlink_nl_driver_info_get(dev->driver, &req); + if (err) + goto err_cancel_msg; + genlmsg_end(msg, hdr); return 0; -- cgit v1.2.3-70-g09d2 From c5cd7c86847cda0fdd44956561c0f3d9adea032b Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Tue, 29 Nov 2022 18:51:39 +0900 Subject: net: devlink: make the devlink_ops::info_get() callback optional Some drivers only reported the driver name in their devlink_ops::info_get() callback. Now that the core provides this information, the callback became empty. For such drivers, just removing the callback would prevent the core from executing devlink_nl_info_fill() meaning that "devlink dev info" would not return anything. Make the callback function optional by executing devlink_nl_info_fill() even if devlink_ops::info_get() is NULL. N.B.: the drivers with devlink support which previously did not implement devlink_ops::info_get() will now also be able to report the driver name. Signed-off-by: Vincent Mailhol Reviewed-by: Jacob Keller Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 60eb0f46520f..fca3ebee97b0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -6847,9 +6847,11 @@ devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, goto err_cancel_msg; req.msg = msg; - err = devlink->ops->info_get(devlink, &req, extack); - if (err) - goto err_cancel_msg; + if (devlink->ops->info_get) { + err = devlink->ops->info_get(devlink, &req, extack); + if (err) + goto err_cancel_msg; + } err = devlink_nl_driver_info_get(dev->driver, &req); if (err) @@ -6870,9 +6872,6 @@ static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct sk_buff *msg; int err; - if (!devlink->ops->info_get) - return -EOPNOTSUPP; - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -6898,7 +6897,7 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, int err = 0; devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { - if (idx < start || !devlink->ops->info_get) + if (idx < start) goto inc; devl_lock(devlink); -- cgit v1.2.3-70-g09d2 From 5cc58b376675981386c6192405fe887cd29c527a Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 29 Nov 2022 01:42:11 +0000 Subject: wifi: nl80211: Add checks for nla_nest_start() in nl80211_send_iface() As the nla_nest_start() may fail with NULL returned, the return value needs to be checked. Fixes: ce08cd344a00 ("wifi: nl80211: expose link information for interfaces") Signed-off-by: Yuan Can Link: https://lore.kernel.org/r/20221129014211.56558-1-yuancan@huawei.com Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1ad0326ff4dc..33a82ecab9d5 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3868,6 +3868,9 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag struct cfg80211_chan_def chandef = {}; int ret; + if (!link) + goto nla_put_failure; + if (nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id)) goto nla_put_failure; if (nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, -- cgit v1.2.3-70-g09d2 From 13e5afd3d773c6fc6ca2b89027befaaaa1ea7293 Mon Sep 17 00:00:00 2001 From: Zhengchao Shao Date: Thu, 17 Nov 2022 14:45:00 +0800 Subject: wifi: mac80211: fix memory leak in ieee80211_if_add() When register_netdevice() failed in ieee80211_if_add(), ndev->tstats isn't released. Fix it. Fixes: 5a490510ba5f ("mac80211: use per-CPU TX/RX statistics") Signed-off-by: Zhengchao Shao Link: https://lore.kernel.org/r/20221117064500.319983-1-shaozhengchao@huawei.com Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7c4ce716c939..a5782927a93b 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -2195,6 +2195,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ret = cfg80211_register_netdevice(ndev); if (ret) { + ieee80211_if_free(ndev); free_netdev(ndev); return ret; } -- cgit v1.2.3-70-g09d2 From 84924aac08a43169811b4814c67994a9154a6a82 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 1 Dec 2022 12:24:36 +0000 Subject: rxrpc: Fix checker warning Fix the following checker warning: ../net/rxrpc/key.c:692:9: error: subtraction of different types can't work (different address spaces) Checker is wrong in this case, but cast the pointers to unsigned long to avoid the warning. Whilst we're at it, reduce the assertions to WARN_ON() and return an error. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/key.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 8d2073e0e3da..830eeffe2d5b 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -602,7 +602,8 @@ static long rxrpc_read(const struct key *key, } _debug("token[%u]: toksize=%u", ntoks, toksize); - ASSERTCMP(toksize, <=, AFSTOKEN_LENGTH_MAX); + if (WARN_ON(toksize > AFSTOKEN_LENGTH_MAX)) + return -EIO; toksizes[ntoks++] = toksize; size += toksize + 4; /* each token has a length word */ @@ -679,8 +680,9 @@ static long rxrpc_read(const struct key *key, return -ENOPKG; } - ASSERTCMP((unsigned long)xdr - (unsigned long)oldxdr, ==, - toksize); + if (WARN_ON((unsigned long)xdr - (unsigned long)oldxdr == + toksize)) + return -EIO; } #undef ENCODE_STR @@ -688,8 +690,10 @@ static long rxrpc_read(const struct key *key, #undef ENCODE64 #undef ENCODE - ASSERTCMP(tok, ==, ntoks); - ASSERTCMP((char __user *) xdr - buffer, ==, size); + if (WARN_ON(tok != ntoks)) + return -EIO; + if (WARN_ON((unsigned long)xdr - (unsigned long)buffer != size)) + return -EIO; _leave(" = %zu", size); return size; } -- cgit v1.2.3-70-g09d2 From 75bfdbf2fca372e2709bcaa43e8cf1147766ae96 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Nov 2022 22:27:52 +0000 Subject: rxrpc: Implement an in-kernel rxperf server for testing purposes Implement an in-kernel rxperf server to allow kernel-based rxrpc services to be tested directly, unlike with AFS where they're accessed by the fileserver when the latter decides it wants to. This is implemented as a module that, if loaded, opens UDP port 7009 (afs3-rmtsys) and listens on it for incoming calls. Calls can be generated using the rxperf command shipped with OpenAFS, for example. Changes ======= ver #2) - Use min_t() instead of min(). Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: Jakub Kicinski --- include/net/af_rxrpc.h | 1 + net/rxrpc/Kconfig | 7 + net/rxrpc/Makefile | 3 + net/rxrpc/rxperf.c | 619 +++++++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/server_key.c | 25 ++ 5 files changed, 655 insertions(+) create mode 100644 net/rxrpc/rxperf.c (limited to 'net') diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index b69ca695935c..dc033f08191e 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -71,5 +71,6 @@ void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *, unsigned long); int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val); +int rxrpc_sock_set_security_keyring(struct sock *, struct key *); #endif /* _NET_RXRPC_H */ diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index accd35c05577..7ae023b37a83 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -58,4 +58,11 @@ config RXKAD See Documentation/networking/rxrpc.rst. +config RXPERF + tristate "RxRPC test service" + help + Provide an rxperf service tester. This listens on UDP port 7009 for + incoming calls from the rxperf program (an example of which can be + found in OpenAFS). + endif diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index fdeba488fc6e..79687477d93c 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -36,3 +36,6 @@ rxrpc-y := \ rxrpc-$(CONFIG_PROC_FS) += proc.o rxrpc-$(CONFIG_RXKAD) += rxkad.o rxrpc-$(CONFIG_SYSCTL) += sysctl.o + + +obj-$(CONFIG_RXPERF) += rxperf.o diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c new file mode 100644 index 000000000000..66f5eea291ff --- /dev/null +++ b/net/rxrpc/rxperf.c @@ -0,0 +1,619 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* In-kernel rxperf server for testing purposes. + * + * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) "rxperf: " fmt +#include +#include +#include +#include + +MODULE_DESCRIPTION("rxperf test server (afs)"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +#define RXPERF_PORT 7009 +#define RX_PERF_SERVICE 147 +#define RX_PERF_VERSION 3 +#define RX_PERF_SEND 0 +#define RX_PERF_RECV 1 +#define RX_PERF_RPC 3 +#define RX_PERF_FILE 4 +#define RX_PERF_MAGIC_COOKIE 0x4711 + +struct rxperf_proto_params { + __be32 version; + __be32 type; + __be32 rsize; + __be32 wsize; +} __packed; + +static const u8 rxperf_magic_cookie[] = { 0x00, 0x00, 0x47, 0x11 }; +static const u8 secret[8] = { 0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 }; + +enum rxperf_call_state { + RXPERF_CALL_SV_AWAIT_PARAMS, /* Server: Awaiting parameter block */ + RXPERF_CALL_SV_AWAIT_REQUEST, /* Server: Awaiting request data */ + RXPERF_CALL_SV_REPLYING, /* Server: Replying */ + RXPERF_CALL_SV_AWAIT_ACK, /* Server: Awaiting final ACK */ + RXPERF_CALL_COMPLETE, /* Completed or failed */ +}; + +struct rxperf_call { + struct rxrpc_call *rxcall; + struct iov_iter iter; + struct kvec kvec[1]; + struct work_struct work; + const char *type; + size_t iov_len; + size_t req_len; /* Size of request blob */ + size_t reply_len; /* Size of reply blob */ + unsigned int debug_id; + unsigned int operation_id; + struct rxperf_proto_params params; + __be32 tmp[2]; + s32 abort_code; + enum rxperf_call_state state; + short error; + unsigned short unmarshal; + u16 service_id; + int (*deliver)(struct rxperf_call *call); + void (*processor)(struct work_struct *work); +}; + +static struct socket *rxperf_socket; +static struct key *rxperf_sec_keyring; /* Ring of security/crypto keys */ +static struct workqueue_struct *rxperf_workqueue; + +static void rxperf_deliver_to_call(struct work_struct *work); +static int rxperf_deliver_param_block(struct rxperf_call *call); +static int rxperf_deliver_request(struct rxperf_call *call); +static int rxperf_process_call(struct rxperf_call *call); +static void rxperf_charge_preallocation(struct work_struct *work); + +static DECLARE_WORK(rxperf_charge_preallocation_work, + rxperf_charge_preallocation); + +static inline void rxperf_set_call_state(struct rxperf_call *call, + enum rxperf_call_state to) +{ + call->state = to; +} + +static inline void rxperf_set_call_complete(struct rxperf_call *call, + int error, s32 remote_abort) +{ + if (call->state != RXPERF_CALL_COMPLETE) { + call->abort_code = remote_abort; + call->error = error; + call->state = RXPERF_CALL_COMPLETE; + } +} + +static void rxperf_rx_discard_new_call(struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + kfree((struct rxperf_call *)user_call_ID); +} + +static void rxperf_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long user_call_ID) +{ + queue_work(rxperf_workqueue, &rxperf_charge_preallocation_work); +} + +static void rxperf_queue_call_work(struct rxperf_call *call) +{ + queue_work(rxperf_workqueue, &call->work); +} + +static void rxperf_notify_rx(struct sock *sk, struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + struct rxperf_call *call = (struct rxperf_call *)call_user_ID; + + if (call->state != RXPERF_CALL_COMPLETE) + rxperf_queue_call_work(call); +} + +static void rxperf_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID) +{ + struct rxperf_call *call = (struct rxperf_call *)user_call_ID; + + call->rxcall = rxcall; +} + +static void rxperf_notify_end_reply_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + rxperf_set_call_state((struct rxperf_call *)call_user_ID, + RXPERF_CALL_SV_AWAIT_ACK); +} + +/* + * Charge the incoming call preallocation. + */ +static void rxperf_charge_preallocation(struct work_struct *work) +{ + struct rxperf_call *call; + + for (;;) { + call = kzalloc(sizeof(*call), GFP_KERNEL); + if (!call) + break; + + call->type = "unset"; + call->debug_id = atomic_inc_return(&rxrpc_debug_id); + call->deliver = rxperf_deliver_param_block; + call->state = RXPERF_CALL_SV_AWAIT_PARAMS; + call->service_id = RX_PERF_SERVICE; + call->iov_len = sizeof(call->params); + call->kvec[0].iov_len = sizeof(call->params); + call->kvec[0].iov_base = &call->params; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len); + INIT_WORK(&call->work, rxperf_deliver_to_call); + + if (rxrpc_kernel_charge_accept(rxperf_socket, + rxperf_notify_rx, + rxperf_rx_attach, + (unsigned long)call, + GFP_KERNEL, + call->debug_id) < 0) + break; + call = NULL; + } + + kfree(call); +} + +/* + * Open an rxrpc socket and bind it to be a server for callback notifications + * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT + */ +static int rxperf_open_socket(void) +{ + struct sockaddr_rxrpc srx; + struct socket *socket; + int ret; + + ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET6, + &socket); + if (ret < 0) + goto error_1; + + socket->sk->sk_allocation = GFP_NOFS; + + /* bind the callback manager's address to make this a server socket */ + memset(&srx, 0, sizeof(srx)); + srx.srx_family = AF_RXRPC; + srx.srx_service = RX_PERF_SERVICE; + srx.transport_type = SOCK_DGRAM; + srx.transport_len = sizeof(srx.transport.sin6); + srx.transport.sin6.sin6_family = AF_INET6; + srx.transport.sin6.sin6_port = htons(RXPERF_PORT); + + ret = rxrpc_sock_set_min_security_level(socket->sk, + RXRPC_SECURITY_ENCRYPT); + if (ret < 0) + goto error_2; + + ret = rxrpc_sock_set_security_keyring(socket->sk, rxperf_sec_keyring); + + ret = kernel_bind(socket, (struct sockaddr *)&srx, sizeof(srx)); + if (ret < 0) + goto error_2; + + rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call, + rxperf_rx_discard_new_call); + + ret = kernel_listen(socket, INT_MAX); + if (ret < 0) + goto error_2; + + rxperf_socket = socket; + rxperf_charge_preallocation(&rxperf_charge_preallocation_work); + return 0; + +error_2: + sock_release(socket); +error_1: + pr_err("Can't set up rxperf socket: %d\n", ret); + return ret; +} + +/* + * close the rxrpc socket rxperf was using + */ +static void rxperf_close_socket(void) +{ + kernel_listen(rxperf_socket, 0); + kernel_sock_shutdown(rxperf_socket, SHUT_RDWR); + flush_workqueue(rxperf_workqueue); + sock_release(rxperf_socket); +} + +/* + * Log remote abort codes that indicate that we have a protocol disagreement + * with the server. + */ +static void rxperf_log_error(struct rxperf_call *call, s32 remote_abort) +{ + static int max = 0; + const char *msg; + int m; + + switch (remote_abort) { + case RX_EOF: msg = "unexpected EOF"; break; + case RXGEN_CC_MARSHAL: msg = "client marshalling"; break; + case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break; + case RXGEN_SS_MARSHAL: msg = "server marshalling"; break; + case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break; + case RXGEN_DECODE: msg = "opcode decode"; break; + case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break; + case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break; + case -32: msg = "insufficient data"; break; + default: + return; + } + + m = max; + if (m < 3) { + max = m + 1; + pr_info("Peer reported %s failure on %s\n", msg, call->type); + } +} + +/* + * deliver messages to a call + */ +static void rxperf_deliver_to_call(struct work_struct *work) +{ + struct rxperf_call *call = container_of(work, struct rxperf_call, work); + enum rxperf_call_state state; + u32 abort_code, remote_abort = 0; + int ret; + + if (call->state == RXPERF_CALL_COMPLETE) + return; + + while (state = call->state, + state == RXPERF_CALL_SV_AWAIT_PARAMS || + state == RXPERF_CALL_SV_AWAIT_REQUEST || + state == RXPERF_CALL_SV_AWAIT_ACK + ) { + if (state == RXPERF_CALL_SV_AWAIT_ACK) { + if (!rxrpc_kernel_check_life(rxperf_socket, call->rxcall)) + goto call_complete; + return; + } + + ret = call->deliver(call); + if (ret == 0) + ret = rxperf_process_call(call); + + switch (ret) { + case 0: + continue; + case -EINPROGRESS: + case -EAGAIN: + return; + case -ECONNABORTED: + rxperf_log_error(call, call->abort_code); + goto call_complete; + case -EOPNOTSUPP: + abort_code = RXGEN_OPCODE; + rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, + abort_code, ret, "GOP"); + goto call_complete; + case -ENOTSUPP: + abort_code = RX_USER_ABORT; + rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, + abort_code, ret, "GUA"); + goto call_complete; + case -EIO: + pr_err("Call %u in bad state %u\n", + call->debug_id, call->state); + fallthrough; + case -ENODATA: + case -EBADMSG: + case -EMSGSIZE: + case -ENOMEM: + case -EFAULT: + rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, + RXGEN_SS_UNMARSHAL, ret, "GUM"); + goto call_complete; + default: + rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, + RX_CALL_DEAD, ret, "GER"); + goto call_complete; + } + } + +call_complete: + rxperf_set_call_complete(call, ret, remote_abort); + /* The call may have been requeued */ + rxrpc_kernel_end_call(rxperf_socket, call->rxcall); + cancel_work(&call->work); + kfree(call); +} + +/* + * Extract a piece of data from the received data socket buffers. + */ +static int rxperf_extract_data(struct rxperf_call *call, bool want_more) +{ + u32 remote_abort = 0; + int ret; + + ret = rxrpc_kernel_recv_data(rxperf_socket, call->rxcall, &call->iter, + &call->iov_len, want_more, &remote_abort, + &call->service_id); + pr_debug("Extract i=%zu l=%zu m=%u ret=%d\n", + iov_iter_count(&call->iter), call->iov_len, want_more, ret); + if (ret == 0 || ret == -EAGAIN) + return ret; + + if (ret == 1) { + switch (call->state) { + case RXPERF_CALL_SV_AWAIT_REQUEST: + rxperf_set_call_state(call, RXPERF_CALL_SV_REPLYING); + break; + case RXPERF_CALL_COMPLETE: + pr_debug("premature completion %d", call->error); + return call->error; + default: + break; + } + return 0; + } + + rxperf_set_call_complete(call, ret, remote_abort); + return ret; +} + +/* + * Grab the operation ID from an incoming manager call. + */ +static int rxperf_deliver_param_block(struct rxperf_call *call) +{ + u32 version; + int ret; + + /* Extract the parameter block */ + ret = rxperf_extract_data(call, true); + if (ret < 0) + return ret; + + version = ntohl(call->params.version); + call->operation_id = ntohl(call->params.type); + call->deliver = rxperf_deliver_request; + + if (version != RX_PERF_VERSION) { + pr_info("Version mismatch %x\n", version); + return -ENOTSUPP; + } + + switch (call->operation_id) { + case RX_PERF_SEND: + call->type = "send"; + call->reply_len = 0; + call->iov_len = 4; /* Expect req size */ + break; + case RX_PERF_RECV: + call->type = "recv"; + call->req_len = 0; + call->iov_len = 4; /* Expect reply size */ + break; + case RX_PERF_RPC: + call->type = "rpc"; + call->iov_len = 8; /* Expect req size and reply size */ + break; + case RX_PERF_FILE: + call->type = "file"; + fallthrough; + default: + return -EOPNOTSUPP; + } + + rxperf_set_call_state(call, RXPERF_CALL_SV_AWAIT_REQUEST); + return call->deliver(call); +} + +/* + * Deliver the request data. + */ +static int rxperf_deliver_request(struct rxperf_call *call) +{ + int ret; + + switch (call->unmarshal) { + case 0: + call->kvec[0].iov_len = call->iov_len; + call->kvec[0].iov_base = call->tmp; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len); + call->unmarshal++; + fallthrough; + case 1: + ret = rxperf_extract_data(call, true); + if (ret < 0) + return ret; + + switch (call->operation_id) { + case RX_PERF_SEND: + call->type = "send"; + call->req_len = ntohl(call->tmp[0]); + call->reply_len = 0; + break; + case RX_PERF_RECV: + call->type = "recv"; + call->req_len = 0; + call->reply_len = ntohl(call->tmp[0]); + break; + case RX_PERF_RPC: + call->type = "rpc"; + call->req_len = ntohl(call->tmp[0]); + call->reply_len = ntohl(call->tmp[1]); + break; + default: + pr_info("Can't parse extra params\n"); + return -EIO; + } + + pr_debug("CALL op=%s rq=%zx rp=%zx\n", + call->type, call->req_len, call->reply_len); + + call->iov_len = call->req_len; + iov_iter_discard(&call->iter, READ, call->req_len); + call->unmarshal++; + fallthrough; + case 2: + ret = rxperf_extract_data(call, false); + if (ret < 0) + return ret; + call->unmarshal++; + fallthrough; + default: + return 0; + } +} + +/* + * Process a call for which we've received the request. + */ +static int rxperf_process_call(struct rxperf_call *call) +{ + struct msghdr msg = {}; + struct bio_vec bv[1]; + struct kvec iov[1]; + ssize_t n; + size_t reply_len = call->reply_len, len; + + rxrpc_kernel_set_tx_length(rxperf_socket, call->rxcall, + reply_len + sizeof(rxperf_magic_cookie)); + + while (reply_len > 0) { + len = min_t(size_t, reply_len, PAGE_SIZE); + bv[0].bv_page = ZERO_PAGE(0); + bv[0].bv_offset = 0; + bv[0].bv_len = len; + iov_iter_bvec(&msg.msg_iter, WRITE, bv, 1, len); + msg.msg_flags = MSG_MORE; + n = rxrpc_kernel_send_data(rxperf_socket, call->rxcall, &msg, + len, rxperf_notify_end_reply_tx); + if (n < 0) + return n; + if (n == 0) + return -EIO; + reply_len -= n; + } + + len = sizeof(rxperf_magic_cookie); + iov[0].iov_base = (void *)rxperf_magic_cookie; + iov[0].iov_len = len; + iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); + msg.msg_flags = 0; + n = rxrpc_kernel_send_data(rxperf_socket, call->rxcall, &msg, len, + rxperf_notify_end_reply_tx); + if (n >= 0) + return 0; /* Success */ + + if (n == -ENOMEM) + rxrpc_kernel_abort_call(rxperf_socket, call->rxcall, + RXGEN_SS_MARSHAL, -ENOMEM, "GOM"); + return n; +} + +/* + * Add a key to the security keyring. + */ +static int rxperf_add_key(struct key *keyring) +{ + key_ref_t kref; + int ret; + + kref = key_create_or_update(make_key_ref(keyring, true), + "rxrpc_s", + __stringify(RX_PERF_SERVICE) ":2", + secret, + sizeof(secret), + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH + | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + + if (IS_ERR(kref)) { + pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + ret = key_link(keyring, key_ref_to_ptr(kref)); + if (ret < 0) + pr_err("Can't link rxperf server key: %d\n", ret); + key_ref_put(kref); + return ret; +} + +/* + * Initialise the rxperf server. + */ +static int __init rxperf_init(void) +{ + struct key *keyring; + int ret = -ENOMEM; + + pr_info("Server registering\n"); + + rxperf_workqueue = alloc_workqueue("rxperf", 0, 0); + if (!rxperf_workqueue) + goto error_workqueue; + + keyring = keyring_alloc("rxperf_server", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | + KEY_POS_WRITE | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH | + KEY_USR_WRITE | + KEY_OTH_VIEW | KEY_OTH_READ | KEY_OTH_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(keyring)) { + pr_err("Can't allocate rxperf server keyring: %ld\n", + PTR_ERR(keyring)); + goto error_keyring; + } + rxperf_sec_keyring = keyring; + ret = rxperf_add_key(keyring); + if (ret < 0) + goto error_key; + + ret = rxperf_open_socket(); + if (ret < 0) + goto error_socket; + return 0; + +error_socket: +error_key: + key_put(rxperf_sec_keyring); +error_keyring: + destroy_workqueue(rxperf_workqueue); + rcu_barrier(); +error_workqueue: + pr_err("Failed to register: %d\n", ret); + return ret; +} +late_initcall(rxperf_init); /* Must be called after net/ to create socket */ + +static void __exit rxperf_exit(void) +{ + pr_info("Server unregistering.\n"); + + rxperf_close_socket(); + key_put(rxperf_sec_keyring); + destroy_workqueue(rxperf_workqueue); + rcu_barrier(); +} +module_exit(rxperf_exit); + diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index ee269e0e6ee8..e51940589ee5 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -144,3 +144,28 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) _leave(" = 0 [key %x]", key->serial); return 0; } + +/** + * rxrpc_sock_set_security_keyring - Set the security keyring for a kernel service + * @sk: The socket to set the keyring on + * @keyring: The keyring to set + * + * Set the server security keyring on an rxrpc socket. This is used to provide + * the encryption keys for a kernel service. + */ +int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret = 0; + + lock_sock(sk); + if (rx->securities) + ret = -EINVAL; + else if (rx->sk.sk_state != RXRPC_UNBOUND) + ret = -EISCONN; + else + rx->securities = key_get(keyring); + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); -- cgit v1.2.3-70-g09d2 From 49df54a6b2953195243d037682cffb9038f9456a Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 28 Nov 2022 09:24:06 +0000 Subject: rxrpc: Fix call leak When retransmitting a packet, rxrpc_resend() shouldn't be attaching a ref to the call to the txbuf as that pins the call and prevents the call from clearing the packet buffer. Signed-off-by: David Howells Fixes: d57a3a151660 ("rxrpc: Save last ACK's SACK table rather than marking txbufs") cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/call_event.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 1e21a708390e..349f3df569ba 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -198,7 +198,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) if (list_empty(&txb->tx_link)) { rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans); - rxrpc_get_call(call, rxrpc_call_got_tx); list_add_tail(&txb->tx_link, &retrans_queue); set_bit(RXRPC_TXBUF_RESENT, &txb->flags); } -- cgit v1.2.3-70-g09d2 From 30efa3ce109d9e852a1a7bb9be19a414e633b1f0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Nov 2022 09:41:34 +0000 Subject: rxrpc: Remove handling of duplicate packets in recvmsg_queue We should not now see duplicate packets in the recvmsg_queue. At one point, jumbo packets that overlapped with already queued data would be added to the queue and dealt with in recvmsg rather than in the softirq input code, but now jumbo packets are split/cloned before being processed by the input code and the subpackets can be discarded individually. So remove the recvmsg-side code for handling this. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/recvmsg.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'net') diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index efb85f983657..134122f5961a 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -228,7 +228,6 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) _enter("%d", call->debug_id); -further_rotation: skb = skb_dequeue(&call->recvmsg_queue); rxrpc_see_skb(skb, rxrpc_skb_rotated); @@ -250,17 +249,6 @@ further_rotation: return; } - /* The next packet on the queue might entirely overlap with the one we - * just consumed; if so, rotate that away also. - */ - skb = skb_peek(&call->recvmsg_queue); - if (skb) { - sp = rxrpc_skb(skb); - if (sp->hdr.seq != call->rx_consumed && - after_eq(call->rx_consumed, sp->hdr.seq)) - goto further_rotation; - } - /* Check to see if there's an ACK that needs sending. */ acked = atomic_add_return(call->rx_consumed - old_consumed, &call->ackr_nr_consumed); @@ -318,11 +306,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, sp = rxrpc_skb(skb); seq = sp->hdr.seq; - if (after_eq(call->rx_consumed, seq)) { - kdebug("obsolete %x %x", call->rx_consumed, seq); - goto skip_obsolete; - } - if (!(flags & MSG_PEEK)) trace_rxrpc_receive(call, rxrpc_receive_front, sp->hdr.serial, seq); @@ -373,7 +356,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, break; } - skip_obsolete: /* The whole packet has been transferred. */ if (sp->hdr.flags & RXRPC_LAST_PACKET) ret = 1; -- cgit v1.2.3-70-g09d2 From 2ebdb26e6abd2a773ab5f009ac38a6de973a2bcf Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 11:51:06 +0100 Subject: rxrpc: Remove the [k_]proto() debugging macros Remove the kproto() and _proto() debugging macros in preference to using tracepoints for this. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 60 ++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 10 -------- net/rxrpc/conn_event.c | 4 --- net/rxrpc/input.c | 17 ------------- net/rxrpc/local_event.c | 3 --- net/rxrpc/output.c | 2 -- net/rxrpc/peer_event.c | 4 --- net/rxrpc/rxkad.c | 9 +++---- 8 files changed, 63 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index b9886d1df825..2b77f9a75bf7 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -733,6 +733,66 @@ TRACE_EVENT(rxrpc_rx_abort, __entry->abort_code) ); +TRACE_EVENT(rxrpc_rx_challenge, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + u32 version, u32 nonce, u32 min_level), + + TP_ARGS(conn, serial, version, nonce, min_level), + + TP_STRUCT__entry( + __field(unsigned int, conn ) + __field(rxrpc_serial_t, serial ) + __field(u32, version ) + __field(u32, nonce ) + __field(u32, min_level ) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->version = version; + __entry->nonce = nonce; + __entry->min_level = min_level; + ), + + TP_printk("C=%08x CHALLENGE %08x v=%x n=%x ml=%x", + __entry->conn, + __entry->serial, + __entry->version, + __entry->nonce, + __entry->min_level) + ); + +TRACE_EVENT(rxrpc_rx_response, + TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t serial, + u32 version, u32 kvno, u32 ticket_len), + + TP_ARGS(conn, serial, version, kvno, ticket_len), + + TP_STRUCT__entry( + __field(unsigned int, conn ) + __field(rxrpc_serial_t, serial ) + __field(u32, version ) + __field(u32, kvno ) + __field(u32, ticket_len ) + ), + + TP_fast_assign( + __entry->conn = conn->debug_id; + __entry->serial = serial; + __entry->version = version; + __entry->kvno = kvno; + __entry->ticket_len = ticket_len; + ), + + TP_printk("C=%08x RESPONSE %08x v=%x kvno=%x tl=%x", + __entry->conn, + __entry->serial, + __entry->version, + __entry->kvno, + __entry->ticket_len) + ); + TRACE_EVENT(rxrpc_rx_rwind_change, TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, u32 rwind, bool wake), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index f5c538ce3e23..a3a29390e12b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1190,7 +1190,6 @@ extern unsigned int rxrpc_debug; #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) -#define kproto(FMT,...) dbgprintk("### "FMT ,##__VA_ARGS__) #define knet(FMT,...) dbgprintk("@@@ "FMT ,##__VA_ARGS__) @@ -1198,14 +1197,12 @@ extern unsigned int rxrpc_debug; #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) -#define _proto(FMT,...) kproto(FMT,##__VA_ARGS__) #define _net(FMT,...) knet(FMT,##__VA_ARGS__) #elif defined(CONFIG_AF_RXRPC_DEBUG) #define RXRPC_DEBUG_KENTER 0x01 #define RXRPC_DEBUG_KLEAVE 0x02 #define RXRPC_DEBUG_KDEBUG 0x04 -#define RXRPC_DEBUG_KPROTO 0x08 #define RXRPC_DEBUG_KNET 0x10 #define _enter(FMT,...) \ @@ -1226,12 +1223,6 @@ do { \ kdebug(FMT,##__VA_ARGS__); \ } while (0) -#define _proto(FMT,...) \ -do { \ - if (unlikely(rxrpc_debug & RXRPC_DEBUG_KPROTO)) \ - kproto(FMT,##__VA_ARGS__); \ -} while (0) - #define _net(FMT,...) \ do { \ if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET)) \ @@ -1242,7 +1233,6 @@ do { \ #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) -#define _proto(FMT,...) no_printk("### "FMT ,##__VA_ARGS__) #define _net(FMT,...) no_printk("@@@ "FMT ,##__VA_ARGS__) #endif diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index aab069701398..d5549cbfc71b 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -122,14 +122,12 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, switch (chan->last_type) { case RXRPC_PACKET_TYPE_ABORT: - _proto("Tx ABORT %%%u { %d } [re]", serial, conn->abort_code); break; case RXRPC_PACKET_TYPE_ACK: trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), pkt.ack.reason, 0); - _proto("Tx ACK %%%u [re]", serial); break; } @@ -242,7 +240,6 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, serial = atomic_inc_return(&conn->serial); rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, serial); whdr.serial = htonl(serial); - _proto("Tx CONN ABORT %%%u { %d }", serial, conn->abort_code); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ret < 0) { @@ -315,7 +312,6 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return -EPROTO; } abort_code = ntohl(wtmp); - _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); conn->error = -ECONNABORTED; conn->abort_code = abort_code; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index bdf70b81addc..646ee61af40e 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -551,9 +551,6 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) atomic64_read(&call->ackr_window), call->rx_highest_seq, skb->len, seq0); - _proto("Rx DATA %%%u { #%u f=%02x }", - sp->hdr.serial, seq0, sp->hdr.flags); - state = READ_ONCE(call->state); if (state >= RXRPC_CALL_COMPLETE) { rxrpc_free_skb(skb, rxrpc_skb_freed); @@ -708,11 +705,6 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, bool wake = false; u32 rwind = ntohl(ackinfo->rwind); - _proto("Rx ACK %%%u Info { rx=%u max=%u rwin=%u jm=%u }", - sp->hdr.serial, - ntohl(ackinfo->rxMTU), ntohl(ackinfo->maxMTU), - rwind, ntohl(ackinfo->jumbo_max)); - if (rwind > RXRPC_TX_MAX_WINDOW) rwind = RXRPC_TX_MAX_WINDOW; if (call->tx_winsize != rwind) { @@ -855,7 +847,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } if (ack.reason == RXRPC_ACK_PING) { - _proto("Rx ACK %%%u PING Request", ack_serial); rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, rxrpc_propose_ack_respond_to_ping); } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { @@ -1014,9 +1005,6 @@ out_not_locked: static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - _proto("Rx ACKALL %%%u", sp->hdr.serial); spin_lock(&call->input_lock); @@ -1044,8 +1032,6 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_abort(call, sp->hdr.serial, abort_code); - _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); - rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, abort_code, -ECONNABORTED); } @@ -1081,8 +1067,6 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, goto no_free; case RXRPC_PACKET_TYPE_BUSY: - _proto("Rx BUSY %%%u", sp->hdr.serial); - /* Just ignore BUSY packets from the server; the retry and * lifespan timers will take care of business. BUSY packets * from the client don't make sense. @@ -1325,7 +1309,6 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) goto discard; default: - _proto("Rx Bad Packet Type %u", sp->hdr.type); goto bad_message; } diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index 19e929c7c38b..f23a3fbabbda 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -63,8 +63,6 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, len = iov[0].iov_len + iov[1].iov_len; - _proto("Tx VERSION (reply)"); - ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, @@ -98,7 +96,6 @@ void rxrpc_process_local_events(struct rxrpc_local *local) if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &v, 1) < 0) return; - _proto("Rx VERSION { %02x }", v); if (v == 0) rxrpc_send_version_request(local, &sp->hdr, skb); break; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index c5eed0e83e47..635acf3dbd77 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -701,8 +701,6 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) len = iov[0].iov_len + iov[1].iov_len; - _proto("Tx VERSION (keepalive)"); - iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len); ret = do_udp_sendmsg(peer->local->socket, &msg, len); if (ret < 0) diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index cda3890657a9..be781c156e89 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -253,15 +253,12 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, break; default: - _proto("Rx Received ICMP error { type=%u code=%u }", - ee->ee_type, ee->ee_code); break; } break; case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: - _proto("Rx Received local error { error=%d }", err); compl = RXRPC_CALL_LOCAL_ERROR; break; @@ -270,7 +267,6 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, err = EHOSTUNREACH; fallthrough; default: - _proto("Rx Received error report { orig=%u }", ee->ee_origin); break; } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 110a5550c0a6..36cf40442a7e 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -704,7 +704,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) serial = atomic_inc_return(&conn->serial); whdr.serial = htonl(serial); - _proto("Tx CHALLENGE %%%u", serial); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); if (ret < 0) { @@ -762,7 +761,6 @@ static int rxkad_send_response(struct rxrpc_connection *conn, serial = atomic_inc_return(&conn->serial); whdr.serial = htonl(serial); - _proto("Tx RESPONSE %%%u", serial); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len); if (ret < 0) { @@ -856,8 +854,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, nonce = ntohl(challenge.nonce); min_level = ntohl(challenge.min_level); - _proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }", - sp->hdr.serial, version, nonce, min_level); + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); eproto = tracepoint_string("chall_ver"); abort_code = RXKADINCONSISTENCY; @@ -1139,8 +1136,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, version = ntohl(response->version); ticket_len = ntohl(response->ticket_len); kvno = ntohl(response->kvno); - _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", - sp->hdr.serial, version, kvno, ticket_len); + + trace_rxrpc_rx_response(conn, sp->hdr.serial, version, kvno, ticket_len); eproto = tracepoint_string("rxkad_rsp_ver"); abort_code = RXKADINCONSISTENCY; -- cgit v1.2.3-70-g09d2 From e969c92ce597baf6aeff3f619d6c082d736575e0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 12:04:20 +0100 Subject: rxrpc: Remove the [_k]net() debugging macros Remove the _net() and knet() debugging macros in favour of tracepoints. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 10 ---------- net/rxrpc/call_object.c | 6 ------ net/rxrpc/conn_client.c | 2 -- net/rxrpc/conn_object.c | 2 -- net/rxrpc/conn_service.c | 2 -- net/rxrpc/input.c | 1 - net/rxrpc/local_object.c | 8 -------- net/rxrpc/peer_event.c | 48 ++---------------------------------------------- net/rxrpc/peer_object.c | 6 +----- 9 files changed, 3 insertions(+), 82 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a3a29390e12b..36ece1efb1d4 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1190,20 +1190,17 @@ extern unsigned int rxrpc_debug; #define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) -#define knet(FMT,...) dbgprintk("@@@ "FMT ,##__VA_ARGS__) #if defined(__KDEBUG) #define _enter(FMT,...) kenter(FMT,##__VA_ARGS__) #define _leave(FMT,...) kleave(FMT,##__VA_ARGS__) #define _debug(FMT,...) kdebug(FMT,##__VA_ARGS__) -#define _net(FMT,...) knet(FMT,##__VA_ARGS__) #elif defined(CONFIG_AF_RXRPC_DEBUG) #define RXRPC_DEBUG_KENTER 0x01 #define RXRPC_DEBUG_KLEAVE 0x02 #define RXRPC_DEBUG_KDEBUG 0x04 -#define RXRPC_DEBUG_KNET 0x10 #define _enter(FMT,...) \ do { \ @@ -1223,17 +1220,10 @@ do { \ kdebug(FMT,##__VA_ARGS__); \ } while (0) -#define _net(FMT,...) \ -do { \ - if (unlikely(rxrpc_debug & RXRPC_DEBUG_KNET)) \ - knet(FMT,##__VA_ARGS__); \ -} while (0) - #else #define _enter(FMT,...) no_printk("==> %s("FMT")",__func__ ,##__VA_ARGS__) #define _leave(FMT,...) no_printk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) no_printk(" "FMT ,##__VA_ARGS__) -#define _net(FMT,...) no_printk("@@@ "FMT ,##__VA_ARGS__) #endif /* diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 1befe22cd301..e36a317b2e9a 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -349,8 +349,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, rxrpc_start_call_timer(call); - _net("CALL new %d on CONN %d", call->debug_id, call->conn->debug_id); - _leave(" = %p [new]", call); return call; @@ -423,8 +421,6 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, hlist_add_head_rcu(&call->error_link, &conn->params.peer->error_targets); spin_unlock(&conn->params.peer->lock); - _net("CALL incoming %d on CONN %d", call->debug_id, call->conn->debug_id); - rxrpc_start_call_timer(call); _leave(""); } @@ -669,8 +665,6 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) { struct rxrpc_txbuf *txb; - _net("DESTROY CALL %d", call->debug_id); - memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index f11c97e28d2a..2b76fbffd4dd 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -541,8 +541,6 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->service_id = conn->service_id; trace_rxrpc_connect_call(call); - _net("CONNECT call %08x:%08x as call %d on conn %d", - call->cid, call->call_id, call->debug_id, conn->debug_id); write_lock_bh(&call->state_lock); call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 156bd26daf74..d5d15389406f 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -356,8 +356,6 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) ASSERTCMP(refcount_read(&conn->ref), ==, 0); - _net("DESTROY CONN %d", conn->debug_id); - del_timer_sync(&conn->timer); rxrpc_purge_queue(&conn->rx_queue); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 6e6aa02c6f9e..75f903099eb0 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -184,8 +184,6 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, /* Make the connection a target for incoming packets. */ rxrpc_publish_service_conn(conn->params.peer, conn); - - _net("CONNECTION new %d {%x}", conn->debug_id, conn->proto.cid); } /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 646ee61af40e..e2461f29d765 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -725,7 +725,6 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, peer->maxdata = mtu; peer->mtu = mtu + peer->hdrsize; spin_unlock_bh(&peer->lock); - _net("Net MTU %u (maxdata %u)", peer->mtu, peer->maxdata); } if (wake) diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index a943fdf91e24..11080c335d42 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -198,7 +198,6 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, struct rxrpc_local *local; struct rxrpc_net *rxnet = rxrpc_net(net); struct hlist_node *cursor; - const char *age; long diff; int ret; @@ -232,7 +231,6 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, if (!rxrpc_use_local(local)) break; - age = "old"; goto found; } @@ -250,14 +248,9 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, } else { hlist_add_head_rcu(&local->link, &rxnet->local_endpoints); } - age = "new"; found: mutex_unlock(&rxnet->local_mutex); - - _net("LOCAL %s %d {%pISp}", - age, local->debug_id, &local->srx.transport); - _leave(" = %p", local); return local; @@ -467,7 +460,6 @@ static void rxrpc_local_rcu(struct rcu_head *rcu) ASSERT(!work_pending(&local->processor)); - _net("DESTROY LOCAL %d", local->debug_id); kfree(local); _leave(""); } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index be781c156e89..ad4d1769e02b 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -48,13 +48,11 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, srx->transport.sin.sin_port = serr->port; switch (serr->ee.ee_origin) { case SO_EE_ORIGIN_ICMP: - _net("Rx ICMP"); memcpy(&srx->transport.sin.sin_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in_addr)); break; case SO_EE_ORIGIN_ICMP6: - _net("Rx ICMP6 on v4 sock"); memcpy(&srx->transport.sin.sin_addr, skb_network_header(skb) + serr->addr_offset + 12, sizeof(struct in_addr)); @@ -70,14 +68,12 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, case AF_INET6: switch (serr->ee.ee_origin) { case SO_EE_ORIGIN_ICMP6: - _net("Rx ICMP6"); srx->transport.sin6.sin6_port = serr->port; memcpy(&srx->transport.sin6.sin6_addr, skb_network_header(skb) + serr->addr_offset, sizeof(struct in6_addr)); break; case SO_EE_ORIGIN_ICMP: - _net("Rx ICMP on v6 sock"); srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; srx->transport.sin.sin_port = serr->port; @@ -106,13 +102,9 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, */ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { - _net("Rx ICMP Fragmentation Needed (%d)", mtu); - /* wind down the local interface MTU */ - if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) { + if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) peer->if_mtu = mtu; - _net("I/F MTU %u", mtu); - } if (mtu == 0) { /* they didn't give us a size, estimate one */ @@ -133,8 +125,6 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) peer->mtu = mtu; peer->maxdata = peer->mtu - peer->hdrsize; spin_unlock_bh(&peer->lock); - _net("Net MTU %u (maxdata %u)", - peer->mtu, peer->maxdata); } } @@ -222,41 +212,6 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, err = ee->ee_errno; switch (ee->ee_origin) { - case SO_EE_ORIGIN_ICMP: - switch (ee->ee_type) { - case ICMP_DEST_UNREACH: - switch (ee->ee_code) { - case ICMP_NET_UNREACH: - _net("Rx Received ICMP Network Unreachable"); - break; - case ICMP_HOST_UNREACH: - _net("Rx Received ICMP Host Unreachable"); - break; - case ICMP_PORT_UNREACH: - _net("Rx Received ICMP Port Unreachable"); - break; - case ICMP_NET_UNKNOWN: - _net("Rx Received ICMP Unknown Network"); - break; - case ICMP_HOST_UNKNOWN: - _net("Rx Received ICMP Unknown Host"); - break; - default: - _net("Rx Received ICMP DestUnreach code=%u", - ee->ee_code); - break; - } - break; - - case ICMP_TIME_EXCEEDED: - _net("Rx Received ICMP TTL Exceeded"); - break; - - default: - break; - } - break; - case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: compl = RXRPC_CALL_LOCAL_ERROR; @@ -266,6 +221,7 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, if (err == EACCES) err = EHOSTUNREACH; fallthrough; + case SO_EE_ORIGIN_ICMP: default: break; } diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 041a51225c5f..b3c3c1c344fc 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -138,10 +138,8 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *local, unsigned long hash_key = rxrpc_peer_hash_key(local, srx); peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); - if (peer) { - _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport); + if (peer) _leave(" = %p {u=%d}", peer, refcount_read(&peer->ref)); - } return peer; } @@ -371,8 +369,6 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, peer = candidate; } - _net("PEER %d {%pISp}", peer->debug_id, &peer->srx.transport); - _leave(" = %p {u=%d}", peer, refcount_read(&peer->ref)); return peer; } -- cgit v1.2.3-70-g09d2 From 2cc800863c49a1f4be1b10b756c09a878d3a3f00 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Oct 2022 13:49:02 +0100 Subject: rxrpc: Drop rxrpc_conn_parameters from rxrpc_connection and rxrpc_bundle Remove the rxrpc_conn_parameters struct from the rxrpc_connection and rxrpc_bundle structs and emplace the members directly. These are going to get filled in from the rxrpc_call struct in future. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 16 ++++++++++++-- net/rxrpc/call_accept.c | 6 +++--- net/rxrpc/call_event.c | 2 +- net/rxrpc/call_object.c | 6 +++--- net/rxrpc/conn_client.c | 53 ++++++++++++++++++++++++++++------------------- net/rxrpc/conn_event.c | 26 +++++++++++------------ net/rxrpc/conn_object.c | 22 ++++++++++---------- net/rxrpc/conn_service.c | 6 +++--- net/rxrpc/input.c | 4 ++-- net/rxrpc/key.c | 2 +- net/rxrpc/output.c | 32 ++++++++++++++-------------- net/rxrpc/proc.c | 6 +++--- net/rxrpc/rxkad.c | 54 ++++++++++++++++++++++++------------------------ net/rxrpc/security.c | 4 ++-- 14 files changed, 131 insertions(+), 108 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 36ece1efb1d4..7c48b0163032 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -403,12 +403,18 @@ enum rxrpc_conn_proto_state { * RxRPC client connection bundle. */ struct rxrpc_bundle { - struct rxrpc_conn_parameters params; + struct rxrpc_local *local; /* Representation of local endpoint */ + struct rxrpc_peer *peer; /* Remote endpoint */ + struct key *key; /* Security details */ refcount_t ref; atomic_t active; /* Number of active users */ unsigned int debug_id; + u32 security_level; /* Security level selected */ + u16 service_id; /* Service ID for this connection */ bool try_upgrade; /* True if the bundle is attempting upgrade */ bool alloc_conn; /* True if someone's getting a conn */ + bool exclusive; /* T if conn is exclusive */ + bool upgrade; /* T if service ID can be upgraded */ short alloc_error; /* Error from last conn allocation */ spinlock_t channel_lock; struct rb_node local_node; /* Node in local->client_conns */ @@ -424,7 +430,9 @@ struct rxrpc_bundle { */ struct rxrpc_connection { struct rxrpc_conn_proto proto; - struct rxrpc_conn_parameters params; + struct rxrpc_local *local; /* Representation of local endpoint */ + struct rxrpc_peer *peer; /* Remote endpoint */ + struct key *key; /* Security details */ refcount_t ref; struct rcu_head rcu; @@ -471,9 +479,13 @@ struct rxrpc_connection { atomic_t serial; /* packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ u32 service_id; /* Service ID, possibly upgraded */ + u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */ u8 bundle_shift; /* Index into bundle->avail_chans */ + bool exclusive; /* T if conn is exclusive */ + bool upgrade; /* T if service ID can be upgraded */ + u16 orig_service_id; /* Originally requested service ID */ short error; /* Local error code */ }; diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 48790ee77019..4888959e4727 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -305,8 +305,8 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, b->conn_backlog[conn_tail] = NULL; smp_store_release(&b->conn_backlog_tail, (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); - conn->params.local = rxrpc_get_local(local); - conn->params.peer = peer; + conn->local = rxrpc_get_local(local); + conn->peer = peer; rxrpc_see_connection(conn); rxrpc_new_incoming_connection(rx, conn, sec, skb); } else { @@ -323,7 +323,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, call->conn = conn; call->security = conn->security; call->security_ix = conn->security_ix; - call->peer = rxrpc_get_peer(conn->params.peer); + call->peer = rxrpc_get_peer(conn->peer); call->cong_ssthresh = call->peer->cong_ssthresh; call->tx_last_sent = ktime_get_real(); return call; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 349f3df569ba..b17ed37434bd 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -69,7 +69,7 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { - struct rxrpc_local *local = call->conn->params.local; + struct rxrpc_local *local = call->conn->local; struct rxrpc_txbuf *txb; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index e36a317b2e9a..59928f0a8fe1 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -417,9 +417,9 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, conn->channels[chan].call_id = call->call_id; rcu_assign_pointer(conn->channels[chan].call, call); - spin_lock(&conn->params.peer->lock); - hlist_add_head_rcu(&call->error_link, &conn->params.peer->error_targets); - spin_unlock(&conn->params.peer->lock); + spin_lock(&conn->peer->lock); + hlist_add_head_rcu(&call->error_link, &conn->peer->error_targets); + spin_unlock(&conn->peer->lock); rxrpc_start_call_timer(call); _leave(""); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 2b76fbffd4dd..71404b33623f 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -51,7 +51,7 @@ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp) { - struct rxrpc_net *rxnet = conn->params.local->rxnet; + struct rxrpc_net *rxnet = conn->local->rxnet; int id; _enter(""); @@ -122,8 +122,13 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, bundle = kzalloc(sizeof(*bundle), gfp); if (bundle) { - bundle->params = *cp; - rxrpc_get_peer(bundle->params.peer); + bundle->local = cp->local; + bundle->peer = rxrpc_get_peer(cp->peer); + bundle->key = cp->key; + bundle->exclusive = cp->exclusive; + bundle->upgrade = cp->upgrade; + bundle->service_id = cp->service_id; + bundle->security_level = cp->security_level; refcount_set(&bundle->ref, 1); atomic_set(&bundle->active, 1); spin_lock_init(&bundle->channel_lock); @@ -140,7 +145,7 @@ struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle) static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { - rxrpc_put_peer(bundle->params.peer); + rxrpc_put_peer(bundle->peer); kfree(bundle); } @@ -164,7 +169,7 @@ static struct rxrpc_connection * rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) { struct rxrpc_connection *conn; - struct rxrpc_net *rxnet = bundle->params.local->rxnet; + struct rxrpc_net *rxnet = bundle->local->rxnet; int ret; _enter(""); @@ -177,10 +182,16 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) refcount_set(&conn->ref, 1); conn->bundle = bundle; - conn->params = bundle->params; + conn->local = bundle->local; + conn->peer = bundle->peer; + conn->key = bundle->key; + conn->exclusive = bundle->exclusive; + conn->upgrade = bundle->upgrade; + conn->orig_service_id = bundle->service_id; + conn->security_level = bundle->security_level; conn->out_clientflag = RXRPC_CLIENT_INITIATED; conn->state = RXRPC_CONN_CLIENT; - conn->service_id = conn->params.service_id; + conn->service_id = conn->orig_service_id; ret = rxrpc_get_client_connection_id(conn, gfp); if (ret < 0) @@ -196,9 +207,9 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) write_unlock(&rxnet->conn_lock); rxrpc_get_bundle(bundle); - rxrpc_get_peer(conn->params.peer); - rxrpc_get_local(conn->params.local); - key_get(conn->params.key); + rxrpc_get_peer(conn->peer); + rxrpc_get_local(conn->local); + key_get(conn->key); trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client, refcount_read(&conn->ref), @@ -228,7 +239,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) if (!conn) goto dont_reuse; - rxnet = conn->params.local->rxnet; + rxnet = conn->local->rxnet; if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) goto dont_reuse; @@ -285,7 +296,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c while (p) { bundle = rb_entry(p, struct rxrpc_bundle, local_node); -#define cmp(X) ((long)bundle->params.X - (long)cp->X) +#define cmp(X) ((long)bundle->X - (long)cp->X) diff = (cmp(peer) ?: cmp(key) ?: cmp(security_level) ?: @@ -314,7 +325,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c parent = *pp; bundle = rb_entry(parent, struct rxrpc_bundle, local_node); -#define cmp(X) ((long)bundle->params.X - (long)cp->X) +#define cmp(X) ((long)bundle->X - (long)cp->X) diff = (cmp(peer) ?: cmp(key) ?: cmp(security_level) ?: @@ -532,7 +543,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, rxrpc_see_call(call); list_del_init(&call->chan_wait_link); - call->peer = rxrpc_get_peer(conn->params.peer); + call->peer = rxrpc_get_peer(conn->peer); call->conn = rxrpc_get_connection(conn); call->cid = conn->proto.cid | channel; call->call_id = call_id; @@ -569,7 +580,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, */ static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connection *conn) { - struct rxrpc_net *rxnet = bundle->params.local->rxnet; + struct rxrpc_net *rxnet = bundle->local->rxnet; bool drop_ref; if (!list_empty(&conn->cache_link)) { @@ -795,7 +806,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call { struct rxrpc_connection *conn; struct rxrpc_channel *chan = NULL; - struct rxrpc_net *rxnet = bundle->params.local->rxnet; + struct rxrpc_net *rxnet = bundle->local->rxnet; unsigned int channel; bool may_reuse; u32 cid; @@ -936,11 +947,11 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) */ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) { - struct rxrpc_local *local = bundle->params.local; + struct rxrpc_local *local = bundle->local; bool need_put = false; if (atomic_dec_and_lock(&bundle->active, &local->client_bundles_lock)) { - if (!bundle->params.exclusive) { + if (!bundle->exclusive) { _debug("erase bundle"); rb_erase(&bundle->local_node, &local->client_bundles); need_put = true; @@ -957,7 +968,7 @@ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) */ static void rxrpc_kill_client_conn(struct rxrpc_connection *conn) { - struct rxrpc_local *local = conn->params.local; + struct rxrpc_local *local = conn->local; struct rxrpc_net *rxnet = local->rxnet; _enter("C=%x", conn->debug_id); @@ -1036,7 +1047,7 @@ next: expiry = rxrpc_conn_idle_client_expiry; if (nr_conns > rxrpc_reap_client_connections) expiry = rxrpc_conn_idle_client_fast_expiry; - if (conn->params.local->service_closed) + if (conn->local->service_closed) expiry = rxrpc_closed_conn_expiry * HZ; conn_expires_at = conn->idle_timestamp + expiry; @@ -1110,7 +1121,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns, cache_link) { - if (conn->params.local == local) { + if (conn->local == local) { trace_rxrpc_client(conn, -1, rxrpc_client_discard); list_move(&conn->cache_link, &graveyard); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index d5549cbfc71b..71ed6b9dc63a 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -52,8 +52,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, if (skb && call_id != sp->hdr.callNumber) return; - msg.msg_name = &conn->params.peer->srx.transport; - msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -86,8 +86,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; case RXRPC_PACKET_TYPE_ACK: - mtu = conn->params.peer->if_mtu; - mtu -= conn->params.peer->hdrsize; + mtu = conn->peer->if_mtu; + mtu -= conn->peer->hdrsize; pkt.ack.bufferSpace = 0; pkt.ack.maxSkew = htons(skb ? skb->priority : 0); pkt.ack.firstPacket = htonl(chan->last_seq + 1); @@ -131,8 +131,8 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; } - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len); - conn->params.peer->last_tx_at = ktime_get_seconds(); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, ioc, len); + conn->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(chan->call_debug_id, serial, ret, rxrpc_tx_point_call_final_resend); @@ -211,8 +211,8 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); spin_unlock_bh(&conn->state_lock); - msg.msg_name = &conn->params.peer->srx.transport; - msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -241,7 +241,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, rxrpc_abort_calls(conn, RXRPC_CALL_LOCALLY_ABORTED, serial); whdr.serial = htonl(serial); - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, rxrpc_tx_point_conn_abort); @@ -251,7 +251,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort); - conn->params.peer->last_tx_at = ktime_get_seconds(); + conn->peer->last_tx_at = ktime_get_seconds(); _leave(" = 0"); return 0; @@ -330,7 +330,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return ret; ret = conn->security->init_connection_security( - conn, conn->params.key->payload.data[0]); + conn, conn->key->payload.data[0]); if (ret < 0) return ret; @@ -484,9 +484,9 @@ void rxrpc_process_connection(struct work_struct *work) rxrpc_see_connection(conn); - if (__rxrpc_use_local(conn->params.local)) { + if (__rxrpc_use_local(conn->local)) { rxrpc_do_process_connection(conn); - rxrpc_unuse_local(conn->params.local); + rxrpc_unuse_local(conn->local); } rxrpc_put_connection(conn); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index d5d15389406f..ad6e5ee1f069 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -120,10 +120,10 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, } if (conn->proto.epoch != k.epoch || - conn->params.local != local) + conn->local != local) goto not_found; - peer = conn->params.peer; + peer = conn->peer; switch (srx.transport.family) { case AF_INET: if (peer->srx.transport.sin.sin_port != @@ -231,7 +231,7 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) */ void rxrpc_kill_connection(struct rxrpc_connection *conn) { - struct rxrpc_net *rxnet = conn->params.local->rxnet; + struct rxrpc_net *rxnet = conn->local->rxnet; ASSERT(!rcu_access_pointer(conn->channels[0].call) && !rcu_access_pointer(conn->channels[1].call) && @@ -340,7 +340,7 @@ void rxrpc_put_service_conn(struct rxrpc_connection *conn) __refcount_dec(&conn->ref, &r); trace_rxrpc_conn(debug_id, rxrpc_conn_put_service, r - 1, here); if (r - 1 == 1) - rxrpc_set_service_reap_timer(conn->params.local->rxnet, + rxrpc_set_service_reap_timer(conn->local->rxnet, jiffies + rxrpc_connection_expiry); } @@ -360,13 +360,13 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) rxrpc_purge_queue(&conn->rx_queue); conn->security->clear(conn); - key_put(conn->params.key); + key_put(conn->key); rxrpc_put_bundle(conn->bundle); - rxrpc_put_peer(conn->params.peer); + rxrpc_put_peer(conn->peer); - if (atomic_dec_and_test(&conn->params.local->rxnet->nr_conns)) - wake_up_var(&conn->params.local->rxnet->nr_conns); - rxrpc_put_local(conn->params.local); + if (atomic_dec_and_test(&conn->local->rxnet->nr_conns)) + wake_up_var(&conn->local->rxnet->nr_conns); + rxrpc_put_local(conn->local); kfree(conn); _leave(""); @@ -397,10 +397,10 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) continue; - if (rxnet->live && !conn->params.local->dead) { + if (rxnet->live && !conn->local->dead) { idle_timestamp = READ_ONCE(conn->idle_timestamp); expire_at = idle_timestamp + rxrpc_connection_expiry * HZ; - if (conn->params.local->service_closed) + if (conn->local->service_closed) expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ; _debug("reap CONN %d { u=%d,t=%ld }", diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 75f903099eb0..a3b91864ef21 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -164,7 +164,7 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, conn->proto.epoch = sp->hdr.epoch; conn->proto.cid = sp->hdr.cid & RXRPC_CIDMASK; - conn->params.service_id = sp->hdr.serviceId; + conn->orig_service_id = sp->hdr.serviceId; conn->service_id = sp->hdr.serviceId; conn->security_ix = sp->hdr.securityIndex; conn->out_clientflag = 0; @@ -183,7 +183,7 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, conn->service_id = rx->service_upgrade.to; /* Make the connection a target for incoming packets. */ - rxrpc_publish_service_conn(conn->params.peer, conn); + rxrpc_publish_service_conn(conn->peer, conn); } /* @@ -192,7 +192,7 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, */ void rxrpc_unpublish_service_conn(struct rxrpc_connection *conn) { - struct rxrpc_peer *peer = conn->params.peer; + struct rxrpc_peer *peer = conn->peer; write_seqlock_bh(&peer->service_conn_lock); if (test_and_clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags)) diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index e2461f29d765..44caf88e04b8 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1339,10 +1339,10 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) goto reupgrade; - old_id = cmpxchg(&conn->service_id, conn->params.service_id, + old_id = cmpxchg(&conn->service_id, conn->orig_service_id, sp->hdr.serviceId); - if (old_id != conn->params.service_id && + if (old_id != conn->orig_service_id && old_id != sp->hdr.serviceId) goto reupgrade; } diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 830eeffe2d5b..8d53aded09c4 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -513,7 +513,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *conn, if (ret < 0) goto error; - conn->params.key = key; + conn->key = key; _leave(" = 0 [%d]", key_serial(key)); return 0; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 635acf3dbd77..b5d8eac8c49c 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -142,8 +142,8 @@ retry: txb->ack.reason = RXRPC_ACK_IDLE; } - mtu = conn->params.peer->if_mtu; - mtu -= conn->params.peer->hdrsize; + mtu = conn->peer->if_mtu; + mtu -= conn->peer->hdrsize; jmax = rxrpc_rx_jumbo_max; qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); @@ -259,7 +259,7 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * txb->ack.previousPacket = htonl(call->rx_highest_seq); iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len); - ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); + ret = do_udp_sendmsg(conn->local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, @@ -368,8 +368,8 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) pkt.whdr.serial = htonl(serial); iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, sizeof(pkt)); - ret = do_udp_sendmsg(conn->params.local->socket, &msg, sizeof(pkt)); - conn->params.peer->last_tx_at = ktime_get_seconds(); + ret = do_udp_sendmsg(conn->local->socket, &msg, sizeof(pkt)); + conn->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_abort); @@ -473,7 +473,7 @@ dont_set_request_ack: if (txb->len >= call->peer->maxdata) goto send_fragmentable; - down_read(&conn->params.local->defrag_sem); + down_read(&conn->local->defrag_sem); txb->last_sent = ktime_get_real(); if (txb->wire.flags & RXRPC_REQUEST_ACK) @@ -486,10 +486,10 @@ dont_set_request_ack: * message and update the peer record */ rxrpc_inc_stat(call->rxnet, stat_tx_data_send); - ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); - conn->params.peer->last_tx_at = ktime_get_seconds(); + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + conn->peer->last_tx_at = ktime_get_seconds(); - up_read(&conn->params.local->defrag_sem); + up_read(&conn->local->defrag_sem); if (ret < 0) { rxrpc_cancel_rtt_probe(call, serial, rtt_slot); trace_rxrpc_tx_fail(call->debug_id, serial, ret, @@ -549,22 +549,22 @@ send_fragmentable: /* attempt to send this message with fragmentation enabled */ _debug("send fragment"); - down_write(&conn->params.local->defrag_sem); + down_write(&conn->local->defrag_sem); txb->last_sent = ktime_get_real(); if (txb->wire.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); - switch (conn->params.local->srx.transport.family) { + switch (conn->local->srx.transport.family) { case AF_INET6: case AF_INET: - ip_sock_set_mtu_discover(conn->params.local->socket->sk, + ip_sock_set_mtu_discover(conn->local->socket->sk, IP_PMTUDISC_DONT); rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag); - ret = do_udp_sendmsg(conn->params.local->socket, &msg, len); - conn->params.peer->last_tx_at = ktime_get_seconds(); + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + conn->peer->last_tx_at = ktime_get_seconds(); - ip_sock_set_mtu_discover(conn->params.local->socket->sk, + ip_sock_set_mtu_discover(conn->local->socket->sk, IP_PMTUDISC_DO); break; @@ -582,7 +582,7 @@ send_fragmentable: } rxrpc_tx_backoff(call, ret); - up_write(&conn->params.local->defrag_sem); + up_write(&conn->local->defrag_sem); goto done; } diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index fae22a8b38d6..bb2edf6db896 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -172,9 +172,9 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) goto print; } - sprintf(lbuff, "%pISpc", &conn->params.local->srx.transport); + sprintf(lbuff, "%pISpc", &conn->local->srx.transport); - sprintf(rbuff, "%pISpc", &conn->params.peer->srx.transport); + sprintf(rbuff, "%pISpc", &conn->peer->srx.transport); print: seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %s %3u" @@ -186,7 +186,7 @@ print: rxrpc_conn_is_service(conn) ? "Svc" : "Clt", refcount_read(&conn->ref), rxrpc_conn_states[conn->state], - key_serial(conn->params.key), + key_serial(conn->key), atomic_read(&conn->serial), conn->hi_serial, conn->channels[0].call_id, diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 36cf40442a7e..d1233720e05f 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -103,7 +103,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn, struct crypto_sync_skcipher *ci; int ret; - _enter("{%d},{%x}", conn->debug_id, key_serial(conn->params.key)); + _enter("{%d},{%x}", conn->debug_id, key_serial(conn->key)); conn->security_ix = token->security_index; @@ -118,7 +118,7 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn, sizeof(token->kad->session_key)) < 0) BUG(); - switch (conn->params.security_level) { + switch (conn->security_level) { case RXRPC_SECURITY_PLAIN: case RXRPC_SECURITY_AUTH: case RXRPC_SECURITY_ENCRYPT: @@ -150,7 +150,7 @@ static int rxkad_how_much_data(struct rxrpc_call *call, size_t remain, { size_t shdr, buf_size, chunk; - switch (call->conn->params.security_level) { + switch (call->conn->security_level) { default: buf_size = chunk = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); shdr = 0; @@ -192,7 +192,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn, _enter(""); - if (!conn->params.key) + if (!conn->key) return 0; tmpbuf = kmalloc(tmpsize, GFP_KERNEL); @@ -205,7 +205,7 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn, return -ENOMEM; } - token = conn->params.key->payload.data[0]; + token = conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); tmpbuf[0] = htonl(conn->proto.epoch); @@ -317,7 +317,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, } /* encrypt from the session key */ - token = call->conn->params.key->payload.data[0]; + token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); sg_init_one(&sg, txb->data, txb->len); @@ -344,13 +344,13 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) int ret; _enter("{%d{%x}},{#%u},%u,", - call->debug_id, key_serial(call->conn->params.key), + call->debug_id, key_serial(call->conn->key), txb->seq, txb->len); if (!call->conn->rxkad.cipher) return 0; - ret = key_validate(call->conn->params.key); + ret = key_validate(call->conn->key); if (ret < 0) return ret; @@ -380,7 +380,7 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) y = 1; /* zero checksums are not permitted */ txb->wire.cksum = htons(y); - switch (call->conn->params.security_level) { + switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: ret = 0; break; @@ -525,7 +525,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, } /* decrypt from the session key */ - token = call->conn->params.key->payload.data[0]; + token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); @@ -596,7 +596,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) u32 x, y; _enter("{%d{%x}},{#%u}", - call->debug_id, key_serial(call->conn->params.key), seq); + call->debug_id, key_serial(call->conn->key), seq); if (!call->conn->rxkad.cipher) return 0; @@ -632,7 +632,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) goto protocol_error; } - switch (call->conn->params.security_level) { + switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: ret = 0; break; @@ -678,8 +678,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) challenge.min_level = htonl(0); challenge.__padding = 0; - msg.msg_name = &conn->params.peer->srx.transport; - msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -705,14 +705,14 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) serial = atomic_inc_return(&conn->serial); whdr.serial = htonl(serial); - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, rxrpc_tx_point_rxkad_challenge); return -EAGAIN; } - conn->params.peer->last_tx_at = ktime_get_seconds(); + conn->peer->last_tx_at = ktime_get_seconds(); trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_rxkad_challenge); _leave(" = 0"); @@ -736,8 +736,8 @@ static int rxkad_send_response(struct rxrpc_connection *conn, _enter(""); - msg.msg_name = &conn->params.peer->srx.transport; - msg.msg_namelen = conn->params.peer->srx.transport_len; + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; @@ -762,14 +762,14 @@ static int rxkad_send_response(struct rxrpc_connection *conn, serial = atomic_inc_return(&conn->serial); whdr.serial = htonl(serial); - ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, rxrpc_tx_point_rxkad_response); return -EAGAIN; } - conn->params.peer->last_tx_at = ktime_get_seconds(); + conn->peer->last_tx_at = ktime_get_seconds(); _leave(" = 0"); return 0; } @@ -832,15 +832,15 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, u32 version, nonce, min_level, abort_code; int ret; - _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); eproto = tracepoint_string("chall_no_key"); abort_code = RX_PROTOCOL_ERROR; - if (!conn->params.key) + if (!conn->key) goto protocol_error; abort_code = RXKADEXPIRED; - ret = key_validate(conn->params.key); + ret = key_validate(conn->key); if (ret < 0) goto other_error; @@ -863,10 +863,10 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, abort_code = RXKADLEVELFAIL; ret = -EACCES; - if (conn->params.security_level < min_level) + if (conn->security_level < min_level) goto other_error; - token = conn->params.key->payload.data[0]; + token = conn->key->payload.data[0]; /* build the response packet */ resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); @@ -878,7 +878,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, resp->encrypted.cid = htonl(conn->proto.cid); resp->encrypted.securityIndex = htonl(conn->security_ix); resp->encrypted.inc_nonce = htonl(nonce + 1); - resp->encrypted.level = htonl(conn->params.security_level); + resp->encrypted.level = htonl(conn->security_level); resp->kvno = htonl(token->kad->kvno); resp->ticket_len = htonl(token->kad->ticket_len); resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter); @@ -1226,7 +1226,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, level = ntohl(response->encrypted.level); if (level > RXRPC_SECURITY_ENCRYPT) goto protocol_error_free; - conn->params.security_level = level; + conn->security_level = level; /* create a key to hold the security data and expiration time - after * this the connection security can be handled in exactly the same way diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index 50cb5f1ee0c0..e6ddac9b3732 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -69,7 +69,7 @@ int rxrpc_init_client_conn_security(struct rxrpc_connection *conn) { const struct rxrpc_security *sec; struct rxrpc_key_token *token; - struct key *key = conn->params.key; + struct key *key = conn->key; int ret; _enter("{%d},{%x}", conn->debug_id, key_serial(key)); @@ -163,7 +163,7 @@ struct key *rxrpc_look_up_server_security(struct rxrpc_connection *conn, rcu_read_lock(); - rx = rcu_dereference(conn->params.local->service); + rx = rcu_dereference(conn->local->service); if (!rx) goto out; -- cgit v1.2.3-70-g09d2 From f14febd8df5a490acc40b919808f163e997d7f03 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 14 Nov 2022 12:21:32 +0000 Subject: rxrpc: Extract the code from a received ABORT packet much earlier Extract the code from a received rx ABORT packet much earlier and in a single place and harmonise the responses to malformed ABORT packets. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/conn_event.c | 12 +----------- net/rxrpc/input.c | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 71ed6b9dc63a..f890a30c4df6 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -282,8 +282,6 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, u32 *_abort_code) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 wtmp; - u32 abort_code; int loop, ret; if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { @@ -305,16 +303,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return 0; case RXRPC_PACKET_TYPE_ABORT: - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &wtmp, sizeof(wtmp)) < 0) { - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("bad_abort")); - return -EPROTO; - } - abort_code = ntohl(wtmp); - conn->error = -ECONNABORTED; - conn->abort_code = abort_code; + conn->abort_code = skb->priority; conn->state = RXRPC_CONN_REMOTELY_ABORTED; set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 44caf88e04b8..42c8257158f7 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1019,20 +1019,11 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - __be32 wtmp; - u32 abort_code = RX_CALL_DEAD; - - _enter(""); - - if (skb->len >= 4 && - skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &wtmp, sizeof(wtmp)) >= 0) - abort_code = ntohl(wtmp); - trace_rxrpc_rx_abort(call, sp->hdr.serial, abort_code); + trace_rxrpc_rx_abort(call, sp->hdr.serial, skb->priority); rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, - abort_code, -ECONNABORTED); + skb->priority, -ECONNABORTED); } /* @@ -1193,6 +1184,20 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) return 0; } +/* + * Extract the abort code from an ABORT packet and stash it in skb->priority. + */ +static bool rxrpc_extract_abort(struct sk_buff *skb) +{ + __be32 wtmp; + + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &wtmp, sizeof(wtmp)) < 0) + return false; + skb->priority = ntohl(wtmp); + return true; +} + /* * handle data received on the local endpoint * - may be called in interrupt context @@ -1264,8 +1269,10 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) case RXRPC_PACKET_TYPE_ACKALL: if (sp->hdr.callNumber == 0) goto bad_message; - fallthrough; + break; case RXRPC_PACKET_TYPE_ABORT: + if (!rxrpc_extract_abort(skb)) + return true; /* Just discard if malformed */ break; case RXRPC_PACKET_TYPE_DATA: -- cgit v1.2.3-70-g09d2 From 0fde882fc9ee9cc2e66e8c5a5a93c83932d7ca95 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 13:00:34 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for rxrpc_local tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the rxrpc_local tracepoint Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 49 ++++++++++++++++++++-------- net/rxrpc/af_rxrpc.c | 8 ++--- net/rxrpc/ar-internal.h | 41 ++++++++++++++++++----- net/rxrpc/call_accept.c | 4 +-- net/rxrpc/call_event.c | 2 +- net/rxrpc/conn_client.c | 2 +- net/rxrpc/conn_event.c | 4 +-- net/rxrpc/conn_object.c | 2 +- net/rxrpc/input.c | 4 +-- net/rxrpc/local_object.c | 78 ++++++++++++++++++++++++-------------------- net/rxrpc/output.c | 3 +- net/rxrpc/peer_event.c | 4 +-- net/rxrpc/peer_object.c | 4 +-- 13 files changed, 129 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 2b77f9a75bf7..015569845b1d 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -32,12 +32,35 @@ E_(rxrpc_skb_unshared_nomem, "US0") #define rxrpc_local_traces \ - EM(rxrpc_local_got, "GOT") \ - EM(rxrpc_local_new, "NEW") \ - EM(rxrpc_local_processing, "PRO") \ - EM(rxrpc_local_put, "PUT") \ - EM(rxrpc_local_queued, "QUE") \ - E_(rxrpc_local_tx_ack, "TAK") + EM(rxrpc_local_free, "FREE ") \ + EM(rxrpc_local_get_client_conn, "GET conn-cln") \ + EM(rxrpc_local_get_for_use, "GET for-use ") \ + EM(rxrpc_local_get_peer, "GET peer ") \ + EM(rxrpc_local_get_prealloc_conn, "GET conn-pre") \ + EM(rxrpc_local_get_queue, "GET queue ") \ + EM(rxrpc_local_new, "NEW ") \ + EM(rxrpc_local_processing, "PROCESSING ") \ + EM(rxrpc_local_put_already_queued, "PUT alreadyq") \ + EM(rxrpc_local_put_bind, "PUT bind ") \ + EM(rxrpc_local_put_for_use, "PUT for-use ") \ + EM(rxrpc_local_put_kill_conn, "PUT conn-kil") \ + EM(rxrpc_local_put_peer, "PUT peer ") \ + EM(rxrpc_local_put_prealloc_conn, "PUT conn-pre") \ + EM(rxrpc_local_put_release_sock, "PUT rel-sock") \ + EM(rxrpc_local_put_queue, "PUT queue ") \ + EM(rxrpc_local_queued, "QUEUED ") \ + EM(rxrpc_local_see_tx_ack, "SEE tx-ack ") \ + EM(rxrpc_local_stop, "STOP ") \ + EM(rxrpc_local_stopped, "STOPPED ") \ + EM(rxrpc_local_unuse_bind, "UNU bind ") \ + EM(rxrpc_local_unuse_conn_work, "UNU conn-wrk") \ + EM(rxrpc_local_unuse_peer_keepalive, "UNU peer-kpa") \ + EM(rxrpc_local_unuse_release_sock, "UNU rel-sock") \ + EM(rxrpc_local_unuse_work, "UNU work ") \ + EM(rxrpc_local_use_conn_work, "USE conn-wrk") \ + EM(rxrpc_local_use_lookup, "USE lookup ") \ + EM(rxrpc_local_use_peer_keepalive, "USE peer-kpa") \ + E_(rxrpc_local_use_work, "USE work ") #define rxrpc_peer_traces \ EM(rxrpc_peer_got, "GOT") \ @@ -345,29 +368,29 @@ rxrpc_txqueue_traces; TRACE_EVENT(rxrpc_local, TP_PROTO(unsigned int local_debug_id, enum rxrpc_local_trace op, - int usage, const void *where), + int ref, int usage), - TP_ARGS(local_debug_id, op, usage, where), + TP_ARGS(local_debug_id, op, ref, usage), TP_STRUCT__entry( __field(unsigned int, local ) __field(int, op ) + __field(int, ref ) __field(int, usage ) - __field(const void *, where ) ), TP_fast_assign( __entry->local = local_debug_id; __entry->op = op; + __entry->ref = ref; __entry->usage = usage; - __entry->where = where; ), - TP_printk("L=%08x %s u=%d sp=%pSR", + TP_printk("L=%08x %s r=%d u=%d", __entry->local, __print_symbolic(__entry->op, rxrpc_local_traces), - __entry->usage, - __entry->where) + __entry->ref, + __entry->usage) ); TRACE_EVENT(rxrpc_peer, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index aacdd96a9886..989ebca899f3 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -194,8 +194,8 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) service_in_use: write_unlock(&local->services_lock); - rxrpc_unuse_local(local); - rxrpc_put_local(local); + rxrpc_unuse_local(local, rxrpc_local_unuse_bind); + rxrpc_put_local(local, rxrpc_local_put_bind); ret = -EADDRINUSE; error_unlock: release_sock(&rx->sk); @@ -888,8 +888,8 @@ static int rxrpc_release_sock(struct sock *sk) flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); - rxrpc_unuse_local(rx->local); - rxrpc_put_local(rx->local); + rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); + rxrpc_put_local(rx->local, rxrpc_local_put_release_sock); rx->local = NULL; key_put(rx->key); rx->key = NULL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 7c48b0163032..dde9ce21ef48 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -979,22 +979,45 @@ extern void rxrpc_process_local_events(struct rxrpc_local *); * local_object.c */ struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc *); -struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *); -struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *); -void rxrpc_put_local(struct rxrpc_local *); -struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *); -void rxrpc_unuse_local(struct rxrpc_local *); +struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *, enum rxrpc_local_trace); +struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *, enum rxrpc_local_trace); +void rxrpc_put_local(struct rxrpc_local *, enum rxrpc_local_trace); +struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *, enum rxrpc_local_trace); +void rxrpc_unuse_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_queue_local(struct rxrpc_local *); void rxrpc_destroy_all_locals(struct rxrpc_net *); -static inline bool __rxrpc_unuse_local(struct rxrpc_local *local) +static inline bool __rxrpc_unuse_local(struct rxrpc_local *local, + enum rxrpc_local_trace why) { - return atomic_dec_return(&local->active_users) == 0; + unsigned int debug_id = local->debug_id; + int r, u; + + r = refcount_read(&local->ref); + u = atomic_dec_return(&local->active_users); + trace_rxrpc_local(debug_id, why, r, u); + return u == 0; +} + +static inline bool __rxrpc_use_local(struct rxrpc_local *local, + enum rxrpc_local_trace why) +{ + int r, u; + + r = refcount_read(&local->ref); + u = atomic_fetch_add_unless(&local->active_users, 1, 0); + trace_rxrpc_local(local->debug_id, why, r, u); + return u != 0; } -static inline bool __rxrpc_use_local(struct rxrpc_local *local) +static inline void rxrpc_see_local(struct rxrpc_local *local, + enum rxrpc_local_trace why) { - return atomic_fetch_add_unless(&local->active_users, 1, 0) != 0; + int r, u; + + r = refcount_read(&local->ref); + u = atomic_read(&local->active_users); + trace_rxrpc_local(local->debug_id, why, r, u); } /* diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 4888959e4727..1b12d4e28373 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -197,7 +197,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) tail = b->peer_backlog_tail; while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_peer *peer = b->peer_backlog[tail]; - rxrpc_put_local(peer->local); + rxrpc_put_local(peer->local, rxrpc_local_put_prealloc_conn); kfree(peer); tail = (tail + 1) & (size - 1); } @@ -305,7 +305,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, b->conn_backlog[conn_tail] = NULL; smp_store_release(&b->conn_backlog_tail, (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); - conn->local = rxrpc_get_local(local); + conn->local = rxrpc_get_local(local, rxrpc_local_get_prealloc_conn); conn->peer = peer; rxrpc_see_connection(conn); rxrpc_new_incoming_connection(rx, conn, sec, skb); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index b17ed37434bd..591af8e2e3d0 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -114,7 +114,7 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, if (in_task()) { rxrpc_transmit_ack_packets(call->peer->local); } else { - rxrpc_get_local(local); + rxrpc_get_local(local, rxrpc_local_get_queue); rxrpc_queue_local(local); } } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 71404b33623f..9a69b4c1b182 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -208,7 +208,7 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) rxrpc_get_bundle(bundle); rxrpc_get_peer(conn->peer); - rxrpc_get_local(conn->local); + rxrpc_get_local(conn->local, rxrpc_local_get_client_conn); key_get(conn->key); trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client, diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index f890a30c4df6..225edaf019f1 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -474,9 +474,9 @@ void rxrpc_process_connection(struct work_struct *work) rxrpc_see_connection(conn); - if (__rxrpc_use_local(conn->local)) { + if (__rxrpc_use_local(conn->local, rxrpc_local_use_conn_work)) { rxrpc_do_process_connection(conn); - rxrpc_unuse_local(conn->local); + rxrpc_unuse_local(conn->local, rxrpc_local_unuse_conn_work); } rxrpc_put_connection(conn); diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index ad6e5ee1f069..725359afeac0 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -366,7 +366,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) if (atomic_dec_and_test(&conn->local->rxnet->nr_conns)) wake_up_var(&conn->local->rxnet->nr_conns); - rxrpc_put_local(conn->local); + rxrpc_put_local(conn->local, rxrpc_local_put_kill_conn); kfree(conn); _leave(""); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 42c8257158f7..cecfd201d832 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1133,7 +1133,7 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, { _enter("%p,%p", local, skb); - if (rxrpc_get_local_maybe(local)) { + if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { skb_queue_tail(&local->event_queue, skb); rxrpc_queue_local(local); } else { @@ -1146,7 +1146,7 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, */ static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) { - if (rxrpc_get_local_maybe(local)) { + if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { skb_queue_tail(&local->reject_queue, skb); rxrpc_queue_local(local); } else { diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 11080c335d42..1617ce651b9b 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -110,7 +110,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; - trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, NULL); + trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, 1); } _leave(" = %p", local); @@ -228,7 +228,7 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, * we're attempting to use a local address that the dying * object is still using. */ - if (!rxrpc_use_local(local)) + if (!rxrpc_use_local(local, rxrpc_local_use_lookup)) break; goto found; @@ -272,32 +272,32 @@ addr_in_use: /* * Get a ref on a local endpoint. */ -struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local) +struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local, + enum rxrpc_local_trace why) { - const void *here = __builtin_return_address(0); - int r; + int r, u; + u = atomic_read(&local->active_users); __refcount_inc(&local->ref, &r); - trace_rxrpc_local(local->debug_id, rxrpc_local_got, r + 1, here); + trace_rxrpc_local(local->debug_id, why, r + 1, u); return local; } /* * Get a ref on a local endpoint unless its usage has already reached 0. */ -struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) +struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local, + enum rxrpc_local_trace why) { - const void *here = __builtin_return_address(0); - int r; + int r, u; - if (local) { - if (__refcount_inc_not_zero(&local->ref, &r)) - trace_rxrpc_local(local->debug_id, rxrpc_local_got, - r + 1, here); - else - local = NULL; + if (local && __refcount_inc_not_zero(&local->ref, &r)) { + u = atomic_read(&local->active_users); + trace_rxrpc_local(local->debug_id, why, r + 1, u); + return local; } - return local; + + return NULL; } /* @@ -305,31 +305,31 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) */ void rxrpc_queue_local(struct rxrpc_local *local) { - const void *here = __builtin_return_address(0); unsigned int debug_id = local->debug_id; int r = refcount_read(&local->ref); + int u = atomic_read(&local->active_users); if (rxrpc_queue_work(&local->processor)) - trace_rxrpc_local(debug_id, rxrpc_local_queued, r + 1, here); + trace_rxrpc_local(debug_id, rxrpc_local_queued, r, u); else - rxrpc_put_local(local); + rxrpc_put_local(local, rxrpc_local_put_already_queued); } /* * Drop a ref on a local endpoint. */ -void rxrpc_put_local(struct rxrpc_local *local) +void rxrpc_put_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { - const void *here = __builtin_return_address(0); unsigned int debug_id; bool dead; - int r; + int r, u; if (local) { debug_id = local->debug_id; + u = atomic_read(&local->active_users); dead = __refcount_dec_and_test(&local->ref, &r); - trace_rxrpc_local(debug_id, rxrpc_local_put, r, here); + trace_rxrpc_local(debug_id, why, r, u); if (dead) call_rcu(&local->rcu, rxrpc_local_rcu); @@ -339,14 +339,15 @@ void rxrpc_put_local(struct rxrpc_local *local) /* * Start using a local endpoint. */ -struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local) +struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, + enum rxrpc_local_trace why) { - local = rxrpc_get_local_maybe(local); + local = rxrpc_get_local_maybe(local, rxrpc_local_get_for_use); if (!local) return NULL; - if (!__rxrpc_use_local(local)) { - rxrpc_put_local(local); + if (!__rxrpc_use_local(local, why)) { + rxrpc_put_local(local, rxrpc_local_put_for_use); return NULL; } @@ -357,11 +358,18 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local) * Cease using a local endpoint. Once the number of active users reaches 0, we * start the closure of the transport in the work processor. */ -void rxrpc_unuse_local(struct rxrpc_local *local) +void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { + unsigned int debug_id; + int r, u; + if (local) { - if (__rxrpc_unuse_local(local)) { - rxrpc_get_local(local); + debug_id = local->debug_id; + r = refcount_read(&local->ref); + u = atomic_dec_return(&local->active_users); + trace_rxrpc_local(debug_id, why, r, u); + if (u == 0) { + rxrpc_get_local(local, rxrpc_local_get_queue); rxrpc_queue_local(local); } } @@ -418,12 +426,11 @@ static void rxrpc_local_processor(struct work_struct *work) if (local->dead) return; - trace_rxrpc_local(local->debug_id, rxrpc_local_processing, - refcount_read(&local->ref), NULL); + rxrpc_see_local(local, rxrpc_local_processing); do { again = false; - if (!__rxrpc_use_local(local)) { + if (!__rxrpc_use_local(local, rxrpc_local_use_work)) { rxrpc_local_destroyer(local); break; } @@ -443,10 +450,10 @@ static void rxrpc_local_processor(struct work_struct *work) again = true; } - __rxrpc_unuse_local(local); + __rxrpc_unuse_local(local, rxrpc_local_unuse_work); } while (again); - rxrpc_put_local(local); + rxrpc_put_local(local, rxrpc_local_put_queue); } /* @@ -460,6 +467,7 @@ static void rxrpc_local_rcu(struct rcu_head *rcu) ASSERT(!work_pending(&local->processor)); + rxrpc_see_local(local, rxrpc_local_free); kfree(local); _leave(""); } diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index b5d8eac8c49c..2762b7ada9ae 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -288,8 +288,7 @@ void rxrpc_transmit_ack_packets(struct rxrpc_local *local) LIST_HEAD(queue); int ret; - trace_rxrpc_local(local->debug_id, rxrpc_local_tx_ack, - refcount_read(&local->ref), NULL); + rxrpc_see_local(local, rxrpc_local_see_tx_ack); if (list_empty(&local->ack_tx_queue)) return; diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index ad4d1769e02b..3f8d104ecaa7 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -266,7 +266,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, if (!rxrpc_get_peer_maybe(peer)) continue; - if (__rxrpc_use_local(peer->local)) { + if (__rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive)) { spin_unlock_bh(&rxnet->peer_hash_lock); keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; @@ -289,7 +289,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, spin_lock_bh(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); - rxrpc_unuse_local(peer->local); + rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } rxrpc_put_peer_locked(peer); } diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index b3c3c1c344fc..bcef897560e7 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -215,7 +215,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) peer = kzalloc(sizeof(struct rxrpc_peer), gfp); if (peer) { refcount_set(&peer->ref, 1); - peer->local = rxrpc_get_local(local); + peer->local = rxrpc_get_local(local, rxrpc_local_get_peer); INIT_HLIST_HEAD(&peer->error_targets); peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); @@ -294,7 +294,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, static void rxrpc_free_peer(struct rxrpc_peer *peer) { - rxrpc_put_local(peer->local); + rxrpc_put_local(peer->local, rxrpc_local_put_peer); kfree_rcu(peer, rcu); } -- cgit v1.2.3-70-g09d2 From 47c810a79844462d3468d831edc00971757693e0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 13:39:34 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for rxrpc_peer tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the rxrpc_peer tracepoint Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 43 ++++++++++++++++++++++++++----------------- net/rxrpc/af_rxrpc.c | 2 +- net/rxrpc/ar-internal.h | 11 ++++++----- net/rxrpc/call_accept.c | 8 +++++--- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_client.c | 8 ++++---- net/rxrpc/conn_object.c | 2 +- net/rxrpc/peer_event.c | 8 ++++---- net/rxrpc/peer_object.c | 34 ++++++++++++++++------------------ net/rxrpc/sendmsg.c | 2 +- 10 files changed, 65 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 015569845b1d..1c74143a51c1 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -63,10 +63,23 @@ E_(rxrpc_local_use_work, "USE work ") #define rxrpc_peer_traces \ - EM(rxrpc_peer_got, "GOT") \ - EM(rxrpc_peer_new, "NEW") \ - EM(rxrpc_peer_processing, "PRO") \ - E_(rxrpc_peer_put, "PUT") + EM(rxrpc_peer_free, "FREE ") \ + EM(rxrpc_peer_get_accept, "GET accept ") \ + EM(rxrpc_peer_get_activate_call, "GET act-call") \ + EM(rxrpc_peer_get_bundle, "GET bundle ") \ + EM(rxrpc_peer_get_client_conn, "GET cln-conn") \ + EM(rxrpc_peer_get_input_error, "GET inpt-err") \ + EM(rxrpc_peer_get_keepalive, "GET keepaliv") \ + EM(rxrpc_peer_get_lookup_client, "GET look-cln") \ + EM(rxrpc_peer_get_service_conn, "GET srv-conn") \ + EM(rxrpc_peer_new_client, "NEW client ") \ + EM(rxrpc_peer_new_prealloc, "NEW prealloc") \ + EM(rxrpc_peer_put_bundle, "PUT bundle ") \ + EM(rxrpc_peer_put_call, "PUT call ") \ + EM(rxrpc_peer_put_conn, "PUT conn ") \ + EM(rxrpc_peer_put_discard_tmp, "PUT disc-tmp") \ + EM(rxrpc_peer_put_input_error, "PUT inpt-err") \ + E_(rxrpc_peer_put_keepalive, "PUT keepaliv") #define rxrpc_conn_traces \ EM(rxrpc_conn_got, "GOT") \ @@ -394,30 +407,26 @@ TRACE_EVENT(rxrpc_local, ); TRACE_EVENT(rxrpc_peer, - TP_PROTO(unsigned int peer_debug_id, enum rxrpc_peer_trace op, - int usage, const void *where), + TP_PROTO(unsigned int peer_debug_id, int ref, enum rxrpc_peer_trace why), - TP_ARGS(peer_debug_id, op, usage, where), + TP_ARGS(peer_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, peer ) - __field(int, op ) - __field(int, usage ) - __field(const void *, where ) + __field(int, ref ) + __field(int, why ) ), TP_fast_assign( __entry->peer = peer_debug_id; - __entry->op = op; - __entry->usage = usage; - __entry->where = where; + __entry->ref = ref; + __entry->why = why; ), - TP_printk("P=%08x %s u=%d sp=%pSR", + TP_printk("P=%08x %s r=%d", __entry->peer, - __print_symbolic(__entry->op, rxrpc_peer_traces), - __entry->usage, - __entry->where) + __print_symbolic(__entry->why, rxrpc_peer_traces), + __entry->ref) ); TRACE_EVENT(rxrpc_conn, diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 989ebca899f3..7a0dc01741e7 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -328,7 +328,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, mutex_unlock(&call->user_mutex); } - rxrpc_put_peer(cp.peer); + rxrpc_put_peer(cp.peer, rxrpc_peer_put_discard_tmp); _leave(" = %p", call); return call; } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index dde9ce21ef48..6cb111e9761c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1063,14 +1063,15 @@ struct rxrpc_peer *rxrpc_lookup_peer_rcu(struct rxrpc_local *, const struct sockaddr_rxrpc *); struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *, struct rxrpc_local *, struct sockaddr_rxrpc *, gfp_t); -struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t); +struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *, gfp_t, + enum rxrpc_peer_trace); void rxrpc_new_incoming_peer(struct rxrpc_sock *, struct rxrpc_local *, struct rxrpc_peer *); void rxrpc_destroy_all_peers(struct rxrpc_net *); -struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *); -struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *); -void rxrpc_put_peer(struct rxrpc_peer *); -void rxrpc_put_peer_locked(struct rxrpc_peer *); +struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); +struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *, enum rxrpc_peer_trace); +void rxrpc_put_peer(struct rxrpc_peer *, enum rxrpc_peer_trace); +void rxrpc_put_peer_locked(struct rxrpc_peer *, enum rxrpc_peer_trace); /* * proc.c diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 1b12d4e28373..f6bc3b07c3e5 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -70,7 +70,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, head = b->peer_backlog_head; tail = READ_ONCE(b->peer_backlog_tail); if (CIRC_CNT(head, tail, size) < max) { - struct rxrpc_peer *peer = rxrpc_alloc_peer(rx->local, gfp); + struct rxrpc_peer *peer; + + peer = rxrpc_alloc_peer(rx->local, gfp, rxrpc_peer_new_prealloc); if (!peer) return -ENOMEM; b->peer_backlog[head] = peer; @@ -286,7 +288,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, return NULL; if (!conn) { - if (peer && !rxrpc_get_peer_maybe(peer)) + if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_service_conn)) peer = NULL; if (!peer) { peer = b->peer_backlog[peer_tail]; @@ -323,7 +325,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, call->conn = conn; call->security = conn->security; call->security_ix = conn->security_ix; - call->peer = rxrpc_get_peer(conn->peer); + call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_accept); call->cong_ssthresh = call->peer->cong_ssthresh; call->tx_last_sent = ktime_get_real(); return call; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 59928f0a8fe1..1b725afd6e2c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -636,7 +636,7 @@ static void rxrpc_destroy_call(struct work_struct *work) rxrpc_delete_call_timer(call); rxrpc_put_connection(call->conn); - rxrpc_put_peer(call->peer); + rxrpc_put_peer(call->peer, rxrpc_peer_put_call); kmem_cache_free(rxrpc_call_jar, call); if (atomic_dec_and_test(&rxnet->nr_calls)) wake_up_var(&rxnet->nr_calls); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 9a69b4c1b182..9444da235a48 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -123,7 +123,7 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, bundle = kzalloc(sizeof(*bundle), gfp); if (bundle) { bundle->local = cp->local; - bundle->peer = rxrpc_get_peer(cp->peer); + bundle->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_bundle); bundle->key = cp->key; bundle->exclusive = cp->exclusive; bundle->upgrade = cp->upgrade; @@ -145,7 +145,7 @@ struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle) static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { - rxrpc_put_peer(bundle->peer); + rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); kfree(bundle); } @@ -207,7 +207,7 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) write_unlock(&rxnet->conn_lock); rxrpc_get_bundle(bundle); - rxrpc_get_peer(conn->peer); + rxrpc_get_peer(conn->peer, rxrpc_peer_get_client_conn); rxrpc_get_local(conn->local, rxrpc_local_get_client_conn); key_get(conn->key); @@ -543,7 +543,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, rxrpc_see_call(call); list_del_init(&call->chan_wait_link); - call->peer = rxrpc_get_peer(conn->peer); + call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_activate_call); call->conn = rxrpc_get_connection(conn); call->cid = conn->proto.cid | channel; call->call_id = call_id; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 725359afeac0..554ee5dd3325 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -362,7 +362,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) conn->security->clear(conn); key_put(conn->key); rxrpc_put_bundle(conn->bundle); - rxrpc_put_peer(conn->peer); + rxrpc_put_peer(conn->peer, rxrpc_peer_put_conn); if (atomic_dec_and_test(&conn->local->rxnet->nr_conns)) wake_up_var(&conn->local->rxnet->nr_conns); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 3f8d104ecaa7..5e97d321ac38 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -168,7 +168,7 @@ void rxrpc_error_report(struct sock *sk) } peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); - if (peer && !rxrpc_get_peer_maybe(peer)) + if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_input_error)) peer = NULL; if (!peer) { rcu_read_unlock(); @@ -190,7 +190,7 @@ void rxrpc_error_report(struct sock *sk) out: rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); - rxrpc_put_peer(peer); + rxrpc_put_peer(peer, rxrpc_peer_put_input_error); _leave(""); } @@ -263,7 +263,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, struct rxrpc_peer, keepalive_link); list_del_init(&peer->keepalive_link); - if (!rxrpc_get_peer_maybe(peer)) + if (!rxrpc_get_peer_maybe(peer, rxrpc_peer_get_keepalive)) continue; if (__rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive)) { @@ -291,7 +291,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, &rxnet->peer_keepalive[slot & mask]); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } - rxrpc_put_peer_locked(peer); + rxrpc_put_peer_locked(peer, rxrpc_peer_put_keepalive); } spin_unlock_bh(&rxnet->peer_hash_lock); diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index bcef897560e7..9e682a60a800 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -205,9 +205,9 @@ static void rxrpc_assess_MTU_size(struct rxrpc_sock *rx, /* * Allocate a peer. */ -struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) +struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, + enum rxrpc_peer_trace why) { - const void *here = __builtin_return_address(0); struct rxrpc_peer *peer; _enter(""); @@ -226,7 +226,7 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp) rxrpc_peer_init_rtt(peer); peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; - trace_rxrpc_peer(peer->debug_id, rxrpc_peer_new, 1, here); + trace_rxrpc_peer(peer->debug_id, why, 1); } _leave(" = %p", peer); @@ -282,7 +282,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, _enter(""); - peer = rxrpc_alloc_peer(local, gfp); + peer = rxrpc_alloc_peer(local, gfp, rxrpc_peer_new_client); if (peer) { memcpy(&peer->srx, srx, sizeof(*srx)); rxrpc_init_peer(rx, peer, hash_key); @@ -294,6 +294,7 @@ static struct rxrpc_peer *rxrpc_create_peer(struct rxrpc_sock *rx, static void rxrpc_free_peer(struct rxrpc_peer *peer) { + trace_rxrpc_peer(peer->debug_id, 0, rxrpc_peer_free); rxrpc_put_local(peer->local, rxrpc_local_put_peer); kfree_rcu(peer, rcu); } @@ -334,7 +335,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, /* search the peer list first */ rcu_read_lock(); peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); - if (peer && !rxrpc_get_peer_maybe(peer)) + if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_lookup_client)) peer = NULL; rcu_read_unlock(); @@ -352,7 +353,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); - if (peer && !rxrpc_get_peer_maybe(peer)) + if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_lookup_client)) peer = NULL; if (!peer) { hash_add_rcu(rxnet->peer_hash, @@ -376,27 +377,26 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, /* * Get a ref on a peer record. */ -struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer) +struct rxrpc_peer *rxrpc_get_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) { - const void *here = __builtin_return_address(0); int r; __refcount_inc(&peer->ref, &r); - trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, r + 1, here); + trace_rxrpc_peer(peer->debug_id, why, r + 1); return peer; } /* * Get a ref on a peer record unless its usage has already reached 0. */ -struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer) +struct rxrpc_peer *rxrpc_get_peer_maybe(struct rxrpc_peer *peer, + enum rxrpc_peer_trace why) { - const void *here = __builtin_return_address(0); int r; if (peer) { if (__refcount_inc_not_zero(&peer->ref, &r)) - trace_rxrpc_peer(peer->debug_id, rxrpc_peer_got, r + 1, here); + trace_rxrpc_peer(peer->debug_id, r + 1, why); else peer = NULL; } @@ -423,9 +423,8 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) /* * Drop a ref on a peer record. */ -void rxrpc_put_peer(struct rxrpc_peer *peer) +void rxrpc_put_peer(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) { - const void *here = __builtin_return_address(0); unsigned int debug_id; bool dead; int r; @@ -433,7 +432,7 @@ void rxrpc_put_peer(struct rxrpc_peer *peer) if (peer) { debug_id = peer->debug_id; dead = __refcount_dec_and_test(&peer->ref, &r); - trace_rxrpc_peer(debug_id, rxrpc_peer_put, r - 1, here); + trace_rxrpc_peer(debug_id, r - 1, why); if (dead) __rxrpc_put_peer(peer); } @@ -443,15 +442,14 @@ void rxrpc_put_peer(struct rxrpc_peer *peer) * Drop a ref on a peer record where the caller already holds the * peer_hash_lock. */ -void rxrpc_put_peer_locked(struct rxrpc_peer *peer) +void rxrpc_put_peer_locked(struct rxrpc_peer *peer, enum rxrpc_peer_trace why) { - const void *here = __builtin_return_address(0); unsigned int debug_id = peer->debug_id; bool dead; int r; dead = __refcount_dec_and_test(&peer->ref, &r); - trace_rxrpc_peer(debug_id, rxrpc_peer_put, r - 1, here); + trace_rxrpc_peer(debug_id, r - 1, why); if (dead) { hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index e5fd8a95bf71..cfe0badba0b3 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -604,7 +604,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, atomic_inc_return(&rxrpc_debug_id)); /* The socket is now unlocked */ - rxrpc_put_peer(cp.peer); + rxrpc_put_peer(cp.peer, rxrpc_peer_put_discard_tmp); _leave(" = %p\n", call); return call; } -- cgit v1.2.3-70-g09d2 From 7fa25105b2d32fcb0f38668bc20d0adf6508322f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 14:06:16 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for rxrpc_conn tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the rxrpc_conn tracepoint Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 58 ++++++++++++++++++++++++++++---------------- net/rxrpc/ar-internal.h | 21 +++++++++------- net/rxrpc/call_accept.c | 9 +++---- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_client.c | 28 ++++++++++----------- net/rxrpc/conn_event.c | 4 +-- net/rxrpc/conn_object.c | 40 +++++++++++++++--------------- net/rxrpc/conn_service.c | 4 +-- net/rxrpc/input.c | 2 +- 9 files changed, 92 insertions(+), 76 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 1c74143a51c1..e09568a8c173 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -82,14 +82,34 @@ E_(rxrpc_peer_put_keepalive, "PUT keepaliv") #define rxrpc_conn_traces \ - EM(rxrpc_conn_got, "GOT") \ - EM(rxrpc_conn_new_client, "NWc") \ - EM(rxrpc_conn_new_service, "NWs") \ - EM(rxrpc_conn_put_client, "PTc") \ - EM(rxrpc_conn_put_service, "PTs") \ - EM(rxrpc_conn_queued, "QUE") \ - EM(rxrpc_conn_reap_service, "RPs") \ - E_(rxrpc_conn_seen, "SEE") + EM(rxrpc_conn_free, "FREE ") \ + EM(rxrpc_conn_get_activate_call, "GET act-call") \ + EM(rxrpc_conn_get_call_input, "GET inp-call") \ + EM(rxrpc_conn_get_conn_input, "GET inp-conn") \ + EM(rxrpc_conn_get_idle, "GET idle ") \ + EM(rxrpc_conn_get_poke, "GET poke ") \ + EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ + EM(rxrpc_conn_new_client, "NEW client ") \ + EM(rxrpc_conn_new_service, "NEW service ") \ + EM(rxrpc_conn_put_already_queued, "PUT alreadyq") \ + EM(rxrpc_conn_put_call, "PUT call ") \ + EM(rxrpc_conn_put_call_input, "PUT inp-call") \ + EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ + EM(rxrpc_conn_put_discard, "PUT discard ") \ + EM(rxrpc_conn_put_discard_idle, "PUT disc-idl") \ + EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ + EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ + EM(rxrpc_conn_put_poke, "PUT poke ") \ + EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ + EM(rxrpc_conn_put_unidle, "PUT unidle ") \ + EM(rxrpc_conn_put_work, "PUT work ") \ + EM(rxrpc_conn_queue_challenge, "GQ chall ") \ + EM(rxrpc_conn_queue_retry_work, "GQ retry-wk") \ + EM(rxrpc_conn_queue_rx_work, "GQ rx-work ") \ + EM(rxrpc_conn_queue_timer, "GQ timer ") \ + EM(rxrpc_conn_see_new_service_conn, "SEE new-svc ") \ + EM(rxrpc_conn_see_reap_service, "SEE reap-svc") \ + E_(rxrpc_conn_see_work, "SEE work ") #define rxrpc_client_traces \ EM(rxrpc_client_activate_chans, "Activa") \ @@ -430,30 +450,26 @@ TRACE_EVENT(rxrpc_peer, ); TRACE_EVENT(rxrpc_conn, - TP_PROTO(unsigned int conn_debug_id, enum rxrpc_conn_trace op, - int usage, const void *where), + TP_PROTO(unsigned int conn_debug_id, int ref, enum rxrpc_conn_trace why), - TP_ARGS(conn_debug_id, op, usage, where), + TP_ARGS(conn_debug_id, ref, why), TP_STRUCT__entry( __field(unsigned int, conn ) - __field(int, op ) - __field(int, usage ) - __field(const void *, where ) + __field(int, ref ) + __field(int, why ) ), TP_fast_assign( __entry->conn = conn_debug_id; - __entry->op = op; - __entry->usage = usage; - __entry->where = where; + __entry->ref = ref; + __entry->why = why; ), - TP_printk("C=%08x %s u=%d sp=%pSR", + TP_printk("C=%08x %s r=%d", __entry->conn, - __print_symbolic(__entry->op, rxrpc_conn_traces), - __entry->usage, - __entry->where) + __print_symbolic(__entry->why, rxrpc_conn_traces), + __entry->ref) ); TRACE_EVENT(rxrpc_client, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6cb111e9761c..bc8281c410c5 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -882,7 +882,7 @@ int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *, gfp_t); void rxrpc_expose_client_call(struct rxrpc_call *); void rxrpc_disconnect_client_call(struct rxrpc_bundle *, struct rxrpc_call *); -void rxrpc_put_client_conn(struct rxrpc_connection *); +void rxrpc_put_client_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_discard_expired_client_conns(struct work_struct *); void rxrpc_destroy_all_client_connections(struct rxrpc_net *); void rxrpc_clean_up_local_conns(struct rxrpc_local *); @@ -906,11 +906,13 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_connection(struct rxrpc_connection *); -bool rxrpc_queue_conn(struct rxrpc_connection *); -void rxrpc_see_connection(struct rxrpc_connection *); -struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *); -struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *); -void rxrpc_put_service_conn(struct rxrpc_connection *); +bool rxrpc_queue_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); +void rxrpc_see_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); +struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *, + enum rxrpc_conn_trace); +struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *, + enum rxrpc_conn_trace); +void rxrpc_put_service_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_service_connection_reaper(struct work_struct *); void rxrpc_destroy_all_connections(struct rxrpc_net *); @@ -924,15 +926,16 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) return !rxrpc_conn_is_client(conn); } -static inline void rxrpc_put_connection(struct rxrpc_connection *conn) +static inline void rxrpc_put_connection(struct rxrpc_connection *conn, + enum rxrpc_conn_trace why) { if (!conn) return; if (rxrpc_conn_is_client(conn)) - rxrpc_put_client_conn(conn); + rxrpc_put_client_conn(conn, why); else - rxrpc_put_service_conn(conn); + rxrpc_put_service_conn(conn, why); } static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index f6bc3b07c3e5..04b52e28e0cc 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -91,9 +91,6 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, b->conn_backlog[head] = conn; smp_store_release(&b->conn_backlog_head, (head + 1) & (size - 1)); - - trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, - refcount_read(&conn->ref), here); } /* Now it gets complicated, because calls get registered with the @@ -309,10 +306,10 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, (conn_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); conn->local = rxrpc_get_local(local, rxrpc_local_get_prealloc_conn); conn->peer = peer; - rxrpc_see_connection(conn); + rxrpc_see_connection(conn, rxrpc_conn_see_new_service_conn); rxrpc_new_incoming_connection(rx, conn, sec, skb); } else { - rxrpc_get_connection(conn); + rxrpc_get_connection(conn, rxrpc_conn_get_service_conn); } /* And now we can allocate and set up a new call */ @@ -402,7 +399,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, case RXRPC_CONN_SERVICE_UNSECURED: conn->state = RXRPC_CONN_SERVICE_CHALLENGING; set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); - rxrpc_queue_conn(call->conn); + rxrpc_queue_conn(call->conn, rxrpc_conn_queue_challenge); break; case RXRPC_CONN_SERVICE: diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 1b725afd6e2c..29ec4013aa0b 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -635,7 +635,7 @@ static void rxrpc_destroy_call(struct work_struct *work) rxrpc_delete_call_timer(call); - rxrpc_put_connection(call->conn); + rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); kmem_cache_free(rxrpc_call_jar, call); if (atomic_dec_and_test(&rxnet->nr_calls)) diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 9444da235a48..dcfec6a45255 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -211,9 +211,8 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) rxrpc_get_local(conn->local, rxrpc_local_get_client_conn); key_get(conn->key); - trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_client, - refcount_read(&conn->ref), - __builtin_return_address(0)); + trace_rxrpc_conn(conn->debug_id, refcount_read(&conn->ref), + rxrpc_conn_new_client); atomic_inc(&rxnet->nr_client_conns); trace_rxrpc_client(conn, -1, rxrpc_client_alloc); @@ -467,10 +466,10 @@ static void rxrpc_add_conn_to_bundle(struct rxrpc_bundle *bundle, gfp_t gfp) if (candidate) { _debug("discard C=%x", candidate->debug_id); trace_rxrpc_client(candidate, -1, rxrpc_client_duplicate); - rxrpc_put_connection(candidate); + rxrpc_put_connection(candidate, rxrpc_conn_put_discard); } - rxrpc_put_connection(old); + rxrpc_put_connection(old, rxrpc_conn_put_noreuse); _leave(""); } @@ -544,7 +543,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, rxrpc_see_call(call); list_del_init(&call->chan_wait_link); call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_activate_call); - call->conn = rxrpc_get_connection(conn); + call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); call->cid = conn->proto.cid | channel; call->call_id = call_id; call->security = conn->security; @@ -592,7 +591,7 @@ static void rxrpc_unidle_conn(struct rxrpc_bundle *bundle, struct rxrpc_connecti } spin_unlock(&rxnet->client_conn_cache_lock); if (drop_ref) - rxrpc_put_connection(conn); + rxrpc_put_connection(conn, rxrpc_conn_put_unidle); } } @@ -896,7 +895,7 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call trace_rxrpc_client(conn, channel, rxrpc_client_to_idle); conn->idle_timestamp = jiffies; - rxrpc_get_connection(conn); + rxrpc_get_connection(conn, rxrpc_conn_get_idle); spin_lock(&rxnet->client_conn_cache_lock); list_move_tail(&conn->cache_link, &rxnet->idle_client_conns); spin_unlock(&rxnet->client_conn_cache_lock); @@ -938,7 +937,7 @@ static void rxrpc_unbundle_conn(struct rxrpc_connection *conn) if (need_drop) { rxrpc_deactivate_bundle(bundle); - rxrpc_put_connection(conn); + rxrpc_put_connection(conn, rxrpc_conn_put_unbundle); } } @@ -983,15 +982,15 @@ static void rxrpc_kill_client_conn(struct rxrpc_connection *conn) /* * Clean up a dead client connections. */ -void rxrpc_put_client_conn(struct rxrpc_connection *conn) +void rxrpc_put_client_conn(struct rxrpc_connection *conn, + enum rxrpc_conn_trace why) { - const void *here = __builtin_return_address(0); unsigned int debug_id = conn->debug_id; bool dead; int r; dead = __refcount_dec_and_test(&conn->ref, &r); - trace_rxrpc_conn(debug_id, rxrpc_conn_put_client, r - 1, here); + trace_rxrpc_conn(debug_id, r - 1, why); if (dead) rxrpc_kill_client_conn(conn); } @@ -1063,7 +1062,8 @@ next: spin_unlock(&rxnet->client_conn_cache_lock); rxrpc_unbundle_conn(conn); - rxrpc_put_connection(conn); /* Drop the ->cache_link ref */ + /* Drop the ->cache_link ref */ + rxrpc_put_connection(conn, rxrpc_conn_put_discard_idle); nr_conns--; goto next; @@ -1134,7 +1134,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) struct rxrpc_connection, cache_link); list_del_init(&conn->cache_link); rxrpc_unbundle_conn(conn); - rxrpc_put_connection(conn); + rxrpc_put_connection(conn, rxrpc_conn_put_local_dead); } _leave(" [culled]"); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 225edaf019f1..817f895c77ca 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -472,14 +472,14 @@ void rxrpc_process_connection(struct work_struct *work) struct rxrpc_connection *conn = container_of(work, struct rxrpc_connection, processor); - rxrpc_see_connection(conn); + rxrpc_see_connection(conn, rxrpc_conn_see_work); if (__rxrpc_use_local(conn->local, rxrpc_local_use_conn_work)) { rxrpc_do_process_connection(conn); rxrpc_unuse_local(conn->local, rxrpc_local_unuse_conn_work); } - rxrpc_put_connection(conn); + rxrpc_put_connection(conn, rxrpc_conn_put_work); _leave(""); return; } diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 554ee5dd3325..bbace8d9953d 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -26,7 +26,7 @@ static void rxrpc_connection_timer(struct timer_list *timer) struct rxrpc_connection *conn = container_of(timer, struct rxrpc_connection, timer); - rxrpc_queue_conn(conn); + rxrpc_queue_conn(conn, rxrpc_conn_queue_timer); } /* @@ -260,43 +260,42 @@ void rxrpc_kill_connection(struct rxrpc_connection *conn) * Queue a connection's work processor, getting a ref to pass to the work * queue. */ -bool rxrpc_queue_conn(struct rxrpc_connection *conn) +bool rxrpc_queue_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) { - const void *here = __builtin_return_address(0); int r; if (!__refcount_inc_not_zero(&conn->ref, &r)) return false; if (rxrpc_queue_work(&conn->processor)) - trace_rxrpc_conn(conn->debug_id, rxrpc_conn_queued, r + 1, here); + trace_rxrpc_conn(conn->debug_id, why, r + 1); else - rxrpc_put_connection(conn); + rxrpc_put_connection(conn, rxrpc_conn_put_already_queued); return true; } /* * Note the re-emergence of a connection. */ -void rxrpc_see_connection(struct rxrpc_connection *conn) +void rxrpc_see_connection(struct rxrpc_connection *conn, + enum rxrpc_conn_trace why) { - const void *here = __builtin_return_address(0); if (conn) { - int n = refcount_read(&conn->ref); + int r = refcount_read(&conn->ref); - trace_rxrpc_conn(conn->debug_id, rxrpc_conn_seen, n, here); + trace_rxrpc_conn(conn->debug_id, r, why); } } /* * Get a ref on a connection. */ -struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn) +struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn, + enum rxrpc_conn_trace why) { - const void *here = __builtin_return_address(0); int r; __refcount_inc(&conn->ref, &r); - trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, r, here); + trace_rxrpc_conn(conn->debug_id, r + 1, why); return conn; } @@ -304,14 +303,14 @@ struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *conn) * Try to get a ref on a connection. */ struct rxrpc_connection * -rxrpc_get_connection_maybe(struct rxrpc_connection *conn) +rxrpc_get_connection_maybe(struct rxrpc_connection *conn, + enum rxrpc_conn_trace why) { - const void *here = __builtin_return_address(0); int r; if (conn) { if (__refcount_inc_not_zero(&conn->ref, &r)) - trace_rxrpc_conn(conn->debug_id, rxrpc_conn_got, r + 1, here); + trace_rxrpc_conn(conn->debug_id, r + 1, why); else conn = NULL; } @@ -331,14 +330,14 @@ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, /* * Release a service connection */ -void rxrpc_put_service_conn(struct rxrpc_connection *conn) +void rxrpc_put_service_conn(struct rxrpc_connection *conn, + enum rxrpc_conn_trace why) { - const void *here = __builtin_return_address(0); unsigned int debug_id = conn->debug_id; int r; __refcount_dec(&conn->ref, &r); - trace_rxrpc_conn(debug_id, rxrpc_conn_put_service, r - 1, here); + trace_rxrpc_conn(debug_id, r - 1, why); if (r - 1 == 1) rxrpc_set_service_reap_timer(conn->local->rxnet, jiffies + rxrpc_connection_expiry); @@ -354,6 +353,9 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) _enter("{%d,u=%d}", conn->debug_id, refcount_read(&conn->ref)); + trace_rxrpc_conn(conn->debug_id, refcount_read(&conn->ref), + rxrpc_conn_free); + ASSERTCMP(refcount_read(&conn->ref), ==, 0); del_timer_sync(&conn->timer); @@ -419,7 +421,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) */ if (!refcount_dec_if_one(&conn->ref)) continue; - trace_rxrpc_conn(conn->debug_id, rxrpc_conn_reap_service, 0, NULL); + rxrpc_see_connection(conn, rxrpc_conn_see_reap_service); if (rxrpc_conn_is_client(conn)) BUG(); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index a3b91864ef21..bf087213bd4d 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -141,9 +141,7 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - trace_rxrpc_conn(conn->debug_id, rxrpc_conn_new_service, - refcount_read(&conn->ref), - __builtin_return_address(0)); + rxrpc_see_connection(conn, rxrpc_conn_new_service); } return conn; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index cecfd201d832..c8ff7489b412 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1121,7 +1121,7 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, _enter("%p,%p", conn, skb); skb_queue_tail(&conn->rx_queue, skb); - rxrpc_queue_conn(conn); + rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); } /* -- cgit v1.2.3-70-g09d2 From cb0fc0c9722c0c001510e2a6d9b0a78b80421487 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 14:39:26 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for rxrpc_call tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the rxrpc_call tracepoint Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 83 ++++++++++++++++++++--------------- net/rxrpc/ar-internal.h | 8 ++-- net/rxrpc/call_accept.c | 16 +++---- net/rxrpc/call_event.c | 8 ++-- net/rxrpc/call_object.c | 102 +++++++++++++++++++------------------------ net/rxrpc/conn_client.c | 2 +- net/rxrpc/input.c | 8 ++-- net/rxrpc/output.c | 2 +- net/rxrpc/peer_event.c | 2 +- net/rxrpc/recvmsg.c | 8 ++-- net/rxrpc/sendmsg.c | 4 +- 11 files changed, 121 insertions(+), 122 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index e09568a8c173..3f6de4294148 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -127,26 +127,44 @@ E_(rxrpc_client_to_idle, "->Idle") #define rxrpc_call_traces \ - EM(rxrpc_call_connected, "CON") \ - EM(rxrpc_call_error, "*E*") \ - EM(rxrpc_call_got, "GOT") \ - EM(rxrpc_call_got_kernel, "Gke") \ - EM(rxrpc_call_got_timer, "GTM") \ - EM(rxrpc_call_got_tx, "Gtx") \ - EM(rxrpc_call_got_userid, "Gus") \ - EM(rxrpc_call_new_client, "NWc") \ - EM(rxrpc_call_new_service, "NWs") \ - EM(rxrpc_call_put, "PUT") \ - EM(rxrpc_call_put_kernel, "Pke") \ - EM(rxrpc_call_put_noqueue, "PnQ") \ - EM(rxrpc_call_put_notimer, "PnT") \ - EM(rxrpc_call_put_timer, "PTM") \ - EM(rxrpc_call_put_tx, "Ptx") \ - EM(rxrpc_call_put_userid, "Pus") \ - EM(rxrpc_call_queued, "QUE") \ - EM(rxrpc_call_queued_ref, "QUR") \ - EM(rxrpc_call_release, "RLS") \ - E_(rxrpc_call_seen, "SEE") + EM(rxrpc_call_get_input, "GET input ") \ + EM(rxrpc_call_get_kernel_service, "GET krnl-srv") \ + EM(rxrpc_call_get_notify_socket, "GET notify ") \ + EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \ + EM(rxrpc_call_get_release_sock, "GET rel-sock") \ + EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ + EM(rxrpc_call_get_send_ack, "GET send-ack") \ + EM(rxrpc_call_get_timer, "GET timer ") \ + EM(rxrpc_call_get_userid, "GET user-id ") \ + EM(rxrpc_call_new_client, "NEW client ") \ + EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ + EM(rxrpc_call_put_already_queued, "PUT alreadyq") \ + EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ + EM(rxrpc_call_put_input, "PUT input ") \ + EM(rxrpc_call_put_kernel, "PUT kernel ") \ + EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ + EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ + EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ + EM(rxrpc_call_put_send_ack, "PUT send-ack") \ + EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ + EM(rxrpc_call_put_timer, "PUT timer ") \ + EM(rxrpc_call_put_timer_already, "PUT timer-al") \ + EM(rxrpc_call_put_unnotify, "PUT unnotify") \ + EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ + EM(rxrpc_call_put_work, "PUT work ") \ + EM(rxrpc_call_queue_abort, "QUE abort ") \ + EM(rxrpc_call_queue_requeue, "QUE requeue ") \ + EM(rxrpc_call_queue_resend, "QUE resend ") \ + EM(rxrpc_call_queue_timer, "QUE timer ") \ + EM(rxrpc_call_see_accept, "SEE accept ") \ + EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ + EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ + EM(rxrpc_call_see_connected, "SEE connect ") \ + EM(rxrpc_call_see_distribute_error, "SEE dist-err") \ + EM(rxrpc_call_see_input, "SEE input ") \ + EM(rxrpc_call_see_release, "SEE release ") \ + EM(rxrpc_call_see_userid_exists, "SEE u-exists") \ + E_(rxrpc_call_see_zap, "SEE zap ") #define rxrpc_txqueue_traces \ EM(rxrpc_txqueue_await_reply, "AWR") \ @@ -503,32 +521,29 @@ TRACE_EVENT(rxrpc_client, ); TRACE_EVENT(rxrpc_call, - TP_PROTO(unsigned int call_debug_id, enum rxrpc_call_trace op, - int usage, const void *where, const void *aux), + TP_PROTO(unsigned int call_debug_id, int ref, unsigned long aux, + enum rxrpc_call_trace why), - TP_ARGS(call_debug_id, op, usage, where, aux), + TP_ARGS(call_debug_id, ref, aux, why), TP_STRUCT__entry( __field(unsigned int, call ) - __field(int, op ) - __field(int, usage ) - __field(const void *, where ) - __field(const void *, aux ) + __field(int, ref ) + __field(int, why ) + __field(unsigned long, aux ) ), TP_fast_assign( __entry->call = call_debug_id; - __entry->op = op; - __entry->usage = usage; - __entry->where = where; + __entry->ref = ref; + __entry->why = why; __entry->aux = aux; ), - TP_printk("c=%08x %s u=%d sp=%pSR a=%p", + TP_printk("c=%08x %s r=%d a=%lx", __entry->call, - __print_symbolic(__entry->op, rxrpc_call_traces), - __entry->usage, - __entry->where, + __print_symbolic(__entry->why, rxrpc_call_traces), + __entry->ref, __entry->aux) ); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index bc8281c410c5..82eb09b961a0 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -847,10 +847,10 @@ void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); -bool __rxrpc_queue_call(struct rxrpc_call *); -bool rxrpc_queue_call(struct rxrpc_call *); -void rxrpc_see_call(struct rxrpc_call *); -bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op); +bool __rxrpc_queue_call(struct rxrpc_call *, enum rxrpc_call_trace); +bool rxrpc_queue_call(struct rxrpc_call *, enum rxrpc_call_trace); +void rxrpc_see_call(struct rxrpc_call *, enum rxrpc_call_trace); +bool rxrpc_try_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_cleanup_call(struct rxrpc_call *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 04b52e28e0cc..dd4ca4bee77f 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -38,7 +38,6 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { - const void *here = __builtin_return_address(0); struct rxrpc_call *call, *xcall; struct rxrpc_net *rxnet = rxrpc_net(sock_net(&rx->sk)); struct rb_node *parent, **pp; @@ -102,9 +101,8 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->flags |= (1 << RXRPC_CALL_IS_SERVICE); call->state = RXRPC_CALL_SERVER_PREALLOC; - trace_rxrpc_call(call->debug_id, rxrpc_call_new_service, - refcount_read(&call->ref), - here, (const void *)user_call_ID); + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), + user_call_ID, rxrpc_call_new_prealloc_service); write_lock(&rx->call_lock); @@ -125,11 +123,11 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; if (user_attach_call) { - rxrpc_get_call(call, rxrpc_call_got_kernel); + rxrpc_get_call(call, rxrpc_call_get_kernel_service); user_attach_call(call, user_call_ID); } - rxrpc_get_call(call, rxrpc_call_got_userid); + rxrpc_get_call(call, rxrpc_call_get_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); set_bit(RXRPC_CALL_HAS_USERID, &call->flags); @@ -229,7 +227,7 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) } rxrpc_call_completed(call); rxrpc_release_call(rx, call); - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_discard_prealloc); tail = (tail + 1) & (size - 1); } @@ -318,7 +316,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, smp_store_release(&b->call_backlog_tail, (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); - rxrpc_see_call(call); + rxrpc_see_call(call, rxrpc_call_see_accept); call->conn = conn; call->security = conn->security; call->security_ix = conn->security_ix; @@ -430,7 +428,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel * service to prevent the call from being deallocated too early. */ - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_discard_prealloc); _leave(" = %p{%d}", call, call->debug_id); return call; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 591af8e2e3d0..0c8d2186cda8 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -101,7 +101,7 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, txb->ack.reason = ack_reason; txb->ack.nAcks = 0; - if (!rxrpc_try_get_call(call, rxrpc_call_got)) { + if (!rxrpc_try_get_call(call, rxrpc_call_get_send_ack)) { rxrpc_put_txbuf(txb, rxrpc_txbuf_put_nomem); return; } @@ -302,7 +302,7 @@ void rxrpc_process_call(struct work_struct *work) unsigned int iterations = 0; rxrpc_serial_t ackr_serial; - rxrpc_see_call(call); + rxrpc_see_call(call, rxrpc_call_see_input); //printk("\n--------------------\n"); _enter("{%d,%s,%lx}", @@ -436,12 +436,12 @@ recheck_state: goto requeue; out_put: - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_work); out: _leave(""); return; requeue: - __rxrpc_queue_call(call); + __rxrpc_queue_call(call, rxrpc_call_queue_requeue); goto out; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 29ec4013aa0b..afd957f6dc1c 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -53,9 +53,9 @@ static void rxrpc_call_timer_expired(struct timer_list *t) if (call->state < RXRPC_CALL_COMPLETE) { trace_rxrpc_timer_expired(call, jiffies); - __rxrpc_queue_call(call); + __rxrpc_queue_call(call, rxrpc_call_queue_timer); } else { - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_already_queued); } } @@ -64,10 +64,10 @@ void rxrpc_reduce_call_timer(struct rxrpc_call *call, unsigned long now, enum rxrpc_timer_trace why) { - if (rxrpc_try_get_call(call, rxrpc_call_got_timer)) { + if (rxrpc_try_get_call(call, rxrpc_call_get_timer)) { trace_rxrpc_timer(call, why, now); if (timer_reduce(&call->timer, expire_at)) - rxrpc_put_call(call, rxrpc_call_put_notimer); + rxrpc_put_call(call, rxrpc_call_put_timer_already); } } @@ -110,7 +110,7 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *rx, return NULL; found_extant_call: - rxrpc_get_call(call, rxrpc_call_got); + rxrpc_get_call(call, rxrpc_call_get_sendmsg); read_unlock(&rx->call_lock); _leave(" = %p [%d]", call, refcount_read(&call->ref)); return call; @@ -270,7 +270,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_net *rxnet; struct semaphore *limiter; struct rb_node *parent, **pp; - const void *here = __builtin_return_address(0); int ret; _enter("%p,%lx", rx, p->user_call_ID); @@ -291,9 +290,8 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, call->interruptibility = p->interruptibility; call->tx_total_len = p->tx_total_len; - trace_rxrpc_call(call->debug_id, rxrpc_call_new_client, - refcount_read(&call->ref), - here, (const void *)p->user_call_ID); + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), + p->user_call_ID, rxrpc_call_new_client); if (p->kernel) __set_bit(RXRPC_CALL_KERNEL, &call->flags); @@ -322,7 +320,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, rcu_assign_pointer(call->socket, rx); call->user_call_ID = p->user_call_ID; __set_bit(RXRPC_CALL_HAS_USERID, &call->flags); - rxrpc_get_call(call, rxrpc_call_got_userid); + rxrpc_get_call(call, rxrpc_call_get_userid); rb_link_node(&call->sock_node, parent, pp); rb_insert_color(&call->sock_node, &rx->calls); list_add(&call->sock_link, &rx->sock_calls); @@ -344,8 +342,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, if (ret < 0) goto error_attached_to_socket; - trace_rxrpc_call(call->debug_id, rxrpc_call_connected, - refcount_read(&call->ref), here, NULL); + rxrpc_see_call(call, rxrpc_call_see_connected); rxrpc_start_call_timer(call); @@ -362,11 +359,11 @@ error_dup_user_ID: release_sock(&rx->sk); __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_CALL_DEAD, -EEXIST); - trace_rxrpc_call(call->debug_id, rxrpc_call_error, - refcount_read(&call->ref), here, ERR_PTR(-EEXIST)); + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, + rxrpc_call_see_userid_exists); rxrpc_release_call(rx, call); mutex_unlock(&call->user_mutex); - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_userid_exists); _leave(" = -EEXIST"); return ERR_PTR(-EEXIST); @@ -376,8 +373,8 @@ error_dup_user_ID: * leave the error to recvmsg() to deal with. */ error_attached_to_socket: - trace_rxrpc_call(call->debug_id, rxrpc_call_error, - refcount_read(&call->ref), here, ERR_PTR(ret)); + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), ret, + rxrpc_call_see_connect_failed); set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_CALL_DEAD, ret); @@ -428,72 +425,65 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, /* * Queue a call's work processor, getting a ref to pass to the work queue. */ -bool rxrpc_queue_call(struct rxrpc_call *call) +bool rxrpc_queue_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { - const void *here = __builtin_return_address(0); int n; if (!__refcount_inc_not_zero(&call->ref, &n)) return false; if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call->debug_id, rxrpc_call_queued, n + 1, - here, NULL); + trace_rxrpc_call(call->debug_id, n + 1, 0, why); else - rxrpc_put_call(call, rxrpc_call_put_noqueue); + rxrpc_put_call(call, rxrpc_call_put_already_queued); return true; } /* * Queue a call's work processor, passing the callers ref to the work queue. */ -bool __rxrpc_queue_call(struct rxrpc_call *call) +bool __rxrpc_queue_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { - const void *here = __builtin_return_address(0); int n = refcount_read(&call->ref); + ASSERTCMP(n, >=, 1); if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call->debug_id, rxrpc_call_queued_ref, n, - here, NULL); + trace_rxrpc_call(call->debug_id, n, 0, why); else - rxrpc_put_call(call, rxrpc_call_put_noqueue); + rxrpc_put_call(call, rxrpc_call_put_already_queued); return true; } /* * Note the re-emergence of a call. */ -void rxrpc_see_call(struct rxrpc_call *call) +void rxrpc_see_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { - const void *here = __builtin_return_address(0); if (call) { - int n = refcount_read(&call->ref); + int r = refcount_read(&call->ref); - trace_rxrpc_call(call->debug_id, rxrpc_call_seen, n, - here, NULL); + trace_rxrpc_call(call->debug_id, r, 0, why); } } -bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) +bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { - const void *here = __builtin_return_address(0); - int n; + int r; - if (!__refcount_inc_not_zero(&call->ref, &n)) + if (!__refcount_inc_not_zero(&call->ref, &r)) return false; - trace_rxrpc_call(call->debug_id, op, n + 1, here, NULL); + trace_rxrpc_call(call->debug_id, r + 1, 0, why); return true; } /* * Note the addition of a ref on a call. */ -void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace op) +void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { - const void *here = __builtin_return_address(0); - int n; + int r; - __refcount_inc(&call->ref, &n); - trace_rxrpc_call(call->debug_id, op, n + 1, here, NULL); + __refcount_inc(&call->ref, &r); + trace_rxrpc_call(call->debug_id, r + 1, 0, why); } /* @@ -510,15 +500,13 @@ static void rxrpc_cleanup_ring(struct rxrpc_call *call) */ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) { - const void *here = __builtin_return_address(0); struct rxrpc_connection *conn = call->conn; bool put = false; _enter("{%d,%d}", call->debug_id, refcount_read(&call->ref)); - trace_rxrpc_call(call->debug_id, rxrpc_call_release, - refcount_read(&call->ref), - here, (const void *)call->flags); + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), + call->flags, rxrpc_call_see_release); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); @@ -544,14 +532,14 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) write_unlock_bh(&rx->recvmsg_lock); if (put) - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_unnotify); write_lock(&rx->call_lock); if (test_and_clear_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { rb_erase(&call->sock_node, &rx->calls); memset(&call->sock_node, 0xdd, sizeof(call->sock_node)); - rxrpc_put_call(call, rxrpc_call_put_userid); + rxrpc_put_call(call, rxrpc_call_put_userid_exists); } list_del(&call->sock_link); @@ -580,17 +568,17 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) struct rxrpc_call, accept_link); list_del(&call->accept_link); rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, -ECONNRESET); - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_release_sock_tba); } while (!list_empty(&rx->sock_calls)) { call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); - rxrpc_get_call(call, rxrpc_call_got); + rxrpc_get_call(call, rxrpc_call_get_release_sock); rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, -ECONNRESET); rxrpc_send_abort_packet(call); rxrpc_release_call(rx, call); - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_release_sock); } _leave(""); @@ -599,20 +587,18 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) /* * release a call */ -void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace op) +void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { struct rxrpc_net *rxnet = call->rxnet; - const void *here = __builtin_return_address(0); unsigned int debug_id = call->debug_id; bool dead; - int n; + int r; ASSERT(call != NULL); - dead = __refcount_dec_and_test(&call->ref, &n); - trace_rxrpc_call(debug_id, op, n, here, NULL); + dead = __refcount_dec_and_test(&call->ref, &r); + trace_rxrpc_call(debug_id, r - 1, 0, why); if (dead) { - _debug("call %d dead", call->debug_id); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { @@ -701,7 +687,7 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) struct rxrpc_call, link); _debug("Zapping call %p", call); - rxrpc_see_call(call); + rxrpc_see_call(call, rxrpc_call_see_zap); list_del_init(&call->link); pr_err("Call %p still in use (%d,%s,%lx,%lx)!\n", diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index dcfec6a45255..4352e777aa2a 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -540,7 +540,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, clear_bit(RXRPC_CONN_FINAL_ACK_0 + channel, &conn->flags); clear_bit(conn->bundle_shift + channel, &bundle->avail_chans); - rxrpc_see_call(call); + rxrpc_see_call(call, rxrpc_call_see_activate_client); list_del_init(&call->chan_wait_link); call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_activate_call); call->conn = rxrpc_get_connection(conn, rxrpc_conn_get_activate_call); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index c8ff7489b412..09b44cd11c9b 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -14,7 +14,7 @@ static void rxrpc_proto_abort(const char *why, { if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG)) { set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call); + rxrpc_queue_call(call, rxrpc_call_queue_abort); } } @@ -175,7 +175,7 @@ out_no_clear_ca: call->cong_cumul_acks = cumulative_acks; trace_rxrpc_congest(call, summary, acked_serial, change); if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_queue_call(call); + rxrpc_queue_call(call, rxrpc_call_queue_resend); return; packet_loss_detected: @@ -678,7 +678,7 @@ static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call) { if (after(call->acks_lost_top, call->acks_prev_seq) && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_queue_call(call); + rxrpc_queue_call(call, rxrpc_call_queue_resend); } /* @@ -1099,7 +1099,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, default: if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN)) { set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call); + rxrpc_queue_call(call, rxrpc_call_queue_abort); } trace_rxrpc_improper_term(call); break; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 2762b7ada9ae..d324e88f7642 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -310,7 +310,7 @@ void rxrpc_transmit_ack_packets(struct rxrpc_local *local) } list_del_init(&txb->tx_link); - rxrpc_put_call(txb->call, rxrpc_call_put); + rxrpc_put_call(txb->call, rxrpc_call_put_send_ack); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); } } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 5e97d321ac38..b28739d10927 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -238,7 +238,7 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, struct rxrpc_call *call; hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { - rxrpc_see_call(call); + rxrpc_see_call(call, rxrpc_call_see_distribute_error); rxrpc_set_call_completion(call, compl, 0, -error); } } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 134122f5961a..c84d2b620396 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -42,7 +42,7 @@ void rxrpc_notify_socket(struct rxrpc_call *call) } else { write_lock_bh(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { - rxrpc_get_call(call, rxrpc_call_got); + rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } write_unlock_bh(&rx->recvmsg_lock); @@ -451,7 +451,7 @@ try_again: if (!(flags & MSG_PEEK)) list_del_init(&call->recvmsg_link); else - rxrpc_get_call(call, rxrpc_call_got); + rxrpc_get_call(call, rxrpc_call_get_recvmsg); write_unlock_bh(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0); @@ -537,7 +537,7 @@ try_again: error_unlock_call: mutex_unlock(&call->user_mutex); - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_recvmsg); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, ret); return ret; @@ -548,7 +548,7 @@ error_requeue_call: write_unlock_bh(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0); } else { - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_recvmsg); } error_no_call: release_sock(&rx->sk); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index cfe0badba0b3..76b1e2e89c1e 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -667,7 +667,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) case RXRPC_CALL_CLIENT_AWAIT_CONN: case RXRPC_CALL_SERVER_PREALLOC: case RXRPC_CALL_SERVER_SECURING: - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_sendmsg); ret = -EBUSY; goto error_release_sock; default: @@ -737,7 +737,7 @@ out_put_unlock: if (!dropped_lock) mutex_unlock(&call->user_mutex); error_put: - rxrpc_put_call(call, rxrpc_call_put); + rxrpc_put_call(call, rxrpc_call_put_sendmsg); _leave(" = %d", ret); return ret; -- cgit v1.2.3-70-g09d2 From fa3492abb64b93b2b5d6fdf7a5b687a1fa810d74 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Nov 2022 21:54:46 +0000 Subject: rxrpc: Trace rxrpc_bundle refcount Add a tracepoint for the rxrpc_bundle refcounting. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 34 ++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 4 ++-- net/rxrpc/conn_client.c | 27 ++++++++++++++++----------- net/rxrpc/conn_object.c | 2 +- net/rxrpc/conn_service.c | 3 ++- 5 files changed, 55 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 3f6de4294148..6f5be7ac7f6b 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -81,6 +81,15 @@ EM(rxrpc_peer_put_input_error, "PUT inpt-err") \ E_(rxrpc_peer_put_keepalive, "PUT keepaliv") +#define rxrpc_bundle_traces \ + EM(rxrpc_bundle_free, "FREE ") \ + EM(rxrpc_bundle_get_client_call, "GET clt-call") \ + EM(rxrpc_bundle_get_client_conn, "GET clt-conn") \ + EM(rxrpc_bundle_get_service_conn, "GET svc-conn") \ + EM(rxrpc_bundle_put_conn, "PUT conn ") \ + EM(rxrpc_bundle_put_discard, "PUT discard ") \ + E_(rxrpc_bundle_new, "NEW ") + #define rxrpc_conn_traces \ EM(rxrpc_conn_free, "FREE ") \ EM(rxrpc_conn_get_activate_call, "GET act-call") \ @@ -361,6 +370,7 @@ #define EM(a, b) a, #define E_(a, b) a +enum rxrpc_bundle_trace { rxrpc_bundle_traces } __mode(byte); enum rxrpc_call_trace { rxrpc_call_traces } __mode(byte); enum rxrpc_client_trace { rxrpc_client_traces } __mode(byte); enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); @@ -390,6 +400,7 @@ enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #define EM(a, b) TRACE_DEFINE_ENUM(a); #define E_(a, b) TRACE_DEFINE_ENUM(a); +rxrpc_bundle_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; @@ -467,6 +478,29 @@ TRACE_EVENT(rxrpc_peer, __entry->ref) ); +TRACE_EVENT(rxrpc_bundle, + TP_PROTO(unsigned int bundle_debug_id, int ref, enum rxrpc_bundle_trace why), + + TP_ARGS(bundle_debug_id, ref, why), + + TP_STRUCT__entry( + __field(unsigned int, bundle ) + __field(int, ref ) + __field(int, why ) + ), + + TP_fast_assign( + __entry->bundle = bundle_debug_id; + __entry->ref = ref; + __entry->why = why; + ), + + TP_printk("CB=%08x %s r=%d", + __entry->bundle, + __print_symbolic(__entry->why, rxrpc_bundle_traces), + __entry->ref) + ); + TRACE_EVENT(rxrpc_conn, TP_PROTO(unsigned int conn_debug_id, int ref, enum rxrpc_conn_trace why), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 82eb09b961a0..c588c0e81f63 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -875,8 +875,8 @@ extern unsigned long rxrpc_conn_idle_client_fast_expiry; extern struct idr rxrpc_client_conn_ids; void rxrpc_destroy_client_conn_ids(void); -struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *); -void rxrpc_put_bundle(struct rxrpc_bundle *); +struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); +void rxrpc_put_bundle(struct rxrpc_bundle *, enum rxrpc_bundle_trace); int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_call *, struct rxrpc_conn_parameters *, struct sockaddr_rxrpc *, gfp_t); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 4352e777aa2a..34ff6fa85c32 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -133,31 +133,36 @@ static struct rxrpc_bundle *rxrpc_alloc_bundle(struct rxrpc_conn_parameters *cp, atomic_set(&bundle->active, 1); spin_lock_init(&bundle->channel_lock); INIT_LIST_HEAD(&bundle->waiting_calls); + trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_new); } return bundle; } -struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle) +struct rxrpc_bundle *rxrpc_get_bundle(struct rxrpc_bundle *bundle, + enum rxrpc_bundle_trace why) { - refcount_inc(&bundle->ref); + int r; + + __refcount_inc(&bundle->ref, &r); + trace_rxrpc_bundle(bundle->debug_id, r + 1, why); return bundle; } static void rxrpc_free_bundle(struct rxrpc_bundle *bundle) { + trace_rxrpc_bundle(bundle->debug_id, 1, rxrpc_bundle_free); rxrpc_put_peer(bundle->peer, rxrpc_peer_put_bundle); kfree(bundle); } -void rxrpc_put_bundle(struct rxrpc_bundle *bundle) +void rxrpc_put_bundle(struct rxrpc_bundle *bundle, enum rxrpc_bundle_trace why) { - unsigned int d = bundle->debug_id; + unsigned int id = bundle->debug_id; bool dead; int r; dead = __refcount_dec_and_test(&bundle->ref, &r); - - _debug("PUT B=%x %d", d, r - 1); + trace_rxrpc_bundle(id, r - 1, why); if (dead) rxrpc_free_bundle(bundle); } @@ -206,7 +211,7 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) list_add_tail(&conn->proc_link, &rxnet->conn_proc_list); write_unlock(&rxnet->conn_lock); - rxrpc_get_bundle(bundle); + rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_conn); rxrpc_get_peer(conn->peer, rxrpc_peer_get_client_conn); rxrpc_get_local(conn->local, rxrpc_local_get_client_conn); key_get(conn->key); @@ -342,7 +347,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c candidate->debug_id = atomic_inc_return(&rxrpc_bundle_id); rb_link_node(&candidate->local_node, parent, pp); rb_insert_color(&candidate->local_node, &local->client_bundles); - rxrpc_get_bundle(candidate); + rxrpc_get_bundle(candidate, rxrpc_bundle_get_client_call); spin_unlock(&local->client_bundles_lock); _leave(" = %u [new]", candidate->debug_id); return candidate; @@ -350,7 +355,7 @@ static struct rxrpc_bundle *rxrpc_look_up_bundle(struct rxrpc_conn_parameters *c found_bundle_free: rxrpc_free_bundle(candidate); found_bundle: - rxrpc_get_bundle(bundle); + rxrpc_get_bundle(bundle, rxrpc_bundle_get_client_call); atomic_inc(&bundle->active); spin_unlock(&local->client_bundles_lock); _leave(" = %u [found]", bundle->debug_id); @@ -740,7 +745,7 @@ granted_channel: out_put_bundle: rxrpc_deactivate_bundle(bundle); - rxrpc_put_bundle(bundle); + rxrpc_put_bundle(bundle, rxrpc_bundle_get_client_call); out: _leave(" = %d", ret); return ret; @@ -958,7 +963,7 @@ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) spin_unlock(&local->client_bundles_lock); if (need_put) - rxrpc_put_bundle(bundle); + rxrpc_put_bundle(bundle, rxrpc_bundle_put_discard); } } diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index bbace8d9953d..f7c271a740ed 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -363,7 +363,7 @@ static void rxrpc_destroy_connection(struct rcu_head *rcu) conn->security->clear(conn); key_put(conn->key); - rxrpc_put_bundle(conn->bundle); + rxrpc_put_bundle(conn->bundle, rxrpc_bundle_put_conn); rxrpc_put_peer(conn->peer, rxrpc_peer_put_conn); if (atomic_dec_and_test(&conn->local->rxnet->nr_conns)) diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index bf087213bd4d..2c44d67b43dc 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -133,7 +133,8 @@ struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxn */ conn->state = RXRPC_CONN_SERVICE_PREALLOC; refcount_set(&conn->ref, 2); - conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle); + conn->bundle = rxrpc_get_bundle(&rxrpc_service_dummy_bundle, + rxrpc_bundle_get_service_conn); atomic_inc(&rxnet->nr_conns); write_lock(&rxnet->conn_lock); -- cgit v1.2.3-70-g09d2 From 9a36a6bc22ca1c0a9d82228171e05dc785fa1154 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Oct 2022 15:31:21 +0100 Subject: rxrpc: trace: Don't use __builtin_return_address for sk_buff tracing In rxrpc tracing, use enums to generate lists of points of interest rather than __builtin_return_address() for the sk_buff tracepoint. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 57 +++++++++++++++++++++++++------------------- net/rxrpc/call_event.c | 4 ++-- net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_event.c | 6 ++--- net/rxrpc/input.c | 36 ++++++++++++++-------------- net/rxrpc/local_event.c | 4 ++-- net/rxrpc/output.c | 6 ++--- net/rxrpc/peer_event.c | 8 +++---- net/rxrpc/recvmsg.c | 6 ++--- net/rxrpc/skbuff.c | 36 ++++++++++++---------------- 10 files changed, 84 insertions(+), 81 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 6f5be7ac7f6b..5a2292baffc8 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -17,19 +17,31 @@ * Declare tracing information enums and their string mappings for display. */ #define rxrpc_skb_traces \ - EM(rxrpc_skb_ack, "ACK") \ - EM(rxrpc_skb_cleaned, "CLN") \ - EM(rxrpc_skb_cloned_jumbo, "CLJ") \ - EM(rxrpc_skb_freed, "FRE") \ - EM(rxrpc_skb_got, "GOT") \ - EM(rxrpc_skb_lost, "*L*") \ - EM(rxrpc_skb_new, "NEW") \ - EM(rxrpc_skb_purged, "PUR") \ - EM(rxrpc_skb_received, "RCV") \ - EM(rxrpc_skb_rotated, "ROT") \ - EM(rxrpc_skb_seen, "SEE") \ - EM(rxrpc_skb_unshared, "UNS") \ - E_(rxrpc_skb_unshared_nomem, "US0") + EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ + EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ + EM(rxrpc_skb_get_ack, "GET ack ") \ + EM(rxrpc_skb_get_conn_work, "GET conn-work") \ + EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ + EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ + EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ + EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ + EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ + EM(rxrpc_skb_new_unshared, "NEW unshared ") \ + EM(rxrpc_skb_put_ack, "PUT ack ") \ + EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ + EM(rxrpc_skb_put_error_report, "PUT error-rep") \ + EM(rxrpc_skb_put_input, "PUT input ") \ + EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ + EM(rxrpc_skb_put_lose, "PUT lose ") \ + EM(rxrpc_skb_put_purge, "PUT purge ") \ + EM(rxrpc_skb_put_rotate, "PUT rotate ") \ + EM(rxrpc_skb_put_unknown, "PUT unknown ") \ + EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ + EM(rxrpc_skb_see_local_work, "SEE locl-work") \ + EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ + EM(rxrpc_skb_see_reject, "SEE reject ") \ + EM(rxrpc_skb_see_rotate, "SEE rotate ") \ + E_(rxrpc_skb_see_version, "SEE version ") #define rxrpc_local_traces \ EM(rxrpc_local_free, "FREE ") \ @@ -582,33 +594,30 @@ TRACE_EVENT(rxrpc_call, ); TRACE_EVENT(rxrpc_skb, - TP_PROTO(struct sk_buff *skb, enum rxrpc_skb_trace op, - int usage, int mod_count, const void *where), + TP_PROTO(struct sk_buff *skb, int usage, int mod_count, + enum rxrpc_skb_trace why), - TP_ARGS(skb, op, usage, mod_count, where), + TP_ARGS(skb, usage, mod_count, why), TP_STRUCT__entry( __field(struct sk_buff *, skb ) - __field(enum rxrpc_skb_trace, op ) __field(int, usage ) __field(int, mod_count ) - __field(const void *, where ) + __field(enum rxrpc_skb_trace, why ) ), TP_fast_assign( __entry->skb = skb; - __entry->op = op; __entry->usage = usage; __entry->mod_count = mod_count; - __entry->where = where; + __entry->why = why; ), - TP_printk("s=%p Rx %s u=%d m=%d p=%pSR", + TP_printk("s=%p Rx %s u=%d m=%d", __entry->skb, - __print_symbolic(__entry->op, rxrpc_skb_traces), + __print_symbolic(__entry->why, rxrpc_skb_traces), __entry->usage, - __entry->mod_count, - __entry->where) + __entry->mod_count) ); TRACE_EVENT(rxrpc_rx_packet, diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 0c8d2186cda8..29ca02e53c47 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -153,7 +153,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) spin_lock_bh(&call->acks_ack_lock); ack_skb = call->acks_soft_tbl; if (ack_skb) { - rxrpc_get_skb(ack_skb, rxrpc_skb_ack); + rxrpc_get_skb(ack_skb, rxrpc_skb_get_ack); ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); } spin_unlock_bh(&call->acks_ack_lock); @@ -251,7 +251,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) no_further_resend: spin_unlock(&call->tx_lock); no_resend: - rxrpc_free_skb(ack_skb, rxrpc_skb_freed); + rxrpc_free_skb(ack_skb, rxrpc_skb_put_ack); resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index afd957f6dc1c..815209673115 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -663,7 +663,7 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); } rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); - rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_cleaned); + rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_put_ack); call_rcu(&call->rcu, rxrpc_rcu_destroy_call); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 817f895c77ca..49d885f73fa5 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -437,7 +437,7 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn) /* go through the conn-level event packets, releasing the ref on this * connection that each one has when we've finished with it */ while ((skb = skb_dequeue(&conn->rx_queue))) { - rxrpc_see_skb(skb, rxrpc_skb_seen); + rxrpc_see_skb(skb, rxrpc_skb_see_conn_work); ret = rxrpc_process_event(conn, skb, &abort_code); switch (ret) { case -EPROTO: @@ -449,7 +449,7 @@ static void rxrpc_do_process_connection(struct rxrpc_connection *conn) goto requeue_and_leave; case -ECONNABORTED: default: - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_conn_work); break; } } @@ -463,7 +463,7 @@ requeue_and_leave: protocol_error: if (rxrpc_abort_connection(conn, ret, abort_code) < 0) goto requeue_and_leave; - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_conn_work); return; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 09b44cd11c9b..ab8b7a1be935 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -485,7 +485,7 @@ send_ack: rxrpc_propose_ack_input_data); err_free: - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); } /* @@ -513,7 +513,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb kdebug("couldn't clone"); return false; } - rxrpc_new_skb(jskb, rxrpc_skb_cloned_jumbo); + rxrpc_new_skb(jskb, rxrpc_skb_new_jumbo_subpacket); jsp = rxrpc_skb(jskb); jsp->offset = offset; jsp->len = RXRPC_JUMBO_DATALEN; @@ -553,7 +553,7 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) state = READ_ONCE(call->state); if (state >= RXRPC_CALL_COMPLETE) { - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); return; } @@ -563,14 +563,14 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) if (sp->hdr.securityIndex != 0) { struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC); if (!nskb) { - rxrpc_eaten_skb(skb, rxrpc_skb_unshared_nomem); + rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare_nomem); return; } if (nskb != skb) { - rxrpc_eaten_skb(skb, rxrpc_skb_received); + rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare); skb = nskb; - rxrpc_new_skb(skb, rxrpc_skb_unshared); + rxrpc_new_skb(skb, rxrpc_skb_new_unshared); sp = rxrpc_skb(skb); } } @@ -609,7 +609,7 @@ out: rxrpc_notify_socket(call); spin_unlock(&call->input_lock); - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); _leave(" [queued]"); } @@ -994,8 +994,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) out: spin_unlock(&call->input_lock); out_not_locked: - rxrpc_free_skb(skb_put, rxrpc_skb_freed); - rxrpc_free_skb(skb_old, rxrpc_skb_freed); + rxrpc_free_skb(skb_put, rxrpc_skb_put_input); + rxrpc_free_skb(skb_old, rxrpc_skb_put_ack); } /* @@ -1075,7 +1075,7 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, break; } - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); no_free: _leave(""); } @@ -1137,7 +1137,7 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, skb_queue_tail(&local->event_queue, skb); rxrpc_queue_local(local); } else { - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); } } @@ -1150,7 +1150,7 @@ static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) skb_queue_tail(&local->reject_queue, skb); rxrpc_queue_local(local); } else { - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); } } @@ -1228,7 +1228,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); - rxrpc_new_skb(skb, rxrpc_skb_received); + rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); skb_pull(skb, sizeof(struct udphdr)); @@ -1245,7 +1245,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); - rxrpc_free_skb(skb, rxrpc_skb_lost); + rxrpc_free_skb(skb, rxrpc_skb_put_lose); return 0; } } @@ -1286,14 +1286,14 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) if (sp->hdr.securityIndex != 0) { struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC); if (!nskb) { - rxrpc_eaten_skb(skb, rxrpc_skb_unshared_nomem); + rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare_nomem); goto out; } if (nskb != skb) { - rxrpc_eaten_skb(skb, rxrpc_skb_received); + rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare); skb = nskb; - rxrpc_new_skb(skb, rxrpc_skb_unshared); + rxrpc_new_skb(skb, rxrpc_skb_new_unshared); sp = rxrpc_skb(skb); } } @@ -1434,7 +1434,7 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) goto out; discard: - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); out: trace_rxrpc_rx_done(0, 0); return 0; diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index f23a3fbabbda..c344383a20b2 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -88,7 +88,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) if (skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - rxrpc_see_skb(skb, rxrpc_skb_seen); + rxrpc_see_skb(skb, rxrpc_skb_see_local_work); _debug("{%d},{%u}", local->debug_id, sp->hdr.type); switch (sp->hdr.type) { @@ -105,7 +105,7 @@ void rxrpc_process_local_events(struct rxrpc_local *local) break; } - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); } _leave(""); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index d324e88f7642..131c7a76fb06 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -615,7 +615,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) memset(&whdr, 0, sizeof(whdr)); while ((skb = skb_dequeue(&local->reject_queue))) { - rxrpc_see_skb(skb, rxrpc_skb_seen); + rxrpc_see_skb(skb, rxrpc_skb_see_reject); sp = rxrpc_skb(skb); switch (skb->mark) { @@ -631,7 +631,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) ioc = 2; break; default: - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); continue; } @@ -656,7 +656,7 @@ void rxrpc_reject_packets(struct rxrpc_local *local) rxrpc_tx_point_reject); } - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_input); } _leave(""); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index b28739d10927..f35cfc458dcf 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -158,12 +158,12 @@ void rxrpc_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } - rxrpc_new_skb(skb, rxrpc_skb_received); + rxrpc_new_skb(skb, rxrpc_skb_new_error_report); serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_error_report); return; } @@ -172,7 +172,7 @@ void rxrpc_error_report(struct sock *sk) peer = NULL; if (!peer) { rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_error_report); _leave(" [no peer]"); return; } @@ -189,7 +189,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); out: rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_error_report); rxrpc_put_peer(peer, rxrpc_peer_put_input_error); _leave(""); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index c84d2b620396..bfac9e09347e 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -229,7 +229,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) _enter("%d", call->debug_id); skb = skb_dequeue(&call->recvmsg_queue); - rxrpc_see_skb(skb, rxrpc_skb_rotated); + rxrpc_see_skb(skb, rxrpc_skb_see_rotate); sp = rxrpc_skb(skb); tseq = sp->hdr.seq; @@ -240,7 +240,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) if (after(tseq, call->rx_consumed)) smp_store_release(&call->rx_consumed, tseq); - rxrpc_free_skb(skb, rxrpc_skb_freed); + rxrpc_free_skb(skb, rxrpc_skb_put_rotate); trace_rxrpc_receive(call, last ? rxrpc_receive_rotate_last : rxrpc_receive_rotate, serial, call->rx_consumed); @@ -302,7 +302,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, */ skb = skb_peek(&call->recvmsg_queue); while (skb) { - rxrpc_see_skb(skb, rxrpc_skb_seen); + rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); sp = rxrpc_skb(skb); seq = sp->hdr.seq; diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c index 0c827d5bb2b8..ebe0c75e7b07 100644 --- a/net/rxrpc/skbuff.c +++ b/net/rxrpc/skbuff.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* ar-skbuff.c: socket buffer destruction handling +/* Socket buffer accounting * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -19,56 +19,50 @@ /* * Note the allocation or reception of a socket buffer. */ -void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { - const void *here = __builtin_return_address(0); int n = atomic_inc_return(select_skb_count(skb)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); + trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ -void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { - const void *here = __builtin_return_address(0); if (skb) { int n = atomic_read(select_skb_count(skb)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); + trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); } } /* * Note the addition of a ref on a socket buffer. */ -void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { - const void *here = __builtin_return_address(0); int n = atomic_inc_return(select_skb_count(skb)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); + trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); skb_get(skb); } /* * Note the dropping of a ref on a socket buffer by the core. */ -void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { - const void *here = __builtin_return_address(0); int n = atomic_inc_return(&rxrpc_n_rx_skbs); - trace_rxrpc_skb(skb, op, 0, n, here); + trace_rxrpc_skb(skb, 0, n, why); } /* * Note the destruction of a socket buffer. */ -void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) +void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why) { - const void *here = __builtin_return_address(0); if (skb) { - int n; - n = atomic_dec_return(select_skb_count(skb)); - trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, here); + int n = atomic_dec_return(select_skb_count(skb)); + trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why); kfree_skb(skb); } } @@ -78,12 +72,12 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) */ void rxrpc_purge_queue(struct sk_buff_head *list) { - const void *here = __builtin_return_address(0); struct sk_buff *skb; + while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); - trace_rxrpc_skb(skb, rxrpc_skb_purged, - refcount_read(&skb->users), n, here); + trace_rxrpc_skb(skb, refcount_read(&skb->users), n, + rxrpc_skb_put_purge); kfree_skb(skb); } } -- cgit v1.2.3-70-g09d2 From 3feda9d69c83983b530cea6287ba4fea0e5c3b87 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Nov 2022 09:00:55 +0000 Subject: rxrpc: Don't hold a ref for call timer or workqueue Currently, rxrpc gives the call timer a ref on the call when it starts it and this is passed along to the workqueue by the timer expiration function. The problem comes when queue_work() fails (ie. the work item is already queued): the timer routine must put the ref - but this may cause the cleanup code to run. This has the unfortunate effect that the cleanup code may then be run in softirq context - which means that any spinlocks it might need to touch have to be guarded to disable softirqs (ie. they need a "_bh" suffix). Fix this by: (1) Don't give a ref to the timer. (2) Making the expiration function not do anything if the refcount is 0. Note that this is more of an optimisation. (3) Make sure that the cleanup routine waits for timer to complete. However, this has a consequence that timer cannot give a ref to the work item. Therefore the following fixes are also necessary: (4) Don't give a ref to the work item. (5) Make the work item return asap if it sees the ref count is 0. (6) Make sure that the cleanup routine waits for the work item to complete. Unfortunately, neither the timer nor the work item can simply get around the problem by just using refcount_inc_not_zero() as the waits would still have to be done, and there would still be the possibility of having to put the ref in the expiration function. Note the call work item is going to go away with the work being transferred to the I/O thread, so the wait in (6) will become obsolete. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 6 +-- net/rxrpc/ar-internal.h | 6 +-- net/rxrpc/call_event.c | 11 ++--- net/rxrpc/call_object.c | 111 ++++++++++++++++--------------------------- net/rxrpc/txbuf.c | 2 + 5 files changed, 52 insertions(+), 84 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 5a2292baffc8..4538de0079a5 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -155,11 +155,9 @@ EM(rxrpc_call_get_release_sock, "GET rel-sock") \ EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ EM(rxrpc_call_get_send_ack, "GET send-ack") \ - EM(rxrpc_call_get_timer, "GET timer ") \ EM(rxrpc_call_get_userid, "GET user-id ") \ EM(rxrpc_call_new_client, "NEW client ") \ EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ - EM(rxrpc_call_put_already_queued, "PUT alreadyq") \ EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ EM(rxrpc_call_put_input, "PUT input ") \ EM(rxrpc_call_put_kernel, "PUT kernel ") \ @@ -168,11 +166,8 @@ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ EM(rxrpc_call_put_send_ack, "PUT send-ack") \ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ - EM(rxrpc_call_put_timer, "PUT timer ") \ - EM(rxrpc_call_put_timer_already, "PUT timer-al") \ EM(rxrpc_call_put_unnotify, "PUT unnotify") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ - EM(rxrpc_call_put_work, "PUT work ") \ EM(rxrpc_call_queue_abort, "QUE abort ") \ EM(rxrpc_call_queue_requeue, "QUE requeue ") \ EM(rxrpc_call_queue_resend, "QUE resend ") \ @@ -368,6 +363,7 @@ EM(rxrpc_txbuf_put_rotated, "PUT ROTATED") \ EM(rxrpc_txbuf_put_send_aborted, "PUT SEND-X ") \ EM(rxrpc_txbuf_put_trans, "PUT TRANS ") \ + EM(rxrpc_txbuf_see_out_of_step, "OUT-OF-STEP") \ EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \ E_(rxrpc_txbuf_see_unacked, "SEE UNACKED") diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c588c0e81f63..03523a864c11 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -598,6 +598,7 @@ struct rxrpc_call { u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ struct timer_list timer; /* Combined event timer */ struct work_struct processor; /* Event processor */ + struct work_struct destroyer; /* In-process-context destroyer */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ struct list_head chan_wait_link; /* Link in conn->bundle->waiting_calls */ @@ -827,8 +828,6 @@ void rxrpc_reduce_call_timer(struct rxrpc_call *call, unsigned long now, enum rxrpc_timer_trace why); -void rxrpc_delete_call_timer(struct rxrpc_call *call); - /* * call_object.c */ @@ -847,8 +846,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); -bool __rxrpc_queue_call(struct rxrpc_call *, enum rxrpc_call_trace); -bool rxrpc_queue_call(struct rxrpc_call *, enum rxrpc_call_trace); +void rxrpc_queue_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_see_call(struct rxrpc_call *, enum rxrpc_call_trace); bool rxrpc_try_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 29ca02e53c47..049b92b1c040 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -323,8 +323,8 @@ recheck_state: rxrpc_shrink_call_tx_buffer(call); if (call->state == RXRPC_CALL_COMPLETE) { - rxrpc_delete_call_timer(call); - goto out_put; + del_timer_sync(&call->timer); + goto out; } /* Work out if any timeouts tripped */ @@ -432,16 +432,15 @@ recheck_state: rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); /* other events may have been raised since we started checking */ - if (call->events && call->state < RXRPC_CALL_COMPLETE) + if (call->events) goto requeue; -out_put: - rxrpc_put_call(call, rxrpc_call_put_work); out: _leave(""); return; requeue: - __rxrpc_queue_call(call, rxrpc_call_queue_requeue); + if (call->state < RXRPC_CALL_COMPLETE) + rxrpc_queue_call(call, rxrpc_call_queue_requeue); goto out; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 815209673115..9cd7e0190ef4 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -53,9 +53,7 @@ static void rxrpc_call_timer_expired(struct timer_list *t) if (call->state < RXRPC_CALL_COMPLETE) { trace_rxrpc_timer_expired(call, jiffies); - __rxrpc_queue_call(call, rxrpc_call_queue_timer); - } else { - rxrpc_put_call(call, rxrpc_call_put_already_queued); + rxrpc_queue_call(call, rxrpc_call_queue_timer); } } @@ -64,21 +62,14 @@ void rxrpc_reduce_call_timer(struct rxrpc_call *call, unsigned long now, enum rxrpc_timer_trace why) { - if (rxrpc_try_get_call(call, rxrpc_call_get_timer)) { - trace_rxrpc_timer(call, why, now); - if (timer_reduce(&call->timer, expire_at)) - rxrpc_put_call(call, rxrpc_call_put_timer_already); - } -} - -void rxrpc_delete_call_timer(struct rxrpc_call *call) -{ - if (del_timer_sync(&call->timer)) - rxrpc_put_call(call, rxrpc_call_put_timer); + trace_rxrpc_timer(call, why, now); + timer_reduce(&call->timer, expire_at); } static struct lock_class_key rxrpc_call_user_mutex_lock_class_key; +static void rxrpc_destroy_call(struct work_struct *); + /* * find an extant server call * - called in process context with IRQs enabled @@ -139,7 +130,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, &rxrpc_call_user_mutex_lock_class_key); timer_setup(&call->timer, rxrpc_call_timer_expired, 0); - INIT_WORK(&call->processor, &rxrpc_process_call); + INIT_WORK(&call->processor, rxrpc_process_call); + INIT_WORK(&call->destroyer, rxrpc_destroy_call); INIT_LIST_HEAD(&call->link); INIT_LIST_HEAD(&call->chan_wait_link); INIT_LIST_HEAD(&call->accept_link); @@ -423,34 +415,12 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, } /* - * Queue a call's work processor, getting a ref to pass to the work queue. + * Queue a call's work processor. */ -bool rxrpc_queue_call(struct rxrpc_call *call, enum rxrpc_call_trace why) +void rxrpc_queue_call(struct rxrpc_call *call, enum rxrpc_call_trace why) { - int n; - - if (!__refcount_inc_not_zero(&call->ref, &n)) - return false; if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call->debug_id, n + 1, 0, why); - else - rxrpc_put_call(call, rxrpc_call_put_already_queued); - return true; -} - -/* - * Queue a call's work processor, passing the callers ref to the work queue. - */ -bool __rxrpc_queue_call(struct rxrpc_call *call, enum rxrpc_call_trace why) -{ - int n = refcount_read(&call->ref); - - ASSERTCMP(n, >=, 1); - if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call->debug_id, n, 0, why); - else - rxrpc_put_call(call, rxrpc_call_put_already_queued); - return true; + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, why); } /* @@ -514,7 +484,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) BUG(); rxrpc_put_call_slot(call); - rxrpc_delete_call_timer(call); + del_timer_sync(&call->timer); /* Make sure we don't get any more notifications */ write_lock_bh(&rx->recvmsg_lock); @@ -612,36 +582,41 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) } /* - * Final call destruction - but must be done in process context. + * Free up the call under RCU. */ -static void rxrpc_destroy_call(struct work_struct *work) +static void rxrpc_rcu_free_call(struct rcu_head *rcu) { - struct rxrpc_call *call = container_of(work, struct rxrpc_call, processor); - struct rxrpc_net *rxnet = call->rxnet; - - rxrpc_delete_call_timer(call); + struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + struct rxrpc_net *rxnet = READ_ONCE(call->rxnet); - rxrpc_put_connection(call->conn, rxrpc_conn_put_call); - rxrpc_put_peer(call->peer, rxrpc_peer_put_call); kmem_cache_free(rxrpc_call_jar, call); if (atomic_dec_and_test(&rxnet->nr_calls)) wake_up_var(&rxnet->nr_calls); } /* - * Final call destruction under RCU. + * Final call destruction - but must be done in process context. */ -static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) +static void rxrpc_destroy_call(struct work_struct *work) { - struct rxrpc_call *call = container_of(rcu, struct rxrpc_call, rcu); + struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); + struct rxrpc_txbuf *txb; - if (in_softirq()) { - INIT_WORK(&call->processor, rxrpc_destroy_call); - if (!rxrpc_queue_work(&call->processor)) - BUG(); - } else { - rxrpc_destroy_call(&call->processor); + del_timer_sync(&call->timer); + cancel_work_sync(&call->processor); /* The processor may restart the timer */ + del_timer_sync(&call->timer); + + rxrpc_cleanup_ring(call); + while ((txb = list_first_entry_or_null(&call->tx_buffer, + struct rxrpc_txbuf, call_link))) { + list_del(&txb->call_link); + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); } + rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); + rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_put_ack); + rxrpc_put_connection(call->conn, rxrpc_conn_put_call); + rxrpc_put_peer(call->peer, rxrpc_peer_put_call); + call_rcu(&call->rcu, rxrpc_rcu_free_call); } /* @@ -649,23 +624,21 @@ static void rxrpc_rcu_destroy_call(struct rcu_head *rcu) */ void rxrpc_cleanup_call(struct rxrpc_call *call) { - struct rxrpc_txbuf *txb; - memset(&call->sock_node, 0xcd, sizeof(call->sock_node)); ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); - rxrpc_cleanup_ring(call); - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); - rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_put_ack); + del_timer_sync(&call->timer); + cancel_work(&call->processor); - call_rcu(&call->rcu, rxrpc_rcu_destroy_call); + if (in_softirq() || work_busy(&call->processor)) + /* Can't use the rxrpc workqueue as we need to cancel/flush + * something that may be running/waiting there. + */ + schedule_work(&call->destroyer); + else + rxrpc_destroy_call(&call->destroyer); } /* diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 96bfee89927b..f93dc666a3a0 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -120,6 +120,8 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) if (before(hard_ack, txb->seq)) break; + if (txb->seq != call->tx_bottom + 1) + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step); ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); call->tx_bottom++; list_del_rcu(&txb->call_link); -- cgit v1.2.3-70-g09d2 From 3cec055c56958c5498eeb3ed9fb2aef2d28c030f Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 25 Nov 2022 12:43:50 +0000 Subject: rxrpc: Don't hold a ref for connection workqueue Currently, rxrpc gives the connection's work item a ref on the connection when it queues it - and this is called from the timer expiration function. The problem comes when queue_work() fails (ie. the work item is already queued): the timer routine must put the ref - but this may cause the cleanup code to run. This has the unfortunate effect that the cleanup code may then be run in softirq context - which means that any spinlocks it might need to touch have to be guarded to disable softirqs (ie. they need a "_bh" suffix). (1) Don't give a ref to the work item. (2) Simplify handling of service connections by adding a separate active count so that the refcount isn't also used for this. (3) Connection destruction for both client and service connections can then be cleaned up by putting rxrpc_put_connection() out of line and making a tidy progression through the destruction code (offloaded to a workqueue if put from softirq or processor function context). The RCU part of the cleanup then only deals with the freeing at the end. (4) Make rxrpc_queue_conn() return immediately if it sees the active count is -1 rather then queuing the connection. (5) Make sure that the cleanup routine waits for the work item to complete. (6) Stash the rxrpc_net pointer in the conn struct so that the rcu free routine can use it, even if the local endpoint has been freed. Unfortunately, neither the timer nor the work item can simply get around the problem by just using refcount_inc_not_zero() as the waits would still have to be done, and there would still be the possibility of having to put the ref in the expiration function. Note the connection work item is mostly going to go away with the main event work being transferred to the I/O thread, so the wait in (6) will become obsolete. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 11 ++- net/rxrpc/ar-internal.h | 25 ++----- net/rxrpc/call_accept.c | 1 + net/rxrpc/conn_client.c | 31 ++------ net/rxrpc/conn_event.c | 4 - net/rxrpc/conn_object.c | 169 ++++++++++++++++++++++++------------------- net/rxrpc/conn_service.c | 4 +- net/rxrpc/net_ns.c | 2 +- net/rxrpc/proc.c | 5 +- 9 files changed, 123 insertions(+), 129 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 4538de0079a5..44a9be9836f9 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -112,7 +112,6 @@ EM(rxrpc_conn_get_service_conn, "GET svc-conn") \ EM(rxrpc_conn_new_client, "NEW client ") \ EM(rxrpc_conn_new_service, "NEW service ") \ - EM(rxrpc_conn_put_already_queued, "PUT alreadyq") \ EM(rxrpc_conn_put_call, "PUT call ") \ EM(rxrpc_conn_put_call_input, "PUT inp-call") \ EM(rxrpc_conn_put_conn_input, "PUT inp-conn") \ @@ -121,13 +120,13 @@ EM(rxrpc_conn_put_local_dead, "PUT loc-dead") \ EM(rxrpc_conn_put_noreuse, "PUT noreuse ") \ EM(rxrpc_conn_put_poke, "PUT poke ") \ + EM(rxrpc_conn_put_service_reaped, "PUT svc-reap") \ EM(rxrpc_conn_put_unbundle, "PUT unbundle") \ EM(rxrpc_conn_put_unidle, "PUT unidle ") \ - EM(rxrpc_conn_put_work, "PUT work ") \ - EM(rxrpc_conn_queue_challenge, "GQ chall ") \ - EM(rxrpc_conn_queue_retry_work, "GQ retry-wk") \ - EM(rxrpc_conn_queue_rx_work, "GQ rx-work ") \ - EM(rxrpc_conn_queue_timer, "GQ timer ") \ + EM(rxrpc_conn_queue_challenge, "QUE chall ") \ + EM(rxrpc_conn_queue_retry_work, "QUE retry-wk") \ + EM(rxrpc_conn_queue_rx_work, "QUE rx-work ") \ + EM(rxrpc_conn_queue_timer, "QUE timer ") \ EM(rxrpc_conn_see_new_service_conn, "SEE new-svc ") \ EM(rxrpc_conn_see_reap_service, "SEE reap-svc") \ E_(rxrpc_conn_see_work, "SEE work ") diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 03523a864c11..41a57c145f2b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -76,7 +76,7 @@ struct rxrpc_net { bool kill_all_client_conns; atomic_t nr_client_conns; spinlock_t client_conn_cache_lock; /* Lock for ->*_client_conns */ - spinlock_t client_conn_discard_lock; /* Prevent multiple discarders */ + struct mutex client_conn_discard_lock; /* Prevent multiple discarders */ struct list_head idle_client_conns; struct work_struct client_conn_reaper; struct timer_list client_conn_reap_timer; @@ -432,9 +432,11 @@ struct rxrpc_connection { struct rxrpc_conn_proto proto; struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_peer *peer; /* Remote endpoint */ + struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ struct key *key; /* Security details */ refcount_t ref; + atomic_t active; /* Active count for service conns */ struct rcu_head rcu; struct list_head cache_link; @@ -455,6 +457,7 @@ struct rxrpc_connection { struct timer_list timer; /* Conn event timer */ struct work_struct processor; /* connection event processor */ + struct work_struct destructor; /* In-process-context destroyer */ struct rxrpc_bundle *bundle; /* Client connection bundle */ struct rb_node service_node; /* Node in peer->service_conns */ struct list_head proc_link; /* link in procfs list */ @@ -897,20 +900,20 @@ void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); extern unsigned int rxrpc_connection_expiry; extern unsigned int rxrpc_closed_conn_expiry; -struct rxrpc_connection *rxrpc_alloc_connection(gfp_t); +struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *, gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, struct sk_buff *, struct rxrpc_peer **); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); -void rxrpc_kill_connection(struct rxrpc_connection *); -bool rxrpc_queue_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); +void rxrpc_kill_client_conn(struct rxrpc_connection *); +void rxrpc_queue_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_see_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); struct rxrpc_connection *rxrpc_get_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); struct rxrpc_connection *rxrpc_get_connection_maybe(struct rxrpc_connection *, enum rxrpc_conn_trace); -void rxrpc_put_service_conn(struct rxrpc_connection *, enum rxrpc_conn_trace); +void rxrpc_put_connection(struct rxrpc_connection *, enum rxrpc_conn_trace); void rxrpc_service_connection_reaper(struct work_struct *); void rxrpc_destroy_all_connections(struct rxrpc_net *); @@ -924,18 +927,6 @@ static inline bool rxrpc_conn_is_service(const struct rxrpc_connection *conn) return !rxrpc_conn_is_client(conn); } -static inline void rxrpc_put_connection(struct rxrpc_connection *conn, - enum rxrpc_conn_trace why) -{ - if (!conn) - return; - - if (rxrpc_conn_is_client(conn)) - rxrpc_put_client_conn(conn, why); - else - rxrpc_put_service_conn(conn, why); -} - static inline void rxrpc_reduce_conn_timer(struct rxrpc_connection *conn, unsigned long expire_at) { diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index dd4ca4bee77f..8d106b626aa3 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -308,6 +308,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, rxrpc_new_incoming_connection(rx, conn, sec, skb); } else { rxrpc_get_connection(conn, rxrpc_conn_get_service_conn); + atomic_inc(&conn->active); } /* And now we can allocate and set up a new call */ diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 34ff6fa85c32..9485a3d18f29 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -51,7 +51,7 @@ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle); static int rxrpc_get_client_connection_id(struct rxrpc_connection *conn, gfp_t gfp) { - struct rxrpc_net *rxnet = conn->local->rxnet; + struct rxrpc_net *rxnet = conn->rxnet; int id; _enter(""); @@ -179,7 +179,7 @@ rxrpc_alloc_client_connection(struct rxrpc_bundle *bundle, gfp_t gfp) _enter(""); - conn = rxrpc_alloc_connection(gfp); + conn = rxrpc_alloc_connection(rxnet, gfp); if (!conn) { _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); @@ -243,7 +243,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) if (!conn) goto dont_reuse; - rxnet = conn->local->rxnet; + rxnet = conn->rxnet; if (test_bit(RXRPC_CONN_DONT_REUSE, &conn->flags)) goto dont_reuse; @@ -970,7 +970,7 @@ static void rxrpc_deactivate_bundle(struct rxrpc_bundle *bundle) /* * Clean up a dead client connection. */ -static void rxrpc_kill_client_conn(struct rxrpc_connection *conn) +void rxrpc_kill_client_conn(struct rxrpc_connection *conn) { struct rxrpc_local *local = conn->local; struct rxrpc_net *rxnet = local->rxnet; @@ -981,23 +981,6 @@ static void rxrpc_kill_client_conn(struct rxrpc_connection *conn) atomic_dec(&rxnet->nr_client_conns); rxrpc_put_client_connection_id(conn); - rxrpc_kill_connection(conn); -} - -/* - * Clean up a dead client connections. - */ -void rxrpc_put_client_conn(struct rxrpc_connection *conn, - enum rxrpc_conn_trace why) -{ - unsigned int debug_id = conn->debug_id; - bool dead; - int r; - - dead = __refcount_dec_and_test(&conn->ref, &r); - trace_rxrpc_conn(debug_id, r - 1, why); - if (dead) - rxrpc_kill_client_conn(conn); } /* @@ -1023,7 +1006,7 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work) } /* Don't double up on the discarding */ - if (!spin_trylock(&rxnet->client_conn_discard_lock)) { + if (!mutex_trylock(&rxnet->client_conn_discard_lock)) { _leave(" [already]"); return; } @@ -1061,6 +1044,7 @@ next: goto not_yet_expired; } + atomic_dec(&conn->active); trace_rxrpc_client(conn, -1, rxrpc_client_discard); list_del_init(&conn->cache_link); @@ -1087,7 +1071,7 @@ not_yet_expired: out: spin_unlock(&rxnet->client_conn_cache_lock); - spin_unlock(&rxnet->client_conn_discard_lock); + mutex_unlock(&rxnet->client_conn_discard_lock); _leave(""); } @@ -1127,6 +1111,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) list_for_each_entry_safe(conn, tmp, &rxnet->idle_client_conns, cache_link) { if (conn->local == local) { + atomic_dec(&conn->active); trace_rxrpc_client(conn, -1, rxrpc_client_discard); list_move(&conn->cache_link, &graveyard); } diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 49d885f73fa5..23a74e35052d 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -478,8 +478,4 @@ void rxrpc_process_connection(struct work_struct *work) rxrpc_do_process_connection(conn); rxrpc_unuse_local(conn->local, rxrpc_local_unuse_conn_work); } - - rxrpc_put_connection(conn, rxrpc_conn_put_work); - _leave(""); - return; } diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index f7c271a740ed..c2e05ea29f12 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -19,7 +19,9 @@ unsigned int __read_mostly rxrpc_connection_expiry = 10 * 60; unsigned int __read_mostly rxrpc_closed_conn_expiry = 10; -static void rxrpc_destroy_connection(struct rcu_head *); +static void rxrpc_clean_up_connection(struct work_struct *work); +static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, + unsigned long reap_at); static void rxrpc_connection_timer(struct timer_list *timer) { @@ -32,7 +34,8 @@ static void rxrpc_connection_timer(struct timer_list *timer) /* * allocate a new connection */ -struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) +struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, + gfp_t gfp) { struct rxrpc_connection *conn; @@ -42,10 +45,12 @@ struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp) if (conn) { INIT_LIST_HEAD(&conn->cache_link); timer_setup(&conn->timer, &rxrpc_connection_timer, 0); - INIT_WORK(&conn->processor, &rxrpc_process_connection); + INIT_WORK(&conn->processor, rxrpc_process_connection); + INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); skb_queue_head_init(&conn->rx_queue); + conn->rxnet = rxnet; conn->security = &rxrpc_no_security; spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); @@ -224,53 +229,20 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) set_bit(RXRPC_CALL_DISCONNECTED, &call->flags); conn->idle_timestamp = jiffies; -} - -/* - * Kill off a connection. - */ -void rxrpc_kill_connection(struct rxrpc_connection *conn) -{ - struct rxrpc_net *rxnet = conn->local->rxnet; - - ASSERT(!rcu_access_pointer(conn->channels[0].call) && - !rcu_access_pointer(conn->channels[1].call) && - !rcu_access_pointer(conn->channels[2].call) && - !rcu_access_pointer(conn->channels[3].call)); - ASSERT(list_empty(&conn->cache_link)); - - write_lock(&rxnet->conn_lock); - list_del_init(&conn->proc_link); - write_unlock(&rxnet->conn_lock); - - /* Drain the Rx queue. Note that even though we've unpublished, an - * incoming packet could still be being added to our Rx queue, so we - * will need to drain it again in the RCU cleanup handler. - */ - rxrpc_purge_queue(&conn->rx_queue); - - /* Leave final destruction to RCU. The connection processor work item - * must carry a ref on the connection to prevent us getting here whilst - * it is queued or running. - */ - call_rcu(&conn->rcu, rxrpc_destroy_connection); + if (atomic_dec_and_test(&conn->active)) + rxrpc_set_service_reap_timer(conn->rxnet, + jiffies + rxrpc_connection_expiry); } /* * Queue a connection's work processor, getting a ref to pass to the work * queue. */ -bool rxrpc_queue_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) +void rxrpc_queue_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) { - int r; - - if (!__refcount_inc_not_zero(&conn->ref, &r)) - return false; - if (rxrpc_queue_work(&conn->processor)) - trace_rxrpc_conn(conn->debug_id, why, r + 1); - else - rxrpc_put_connection(conn, rxrpc_conn_put_already_queued); - return true; + if (atomic_read(&conn->active) >= 0 && + rxrpc_queue_work(&conn->processor)) + rxrpc_see_connection(conn, why); } /* @@ -327,51 +299,96 @@ static void rxrpc_set_service_reap_timer(struct rxrpc_net *rxnet, timer_reduce(&rxnet->service_conn_reap_timer, reap_at); } -/* - * Release a service connection - */ -void rxrpc_put_service_conn(struct rxrpc_connection *conn, - enum rxrpc_conn_trace why) -{ - unsigned int debug_id = conn->debug_id; - int r; - - __refcount_dec(&conn->ref, &r); - trace_rxrpc_conn(debug_id, r - 1, why); - if (r - 1 == 1) - rxrpc_set_service_reap_timer(conn->local->rxnet, - jiffies + rxrpc_connection_expiry); -} - /* * destroy a virtual connection */ -static void rxrpc_destroy_connection(struct rcu_head *rcu) +static void rxrpc_rcu_free_connection(struct rcu_head *rcu) { struct rxrpc_connection *conn = container_of(rcu, struct rxrpc_connection, rcu); + struct rxrpc_net *rxnet = conn->rxnet; _enter("{%d,u=%d}", conn->debug_id, refcount_read(&conn->ref)); trace_rxrpc_conn(conn->debug_id, refcount_read(&conn->ref), rxrpc_conn_free); + kfree(conn); - ASSERTCMP(refcount_read(&conn->ref), ==, 0); + if (atomic_dec_and_test(&rxnet->nr_conns)) + wake_up_var(&rxnet->nr_conns); +} + +/* + * Clean up a dead connection. + */ +static void rxrpc_clean_up_connection(struct work_struct *work) +{ + struct rxrpc_connection *conn = + container_of(work, struct rxrpc_connection, destructor); + struct rxrpc_net *rxnet = conn->rxnet; + + ASSERT(!rcu_access_pointer(conn->channels[0].call) && + !rcu_access_pointer(conn->channels[1].call) && + !rcu_access_pointer(conn->channels[2].call) && + !rcu_access_pointer(conn->channels[3].call)); + ASSERT(list_empty(&conn->cache_link)); del_timer_sync(&conn->timer); + cancel_work_sync(&conn->processor); /* Processing may restart the timer */ + del_timer_sync(&conn->timer); + + write_lock(&rxnet->conn_lock); + list_del_init(&conn->proc_link); + write_unlock(&rxnet->conn_lock); + rxrpc_purge_queue(&conn->rx_queue); + rxrpc_kill_client_conn(conn); + conn->security->clear(conn); key_put(conn->key); rxrpc_put_bundle(conn->bundle, rxrpc_bundle_put_conn); rxrpc_put_peer(conn->peer, rxrpc_peer_put_conn); - - if (atomic_dec_and_test(&conn->local->rxnet->nr_conns)) - wake_up_var(&conn->local->rxnet->nr_conns); rxrpc_put_local(conn->local, rxrpc_local_put_kill_conn); - kfree(conn); - _leave(""); + /* Drain the Rx queue. Note that even though we've unpublished, an + * incoming packet could still be being added to our Rx queue, so we + * will need to drain it again in the RCU cleanup handler. + */ + rxrpc_purge_queue(&conn->rx_queue); + + call_rcu(&conn->rcu, rxrpc_rcu_free_connection); +} + +/* + * Drop a ref on a connection. + */ +void rxrpc_put_connection(struct rxrpc_connection *conn, + enum rxrpc_conn_trace why) +{ + unsigned int debug_id; + bool dead; + int r; + + if (!conn) + return; + + debug_id = conn->debug_id; + dead = __refcount_dec_and_test(&conn->ref, &r); + trace_rxrpc_conn(debug_id, r - 1, why); + if (dead) { + del_timer(&conn->timer); + cancel_work(&conn->processor); + + if (in_softirq() || work_busy(&conn->processor) || + timer_pending(&conn->timer)) + /* Can't use the rxrpc workqueue as we need to cancel/flush + * something that may be running/waiting there. + */ + schedule_work(&conn->destructor); + else + rxrpc_clean_up_connection(&conn->destructor); + } } /* @@ -383,6 +400,7 @@ void rxrpc_service_connection_reaper(struct work_struct *work) struct rxrpc_net *rxnet = container_of(work, struct rxrpc_net, service_conn_reaper); unsigned long expire_at, earliest, idle_timestamp, now; + int active; LIST_HEAD(graveyard); @@ -393,8 +411,8 @@ void rxrpc_service_connection_reaper(struct work_struct *work) write_lock(&rxnet->conn_lock); list_for_each_entry_safe(conn, _p, &rxnet->service_conns, link) { - ASSERTCMP(refcount_read(&conn->ref), >, 0); - if (likely(refcount_read(&conn->ref) > 1)) + ASSERTCMP(atomic_read(&conn->active), >=, 0); + if (likely(atomic_read(&conn->active) > 0)) continue; if (conn->state == RXRPC_CONN_SERVICE_PREALLOC) continue; @@ -405,8 +423,8 @@ void rxrpc_service_connection_reaper(struct work_struct *work) if (conn->local->service_closed) expire_at = idle_timestamp + rxrpc_closed_conn_expiry * HZ; - _debug("reap CONN %d { u=%d,t=%ld }", - conn->debug_id, refcount_read(&conn->ref), + _debug("reap CONN %d { a=%d,t=%ld }", + conn->debug_id, atomic_read(&conn->active), (long)expire_at - (long)now); if (time_before(now, expire_at)) { @@ -416,10 +434,11 @@ void rxrpc_service_connection_reaper(struct work_struct *work) } } - /* The usage count sits at 1 whilst the object is unused on the - * list; we reduce that to 0 to make the object unavailable. + /* The activity count sits at 0 whilst the conn is unused on + * the list; we reduce that to -1 to make the conn unavailable. */ - if (!refcount_dec_if_one(&conn->ref)) + active = 0; + if (!atomic_try_cmpxchg(&conn->active, &active, -1)) continue; rxrpc_see_connection(conn, rxrpc_conn_see_reap_service); @@ -443,8 +462,8 @@ void rxrpc_service_connection_reaper(struct work_struct *work) link); list_del_init(&conn->link); - ASSERTCMP(refcount_read(&conn->ref), ==, 0); - rxrpc_kill_connection(conn); + ASSERTCMP(atomic_read(&conn->active), ==, -1); + rxrpc_put_connection(conn, rxrpc_conn_put_service_reaped); } _leave(""); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index 2c44d67b43dc..b5ae7c753fc3 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -125,7 +125,7 @@ replace_old_connection: struct rxrpc_connection *rxrpc_prealloc_service_connection(struct rxrpc_net *rxnet, gfp_t gfp) { - struct rxrpc_connection *conn = rxrpc_alloc_connection(gfp); + struct rxrpc_connection *conn = rxrpc_alloc_connection(rxnet, gfp); if (conn) { /* We maintain an extra ref on the connection whilst it is on @@ -181,6 +181,8 @@ void rxrpc_new_incoming_connection(struct rxrpc_sock *rx, conn->service_id == rx->service_upgrade.from) conn->service_id = rx->service_upgrade.to; + atomic_set(&conn->active, 1); + /* Make the connection a target for incoming packets. */ rxrpc_publish_service_conn(conn->peer, conn); } diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index 84242c0e467c..5905530e2f33 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -65,7 +65,7 @@ static __net_init int rxrpc_init_net(struct net *net) atomic_set(&rxnet->nr_client_conns, 0); rxnet->kill_all_client_conns = false; spin_lock_init(&rxnet->client_conn_cache_lock); - spin_lock_init(&rxnet->client_conn_discard_lock); + mutex_init(&rxnet->client_conn_discard_lock); INIT_LIST_HEAD(&rxnet->idle_client_conns); INIT_WORK(&rxnet->client_conn_reaper, rxrpc_discard_expired_client_conns); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index bb2edf6db896..d3a6d24cf871 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -159,7 +159,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "Proto Local " " Remote " - " SvID ConnID End Use State Key " + " SvID ConnID End Ref Act State Key " " Serial ISerial CallId0 CallId1 CallId2 CallId3\n" ); return 0; @@ -177,7 +177,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v) sprintf(rbuff, "%pISpc", &conn->peer->srx.transport); print: seq_printf(seq, - "UDP %-47.47s %-47.47s %4x %08x %s %3u" + "UDP %-47.47s %-47.47s %4x %08x %s %3u %3d" " %s %08x %08x %08x %08x %08x %08x %08x\n", lbuff, rbuff, @@ -185,6 +185,7 @@ print: conn->proto.cid, rxrpc_conn_is_service(conn) ? "Svc" : "Clt", refcount_read(&conn->ref), + atomic_read(&conn->active), rxrpc_conn_states[conn->state], key_serial(conn->key), atomic_read(&conn->serial), -- cgit v1.2.3-70-g09d2 From 96b2d69b43a075a38df600597133f17d28525f24 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Jan 2020 13:01:33 +0000 Subject: rxrpc: Split the receive code Split the code that handles packet reception in softirq mode as a prelude to moving all the packet processing beyond routing to the appropriate call and setting up of a new call out into process context. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/Makefile | 1 + net/rxrpc/ar-internal.h | 7 + net/rxrpc/input.c | 372 +----------------------------------------------- net/rxrpc/io_thread.c | 370 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 384 insertions(+), 366 deletions(-) create mode 100644 net/rxrpc/io_thread.c (limited to 'net') diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index 79687477d93c..e76d3459d78e 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -16,6 +16,7 @@ rxrpc-y := \ conn_service.o \ input.o \ insecure.o \ + io_thread.o \ key.o \ local_event.o \ local_object.o \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 41a57c145f2b..523cc9c5ab12 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -946,6 +946,13 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ +void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); +void rxrpc_input_implicit_end_call(struct rxrpc_sock *, struct rxrpc_connection *, + struct rxrpc_call *); + +/* + * io_thread.c + */ int rxrpc_input_packet(struct sock *, struct sk_buff *); /* diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index ab8b7a1be935..f4f6f3c62d03 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* RxRPC packet reception +/* Processing of received RxRPC packets * - * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -1029,7 +1029,7 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) /* * Process an incoming call packet. */ -static void rxrpc_input_call_packet(struct rxrpc_call *call, +void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -1086,9 +1086,9 @@ no_free: * * TODO: If callNumber > call_id + 1, renegotiate security. */ -static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, - struct rxrpc_connection *conn, - struct rxrpc_call *call) +void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, + struct rxrpc_connection *conn, + struct rxrpc_call *call) { switch (READ_ONCE(call->state)) { case RXRPC_CALL_SERVER_AWAIT_ACK: @@ -1109,363 +1109,3 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, __rxrpc_disconnect_call(conn, call); spin_unlock(&rx->incoming_lock); } - -/* - * post connection-level events to the connection - * - this includes challenges, responses, some aborts and call terminal packet - * retransmission. - */ -static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, - struct sk_buff *skb) -{ - _enter("%p,%p", conn, skb); - - skb_queue_tail(&conn->rx_queue, skb); - rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); -} - -/* - * post endpoint-level events to the local endpoint - * - this includes debug and version messages - */ -static void rxrpc_post_packet_to_local(struct rxrpc_local *local, - struct sk_buff *skb) -{ - _enter("%p,%p", local, skb); - - if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { - skb_queue_tail(&local->event_queue, skb); - rxrpc_queue_local(local); - } else { - rxrpc_free_skb(skb, rxrpc_skb_put_input); - } -} - -/* - * put a packet up for transport-level abort - */ -static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) -{ - if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { - skb_queue_tail(&local->reject_queue, skb); - rxrpc_queue_local(local); - } else { - rxrpc_free_skb(skb, rxrpc_skb_put_input); - } -} - -/* - * Extract the wire header from a packet and translate the byte order. - */ -static noinline -int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) -{ - struct rxrpc_wire_header whdr; - - /* dig out the RxRPC connection details */ - if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) { - trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, - tracepoint_string("bad_hdr")); - return -EBADMSG; - } - - memset(sp, 0, sizeof(*sp)); - sp->hdr.epoch = ntohl(whdr.epoch); - sp->hdr.cid = ntohl(whdr.cid); - sp->hdr.callNumber = ntohl(whdr.callNumber); - sp->hdr.seq = ntohl(whdr.seq); - sp->hdr.serial = ntohl(whdr.serial); - sp->hdr.flags = whdr.flags; - sp->hdr.type = whdr.type; - sp->hdr.userStatus = whdr.userStatus; - sp->hdr.securityIndex = whdr.securityIndex; - sp->hdr._rsvd = ntohs(whdr._rsvd); - sp->hdr.serviceId = ntohs(whdr.serviceId); - return 0; -} - -/* - * Extract the abort code from an ABORT packet and stash it in skb->priority. - */ -static bool rxrpc_extract_abort(struct sk_buff *skb) -{ - __be32 wtmp; - - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &wtmp, sizeof(wtmp)) < 0) - return false; - skb->priority = ntohl(wtmp); - return true; -} - -/* - * handle data received on the local endpoint - * - may be called in interrupt context - * - * [!] Note that as this is called from the encap_rcv hook, the socket is not - * held locked by the caller and nothing prevents sk_user_data on the UDP from - * being cleared in the middle of processing this function. - * - * Called with the RCU read lock held from the IP layer via UDP. - */ -int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) -{ - struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); - struct rxrpc_connection *conn; - struct rxrpc_channel *chan; - struct rxrpc_call *call = NULL; - struct rxrpc_skb_priv *sp; - struct rxrpc_peer *peer = NULL; - struct rxrpc_sock *rx = NULL; - unsigned int channel; - - _enter("%p", udp_sk); - - if (unlikely(!local)) { - kfree_skb(skb); - return 0; - } - if (skb->tstamp == 0) - skb->tstamp = ktime_get_real(); - - rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); - - skb_pull(skb, sizeof(struct udphdr)); - - /* The UDP protocol already released all skb resources; - * we are free to add our own data there. - */ - sp = rxrpc_skb(skb); - - /* dig out the RxRPC connection details */ - if (rxrpc_extract_header(sp, skb) < 0) - goto bad_message; - - if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { - static int lose; - if ((lose++ & 7) == 7) { - trace_rxrpc_rx_lose(sp); - rxrpc_free_skb(skb, rxrpc_skb_put_lose); - return 0; - } - } - - if (skb->tstamp == 0) - skb->tstamp = ktime_get_real(); - trace_rxrpc_rx_packet(sp); - - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_VERSION: - if (rxrpc_to_client(sp)) - goto discard; - rxrpc_post_packet_to_local(local, skb); - goto out; - - case RXRPC_PACKET_TYPE_BUSY: - if (rxrpc_to_server(sp)) - goto discard; - fallthrough; - case RXRPC_PACKET_TYPE_ACK: - case RXRPC_PACKET_TYPE_ACKALL: - if (sp->hdr.callNumber == 0) - goto bad_message; - break; - case RXRPC_PACKET_TYPE_ABORT: - if (!rxrpc_extract_abort(skb)) - return true; /* Just discard if malformed */ - break; - - case RXRPC_PACKET_TYPE_DATA: - if (sp->hdr.callNumber == 0 || - sp->hdr.seq == 0) - goto bad_message; - - /* Unshare the packet so that it can be modified for in-place - * decryption. - */ - if (sp->hdr.securityIndex != 0) { - struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC); - if (!nskb) { - rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare_nomem); - goto out; - } - - if (nskb != skb) { - rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare); - skb = nskb; - rxrpc_new_skb(skb, rxrpc_skb_new_unshared); - sp = rxrpc_skb(skb); - } - } - break; - - case RXRPC_PACKET_TYPE_CHALLENGE: - if (rxrpc_to_server(sp)) - goto discard; - break; - case RXRPC_PACKET_TYPE_RESPONSE: - if (rxrpc_to_client(sp)) - goto discard; - break; - - /* Packet types 9-11 should just be ignored. */ - case RXRPC_PACKET_TYPE_PARAMS: - case RXRPC_PACKET_TYPE_10: - case RXRPC_PACKET_TYPE_11: - goto discard; - - default: - goto bad_message; - } - - if (sp->hdr.serviceId == 0) - goto bad_message; - - if (rxrpc_to_server(sp)) { - /* Weed out packets to services we're not offering. Packets - * that would begin a call are explicitly rejected and the rest - * are just discarded. - */ - rx = rcu_dereference(local->service); - if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && - sp->hdr.serviceId != rx->second_service)) { - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - sp->hdr.seq == 1) - goto unsupported_service; - goto discard; - } - } - - conn = rxrpc_find_connection_rcu(local, skb, &peer); - if (conn) { - if (sp->hdr.securityIndex != conn->security_ix) - goto wrong_security; - - if (sp->hdr.serviceId != conn->service_id) { - int old_id; - - if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) - goto reupgrade; - old_id = cmpxchg(&conn->service_id, conn->orig_service_id, - sp->hdr.serviceId); - - if (old_id != conn->orig_service_id && - old_id != sp->hdr.serviceId) - goto reupgrade; - } - - if (sp->hdr.callNumber == 0) { - /* Connection-level packet */ - _debug("CONN %p {%d}", conn, conn->debug_id); - rxrpc_post_packet_to_conn(conn, skb); - goto out; - } - - if ((int)sp->hdr.serial - (int)conn->hi_serial > 0) - conn->hi_serial = sp->hdr.serial; - - /* Call-bound packets are routed by connection channel. */ - channel = sp->hdr.cid & RXRPC_CHANNELMASK; - chan = &conn->channels[channel]; - - /* Ignore really old calls */ - if (sp->hdr.callNumber < chan->last_call) - goto discard; - - if (sp->hdr.callNumber == chan->last_call) { - if (chan->call || - sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) - goto discard; - - /* For the previous service call, if completed - * successfully, we discard all further packets. - */ - if (rxrpc_conn_is_service(conn) && - chan->last_type == RXRPC_PACKET_TYPE_ACK) - goto discard; - - /* But otherwise we need to retransmit the final packet - * from data cached in the connection record. - */ - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) - trace_rxrpc_rx_data(chan->call_debug_id, - sp->hdr.seq, - sp->hdr.serial, - sp->hdr.flags); - rxrpc_post_packet_to_conn(conn, skb); - goto out; - } - - call = rcu_dereference(chan->call); - - if (sp->hdr.callNumber > chan->call_id) { - if (rxrpc_to_client(sp)) - goto reject_packet; - if (call) - rxrpc_input_implicit_end_call(rx, conn, call); - call = NULL; - } - - if (call) { - if (sp->hdr.serviceId != call->service_id) - call->service_id = sp->hdr.serviceId; - if ((int)sp->hdr.serial - (int)call->rx_serial > 0) - call->rx_serial = sp->hdr.serial; - if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) - set_bit(RXRPC_CALL_RX_HEARD, &call->flags); - } - } - - if (!call || refcount_read(&call->ref) == 0) { - if (rxrpc_to_client(sp) || - sp->hdr.type != RXRPC_PACKET_TYPE_DATA) - goto bad_message; - if (sp->hdr.seq != 1) - goto discard; - call = rxrpc_new_incoming_call(local, rx, skb); - if (!call) - goto reject_packet; - } - - /* Process a call packet; this either discards or passes on the ref - * elsewhere. - */ - rxrpc_input_call_packet(call, skb); - goto out; - -discard: - rxrpc_free_skb(skb, rxrpc_skb_put_input); -out: - trace_rxrpc_rx_done(0, 0); - return 0; - -wrong_security: - trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RXKADINCONSISTENCY, EBADMSG); - skb->priority = RXKADINCONSISTENCY; - goto post_abort; - -unsupported_service: - trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EOPNOTSUPP); - skb->priority = RX_INVALID_OPERATION; - goto post_abort; - -reupgrade: - trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_PROTOCOL_ERROR, EBADMSG); - goto protocol_error; - -bad_message: - trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_PROTOCOL_ERROR, EBADMSG); -protocol_error: - skb->priority = RX_PROTOCOL_ERROR; -post_abort: - skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; -reject_packet: - trace_rxrpc_rx_done(skb->mark, skb->priority); - rxrpc_reject_packet(local, skb); - _leave(" [badmsg]"); - return 0; -} diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c new file mode 100644 index 000000000000..d2aaad5afa1d --- /dev/null +++ b/net/rxrpc/io_thread.c @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxRPC packet reception + * + * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "ar-internal.h" + +/* + * post connection-level events to the connection + * - this includes challenges, responses, some aborts and call terminal packet + * retransmission. + */ +static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + _enter("%p,%p", conn, skb); + + skb_queue_tail(&conn->rx_queue, skb); + rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); +} + +/* + * post endpoint-level events to the local endpoint + * - this includes debug and version messages + */ +static void rxrpc_post_packet_to_local(struct rxrpc_local *local, + struct sk_buff *skb) +{ + _enter("%p,%p", local, skb); + + if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { + skb_queue_tail(&local->event_queue, skb); + rxrpc_queue_local(local); + } else { + rxrpc_free_skb(skb, rxrpc_skb_put_input); + } +} + +/* + * put a packet up for transport-level abort + */ +static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) +{ + if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { + skb_queue_tail(&local->reject_queue, skb); + rxrpc_queue_local(local); + } else { + rxrpc_free_skb(skb, rxrpc_skb_put_input); + } +} + +/* + * Extract the wire header from a packet and translate the byte order. + */ +static noinline +int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) +{ + struct rxrpc_wire_header whdr; + + /* dig out the RxRPC connection details */ + if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) { + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, + tracepoint_string("bad_hdr")); + return -EBADMSG; + } + + memset(sp, 0, sizeof(*sp)); + sp->hdr.epoch = ntohl(whdr.epoch); + sp->hdr.cid = ntohl(whdr.cid); + sp->hdr.callNumber = ntohl(whdr.callNumber); + sp->hdr.seq = ntohl(whdr.seq); + sp->hdr.serial = ntohl(whdr.serial); + sp->hdr.flags = whdr.flags; + sp->hdr.type = whdr.type; + sp->hdr.userStatus = whdr.userStatus; + sp->hdr.securityIndex = whdr.securityIndex; + sp->hdr._rsvd = ntohs(whdr._rsvd); + sp->hdr.serviceId = ntohs(whdr.serviceId); + return 0; +} + +/* + * Extract the abort code from an ABORT packet and stash it in skb->priority. + */ +static bool rxrpc_extract_abort(struct sk_buff *skb) +{ + __be32 wtmp; + + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + &wtmp, sizeof(wtmp)) < 0) + return false; + skb->priority = ntohl(wtmp); + return true; +} + +/* + * handle data received on the local endpoint + * - may be called in interrupt context + * + * [!] Note that as this is called from the encap_rcv hook, the socket is not + * held locked by the caller and nothing prevents sk_user_data on the UDP from + * being cleared in the middle of processing this function. + * + * Called with the RCU read lock held from the IP layer via UDP. + */ +int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) +{ + struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); + struct rxrpc_connection *conn; + struct rxrpc_channel *chan; + struct rxrpc_call *call = NULL; + struct rxrpc_skb_priv *sp; + struct rxrpc_peer *peer = NULL; + struct rxrpc_sock *rx = NULL; + unsigned int channel; + + _enter("%p", udp_sk); + + if (unlikely(!local)) { + kfree_skb(skb); + return 0; + } + if (skb->tstamp == 0) + skb->tstamp = ktime_get_real(); + + rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); + + skb_pull(skb, sizeof(struct udphdr)); + + /* The UDP protocol already released all skb resources; + * we are free to add our own data there. + */ + sp = rxrpc_skb(skb); + + /* dig out the RxRPC connection details */ + if (rxrpc_extract_header(sp, skb) < 0) + goto bad_message; + + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + if ((lose++ & 7) == 7) { + trace_rxrpc_rx_lose(sp); + rxrpc_free_skb(skb, rxrpc_skb_put_lose); + return 0; + } + } + + if (skb->tstamp == 0) + skb->tstamp = ktime_get_real(); + trace_rxrpc_rx_packet(sp); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: + if (rxrpc_to_client(sp)) + goto discard; + rxrpc_post_packet_to_local(local, skb); + goto out; + + case RXRPC_PACKET_TYPE_BUSY: + if (rxrpc_to_server(sp)) + goto discard; + fallthrough; + case RXRPC_PACKET_TYPE_ACK: + case RXRPC_PACKET_TYPE_ACKALL: + if (sp->hdr.callNumber == 0) + goto bad_message; + break; + case RXRPC_PACKET_TYPE_ABORT: + if (!rxrpc_extract_abort(skb)) + return true; /* Just discard if malformed */ + break; + + case RXRPC_PACKET_TYPE_DATA: + if (sp->hdr.callNumber == 0 || + sp->hdr.seq == 0) + goto bad_message; + + /* Unshare the packet so that it can be modified for in-place + * decryption. + */ + if (sp->hdr.securityIndex != 0) { + struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC); + if (!nskb) { + rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare_nomem); + goto out; + } + + if (nskb != skb) { + rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare); + skb = nskb; + rxrpc_new_skb(skb, rxrpc_skb_new_unshared); + sp = rxrpc_skb(skb); + } + } + break; + + case RXRPC_PACKET_TYPE_CHALLENGE: + if (rxrpc_to_server(sp)) + goto discard; + break; + case RXRPC_PACKET_TYPE_RESPONSE: + if (rxrpc_to_client(sp)) + goto discard; + break; + + /* Packet types 9-11 should just be ignored. */ + case RXRPC_PACKET_TYPE_PARAMS: + case RXRPC_PACKET_TYPE_10: + case RXRPC_PACKET_TYPE_11: + goto discard; + + default: + goto bad_message; + } + + if (sp->hdr.serviceId == 0) + goto bad_message; + + if (rxrpc_to_server(sp)) { + /* Weed out packets to services we're not offering. Packets + * that would begin a call are explicitly rejected and the rest + * are just discarded. + */ + rx = rcu_dereference(local->service); + if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && + sp->hdr.serviceId != rx->second_service)) { + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.seq == 1) + goto unsupported_service; + goto discard; + } + } + + conn = rxrpc_find_connection_rcu(local, skb, &peer); + if (conn) { + if (sp->hdr.securityIndex != conn->security_ix) + goto wrong_security; + + if (sp->hdr.serviceId != conn->service_id) { + int old_id; + + if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) + goto reupgrade; + old_id = cmpxchg(&conn->service_id, conn->orig_service_id, + sp->hdr.serviceId); + + if (old_id != conn->orig_service_id && + old_id != sp->hdr.serviceId) + goto reupgrade; + } + + if (sp->hdr.callNumber == 0) { + /* Connection-level packet */ + _debug("CONN %p {%d}", conn, conn->debug_id); + rxrpc_post_packet_to_conn(conn, skb); + goto out; + } + + if ((int)sp->hdr.serial - (int)conn->hi_serial > 0) + conn->hi_serial = sp->hdr.serial; + + /* Call-bound packets are routed by connection channel. */ + channel = sp->hdr.cid & RXRPC_CHANNELMASK; + chan = &conn->channels[channel]; + + /* Ignore really old calls */ + if (sp->hdr.callNumber < chan->last_call) + goto discard; + + if (sp->hdr.callNumber == chan->last_call) { + if (chan->call || + sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) + goto discard; + + /* For the previous service call, if completed + * successfully, we discard all further packets. + */ + if (rxrpc_conn_is_service(conn) && + chan->last_type == RXRPC_PACKET_TYPE_ACK) + goto discard; + + /* But otherwise we need to retransmit the final packet + * from data cached in the connection record. + */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) + trace_rxrpc_rx_data(chan->call_debug_id, + sp->hdr.seq, + sp->hdr.serial, + sp->hdr.flags); + rxrpc_post_packet_to_conn(conn, skb); + goto out; + } + + call = rcu_dereference(chan->call); + + if (sp->hdr.callNumber > chan->call_id) { + if (rxrpc_to_client(sp)) + goto reject_packet; + if (call) + rxrpc_input_implicit_end_call(rx, conn, call); + call = NULL; + } + + if (call) { + if (sp->hdr.serviceId != call->service_id) + call->service_id = sp->hdr.serviceId; + if ((int)sp->hdr.serial - (int)call->rx_serial > 0) + call->rx_serial = sp->hdr.serial; + if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) + set_bit(RXRPC_CALL_RX_HEARD, &call->flags); + } + } + + if (!call || refcount_read(&call->ref) == 0) { + if (rxrpc_to_client(sp) || + sp->hdr.type != RXRPC_PACKET_TYPE_DATA) + goto bad_message; + if (sp->hdr.seq != 1) + goto discard; + call = rxrpc_new_incoming_call(local, rx, skb); + if (!call) + goto reject_packet; + } + + /* Process a call packet; this either discards or passes on the ref + * elsewhere. + */ + rxrpc_input_call_packet(call, skb); + goto out; + +discard: + rxrpc_free_skb(skb, rxrpc_skb_put_input); +out: + trace_rxrpc_rx_done(0, 0); + return 0; + +wrong_security: + trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RXKADINCONSISTENCY, EBADMSG); + skb->priority = RXKADINCONSISTENCY; + goto post_abort; + +unsupported_service: + trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EOPNOTSUPP); + skb->priority = RX_INVALID_OPERATION; + goto post_abort; + +reupgrade: + trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_PROTOCOL_ERROR, EBADMSG); + goto protocol_error; + +bad_message: + trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_PROTOCOL_ERROR, EBADMSG); +protocol_error: + skb->priority = RX_PROTOCOL_ERROR; +post_abort: + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; +reject_packet: + trace_rxrpc_rx_done(skb->mark, skb->priority); + rxrpc_reject_packet(local, skb); + _leave(" [badmsg]"); + return 0; +} -- cgit v1.2.3-70-g09d2 From a275da62e8c111b897b9cb73eb91df2f4e475ca5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Oct 2022 08:45:20 +0100 Subject: rxrpc: Create a per-local endpoint receive queue and I/O thread Create a per-local receive queue to which, in a future patch, all incoming packets will be directed and an I/O thread that will process those packets and perform all transmission of packets. Destruction of the local endpoint is also moved from the local processor work item (which will be absorbed) to the thread. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 10 ++++++++++ net/rxrpc/io_thread.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++- net/rxrpc/local_object.c | 39 ++++++++++++++++++++---------------- net/rxrpc/proc.c | 12 +++++++++--- 4 files changed, 91 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 523cc9c5ab12..de82c25956a6 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -110,6 +110,8 @@ struct rxrpc_net { atomic_t stat_rx_acks[256]; atomic_t stat_why_req_ack[8]; + + atomic_t stat_io_loop; }; /* @@ -280,12 +282,14 @@ struct rxrpc_local { struct hlist_node link; struct socket *socket; /* my UDP socket */ struct work_struct processor; + struct task_struct *io_thread; struct list_head ack_tx_queue; /* List of ACKs that need sending */ spinlock_t ack_tx_lock; /* ACK list lock */ struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ + struct sk_buff_head rx_queue; /* Received packets */ struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ spinlock_t lock; /* access lock */ @@ -954,6 +958,11 @@ void rxrpc_input_implicit_end_call(struct rxrpc_sock *, struct rxrpc_connection * io_thread.c */ int rxrpc_input_packet(struct sock *, struct sk_buff *); +int rxrpc_io_thread(void *data); +static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) +{ + wake_up_process(local->io_thread); +} /* * insecure.c @@ -984,6 +993,7 @@ void rxrpc_put_local(struct rxrpc_local *, enum rxrpc_local_trace); struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_unuse_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_queue_local(struct rxrpc_local *); +void rxrpc_destroy_local(struct rxrpc_local *local); void rxrpc_destroy_all_locals(struct rxrpc_net *); static inline bool __rxrpc_unuse_local(struct rxrpc_local *local, diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index d2aaad5afa1d..0b3e096e3d50 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC packet reception * - * Copyright (C) 2007, 2016 Red Hat, Inc. All Rights Reserved. + * Copyright (C) 2007, 2016, 2022 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ @@ -368,3 +368,52 @@ reject_packet: _leave(" [badmsg]"); return 0; } + +/* + * I/O and event handling thread. + */ +int rxrpc_io_thread(void *data) +{ + struct sk_buff_head rx_queue; + struct rxrpc_local *local = data; + struct sk_buff *skb; + + skb_queue_head_init(&rx_queue); + + set_user_nice(current, MIN_NICE); + + for (;;) { + rxrpc_inc_stat(local->rxnet, stat_io_loop); + + /* Process received packets and errors. */ + if ((skb = __skb_dequeue(&rx_queue))) { + // TODO: Input packet + rxrpc_free_skb(skb, rxrpc_skb_put_input); + continue; + } + + if (!skb_queue_empty(&local->rx_queue)) { + spin_lock_irq(&local->rx_queue.lock); + skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); + spin_unlock_irq(&local->rx_queue.lock); + continue; + } + + set_current_state(TASK_INTERRUPTIBLE); + if (!skb_queue_empty(&local->rx_queue)) { + __set_current_state(TASK_RUNNING); + continue; + } + + if (kthread_should_stop()) + break; + schedule(); + } + + __set_current_state(TASK_RUNNING); + rxrpc_see_local(local, rxrpc_local_stop); + rxrpc_destroy_local(local); + local->io_thread = NULL; + rxrpc_see_local(local, rxrpc_local_stopped); + return 0; +} diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 1617ce651b9b..7c61349984e3 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -103,6 +103,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); + skb_queue_head_init(&local->rx_queue); local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); spin_lock_init(&local->lock); @@ -126,6 +127,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) struct udp_tunnel_sock_cfg tuncfg = {NULL}; struct sockaddr_rxrpc *srx = &local->srx; struct udp_port_cfg udp_conf = {0}; + struct task_struct *io_thread; struct sock *usk; int ret; @@ -185,8 +187,23 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) BUG(); } + io_thread = kthread_run(rxrpc_io_thread, local, + "krxrpcio/%u", ntohs(udp_conf.local_udp_port)); + if (IS_ERR(io_thread)) { + ret = PTR_ERR(io_thread); + goto error_sock; + } + + local->io_thread = io_thread; _leave(" = 0"); return 0; + +error_sock: + kernel_sock_shutdown(local->socket, SHUT_RDWR); + local->socket->sk->sk_user_data = NULL; + sock_release(local->socket); + local->socket = NULL; + return ret; } /* @@ -360,19 +377,8 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { - unsigned int debug_id; - int r, u; - - if (local) { - debug_id = local->debug_id; - r = refcount_read(&local->ref); - u = atomic_dec_return(&local->active_users); - trace_rxrpc_local(debug_id, why, r, u); - if (u == 0) { - rxrpc_get_local(local, rxrpc_local_get_queue); - rxrpc_queue_local(local); - } - } + if (local && __rxrpc_unuse_local(local, why)) + kthread_stop(local->io_thread); } /* @@ -382,7 +388,7 @@ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) * Closing the socket cannot be done from bottom half context or RCU callback * context because it might sleep. */ -static void rxrpc_local_destroyer(struct rxrpc_local *local) +void rxrpc_destroy_local(struct rxrpc_local *local) { struct socket *socket = local->socket; struct rxrpc_net *rxnet = local->rxnet; @@ -411,6 +417,7 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) */ rxrpc_purge_queue(&local->reject_queue); rxrpc_purge_queue(&local->event_queue); + rxrpc_purge_queue(&local->rx_queue); } /* @@ -430,10 +437,8 @@ static void rxrpc_local_processor(struct work_struct *work) do { again = false; - if (!__rxrpc_use_local(local, rxrpc_local_use_work)) { - rxrpc_local_destroyer(local); + if (!__rxrpc_use_local(local, rxrpc_local_use_work)) break; - } if (!list_empty(&local->ack_tx_queue)) { rxrpc_transmit_ack_packets(local); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index d3a6d24cf871..35d5b43c677e 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -342,7 +342,7 @@ static int rxrpc_local_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_puts(seq, "Proto Local " - " Use Act\n"); + " Use Act RxQ\n"); return 0; } @@ -351,10 +351,11 @@ static int rxrpc_local_seq_show(struct seq_file *seq, void *v) sprintf(lbuff, "%pISpc", &local->srx.transport); seq_printf(seq, - "UDP %-47.47s %3u %3u\n", + "UDP %-47.47s %3u %3u %3u\n", lbuff, refcount_read(&local->ref), - atomic_read(&local->active_users)); + atomic_read(&local->active_users), + local->rx_queue.qlen); return 0; } @@ -463,6 +464,9 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_nr_txbuf), atomic_read(&rxrpc_n_rx_skbs)); + seq_printf(seq, + "IO-thread: loops=%u\n", + atomic_read(&rxnet->stat_io_loop)); return 0; } @@ -492,5 +496,7 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks)); memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack)); + + atomic_set(&rxnet->stat_io_loop, 0); return size; } -- cgit v1.2.3-70-g09d2 From 446b3e14525b477e441a6bb8ce56cea12512acc2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Oct 2022 10:55:24 +0100 Subject: rxrpc: Move packet reception processing into I/O thread Split the packet input handler to make the softirq side just dump the received packet into the local endpoint receive queue and then call the remainder of the input function from the I/O thread. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 3 ++- net/rxrpc/call_event.c | 4 ++-- net/rxrpc/call_object.c | 2 +- net/rxrpc/io_thread.c | 61 ++++++++++++++++++++++++++++++++---------------- net/rxrpc/local_object.c | 2 +- 5 files changed, 47 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index de82c25956a6..044815ba2b49 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -36,6 +36,7 @@ struct rxrpc_txbuf; * to pass supplementary information. */ enum rxrpc_skb_mark { + RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; @@ -957,7 +958,7 @@ void rxrpc_input_implicit_end_call(struct rxrpc_sock *, struct rxrpc_connection /* * io_thread.c */ -int rxrpc_input_packet(struct sock *, struct sk_buff *); +int rxrpc_encap_rcv(struct sock *, struct sk_buff *); int rxrpc_io_thread(void *data); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 049b92b1c040..3925b55e2064 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -83,7 +83,7 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); txb = rxrpc_alloc_txbuf(call, RXRPC_PACKET_TYPE_ACK, - in_softirq() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS); + rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS); if (!txb) { kleave(" = -ENOMEM"); return; @@ -111,7 +111,7 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, spin_unlock_bh(&local->ack_tx_lock); trace_rxrpc_send_ack(call, why, ack_reason, serial); - if (in_task()) { + if (!rcu_read_lock_held()) { rxrpc_transmit_ack_packets(call->peer->local); } else { rxrpc_get_local(local, rxrpc_local_get_queue); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 9cd7e0190ef4..57c8d4cc900a 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -632,7 +632,7 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) del_timer_sync(&call->timer); cancel_work(&call->processor); - if (in_softirq() || work_busy(&call->processor)) + if (rcu_read_lock_held() || work_busy(&call->processor)) /* Can't use the rxrpc workqueue as we need to cancel/flush * something that may be running/waiting there. */ diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 0b3e096e3d50..ee2e36c46ae2 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -9,6 +9,34 @@ #include "ar-internal.h" +/* + * handle data received on the local endpoint + * - may be called in interrupt context + * + * [!] Note that as this is called from the encap_rcv hook, the socket is not + * held locked by the caller and nothing prevents sk_user_data on the UDP from + * being cleared in the middle of processing this function. + * + * Called with the RCU read lock held from the IP layer via UDP. + */ +int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) +{ + struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); + + if (unlikely(!local)) { + kfree_skb(skb); + return 0; + } + if (skb->tstamp == 0) + skb->tstamp = ktime_get_real(); + + skb->mark = RXRPC_SKB_MARK_PACKET; + rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); + skb_queue_tail(&local->rx_queue, skb); + rxrpc_wake_up_io_thread(local); + return 0; +} + /* * post connection-level events to the connection * - this includes challenges, responses, some aborts and call terminal packet @@ -98,18 +126,10 @@ static bool rxrpc_extract_abort(struct sk_buff *skb) } /* - * handle data received on the local endpoint - * - may be called in interrupt context - * - * [!] Note that as this is called from the encap_rcv hook, the socket is not - * held locked by the caller and nothing prevents sk_user_data on the UDP from - * being cleared in the middle of processing this function. - * - * Called with the RCU read lock held from the IP layer via UDP. + * Process packets received on the local endpoint */ -int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) +static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) { - struct rxrpc_local *local = rcu_dereference_sk_user_data(udp_sk); struct rxrpc_connection *conn; struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; @@ -118,17 +138,9 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) struct rxrpc_sock *rx = NULL; unsigned int channel; - _enter("%p", udp_sk); - - if (unlikely(!local)) { - kfree_skb(skb); - return 0; - } if (skb->tstamp == 0) skb->tstamp = ktime_get_real(); - rxrpc_new_skb(skb, rxrpc_skb_new_encap_rcv); - skb_pull(skb, sizeof(struct udphdr)); /* The UDP protocol already released all skb resources; @@ -387,8 +399,17 @@ int rxrpc_io_thread(void *data) /* Process received packets and errors. */ if ((skb = __skb_dequeue(&rx_queue))) { - // TODO: Input packet - rxrpc_free_skb(skb, rxrpc_skb_put_input); + switch (skb->mark) { + case RXRPC_SKB_MARK_PACKET: + rcu_read_lock(); + rxrpc_input_packet(local, skb); + rcu_read_unlock(); + break; + default: + WARN_ON_ONCE(1); + rxrpc_free_skb(skb, rxrpc_skb_put_unknown); + break; + } continue; } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 7c61349984e3..6b4d77219f36 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -154,7 +154,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } tuncfg.encap_type = UDP_ENCAP_RXRPC; - tuncfg.encap_rcv = rxrpc_input_packet; + tuncfg.encap_rcv = rxrpc_encap_rcv; tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); -- cgit v1.2.3-70-g09d2 From ff7348254e704b6d0121970e311a6b699268e1ac Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Oct 2022 11:47:31 +0100 Subject: rxrpc: Move error processing into the local endpoint I/O thread Move the processing of error packets into the local endpoint I/O thread, leaving the handover from UDP to merely transfer them into the local endpoint queue. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 4 +++- net/rxrpc/io_thread.c | 29 +++++++++++++++++++++++++++++ net/rxrpc/peer_event.c | 41 ++++++----------------------------------- 3 files changed, 38 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 044815ba2b49..566377c64184 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -37,6 +37,7 @@ struct rxrpc_txbuf; */ enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ + RXRPC_SKB_MARK_ERROR, /* Error notification */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ }; @@ -959,6 +960,7 @@ void rxrpc_input_implicit_end_call(struct rxrpc_sock *, struct rxrpc_connection * io_thread.c */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); +void rxrpc_error_report(struct sock *); int rxrpc_io_thread(void *data); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { @@ -1063,7 +1065,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c */ -void rxrpc_error_report(struct sock *); +void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); /* diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index ee2e36c46ae2..416c6101cf78 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -37,6 +37,31 @@ int rxrpc_encap_rcv(struct sock *udp_sk, struct sk_buff *skb) return 0; } +/* + * Handle an error received on the local endpoint. + */ +void rxrpc_error_report(struct sock *sk) +{ + struct rxrpc_local *local; + struct sk_buff *skb; + + rcu_read_lock(); + local = rcu_dereference_sk_user_data(sk); + if (unlikely(!local)) { + rcu_read_unlock(); + return; + } + + while ((skb = skb_dequeue(&sk->sk_error_queue))) { + skb->mark = RXRPC_SKB_MARK_ERROR; + rxrpc_new_skb(skb, rxrpc_skb_new_error_report); + skb_queue_tail(&local->rx_queue, skb); + } + + rxrpc_wake_up_io_thread(local); + rcu_read_unlock(); +} + /* * post connection-level events to the connection * - this includes challenges, responses, some aborts and call terminal packet @@ -405,6 +430,10 @@ int rxrpc_io_thread(void *data) rxrpc_input_packet(local, skb); rcu_read_unlock(); break; + case RXRPC_SKB_MARK_ERROR: + rxrpc_input_error(local, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_error_report); + break; default: WARN_ON_ONCE(1); rxrpc_free_skb(skb, rxrpc_skb_put_unknown); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index f35cfc458dcf..94f63fb1bd67 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -131,51 +131,26 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) /* * Handle an error received on the local endpoint. */ -void rxrpc_error_report(struct sock *sk) +void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) { - struct sock_exterr_skb *serr; + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); struct sockaddr_rxrpc srx; - struct rxrpc_local *local; struct rxrpc_peer *peer = NULL; - struct sk_buff *skb; - rcu_read_lock(); - local = rcu_dereference_sk_user_data(sk); - if (unlikely(!local)) { - rcu_read_unlock(); - return; - } - _enter("%p{%d}", sk, local->debug_id); + _enter("L=%x", local->debug_id); - /* Clear the outstanding error value on the socket so that it doesn't - * cause kernel_sendmsg() to return it later. - */ - sock_error(sk); - - skb = sock_dequeue_err_skb(sk); - if (!skb) { - rcu_read_unlock(); - _leave("UDP socket errqueue empty"); - return; - } - rxrpc_new_skb(skb, rxrpc_skb_new_error_report); - serr = SKB_EXT_ERR(skb); if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_put_error_report); return; } + rcu_read_lock(); peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); if (peer && !rxrpc_get_peer_maybe(peer, rxrpc_peer_get_input_error)) peer = NULL; - if (!peer) { - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_put_error_report); - _leave(" [no peer]"); + rcu_read_unlock(); + if (!peer) return; - } trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); @@ -188,11 +163,7 @@ void rxrpc_error_report(struct sock *sk) rxrpc_store_error(peer, serr); out: - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_put_error_report); rxrpc_put_peer(peer, rxrpc_peer_put_input_error); - - _leave(""); } /* -- cgit v1.2.3-70-g09d2 From 4041a8ff653ec4e4d9e6395802cb3f4fca59f7f3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Jan 2020 22:21:59 +0000 Subject: rxrpc: Remove call->input_lock Remove call->input_lock as it was only necessary to serialise access to the state stored in the rxrpc_call struct by simultaneous softirq handlers presenting received packets. They now dump the packets in a queue and a single process-context handler now processes them. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 1 - net/rxrpc/call_object.c | 1 - net/rxrpc/input.c | 22 +++++----------------- 3 files changed, 5 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 566377c64184..654e9dab107c 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -657,7 +657,6 @@ struct rxrpc_call { rxrpc_seq_t rx_consumed; /* Highest packet consumed */ rxrpc_serial_t rx_serial; /* Highest serial received for this call */ u8 rx_winsize; /* Size of Rx window */ - spinlock_t input_lock; /* Lock for packet input to this call */ /* TCP-style slow-start congestion control [RFC5681]. Since the SMSS * is fixed, we keep these numbers in terms of segments (ie. DATA diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 57c8d4cc900a..f6d1b3a6f8c6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -143,7 +143,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); - spin_lock_init(&call->input_lock); spin_lock_init(&call->acks_ack_lock); rwlock_init(&call->state_lock); refcount_set(&call->ref, 1); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index f4f6f3c62d03..13c52145a926 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -588,8 +588,6 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) } } - spin_lock(&call->input_lock); - /* Received data implicitly ACKs all of the request packets we sent * when we're acting as a client. */ @@ -607,8 +605,6 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) out: trace_rxrpc_notify_socket(call->debug_id, serial); rxrpc_notify_socket(call); - - spin_unlock(&call->input_lock); rxrpc_free_skb(skb, rxrpc_skb_put_input); _leave(" [queued]"); } @@ -811,7 +807,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) offset = sizeof(struct rxrpc_wire_header); if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) { rxrpc_proto_abort("XAK", call, 0); - goto out_not_locked; + goto out; } offset += sizeof(ack); @@ -863,7 +859,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); - return; + goto out; } /* If we get an OUT_OF_SEQUENCE ACK from the server, that can also @@ -877,7 +873,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); - return; + goto out; } /* Discard any out-of-order or duplicate ACKs (outside lock). */ @@ -885,7 +881,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, first_soft_ack, call->acks_first_seq, prev_pkt, call->acks_prev_seq); - goto out_not_locked; + goto out; } info.rxMTU = 0; @@ -893,14 +889,12 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (skb->len >= ioffset + sizeof(info) && skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) { rxrpc_proto_abort("XAI", call, 0); - goto out_not_locked; + goto out; } if (nr_acks > 0) skb_condense(skb); - spin_lock(&call->input_lock); - /* Discard any out-of-order or duplicate ACKs (inside lock). */ if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, @@ -992,8 +986,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_congestion_management(call, skb, &summary, acked_serial); out: - spin_unlock(&call->input_lock); -out_not_locked: rxrpc_free_skb(skb_put, rxrpc_skb_put_input); rxrpc_free_skb(skb_old, rxrpc_skb_put_ack); } @@ -1005,12 +997,8 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - spin_lock(&call->input_lock); - if (rxrpc_rotate_tx_window(call, call->tx_top, &summary)) rxrpc_end_tx_phase(call, false, "ETL"); - - spin_unlock(&call->input_lock); } /* -- cgit v1.2.3-70-g09d2 From 81f2e8adc0fd10847637873dafe8610f3fb4cdff Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Oct 2022 13:13:47 +0100 Subject: rxrpc: Don't use sk->sk_receive_queue.lock to guard socket state changes Don't use sk->sk_receive_queue.lock to guard socket state changes as the socket mutex is sufficient. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/af_rxrpc.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 7a0dc01741e7..8ad4d85acb0b 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -812,14 +812,12 @@ static int rxrpc_shutdown(struct socket *sock, int flags) lock_sock(sk); - spin_lock_bh(&sk->sk_receive_queue.lock); if (sk->sk_state < RXRPC_CLOSE) { sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; } else { ret = -ESHUTDOWN; } - spin_unlock_bh(&sk->sk_receive_queue.lock); rxrpc_discard_prealloc(rx); @@ -872,9 +870,7 @@ static int rxrpc_release_sock(struct sock *sk) break; } - spin_lock_bh(&sk->sk_receive_queue.lock); sk->sk_state = RXRPC_CLOSE; - spin_unlock_bh(&sk->sk_receive_queue.lock); if (rx->local && rcu_access_pointer(rx->local->service) == rx) { write_lock(&rx->local->services_lock); -- cgit v1.2.3-70-g09d2 From 15f661dc95daec9b38e8e4cc931c95afe0ae0cef Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 10 Oct 2022 15:51:39 +0100 Subject: rxrpc: Implement a mechanism to send an event notification to a call Provide a means by which an event notification can be sent to a call such that the I/O thread can process it rather than it being done in a separate workqueue. This will allow a lot of locking to be removed. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 52 ++++++++++++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 5 ++++- net/rxrpc/call_object.c | 24 ++++++++++++++++++++ net/rxrpc/input.c | 3 +-- net/rxrpc/io_thread.c | 20 +++++++++++++++-- net/rxrpc/local_object.c | 1 + 6 files changed, 100 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 44a9be9836f9..0b12d96c7921 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -16,6 +16,13 @@ /* * Declare tracing information enums and their string mappings for display. */ +#define rxrpc_call_poke_traces \ + EM(rxrpc_call_poke_error, "Error") \ + EM(rxrpc_call_poke_idle, "Idle") \ + EM(rxrpc_call_poke_start, "Start") \ + EM(rxrpc_call_poke_timer, "Timer") \ + E_(rxrpc_call_poke_timer_now, "Timer-now") + #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ @@ -150,6 +157,7 @@ EM(rxrpc_call_get_input, "GET input ") \ EM(rxrpc_call_get_kernel_service, "GET krnl-srv") \ EM(rxrpc_call_get_notify_socket, "GET notify ") \ + EM(rxrpc_call_get_poke, "GET poke ") \ EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \ EM(rxrpc_call_get_release_sock, "GET rel-sock") \ EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ @@ -160,6 +168,7 @@ EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ EM(rxrpc_call_put_input, "PUT input ") \ EM(rxrpc_call_put_kernel, "PUT kernel ") \ + EM(rxrpc_call_put_poke, "PUT poke ") \ EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ @@ -378,6 +387,7 @@ #define E_(a, b) a enum rxrpc_bundle_trace { rxrpc_bundle_traces } __mode(byte); +enum rxrpc_call_poke_trace { rxrpc_call_poke_traces } __mode(byte); enum rxrpc_call_trace { rxrpc_call_traces } __mode(byte); enum rxrpc_client_trace { rxrpc_client_traces } __mode(byte); enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte); @@ -408,6 +418,7 @@ enum rxrpc_txqueue_trace { rxrpc_txqueue_traces } __mode(byte); #define E_(a, b) TRACE_DEFINE_ENUM(a); rxrpc_bundle_traces; +rxrpc_call_poke_traces; rxrpc_call_traces; rxrpc_client_traces; rxrpc_congest_changes; @@ -1747,6 +1758,47 @@ TRACE_EVENT(rxrpc_txbuf, __entry->ref) ); +TRACE_EVENT(rxrpc_poke_call, + TP_PROTO(struct rxrpc_call *call, bool busy, + enum rxrpc_call_poke_trace what), + + TP_ARGS(call, busy, what), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id ) + __field(bool, busy ) + __field(enum rxrpc_call_poke_trace, what ) + ), + + TP_fast_assign( + __entry->call_debug_id = call->debug_id; + __entry->busy = busy; + __entry->what = what; + ), + + TP_printk("c=%08x %s%s", + __entry->call_debug_id, + __print_symbolic(__entry->what, rxrpc_call_poke_traces), + __entry->busy ? "!" : "") + ); + +TRACE_EVENT(rxrpc_call_poked, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(unsigned int, call_debug_id ) + ), + + TP_fast_assign( + __entry->call_debug_id = call->debug_id; + ), + + TP_printk("c=%08x", + __entry->call_debug_id) + ); + #undef EM #undef E_ #endif /* _TRACE_RXRPC_H */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 654e9dab107c..a80655fa9dfb 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -292,6 +292,7 @@ struct rxrpc_local { struct sk_buff_head reject_queue; /* packets awaiting rejection */ struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ struct sk_buff_head rx_queue; /* Received packets */ + struct list_head call_attend_q; /* Calls requiring immediate attention */ struct rb_root client_bundles; /* Client connection bundles by socket params */ spinlock_t client_bundles_lock; /* Lock for client_bundles */ spinlock_t lock; /* access lock */ @@ -616,6 +617,7 @@ struct rxrpc_call { struct list_head recvmsg_link; /* Link in rx->recvmsg_q */ struct list_head sock_link; /* Link in rx->sock_calls */ struct rb_node sock_node; /* Node in rx->calls */ + struct list_head attend_link; /* Link in local->call_attend_q */ struct rxrpc_txbuf *tx_pending; /* Tx buffer being filled */ wait_queue_head_t waitq; /* Wait queue for channel or Tx */ s64 tx_total_len; /* Total length left to be transmitted (or -1) */ @@ -843,6 +845,7 @@ extern const char *const rxrpc_call_states[]; extern const char *const rxrpc_call_completions[]; extern struct kmem_cache *rxrpc_call_jar; +void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what); struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, @@ -951,7 +954,7 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ -void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); +void rxrpc_input_call_event(struct rxrpc_call *, struct sk_buff *); void rxrpc_input_implicit_end_call(struct rxrpc_sock *, struct rxrpc_connection *, struct rxrpc_call *); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f6d1b3a6f8c6..997641e3d1c8 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -45,6 +45,29 @@ static struct semaphore rxrpc_call_limiter = static struct semaphore rxrpc_kernel_call_limiter = __SEMAPHORE_INITIALIZER(rxrpc_kernel_call_limiter, 1000); +void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) +{ + struct rxrpc_local *local; + struct rxrpc_peer *peer = call->peer; + bool busy; + + if (WARN_ON_ONCE(!peer)) + return; + local = peer->local; + + if (call->state < RXRPC_CALL_COMPLETE) { + spin_lock_bh(&local->lock); + busy = !list_empty(&call->attend_link); + trace_rxrpc_poke_call(call, busy, what); + if (!busy) { + rxrpc_get_call(call, rxrpc_call_get_poke); + list_add_tail(&call->attend_link, &local->call_attend_q); + } + spin_unlock_bh(&local->lock); + rxrpc_wake_up_io_thread(local); + } +} + static void rxrpc_call_timer_expired(struct timer_list *t) { struct rxrpc_call *call = from_timer(call, t, timer); @@ -137,6 +160,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->accept_link); INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); + INIT_LIST_HEAD(&call->attend_link); INIT_LIST_HEAD(&call->tx_buffer); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 13c52145a926..036f02371051 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1017,8 +1017,7 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) /* * Process an incoming call packet. */ -void rxrpc_input_call_packet(struct rxrpc_call *call, - struct sk_buff *skb) +void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned long timo; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 416c6101cf78..cc249bc6b8cd 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -366,7 +366,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) /* Process a call packet; this either discards or passes on the ref * elsewhere. */ - rxrpc_input_call_packet(call, skb); + rxrpc_input_call_event(call, skb); goto out; discard: @@ -413,6 +413,7 @@ int rxrpc_io_thread(void *data) { struct sk_buff_head rx_queue; struct rxrpc_local *local = data; + struct rxrpc_call *call; struct sk_buff *skb; skb_queue_head_init(&rx_queue); @@ -422,6 +423,20 @@ int rxrpc_io_thread(void *data) for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); + /* Deal with calls that want immediate attention. */ + if ((call = list_first_entry_or_null(&local->call_attend_q, + struct rxrpc_call, + attend_link))) { + spin_lock_bh(&local->lock); + list_del_init(&call->attend_link); + spin_unlock_bh(&local->lock); + + trace_rxrpc_call_poked(call); + rxrpc_input_call_event(call, NULL); + rxrpc_put_call(call, rxrpc_call_put_poke); + continue; + } + /* Process received packets and errors. */ if ((skb = __skb_dequeue(&rx_queue))) { switch (skb->mark) { @@ -450,7 +465,8 @@ int rxrpc_io_thread(void *data) } set_current_state(TASK_INTERRUPTIBLE); - if (!skb_queue_empty(&local->rx_queue)) { + if (!skb_queue_empty(&local->rx_queue) || + !list_empty(&local->call_attend_q)) { __set_current_state(TASK_RUNNING); continue; } diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 6b4d77219f36..03f491cc23ef 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -104,6 +104,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, skb_queue_head_init(&local->reject_queue); skb_queue_head_init(&local->event_queue); skb_queue_head_init(&local->rx_queue); + INIT_LIST_HEAD(&local->call_attend_q); local->client_bundles = RB_ROOT; spin_lock_init(&local->client_bundles_lock); spin_lock_init(&local->lock); -- cgit v1.2.3-70-g09d2 From f3441d4125fc98995858550a5521b8d7daf0504a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 21:58:36 +0100 Subject: rxrpc: Copy client call parameters into rxrpc_call earlier Copy client call parameters into rxrpc_call earlier so that that can be used to convey them to the connection code - which can then be offloaded to the I/O thread. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 3 +++ net/rxrpc/ar-internal.h | 7 ++++++- net/rxrpc/call_accept.c | 2 ++ net/rxrpc/call_object.c | 50 +++++++++++++++++++++++++++----------------- net/rxrpc/conn_client.c | 2 +- net/rxrpc/io_thread.c | 4 ++-- net/rxrpc/output.c | 2 +- net/rxrpc/proc.c | 25 +++++++--------------- net/rxrpc/recvmsg.c | 8 +++---- net/rxrpc/security.c | 30 ++++++++++++++++++++++++++ net/rxrpc/txbuf.c | 2 +- 11 files changed, 87 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 0b12d96c7921..8bd48358f757 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -52,6 +52,7 @@ #define rxrpc_local_traces \ EM(rxrpc_local_free, "FREE ") \ + EM(rxrpc_local_get_call, "GET call ") \ EM(rxrpc_local_get_client_conn, "GET conn-cln") \ EM(rxrpc_local_get_for_use, "GET for-use ") \ EM(rxrpc_local_get_peer, "GET peer ") \ @@ -61,6 +62,7 @@ EM(rxrpc_local_processing, "PROCESSING ") \ EM(rxrpc_local_put_already_queued, "PUT alreadyq") \ EM(rxrpc_local_put_bind, "PUT bind ") \ + EM(rxrpc_local_put_call, "PUT call ") \ EM(rxrpc_local_put_for_use, "PUT for-use ") \ EM(rxrpc_local_put_kill_conn, "PUT conn-kil") \ EM(rxrpc_local_put_peer, "PUT peer ") \ @@ -166,6 +168,7 @@ EM(rxrpc_call_new_client, "NEW client ") \ EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ EM(rxrpc_call_put_discard_prealloc, "PUT disc-pre") \ + EM(rxrpc_call_put_discard_error, "PUT disc-err") \ EM(rxrpc_call_put_input, "PUT input ") \ EM(rxrpc_call_put_kernel, "PUT kernel ") \ EM(rxrpc_call_put_poke, "PUT poke ") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index a80655fa9dfb..3bd6a5eb2fb7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -530,6 +530,7 @@ enum rxrpc_call_flag { RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ RXRPC_CALL_DELAY_ACK_PENDING, /* DELAY ACK generation is pending */ RXRPC_CALL_IDLE_ACK_PENDING, /* IDLE ACK generation is pending */ + RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ }; /* @@ -592,10 +593,13 @@ struct rxrpc_call { struct rcu_head rcu; struct rxrpc_connection *conn; /* connection carrying call */ struct rxrpc_peer *peer; /* Peer record for remote address */ + struct rxrpc_local *local; /* Representation of local endpoint */ struct rxrpc_sock __rcu *socket; /* socket responsible */ struct rxrpc_net *rxnet; /* Network namespace to which call belongs */ + struct key *key; /* Security details */ const struct rxrpc_security *security; /* applied security module */ struct mutex user_mutex; /* User access mutex */ + struct sockaddr_rxrpc dest_srx; /* Destination address */ unsigned long delay_ack_at; /* When DELAY ACK needs to happen */ unsigned long ack_lost_at; /* When ACK is figured as lost */ unsigned long resend_at; /* When next resend needs to happen */ @@ -631,11 +635,11 @@ struct rxrpc_call { enum rxrpc_call_state state; /* current state of call */ enum rxrpc_call_completion completion; /* Call completion condition */ refcount_t ref; - u16 service_id; /* service ID */ u8 security_ix; /* Security type */ enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */ u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ + u32 security_level; /* Security level selected */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ @@ -1147,6 +1151,7 @@ extern const struct rxrpc_security rxkad; int __init rxrpc_init_security(void); const struct rxrpc_security *rxrpc_security_lookup(u8); void rxrpc_exit_security(void); +int rxrpc_init_client_call_security(struct rxrpc_call *); int rxrpc_init_client_conn_security(struct rxrpc_connection *); const struct rxrpc_security *rxrpc_get_incoming_security(struct rxrpc_sock *, struct sk_buff *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 8d106b626aa3..8bc327aa2beb 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -318,10 +318,12 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, (call_tail + 1) & (RXRPC_BACKLOG_MAX - 1)); rxrpc_see_call(call, rxrpc_call_see_accept); + call->local = rxrpc_get_local(conn->local, rxrpc_local_get_call); call->conn = conn; call->security = conn->security; call->security_ix = conn->security_ix; call->peer = rxrpc_get_peer(conn->peer, rxrpc_peer_get_accept); + call->dest_srx = peer->srx; call->cong_ssthresh = call->peer->cong_ssthresh; call->tx_last_sent = ktime_get_real(); return call; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 997641e3d1c8..2622d06bb0d6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -47,14 +47,9 @@ static struct semaphore rxrpc_kernel_call_limiter = void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) { - struct rxrpc_local *local; - struct rxrpc_peer *peer = call->peer; + struct rxrpc_local *local = call->local; bool busy; - if (WARN_ON_ONCE(!peer)) - return; - local = peer->local; - if (call->state < RXRPC_CALL_COMPLETE) { spin_lock_bh(&local->lock); busy = !list_empty(&call->attend_link); @@ -200,22 +195,45 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, */ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, struct sockaddr_rxrpc *srx, + struct rxrpc_conn_parameters *cp, + struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) { struct rxrpc_call *call; ktime_t now; + int ret; _enter(""); call = rxrpc_alloc_call(rx, gfp, debug_id); if (!call) return ERR_PTR(-ENOMEM); - call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; - call->service_id = srx->srx_service; now = ktime_get_real(); - call->acks_latest_ts = now; - call->cong_tstamp = now; + call->acks_latest_ts = now; + call->cong_tstamp = now; + call->state = RXRPC_CALL_CLIENT_AWAIT_CONN; + call->dest_srx = *srx; + call->interruptibility = p->interruptibility; + call->tx_total_len = p->tx_total_len; + call->key = key_get(cp->key); + call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); + if (p->kernel) + __set_bit(RXRPC_CALL_KERNEL, &call->flags); + if (cp->upgrade) + __set_bit(RXRPC_CALL_UPGRADE, &call->flags); + if (cp->exclusive) + __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); + + ret = rxrpc_init_client_call_security(call); + if (ret < 0) { + __rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, 0, ret); + rxrpc_put_call(call, rxrpc_call_put_discard_error); + return ERR_PTR(ret); + } + + trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), + p->user_call_ID, rxrpc_call_new_client); _leave(" = %p", call); return call; @@ -295,7 +313,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return ERR_PTR(-ERESTARTSYS); } - call = rxrpc_alloc_client_call(rx, srx, gfp, debug_id); + call = rxrpc_alloc_client_call(rx, srx, cp, p, gfp, debug_id); if (IS_ERR(call)) { release_sock(&rx->sk); up(limiter); @@ -303,13 +321,6 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, return call; } - call->interruptibility = p->interruptibility; - call->tx_total_len = p->tx_total_len; - trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), - p->user_call_ID, rxrpc_call_new_client); - if (p->kernel) - __set_bit(RXRPC_CALL_KERNEL, &call->flags); - /* We need to protect a partially set up call against the user as we * will be acting outside the socket lock. */ @@ -413,7 +424,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, rcu_assign_pointer(call->socket, rx); call->call_id = sp->hdr.callNumber; - call->service_id = sp->hdr.serviceId; + call->dest_srx.srx_service = sp->hdr.serviceId; call->cid = sp->hdr.cid; call->state = RXRPC_CALL_SERVER_SECURING; call->cong_tstamp = skb->tstamp; @@ -639,6 +650,7 @@ static void rxrpc_destroy_call(struct work_struct *work) rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_put_ack); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); + rxrpc_put_local(call->local, rxrpc_local_put_call); call_rcu(&call->rcu, rxrpc_rcu_free_call); } diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 9485a3d18f29..ab3dd22fadc0 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -553,7 +553,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->call_id = call_id; call->security = conn->security; call->security_ix = conn->security_ix; - call->service_id = conn->service_id; + call->dest_srx.srx_service = conn->service_id; trace_rxrpc_connect_call(call); diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index cc249bc6b8cd..2119941b6d6c 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -343,8 +343,8 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) } if (call) { - if (sp->hdr.serviceId != call->service_id) - call->service_id = sp->hdr.serviceId; + if (sp->hdr.serviceId != call->dest_srx.srx_service) + call->dest_srx.srx_service = sp->hdr.serviceId; if ((int)sp->hdr.serial - (int)call->rx_serial > 0) call->rx_serial = sp->hdr.serial; if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 131c7a76fb06..e2ce7dadbb7a 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -357,7 +357,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) pkt.whdr.userStatus = 0; pkt.whdr.securityIndex = call->security_ix; pkt.whdr._rsvd = 0; - pkt.whdr.serviceId = htons(call->service_id); + pkt.whdr.serviceId = htons(call->dest_srx.srx_service); pkt.abort_code = htonl(call->abort_code); iov[0].iov_base = &pkt; diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 35d5b43c677e..5af7c8ee4b1a 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -49,8 +49,6 @@ static void rxrpc_call_seq_stop(struct seq_file *seq, void *v) static int rxrpc_call_seq_show(struct seq_file *seq, void *v) { struct rxrpc_local *local; - struct rxrpc_sock *rx; - struct rxrpc_peer *peer; struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); unsigned long timeout = 0; @@ -69,22 +67,13 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) call = list_entry(v, struct rxrpc_call, link); - rx = rcu_dereference(call->socket); - if (rx) { - local = READ_ONCE(rx->local); - if (local) - sprintf(lbuff, "%pISpc", &local->srx.transport); - else - strcpy(lbuff, "no_local"); - } else { - strcpy(lbuff, "no_socket"); - } - - peer = call->peer; - if (peer) - sprintf(rbuff, "%pISpc", &peer->srx.transport); + local = call->local; + if (local) + sprintf(lbuff, "%pISpc", &local->srx.transport); else - strcpy(rbuff, "no_connection"); + strcpy(lbuff, "no_local"); + + sprintf(rbuff, "%pISpc", &call->dest_srx.transport); if (call->state != RXRPC_CALL_SERVER_PREALLOC) { timeout = READ_ONCE(call->expect_rx_by); @@ -98,7 +87,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) " %-8.8s %08x %08x %08x %02x %08x %02x %08x %06lx\n", lbuff, rbuff, - call->service_id, + call->dest_srx.srx_service, call->cid, call->call_id, rxrpc_is_service_call(call) ? "Svc" : "Clt", diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index bfac9e09347e..5df7f468abed 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -490,11 +490,9 @@ try_again: } if (msg->msg_name && call->peer) { - struct sockaddr_rxrpc *srx = msg->msg_name; - size_t len = sizeof(call->peer->srx); + size_t len = sizeof(call->dest_srx); - memcpy(msg->msg_name, &call->peer->srx, len); - srx->srx_service = call->service_id; + memcpy(msg->msg_name, &call->dest_srx, len); msg->msg_namelen = len; } @@ -639,7 +637,7 @@ read_phase_complete: out: rxrpc_transmit_ack_packets(call->peer->local); if (_service) - *_service = call->service_id; + *_service = call->dest_srx.srx_service; mutex_unlock(&call->user_mutex); _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort); return ret; diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index e6ddac9b3732..209f2c25a0da 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -62,6 +62,36 @@ const struct rxrpc_security *rxrpc_security_lookup(u8 security_index) return rxrpc_security_types[security_index]; } +/* + * Initialise the security on a client call. + */ +int rxrpc_init_client_call_security(struct rxrpc_call *call) +{ + const struct rxrpc_security *sec; + struct rxrpc_key_token *token; + struct key *key = call->key; + int ret; + + if (!key) + return 0; + + ret = key_validate(key); + if (ret < 0) + return ret; + + for (token = key->payload.data[0]; token; token = token->next) { + sec = rxrpc_security_lookup(token->security_index); + if (sec) + goto found; + } + return -EKEYREJECTED; + +found: + call->security = sec; + _leave(" = 0"); + return 0; +} + /* * initialise the security on a client connection */ diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index f93dc666a3a0..90ff00c340cd 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -44,7 +44,7 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, txb->wire.userStatus = 0; txb->wire.securityIndex = call->security_ix; txb->wire._rsvd = 0; - txb->wire.serviceId = htons(call->service_id); + txb->wire.serviceId = htons(call->dest_srx.srx_service); trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, -- cgit v1.2.3-70-g09d2 From cf37b5987508878647161ec3cdba0bb00a1b607a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 31 Mar 2022 23:55:08 +0100 Subject: rxrpc: Move DATA transmission into call processor work item Move DATA transmission into the call processor work item. In a future patch, this will be called from the I/O thread rather than being itsown work item. This will allow DATA transmission to be driven directly by incoming ACKs, pokes and timers as those are processed. The Tx queue is also split: The queue of packets prepared by sendmsg is now places in call->tx_sendmsg and the packet dispatcher decants the packets into call->tx_buffer as space becomes available in the transmission window. This allows sendmsg to run ahead of the available space to try and prevent an underflow in transmission. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 6 +++- net/rxrpc/ar-internal.h | 5 ++- net/rxrpc/call_event.c | 83 ++++++++++++++++++++++++++++++++++++++++---- net/rxrpc/call_object.c | 6 ++++ net/rxrpc/output.c | 48 +++++++++++++++++++++++++ net/rxrpc/sendmsg.c | 83 +++++++------------------------------------- net/rxrpc/txbuf.c | 10 ++++-- 7 files changed, 161 insertions(+), 80 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 8bd48358f757..c3043fbea0e6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -183,6 +183,7 @@ EM(rxrpc_call_queue_requeue, "QUE requeue ") \ EM(rxrpc_call_queue_resend, "QUE resend ") \ EM(rxrpc_call_queue_timer, "QUE timer ") \ + EM(rxrpc_call_queue_tx_data, "QUE tx-data ") \ EM(rxrpc_call_see_accept, "SEE accept ") \ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ @@ -738,6 +739,7 @@ TRACE_EVENT(rxrpc_txqueue, __field(rxrpc_seq_t, acks_hard_ack ) __field(rxrpc_seq_t, tx_bottom ) __field(rxrpc_seq_t, tx_top ) + __field(rxrpc_seq_t, tx_prepared ) __field(int, tx_winsize ) ), @@ -747,16 +749,18 @@ TRACE_EVENT(rxrpc_txqueue, __entry->acks_hard_ack = call->acks_hard_ack; __entry->tx_bottom = call->tx_bottom; __entry->tx_top = call->tx_top; + __entry->tx_prepared = call->tx_prepared; __entry->tx_winsize = call->tx_winsize; ), - TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u", + TP_printk("c=%08x %s f=%08x h=%08x n=%u/%u/%u/%u", __entry->call, __print_symbolic(__entry->why, rxrpc_txqueue_traces), __entry->tx_bottom, __entry->acks_hard_ack, __entry->tx_top - __entry->tx_bottom, __entry->tx_top - __entry->acks_hard_ack, + __entry->tx_prepared - __entry->tx_bottom, __entry->tx_winsize) ); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 3bd6a5eb2fb7..6af7298af39b 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -646,9 +646,11 @@ struct rxrpc_call { /* Transmitted data tracking. */ spinlock_t tx_lock; /* Transmit queue lock */ + struct list_head tx_sendmsg; /* Sendmsg prepared packets */ struct list_head tx_buffer; /* Buffer of transmissible packets */ rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ + rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ u16 tx_backoff; /* Delay to insert due to Tx failure */ u8 tx_winsize; /* Maximum size of Tx window */ @@ -766,7 +768,7 @@ struct rxrpc_send_params { */ struct rxrpc_txbuf { struct rcu_head rcu; - struct list_head call_link; /* Link in call->tx_queue */ + struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ struct list_head tx_link; /* Link in live Enc queue or Tx queue */ struct rxrpc_call *call; /* Call to which belongs */ ktime_t last_sent; /* Time at which last transmitted */ @@ -1067,6 +1069,7 @@ int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *); void rxrpc_reject_packets(struct rxrpc_local *); void rxrpc_send_keepalive(struct rxrpc_peer *); +void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); /* * peer_event.c diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 3925b55e2064..c9f835292f7b 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -291,6 +291,72 @@ out: _leave(""); } +static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) +{ + unsigned int winsize = min_t(unsigned int, call->tx_winsize, + call->cong_cwnd + call->cong_extra); + rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; + rxrpc_seq_t tx_top = call->tx_top; + int space; + + space = wtop - tx_top; + return space > 0; +} + +/* + * Decant some if the sendmsg prepared queue into the transmission buffer. + */ +static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) +{ + struct rxrpc_txbuf *txb; + + if (rxrpc_is_client_call(call) && + !test_bit(RXRPC_CALL_EXPOSED, &call->flags)) + rxrpc_expose_client_call(call); + + while ((txb = list_first_entry_or_null(&call->tx_sendmsg, + struct rxrpc_txbuf, call_link))) { + spin_lock(&call->tx_lock); + list_del(&txb->call_link); + spin_unlock(&call->tx_lock); + + call->tx_top = txb->seq; + list_add_tail(&txb->call_link, &call->tx_buffer); + + rxrpc_transmit_one(call, txb); + + // TODO: Drain the transmission buffers. Do this somewhere better + if (after(call->acks_hard_ack, call->tx_bottom + 16)) + rxrpc_shrink_call_tx_buffer(call); + + if (!rxrpc_tx_window_has_space(call)) + break; + } +} + +static void rxrpc_transmit_some_data(struct rxrpc_call *call) +{ + switch (call->state) { + case RXRPC_CALL_SERVER_ACK_REQUEST: + if (list_empty(&call->tx_sendmsg)) + return; + fallthrough; + + case RXRPC_CALL_SERVER_SEND_REPLY: + case RXRPC_CALL_SERVER_AWAIT_ACK: + case RXRPC_CALL_CLIENT_SEND_REQUEST: + case RXRPC_CALL_CLIENT_AWAIT_REPLY: + if (!rxrpc_tx_window_has_space(call)) + return; + if (list_empty(&call->tx_sendmsg)) + return; + rxrpc_decant_prepared_tx(call); + break; + default: + return; + } +} + /* * Handle retransmission and deferred ACK/abort generation. */ @@ -309,19 +375,22 @@ void rxrpc_process_call(struct work_struct *work) call->debug_id, rxrpc_call_states[call->state], call->events); recheck_state: + if (call->acks_hard_ack != call->tx_bottom) + rxrpc_shrink_call_tx_buffer(call); + /* Limit the number of times we do this before returning to the manager */ - iterations++; - if (iterations > 5) - goto requeue; + if (!rxrpc_tx_window_has_space(call) || + list_empty(&call->tx_sendmsg)) { + iterations++; + if (iterations > 5) + goto requeue; + } if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { rxrpc_send_abort_packet(call); goto recheck_state; } - if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) - rxrpc_shrink_call_tx_buffer(call); - if (call->state == RXRPC_CALL_COMPLETE) { del_timer_sync(&call->timer); goto out; @@ -387,6 +456,8 @@ recheck_state: set_bit(RXRPC_CALL_EV_RESEND, &call->events); } + rxrpc_transmit_some_data(call); + /* Process events */ if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) { if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 2622d06bb0d6..96a7edd3a842 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -156,6 +156,7 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); + INIT_LIST_HEAD(&call->tx_sendmsg); INIT_LIST_HEAD(&call->tx_buffer); skb_queue_head_init(&call->recvmsg_queue); skb_queue_head_init(&call->rx_oos_queue); @@ -641,6 +642,11 @@ static void rxrpc_destroy_call(struct work_struct *work) del_timer_sync(&call->timer); rxrpc_cleanup_ring(call); + while ((txb = list_first_entry_or_null(&call->tx_sendmsg, + struct rxrpc_txbuf, call_link))) { + list_del(&txb->call_link); + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); + } while ((txb = list_first_entry_or_null(&call->tx_buffer, struct rxrpc_txbuf, call_link))) { list_del(&txb->call_link); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e2ce7dadbb7a..c8147e50060b 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -465,6 +465,14 @@ dont_set_request_ack: trace_rxrpc_tx_data(call, txb->seq, serial, txb->wire.flags, test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false); + + /* Track what we've attempted to transmit at least once so that the + * retransmission algorithm doesn't try to resend what we haven't sent + * yet. However, this can race as we can receive an ACK before we get + * to this point. But, OTOH, if we won't get an ACK mentioning this + * packet unless the far side received it (though it could have + * discarded it anyway and NAK'd it). + */ cmpxchg(&call->tx_transmitted, txb->seq - 1, txb->seq); /* send the packet with the don't fragment bit set if we currently @@ -712,3 +720,43 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) peer->last_tx_at = ktime_get_seconds(); _leave(""); } + +/* + * Schedule an instant Tx resend. + */ +static inline void rxrpc_instant_resend(struct rxrpc_call *call, + struct rxrpc_txbuf *txb) +{ + if (call->state < RXRPC_CALL_COMPLETE) + kdebug("resend"); +} + +/* + * Transmit one packet. + */ +void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + int ret; + + ret = rxrpc_send_data_packet(call, txb); + if (ret < 0) { + switch (ret) { + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, + 0, ret); + break; + default: + _debug("need instant resend %d", ret); + rxrpc_instant_resend(call, txb); + } + } else { + unsigned long now = jiffies; + unsigned long resend_at = now + call->peer->rto_j; + + WRITE_ONCE(call->resend_at, resend_at); + rxrpc_reduce_call_timer(call, resend_at, now, + rxrpc_timer_set_for_send); + } +} diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 76b1e2e89c1e..11af37275d5b 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -22,30 +22,9 @@ */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { - unsigned int win_size; - rxrpc_seq_t tx_win = smp_load_acquire(&call->acks_hard_ack); - - /* If we haven't transmitted anything for >1RTT, we should reset the - * congestion management state. - */ - if (ktime_before(ktime_add_us(call->tx_last_sent, - call->peer->srtt_us >> 3), - ktime_get_real())) { - if (RXRPC_TX_SMSS > 2190) - win_size = 2; - else if (RXRPC_TX_SMSS > 1095) - win_size = 3; - else - win_size = 4; - win_size += call->cong_extra; - } else { - win_size = min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra); - } - if (_tx_win) - *_tx_win = tx_win; - return call->tx_top - tx_win < win_size; + *_tx_win = call->tx_bottom; + return call->tx_prepared - call->tx_bottom < 256; } /* @@ -66,11 +45,6 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx, if (signal_pending(current)) return sock_intr_errno(*timeo); - if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) { - rxrpc_shrink_call_tx_buffer(call); - continue; - } - trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } @@ -107,11 +81,6 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, tx_win == tx_start && signal_pending(current)) return -EINTR; - if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) { - rxrpc_shrink_call_tx_buffer(call); - continue; - } - if (tx_win != tx_start) { timeout = rtt; tx_start = tx_win; @@ -137,11 +106,6 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx, if (call->state >= RXRPC_CALL_COMPLETE) return call->error; - if (READ_ONCE(call->acks_hard_ack) != call->tx_bottom) { - rxrpc_shrink_call_tx_buffer(call); - continue; - } - trace_rxrpc_txqueue(call, rxrpc_txqueue_wait); *timeo = schedule_timeout(*timeo); } @@ -207,29 +171,27 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, unsigned long now; rxrpc_seq_t seq = txb->seq; bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags); - int ret; rxrpc_inc_stat(call->rxnet, stat_tx_data); - ASSERTCMP(seq, ==, call->tx_top + 1); + ASSERTCMP(txb->seq, ==, call->tx_prepared + 1); /* We have to set the timestamp before queueing as the retransmit * algorithm can see the packet as soon as we queue it. */ txb->last_sent = ktime_get_real(); - /* Add the packet to the call's output buffer */ - rxrpc_get_txbuf(txb, rxrpc_txbuf_get_buffer); - spin_lock(&call->tx_lock); - list_add_tail(&txb->call_link, &call->tx_buffer); - call->tx_top = seq; - spin_unlock(&call->tx_lock); - if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); + /* Add the packet to the call's output buffer */ + spin_lock(&call->tx_lock); + list_add_tail(&txb->call_link, &call->tx_sendmsg); + call->tx_prepared = seq; + spin_unlock(&call->tx_lock); + if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { _debug("________awaiting reply/ACK__________"); write_lock_bh(&call->state_lock); @@ -258,30 +220,11 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, write_unlock_bh(&call->state_lock); } - if (seq == 1 && rxrpc_is_client_call(call)) - rxrpc_expose_client_call(call); - - ret = rxrpc_send_data_packet(call, txb); - if (ret < 0) { - switch (ret) { - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - 0, ret); - goto out; - } - } else { - unsigned long now = jiffies; - unsigned long resend_at = now + call->peer->rto_j; - WRITE_ONCE(call->resend_at, resend_at); - rxrpc_reduce_call_timer(call, resend_at, now, - rxrpc_timer_set_for_send); - } - -out: - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans); + /* Stick the packet on the crypto queue or the transmission queue as + * appropriate. + */ + rxrpc_queue_call(call, rxrpc_call_queue_tx_data); } /* diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 90ff00c340cd..a5054389dfbb 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -34,7 +34,7 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, txb->offset = 0; txb->flags = 0; txb->ack_why = 0; - txb->seq = call->tx_top + 1; + txb->seq = call->tx_prepared + 1; txb->wire.epoch = htonl(call->conn->proto.epoch); txb->wire.cid = htonl(call->cid); txb->wire.callNumber = htonl(call->call_id); @@ -107,6 +107,7 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) { struct rxrpc_txbuf *txb; rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack); + bool wake = false; _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); @@ -123,7 +124,7 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) if (txb->seq != call->tx_bottom + 1) rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step); ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); - call->tx_bottom++; + smp_store_release(&call->tx_bottom, call->tx_bottom + 1); list_del_rcu(&txb->call_link); trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); @@ -131,7 +132,12 @@ void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) spin_unlock(&call->tx_lock); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); + if (after(call->acks_hard_ack, call->tx_bottom + 128)) + wake = true; } spin_unlock(&call->tx_lock); + + if (wake) + wake_up(&call->waitq); } -- cgit v1.2.3-70-g09d2 From 29fb4ec385f18db98d9188c2173a0b07d2de6917 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Oct 2022 15:42:06 +0100 Subject: rxrpc: Remove RCU from peer->error_targets list Remove the RCU requirements from the peer's list of error targets so that the error distributor can call sleeping functions. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/call_accept.c | 6 ++++++ net/rxrpc/call_object.c | 2 +- net/rxrpc/conn_client.c | 4 ++++ net/rxrpc/conn_object.c | 6 +++--- net/rxrpc/output.c | 6 ------ net/rxrpc/peer_event.c | 15 ++++++++++++++- 6 files changed, 28 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 8bc327aa2beb..5f978b0b2404 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -433,6 +433,12 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, */ rxrpc_put_call(call, rxrpc_call_put_discard_prealloc); + if (hlist_unhashed(&call->error_link)) { + spin_lock(&call->peer->lock); + hlist_add_head(&call->error_link, &call->peer->error_targets); + spin_unlock(&call->peer->lock); + } + _leave(" = %p{%d}", call, call->debug_id); return call; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 96a7edd3a842..7570b4e67bc5 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -442,7 +442,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, rcu_assign_pointer(conn->channels[chan].call, call); spin_lock(&conn->peer->lock); - hlist_add_head_rcu(&call->error_link, &conn->peer->error_targets); + hlist_add_head(&call->error_link, &conn->peer->error_targets); spin_unlock(&conn->peer->lock); rxrpc_start_call_timer(call); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index ab3dd22fadc0..3c7b1bdec0db 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -786,6 +786,10 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) if (chan->call_counter >= INT_MAX) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); trace_rxrpc_client(conn, channel, rxrpc_client_exposed); + + spin_lock(&call->peer->lock); + hlist_add_head(&call->error_link, &call->peer->error_targets); + spin_unlock(&call->peer->lock); } } diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index c2e05ea29f12..5a39255ea014 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -215,9 +215,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { - spin_lock_bh(&call->peer->lock); - hlist_del_rcu(&call->error_link); - spin_unlock_bh(&call->peer->lock); + spin_lock(&call->peer->lock); + hlist_del_init(&call->error_link); + spin_unlock(&call->peer->lock); } if (rxrpc_is_client_call(call)) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index c8147e50060b..71963b4523be 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -394,12 +394,6 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) _enter("%x,{%d}", txb->seq, txb->len); - if (hlist_unhashed(&call->error_link)) { - spin_lock_bh(&call->peer->lock); - hlist_add_head_rcu(&call->error_link, &call->peer->error_targets); - spin_unlock_bh(&call->peer->lock); - } - /* Each transmission of a Tx packet needs a new serial number */ serial = atomic_inc_return(&conn->serial); txb->wire.serial = htonl(serial); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 94f63fb1bd67..97d017ca3dc4 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -207,11 +207,24 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, enum rxrpc_call_completion compl) { struct rxrpc_call *call; + HLIST_HEAD(error_targets); + + spin_lock(&peer->lock); + hlist_move_list(&peer->error_targets, &error_targets); + + while (!hlist_empty(&error_targets)) { + call = hlist_entry(error_targets.first, + struct rxrpc_call, error_link); + hlist_del_init(&call->error_link); + spin_unlock(&peer->lock); - hlist_for_each_entry_rcu(call, &peer->error_targets, error_link) { rxrpc_see_call(call, rxrpc_call_see_distribute_error); rxrpc_set_call_completion(call, compl, 0, -error); + + spin_lock(&peer->lock); } + + spin_unlock(&peer->lock); } /* -- cgit v1.2.3-70-g09d2 From 2d1faf7a0ca3c0b327cf064c80e4e775532c9319 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 6 Oct 2022 15:43:51 +0100 Subject: rxrpc: Simplify skbuff accounting in receive path A received skbuff needs a ref when it gets put on a call data queue or conn packet queue, and rxrpc_input_packet() and co. jump through a lot of hoops to avoid double-dropping the skbuff ref so that we can avoid getting a ref when we queue the packet. Change this so that the skbuff ref is unconditionally dropped by the caller of rxrpc_input_packet(). An additional ref is then taken on the packet if it is pushed onto a queue. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 3 +- net/rxrpc/input.c | 45 ++++++++++++++-------------- net/rxrpc/io_thread.c | 70 ++++++++++++++++++++------------------------ 3 files changed, 56 insertions(+), 62 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index c3043fbea0e6..82b1327c2ba6 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -28,6 +28,8 @@ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ EM(rxrpc_skb_get_ack, "GET ack ") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ + EM(rxrpc_skb_get_local_work, "GET locl-work") \ + EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ EM(rxrpc_skb_get_to_recvmsg, "GET to-recv ") \ EM(rxrpc_skb_get_to_recvmsg_oos, "GET to-recv-o") \ EM(rxrpc_skb_new_encap_rcv, "NEW encap-rcv") \ @@ -39,7 +41,6 @@ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ EM(rxrpc_skb_put_jumbo_subpacket, "PUT jumbo-sub") \ - EM(rxrpc_skb_put_lose, "PUT lose ") \ EM(rxrpc_skb_put_purge, "PUT purge ") \ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 036f02371051..42addbcf59f9 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -338,7 +338,8 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, /* * Process a DATA packet. */ -static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb) +static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, + bool *_notify) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct sk_buff *oos; @@ -361,7 +362,7 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb) if (test_and_set_bit(RXRPC_CALL_RX_LAST, &call->flags) && seq + 1 != wtop) { rxrpc_proto_abort("LSN", call, seq); - goto err_free; + return; } } else { if (test_bit(RXRPC_CALL_RX_LAST, &call->flags) && @@ -369,7 +370,7 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb) pr_warn("Packet beyond last: c=%x q=%x window=%x-%x wlimit=%x\n", call->debug_id, seq, window, wtop, wlimit); rxrpc_proto_abort("LSA", call, seq); - goto err_free; + return; } } @@ -402,9 +403,11 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb) if (after(window, wtop)) wtop = window; + rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg); + spin_lock(&call->recvmsg_queue.lock); rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue); - skb = NULL; + *_notify = true; while ((oos = skb_peek(&call->rx_oos_queue))) { struct rxrpc_skb_priv *osp = rxrpc_skb(oos); @@ -456,16 +459,17 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_skb_priv *osp = rxrpc_skb(oos); if (after(osp->hdr.seq, seq)) { + rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg_oos); __skb_queue_before(&call->rx_oos_queue, oos, skb); goto oos_queued; } } + rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg_oos); __skb_queue_tail(&call->rx_oos_queue, skb); oos_queued: trace_rxrpc_receive(call, last ? rxrpc_receive_oos_last : rxrpc_receive_oos, sp->hdr.serial, sp->hdr.seq); - skb = NULL; } send_ack: @@ -483,9 +487,6 @@ send_ack: else rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_input_data); - -err_free: - rxrpc_free_skb(skb, rxrpc_skb_put_input); } /* @@ -498,6 +499,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb struct sk_buff *jskb; unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len - offset; + bool notify = false; while (sp->hdr.flags & RXRPC_JUMBO_PACKET) { if (len < RXRPC_JUMBO_SUBPKTLEN) @@ -517,7 +519,8 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb jsp = rxrpc_skb(jskb); jsp->offset = offset; jsp->len = RXRPC_JUMBO_DATALEN; - rxrpc_input_data_one(call, jskb); + rxrpc_input_data_one(call, jskb, ¬ify); + rxrpc_free_skb(jskb, rxrpc_skb_put_jumbo_subpacket); sp->hdr.flags = jhdr.flags; sp->hdr._rsvd = ntohs(jhdr._rsvd); @@ -529,7 +532,11 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb sp->offset = offset; sp->len = len; - rxrpc_input_data_one(call, skb); + rxrpc_input_data_one(call, skb, ¬ify); + if (notify) { + trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); + rxrpc_notify_socket(call); + } return true; protocol_error: @@ -552,10 +559,8 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) skb->len, seq0); state = READ_ONCE(call->state); - if (state >= RXRPC_CALL_COMPLETE) { - rxrpc_free_skb(skb, rxrpc_skb_put_input); + if (state >= RXRPC_CALL_COMPLETE) return; - } /* Unshare the packet so that it can be modified for in-place * decryption. @@ -605,7 +610,6 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) out: trace_rxrpc_notify_socket(call->debug_id, serial); rxrpc_notify_socket(call); - rxrpc_free_skb(skb, rxrpc_skb_put_input); _leave(" [queued]"); } @@ -797,7 +801,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_ackpacket ack; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_ackinfo info; - struct sk_buff *skb_old = NULL, *skb_put = skb; + struct sk_buff *skb_old = NULL; rxrpc_serial_t ack_serial, acked_serial; rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; @@ -963,6 +967,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto out; } + rxrpc_get_skb(skb, rxrpc_skb_get_ack); spin_lock(&call->acks_ack_lock); skb_old = call->acks_soft_tbl; call->acks_soft_tbl = skb; @@ -970,7 +975,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_input_soft_acks(call, skb->data + offset, first_soft_ack, nr_acks, &summary); - skb_put = NULL; } else if (call->acks_soft_tbl) { spin_lock(&call->acks_ack_lock); skb_old = call->acks_soft_tbl; @@ -986,7 +990,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_congestion_management(call, skb, &summary, acked_serial); out: - rxrpc_free_skb(skb_put, rxrpc_skb_put_input); rxrpc_free_skb(skb_old, rxrpc_skb_put_ack); } @@ -1037,11 +1040,11 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: rxrpc_input_data(call, skb); - goto no_free; + break; case RXRPC_PACKET_TYPE_ACK: rxrpc_input_ack(call, skb); - goto no_free; + break; case RXRPC_PACKET_TYPE_BUSY: /* Just ignore BUSY packets from the server; the retry and @@ -1061,10 +1064,6 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) default: break; } - - rxrpc_free_skb(skb, rxrpc_skb_put_input); -no_free: - _leave(""); } /* diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 2119941b6d6c..91b8ba5b90db 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -72,6 +72,7 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, { _enter("%p,%p", conn, skb); + rxrpc_get_skb(skb, rxrpc_skb_get_conn_work); skb_queue_tail(&conn->rx_queue, skb); rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); } @@ -86,10 +87,9 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, _enter("%p,%p", local, skb); if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { + rxrpc_get_skb(skb, rxrpc_skb_get_local_work); skb_queue_tail(&local->event_queue, skb); rxrpc_queue_local(local); - } else { - rxrpc_free_skb(skb, rxrpc_skb_put_input); } } @@ -99,10 +99,9 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) { if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { + rxrpc_get_skb(skb, rxrpc_skb_get_reject_work); skb_queue_tail(&local->reject_queue, skb); rxrpc_queue_local(local); - } else { - rxrpc_free_skb(skb, rxrpc_skb_put_input); } } @@ -153,7 +152,7 @@ static bool rxrpc_extract_abort(struct sk_buff *skb) /* * Process packets received on the local endpoint */ -static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) +static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) { struct rxrpc_connection *conn; struct rxrpc_channel *chan; @@ -161,6 +160,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) struct rxrpc_skb_priv *sp; struct rxrpc_peer *peer = NULL; struct rxrpc_sock *rx = NULL; + struct sk_buff *skb = *_skb; unsigned int channel; if (skb->tstamp == 0) @@ -181,7 +181,6 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) static int lose; if ((lose++ & 7) == 7) { trace_rxrpc_rx_lose(sp); - rxrpc_free_skb(skb, rxrpc_skb_put_lose); return 0; } } @@ -193,13 +192,13 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) - goto discard; + return 0; rxrpc_post_packet_to_local(local, skb); - goto out; + return 0; case RXRPC_PACKET_TYPE_BUSY: if (rxrpc_to_server(sp)) - goto discard; + return 0; fallthrough; case RXRPC_PACKET_TYPE_ACK: case RXRPC_PACKET_TYPE_ACKALL: @@ -208,7 +207,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) break; case RXRPC_PACKET_TYPE_ABORT: if (!rxrpc_extract_abort(skb)) - return true; /* Just discard if malformed */ + return 0; /* Just discard if malformed */ break; case RXRPC_PACKET_TYPE_DATA: @@ -220,15 +219,16 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) * decryption. */ if (sp->hdr.securityIndex != 0) { - struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC); - if (!nskb) { - rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare_nomem); - goto out; + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) { + rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare_nomem); + *_skb = NULL; + return 0; } - if (nskb != skb) { - rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare); - skb = nskb; + if (skb != *_skb) { + rxrpc_eaten_skb(*_skb, rxrpc_skb_eaten_by_unshare); + *_skb = skb; rxrpc_new_skb(skb, rxrpc_skb_new_unshared); sp = rxrpc_skb(skb); } @@ -237,18 +237,18 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) case RXRPC_PACKET_TYPE_CHALLENGE: if (rxrpc_to_server(sp)) - goto discard; + return 0; break; case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_to_client(sp)) - goto discard; + return 0; break; /* Packet types 9-11 should just be ignored. */ case RXRPC_PACKET_TYPE_PARAMS: case RXRPC_PACKET_TYPE_10: case RXRPC_PACKET_TYPE_11: - goto discard; + return 0; default: goto bad_message; @@ -268,7 +268,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && sp->hdr.seq == 1) goto unsupported_service; - goto discard; + return 0; } } @@ -294,7 +294,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); rxrpc_post_packet_to_conn(conn, skb); - goto out; + return 0; } if ((int)sp->hdr.serial - (int)conn->hi_serial > 0) @@ -306,19 +306,19 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) /* Ignore really old calls */ if (sp->hdr.callNumber < chan->last_call) - goto discard; + return 0; if (sp->hdr.callNumber == chan->last_call) { if (chan->call || sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) - goto discard; + return 0; /* For the previous service call, if completed * successfully, we discard all further packets. */ if (rxrpc_conn_is_service(conn) && chan->last_type == RXRPC_PACKET_TYPE_ACK) - goto discard; + return 0; /* But otherwise we need to retransmit the final packet * from data cached in the connection record. @@ -329,7 +329,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) sp->hdr.serial, sp->hdr.flags); rxrpc_post_packet_to_conn(conn, skb); - goto out; + return 0; } call = rcu_dereference(chan->call); @@ -357,21 +357,14 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff *skb) sp->hdr.type != RXRPC_PACKET_TYPE_DATA) goto bad_message; if (sp->hdr.seq != 1) - goto discard; + return 0; call = rxrpc_new_incoming_call(local, rx, skb); if (!call) goto reject_packet; } - /* Process a call packet; this either discards or passes on the ref - * elsewhere. - */ + /* Process a call packet. */ rxrpc_input_call_event(call, skb); - goto out; - -discard: - rxrpc_free_skb(skb, rxrpc_skb_put_input); -out: trace_rxrpc_rx_done(0, 0); return 0; @@ -400,9 +393,7 @@ protocol_error: post_abort: skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; reject_packet: - trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_reject_packet(local, skb); - _leave(" [badmsg]"); return 0; } @@ -441,9 +432,12 @@ int rxrpc_io_thread(void *data) if ((skb = __skb_dequeue(&rx_queue))) { switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: + skb->priority = 0; rcu_read_lock(); - rxrpc_input_packet(local, skb); + rxrpc_input_packet(local, &skb); rcu_read_unlock(); + trace_rxrpc_rx_done(skb->mark, skb->priority); + rxrpc_free_skb(skb, rxrpc_skb_put_input); break; case RXRPC_SKB_MARK_ERROR: rxrpc_input_error(local, skb); -- cgit v1.2.3-70-g09d2 From cd21effb0552d666b2f8609560be764a1a56adbe Mon Sep 17 00:00:00 2001 From: David Howells Date: Sat, 8 Oct 2022 14:33:50 +0100 Subject: rxrpc: Reduce the use of RCU in packet input Shrink the region of rxrpc_input_packet() that is covered by the RCU read lock so that it only covers the connection and call lookup. This means that the bits now outside of that can call sleepable functions such as kmalloc and sendmsg. Also take a ref on the conn or call we're going to use before we drop the RCU read lock. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 3 +-- net/rxrpc/call_accept.c | 13 +++------- net/rxrpc/input.c | 7 +++-- net/rxrpc/io_thread.c | 68 +++++++++++++++++++++++++++++++++++++------------ 4 files changed, 59 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6af7298af39b..cfd16f1e5c83 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -961,8 +961,7 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *); * input.c */ void rxrpc_input_call_event(struct rxrpc_call *, struct sk_buff *); -void rxrpc_input_implicit_end_call(struct rxrpc_sock *, struct rxrpc_connection *, - struct rxrpc_call *); +void rxrpc_input_implicit_end_call(struct rxrpc_connection *, struct rxrpc_call *); /* * io_thread.c diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 5f978b0b2404..beb8efa2e7a9 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -336,13 +336,13 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, * If this is for a kernel service, when we allocate the call, it will have * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the * retainer ref obtained from the backlog buffer. Prealloc calls for userspace - * services only have the ref from the backlog buffer. We want to pass this - * ref to non-BH context to dispose of. + * services only have the ref from the backlog buffer. We pass this ref to the + * caller. * * If we want to report an error, we mark the skb with the packet type and * abort code and return NULL. * - * The call is returned with the user access mutex held. + * The call is returned with the user access mutex held and a ref on it. */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_sock *rx, @@ -426,13 +426,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_send_ping(call, skb); - /* We have to discard the prealloc queue's ref here and rely on a - * combination of the RCU read lock and refs held either by the socket - * (recvmsg queue, to-be-accepted queue or user ID tree) or the kernel - * service to prevent the call from being deallocated too early. - */ - rxrpc_put_call(call, rxrpc_call_put_discard_prealloc); - if (hlist_unhashed(&call->error_link)) { spin_lock(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 42addbcf59f9..01d32f817a7a 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1072,8 +1072,7 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) * * TODO: If callNumber > call_id + 1, renegotiate security. */ -void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, - struct rxrpc_connection *conn, +void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, struct rxrpc_call *call) { switch (READ_ONCE(call->state)) { @@ -1091,7 +1090,7 @@ void rxrpc_input_implicit_end_call(struct rxrpc_sock *rx, break; } - spin_lock(&rx->incoming_lock); + spin_lock(&conn->bundle->channel_lock); __rxrpc_disconnect_call(conn, call); - spin_unlock(&rx->incoming_lock); + spin_unlock(&conn->bundle->channel_lock); } diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 91b8ba5b90db..3b6927610677 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -257,6 +257,8 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) if (sp->hdr.serviceId == 0) goto bad_message; + rcu_read_lock(); + if (rxrpc_to_server(sp)) { /* Weed out packets to services we're not offering. Packets * that would begin a call are explicitly rejected and the rest @@ -264,7 +266,9 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) */ rx = rcu_dereference(local->service); if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && - sp->hdr.serviceId != rx->second_service)) { + sp->hdr.serviceId != rx->second_service) + ) { + rcu_read_unlock(); if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && sp->hdr.seq == 1) goto unsupported_service; @@ -293,7 +297,12 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) if (sp->hdr.callNumber == 0) { /* Connection-level packet */ _debug("CONN %p {%d}", conn, conn->debug_id); - rxrpc_post_packet_to_conn(conn, skb); + conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_conn_input); + rcu_read_unlock(); + if (conn) { + rxrpc_post_packet_to_conn(conn, skb); + rxrpc_put_connection(conn, rxrpc_conn_put_conn_input); + } return 0; } @@ -305,20 +314,26 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) chan = &conn->channels[channel]; /* Ignore really old calls */ - if (sp->hdr.callNumber < chan->last_call) + if (sp->hdr.callNumber < chan->last_call) { + rcu_read_unlock(); return 0; + } if (sp->hdr.callNumber == chan->last_call) { if (chan->call || - sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) + sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) { + rcu_read_unlock(); return 0; + } /* For the previous service call, if completed * successfully, we discard all further packets. */ if (rxrpc_conn_is_service(conn) && - chan->last_type == RXRPC_PACKET_TYPE_ACK) + chan->last_type == RXRPC_PACKET_TYPE_ACK) { + rcu_read_unlock(); return 0; + } /* But otherwise we need to retransmit the final packet * from data cached in the connection record. @@ -328,20 +343,32 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) sp->hdr.seq, sp->hdr.serial, sp->hdr.flags); - rxrpc_post_packet_to_conn(conn, skb); + conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); + rcu_read_unlock(); + if (conn) { + rxrpc_post_packet_to_conn(conn, skb); + rxrpc_put_connection(conn, rxrpc_conn_put_call_input); + } return 0; } call = rcu_dereference(chan->call); if (sp->hdr.callNumber > chan->call_id) { - if (rxrpc_to_client(sp)) + if (rxrpc_to_client(sp)) { + rcu_read_unlock(); goto reject_packet; - if (call) - rxrpc_input_implicit_end_call(rx, conn, call); - call = NULL; + } + if (call) { + rxrpc_input_implicit_end_call(conn, call); + chan->call = NULL; + call = NULL; + } } + if (call && !rxrpc_try_get_call(call, rxrpc_call_get_input)) + call = NULL; + if (call) { if (sp->hdr.serviceId != call->dest_srx.srx_service) call->dest_srx.srx_service = sp->hdr.serviceId; @@ -352,23 +379,33 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) } } - if (!call || refcount_read(&call->ref) == 0) { + if (!call) { if (rxrpc_to_client(sp) || - sp->hdr.type != RXRPC_PACKET_TYPE_DATA) + sp->hdr.type != RXRPC_PACKET_TYPE_DATA) { + rcu_read_unlock(); goto bad_message; - if (sp->hdr.seq != 1) + } + if (sp->hdr.seq != 1) { + rcu_read_unlock(); return 0; + } call = rxrpc_new_incoming_call(local, rx, skb); - if (!call) + if (!call) { + rcu_read_unlock(); goto reject_packet; + } } + rcu_read_unlock(); + /* Process a call packet. */ rxrpc_input_call_event(call, skb); + rxrpc_put_call(call, rxrpc_call_put_input); trace_rxrpc_rx_done(0, 0); return 0; wrong_security: + rcu_read_unlock(); trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RXKADINCONSISTENCY, EBADMSG); skb->priority = RXKADINCONSISTENCY; @@ -381,6 +418,7 @@ unsupported_service: goto post_abort; reupgrade: + rcu_read_unlock(); trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); goto protocol_error; @@ -433,9 +471,7 @@ int rxrpc_io_thread(void *data) switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: skb->priority = 0; - rcu_read_lock(); rxrpc_input_packet(local, &skb); - rcu_read_unlock(); trace_rxrpc_rx_done(skb->mark, skb->priority); rxrpc_free_skb(skb, rxrpc_skb_put_input); break; -- cgit v1.2.3-70-g09d2 From 393a2a2007d13df7ae54c94328b45b6c2269b6a9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 20 Oct 2022 22:36:20 +0100 Subject: rxrpc: Extract the peer address from an incoming packet earlier Extract the peer address from an incoming packet earlier, at the beginning of rxrpc_input_packet() and thence pass a pointer to it to various functions that use it as part of the lookup rather than doing it on several separate paths. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/call_accept.c | 10 ++++++---- net/rxrpc/conn_object.c | 29 ++++++++--------------------- net/rxrpc/io_thread.c | 17 +++++++++++++++-- 4 files changed, 31 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index cfd16f1e5c83..c3c915a05627 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -824,6 +824,7 @@ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_sock *, + struct sockaddr_rxrpc *, struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); @@ -916,6 +917,7 @@ extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *, gfp_t); struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, + struct sockaddr_rxrpc *, struct sk_buff *, struct rxrpc_peer **); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index beb8efa2e7a9..11134b7cec17 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -258,6 +258,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_peer *peer, struct rxrpc_connection *conn, const struct rxrpc_security *sec, + struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb) { struct rxrpc_backlog *b = rx->backlog; @@ -287,8 +288,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, peer = NULL; if (!peer) { peer = b->peer_backlog[peer_tail]; - if (rxrpc_extract_addr_from_skb(&peer->srx, skb) < 0) - return NULL; + peer->srx = *peer_srx; b->peer_backlog[peer_tail] = NULL; smp_store_release(&b->peer_backlog_tail, (peer_tail + 1) & @@ -346,6 +346,7 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, */ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, struct rxrpc_sock *rx, + struct sockaddr_rxrpc *peer_srx, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -371,7 +372,7 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, * we have to recheck the routing. However, we're now holding * rx->incoming_lock, so the values should remain stable. */ - conn = rxrpc_find_connection_rcu(local, skb, &peer); + conn = rxrpc_find_connection_rcu(local, peer_srx, skb, &peer); if (!conn) { sec = rxrpc_get_incoming_security(rx, skb); @@ -379,7 +380,8 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, goto no_call; } - call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, skb); + call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, peer_srx, + skb); if (!call) { skb->mark = RXRPC_SKB_MARK_REJECT_BUSY; goto no_call; diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 5a39255ea014..98e49646ca1d 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -73,29 +73,17 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, * The caller must be holding the RCU read lock. */ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, struct sk_buff *skb, struct rxrpc_peer **_peer) { struct rxrpc_connection *conn; struct rxrpc_conn_proto k; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct sockaddr_rxrpc srx; struct rxrpc_peer *peer; _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); - if (rxrpc_extract_addr_from_skb(&srx, skb) < 0) - goto not_found; - - if (srx.transport.family != local->srx.transport.family && - (srx.transport.family == AF_INET && - local->srx.transport.family != AF_INET6)) { - pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", - srx.transport.family, - local->srx.transport.family); - goto not_found; - } - k.epoch = sp->hdr.epoch; k.cid = sp->hdr.cid & RXRPC_CIDMASK; @@ -104,7 +92,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, * parameter set. We look up the peer first as an intermediate * step and then the connection from the peer's tree. */ - peer = rxrpc_lookup_peer_rcu(local, &srx); + peer = rxrpc_lookup_peer_rcu(local, srx); if (!peer) goto not_found; *_peer = peer; @@ -117,8 +105,7 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, /* Look up client connections by connection ID alone as their * IDs are unique for this machine. */ - conn = idr_find(&rxrpc_client_conn_ids, - sp->hdr.cid >> RXRPC_CIDSHIFT); + conn = idr_find(&rxrpc_client_conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT); if (!conn || refcount_read(&conn->ref) == 0) { _debug("no conn"); goto not_found; @@ -129,20 +116,20 @@ struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, goto not_found; peer = conn->peer; - switch (srx.transport.family) { + switch (srx->transport.family) { case AF_INET: if (peer->srx.transport.sin.sin_port != - srx.transport.sin.sin_port || + srx->transport.sin.sin_port || peer->srx.transport.sin.sin_addr.s_addr != - srx.transport.sin.sin_addr.s_addr) + srx->transport.sin.sin_addr.s_addr) goto not_found; break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: if (peer->srx.transport.sin6.sin6_port != - srx.transport.sin6.sin6_port || + srx->transport.sin6.sin6_port || memcmp(&peer->srx.transport.sin6.sin6_addr, - &srx.transport.sin6.sin6_addr, + &srx->transport.sin6.sin6_addr, sizeof(struct in6_addr)) != 0) goto not_found; break; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 3b6927610677..bc65d83fab88 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -155,6 +155,7 @@ static bool rxrpc_extract_abort(struct sk_buff *skb) static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) { struct rxrpc_connection *conn; + struct sockaddr_rxrpc peer_srx; struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; @@ -257,6 +258,18 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) if (sp->hdr.serviceId == 0) goto bad_message; + if (WARN_ON_ONCE(rxrpc_extract_addr_from_skb(&peer_srx, skb) < 0)) + return 0; /* Unsupported address type - discard. */ + + if (peer_srx.transport.family != local->srx.transport.family && + (peer_srx.transport.family == AF_INET && + local->srx.transport.family != AF_INET6)) { + pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", + peer_srx.transport.family, + local->srx.transport.family); + return 0; /* Wrong address type - discard. */ + } + rcu_read_lock(); if (rxrpc_to_server(sp)) { @@ -276,7 +289,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) } } - conn = rxrpc_find_connection_rcu(local, skb, &peer); + conn = rxrpc_find_connection_rcu(local, &peer_srx, skb, &peer); if (conn) { if (sp->hdr.securityIndex != conn->security_ix) goto wrong_security; @@ -389,7 +402,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) rcu_read_unlock(); return 0; } - call = rxrpc_new_incoming_call(local, rx, skb); + call = rxrpc_new_incoming_call(local, rx, &peer_srx, skb); if (!call) { rcu_read_unlock(); goto reject_packet; -- cgit v1.2.3-70-g09d2 From 5e6ef4f1017c7f844e305283bbd8875af475e2fc Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 23 Jan 2020 13:13:41 +0000 Subject: rxrpc: Make the I/O thread take over the call and local processor work Move the functions from the call->processor and local->processor work items into the domain of the I/O thread. The call event processor, now called from the I/O thread, then takes over the job of cranking the call state machine, processing incoming packets and transmitting DATA, ACK and ABORT packets. In a future patch, rxrpc_send_ACK() will transmit the ACK on the spot rather than queuing it for later transmission. The call event processor becomes purely received-skb driven. It only transmits things in response to events. We use "pokes" to queue a dummy skb to make it do things like start/resume transmitting data. Timer expiry also results in pokes. The connection event processor, becomes similar, though crypto events, such as dealing with CHALLENGE and RESPONSE packets is offloaded to a work item to avoid doing crypto in the I/O thread. The local event processor is removed and VERSION response packets are generated directly from the packet parser. Similarly, ABORTs generated in response to protocol errors will be transmitted immediately rather than being pushed onto a queue for later transmission. Changes: ======== ver #2) - Fix a couple of introduced lock context imbalances. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 42 +++--- net/rxrpc/ar-internal.h | 50 +++---- net/rxrpc/call_accept.c | 126 ++++++++--------- net/rxrpc/call_event.c | 171 ++++++++++------------- net/rxrpc/call_object.c | 56 +++++--- net/rxrpc/conn_event.c | 60 ++++++++ net/rxrpc/conn_object.c | 93 +++++-------- net/rxrpc/input.c | 167 ++++++---------------- net/rxrpc/io_thread.c | 319 ++++++++++++++++++++----------------------- net/rxrpc/local_event.c | 43 +----- net/rxrpc/local_object.c | 69 +--------- net/rxrpc/output.c | 92 +++++-------- net/rxrpc/peer_event.c | 29 ++-- net/rxrpc/recvmsg.c | 9 +- net/rxrpc/sendmsg.c | 10 +- 15 files changed, 545 insertions(+), 791 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 82b1327c2ba6..c49b0c233594 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -26,7 +26,6 @@ #define rxrpc_skb_traces \ EM(rxrpc_skb_eaten_by_unshare, "ETN unshare ") \ EM(rxrpc_skb_eaten_by_unshare_nomem, "ETN unshar-nm") \ - EM(rxrpc_skb_get_ack, "GET ack ") \ EM(rxrpc_skb_get_conn_work, "GET conn-work") \ EM(rxrpc_skb_get_local_work, "GET locl-work") \ EM(rxrpc_skb_get_reject_work, "GET rej-work ") \ @@ -36,7 +35,6 @@ EM(rxrpc_skb_new_error_report, "NEW error-rpt") \ EM(rxrpc_skb_new_jumbo_subpacket, "NEW jumbo-sub") \ EM(rxrpc_skb_new_unshared, "NEW unshared ") \ - EM(rxrpc_skb_put_ack, "PUT ack ") \ EM(rxrpc_skb_put_conn_work, "PUT conn-work") \ EM(rxrpc_skb_put_error_report, "PUT error-rep") \ EM(rxrpc_skb_put_input, "PUT input ") \ @@ -45,7 +43,6 @@ EM(rxrpc_skb_put_rotate, "PUT rotate ") \ EM(rxrpc_skb_put_unknown, "PUT unknown ") \ EM(rxrpc_skb_see_conn_work, "SEE conn-work") \ - EM(rxrpc_skb_see_local_work, "SEE locl-work") \ EM(rxrpc_skb_see_recvmsg, "SEE recvmsg ") \ EM(rxrpc_skb_see_reject, "SEE reject ") \ EM(rxrpc_skb_see_rotate, "SEE rotate ") \ @@ -58,10 +55,7 @@ EM(rxrpc_local_get_for_use, "GET for-use ") \ EM(rxrpc_local_get_peer, "GET peer ") \ EM(rxrpc_local_get_prealloc_conn, "GET conn-pre") \ - EM(rxrpc_local_get_queue, "GET queue ") \ EM(rxrpc_local_new, "NEW ") \ - EM(rxrpc_local_processing, "PROCESSING ") \ - EM(rxrpc_local_put_already_queued, "PUT alreadyq") \ EM(rxrpc_local_put_bind, "PUT bind ") \ EM(rxrpc_local_put_call, "PUT call ") \ EM(rxrpc_local_put_for_use, "PUT for-use ") \ @@ -69,8 +63,6 @@ EM(rxrpc_local_put_peer, "PUT peer ") \ EM(rxrpc_local_put_prealloc_conn, "PUT conn-pre") \ EM(rxrpc_local_put_release_sock, "PUT rel-sock") \ - EM(rxrpc_local_put_queue, "PUT queue ") \ - EM(rxrpc_local_queued, "QUEUED ") \ EM(rxrpc_local_see_tx_ack, "SEE tx-ack ") \ EM(rxrpc_local_stop, "STOP ") \ EM(rxrpc_local_stopped, "STOPPED ") \ @@ -78,11 +70,9 @@ EM(rxrpc_local_unuse_conn_work, "UNU conn-wrk") \ EM(rxrpc_local_unuse_peer_keepalive, "UNU peer-kpa") \ EM(rxrpc_local_unuse_release_sock, "UNU rel-sock") \ - EM(rxrpc_local_unuse_work, "UNU work ") \ EM(rxrpc_local_use_conn_work, "USE conn-wrk") \ EM(rxrpc_local_use_lookup, "USE lookup ") \ - EM(rxrpc_local_use_peer_keepalive, "USE peer-kpa") \ - E_(rxrpc_local_use_work, "USE work ") + E_(rxrpc_local_use_peer_keepalive, "USE peer-kpa") #define rxrpc_peer_traces \ EM(rxrpc_peer_free, "FREE ") \ @@ -90,6 +80,7 @@ EM(rxrpc_peer_get_activate_call, "GET act-call") \ EM(rxrpc_peer_get_bundle, "GET bundle ") \ EM(rxrpc_peer_get_client_conn, "GET cln-conn") \ + EM(rxrpc_peer_get_input, "GET input ") \ EM(rxrpc_peer_get_input_error, "GET inpt-err") \ EM(rxrpc_peer_get_keepalive, "GET keepaliv") \ EM(rxrpc_peer_get_lookup_client, "GET look-cln") \ @@ -100,6 +91,7 @@ EM(rxrpc_peer_put_call, "PUT call ") \ EM(rxrpc_peer_put_conn, "PUT conn ") \ EM(rxrpc_peer_put_discard_tmp, "PUT disc-tmp") \ + EM(rxrpc_peer_put_input, "PUT input ") \ EM(rxrpc_peer_put_input_error, "PUT inpt-err") \ E_(rxrpc_peer_put_keepalive, "PUT keepaliv") @@ -180,11 +172,6 @@ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ EM(rxrpc_call_put_unnotify, "PUT unnotify") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ - EM(rxrpc_call_queue_abort, "QUE abort ") \ - EM(rxrpc_call_queue_requeue, "QUE requeue ") \ - EM(rxrpc_call_queue_resend, "QUE resend ") \ - EM(rxrpc_call_queue_timer, "QUE timer ") \ - EM(rxrpc_call_queue_tx_data, "QUE tx-data ") \ EM(rxrpc_call_see_accept, "SEE accept ") \ EM(rxrpc_call_see_activate_client, "SEE act-clnt") \ EM(rxrpc_call_see_connect_failed, "SEE con-fail") \ @@ -282,6 +269,7 @@ EM(rxrpc_propose_ack_respond_to_ping, "Rsp2Png") \ EM(rxrpc_propose_ack_retry_tx, "RetryTx") \ EM(rxrpc_propose_ack_rotate_rx, "RxAck ") \ + EM(rxrpc_propose_ack_rx_idle, "RxIdle ") \ E_(rxrpc_propose_ack_terminal_ack, "ClTerm ") #define rxrpc_congest_modes \ @@ -1532,6 +1520,7 @@ TRACE_EVENT(rxrpc_connect_call, __field(unsigned long, user_call_ID ) __field(u32, cid ) __field(u32, call_id ) + __field_struct(struct sockaddr_rxrpc, srx ) ), TP_fast_assign( @@ -1539,33 +1528,42 @@ TRACE_EVENT(rxrpc_connect_call, __entry->user_call_ID = call->user_call_ID; __entry->cid = call->cid; __entry->call_id = call->call_id; + __entry->srx = call->dest_srx; ), - TP_printk("c=%08x u=%p %08x:%08x", + TP_printk("c=%08x u=%p %08x:%08x dst=%pISp", __entry->call, (void *)__entry->user_call_ID, __entry->cid, - __entry->call_id) + __entry->call_id, + &__entry->srx.transport) ); TRACE_EVENT(rxrpc_resend, - TP_PROTO(struct rxrpc_call *call), + TP_PROTO(struct rxrpc_call *call, struct sk_buff *ack), - TP_ARGS(call), + TP_ARGS(call, ack), TP_STRUCT__entry( __field(unsigned int, call ) __field(rxrpc_seq_t, seq ) + __field(rxrpc_seq_t, transmitted ) + __field(rxrpc_serial_t, ack_serial ) ), TP_fast_assign( + struct rxrpc_skb_priv *sp = ack ? rxrpc_skb(ack) : NULL; __entry->call = call->debug_id; __entry->seq = call->acks_hard_ack; + __entry->transmitted = call->tx_transmitted; + __entry->ack_serial = sp ? sp->hdr.serial : 0; ), - TP_printk("c=%08x q=%x", + TP_printk("c=%08x r=%x q=%x tq=%x", __entry->call, - __entry->seq) + __entry->ack_serial, + __entry->seq, + __entry->transmitted) ); TRACE_EVENT(rxrpc_rx_icmp, diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index c3c915a05627..6b993a3d4186 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -283,14 +283,11 @@ struct rxrpc_local { struct rxrpc_net *rxnet; /* The network ns in which this resides */ struct hlist_node link; struct socket *socket; /* my UDP socket */ - struct work_struct processor; struct task_struct *io_thread; struct list_head ack_tx_queue; /* List of ACKs that need sending */ spinlock_t ack_tx_lock; /* ACK list lock */ struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ - struct sk_buff_head reject_queue; /* packets awaiting rejection */ - struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ struct sk_buff_head rx_queue; /* Received packets */ struct list_head call_attend_q; /* Calls requiring immediate attention */ struct rb_root client_bundles; /* Client connection bundles by socket params */ @@ -524,23 +521,19 @@ enum rxrpc_call_flag { RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */ - RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */ RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */ RXRPC_CALL_KERNEL, /* The call was made by the kernel */ RXRPC_CALL_UPGRADE, /* Service upgrade was requested for the call */ - RXRPC_CALL_DELAY_ACK_PENDING, /* DELAY ACK generation is pending */ - RXRPC_CALL_IDLE_ACK_PENDING, /* IDLE ACK generation is pending */ RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ + RXRPC_CALL_RX_IS_IDLE, /* Reception is idle - send an ACK */ }; /* * Events that can be raised on a call. */ enum rxrpc_call_event { - RXRPC_CALL_EV_ABORT, /* need to generate abort */ - RXRPC_CALL_EV_RESEND, /* Tx resend required */ - RXRPC_CALL_EV_EXPIRED, /* Expiry occurred */ RXRPC_CALL_EV_ACK_LOST, /* ACK may be lost, send ping */ + RXRPC_CALL_EV_INITIAL_PING, /* Send initial ping for a new service call */ }; /* @@ -611,7 +604,6 @@ struct rxrpc_call { u32 next_rx_timo; /* Timeout for next Rx packet (jif) */ u32 next_req_timo; /* Timeout for next Rx request packet (jif) */ struct timer_list timer; /* Combined event timer */ - struct work_struct processor; /* Event processor */ struct work_struct destroyer; /* In-process-context destroyer */ rxrpc_notify_rx_t notify_rx; /* kernel service Rx notification function */ struct list_head link; /* link in master call list */ @@ -705,11 +697,7 @@ struct rxrpc_call { rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ - rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */ - rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ - struct sk_buff *acks_soft_tbl; /* The last ACK packet with NAKs in it */ - spinlock_t acks_ack_lock; /* Access to ->acks_last_ack */ }; /* @@ -822,10 +810,9 @@ extern struct workqueue_struct *rxrpc_workqueue; */ int rxrpc_service_prealloc(struct rxrpc_sock *, gfp_t); void rxrpc_discard_prealloc(struct rxrpc_sock *); -struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *, - struct rxrpc_sock *, - struct sockaddr_rxrpc *, - struct sk_buff *); +bool rxrpc_new_incoming_call(struct rxrpc_local *, struct rxrpc_peer *, + struct rxrpc_connection *, struct sockaddr_rxrpc *, + struct sk_buff *); void rxrpc_accept_incoming_calls(struct rxrpc_local *); int rxrpc_user_charge_accept(struct rxrpc_sock *, unsigned long); @@ -838,13 +825,15 @@ void rxrpc_send_ACK(struct rxrpc_call *, u8, rxrpc_serial_t, enum rxrpc_propose_ void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); -void rxrpc_process_call(struct work_struct *); +void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); void rxrpc_reduce_call_timer(struct rxrpc_call *call, unsigned long expire_at, unsigned long now, enum rxrpc_timer_trace why); +void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); + /* * call_object.c */ @@ -864,9 +853,8 @@ void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); void rxrpc_release_call(struct rxrpc_sock *, struct rxrpc_call *); void rxrpc_release_calls_on_socket(struct rxrpc_sock *); -void rxrpc_queue_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_see_call(struct rxrpc_call *, enum rxrpc_call_trace); -bool rxrpc_try_get_call(struct rxrpc_call *, enum rxrpc_call_trace); +struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_get_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_put_call(struct rxrpc_call *, enum rxrpc_call_trace); void rxrpc_cleanup_call(struct rxrpc_call *); @@ -908,6 +896,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *); */ void rxrpc_process_connection(struct work_struct *); void rxrpc_process_delayed_final_acks(struct rxrpc_connection *, bool); +int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb); /* * conn_object.c @@ -916,10 +905,9 @@ extern unsigned int rxrpc_connection_expiry; extern unsigned int rxrpc_closed_conn_expiry; struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *, gfp_t); -struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *, - struct sockaddr_rxrpc *, - struct sk_buff *, - struct rxrpc_peer **); +struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *, + struct sockaddr_rxrpc *, + struct sk_buff *); void __rxrpc_disconnect_call(struct rxrpc_connection *, struct rxrpc_call *); void rxrpc_disconnect_call(struct rxrpc_call *); void rxrpc_kill_client_conn(struct rxrpc_connection *); @@ -962,8 +950,8 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ -void rxrpc_input_call_event(struct rxrpc_call *, struct sk_buff *); -void rxrpc_input_implicit_end_call(struct rxrpc_connection *, struct rxrpc_call *); +void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); +void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); /* * io_thread.c @@ -993,7 +981,9 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time64_t, /* * local_event.c */ -extern void rxrpc_process_local_events(struct rxrpc_local *); +void rxrpc_send_version_request(struct rxrpc_local *local, + struct rxrpc_host_header *hdr, + struct sk_buff *skb); /* * local_object.c @@ -1004,7 +994,6 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *, enum rxrpc_local void rxrpc_put_local(struct rxrpc_local *, enum rxrpc_local_trace); struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_unuse_local(struct rxrpc_local *, enum rxrpc_local_trace); -void rxrpc_queue_local(struct rxrpc_local *); void rxrpc_destroy_local(struct rxrpc_local *local); void rxrpc_destroy_all_locals(struct rxrpc_net *); @@ -1068,7 +1057,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) void rxrpc_transmit_ack_packets(struct rxrpc_local *); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *); -void rxrpc_reject_packets(struct rxrpc_local *); +void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); @@ -1178,7 +1167,6 @@ int rxrpc_server_keyring(struct rxrpc_sock *, sockptr_t, int); * skbuff.c */ void rxrpc_kernel_data_consumed(struct rxrpc_call *, struct sk_buff *); -void rxrpc_packet_destructor(struct sk_buff *); void rxrpc_new_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_see_skb(struct sk_buff *, enum rxrpc_skb_trace); void rxrpc_eaten_skb(struct sk_buff *, enum rxrpc_skb_trace); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 11134b7cec17..87b46c2a1985 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -100,6 +100,7 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, return -ENOMEM; call->flags |= (1 << RXRPC_CALL_IS_SERVICE); call->state = RXRPC_CALL_SERVER_PREALLOC; + __set_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events); trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), user_call_ID, rxrpc_call_new_prealloc_service); @@ -234,21 +235,6 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) kfree(b); } -/* - * Ping the other end to fill our RTT cache and to retrieve the rwind - * and MTU parameters. - */ -static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) -{ - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - ktime_t now = skb->tstamp; - - if (call->peer->rtt_count < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) - rxrpc_send_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, - rxrpc_propose_ack_ping_for_params); -} - /* * Allocate a new incoming call from the prealloc pool, along with a connection * and a peer as necessary. @@ -330,33 +316,56 @@ static struct rxrpc_call *rxrpc_alloc_incoming_call(struct rxrpc_sock *rx, } /* - * Set up a new incoming call. Called in BH context with the RCU read lock - * held. + * Set up a new incoming call. Called from the I/O thread. * * If this is for a kernel service, when we allocate the call, it will have * three refs on it: (1) the kernel service, (2) the user_call_ID tree, (3) the * retainer ref obtained from the backlog buffer. Prealloc calls for userspace - * services only have the ref from the backlog buffer. We pass this ref to the - * caller. + * services only have the ref from the backlog buffer. * * If we want to report an error, we mark the skb with the packet type and - * abort code and return NULL. - * - * The call is returned with the user access mutex held and a ref on it. + * abort code and return false. */ -struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, - struct rxrpc_sock *rx, - struct sockaddr_rxrpc *peer_srx, - struct sk_buff *skb) +bool rxrpc_new_incoming_call(struct rxrpc_local *local, + struct rxrpc_peer *peer, + struct rxrpc_connection *conn, + struct sockaddr_rxrpc *peer_srx, + struct sk_buff *skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); const struct rxrpc_security *sec = NULL; - struct rxrpc_connection *conn; - struct rxrpc_peer *peer = NULL; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_call *call = NULL; + struct rxrpc_sock *rx; _enter(""); + /* Don't set up a call for anything other than the first DATA packet. */ + if (sp->hdr.seq != 1 || + sp->hdr.type != RXRPC_PACKET_TYPE_DATA) + return true; /* Just discard */ + + rcu_read_lock(); + + /* Weed out packets to services we're not offering. Packets that would + * begin a call are explicitly rejected and the rest are just + * discarded. + */ + rx = rcu_dereference(local->service); + if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && + sp->hdr.serviceId != rx->second_service) + ) { + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && + sp->hdr.seq == 1) + goto unsupported_service; + goto discard; + } + + if (!conn) { + sec = rxrpc_get_incoming_security(rx, skb); + if (!sec) + goto reject; + } + spin_lock(&rx->incoming_lock); if (rx->sk.sk_state == RXRPC_SERVER_LISTEN_DISABLED || rx->sk.sk_state == RXRPC_CLOSE) { @@ -367,19 +376,6 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, goto no_call; } - /* The peer, connection and call may all have sprung into existence due - * to a duplicate packet being handled on another CPU in parallel, so - * we have to recheck the routing. However, we're now holding - * rx->incoming_lock, so the values should remain stable. - */ - conn = rxrpc_find_connection_rcu(local, peer_srx, skb, &peer); - - if (!conn) { - sec = rxrpc_get_incoming_security(rx, skb); - if (!sec) - goto no_call; - } - call = rxrpc_alloc_incoming_call(rx, local, peer, conn, sec, peer_srx, skb); if (!call) { @@ -398,35 +394,15 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, rx->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); - switch (conn->state) { - case RXRPC_CONN_SERVICE_UNSECURED: + if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { conn->state = RXRPC_CONN_SERVICE_CHALLENGING; set_bit(RXRPC_CONN_EV_CHALLENGE, &call->conn->events); rxrpc_queue_conn(call->conn, rxrpc_conn_queue_challenge); - break; - - case RXRPC_CONN_SERVICE: - write_lock(&call->state_lock); - if (call->state < RXRPC_CALL_COMPLETE) - call->state = RXRPC_CALL_SERVER_RECV_REQUEST; - write_unlock(&call->state_lock); - break; - - case RXRPC_CONN_REMOTELY_ABORTED: - rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, - conn->abort_code, conn->error); - break; - case RXRPC_CONN_LOCALLY_ABORTED: - rxrpc_abort_call("CON", call, sp->hdr.seq, - conn->abort_code, conn->error); - break; - default: - BUG(); } spin_unlock(&conn->state_lock); - spin_unlock(&rx->incoming_lock); - rxrpc_send_ping(call, skb); + spin_unlock(&rx->incoming_lock); + rcu_read_unlock(); if (hlist_unhashed(&call->error_link)) { spin_lock(&call->peer->lock); @@ -435,12 +411,24 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, } _leave(" = %p{%d}", call, call->debug_id); - return call; - + rxrpc_input_call_event(call, skb); + rxrpc_put_call(call, rxrpc_call_put_input); + return true; + +unsupported_service: + trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_INVALID_OPERATION, EOPNOTSUPP); + skb->priority = RX_INVALID_OPERATION; + goto reject; no_call: spin_unlock(&rx->incoming_lock); - _leave(" = NULL [%u]", skb->mark); - return NULL; +reject: + rcu_read_unlock(); + _leave(" = f [%u]", skb->mark); + return false; +discard: + rcu_read_unlock(); + return true; } /* diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index c9f835292f7b..9db62fa55c62 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -74,11 +74,6 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return; - if (ack_reason == RXRPC_ACK_DELAY && - test_and_set_bit(RXRPC_CALL_DELAY_ACK_PENDING, &call->flags)) { - trace_rxrpc_drop_ack(call, why, ack_reason, serial, false); - return; - } rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); @@ -111,12 +106,7 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, spin_unlock_bh(&local->ack_tx_lock); trace_rxrpc_send_ack(call, why, ack_reason, serial); - if (!rcu_read_lock_held()) { - rxrpc_transmit_ack_packets(call->peer->local); - } else { - rxrpc_get_local(local, rxrpc_local_get_queue); - rxrpc_queue_local(local); - } + rxrpc_wake_up_io_thread(local); } /* @@ -130,11 +120,10 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) /* * Perform retransmission of NAK'd and unack'd packets. */ -static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) +void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) { struct rxrpc_ackpacket *ack = NULL; struct rxrpc_txbuf *txb; - struct sk_buff *ack_skb = NULL; unsigned long resend_at; rxrpc_seq_t transmitted = READ_ONCE(call->tx_transmitted); ktime_t now, max_age, oldest, ack_ts; @@ -148,32 +137,21 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); oldest = now; - /* See if there's an ACK saved with a soft-ACK table in it. */ - if (call->acks_soft_tbl) { - spin_lock_bh(&call->acks_ack_lock); - ack_skb = call->acks_soft_tbl; - if (ack_skb) { - rxrpc_get_skb(ack_skb, rxrpc_skb_get_ack); - ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); - } - spin_unlock_bh(&call->acks_ack_lock); - } - if (list_empty(&call->tx_buffer)) goto no_resend; - spin_lock(&call->tx_lock); - if (list_empty(&call->tx_buffer)) goto no_further_resend; - trace_rxrpc_resend(call); + trace_rxrpc_resend(call, ack_skb); txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link); /* Scan the soft ACK table without dropping the lock and resend any * explicitly NAK'd packets. */ - if (ack) { + if (ack_skb) { + ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); + for (i = 0; i < ack->nAcks; i++) { rxrpc_seq_t seq; @@ -197,7 +175,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); if (list_empty(&txb->tx_link)) { - rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans); list_add_tail(&txb->tx_link, &retrans_queue); set_bit(RXRPC_TXBUF_RESENT, &txb->flags); } @@ -241,7 +218,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) do_resend: unacked = true; if (list_empty(&txb->tx_link)) { - rxrpc_get_txbuf(txb, rxrpc_txbuf_get_retrans); list_add_tail(&txb->tx_link, &retrans_queue); set_bit(RXRPC_TXBUF_RESENT, &txb->flags); rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); @@ -249,10 +225,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) } no_further_resend: - spin_unlock(&call->tx_lock); no_resend: - rxrpc_free_skb(ack_skb, rxrpc_skb_put_ack); - resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, !list_empty(&retrans_queue)); @@ -266,7 +239,7 @@ no_resend: * retransmitting data. */ if (list_empty(&retrans_queue)) { - rxrpc_reduce_call_timer(call, resend_at, now_j, + rxrpc_reduce_call_timer(call, resend_at, jiffies, rxrpc_timer_set_for_resend); ack_ts = ktime_sub(now, call->acks_latest_ts); if (ktime_to_us(ack_ts) < (call->peer->srtt_us >> 3)) @@ -276,15 +249,11 @@ no_resend: goto out; } + /* Retransmit the queue */ while ((txb = list_first_entry_or_null(&retrans_queue, struct rxrpc_txbuf, tx_link))) { list_del_init(&txb->tx_link); - rxrpc_send_data_packet(call, txb); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_trans); - - trace_rxrpc_retransmit(call, txb->seq, - ktime_to_ns(ktime_sub(txb->last_sent, - max_age))); + rxrpc_transmit_one(call, txb); } out: @@ -357,16 +326,27 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) } } +/* + * Ping the other end to fill our RTT cache and to retrieve the rwind + * and MTU parameters. + */ +static void rxrpc_send_initial_ping(struct rxrpc_call *call) +{ + if (call->peer->rtt_count < 3 || + ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + ktime_get_real())) + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_params); +} + /* * Handle retransmission and deferred ACK/abort generation. */ -void rxrpc_process_call(struct work_struct *work) +void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) { - struct rxrpc_call *call = - container_of(work, struct rxrpc_call, processor); unsigned long now, next, t; - unsigned int iterations = 0; rxrpc_serial_t ackr_serial; + bool resend = false, expired = false; rxrpc_see_call(call, rxrpc_call_see_input); @@ -374,47 +354,31 @@ void rxrpc_process_call(struct work_struct *work) _enter("{%d,%s,%lx}", call->debug_id, rxrpc_call_states[call->state], call->events); -recheck_state: - if (call->acks_hard_ack != call->tx_bottom) - rxrpc_shrink_call_tx_buffer(call); - - /* Limit the number of times we do this before returning to the manager */ - if (!rxrpc_tx_window_has_space(call) || - list_empty(&call->tx_sendmsg)) { - iterations++; - if (iterations > 5) - goto requeue; - } - - if (test_and_clear_bit(RXRPC_CALL_EV_ABORT, &call->events)) { - rxrpc_send_abort_packet(call); - goto recheck_state; - } + if (call->state == RXRPC_CALL_COMPLETE) + goto out; - if (call->state == RXRPC_CALL_COMPLETE) { - del_timer_sync(&call->timer); + if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) goto out; - } - /* Work out if any timeouts tripped */ + /* If we see our async-event poke, check for timeout trippage. */ now = jiffies; t = READ_ONCE(call->expect_rx_by); if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_normal, now); - set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + expired = true; } t = READ_ONCE(call->expect_req_by); if (call->state == RXRPC_CALL_SERVER_RECV_REQUEST && time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_idle, now); - set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + expired = true; } t = READ_ONCE(call->expect_term_by); if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_hard, now); - set_bit(RXRPC_CALL_EV_EXPIRED, &call->events); + expired = true; } t = READ_ONCE(call->delay_ack_at); @@ -453,13 +417,19 @@ recheck_state: if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_resend, now); cmpxchg(&call->resend_at, t, now + MAX_JIFFY_OFFSET); - set_bit(RXRPC_CALL_EV_RESEND, &call->events); + resend = true; } + if (skb) + rxrpc_input_call_packet(call, skb); + rxrpc_transmit_some_data(call); + if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) + rxrpc_send_initial_ping(call); + /* Process events */ - if (test_and_clear_bit(RXRPC_CALL_EV_EXPIRED, &call->events)) { + if (expired) { if (test_bit(RXRPC_CALL_RX_HEARD, &call->flags) && (int)call->conn->hi_serial - (int)call->rx_serial > 0) { trace_rxrpc_call_reset(call); @@ -467,51 +437,50 @@ recheck_state: } else { rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, -ETIME); } - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - goto recheck_state; + rxrpc_send_abort_packet(call); + goto out; } - if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) { - call->acks_lost_top = call->tx_top; + if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - } - if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events) && - call->state != RXRPC_CALL_CLIENT_RECV_REPLY) { - rxrpc_resend(call, now); - goto recheck_state; - } + if (resend && call->state != RXRPC_CALL_CLIENT_RECV_REPLY) + rxrpc_resend(call, NULL); + + if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) + rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, + rxrpc_propose_ack_rx_idle); + + if (atomic_read(&call->ackr_nr_unacked) > 2) + rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, + rxrpc_propose_ack_input_data); /* Make sure the timer is restarted */ - next = call->expect_rx_by; + if (call->state != RXRPC_CALL_COMPLETE) { + next = call->expect_rx_by; #define set(T) { t = READ_ONCE(T); if (time_before(t, next)) next = t; } - set(call->expect_req_by); - set(call->expect_term_by); - set(call->delay_ack_at); - set(call->ack_lost_at); - set(call->resend_at); - set(call->keepalive_at); - set(call->ping_at); - - now = jiffies; - if (time_after_eq(now, next)) - goto recheck_state; + set(call->expect_req_by); + set(call->expect_term_by); + set(call->delay_ack_at); + set(call->ack_lost_at); + set(call->resend_at); + set(call->keepalive_at); + set(call->ping_at); - rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); + now = jiffies; + if (time_after_eq(now, next)) + rxrpc_poke_call(call, rxrpc_call_poke_timer_now); - /* other events may have been raised since we started checking */ - if (call->events) - goto requeue; + rxrpc_reduce_call_timer(call, next, now, rxrpc_timer_restart); + } out: + if (call->state == RXRPC_CALL_COMPLETE) + del_timer_sync(&call->timer); + if (call->acks_hard_ack != call->tx_bottom) + rxrpc_shrink_call_tx_buffer(call); _leave(""); - return; - -requeue: - if (call->state < RXRPC_CALL_COMPLETE) - rxrpc_queue_call(call, rxrpc_call_queue_requeue); - goto out; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 7570b4e67bc5..d441a715d988 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -71,7 +71,7 @@ static void rxrpc_call_timer_expired(struct timer_list *t) if (call->state < RXRPC_CALL_COMPLETE) { trace_rxrpc_timer_expired(call, jiffies); - rxrpc_queue_call(call, rxrpc_call_queue_timer); + rxrpc_poke_call(call, rxrpc_call_poke_timer); } } @@ -148,7 +148,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, &rxrpc_call_user_mutex_lock_class_key); timer_setup(&call->timer, rxrpc_call_timer_expired, 0); - INIT_WORK(&call->processor, rxrpc_process_call); INIT_WORK(&call->destroyer, rxrpc_destroy_call); INIT_LIST_HEAD(&call->link); INIT_LIST_HEAD(&call->chan_wait_link); @@ -163,7 +162,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); spin_lock_init(&call->tx_lock); - spin_lock_init(&call->acks_ack_lock); rwlock_init(&call->state_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; @@ -252,6 +250,7 @@ static void rxrpc_start_call_timer(struct rxrpc_call *call) call->ack_lost_at = j; call->resend_at = j; call->ping_at = j; + call->keepalive_at = j; call->expect_rx_by = j; call->expect_req_by = j; call->expect_term_by = j; @@ -430,6 +429,29 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->state = RXRPC_CALL_SERVER_SECURING; call->cong_tstamp = skb->tstamp; + spin_lock(&conn->state_lock); + + switch (conn->state) { + case RXRPC_CONN_SERVICE_UNSECURED: + case RXRPC_CONN_SERVICE_CHALLENGING: + call->state = RXRPC_CALL_SERVER_SECURING; + break; + case RXRPC_CONN_SERVICE: + call->state = RXRPC_CALL_SERVER_RECV_REQUEST; + break; + + case RXRPC_CONN_REMOTELY_ABORTED: + __rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, + conn->abort_code, conn->error); + break; + case RXRPC_CONN_LOCALLY_ABORTED: + __rxrpc_abort_call("CON", call, 1, + conn->abort_code, conn->error); + break; + default: + BUG(); + } + /* Set the channel for this call. We don't get channel_lock as we're * only defending against the data_ready handler (which we're called * from) and the RESPONSE packet parser (which is only really @@ -440,6 +462,7 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, conn->channels[chan].call_counter = call->call_id; conn->channels[chan].call_id = call->call_id; rcu_assign_pointer(conn->channels[chan].call, call); + spin_unlock(&conn->state_lock); spin_lock(&conn->peer->lock); hlist_add_head(&call->error_link, &conn->peer->error_targets); @@ -449,15 +472,6 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, _leave(""); } -/* - * Queue a call's work processor. - */ -void rxrpc_queue_call(struct rxrpc_call *call, enum rxrpc_call_trace why) -{ - if (rxrpc_queue_work(&call->processor)) - trace_rxrpc_call(call->debug_id, refcount_read(&call->ref), 0, why); -} - /* * Note the re-emergence of a call. */ @@ -470,14 +484,15 @@ void rxrpc_see_call(struct rxrpc_call *call, enum rxrpc_call_trace why) } } -bool rxrpc_try_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) +struct rxrpc_call *rxrpc_try_get_call(struct rxrpc_call *call, + enum rxrpc_call_trace why) { int r; - if (!__refcount_inc_not_zero(&call->ref, &r)) - return false; + if (!call || !__refcount_inc_not_zero(&call->ref, &r)) + return NULL; trace_rxrpc_call(call->debug_id, r + 1, 0, why); - return true; + return call; } /* @@ -637,8 +652,6 @@ static void rxrpc_destroy_call(struct work_struct *work) struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); struct rxrpc_txbuf *txb; - del_timer_sync(&call->timer); - cancel_work_sync(&call->processor); /* The processor may restart the timer */ del_timer_sync(&call->timer); rxrpc_cleanup_ring(call); @@ -652,8 +665,8 @@ static void rxrpc_destroy_call(struct work_struct *work) list_del(&txb->call_link); rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); } + rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); - rxrpc_free_skb(call->acks_soft_tbl, rxrpc_skb_put_ack); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_put_peer(call->peer, rxrpc_peer_put_call); rxrpc_put_local(call->local, rxrpc_local_put_call); @@ -670,10 +683,9 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); - del_timer_sync(&call->timer); - cancel_work(&call->processor); + del_timer(&call->timer); - if (rcu_read_lock_held() || work_busy(&call->processor)) + if (rcu_read_lock_held()) /* Can't use the rxrpc workqueue as we need to cancel/flush * something that may be running/waiting there. */ diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 23a74e35052d..643a56322224 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -479,3 +479,63 @@ void rxrpc_process_connection(struct work_struct *work) rxrpc_unuse_local(conn->local, rxrpc_local_unuse_conn_work); } } + +/* + * post connection-level events to the connection + * - this includes challenges, responses, some aborts and call terminal packet + * retransmission. + */ +static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + _enter("%p,%p", conn, skb); + + rxrpc_get_skb(skb, rxrpc_skb_get_conn_work); + skb_queue_tail(&conn->rx_queue, skb); + rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); +} + +/* + * Input a connection-level packet. + */ +int rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { + _leave(" = -ECONNABORTED [%u]", conn->state); + return -ECONNABORTED; + } + + _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_DATA: + case RXRPC_PACKET_TYPE_ACK: + rxrpc_conn_retransmit_call(conn, skb, + sp->hdr.cid & RXRPC_CHANNELMASK); + return 0; + + case RXRPC_PACKET_TYPE_BUSY: + /* Just ignore BUSY packets for now. */ + return 0; + + case RXRPC_PACKET_TYPE_ABORT: + conn->error = -ECONNABORTED; + conn->abort_code = skb->priority; + conn->state = RXRPC_CONN_REMOTELY_ABORTED; + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, sp->hdr.serial); + return -ECONNABORTED; + + case RXRPC_PACKET_TYPE_CHALLENGE: + case RXRPC_PACKET_TYPE_RESPONSE: + rxrpc_post_packet_to_conn(conn, skb); + return 0; + + default: + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, + tracepoint_string("bad_conn_pkt")); + return -EPROTO; + } +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 98e49646ca1d..3c8f83dacb2b 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -72,76 +72,55 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, * * The caller must be holding the RCU read lock. */ -struct rxrpc_connection *rxrpc_find_connection_rcu(struct rxrpc_local *local, - struct sockaddr_rxrpc *srx, - struct sk_buff *skb, - struct rxrpc_peer **_peer) +struct rxrpc_connection *rxrpc_find_client_connection_rcu(struct rxrpc_local *local, + struct sockaddr_rxrpc *srx, + struct sk_buff *skb) { struct rxrpc_connection *conn; - struct rxrpc_conn_proto k; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_peer *peer; _enter(",%x", sp->hdr.cid & RXRPC_CIDMASK); - k.epoch = sp->hdr.epoch; - k.cid = sp->hdr.cid & RXRPC_CIDMASK; - - if (rxrpc_to_server(sp)) { - /* We need to look up service connections by the full protocol - * parameter set. We look up the peer first as an intermediate - * step and then the connection from the peer's tree. - */ - peer = rxrpc_lookup_peer_rcu(local, srx); - if (!peer) - goto not_found; - *_peer = peer; - conn = rxrpc_find_service_conn_rcu(peer, skb); - if (!conn || refcount_read(&conn->ref) == 0) - goto not_found; - _leave(" = %p", conn); - return conn; - } else { - /* Look up client connections by connection ID alone as their - * IDs are unique for this machine. - */ - conn = idr_find(&rxrpc_client_conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT); - if (!conn || refcount_read(&conn->ref) == 0) { - _debug("no conn"); - goto not_found; - } + /* Look up client connections by connection ID alone as their IDs are + * unique for this machine. + */ + conn = idr_find(&rxrpc_client_conn_ids, sp->hdr.cid >> RXRPC_CIDSHIFT); + if (!conn || refcount_read(&conn->ref) == 0) { + _debug("no conn"); + goto not_found; + } - if (conn->proto.epoch != k.epoch || - conn->local != local) + if (conn->proto.epoch != sp->hdr.epoch || + conn->local != local) + goto not_found; + + peer = conn->peer; + switch (srx->transport.family) { + case AF_INET: + if (peer->srx.transport.sin.sin_port != + srx->transport.sin.sin_port || + peer->srx.transport.sin.sin_addr.s_addr != + srx->transport.sin.sin_addr.s_addr) goto not_found; - - peer = conn->peer; - switch (srx->transport.family) { - case AF_INET: - if (peer->srx.transport.sin.sin_port != - srx->transport.sin.sin_port || - peer->srx.transport.sin.sin_addr.s_addr != - srx->transport.sin.sin_addr.s_addr) - goto not_found; - break; + break; #ifdef CONFIG_AF_RXRPC_IPV6 - case AF_INET6: - if (peer->srx.transport.sin6.sin6_port != - srx->transport.sin6.sin6_port || - memcmp(&peer->srx.transport.sin6.sin6_addr, - &srx->transport.sin6.sin6_addr, - sizeof(struct in6_addr)) != 0) - goto not_found; - break; + case AF_INET6: + if (peer->srx.transport.sin6.sin6_port != + srx->transport.sin6.sin6_port || + memcmp(&peer->srx.transport.sin6.sin6_addr, + &srx->transport.sin6.sin6_addr, + sizeof(struct in6_addr)) != 0) + goto not_found; + break; #endif - default: - BUG(); - } - - _leave(" = %p", conn); - return conn; + default: + BUG(); } + _leave(" = %p", conn); + return conn; + not_found: _leave(" = NULL"); return NULL; diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 01d32f817a7a..7ae7046f0b03 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -12,10 +12,8 @@ static void rxrpc_proto_abort(const char *why, struct rxrpc_call *call, rxrpc_seq_t seq) { - if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG)) { - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call, rxrpc_call_queue_abort); - } + if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG)) + rxrpc_send_abort_packet(call); } /* @@ -174,8 +172,8 @@ out_no_clear_ca: call->cong_cwnd = cwnd; call->cong_cumul_acks = cumulative_acks; trace_rxrpc_congest(call, summary, acked_serial, change); - if (resend && !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_queue_call(call, rxrpc_call_queue_resend); + if (resend) + rxrpc_resend(call, skb); return; packet_loss_detected: @@ -398,6 +396,8 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, /* Send an immediate ACK if we fill in a hole */ else if (!skb_queue_empty(&call->rx_oos_queue)) ack_reason = RXRPC_ACK_DELAY; + else + atomic_inc_return(&call->ackr_nr_unacked); window++; if (after(window, wtop)) @@ -473,14 +473,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, } send_ack: - if (ack_reason < 0 && - atomic_inc_return(&call->ackr_nr_unacked) > 2 && - test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) { - ack_reason = RXRPC_ACK_IDLE; - } else if (ack_reason >= 0) { - set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags); - } - if (ack_reason >= 0) rxrpc_send_ACK(call, ack_reason, serial, rxrpc_propose_ack_input_data); @@ -510,7 +502,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb &jhdr, sizeof(jhdr)) < 0) goto protocol_error; - jskb = skb_clone(skb, GFP_ATOMIC); + jskb = skb_clone(skb, GFP_NOFS); if (!jskb) { kdebug("couldn't clone"); return false; @@ -562,24 +554,6 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) if (state >= RXRPC_CALL_COMPLETE) return; - /* Unshare the packet so that it can be modified for in-place - * decryption. - */ - if (sp->hdr.securityIndex != 0) { - struct sk_buff *nskb = skb_unshare(skb, GFP_ATOMIC); - if (!nskb) { - rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare_nomem); - return; - } - - if (nskb != skb) { - rxrpc_eaten_skb(skb, rxrpc_skb_eaten_by_unshare); - skb = nskb; - rxrpc_new_skb(skb, rxrpc_skb_new_unshared); - sp = rxrpc_skb(skb); - } - } - if (state == RXRPC_CALL_SERVER_RECV_REQUEST) { unsigned long timo = READ_ONCE(call->next_req_timo); unsigned long now, expect_req_by; @@ -599,15 +573,15 @@ static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) if ((state == RXRPC_CALL_CLIENT_SEND_REQUEST || state == RXRPC_CALL_CLIENT_AWAIT_REPLY) && !rxrpc_receiving_reply(call)) - goto out; + goto out_notify; if (!rxrpc_input_split_jumbo(call, skb)) { rxrpc_proto_abort("VLD", call, sp->hdr.seq); - goto out; + goto out_notify; } skb = NULL; -out: +out_notify: trace_rxrpc_notify_socket(call->debug_id, serial); rxrpc_notify_socket(call); _leave(" [queued]"); @@ -667,32 +641,6 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0); } -/* - * Process the response to a ping that we sent to find out if we lost an ACK. - * - * If we got back a ping response that indicates a lower tx_top than what we - * had at the time of the ping transmission, we adjudge all the DATA packets - * sent between the response tx_top and the ping-time tx_top to have been lost. - */ -static void rxrpc_input_check_for_lost_ack(struct rxrpc_call *call) -{ - if (after(call->acks_lost_top, call->acks_prev_seq) && - !test_and_set_bit(RXRPC_CALL_EV_RESEND, &call->events)) - rxrpc_queue_call(call, rxrpc_call_queue_resend); -} - -/* - * Process a ping response. - */ -static void rxrpc_input_ping_response(struct rxrpc_call *call, - ktime_t resp_time, - rxrpc_serial_t acked_serial, - rxrpc_serial_t ack_serial) -{ - if (acked_serial == call->acks_lost_ping) - rxrpc_input_check_for_lost_ack(call); -} - /* * Process the extra information that may be appended to an ACK packet */ @@ -801,7 +749,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) struct rxrpc_ackpacket ack; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_ackinfo info; - struct sk_buff *skb_old = NULL; rxrpc_serial_t ack_serial, acked_serial; rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; @@ -809,10 +756,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) _enter(""); offset = sizeof(struct rxrpc_wire_header); - if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) { - rxrpc_proto_abort("XAK", call, 0); - goto out; - } + if (skb_copy_bits(skb, offset, &ack, sizeof(ack)) < 0) + return rxrpc_proto_abort("XAK", call, 0); offset += sizeof(ack); ack_serial = sp->hdr.serial; @@ -863,7 +808,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); - goto out; + return; } /* If we get an OUT_OF_SEQUENCE ACK from the server, that can also @@ -877,7 +822,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); - goto out; + return; } /* Discard any out-of-order or duplicate ACKs (outside lock). */ @@ -885,39 +830,25 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, first_soft_ack, call->acks_first_seq, prev_pkt, call->acks_prev_seq); - goto out; + return; } info.rxMTU = 0; ioffset = offset + nr_acks + 3; if (skb->len >= ioffset + sizeof(info) && - skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) { - rxrpc_proto_abort("XAI", call, 0); - goto out; - } + skb_copy_bits(skb, ioffset, &info, sizeof(info)) < 0) + return rxrpc_proto_abort("XAI", call, 0); if (nr_acks > 0) skb_condense(skb); - /* Discard any out-of-order or duplicate ACKs (inside lock). */ - if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, - first_soft_ack, call->acks_first_seq, - prev_pkt, call->acks_prev_seq); - goto out; - } call->acks_latest_ts = skb->tstamp; - call->acks_first_seq = first_soft_ack; call->acks_prev_seq = prev_pkt; switch (ack.reason) { case RXRPC_ACK_PING: break; - case RXRPC_ACK_PING_RESPONSE: - rxrpc_input_ping_response(call, skb->tstamp, acked_serial, - ack_serial); - fallthrough; default: if (after(acked_serial, call->acks_highest_serial)) call->acks_highest_serial = acked_serial; @@ -928,10 +859,8 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (info.rxMTU) rxrpc_input_ackinfo(call, skb, &info); - if (first_soft_ack == 0) { - rxrpc_proto_abort("AK0", call, 0); - goto out; - } + if (first_soft_ack == 0) + return rxrpc_proto_abort("AK0", call, 0); /* Ignore ACKs unless we are or have just been transmitting. */ switch (READ_ONCE(call->state)) { @@ -941,45 +870,27 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) case RXRPC_CALL_SERVER_AWAIT_ACK: break; default: - goto out; + return; } if (before(hard_ack, call->acks_hard_ack) || - after(hard_ack, call->tx_top)) { - rxrpc_proto_abort("AKW", call, 0); - goto out; - } - if (nr_acks > call->tx_top - hard_ack) { - rxrpc_proto_abort("AKN", call, 0); - goto out; - } + after(hard_ack, call->tx_top)) + return rxrpc_proto_abort("AKW", call, 0); + if (nr_acks > call->tx_top - hard_ack) + return rxrpc_proto_abort("AKN", call, 0); if (after(hard_ack, call->acks_hard_ack)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, "ETA"); - goto out; + return; } } if (nr_acks > 0) { - if (offset > (int)skb->len - nr_acks) { - rxrpc_proto_abort("XSA", call, 0); - goto out; - } - - rxrpc_get_skb(skb, rxrpc_skb_get_ack); - spin_lock(&call->acks_ack_lock); - skb_old = call->acks_soft_tbl; - call->acks_soft_tbl = skb; - spin_unlock(&call->acks_ack_lock); - + if (offset > (int)skb->len - nr_acks) + return rxrpc_proto_abort("XSA", call, 0); rxrpc_input_soft_acks(call, skb->data + offset, first_soft_ack, nr_acks, &summary); - } else if (call->acks_soft_tbl) { - spin_lock(&call->acks_ack_lock); - skb_old = call->acks_soft_tbl; - call->acks_soft_tbl = NULL; - spin_unlock(&call->acks_ack_lock); } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && @@ -989,8 +900,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_lost_reply); rxrpc_congestion_management(call, skb, &summary, acked_serial); -out: - rxrpc_free_skb(skb_old, rxrpc_skb_put_ack); } /* @@ -1020,13 +929,20 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) /* * Process an incoming call packet. */ -void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) +void rxrpc_input_call_packet(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned long timo; _enter("%p,%p", call, skb); + if (sp->hdr.serviceId != call->dest_srx.srx_service) + call->dest_srx.srx_service = sp->hdr.serviceId; + if ((int)sp->hdr.serial - (int)call->rx_serial > 0) + call->rx_serial = sp->hdr.serial; + if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) + set_bit(RXRPC_CALL_RX_HEARD, &call->flags); + timo = READ_ONCE(call->next_rx_timo); if (timo) { unsigned long now = jiffies, expect_rx_by; @@ -1072,9 +988,10 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) * * TODO: If callNumber > call_id + 1, renegotiate security. */ -void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, - struct rxrpc_call *call) +void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) { + struct rxrpc_connection *conn = call->conn; + switch (READ_ONCE(call->state)) { case RXRPC_CALL_SERVER_AWAIT_ACK: rxrpc_call_completed(call); @@ -1082,14 +999,14 @@ void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, case RXRPC_CALL_COMPLETE: break; default: - if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN)) { - set_bit(RXRPC_CALL_EV_ABORT, &call->events); - rxrpc_queue_call(call, rxrpc_call_queue_abort); - } + if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN)) + rxrpc_send_abort_packet(call); trace_rxrpc_improper_term(call); break; } + rxrpc_input_call_event(call, skb); + spin_lock(&conn->bundle->channel_lock); __rxrpc_disconnect_call(conn, call); spin_unlock(&conn->bundle->channel_lock); diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index bc65d83fab88..19aa315eddf5 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -9,6 +9,10 @@ #include "ar-internal.h" +static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, + struct sockaddr_rxrpc *peer_srx, + struct sk_buff *skb); + /* * handle data received on the local endpoint * - may be called in interrupt context @@ -63,45 +67,19 @@ void rxrpc_error_report(struct sock *sk) } /* - * post connection-level events to the connection - * - this includes challenges, responses, some aborts and call terminal packet - * retransmission. + * Process event packets targeted at a local endpoint. */ -static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, - struct sk_buff *skb) +static void rxrpc_input_version(struct rxrpc_local *local, struct sk_buff *skb) { - _enter("%p,%p", conn, skb); - - rxrpc_get_skb(skb, rxrpc_skb_get_conn_work); - skb_queue_tail(&conn->rx_queue, skb); - rxrpc_queue_conn(conn, rxrpc_conn_queue_rx_work); -} + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + char v; -/* - * post endpoint-level events to the local endpoint - * - this includes debug and version messages - */ -static void rxrpc_post_packet_to_local(struct rxrpc_local *local, - struct sk_buff *skb) -{ - _enter("%p,%p", local, skb); + _enter(""); - if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { - rxrpc_get_skb(skb, rxrpc_skb_get_local_work); - skb_queue_tail(&local->event_queue, skb); - rxrpc_queue_local(local); - } -} - -/* - * put a packet up for transport-level abort - */ -static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) -{ - if (rxrpc_get_local_maybe(local, rxrpc_local_get_queue)) { - rxrpc_get_skb(skb, rxrpc_skb_get_reject_work); - skb_queue_tail(&local->reject_queue, skb); - rxrpc_queue_local(local); + rxrpc_see_skb(skb, rxrpc_skb_see_version); + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &v, 1) >= 0) { + if (v == 0) + rxrpc_send_version_request(local, &sp->hdr, skb); } } @@ -156,22 +134,13 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) { struct rxrpc_connection *conn; struct sockaddr_rxrpc peer_srx; - struct rxrpc_channel *chan; - struct rxrpc_call *call = NULL; struct rxrpc_skb_priv *sp; struct rxrpc_peer *peer = NULL; - struct rxrpc_sock *rx = NULL; struct sk_buff *skb = *_skb; - unsigned int channel; - - if (skb->tstamp == 0) - skb->tstamp = ktime_get_real(); + int ret = 0; skb_pull(skb, sizeof(struct udphdr)); - /* The UDP protocol already released all skb resources; - * we are free to add our own data there. - */ sp = rxrpc_skb(skb); /* dig out the RxRPC connection details */ @@ -186,15 +155,13 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) } } - if (skb->tstamp == 0) - skb->tstamp = ktime_get_real(); trace_rxrpc_rx_packet(sp); switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_VERSION: if (rxrpc_to_client(sp)) return 0; - rxrpc_post_packet_to_local(local, skb); + rxrpc_input_version(local, skb); return 0; case RXRPC_PACKET_TYPE_BUSY: @@ -259,7 +226,7 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) goto bad_message; if (WARN_ON_ONCE(rxrpc_extract_addr_from_skb(&peer_srx, skb) < 0)) - return 0; /* Unsupported address type - discard. */ + return true; /* Unsupported address type - discard. */ if (peer_srx.transport.family != local->srx.transport.family && (peer_srx.transport.family == AF_INET && @@ -267,171 +234,172 @@ static int rxrpc_input_packet(struct rxrpc_local *local, struct sk_buff **_skb) pr_warn_ratelimited("AF_RXRPC: Protocol mismatch %u not %u\n", peer_srx.transport.family, local->srx.transport.family); - return 0; /* Wrong address type - discard. */ + return true; /* Wrong address type - discard. */ + } + + if (rxrpc_to_client(sp)) { + rcu_read_lock(); + conn = rxrpc_find_client_connection_rcu(local, &peer_srx, skb); + conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); + rcu_read_unlock(); + if (!conn) { + trace_rxrpc_abort(0, "NCC", sp->hdr.cid, + sp->hdr.callNumber, sp->hdr.seq, + RXKADINCONSISTENCY, EBADMSG); + goto protocol_error; + } + + ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb); + rxrpc_put_connection(conn, rxrpc_conn_put_call_input); + return ret; } + /* We need to look up service connections by the full protocol + * parameter set. We look up the peer first as an intermediate step + * and then the connection from the peer's tree. + */ rcu_read_lock(); - if (rxrpc_to_server(sp)) { - /* Weed out packets to services we're not offering. Packets - * that would begin a call are explicitly rejected and the rest - * are just discarded. - */ - rx = rcu_dereference(local->service); - if (!rx || (sp->hdr.serviceId != rx->srx.srx_service && - sp->hdr.serviceId != rx->second_service) - ) { - rcu_read_unlock(); - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && - sp->hdr.seq == 1) - goto unsupported_service; - return 0; - } + peer = rxrpc_lookup_peer_rcu(local, &peer_srx); + if (!peer) { + rcu_read_unlock(); + return rxrpc_new_incoming_call(local, NULL, NULL, &peer_srx, skb); } - conn = rxrpc_find_connection_rcu(local, &peer_srx, skb, &peer); + conn = rxrpc_find_service_conn_rcu(peer, skb); + conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); if (conn) { - if (sp->hdr.securityIndex != conn->security_ix) - goto wrong_security; + rcu_read_unlock(); + ret = rxrpc_input_packet_on_conn(conn, &peer_srx, skb); + rxrpc_put_connection(conn, rxrpc_conn_put_call_input); + return ret; + } - if (sp->hdr.serviceId != conn->service_id) { - int old_id; + peer = rxrpc_get_peer_maybe(peer, rxrpc_peer_get_input); + rcu_read_unlock(); - if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) - goto reupgrade; - old_id = cmpxchg(&conn->service_id, conn->orig_service_id, - sp->hdr.serviceId); + ret = rxrpc_new_incoming_call(local, peer, NULL, &peer_srx, skb); + rxrpc_put_peer(peer, rxrpc_peer_put_input); + if (ret < 0) + goto reject_packet; + return 0; - if (old_id != conn->orig_service_id && - old_id != sp->hdr.serviceId) - goto reupgrade; - } +bad_message: + trace_rxrpc_abort(0, "BAD", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, + RX_PROTOCOL_ERROR, EBADMSG); +protocol_error: + skb->priority = RX_PROTOCOL_ERROR; + skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; +reject_packet: + rxrpc_reject_packet(local, skb); + return ret; +} - if (sp->hdr.callNumber == 0) { - /* Connection-level packet */ - _debug("CONN %p {%d}", conn, conn->debug_id); - conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_conn_input); - rcu_read_unlock(); - if (conn) { - rxrpc_post_packet_to_conn(conn, skb); - rxrpc_put_connection(conn, rxrpc_conn_put_conn_input); - } - return 0; - } +/* + * Deal with a packet that's associated with an extant connection. + */ +static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, + struct sockaddr_rxrpc *peer_srx, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_channel *chan; + struct rxrpc_call *call = NULL; + unsigned int channel; - if ((int)sp->hdr.serial - (int)conn->hi_serial > 0) - conn->hi_serial = sp->hdr.serial; + if (sp->hdr.securityIndex != conn->security_ix) + goto wrong_security; - /* Call-bound packets are routed by connection channel. */ - channel = sp->hdr.cid & RXRPC_CHANNELMASK; - chan = &conn->channels[channel]; + if (sp->hdr.serviceId != conn->service_id) { + int old_id; - /* Ignore really old calls */ - if (sp->hdr.callNumber < chan->last_call) { - rcu_read_unlock(); - return 0; - } + if (!test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags)) + goto reupgrade; + old_id = cmpxchg(&conn->service_id, conn->orig_service_id, + sp->hdr.serviceId); - if (sp->hdr.callNumber == chan->last_call) { - if (chan->call || - sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) { - rcu_read_unlock(); - return 0; - } + if (old_id != conn->orig_service_id && + old_id != sp->hdr.serviceId) + goto reupgrade; + } - /* For the previous service call, if completed - * successfully, we discard all further packets. - */ - if (rxrpc_conn_is_service(conn) && - chan->last_type == RXRPC_PACKET_TYPE_ACK) { - rcu_read_unlock(); - return 0; - } + if (after(sp->hdr.serial, conn->hi_serial)) + conn->hi_serial = sp->hdr.serial; - /* But otherwise we need to retransmit the final packet - * from data cached in the connection record. - */ - if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) - trace_rxrpc_rx_data(chan->call_debug_id, - sp->hdr.seq, - sp->hdr.serial, - sp->hdr.flags); - conn = rxrpc_get_connection_maybe(conn, rxrpc_conn_get_call_input); - rcu_read_unlock(); - if (conn) { - rxrpc_post_packet_to_conn(conn, skb); - rxrpc_put_connection(conn, rxrpc_conn_put_call_input); - } + /* It's a connection-level packet if the call number is 0. */ + if (sp->hdr.callNumber == 0) + return rxrpc_input_conn_packet(conn, skb); + + /* Call-bound packets are routed by connection channel. */ + channel = sp->hdr.cid & RXRPC_CHANNELMASK; + chan = &conn->channels[channel]; + + /* Ignore really old calls */ + if (sp->hdr.callNumber < chan->last_call) + return 0; + + if (sp->hdr.callNumber == chan->last_call) { + if (chan->call || + sp->hdr.type == RXRPC_PACKET_TYPE_ABORT) return 0; - } - call = rcu_dereference(chan->call); + /* For the previous service call, if completed successfully, we + * discard all further packets. + */ + if (rxrpc_conn_is_service(conn) && + chan->last_type == RXRPC_PACKET_TYPE_ACK) + return 0; - if (sp->hdr.callNumber > chan->call_id) { - if (rxrpc_to_client(sp)) { - rcu_read_unlock(); - goto reject_packet; - } - if (call) { - rxrpc_input_implicit_end_call(conn, call); - chan->call = NULL; - call = NULL; - } - } + /* But otherwise we need to retransmit the final packet from + * data cached in the connection record. + */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA) + trace_rxrpc_rx_data(chan->call_debug_id, + sp->hdr.seq, + sp->hdr.serial, + sp->hdr.flags); + rxrpc_input_conn_packet(conn, skb); + return 0; + } - if (call && !rxrpc_try_get_call(call, rxrpc_call_get_input)) - call = NULL; + rcu_read_lock(); + call = rxrpc_try_get_call(rcu_dereference(chan->call), + rxrpc_call_get_input); + rcu_read_unlock(); + + if (sp->hdr.callNumber > chan->call_id) { + if (rxrpc_to_client(sp)) { + rxrpc_put_call(call, rxrpc_call_put_input); + goto reject_packet; + } if (call) { - if (sp->hdr.serviceId != call->dest_srx.srx_service) - call->dest_srx.srx_service = sp->hdr.serviceId; - if ((int)sp->hdr.serial - (int)call->rx_serial > 0) - call->rx_serial = sp->hdr.serial; - if (!test_bit(RXRPC_CALL_RX_HEARD, &call->flags)) - set_bit(RXRPC_CALL_RX_HEARD, &call->flags); + rxrpc_implicit_end_call(call, skb); + rxrpc_put_call(call, rxrpc_call_put_input); + call = NULL; } } if (!call) { - if (rxrpc_to_client(sp) || - sp->hdr.type != RXRPC_PACKET_TYPE_DATA) { - rcu_read_unlock(); + if (rxrpc_to_client(sp)) goto bad_message; - } - if (sp->hdr.seq != 1) { - rcu_read_unlock(); + if (rxrpc_new_incoming_call(conn->local, conn->peer, conn, + peer_srx, skb)) return 0; - } - call = rxrpc_new_incoming_call(local, rx, &peer_srx, skb); - if (!call) { - rcu_read_unlock(); - goto reject_packet; - } + goto reject_packet; } - rcu_read_unlock(); - - /* Process a call packet. */ rxrpc_input_call_event(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - trace_rxrpc_rx_done(0, 0); return 0; wrong_security: - rcu_read_unlock(); trace_rxrpc_abort(0, "SEC", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RXKADINCONSISTENCY, EBADMSG); skb->priority = RXKADINCONSISTENCY; goto post_abort; -unsupported_service: - trace_rxrpc_abort(0, "INV", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, - RX_INVALID_OPERATION, EOPNOTSUPP); - skb->priority = RX_INVALID_OPERATION; - goto post_abort; - reupgrade: - rcu_read_unlock(); trace_rxrpc_abort(0, "UPG", sp->hdr.cid, sp->hdr.callNumber, sp->hdr.seq, RX_PROTOCOL_ERROR, EBADMSG); goto protocol_error; @@ -444,7 +412,7 @@ protocol_error: post_abort: skb->mark = RXRPC_SKB_MARK_REJECT_ABORT; reject_packet: - rxrpc_reject_packet(local, skb); + rxrpc_reject_packet(conn->local, skb); return 0; } @@ -479,6 +447,11 @@ int rxrpc_io_thread(void *data) continue; } + if (!list_empty(&local->ack_tx_queue)) { + rxrpc_transmit_ack_packets(local); + continue; + } + /* Process received packets and errors. */ if ((skb = __skb_dequeue(&rx_queue))) { switch (skb->mark) { diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c index c344383a20b2..5e69ea6b233d 100644 --- a/net/rxrpc/local_event.c +++ b/net/rxrpc/local_event.c @@ -21,9 +21,9 @@ static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; /* * Reply to a version request */ -static void rxrpc_send_version_request(struct rxrpc_local *local, - struct rxrpc_host_header *hdr, - struct sk_buff *skb) +void rxrpc_send_version_request(struct rxrpc_local *local, + struct rxrpc_host_header *hdr, + struct sk_buff *skb) { struct rxrpc_wire_header whdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -73,40 +73,3 @@ static void rxrpc_send_version_request(struct rxrpc_local *local, _leave(""); } - -/* - * Process event packets targeted at a local endpoint. - */ -void rxrpc_process_local_events(struct rxrpc_local *local) -{ - struct sk_buff *skb; - char v; - - _enter(""); - - skb = skb_dequeue(&local->event_queue); - if (skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - rxrpc_see_skb(skb, rxrpc_skb_see_local_work); - _debug("{%d},{%u}", local->debug_id, sp->hdr.type); - - switch (sp->hdr.type) { - case RXRPC_PACKET_TYPE_VERSION: - if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &v, 1) < 0) - return; - if (v == 0) - rxrpc_send_version_request(local, &sp->hdr, skb); - break; - - default: - /* Just ignore anything we don't understand */ - break; - } - - rxrpc_free_skb(skb, rxrpc_skb_put_input); - } - - _leave(""); -} diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 03f491cc23ef..c73a5a1bc088 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -20,7 +20,6 @@ #include #include "ar-internal.h" -static void rxrpc_local_processor(struct work_struct *); static void rxrpc_local_rcu(struct rcu_head *); /* @@ -97,12 +96,9 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, atomic_set(&local->active_users, 1); local->rxnet = rxnet; INIT_HLIST_NODE(&local->link); - INIT_WORK(&local->processor, rxrpc_local_processor); INIT_LIST_HEAD(&local->ack_tx_queue); spin_lock_init(&local->ack_tx_lock); init_rwsem(&local->defrag_sem); - skb_queue_head_init(&local->reject_queue); - skb_queue_head_init(&local->event_queue); skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->call_attend_q); local->client_bundles = RB_ROOT; @@ -318,21 +314,6 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local, return NULL; } -/* - * Queue a local endpoint and pass the caller's reference to the work item. - */ -void rxrpc_queue_local(struct rxrpc_local *local) -{ - unsigned int debug_id = local->debug_id; - int r = refcount_read(&local->ref); - int u = atomic_read(&local->active_users); - - if (rxrpc_queue_work(&local->processor)) - trace_rxrpc_local(debug_id, rxrpc_local_queued, r, u); - else - rxrpc_put_local(local, rxrpc_local_put_already_queued); -} - /* * Drop a ref on a local endpoint. */ @@ -374,7 +355,7 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, /* * Cease using a local endpoint. Once the number of active users reaches 0, we - * start the closure of the transport in the work processor. + * start the closure of the transport in the I/O thread.. */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { @@ -416,52 +397,9 @@ void rxrpc_destroy_local(struct rxrpc_local *local) /* At this point, there should be no more packets coming in to the * local endpoint. */ - rxrpc_purge_queue(&local->reject_queue); - rxrpc_purge_queue(&local->event_queue); rxrpc_purge_queue(&local->rx_queue); } -/* - * Process events on an endpoint. The work item carries a ref which - * we must release. - */ -static void rxrpc_local_processor(struct work_struct *work) -{ - struct rxrpc_local *local = - container_of(work, struct rxrpc_local, processor); - bool again; - - if (local->dead) - return; - - rxrpc_see_local(local, rxrpc_local_processing); - - do { - again = false; - if (!__rxrpc_use_local(local, rxrpc_local_use_work)) - break; - - if (!list_empty(&local->ack_tx_queue)) { - rxrpc_transmit_ack_packets(local); - again = true; - } - - if (!skb_queue_empty(&local->reject_queue)) { - rxrpc_reject_packets(local); - again = true; - } - - if (!skb_queue_empty(&local->event_queue)) { - rxrpc_process_local_events(local); - again = true; - } - - __rxrpc_unuse_local(local, rxrpc_local_unuse_work); - } while (again); - - rxrpc_put_local(local, rxrpc_local_put_queue); -} - /* * Destroy a local endpoint after the RCU grace period expires. */ @@ -469,13 +407,8 @@ static void rxrpc_local_rcu(struct rcu_head *rcu) { struct rxrpc_local *local = container_of(rcu, struct rxrpc_local, rcu); - _enter("%d", local->debug_id); - - ASSERT(!work_pending(&local->processor)); - rxrpc_see_local(local, rxrpc_local_free); kfree(local); - _leave(""); } /* diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 71963b4523be..2ea1fa1b8a6f 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -229,11 +229,6 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * if (txb->ack.reason == RXRPC_ACK_PING) txb->wire.flags |= RXRPC_REQUEST_ACK; - if (txb->ack.reason == RXRPC_ACK_DELAY) - clear_bit(RXRPC_CALL_DELAY_ACK_PENDING, &call->flags); - if (txb->ack.reason == RXRPC_ACK_IDLE) - clear_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags); - n = rxrpc_fill_out_ack(conn, call, txb); if (n == 0) return 0; @@ -247,8 +242,6 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(txb->ack.firstPacket), ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks); - if (txb->ack_why == rxrpc_propose_ack_ping_for_lost_ack) - call->acks_lost_ping = serial; if (txb->ack.reason == RXRPC_ACK_PING) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); @@ -588,21 +581,20 @@ send_fragmentable: } /* - * reject packets through the local endpoint + * Reject a packet through the local endpoint. */ -void rxrpc_reject_packets(struct rxrpc_local *local) +void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) { - struct sockaddr_rxrpc srx; - struct rxrpc_skb_priv *sp; struct rxrpc_wire_header whdr; - struct sk_buff *skb; + struct sockaddr_rxrpc srx; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct msghdr msg; struct kvec iov[2]; size_t size; __be32 code; int ret, ioc; - _enter("%d", local->debug_id); + rxrpc_see_skb(skb, rxrpc_skb_see_reject); iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); @@ -616,52 +608,42 @@ void rxrpc_reject_packets(struct rxrpc_local *local) memset(&whdr, 0, sizeof(whdr)); - while ((skb = skb_dequeue(&local->reject_queue))) { - rxrpc_see_skb(skb, rxrpc_skb_see_reject); - sp = rxrpc_skb(skb); + switch (skb->mark) { + case RXRPC_SKB_MARK_REJECT_BUSY: + whdr.type = RXRPC_PACKET_TYPE_BUSY; + size = sizeof(whdr); + ioc = 1; + break; + case RXRPC_SKB_MARK_REJECT_ABORT: + whdr.type = RXRPC_PACKET_TYPE_ABORT; + code = htonl(skb->priority); + size = sizeof(whdr) + sizeof(code); + ioc = 2; + break; + default: + return; + } - switch (skb->mark) { - case RXRPC_SKB_MARK_REJECT_BUSY: - whdr.type = RXRPC_PACKET_TYPE_BUSY; - size = sizeof(whdr); - ioc = 1; - break; - case RXRPC_SKB_MARK_REJECT_ABORT: - whdr.type = RXRPC_PACKET_TYPE_ABORT; - code = htonl(skb->priority); - size = sizeof(whdr) + sizeof(code); - ioc = 2; - break; - default: - rxrpc_free_skb(skb, rxrpc_skb_put_input); - continue; - } + if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { + msg.msg_namelen = srx.transport_len; - if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { - msg.msg_namelen = srx.transport_len; - - whdr.epoch = htonl(sp->hdr.epoch); - whdr.cid = htonl(sp->hdr.cid); - whdr.callNumber = htonl(sp->hdr.callNumber); - whdr.serviceId = htons(sp->hdr.serviceId); - whdr.flags = sp->hdr.flags; - whdr.flags ^= RXRPC_CLIENT_INITIATED; - whdr.flags &= RXRPC_CLIENT_INITIATED; - - iov_iter_kvec(&msg.msg_iter, WRITE, iov, ioc, size); - ret = do_udp_sendmsg(local->socket, &msg, size); - if (ret < 0) - trace_rxrpc_tx_fail(local->debug_id, 0, ret, - rxrpc_tx_point_reject); - else - trace_rxrpc_tx_packet(local->debug_id, &whdr, - rxrpc_tx_point_reject); - } + whdr.epoch = htonl(sp->hdr.epoch); + whdr.cid = htonl(sp->hdr.cid); + whdr.callNumber = htonl(sp->hdr.callNumber); + whdr.serviceId = htons(sp->hdr.serviceId); + whdr.flags = sp->hdr.flags; + whdr.flags ^= RXRPC_CLIENT_INITIATED; + whdr.flags &= RXRPC_CLIENT_INITIATED; - rxrpc_free_skb(skb, rxrpc_skb_put_input); + iov_iter_kvec(&msg.msg_iter, WRITE, iov, ioc, size); + ret = do_udp_sendmsg(local->socket, &msg, size); + if (ret < 0) + trace_rxrpc_tx_fail(local->debug_id, 0, ret, + rxrpc_tx_point_reject); + else + trace_rxrpc_tx_packet(local->debug_id, &whdr, + rxrpc_tx_point_reject); } - - _leave(""); } /* diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 97d017ca3dc4..fb8096e93d2c 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -18,9 +18,9 @@ #include #include "ar-internal.h" -static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); -static void rxrpc_distribute_error(struct rxrpc_peer *, int, - enum rxrpc_call_completion); +static void rxrpc_store_error(struct rxrpc_peer *, struct sk_buff *); +static void rxrpc_distribute_error(struct rxrpc_peer *, struct sk_buff *, + enum rxrpc_call_completion, int); /* * Find the peer associated with a local error. @@ -161,7 +161,7 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) goto out; } - rxrpc_store_error(peer, serr); + rxrpc_store_error(peer, skb); out: rxrpc_put_peer(peer, rxrpc_peer_put_input_error); } @@ -169,19 +169,15 @@ out: /* * Map an error report to error codes on the peer record. */ -static void rxrpc_store_error(struct rxrpc_peer *peer, - struct sock_exterr_skb *serr) +static void rxrpc_store_error(struct rxrpc_peer *peer, struct sk_buff *skb) { enum rxrpc_call_completion compl = RXRPC_CALL_NETWORK_ERROR; - struct sock_extended_err *ee; - int err; + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct sock_extended_err *ee = &serr->ee; + int err = ee->ee_errno; _enter(""); - ee = &serr->ee; - - err = ee->ee_errno; - switch (ee->ee_origin) { case SO_EE_ORIGIN_NONE: case SO_EE_ORIGIN_LOCAL: @@ -197,14 +193,14 @@ static void rxrpc_store_error(struct rxrpc_peer *peer, break; } - rxrpc_distribute_error(peer, err, compl); + rxrpc_distribute_error(peer, skb, compl, err); } /* * Distribute an error that occurred on a peer. */ -static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, - enum rxrpc_call_completion compl) +static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb, + enum rxrpc_call_completion compl, int err) { struct rxrpc_call *call; HLIST_HEAD(error_targets); @@ -219,7 +215,8 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, int error, spin_unlock(&peer->lock); rxrpc_see_call(call, rxrpc_call_see_distribute_error); - rxrpc_set_call_completion(call, compl, 0, -error); + rxrpc_set_call_completion(call, compl, 0, -err); + rxrpc_input_call_event(call, skb); spin_lock(&peer->lock); } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 5df7f468abed..77d03b9e4c4c 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -253,11 +253,8 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) acked = atomic_add_return(call->rx_consumed - old_consumed, &call->ackr_nr_consumed); if (acked > 2 && - !test_and_set_bit(RXRPC_CALL_IDLE_ACK_PENDING, &call->flags)) { - rxrpc_send_ACK(call, RXRPC_ACK_IDLE, serial, - rxrpc_propose_ack_rotate_rx); - rxrpc_transmit_ack_packets(call->peer->local); - } + !test_and_set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) + rxrpc_poke_call(call, rxrpc_call_poke_idle); } /* @@ -377,7 +374,7 @@ done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); if (ret == -EAGAIN) - set_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags); + set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); return ret; } diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 11af37275d5b..58e0a36f6aa9 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -170,7 +170,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, { unsigned long now; rxrpc_seq_t seq = txb->seq; - bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags); + bool last = test_bit(RXRPC_TXBUF_LAST, &txb->flags), poke; rxrpc_inc_stat(call->rxnet, stat_tx_data); @@ -188,6 +188,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, /* Add the packet to the call's output buffer */ spin_lock(&call->tx_lock); + poke = list_empty(&call->tx_sendmsg); list_add_tail(&txb->call_link, &call->tx_sendmsg); call->tx_prepared = seq; spin_unlock(&call->tx_lock); @@ -220,11 +221,8 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, write_unlock_bh(&call->state_lock); } - - /* Stick the packet on the crypto queue or the transmission queue as - * appropriate. - */ - rxrpc_queue_call(call, rxrpc_call_queue_tx_data); + if (poke) + rxrpc_poke_call(call, rxrpc_call_poke_start); } /* -- cgit v1.2.3-70-g09d2 From 3dd9c8b5f09fd24652729a3da5c5efa3ec2c4590 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Jan 2020 10:21:15 +0000 Subject: rxrpc: Remove the _bh annotation from all the spinlocks None of the spinlocks in rxrpc need a _bh annotation now as the RCU callback routines no longer take spinlocks and the bulk of the packet wrangling code is now run in the I/O thread, not softirq context. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/af_rxrpc.c | 4 ++-- net/rxrpc/call_accept.c | 8 ++++---- net/rxrpc/call_event.c | 4 ++-- net/rxrpc/call_object.c | 20 ++++++++++---------- net/rxrpc/conn_client.c | 4 ++-- net/rxrpc/conn_event.c | 16 ++++++++-------- net/rxrpc/conn_service.c | 10 +++++----- net/rxrpc/input.c | 4 ++-- net/rxrpc/output.c | 8 ++++---- net/rxrpc/peer_event.c | 16 ++++++++-------- net/rxrpc/peer_object.c | 8 ++++---- net/rxrpc/recvmsg.c | 36 ++++++++++++++++++------------------ net/rxrpc/sendmsg.c | 12 ++++++------ 13 files changed, 75 insertions(+), 75 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 8ad4d85acb0b..7ea576f6ba4b 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -359,9 +359,9 @@ void rxrpc_kernel_end_call(struct socket *sock, struct rxrpc_call *call) /* Make sure we're not going to call back into a kernel service */ if (call->notify_rx) { - spin_lock_bh(&call->notify_lock); + spin_lock(&call->notify_lock); call->notify_rx = rxrpc_dummy_notify_rx; - spin_unlock_bh(&call->notify_lock); + spin_unlock(&call->notify_lock); } mutex_unlock(&call->user_mutex); diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 87b46c2a1985..d1850863507f 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -138,9 +138,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); rxnet = call->rxnet; - spin_lock_bh(&rxnet->call_lock); + spin_lock(&rxnet->call_lock); list_add_tail_rcu(&call->link, &rxnet->calls); - spin_unlock_bh(&rxnet->call_lock); + spin_unlock(&rxnet->call_lock); b->call_backlog[call_head] = call; smp_store_release(&b->call_backlog_head, (call_head + 1) & (size - 1)); @@ -188,8 +188,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) /* Make sure that there aren't any incoming calls in progress before we * clear the preallocation buffers. */ - spin_lock_bh(&rx->incoming_lock); - spin_unlock_bh(&rx->incoming_lock); + spin_lock(&rx->incoming_lock); + spin_unlock(&rx->incoming_lock); head = b->peer_backlog_head; tail = b->peer_backlog_tail; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 9db62fa55c62..18591f9ecc6a 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -101,9 +101,9 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, return; } - spin_lock_bh(&local->ack_tx_lock); + spin_lock(&local->ack_tx_lock); list_add_tail(&txb->tx_link, &local->ack_tx_queue); - spin_unlock_bh(&local->ack_tx_lock); + spin_unlock(&local->ack_tx_lock); trace_rxrpc_send_ack(call, why, ack_reason, serial); rxrpc_wake_up_io_thread(local); diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index d441a715d988..be5eb8cdf549 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -354,9 +354,9 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, write_unlock(&rx->call_lock); rxnet = call->rxnet; - spin_lock_bh(&rxnet->call_lock); + spin_lock(&rxnet->call_lock); list_add_tail_rcu(&call->link, &rxnet->calls); - spin_unlock_bh(&rxnet->call_lock); + spin_unlock(&rxnet->call_lock); /* From this point on, the call is protected by its own lock. */ release_sock(&rx->sk); @@ -537,7 +537,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) del_timer_sync(&call->timer); /* Make sure we don't get any more notifications */ - write_lock_bh(&rx->recvmsg_lock); + write_lock(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -550,7 +550,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; - write_unlock_bh(&rx->recvmsg_lock); + write_unlock(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); @@ -622,9 +622,9 @@ void rxrpc_put_call(struct rxrpc_call *call, enum rxrpc_call_trace why) ASSERTCMP(call->state, ==, RXRPC_CALL_COMPLETE); if (!list_empty(&call->link)) { - spin_lock_bh(&rxnet->call_lock); + spin_lock(&rxnet->call_lock); list_del_init(&call->link); - spin_unlock_bh(&rxnet->call_lock); + spin_unlock(&rxnet->call_lock); } rxrpc_cleanup_call(call); @@ -706,7 +706,7 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) _enter(""); if (!list_empty(&rxnet->calls)) { - spin_lock_bh(&rxnet->call_lock); + spin_lock(&rxnet->call_lock); while (!list_empty(&rxnet->calls)) { call = list_entry(rxnet->calls.next, @@ -721,12 +721,12 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) rxrpc_call_states[call->state], call->flags, call->events); - spin_unlock_bh(&rxnet->call_lock); + spin_unlock(&rxnet->call_lock); cond_resched(); - spin_lock_bh(&rxnet->call_lock); + spin_lock(&rxnet->call_lock); } - spin_unlock_bh(&rxnet->call_lock); + spin_unlock(&rxnet->call_lock); } atomic_dec(&rxnet->nr_calls); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 3c7b1bdec0db..a08e33c9e54b 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -557,9 +557,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, trace_rxrpc_connect_call(call); - write_lock_bh(&call->state_lock); + write_lock(&call->state_lock); call->state = RXRPC_CALL_CLIENT_SEND_REQUEST; - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); /* Paired with the read barrier in rxrpc_connect_call(). This orders * cid and epoch in the connection wrt to call_id without the need to diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 643a56322224..480364bcbf85 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -198,9 +198,9 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, _enter("%d,,%u,%u", conn->debug_id, error, abort_code); /* generate a connection-level abort */ - spin_lock_bh(&conn->state_lock); + spin_lock(&conn->state_lock); if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) { - spin_unlock_bh(&conn->state_lock); + spin_unlock(&conn->state_lock); _leave(" = 0 [already dead]"); return 0; } @@ -209,7 +209,7 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn, conn->abort_code = abort_code; conn->state = RXRPC_CONN_LOCALLY_ABORTED; set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); - spin_unlock_bh(&conn->state_lock); + spin_unlock(&conn->state_lock); msg.msg_name = &conn->peer->srx.transport; msg.msg_namelen = conn->peer->srx.transport_len; @@ -265,12 +265,12 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call) { _enter("%p", call); if (call) { - write_lock_bh(&call->state_lock); + write_lock(&call->state_lock); if (call->state == RXRPC_CALL_SERVER_SECURING) { call->state = RXRPC_CALL_SERVER_RECV_REQUEST; rxrpc_notify_socket(call); } - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); } } @@ -325,18 +325,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return ret; spin_lock(&conn->bundle->channel_lock); - spin_lock_bh(&conn->state_lock); + spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) { conn->state = RXRPC_CONN_SERVICE; - spin_unlock_bh(&conn->state_lock); + spin_unlock(&conn->state_lock); for (loop = 0; loop < RXRPC_MAXCALLS; loop++) rxrpc_call_is_secure( rcu_dereference_protected( conn->channels[loop].call, lockdep_is_held(&conn->bundle->channel_lock))); } else { - spin_unlock_bh(&conn->state_lock); + spin_unlock(&conn->state_lock); } spin_unlock(&conn->bundle->channel_lock); diff --git a/net/rxrpc/conn_service.c b/net/rxrpc/conn_service.c index b5ae7c753fc3..2a55a88b2a5b 100644 --- a/net/rxrpc/conn_service.c +++ b/net/rxrpc/conn_service.c @@ -73,7 +73,7 @@ static void rxrpc_publish_service_conn(struct rxrpc_peer *peer, struct rxrpc_conn_proto k = conn->proto; struct rb_node **pp, *parent; - write_seqlock_bh(&peer->service_conn_lock); + write_seqlock(&peer->service_conn_lock); pp = &peer->service_conns.rb_node; parent = NULL; @@ -94,14 +94,14 @@ static void rxrpc_publish_service_conn(struct rxrpc_peer *peer, rb_insert_color(&conn->service_node, &peer->service_conns); conn_published: set_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags); - write_sequnlock_bh(&peer->service_conn_lock); + write_sequnlock(&peer->service_conn_lock); _leave(" = %d [new]", conn->debug_id); return; found_extant_conn: if (refcount_read(&cursor->ref) == 0) goto replace_old_connection; - write_sequnlock_bh(&peer->service_conn_lock); + write_sequnlock(&peer->service_conn_lock); /* We should not be able to get here. rxrpc_incoming_connection() is * called in a non-reentrant context, so there can't be a race to * insert a new connection. @@ -195,8 +195,8 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *conn) { struct rxrpc_peer *peer = conn->peer; - write_seqlock_bh(&peer->service_conn_lock); + write_seqlock(&peer->service_conn_lock); if (test_and_clear_bit(RXRPC_CONN_IN_SERVICE_CONNS, &conn->flags)) rb_erase(&conn->service_node, &peer->service_conns); - write_sequnlock_bh(&peer->service_conn_lock); + write_sequnlock(&peer->service_conn_lock); } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 7ae7046f0b03..dd2ac5d55e1c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -669,10 +669,10 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, peer = call->peer; if (mtu < peer->maxdata) { - spin_lock_bh(&peer->lock); + spin_lock(&peer->lock); peer->maxdata = mtu; peer->mtu = mtu + peer->hdrsize; - spin_unlock_bh(&peer->lock); + spin_unlock(&peer->lock); } if (wake) diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 2ea1fa1b8a6f..e5d715b855fc 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -286,9 +286,9 @@ void rxrpc_transmit_ack_packets(struct rxrpc_local *local) if (list_empty(&local->ack_tx_queue)) return; - spin_lock_bh(&local->ack_tx_lock); + spin_lock(&local->ack_tx_lock); list_splice_tail_init(&local->ack_tx_queue, &queue); - spin_unlock_bh(&local->ack_tx_lock); + spin_unlock(&local->ack_tx_lock); while (!list_empty(&queue)) { struct rxrpc_txbuf *txb = @@ -296,9 +296,9 @@ void rxrpc_transmit_ack_packets(struct rxrpc_local *local) ret = rxrpc_send_ack_packet(local, txb); if (ret < 0 && ret != -ECONNRESET) { - spin_lock_bh(&local->ack_tx_lock); + spin_lock(&local->ack_tx_lock); list_splice_init(&queue, &local->ack_tx_queue); - spin_unlock_bh(&local->ack_tx_lock); + spin_unlock(&local->ack_tx_lock); break; } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index fb8096e93d2c..6685bf917aa6 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -121,10 +121,10 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) } if (mtu < peer->mtu) { - spin_lock_bh(&peer->lock); + spin_lock(&peer->lock); peer->mtu = mtu; peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock_bh(&peer->lock); + spin_unlock(&peer->lock); } } @@ -237,7 +237,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, time64_t keepalive_at; int slot; - spin_lock_bh(&rxnet->peer_hash_lock); + spin_lock(&rxnet->peer_hash_lock); while (!list_empty(collector)) { peer = list_entry(collector->next, @@ -248,7 +248,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, continue; if (__rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive)) { - spin_unlock_bh(&rxnet->peer_hash_lock); + spin_unlock(&rxnet->peer_hash_lock); keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; slot = keepalive_at - base; @@ -267,7 +267,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, */ slot += cursor; slot &= mask; - spin_lock_bh(&rxnet->peer_hash_lock); + spin_lock(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); @@ -275,7 +275,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, rxrpc_put_peer_locked(peer, rxrpc_peer_put_keepalive); } - spin_unlock_bh(&rxnet->peer_hash_lock); + spin_unlock(&rxnet->peer_hash_lock); } /* @@ -305,7 +305,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) * second; the bucket at cursor + 1 goes at now + 1s and so * on... */ - spin_lock_bh(&rxnet->peer_hash_lock); + spin_lock(&rxnet->peer_hash_lock); list_splice_init(&rxnet->peer_keepalive_new, &collector); stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); @@ -317,7 +317,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) } base = now; - spin_unlock_bh(&rxnet->peer_hash_lock); + spin_unlock(&rxnet->peer_hash_lock); rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 9e682a60a800..608946dcc505 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -349,7 +349,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, return NULL; } - spin_lock_bh(&rxnet->peer_hash_lock); + spin_lock(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -362,7 +362,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_sock *rx, &rxnet->peer_keepalive_new); } - spin_unlock_bh(&rxnet->peer_hash_lock); + spin_unlock(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); @@ -412,10 +412,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) ASSERT(hlist_empty(&peer->error_targets)); - spin_lock_bh(&rxnet->peer_hash_lock); + spin_lock(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); - spin_unlock_bh(&rxnet->peer_hash_lock); + spin_unlock(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 77d03b9e4c4c..3a8576e9daf3 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call) sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { - spin_lock_bh(&call->notify_lock); + spin_lock(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); - spin_unlock_bh(&call->notify_lock); + spin_unlock(&call->notify_lock); } else { - write_lock_bh(&rx->recvmsg_lock); + write_lock(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } - write_unlock_bh(&rx->recvmsg_lock); + write_unlock(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); @@ -87,9 +87,9 @@ bool rxrpc_set_call_completion(struct rxrpc_call *call, bool ret = false; if (call->state < RXRPC_CALL_COMPLETE) { - write_lock_bh(&call->state_lock); + write_lock(&call->state_lock); ret = __rxrpc_set_call_completion(call, compl, abort_code, error); - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); } return ret; } @@ -107,9 +107,9 @@ bool rxrpc_call_completed(struct rxrpc_call *call) bool ret = false; if (call->state < RXRPC_CALL_COMPLETE) { - write_lock_bh(&call->state_lock); + write_lock(&call->state_lock); ret = __rxrpc_call_completed(call); - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); } return ret; } @@ -131,9 +131,9 @@ bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, { bool ret; - write_lock_bh(&call->state_lock); + write_lock(&call->state_lock); ret = __rxrpc_abort_call(why, call, seq, abort_code, error); - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); return ret; } @@ -193,23 +193,23 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_terminal_ack); - write_lock_bh(&call->state_lock); + write_lock(&call->state_lock); switch (call->state) { case RXRPC_CALL_CLIENT_RECV_REPLY: __rxrpc_call_completed(call); - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); break; case RXRPC_CALL_SERVER_RECV_REQUEST: call->state = RXRPC_CALL_SERVER_ACK_REQUEST; call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); rxrpc_propose_delay_ACK(call, serial, rxrpc_propose_ack_processing_op); break; default: - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); break; } } @@ -442,14 +442,14 @@ try_again: /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. */ - write_lock_bh(&rx->recvmsg_lock); + write_lock(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!(flags & MSG_PEEK)) list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); - write_unlock_bh(&rx->recvmsg_lock); + write_unlock(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0); @@ -538,9 +538,9 @@ error_unlock_call: error_requeue_call: if (!(flags & MSG_PEEK)) { - write_lock_bh(&rx->recvmsg_lock); + write_lock(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); - write_unlock_bh(&rx->recvmsg_lock); + write_unlock(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 58e0a36f6aa9..2c861c55ed70 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -195,7 +195,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, if (last || call->state == RXRPC_CALL_SERVER_ACK_REQUEST) { _debug("________awaiting reply/ACK__________"); - write_lock_bh(&call->state_lock); + write_lock(&call->state_lock); switch (call->state) { case RXRPC_CALL_CLIENT_SEND_REQUEST: call->state = RXRPC_CALL_CLIENT_AWAIT_REPLY; @@ -218,7 +218,7 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, default: break; } - write_unlock_bh(&call->state_lock); + write_unlock(&call->state_lock); } if (poke) @@ -357,10 +357,10 @@ reload: success: ret = copied; if (READ_ONCE(call->state) == RXRPC_CALL_COMPLETE) { - read_lock_bh(&call->state_lock); + read_lock(&call->state_lock); if (call->error < 0) ret = call->error; - read_unlock_bh(&call->state_lock); + read_unlock(&call->state_lock); } out: call->tx_pending = txb; @@ -725,9 +725,9 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, notify_end_tx, &dropped_lock); break; case RXRPC_CALL_COMPLETE: - read_lock_bh(&call->state_lock); + read_lock(&call->state_lock); ret = call->error; - read_unlock_bh(&call->state_lock); + read_unlock(&call->state_lock); break; default: /* Request phase complete for this client call */ -- cgit v1.2.3-70-g09d2 From 32cf8edb079a6a687a2b5dba39a813a0bbd0ddf9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Nov 2022 13:47:35 +0000 Subject: rxrpc: Trace/count transmission underflows and cwnd resets Add a tracepoint to log when a cwnd reset occurs due to lack of transmission on a call. Add stat counters to count transmission underflows (ie. when we have tx window space, but sendmsg doesn't manage to keep up), cwnd resets and transmission failures. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 38 ++++++++++++++++++++++++++++++++++++++ net/rxrpc/ar-internal.h | 3 +++ net/rxrpc/call_event.c | 4 +++- net/rxrpc/input.c | 7 +++++-- net/rxrpc/output.c | 2 ++ net/rxrpc/proc.c | 14 ++++++++++---- 6 files changed, 61 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index c49b0c233594..b41e913ae78a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -1446,6 +1446,44 @@ TRACE_EVENT(rxrpc_congest, __entry->sum.retrans_timeo ? " rTxTo" : "") ); +TRACE_EVENT(rxrpc_reset_cwnd, + TP_PROTO(struct rxrpc_call *call, ktime_t now), + + TP_ARGS(call, now), + + TP_STRUCT__entry( + __field(unsigned int, call ) + __field(enum rxrpc_congest_mode, mode ) + __field(unsigned short, cwnd ) + __field(unsigned short, extra ) + __field(rxrpc_seq_t, hard_ack ) + __field(rxrpc_seq_t, prepared ) + __field(ktime_t, since_last_tx ) + __field(bool, has_data ) + ), + + TP_fast_assign( + __entry->call = call->debug_id; + __entry->mode = call->cong_mode; + __entry->cwnd = call->cong_cwnd; + __entry->extra = call->cong_extra; + __entry->hard_ack = call->acks_hard_ack; + __entry->prepared = call->tx_prepared - call->tx_bottom; + __entry->since_last_tx = ktime_sub(now, call->tx_last_sent); + __entry->has_data = !list_empty(&call->tx_sendmsg); + ), + + TP_printk("c=%08x q=%08x %s cw=%u+%u pr=%u tm=%llu d=%u", + __entry->call, + __entry->hard_ack, + __print_symbolic(__entry->mode, rxrpc_congest_modes), + __entry->cwnd, + __entry->extra, + __entry->prepared, + ktime_to_ns(__entry->since_last_tx), + __entry->has_data) + ); + TRACE_EVENT(rxrpc_disconnect_call, TP_PROTO(struct rxrpc_call *call), diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6b993a3d4186..6cfe366ee224 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -101,6 +101,9 @@ struct rxrpc_net { atomic_t stat_tx_data_retrans; atomic_t stat_tx_data_send; atomic_t stat_tx_data_send_frag; + atomic_t stat_tx_data_send_fail; + atomic_t stat_tx_data_underflow; + atomic_t stat_tx_data_cwnd_reset; atomic_t stat_rx_data; atomic_t stat_rx_data_reqack; atomic_t stat_rx_data_jumbo; diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 18591f9ecc6a..9f1e490ab976 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -317,8 +317,10 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) case RXRPC_CALL_CLIENT_AWAIT_REPLY: if (!rxrpc_tx_window_has_space(call)) return; - if (list_empty(&call->tx_sendmsg)) + if (list_empty(&call->tx_sendmsg)) { + rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); return; + } rxrpc_decant_prepared_tx(call); break; default: diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index dd2ac5d55e1c..2988e3d0c1f6 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -27,6 +27,7 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, enum rxrpc_congest_change change = rxrpc_cong_no_change; unsigned int cumulative_acks = call->cong_cumul_acks; unsigned int cwnd = call->cong_cwnd; + ktime_t now; bool resend = false; summary->flight_size = @@ -59,13 +60,15 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, /* If we haven't transmitted anything for >1RTT, we should reset the * congestion management state. */ + now = ktime_get_real(); if ((call->cong_mode == RXRPC_CALL_SLOW_START || call->cong_mode == RXRPC_CALL_CONGEST_AVOIDANCE) && ktime_before(ktime_add_us(call->tx_last_sent, - call->peer->srtt_us >> 3), - ktime_get_real()) + call->peer->srtt_us >> 3), now) ) { + trace_rxrpc_reset_cwnd(call, now); change = rxrpc_cong_idle_reset; + rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); summary->mode = RXRPC_CALL_SLOW_START; if (RXRPC_TX_SMSS > 2190) summary->cwnd = 2; diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index e5d715b855fc..8147a47d1702 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -485,6 +485,7 @@ dont_set_request_ack: up_read(&conn->local->defrag_sem); if (ret < 0) { + rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); rxrpc_cancel_rtt_probe(call, serial, rtt_slot); trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_nofrag); @@ -567,6 +568,7 @@ send_fragmentable: } if (ret < 0) { + rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); rxrpc_cancel_rtt_probe(call, serial, rtt_slot); trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_frag); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 5af7c8ee4b1a..6816934cb4cf 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -398,13 +398,16 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq)); seq_printf(seq, - "Data : send=%u sendf=%u\n", + "Data : send=%u sendf=%u fail=%u\n", atomic_read(&rxnet->stat_tx_data_send), - atomic_read(&rxnet->stat_tx_data_send_frag)); + atomic_read(&rxnet->stat_tx_data_send_frag), + atomic_read(&rxnet->stat_tx_data_send_fail)); seq_printf(seq, - "Data-Tx : nr=%u retrans=%u\n", + "Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n", atomic_read(&rxnet->stat_tx_data), - atomic_read(&rxnet->stat_tx_data_retrans)); + atomic_read(&rxnet->stat_tx_data_retrans), + atomic_read(&rxnet->stat_tx_data_underflow), + atomic_read(&rxnet->stat_tx_data_cwnd_reset)); seq_printf(seq, "Data-Rx : nr=%u reqack=%u jumbo=%u\n", atomic_read(&rxnet->stat_rx_data), @@ -472,8 +475,11 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) atomic_set(&rxnet->stat_tx_data, 0); atomic_set(&rxnet->stat_tx_data_retrans, 0); + atomic_set(&rxnet->stat_tx_data_underflow, 0); + atomic_set(&rxnet->stat_tx_data_cwnd_reset, 0); atomic_set(&rxnet->stat_tx_data_send, 0); atomic_set(&rxnet->stat_tx_data_send_frag, 0); + atomic_set(&rxnet->stat_tx_data_send_fail, 0); atomic_set(&rxnet->stat_rx_data, 0); atomic_set(&rxnet->stat_rx_data_reqack, 0); atomic_set(&rxnet->stat_rx_data_jumbo, 0); -- cgit v1.2.3-70-g09d2 From 5086d9a9dfec4866806da303115489b0606decb7 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 11 Nov 2022 13:47:35 +0000 Subject: rxrpc: Move the cwnd degradation after transmitting packets When we've gone for >1RTT without transmitting a packet, we should reduce the ssthresh and cut the cwnd by half (as suggested in RFC2861 sec 3.1). However, we may receive ACK packets in a batch and the first of these may cut the cwnd, preventing further transmission, and each subsequent one cuts the cwnd yet further, reducing it to the floor and killing performance. Fix this by moving the cwnd reset to after doing the transmission and resetting the base time such that we don't cut the cwnd by half again for at least another RTT. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 2 ++ net/rxrpc/call_event.c | 7 +++++++ net/rxrpc/input.c | 49 +++++++++++++++++++++++++++---------------------- net/rxrpc/proc.c | 5 +++-- 4 files changed, 39 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 6cfe366ee224..785cd0dd1eea 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -666,6 +666,7 @@ struct rxrpc_call { * packets) rather than bytes. */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN +#define RXRPC_MIN_CWND (RXRPC_TX_SMSS > 2190 ? 2 : RXRPC_TX_SMSS > 1095 ? 3 : 4) u8 cong_cwnd; /* Congestion window size */ u8 cong_extra; /* Extra to send for congestion management */ u8 cong_ssthresh; /* Slow-start threshold */ @@ -953,6 +954,7 @@ void rxrpc_unpublish_service_conn(struct rxrpc_connection *); /* * input.c */ +void rxrpc_congestion_degrade(struct rxrpc_call *); void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 9f1e490ab976..fd122e3726bd 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -427,6 +427,13 @@ void rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_transmit_some_data(call); + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) + rxrpc_congestion_degrade(call); + } + if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) rxrpc_send_initial_ping(call); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 2988e3d0c1f6..d0e20e946e48 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -27,7 +27,6 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, enum rxrpc_congest_change change = rxrpc_cong_no_change; unsigned int cumulative_acks = call->cong_cumul_acks; unsigned int cwnd = call->cong_cwnd; - ktime_t now; bool resend = false; summary->flight_size = @@ -57,27 +56,6 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, summary->cumulative_acks = cumulative_acks; summary->dup_acks = call->cong_dup_acks; - /* If we haven't transmitted anything for >1RTT, we should reset the - * congestion management state. - */ - now = ktime_get_real(); - if ((call->cong_mode == RXRPC_CALL_SLOW_START || - call->cong_mode == RXRPC_CALL_CONGEST_AVOIDANCE) && - ktime_before(ktime_add_us(call->tx_last_sent, - call->peer->srtt_us >> 3), now) - ) { - trace_rxrpc_reset_cwnd(call, now); - change = rxrpc_cong_idle_reset; - rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); - summary->mode = RXRPC_CALL_SLOW_START; - if (RXRPC_TX_SMSS > 2190) - summary->cwnd = 2; - else if (RXRPC_TX_SMSS > 1095) - summary->cwnd = 3; - else - summary->cwnd = 4; - } - switch (call->cong_mode) { case RXRPC_CALL_SLOW_START: if (summary->saw_nacks) @@ -197,6 +175,33 @@ send_extra_data: goto out_no_clear_ca; } +/* + * Degrade the congestion window if we haven't transmitted a packet for >1RTT. + */ +void rxrpc_congestion_degrade(struct rxrpc_call *call) +{ + ktime_t rtt, now; + + if (call->cong_mode != RXRPC_CALL_SLOW_START && + call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE) + return; + if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY) + return; + + rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8)); + now = ktime_get_real(); + if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now)) + return; + + trace_rxrpc_reset_cwnd(call, now); + rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); + call->tx_last_sent = now; + call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh, + call->cong_cwnd * 3 / 4); + call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND); +} + /* * Apply a hard ACK by advancing the Tx window. */ diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 6816934cb4cf..3a59591ec061 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -61,7 +61,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) "Proto Local " " Remote " " SvID ConnID CallID End Use State Abort " - " DebugId TxSeq TW RxSeq RW RxSerial RxTimo\n"); + " DebugId TxSeq TW RxSeq RW RxSerial CW RxTimo\n"); return 0; } @@ -84,7 +84,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) wtmp = atomic64_read_acquire(&call->ackr_window); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" - " %-8.8s %08x %08x %08x %02x %08x %02x %08x %06lx\n", + " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n", lbuff, rbuff, call->dest_srx.srx_service, @@ -98,6 +98,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, lower_32_bits(wtmp), upper_32_bits(wtmp) - lower_32_bits(wtmp), call->rx_serial, + call->cong_cwnd, timeout); return 0; -- cgit v1.2.3-70-g09d2 From a2cf3264f331acfeb7e463ad7b7fe1ac647a829d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 16 Nov 2022 12:02:22 +0000 Subject: rxrpc: Fold __rxrpc_unuse_local() into rxrpc_unuse_local() Fold __rxrpc_unuse_local() into rxrpc_unuse_local() as the latter is now the only user of the former. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- net/rxrpc/ar-internal.h | 12 ------------ net/rxrpc/local_object.c | 12 ++++++++++-- 2 files changed, 10 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 785cd0dd1eea..2a4928249a64 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -1002,18 +1002,6 @@ void rxrpc_unuse_local(struct rxrpc_local *, enum rxrpc_local_trace); void rxrpc_destroy_local(struct rxrpc_local *local); void rxrpc_destroy_all_locals(struct rxrpc_net *); -static inline bool __rxrpc_unuse_local(struct rxrpc_local *local, - enum rxrpc_local_trace why) -{ - unsigned int debug_id = local->debug_id; - int r, u; - - r = refcount_read(&local->ref); - u = atomic_dec_return(&local->active_users); - trace_rxrpc_local(debug_id, why, r, u); - return u == 0; -} - static inline bool __rxrpc_use_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index c73a5a1bc088..1e994a83db2b 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -359,8 +359,16 @@ struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local, */ void rxrpc_unuse_local(struct rxrpc_local *local, enum rxrpc_local_trace why) { - if (local && __rxrpc_unuse_local(local, why)) - kthread_stop(local->io_thread); + unsigned int debug_id = local->debug_id; + int r, u; + + if (local) { + r = refcount_read(&local->ref); + u = atomic_dec_return(&local->active_users); + trace_rxrpc_local(debug_id, why, r, u); + if (u == 0) + kthread_stop(local->io_thread); + } } /* -- cgit v1.2.3-70-g09d2 From b0346843b1076b34a0278ff601f8f287535cb064 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 30 Jan 2020 21:48:13 +0000 Subject: rxrpc: Transmit ACKs at the point of generation For ACKs generated inside the I/O thread, transmit the ACK at the point of generation. Where the ACK is generated outside of the I/O thread, it's offloaded to the I/O thread to transmit it. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org --- include/trace/events/rxrpc.h | 3 --- net/rxrpc/ar-internal.h | 5 +---- net/rxrpc/call_event.c | 17 ++--------------- net/rxrpc/io_thread.c | 5 ----- net/rxrpc/local_object.c | 2 -- net/rxrpc/output.c | 42 ++---------------------------------------- net/rxrpc/recvmsg.c | 3 --- net/rxrpc/sendmsg.c | 2 -- net/rxrpc/txbuf.c | 1 - 9 files changed, 5 insertions(+), 75 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index b41e913ae78a..049b52e7aa6a 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -63,7 +63,6 @@ EM(rxrpc_local_put_peer, "PUT peer ") \ EM(rxrpc_local_put_prealloc_conn, "PUT conn-pre") \ EM(rxrpc_local_put_release_sock, "PUT rel-sock") \ - EM(rxrpc_local_see_tx_ack, "SEE tx-ack ") \ EM(rxrpc_local_stop, "STOP ") \ EM(rxrpc_local_stopped, "STOPPED ") \ EM(rxrpc_local_unuse_bind, "UNU bind ") \ @@ -156,7 +155,6 @@ EM(rxrpc_call_get_recvmsg, "GET recvmsg ") \ EM(rxrpc_call_get_release_sock, "GET rel-sock") \ EM(rxrpc_call_get_sendmsg, "GET sendmsg ") \ - EM(rxrpc_call_get_send_ack, "GET send-ack") \ EM(rxrpc_call_get_userid, "GET user-id ") \ EM(rxrpc_call_new_client, "NEW client ") \ EM(rxrpc_call_new_prealloc_service, "NEW prealloc") \ @@ -168,7 +166,6 @@ EM(rxrpc_call_put_recvmsg, "PUT recvmsg ") \ EM(rxrpc_call_put_release_sock, "PUT rls-sock") \ EM(rxrpc_call_put_release_sock_tba, "PUT rls-sk-a") \ - EM(rxrpc_call_put_send_ack, "PUT send-ack") \ EM(rxrpc_call_put_sendmsg, "PUT sendmsg ") \ EM(rxrpc_call_put_unnotify, "PUT unnotify") \ EM(rxrpc_call_put_userid_exists, "PUT u-exists") \ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 2a4928249a64..e7dccab7b741 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -287,8 +287,6 @@ struct rxrpc_local { struct hlist_node link; struct socket *socket; /* my UDP socket */ struct task_struct *io_thread; - struct list_head ack_tx_queue; /* List of ACKs that need sending */ - spinlock_t ack_tx_lock; /* ACK list lock */ struct rxrpc_sock __rcu *service; /* Service(s) listening on this endpoint */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head rx_queue; /* Received packets */ @@ -762,7 +760,6 @@ struct rxrpc_txbuf { struct rcu_head rcu; struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ struct list_head tx_link; /* Link in live Enc queue or Tx queue */ - struct rxrpc_call *call; /* Call to which belongs */ ktime_t last_sent; /* Time at which last transmitted */ refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ @@ -1047,7 +1044,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) /* * output.c */ -void rxrpc_transmit_ack_packets(struct rxrpc_local *); +int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb); int rxrpc_send_abort_packet(struct rxrpc_call *); int rxrpc_send_data_packet(struct rxrpc_call *, struct rxrpc_txbuf *); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index fd122e3726bd..b2cf448fb02c 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -69,7 +69,6 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { - struct rxrpc_local *local = call->conn->local; struct rxrpc_txbuf *txb; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) @@ -96,17 +95,9 @@ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, txb->ack.reason = ack_reason; txb->ack.nAcks = 0; - if (!rxrpc_try_get_call(call, rxrpc_call_get_send_ack)) { - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_nomem); - return; - } - - spin_lock(&local->ack_tx_lock); - list_add_tail(&txb->tx_link, &local->ack_tx_queue); - spin_unlock(&local->ack_tx_lock); trace_rxrpc_send_ack(call, why, ack_reason, serial); - - rxrpc_wake_up_io_thread(local); + rxrpc_send_ack_packet(call, txb); + rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); } /* @@ -294,10 +285,6 @@ static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) rxrpc_transmit_one(call, txb); - // TODO: Drain the transmission buffers. Do this somewhere better - if (after(call->acks_hard_ack, call->tx_bottom + 16)) - rxrpc_shrink_call_tx_buffer(call); - if (!rxrpc_tx_window_has_space(call)) break; } diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 19aa315eddf5..d83ae3193032 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -447,11 +447,6 @@ int rxrpc_io_thread(void *data) continue; } - if (!list_empty(&local->ack_tx_queue)) { - rxrpc_transmit_ack_packets(local); - continue; - } - /* Process received packets and errors. */ if ((skb = __skb_dequeue(&rx_queue))) { switch (skb->mark) { diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 1e994a83db2b..44222923c0d1 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -96,8 +96,6 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, atomic_set(&local->active_users, 1); local->rxnet = rxnet; INIT_HLIST_NODE(&local->link); - INIT_LIST_HEAD(&local->ack_tx_queue); - spin_lock_init(&local->ack_tx_lock); init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->rx_queue); INIT_LIST_HEAD(&local->call_attend_q); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 8147a47d1702..3d8c9f830ee0 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -203,12 +203,11 @@ static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call, } /* - * Send an ACK call packet. + * Transmit an ACK packet. */ -static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf *txb) +int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { struct rxrpc_connection *conn; - struct rxrpc_call *call = txb->call; struct msghdr msg; struct kvec iov[1]; rxrpc_serial_t serial; @@ -271,43 +270,6 @@ static int rxrpc_send_ack_packet(struct rxrpc_local *local, struct rxrpc_txbuf * return ret; } -/* - * ACK transmitter for a local endpoint. The UDP socket locks around each - * transmission, so we can only transmit one packet at a time, ACK, DATA or - * otherwise. - */ -void rxrpc_transmit_ack_packets(struct rxrpc_local *local) -{ - LIST_HEAD(queue); - int ret; - - rxrpc_see_local(local, rxrpc_local_see_tx_ack); - - if (list_empty(&local->ack_tx_queue)) - return; - - spin_lock(&local->ack_tx_lock); - list_splice_tail_init(&local->ack_tx_queue, &queue); - spin_unlock(&local->ack_tx_lock); - - while (!list_empty(&queue)) { - struct rxrpc_txbuf *txb = - list_entry(queue.next, struct rxrpc_txbuf, tx_link); - - ret = rxrpc_send_ack_packet(local, txb); - if (ret < 0 && ret != -ECONNRESET) { - spin_lock(&local->ack_tx_lock); - list_splice_init(&queue, &local->ack_tx_queue); - spin_unlock(&local->ack_tx_lock); - break; - } - - list_del_init(&txb->tx_link); - rxrpc_put_call(txb->call, rxrpc_call_put_send_ack); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); - } -} - /* * Send an ABORT call packet. */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 3a8576e9daf3..36b25d003cf0 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -320,7 +320,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, ret = ret2; goto out; } - rxrpc_transmit_ack_packets(call->peer->local); } else { trace_rxrpc_recvdata(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); @@ -502,7 +501,6 @@ try_again: if (ret == -EAGAIN) ret = 0; - rxrpc_transmit_ack_packets(call->peer->local); if (!skb_queue_empty(&call->recvmsg_queue)) rxrpc_notify_socket(call); break; @@ -632,7 +630,6 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, read_phase_complete: ret = 1; out: - rxrpc_transmit_ack_packets(call->peer->local); if (_service) *_service = call->dest_srx.srx_service; mutex_unlock(&call->user_mutex); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 2c861c55ed70..9fa7e37f7155 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -276,8 +276,6 @@ reload: rxrpc_see_txbuf(txb, rxrpc_txbuf_see_send_more); do { - rxrpc_transmit_ack_packets(call->peer->local); - if (!txb) { size_t remain, bufsize, chunk, offset; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index a5054389dfbb..d2cf2aac3adb 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -26,7 +26,6 @@ struct rxrpc_txbuf *rxrpc_alloc_txbuf(struct rxrpc_call *call, u8 packet_type, INIT_LIST_HEAD(&txb->call_link); INIT_LIST_HEAD(&txb->tx_link); refcount_set(&txb->ref, 1); - txb->call = call; txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); txb->space = sizeof(txb->data); -- cgit v1.2.3-70-g09d2 From 09d838a457a89883a926b8b0104d575158fd4b92 Mon Sep 17 00:00:00 2001 From: Íñigo Huguet Date: Fri, 11 Nov 2022 16:36:22 +0100 Subject: wifi: mac80211: fix maybe-unused warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ieee80211_lookup_key, the variable named `local` is unused if compiled without lockdep, getting this warning: net/mac80211/cfg.c: In function ‘ieee80211_lookup_key’: net/mac80211/cfg.c:542:26: error: unused variable ‘local’ [-Werror=unused-variable] struct ieee80211_local *local = sdata->local; ^~~~~ Fix it with __maybe_unused. Fixes: 8cbf0c2ab6df ("wifi: mac80211: refactor some key code") Signed-off-by: Íñigo Huguet Link: https://lore.kernel.org/r/20221111153622.29016-1-ihuguet@redhat.com Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c848fe04dd44..8f9a2ab502b3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -576,7 +576,7 @@ static struct ieee80211_key * ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata, int link_id, u8 key_idx, bool pairwise, const u8 *mac_addr) { - struct ieee80211_local *local = sdata->local; + struct ieee80211_local *local __maybe_unused = sdata->local; struct ieee80211_link_data *link = &sdata->deflink; struct ieee80211_key *key; -- cgit v1.2.3-70-g09d2 From c1d3214d61d93eb4a3959c6b402230988d0f362d Mon Sep 17 00:00:00 2001 From: JUN-KYU SHIN Date: Fri, 11 Nov 2022 11:33:04 +0900 Subject: wifi: cfg80211: fix comparison of BSS frequencies If the "channel->freq_offset" comparison is omitted in cmp_bss(), BSS with different kHz units cannot be distinguished in the S1G Band. So "freq_offset" should also be included in the comparison. Signed-off-by: JUN-KYU SHIN Link: https://lore.kernel.org/r/20221111023301.6395-1-jk.shin@newratek.com Signed-off-by: Johannes Berg --- net/wireless/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index e70302a30013..a39c93753213 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1289,7 +1289,8 @@ static int cmp_bss(struct cfg80211_bss *a, int i, r; if (a->channel != b->channel) - return b->channel->center_freq - a->channel->center_freq; + return (b->channel->center_freq * 1000 + b->channel->freq_offset) - + (a->channel->center_freq * 1000 + a->channel->freq_offset); a_ies = rcu_access_pointer(a->ies); if (!a_ies) -- cgit v1.2.3-70-g09d2 From 833a9fd28c9b7ccb39a334721379e992dc1c0c89 Mon Sep 17 00:00:00 2001 From: Chen Zhongjin Date: Wed, 9 Nov 2022 17:02:37 +0800 Subject: wifi: cfg80211: Fix not unregister reg_pdev when load_builtin_regdb_keys() fails In regulatory_init_db(), when it's going to return a error, reg_pdev should be unregistered. When load_builtin_regdb_keys() fails it doesn't do it and makes cfg80211 can't be reload with report: sysfs: cannot create duplicate filename '/devices/platform/regulatory.0' ... dump_stack_lvl+0x79/0x9b sysfs_warn_dup.cold+0x1c/0x29 sysfs_create_dir_ns+0x22d/0x290 kobject_add_internal+0x247/0x800 kobject_add+0x135/0x1b0 device_add+0x389/0x1be0 platform_device_add+0x28f/0x790 platform_device_register_full+0x376/0x4b0 regulatory_init+0x9a/0x4b2 [cfg80211] cfg80211_init+0x84/0x113 [cfg80211] ... Fixes: 90a53e4432b1 ("cfg80211: implement regdb signature checking") Signed-off-by: Chen Zhongjin Link: https://lore.kernel.org/r/20221109090237.214127-1-chenzhongjin@huawei.com Signed-off-by: Johannes Berg --- net/wireless/reg.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index c3d950d29432..4f3f31244e8b 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -4311,8 +4311,10 @@ static int __init regulatory_init_db(void) return -EINVAL; err = load_builtin_regdb_keys(); - if (err) + if (err) { + platform_device_unregister(reg_pdev); return err; + } /* We always try to get an update for the static regdomain */ err = regulatory_hint_core(cfg80211_world_regdom->alpha2); -- cgit v1.2.3-70-g09d2 From 9445096319206814e462b904915b645c4f2bf514 Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Mon, 7 Nov 2022 17:13:28 +0100 Subject: wifi: mac80211: Drop not needed check for NULL ieee80211_get_txq() can only be called with vif != NULL. Remove not needed NULL test in function. Signed-off-by: Alexander Wetzel Reported-by: kernel test robot Reported-by: Dan Carpenter Link: https://lore.kernel.org/r/20221107161328.2883-1-alexander@wetzel-home.de Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 165ac0711d71..3ed5ca4a2bae 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1343,7 +1343,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, return NULL; txq = sta->sta.txq[tid]; - } else if (vif) { + } else { txq = vif->txq; } -- cgit v1.2.3-70-g09d2 From b2ddde566de409a3dbf708fe89ecbe114dd14cc3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 2 Nov 2022 09:47:47 +0100 Subject: wifi: mac80211: remove unnecessary synchronize_net() The call to ieee80211_do_stop() right after will also do synchronize_rcu() to ensure the SDATA_STATE_RUNNING bit is cleared, so we don't need to synchronize_net() here. Change-Id: Id9f9ffcf195002013e5d9fde288877d219780864 Signed-off-by: Johannes Berg --- net/mac80211/iface.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index a5782927a93b..7b2843df3813 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1849,8 +1849,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, ieee80211_stop_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_IFTYPE_CHANGE); - synchronize_net(); - + /* do_stop will synchronize_rcu() first thing */ ieee80211_do_stop(sdata, false); ieee80211_teardown_sdata(sdata); -- cgit v1.2.3-70-g09d2 From 61e41e5dfcc22e5e65b6537453fd2f03ac768b82 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 17 Oct 2022 09:03:48 +0200 Subject: wifi: cfg80211: use bss_from_pub() instead of container_of() There's no need to open-code container_of() when we have bss_from_pub(). Use it. Change-Id: I074723717909ba211a40e6499f0c36df0e2ba4be Signed-off-by: Johannes Berg --- net/wireless/scan.c | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a39c93753213..8cf405776dde 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -158,9 +158,8 @@ static inline void bss_ref_put(struct cfg80211_registered_device *rdev, if (bss->pub.hidden_beacon_bss) { struct cfg80211_internal_bss *hbss; - hbss = container_of(bss->pub.hidden_beacon_bss, - struct cfg80211_internal_bss, - pub); + + hbss = bss_from_pub(bss->pub.hidden_beacon_bss); hbss->refcount--; if (hbss->refcount == 0) bss_free(hbss); @@ -169,9 +168,7 @@ static inline void bss_ref_put(struct cfg80211_registered_device *rdev, if (bss->pub.transmitted_bss) { struct cfg80211_internal_bss *tbss; - tbss = container_of(bss->pub.transmitted_bss, - struct cfg80211_internal_bss, - pub); + tbss = bss_from_pub(bss->pub.transmitted_bss); tbss->refcount--; if (tbss->refcount == 0) bss_free(tbss); @@ -1791,13 +1788,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, /* This must be before the call to bss_ref_get */ if (tmp->pub.transmitted_bss) { - struct cfg80211_internal_bss *pbss = - container_of(tmp->pub.transmitted_bss, - struct cfg80211_internal_bss, - pub); - new->pub.transmitted_bss = tmp->pub.transmitted_bss; - bss_ref_get(rdev, pbss); + bss_ref_get(rdev, bss_from_pub(tmp->pub.transmitted_bss)); } list_add_tail(&new->list, &rdev->bss_list); @@ -2570,15 +2562,12 @@ EXPORT_SYMBOL(cfg80211_inform_bss_frame_data); void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - struct cfg80211_internal_bss *bss; if (!pub) return; - bss = container_of(pub, struct cfg80211_internal_bss, pub); - spin_lock_bh(&rdev->bss_lock); - bss_ref_get(rdev, bss); + bss_ref_get(rdev, bss_from_pub(pub)); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_ref_bss); @@ -2586,15 +2575,12 @@ EXPORT_SYMBOL(cfg80211_ref_bss); void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - struct cfg80211_internal_bss *bss; if (!pub) return; - bss = container_of(pub, struct cfg80211_internal_bss, pub); - spin_lock_bh(&rdev->bss_lock); - bss_ref_put(rdev, bss); + bss_ref_put(rdev, bss_from_pub(pub)); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_put_bss); @@ -2608,7 +2594,7 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) if (WARN_ON(!pub)) return; - bss = container_of(pub, struct cfg80211_internal_bss, pub); + bss = bss_from_pub(pub); spin_lock_bh(&rdev->bss_lock); if (list_empty(&bss->list)) @@ -2617,8 +2603,7 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) list_for_each_entry_safe(nontrans_bss, tmp, &pub->nontrans_list, nontrans_list) { - tmp1 = container_of(nontrans_bss, - struct cfg80211_internal_bss, pub); + tmp1 = bss_from_pub(nontrans_bss); if (__cfg80211_unlink_bss(rdev, tmp1)) rdev->bss_generation++; } @@ -2675,9 +2660,7 @@ void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, /* use transmitting bss */ if (cbss->pub.transmitted_bss) - cbss = container_of(cbss->pub.transmitted_bss, - struct cfg80211_internal_bss, - pub); + cbss = bss_from_pub(cbss->pub.transmitted_bss); cbss->pub.channel = chan; @@ -2706,8 +2689,7 @@ void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, list_for_each_entry_safe(nontrans_bss, tmp, &new->pub.nontrans_list, nontrans_list) { - bss = container_of(nontrans_bss, - struct cfg80211_internal_bss, pub); + bss = bss_from_pub(nontrans_bss); if (__cfg80211_unlink_bss(rdev, bss)) rdev->bss_generation++; } @@ -2724,8 +2706,7 @@ void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, list_for_each_entry_safe(nontrans_bss, tmp, &cbss->pub.nontrans_list, nontrans_list) { - bss = container_of(nontrans_bss, - struct cfg80211_internal_bss, pub); + bss = bss_from_pub(nontrans_bss); bss->pub.channel = chan; rb_erase(&bss->rbn, &rdev->bss_tree); rb_insert_bss(rdev, bss); -- cgit v1.2.3-70-g09d2 From 8950b5988a9a2957b4c2c6f20143dfc2a0230a74 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 14 Oct 2022 18:56:11 +0200 Subject: wifi: mac80211: don't parse multi-BSSID in assoc resp It's not valid to have the multiple BSSID element in the association response (per 802.11 REVme D1.0), so don't try to parse it there, but only in the fallback beacon elements if needed. The other case that was parsing association requests was already changed in a previous commit. Change-Id: I659d2ef1253e079cc71c46a017044e116e31c024 Signed-off-by: Johannes Berg --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index a804e0220ed7..0aee2392dd29 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3932,7 +3932,6 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct ieee80211_elems_parse_params parse_params = { .start = elem_start, .len = elem_len, - .bss = cbss, .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, }; @@ -4017,6 +4016,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, parse_params.start = bss_ies->data; parse_params.len = bss_ies->len; + parse_params.bss = cbss; bss_elems = ieee802_11_parse_elems_full(&parse_params); if (!bss_elems) { ret = false; -- cgit v1.2.3-70-g09d2 From 209d70d34a7fd32e6fb6784f107913b0d18fe501 Mon Sep 17 00:00:00 2001 From: Kieran Frewen Date: Mon, 7 Nov 2022 11:16:02 +1300 Subject: wifi: mac80211: update TIM for S1G specification changes Updates to the TIM information element to match changes made in the IEEE Std 802.11ah-2020. Signed-off-by: Kieran Frewen Co-developed-by: Gilad Itzkovitch Signed-off-by: Gilad Itzkovitch Link: https://lore.kernel.org/r/20221106221602.25714-1-gilad.itzkovitch@morsemicro.com [use skb_put_data/skb_put_u8] Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 3ed5ca4a2bae..774e62860c45 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4774,9 +4774,9 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, ps->dtim_count--; } - tim = pos = skb_put(skb, 6); + tim = pos = skb_put(skb, 5); *pos++ = WLAN_EID_TIM; - *pos++ = 4; + *pos++ = 3; *pos++ = ps->dtim_count; *pos++ = link_conf->dtim_period; @@ -4807,13 +4807,17 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, /* Bitmap control */ *pos++ = n1 | aid0; /* Part Virt Bitmap */ - skb_put(skb, n2 - n1); - memcpy(pos, ps->tim + n1, n2 - n1 + 1); + skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); tim[1] = n2 - n1 + 4; } else { *pos++ = aid0; /* Bitmap control */ - *pos++ = 0; /* Part Virt Bitmap */ + + if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) { + tim[1] = 4; + /* Part Virt Bitmap */ + skb_put_u8(skb, 0); + } } } -- cgit v1.2.3-70-g09d2 From 7d360f6061db01830adfdb1eaa3977b19db0c30b Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 10 Oct 2022 11:43:38 +0200 Subject: wifi: mac80211: add support for restricting netdev features per vif This can be used to selectively disable feature flags for checksum offload, scatter/gather or GSO by changing vif->netdev_features. Removing features from vif->netdev_features does not affect the netdev features themselves, but instead fixes up skbs in the tx path so that the offloads are not needed in the driver. Aside from making it easier to deal with vif type based hardware limitations, this also makes it possible to optimize performance on hardware without native GSO support by declaring GSO support in hw->netdev_features and removing it from vif->netdev_features. This allows mac80211 to handle GSO segmentation after the sta lookup, but before itxq enqueue, thus reducing the number of unnecessary sta lookups, as well as some other per-packet processing. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20221010094338.78070-1-nbd@nbd.name Signed-off-by: Johannes Berg --- include/net/fq_impl.h | 16 +-- include/net/mac80211.h | 5 + net/mac80211/iface.c | 1 + net/mac80211/tx.c | 283 ++++++++++++++++++++++++++++++++----------------- 4 files changed, 202 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h index 524b510f1c68..9467e33dfb36 100644 --- a/include/net/fq_impl.h +++ b/include/net/fq_impl.h @@ -200,6 +200,7 @@ static void fq_tin_enqueue(struct fq *fq, fq_skb_free_t free_func) { struct fq_flow *flow; + struct sk_buff *next; bool oom; lockdep_assert_held(&fq->lock); @@ -214,11 +215,15 @@ static void fq_tin_enqueue(struct fq *fq, } flow->tin = tin; - flow->backlog += skb->len; - tin->backlog_bytes += skb->len; - tin->backlog_packets++; - fq->memory_usage += skb->truesize; - fq->backlog++; + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); + flow->backlog += skb->len; + tin->backlog_bytes += skb->len; + tin->backlog_packets++; + fq->memory_usage += skb->truesize; + fq->backlog++; + __skb_queue_tail(&flow->queue, skb); + } if (list_empty(&flow->flowchain)) { flow->deficit = fq->quantum; @@ -226,7 +231,6 @@ static void fq_tin_enqueue(struct fq *fq, &tin->new_flows); } - __skb_queue_tail(&flow->queue, skb); oom = (fq->memory_usage > fq->memory_limit); while (fq->backlog > fq->limit || oom) { flow = fq_find_fattest_flow(fq); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 721c450a9ccd..689da327ce2e 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1807,6 +1807,10 @@ struct ieee80211_vif_cfg { * @addr: address of this interface * @p2p: indicates whether this AP or STA interface is a p2p * interface, i.e. a GO or p2p-sta respectively + * @netdev_features: tx netdev features supported by the hardware for this + * vif. mac80211 initializes this to hw->netdev_features, and the driver + * can mask out specific tx features. mac80211 will handle software fixup + * for masked offloads (GSO, CSUM) * @driver_flags: flags/capabilities the driver has for this interface, * these need to be set (or cleared) when the interface is added * or, if supported by the driver, the interface type is changed @@ -1848,6 +1852,7 @@ struct ieee80211_vif { struct ieee80211_txq *txq; + netdev_features_t netdev_features; u32 driver_flags; u32 offload_flags; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7b2843df3813..d49a5906a943 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -2178,6 +2178,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ndev->hw_features |= ndev->features & MAC80211_SUPPORTED_FEATURES_TX; + sdata->vif.netdev_features = local->hw.netdev_features; netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 774e62860c45..2171cd1ca807 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1355,7 +1355,11 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local, static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb) { - IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time(); + struct sk_buff *next; + codel_time_t now = codel_get_time(); + + skb_list_walk_safe(skb, skb, next) + IEEE80211_SKB_CB(skb)->control.enqueue_time = now; } static u32 codel_skb_len_func(const struct sk_buff *skb) @@ -3578,55 +3582,79 @@ ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata, return TX_CONTINUE; } -static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, - struct ieee80211_fast_tx *fast_tx, - struct sk_buff *skb) +static netdev_features_t +ieee80211_sdata_netdev_features(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_local *local = sdata->local; - u16 ethertype = (skb->data[12] << 8) | skb->data[13]; - int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); - int hw_headroom = sdata->local->hw.extra_tx_headroom; - struct ethhdr eth; - struct ieee80211_tx_info *info; - struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; - struct ieee80211_tx_data tx; - ieee80211_tx_result r; - struct tid_ampdu_tx *tid_tx = NULL; - u8 tid = IEEE80211_NUM_TIDS; + if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN) + return sdata->vif.netdev_features; - /* control port protocol needs a lot of special handling */ - if (cpu_to_be16(ethertype) == sdata->control_port_protocol) - return false; + if (!sdata->bss) + return 0; - /* only RFC 1042 SNAP */ - if (ethertype < ETH_P_802_3_MIN) - return false; + sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); + return sdata->vif.netdev_features; +} - /* don't handle TX status request here either */ - if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) - return false; +static struct sk_buff * +ieee80211_tx_skb_fixup(struct sk_buff *skb, netdev_features_t features) +{ + if (skb_is_gso(skb)) { + struct sk_buff *segs; - if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { - tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; - tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); - if (tid_tx) { - if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) - return false; - if (tid_tx->timeout) - tid_tx->last_tx = jiffies; - } + segs = skb_gso_segment(skb, features); + if (!segs) + return skb; + if (IS_ERR(segs)) + goto free; + + consume_skb(skb); + return segs; } - /* after this point (skb is modified) we cannot return false */ + if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) + goto free; + + if (skb->ip_summed == CHECKSUM_PARTIAL) { + int ofs = skb_checksum_start_offset(skb); + + if (skb->encapsulation) + skb_set_inner_transport_header(skb, ofs); + else + skb_set_transport_header(skb, ofs); + + if (skb_csum_hwoffload_help(skb, features)) + goto free; + } + + skb_mark_not_on_list(skb); + return skb; + +free: + kfree_skb(skb); + return NULL; +} + +static void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_fast_tx *fast_tx, + struct sk_buff *skb, u8 tid, bool ampdu) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; + struct ieee80211_tx_info *info; + struct ieee80211_tx_data tx; + ieee80211_tx_result r; + int hw_headroom = sdata->local->hw.extra_tx_headroom; + int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2); + struct ethhdr eth; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) - return true; + return; if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) && ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb)) - return true; + return; /* will not be crypto-handled beyond what we do here, so use false * as the may-encrypt argument for the resize to not account for @@ -3635,10 +3663,8 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, if (unlikely(ieee80211_skb_resize(sdata, skb, max_t(int, extra_head + hw_headroom - skb_headroom(skb), 0), - ENCRYPT_NO))) { - kfree_skb(skb); - return true; - } + ENCRYPT_NO))) + goto free; memcpy(ð, skb->data, ETH_HLEN - 2); hdr = skb_push(skb, extra_head); @@ -3652,7 +3678,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, info->control.vif = &sdata->vif; info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT | IEEE80211_TX_CTL_DONTFRAG | - (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0); + (ampdu ? IEEE80211_TX_CTL_AMPDU : 0); info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT | u32_encode_bits(IEEE80211_LINK_UNSPECIFIED, IEEE80211_TX_CTRL_MLO_LINK); @@ -3676,16 +3702,14 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, tx.key = fast_tx->key; if (ieee80211_queue_skb(local, sdata, sta, skb)) - return true; + return; tx.skb = skb; r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, fast_tx->key, &tx); tx.skb = NULL; - if (r == TX_DROP) { - kfree_skb(skb); - return true; - } + if (r == TX_DROP) + goto free; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -3693,6 +3717,56 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, __skb_queue_tail(&tx.skbs, skb); ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false); + return; + +free: + kfree_skb(skb); +} + +static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + struct ieee80211_fast_tx *fast_tx, + struct sk_buff *skb) +{ + u16 ethertype = (skb->data[12] << 8) | skb->data[13]; + struct ieee80211_hdr *hdr = (void *)fast_tx->hdr; + struct tid_ampdu_tx *tid_tx = NULL; + struct sk_buff *next; + u8 tid = IEEE80211_NUM_TIDS; + + /* control port protocol needs a lot of special handling */ + if (cpu_to_be16(ethertype) == sdata->control_port_protocol) + return false; + + /* only RFC 1042 SNAP */ + if (ethertype < ETH_P_802_3_MIN) + return false; + + /* don't handle TX status request here either */ + if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + return false; + + if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { + tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; + tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]); + if (tid_tx) { + if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state)) + return false; + if (tid_tx->timeout) + tid_tx->last_tx = jiffies; + } + } + + /* after this point (skb is modified) we cannot return false */ + skb = ieee80211_tx_skb_fixup(skb, ieee80211_sdata_netdev_features(sdata)); + if (!skb) + return true; + + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); + __ieee80211_xmit_fast(sdata, sta, fast_tx, skb, tid, tid_tx); + } + return true; } @@ -4192,31 +4266,14 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, goto out; } - if (skb_is_gso(skb)) { - struct sk_buff *segs; - - segs = skb_gso_segment(skb, 0); - if (IS_ERR(segs)) { - goto out_free; - } else if (segs) { - consume_skb(skb); - skb = segs; - } - } else { - /* we cannot process non-linear frames on this path */ - if (skb_linearize(skb)) - goto out_free; - - /* the frame could be fragmented, software-encrypted, and other - * things so we cannot really handle checksum offload with it - - * fix it up in software before we handle anything else. - */ - if (skb->ip_summed == CHECKSUM_PARTIAL) { - skb_set_transport_header(skb, - skb_checksum_start_offset(skb)); - if (skb_checksum_help(skb)) - goto out_free; - } + /* the frame could be fragmented, software-encrypted, and other + * things so we cannot really handle checksum or GSO offload. + * fix it up in software before we handle anything else. + */ + skb = ieee80211_tx_skb_fixup(skb, 0); + if (!skb) { + len = 0; + goto out; } skb_list_walk_safe(skb, skb, next) { @@ -4434,9 +4491,11 @@ normal: return NETDEV_TX_OK; } -static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, struct sta_info *sta, - bool txpending) + + +static bool __ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, struct sta_info *sta, + bool txpending) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_control control = {}; @@ -4445,14 +4504,6 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, unsigned long flags; int q = info->hw_queue; - if (sta) - sk_pacing_shift_update(skb->sk, local->hw.tx_sk_pacing_shift); - - ieee80211_tpt_led_trig_tx(local, skb->len); - - if (ieee80211_queue_skb(local, sdata, sta, skb)) - return true; - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); if (local->queue_stop_reasons[q] || @@ -4479,6 +4530,26 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, return true; } +static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, struct sta_info *sta, + bool txpending) +{ + struct ieee80211_local *local = sdata->local; + struct sk_buff *next; + bool ret = true; + + if (ieee80211_queue_skb(local, sdata, sta, skb)) + return true; + + skb_list_walk_safe(skb, skb, next) { + skb_mark_not_on_list(skb); + if (!__ieee80211_tx_8023(sdata, skb, sta, txpending)) + ret = false; + } + + return ret; +} + static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, struct net_device *dev, struct sta_info *sta, struct ieee80211_key *key, struct sk_buff *skb) @@ -4486,9 +4557,13 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info; struct ieee80211_local *local = sdata->local; struct tid_ampdu_tx *tid_tx; + struct sk_buff *seg, *next; + unsigned int skbs = 0, len = 0; + u16 queue; u8 tid; - skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb)); + queue = ieee80211_select_queue(sdata, sta, skb); + skb_set_queue_mapping(skb, queue); if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) @@ -4498,9 +4573,6 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, if (unlikely(!skb)) return; - info = IEEE80211_SKB_CB(skb); - memset(info, 0, sizeof(*info)); - ieee80211_aggr_check(sdata, sta, skb); tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; @@ -4514,22 +4586,20 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, return; } - info->flags |= IEEE80211_TX_CTL_AMPDU; if (tid_tx->timeout) tid_tx->last_tx = jiffies; } - if (unlikely(skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) - info->ack_frame_id = ieee80211_store_ack_skb(local, skb, - &info->flags, NULL); - - info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; + skb = ieee80211_tx_skb_fixup(skb, ieee80211_sdata_netdev_features(sdata)); + if (!skb) + return; - dev_sw_netstats_tx_add(dev, 1, skb->len); + info = IEEE80211_SKB_CB(skb); + memset(info, 0, sizeof(*info)); + if (tid_tx) + info->flags |= IEEE80211_TX_CTL_AMPDU; - sta->deflink.tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len; - sta->deflink.tx_stats.packets[skb_get_queue_mapping(skb)]++; + info->hw_queue = sdata->vif.hw_queue[queue]; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -4541,6 +4611,24 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, if (key) info->control.hw_key = &key->conf; + skb_list_walk_safe(skb, seg, next) { + skbs++; + len += seg->len; + if (seg != skb) + memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info)); + } + + if (unlikely(skb->sk && + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) + info->ack_frame_id = ieee80211_store_ack_skb(local, skb, + &info->flags, NULL); + + dev_sw_netstats_tx_add(dev, skbs, len); + sta->deflink.tx_stats.packets[queue] += skbs; + sta->deflink.tx_stats.bytes[queue] += len; + + ieee80211_tpt_led_trig_tx(local, len); + ieee80211_tx_8023(sdata, skb, sta, false); return; @@ -4582,6 +4670,7 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) goto skip_offload; + sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); ieee80211_8023_xmit(sdata, dev, sta, key, skb); goto out; -- cgit v1.2.3-70-g09d2 From 94b9b9de05b62ac54d8766caa9865fb4d82cc47e Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 1 Dec 2022 14:57:30 +0100 Subject: wifi: mac80211: fix and simplify unencrypted drop check for mesh ieee80211_drop_unencrypted is called from ieee80211_rx_h_mesh_fwding and ieee80211_frame_allowed. Since ieee80211_rx_h_mesh_fwding can forward packets for other mesh nodes and is called earlier, it needs to check the decryptions status and if the packet is using the control protocol on its own, instead of deferring to the later call from ieee80211_frame_allowed. Because of that, ieee80211_drop_unencrypted has a mesh specific check that skips over the mesh header in order to check the payload protocol. This code is invalid when called from ieee80211_frame_allowed, since that happens after the 802.11->802.3 conversion. Fix this by moving the mesh specific check directly into ieee80211_rx_h_mesh_fwding. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20221201135730.19723-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index c28c6fbf786e..7e3ab6e1b28f 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2403,7 +2403,6 @@ static int ieee80211_802_1x_port_control(struct ieee80211_rx_data *rx) static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) { - struct ieee80211_hdr *hdr = (void *)rx->skb->data; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); @@ -2414,31 +2413,6 @@ static int ieee80211_drop_unencrypted(struct ieee80211_rx_data *rx, __le16 fc) if (status->flag & RX_FLAG_DECRYPTED) return 0; - /* check mesh EAPOL frames first */ - if (unlikely(rx->sta && ieee80211_vif_is_mesh(&rx->sdata->vif) && - ieee80211_is_data(fc))) { - struct ieee80211s_hdr *mesh_hdr; - u16 hdr_len = ieee80211_hdrlen(fc); - u16 ethertype_offset; - __be16 ethertype; - - if (!ether_addr_equal(hdr->addr1, rx->sdata->vif.addr)) - goto drop_check; - - /* make sure fixed part of mesh header is there, also checks skb len */ - if (!pskb_may_pull(rx->skb, hdr_len + 6)) - goto drop_check; - - mesh_hdr = (struct ieee80211s_hdr *)(skb->data + hdr_len); - ethertype_offset = hdr_len + ieee80211_get_mesh_hdrlen(mesh_hdr) + - sizeof(rfc1042_header); - - if (skb_copy_bits(rx->skb, ethertype_offset, ðertype, 2) == 0 && - ethertype == rx->sdata->control_port_protocol) - return 0; - } - -drop_check: /* Drop unencrypted frames if key is set. */ if (unlikely(!ieee80211_has_protected(fc) && !ieee80211_is_any_nullfunc(fc) && @@ -2892,8 +2866,16 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) hdr = (struct ieee80211_hdr *) skb->data; mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); - if (ieee80211_drop_unencrypted(rx, hdr->frame_control)) - return RX_DROP_MONITOR; + if (ieee80211_drop_unencrypted(rx, hdr->frame_control)) { + int offset = hdrlen + ieee80211_get_mesh_hdrlen(mesh_hdr) + + sizeof(rfc1042_header); + __be16 ethertype; + + if (!ether_addr_equal(hdr->addr1, rx->sdata->vif.addr) || + skb_copy_bits(rx->skb, offset, ðertype, 2) != 0 || + ethertype != rx->sdata->control_port_protocol) + return RX_DROP_MONITOR; + } /* frame is in RMC, don't forward */ if (ieee80211_is_data(hdr->frame_control) && -- cgit v1.2.3-70-g09d2 From f62c7517ffa1378cc60cb5646567fa98e4b388cd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:56 +0000 Subject: net/tcp: Separate tcp_md5sig_info allocation into tcp_md5sig_info_add() Add a helper to allocate tcp_md5sig_info, that will help later to do/allocate things when info allocated, once per socket. Signed-off-by: Dmitry Safonov Reviewed-by: Eric Dumazet Acked-by: Jakub Kicinski Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1215fa4c1b9f..c72e53835397 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1161,6 +1161,24 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, } EXPORT_SYMBOL(tcp_v4_md5_lookup); +static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_info *md5sig; + + if (rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) + return 0; + + md5sig = kmalloc(sizeof(*md5sig), gfp); + if (!md5sig) + return -ENOMEM; + + sk_gso_disable(sk); + INIT_HLIST_HEAD(&md5sig->head); + rcu_assign_pointer(tp->md5sig_info, md5sig); + return 0; +} + /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, @@ -1191,17 +1209,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, return 0; } + if (tcp_md5sig_info_add(sk, gfp)) + return -ENOMEM; + md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); - if (!md5sig) { - md5sig = kmalloc(sizeof(*md5sig), gfp); - if (!md5sig) - return -ENOMEM; - - sk_gso_disable(sk); - INIT_HLIST_HEAD(&md5sig->head); - rcu_assign_pointer(tp->md5sig_info, md5sig); - } key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO); if (!key) -- cgit v1.2.3-70-g09d2 From 459837b522f7dff3b6681f534d8fff4eca19b7d1 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:57 +0000 Subject: net/tcp: Disable TCP-MD5 static key on tcp_md5sig_info destruction To do that, separate two scenarios: - where it's the first MD5 key on the system, which means that enabling of the static key may need to sleep; - copying of an existing key from a listening socket to the request socket upon receiving a signed TCP segment, where static key was already enabled (when the key was added to the listening socket). Now the life-time of the static branch for TCP-MD5 is until: - last tcp_md5sig_info is destroyed - last socket in time-wait state with MD5 key is closed. Which means that after all sockets with TCP-MD5 keys are gone, the system gets back the performance of disabled md5-key static branch. While at here, provide static_key_fast_inc() helper that does ref counter increment in atomic fashion (without grabbing cpus_read_lock() on CONFIG_JUMP_LABEL=y). This is needed to add a new user for a static_key when the caller controls the lifetime of another user. Signed-off-by: Dmitry Safonov Acked-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 10 +++++-- net/ipv4/tcp.c | 5 +--- net/ipv4/tcp_ipv4.c | 71 +++++++++++++++++++++++++++++++++++++++--------- net/ipv4/tcp_minisocks.c | 16 ++++++++--- net/ipv4/tcp_output.c | 4 +-- net/ipv6/tcp_ipv6.c | 10 +++---- 6 files changed, 84 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 6b814e788f00..f925377066fe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1675,7 +1675,11 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags, - const u8 *newkey, u8 newkeylen, gfp_t gfp); + const u8 *newkey, u8 newkeylen); +int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, + int family, u8 prefixlen, int l3index, + struct tcp_md5sig_key *key); + int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, @@ -1683,7 +1687,7 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, #ifdef CONFIG_TCP_MD5SIG #include -extern struct static_key_false tcp_md5_needed; +extern struct static_key_false_deferred tcp_md5_needed; struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family); @@ -1691,7 +1695,7 @@ static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { - if (!static_branch_unlikely(&tcp_md5_needed)) + if (!static_branch_unlikely(&tcp_md5_needed.key)) return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 24602a5184b0..001947136b0a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4464,11 +4464,8 @@ bool tcp_alloc_md5sig_pool(void) if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) { mutex_lock(&tcp_md5sig_mutex); - if (!tcp_md5sig_pool_populated) { + if (!tcp_md5sig_pool_populated) __tcp_alloc_md5sig_pool(); - if (tcp_md5sig_pool_populated) - static_branch_inc(&tcp_md5_needed); - } mutex_unlock(&tcp_md5sig_mutex); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c72e53835397..5d83a332f1dd 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1053,7 +1053,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) * We need to maintain these in the sk structure. */ -DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); +DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); EXPORT_SYMBOL(tcp_md5_needed); static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) @@ -1166,9 +1166,6 @@ static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - if (rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) - return 0; - md5sig = kmalloc(sizeof(*md5sig), gfp); if (!md5sig) return -ENOMEM; @@ -1180,9 +1177,9 @@ static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) } /* This can be called on a newly created socket, from other files */ -int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index, u8 flags, - const u8 *newkey, u8 newkeylen, gfp_t gfp) +static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, + int family, u8 prefixlen, int l3index, u8 flags, + const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; @@ -1209,9 +1206,6 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, return 0; } - if (tcp_md5sig_info_add(sk, gfp)) - return -ENOMEM; - md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); @@ -1235,8 +1229,59 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, hlist_add_head_rcu(&key->node, &md5sig->head); return 0; } + +int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, + int family, u8 prefixlen, int l3index, u8 flags, + const u8 *newkey, u8 newkeylen) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { + if (tcp_md5sig_info_add(sk, GFP_KERNEL)) + return -ENOMEM; + + if (!static_branch_inc(&tcp_md5_needed.key)) { + struct tcp_md5sig_info *md5sig; + + md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); + rcu_assign_pointer(tp->md5sig_info, NULL); + kfree_rcu(md5sig); + return -EUSERS; + } + } + + return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, + newkey, newkeylen, GFP_KERNEL); +} EXPORT_SYMBOL(tcp_md5_do_add); +int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, + int family, u8 prefixlen, int l3index, + struct tcp_md5sig_key *key) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { + if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) + return -ENOMEM; + + if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { + struct tcp_md5sig_info *md5sig; + + md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); + net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); + rcu_assign_pointer(tp->md5sig_info, NULL); + kfree_rcu(md5sig); + return -EUSERS; + } + } + + return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, + key->flags, key->key, key->keylen, + sk_gfp_mask(sk, GFP_ATOMIC)); +} +EXPORT_SYMBOL(tcp_md5_key_copy); + int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags) { @@ -1323,7 +1368,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, - cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); + cmd.tcpm_key, cmd.tcpm_keylen); } static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -1580,8 +1625,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, - key->key, key->keylen, GFP_ATOMIC); + tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key); sk_gso_disable(newsk); } #endif @@ -2273,6 +2317,7 @@ void tcp_v4_destroy_sock(struct sock *sk) tcp_clear_md5_list(sk); kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu); tp->md5sig_info = NULL; + static_branch_slow_dec_deferred(&tcp_md5_needed); } #endif diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c375f603a16c..6908812d50d3 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -291,13 +291,19 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) */ do { tcptw->tw_md5_key = NULL; - if (static_branch_unlikely(&tcp_md5_needed)) { + if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_md5sig_key *key; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); + if (!tcptw->tw_md5_key) + break; + BUG_ON(!tcp_alloc_md5sig_pool()); + if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { + kfree(tcptw->tw_md5_key); + tcptw->tw_md5_key = NULL; + } } } } while (0); @@ -337,11 +343,13 @@ EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG - if (static_branch_unlikely(&tcp_md5_needed)) { + if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) + if (twsk->tw_md5_key) { kfree_rcu(twsk->tw_md5_key, rcu); + static_branch_slow_dec_deferred(&tcp_md5_needed); + } } #endif } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 894410dc9293..71d01cf3c13e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -766,7 +766,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (static_branch_unlikely(&tcp_md5_needed) && + if (static_branch_unlikely(&tcp_md5_needed.key) && rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { @@ -922,7 +922,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb *md5 = NULL; #ifdef CONFIG_TCP_MD5SIG - if (static_branch_unlikely(&tcp_md5_needed) && + if (static_branch_unlikely(&tcp_md5_needed.key) && rcu_access_pointer(tp->md5sig_info)) { *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f52b6f271a24..80df917ced5f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -665,12 +665,11 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, l3index, flags, - cmd.tcpm_key, cmd.tcpm_keylen, - GFP_KERNEL); + cmd.tcpm_key, cmd.tcpm_keylen); return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, AF_INET6, prefixlen, l3index, flags, - cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); + cmd.tcpm_key, cmd.tcpm_keylen); } static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -1370,9 +1369,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, - AF_INET6, 128, l3index, key->flags, key->key, key->keylen, - sk_gfp_mask(sk, GFP_ATOMIC)); + tcp_md5_key_copy(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, + AF_INET6, 128, l3index, key); } #endif -- cgit v1.2.3-70-g09d2 From b389d1affc2cc2dc8686cdab303a30b2ad3a81d4 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:58 +0000 Subject: net/tcp: Do cleanup on tcp_md5_key_copy() failure If the kernel was short on (atomic) memory and failed to allocate it - don't proceed to creation of request socket. Otherwise the socket would be unsigned and userspace likely doesn't expect that the TCP is not MD5-signed anymore. Signed-off-by: Dmitry Safonov Acked-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 9 ++------- net/ipv6/tcp_ipv6.c | 15 ++++++++------- 2 files changed, 10 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5d83a332f1dd..7fae586405cf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1619,13 +1619,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, addr = (union tcp_md5_addr *)&newinet->inet_daddr; key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key) { - /* - * We're using one, so create a matching key - * on the newsk structure. If we fail to get - * memory, then we end up not copying the key - * across. Shucks. - */ - tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key); + if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key)) + goto put_and_exit; sk_gso_disable(newsk); } #endif diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 80df917ced5f..11b736a76bd7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1364,13 +1364,14 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * /* Copy over the MD5 key from the original socket */ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { - /* We're using one, so create a matching key - * on the newsk structure. If we fail to get - * memory, then we end up not copying the key - * across. Shucks. - */ - tcp_md5_key_copy(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, - AF_INET6, 128, l3index, key); + const union tcp_md5_addr *addr; + + addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; + if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); + goto out; + } } #endif -- cgit v1.2.3-70-g09d2 From c5b8b515a211377e78bb7807fe3e6e7212626545 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:59 +0000 Subject: net/tcp: Separate initialization of twsk Convert BUG_ON() to WARN_ON_ONCE() and warn as well for unlikely static key int overflow error-path. Signed-off-by: Dmitry Safonov Acked-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_minisocks.c | 61 +++++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6908812d50d3..e002f2e1d4f2 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -240,6 +240,40 @@ kill: } EXPORT_SYMBOL(tcp_timewait_state_process); +static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) +{ +#ifdef CONFIG_TCP_MD5SIG + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_md5sig_key *key; + + /* + * The timewait bucket does not have the key DB from the + * sock structure. We just make a quick copy of the + * md5 key being used (if indeed we are using one) + * so the timewait ack generating code has the key. + */ + tcptw->tw_md5_key = NULL; + if (!static_branch_unlikely(&tcp_md5_needed.key)) + return; + + key = tp->af_specific->md5_lookup(sk, sk); + if (key) { + tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); + if (!tcptw->tw_md5_key) + return; + if (!tcp_alloc_md5sig_pool()) + goto out_free; + if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) + goto out_free; + } + return; +out_free: + WARN_ON_ONCE(1); + kfree(tcptw->tw_md5_key); + tcptw->tw_md5_key = NULL; +#endif +} + /* * Move a socket to time-wait or dead fin-wait-2 state. */ @@ -282,32 +316,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } #endif -#ifdef CONFIG_TCP_MD5SIG - /* - * The timewait bucket does not have the key DB from the - * sock structure. We just make a quick copy of the - * md5 key being used (if indeed we are using one) - * so the timewait ack generating code has the key. - */ - do { - tcptw->tw_md5_key = NULL; - if (static_branch_unlikely(&tcp_md5_needed.key)) { - struct tcp_md5sig_key *key; - - key = tp->af_specific->md5_lookup(sk, sk); - if (key) { - tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); - if (!tcptw->tw_md5_key) - break; - BUG_ON(!tcp_alloc_md5sig_pool()); - if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { - kfree(tcptw->tw_md5_key); - tcptw->tw_md5_key = NULL; - } - } - } - } while (0); -#endif + tcp_time_wait_init(sk, tcptw); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) -- cgit v1.2.3-70-g09d2 From f8c9dfbd875b17fee59c7f1aa35a4944d4e6d810 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Wed, 30 Nov 2022 15:06:28 +0100 Subject: mptcp: add pm listener events This patch adds two new MPTCP netlink event types for PM listening socket create and close, named MPTCP_EVENT_LISTENER_CREATED and MPTCP_EVENT_LISTENER_CLOSED. Add a new function mptcp_event_pm_listener() to push the new events with family, port and addr to userspace. Invoke mptcp_event_pm_listener() with MPTCP_EVENT_LISTENER_CREATED in mptcp_listen() and mptcp_pm_nl_create_listen_socket(), invoke it with MPTCP_EVENT_LISTENER_CLOSED in __mptcp_close_ssk(). Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Jakub Kicinski --- include/uapi/linux/mptcp.h | 9 ++++++++ net/mptcp/pm_netlink.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ net/mptcp/protocol.c | 3 +++ net/mptcp/protocol.h | 2 ++ 4 files changed, 71 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h index dfe19bf13f4c..32af2d278cb4 100644 --- a/include/uapi/linux/mptcp.h +++ b/include/uapi/linux/mptcp.h @@ -160,6 +160,12 @@ struct mptcp_info { * daddr4 | daddr6, sport, dport, backup, if_idx * [, error] * The priority of a subflow has changed. 'error' should not be set. + * + * MPTCP_EVENT_LISTENER_CREATED: family, sport, saddr4 | saddr6 + * A new PM listener is created. + * + * MPTCP_EVENT_LISTENER_CLOSED: family, sport, saddr4 | saddr6 + * A PM listener is closed. */ enum mptcp_event_type { MPTCP_EVENT_UNSPEC = 0, @@ -174,6 +180,9 @@ enum mptcp_event_type { MPTCP_EVENT_SUB_CLOSED = 11, MPTCP_EVENT_SUB_PRIORITY = 13, + + MPTCP_EVENT_LISTENER_CREATED = 15, + MPTCP_EVENT_LISTENER_CLOSED = 16, }; enum mptcp_event_attr { diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index d66fbd558263..eef69d0e44ec 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1029,6 +1029,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, if (err) return err; + mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); + return 0; } @@ -2152,6 +2154,58 @@ nla_put_failure: kfree_skb(skb); } +void mptcp_event_pm_listener(const struct sock *ssk, + enum mptcp_event_type event) +{ + const struct inet_sock *issk = inet_sk(ssk); + struct net *net = sock_net(ssk); + struct nlmsghdr *nlh; + struct sk_buff *skb; + + if (!genl_has_listeners(&mptcp_genl_family, net, MPTCP_PM_EV_GRP_OFFSET)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return; + + nlh = genlmsg_put(skb, 0, 0, &mptcp_genl_family, 0, event); + if (!nlh) + goto nla_put_failure; + + if (nla_put_u16(skb, MPTCP_ATTR_FAMILY, ssk->sk_family)) + goto nla_put_failure; + + if (nla_put_be16(skb, MPTCP_ATTR_SPORT, issk->inet_sport)) + goto nla_put_failure; + + switch (ssk->sk_family) { + case AF_INET: + if (nla_put_in_addr(skb, MPTCP_ATTR_SADDR4, issk->inet_saddr)) + goto nla_put_failure; + break; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + case AF_INET6: { + const struct ipv6_pinfo *np = inet6_sk(ssk); + + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + goto nla_put_failure; + break; + } +#endif + default: + WARN_ON_ONCE(1); + goto nla_put_failure; + } + + genlmsg_end(skb, nlh); + mptcp_nl_mcast_send(net, skb, GFP_KERNEL); + return; + +nla_put_failure: + kfree_skb(skb); +} + void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) { @@ -2197,6 +2251,9 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, if (mptcp_event_sub_closed(skb, msk, ssk) < 0) goto nla_put_failure; break; + case MPTCP_EVENT_LISTENER_CREATED: + case MPTCP_EVENT_LISTENER_CLOSED: + break; } genlmsg_end(skb, nlh); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index b0d387be500a..f6f93957275b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2355,6 +2355,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, tcp_set_state(ssk, TCP_CLOSE); mptcp_subflow_queue_clean(ssk); inet_csk_listen_stop(ssk); + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); } __tcp_close(ssk, 0); @@ -3647,6 +3648,8 @@ static int mptcp_listen(struct socket *sock, int backlog) if (!err) mptcp_copy_inaddrs(sock->sk, ssock->sk); + mptcp_event_pm_listener(ssock->sk, MPTCP_EVENT_LISTENER_CREATED); + unlock: release_sock(sock->sk); return err; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 8b4379a2cd85..955fb3d88eb3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -839,6 +839,8 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); void mptcp_event_addr_announced(const struct sock *ssk, const struct mptcp_addr_info *info); void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); +void mptcp_event_pm_listener(const struct sock *ssk, + enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, -- cgit v1.2.3-70-g09d2 From 7d802c8098c50fb7dcf5dfcb6466482e1f2b15e4 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 30 Nov 2022 18:04:31 -0500 Subject: sctp: delete free member from struct sctp_sched_ops After commit 9ed7bfc79542 ("sctp: fix memory leak in sctp_stream_outq_migrate()"), sctp_sched_set_sched() is the only place calling sched->free(), and it can actually be replaced by sched->free_sid() on each stream, and yet there's already a loop to traverse all streams in sctp_sched_set_sched(). This patch adds a function sctp_sched_free_sched() where it calls sched->free_sid() for each stream to replace sched->free() calls in sctp_sched_set_sched() and then deletes the unused free member from struct sctp_sched_ops. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Link: https://lore.kernel.org/r/e10aac150aca2686cb0bd0570299ec716da5a5c0.1669849471.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sctp/stream_sched.h | 2 -- net/sctp/stream_sched.c | 38 ++++++++++++++++++++------------------ net/sctp/stream_sched_prio.c | 27 --------------------------- net/sctp/stream_sched_rr.c | 6 ------ 4 files changed, 20 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h index 65058faea4db..fa00dc20a0d7 100644 --- a/include/net/sctp/stream_sched.h +++ b/include/net/sctp/stream_sched.h @@ -28,8 +28,6 @@ struct sctp_sched_ops { int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp); /* free a stream */ void (*free_sid)(struct sctp_stream *stream, __u16 sid); - /* Frees the entire thing */ - void (*free)(struct sctp_stream *stream); /* Enqueue a chunk */ void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg); diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 7c8f9d89e16a..330067002deb 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -50,10 +50,6 @@ static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) { } -static void sctp_sched_fcfs_free(struct sctp_stream *stream) -{ -} - static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { @@ -101,7 +97,6 @@ static struct sctp_sched_ops sctp_sched_fcfs = { .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, .free_sid = sctp_sched_fcfs_free_sid, - .free = sctp_sched_fcfs_free, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, .dequeue_done = sctp_sched_fcfs_dequeue_done, @@ -131,6 +126,23 @@ void sctp_sched_ops_init(void) sctp_sched_ops_rr_init(); } +static void sctp_sched_free_sched(struct sctp_stream *stream) +{ + struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + struct sctp_stream_out_ext *soute; + int i; + + sched->unsched_all(stream); + for (i = 0; i < stream->outcnt; i++) { + soute = SCTP_SO(stream, i)->ext; + if (!soute) + continue; + sched->free_sid(stream, i); + /* Give the next scheduler a clean slate. */ + memset_after(soute, 0, outq); + } +} + int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { @@ -146,18 +158,8 @@ int sctp_sched_set_sched(struct sctp_association *asoc, if (sched > SCTP_SS_MAX) return -EINVAL; - if (old) { - old->free(&asoc->stream); - - /* Give the next scheduler a clean slate. */ - for (i = 0; i < asoc->stream.outcnt; i++) { - struct sctp_stream_out_ext *ext = SCTP_SO(&asoc->stream, i)->ext; - - if (!ext) - continue; - memset_after(ext, 0, outq); - } - } + if (old) + sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = n; n->init(&asoc->stream); @@ -181,7 +183,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc, return ret; err: - n->free(&asoc->stream); + sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ return ret; diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 4fc9f2923ed1..42d4800f263d 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -222,32 +222,6 @@ static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid) kfree(prio); } -static void sctp_sched_prio_free(struct sctp_stream *stream) -{ - struct sctp_stream_priorities *prio, *n; - LIST_HEAD(list); - int i; - - /* As we don't keep a list of priorities, to avoid multiple - * frees we have to do it in 3 steps: - * 1. unsched everyone, so the lists are free to use in 2. - * 2. build the list of the priorities - * 3. free the list - */ - sctp_sched_prio_unsched_all(stream); - for (i = 0; i < stream->outcnt; i++) { - if (!SCTP_SO(stream, i)->ext) - continue; - prio = SCTP_SO(stream, i)->ext->prio_head; - if (prio && list_empty(&prio->prio_sched)) - list_add(&prio->prio_sched, &list); - } - list_for_each_entry_safe(prio, n, &list, prio_sched) { - list_del_init(&prio->prio_sched); - kfree(prio); - } -} - static void sctp_sched_prio_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { @@ -342,7 +316,6 @@ static struct sctp_sched_ops sctp_sched_prio = { .init = sctp_sched_prio_init, .init_sid = sctp_sched_prio_init_sid, .free_sid = sctp_sched_prio_free_sid, - .free = sctp_sched_prio_free, .enqueue = sctp_sched_prio_enqueue, .dequeue = sctp_sched_prio_dequeue, .dequeue_done = sctp_sched_prio_dequeue_done, diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index cc444fe0d67c..1f235e7f643a 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -94,11 +94,6 @@ static void sctp_sched_rr_free_sid(struct sctp_stream *stream, __u16 sid) { } -static void sctp_sched_rr_free(struct sctp_stream *stream) -{ - sctp_sched_rr_unsched_all(stream); -} - static void sctp_sched_rr_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { @@ -182,7 +177,6 @@ static struct sctp_sched_ops sctp_sched_rr = { .init = sctp_sched_rr_init, .init_sid = sctp_sched_rr_init_sid, .free_sid = sctp_sched_rr_free_sid, - .free = sctp_sched_rr_free, .enqueue = sctp_sched_rr_enqueue, .dequeue = sctp_sched_rr_dequeue, .dequeue_done = sctp_sched_rr_dequeue_done, -- cgit v1.2.3-70-g09d2 From e012764cebf6e33097f6833ff15a936fbe7b846c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Nov 2022 17:48:08 +0100 Subject: Revert "net: hsr: use hlist_head instead of list_head for mac addresses" The hlist optimisation (which not only uses hlist_head instead of list_head but also splits hsr_priv::node_db into an array of 256 slots) does not consider the "node merge": Upon starting the hsr network (with three nodes) a packet that is sent from node1 to node3 will also be sent from node1 to node2 and then forwarded to node3. As a result node3 will receive 2 packets because it is not able to filter out the duplicate. Each packet received will create a new struct hsr_node with macaddress_A only set the MAC address it received from (the two MAC addesses from node1). At some point (early in the process) two supervision frames will be received from node1. They will be processed by hsr_handle_sup_frame() and one frame will leave early ("Node has already been merged") and does nothing. The other frame will be merged as portB and have its MAC address written to macaddress_B and the hsr_node (that was created for it as macaddress_A) will be removed. From now on HSR is able to identify a duplicate because both packets sent from one node will result in the same struct hsr_node because hsr_get_node() will find the MAC address either on macaddress_A or macaddress_B. Things get tricky with the optimisation: If sender's MAC address is saved as macaddress_A then the lookup will work as usual. If the MAC address has been merged into macaddress_B of another hsr_node then the lookup won't work because it is likely that the data structure is in another bucket. This results in creating a new struct hsr_node and not recognising a possible duplicate. A way around it would be to add another hsr_node::mac_list_B and attach it to the other bucket to ensure that this hsr_node will be looked up either via macaddress_A _or_ macaddress_B. I however prefer to revert it because it sounds like an academic problem rather than real life workload plus it adds complexity. I'm not an HSR expert with what is usual size of a network but I would guess 40 to 60 nodes. With 10.000 nodes and assuming 60us for pass-through (from node to node) then it would take almost 600ms for a packet to almost wrap around which sounds a lot. Revert the hash MAC addresses optimisation. Fixes: 4acc45db71158 ("net: hsr: use hlist_head instead of list_head for mac addresses") Cc: Juhee Kang Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jakub Kicinski --- net/hsr/hsr_debugfs.c | 40 ++++------ net/hsr/hsr_device.c | 10 +-- net/hsr/hsr_forward.c | 7 +- net/hsr/hsr_framereg.c | 209 +++++++++++++++++++------------------------------ net/hsr/hsr_framereg.h | 14 +--- net/hsr/hsr_main.h | 9 +-- net/hsr/hsr_netlink.c | 4 +- 7 files changed, 106 insertions(+), 187 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index de476a417631..1a195efc79cd 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -9,7 +9,6 @@ #include #include #include -#include #include "hsr_main.h" #include "hsr_framereg.h" @@ -21,7 +20,6 @@ hsr_node_table_show(struct seq_file *sfp, void *data) { struct hsr_priv *priv = (struct hsr_priv *)sfp->private; struct hsr_node *node; - int i; seq_printf(sfp, "Node Table entries for (%s) device\n", (priv->prot_version == PRP_V1 ? "PRP" : "HSR")); @@ -33,28 +31,22 @@ hsr_node_table_show(struct seq_file *sfp, void *data) seq_puts(sfp, "DAN-H\n"); rcu_read_lock(); - - for (i = 0 ; i < priv->hash_buckets; i++) { - hlist_for_each_entry_rcu(node, &priv->node_db[i], mac_list) { - /* skip self node */ - if (hsr_addr_is_self(priv, node->macaddress_A)) - continue; - seq_printf(sfp, "%pM ", &node->macaddress_A[0]); - seq_printf(sfp, "%pM ", &node->macaddress_B[0]); - seq_printf(sfp, "%10lx, ", - node->time_in[HSR_PT_SLAVE_A]); - seq_printf(sfp, "%10lx, ", - node->time_in[HSR_PT_SLAVE_B]); - seq_printf(sfp, "%14x, ", node->addr_B_port); - - if (priv->prot_version == PRP_V1) - seq_printf(sfp, "%5x, %5x, %5x\n", - node->san_a, node->san_b, - (node->san_a == 0 && - node->san_b == 0)); - else - seq_printf(sfp, "%5x\n", 1); - } + list_for_each_entry_rcu(node, &priv->node_db, mac_list) { + /* skip self node */ + if (hsr_addr_is_self(priv, node->macaddress_A)) + continue; + seq_printf(sfp, "%pM ", &node->macaddress_A[0]); + seq_printf(sfp, "%pM ", &node->macaddress_B[0]); + seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]); + seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]); + seq_printf(sfp, "%14x, ", node->addr_B_port); + + if (priv->prot_version == PRP_V1) + seq_printf(sfp, "%5x, %5x, %5x\n", + node->san_a, node->san_b, + (node->san_a == 0 && node->san_b == 0)); + else + seq_printf(sfp, "%5x\n", 1); } rcu_read_unlock(); return 0; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 6ffef47e9be5..7518f7e93043 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -485,16 +485,12 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], { bool unregister = false; struct hsr_priv *hsr; - int res, i; + int res; hsr = netdev_priv(hsr_dev); INIT_LIST_HEAD(&hsr->ports); - INIT_HLIST_HEAD(&hsr->self_node_db); - hsr->hash_buckets = HSR_HSIZE; - get_random_bytes(&hsr->hash_seed, sizeof(hsr->hash_seed)); - for (i = 0; i < hsr->hash_buckets; i++) - INIT_HLIST_HEAD(&hsr->node_db[i]); - + INIT_LIST_HEAD(&hsr->node_db); + INIT_LIST_HEAD(&hsr->self_node_db); spin_lock_init(&hsr->list_lock); eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 56bb27d67a2e..3e58d310b4d4 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -571,23 +571,20 @@ static int fill_frame_info(struct hsr_frame_info *frame, struct ethhdr *ethhdr; __be16 proto; int ret; - u32 hash; /* Check if skb contains ethhdr */ if (skb->mac_len < sizeof(struct ethhdr)) return -EINVAL; memset(frame, 0, sizeof(*frame)); - - ethhdr = (struct ethhdr *)skb_mac_header(skb); - hash = hsr_mac_hash(port->hsr, ethhdr->h_source); frame->is_supervision = is_supervision_frame(port->hsr, skb); - frame->node_src = hsr_get_node(port, &hsr->node_db[hash], skb, + frame->node_src = hsr_get_node(port, &hsr->node_db, skb, frame->is_supervision, port->type); if (!frame->node_src) return -1; /* Unknown node and !is_supervision, or no mem */ + ethhdr = (struct ethhdr *)skb_mac_header(skb); frame->is_vlan = false; proto = ethhdr->h_proto; diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 584e21788799..9b8eaebce254 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -15,37 +15,10 @@ #include #include #include -#include #include "hsr_main.h" #include "hsr_framereg.h" #include "hsr_netlink.h" -#ifdef CONFIG_LOCKDEP -int lockdep_hsr_is_held(spinlock_t *lock) -{ - return lockdep_is_held(lock); -} -#endif - -u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr) -{ - u32 hash = jhash(addr, ETH_ALEN, hsr->hash_seed); - - return reciprocal_scale(hash, hsr->hash_buckets); -} - -struct hsr_node *hsr_node_get_first(struct hlist_head *head, spinlock_t *lock) -{ - struct hlist_node *first; - - first = rcu_dereference_bh_check(hlist_first_rcu(head), - lockdep_hsr_is_held(lock)); - if (first) - return hlist_entry(first, struct hsr_node, mac_list); - - return NULL; -} - /* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, * false otherwise. */ @@ -67,7 +40,8 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { struct hsr_node *node; - node = hsr_node_get_first(&hsr->self_node_db, &hsr->list_lock); + node = list_first_or_null_rcu(&hsr->self_node_db, struct hsr_node, + mac_list); if (!node) { WARN_ONCE(1, "HSR: No self node\n"); return false; @@ -83,12 +57,12 @@ bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) /* Search for mac entry. Caller must hold rcu read lock. */ -static struct hsr_node *find_node_by_addr_A(struct hlist_head *node_db, +static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, const unsigned char addr[ETH_ALEN]) { struct hsr_node *node; - hlist_for_each_entry_rcu(node, node_db, mac_list) { + list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, addr)) return node; } @@ -103,7 +77,7 @@ int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], const unsigned char addr_b[ETH_ALEN]) { - struct hlist_head *self_node_db = &hsr->self_node_db; + struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node, *oldnode; node = kmalloc(sizeof(*node), GFP_KERNEL); @@ -114,13 +88,14 @@ int hsr_create_self_node(struct hsr_priv *hsr, ether_addr_copy(node->macaddress_B, addr_b); spin_lock_bh(&hsr->list_lock); - oldnode = hsr_node_get_first(self_node_db, &hsr->list_lock); + oldnode = list_first_or_null_rcu(self_node_db, + struct hsr_node, mac_list); if (oldnode) { - hlist_replace_rcu(&oldnode->mac_list, &node->mac_list); + list_replace_rcu(&oldnode->mac_list, &node->mac_list); spin_unlock_bh(&hsr->list_lock); kfree_rcu(oldnode, rcu_head); } else { - hlist_add_tail_rcu(&node->mac_list, self_node_db); + list_add_tail_rcu(&node->mac_list, self_node_db); spin_unlock_bh(&hsr->list_lock); } @@ -129,25 +104,25 @@ int hsr_create_self_node(struct hsr_priv *hsr, void hsr_del_self_node(struct hsr_priv *hsr) { - struct hlist_head *self_node_db = &hsr->self_node_db; + struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node; spin_lock_bh(&hsr->list_lock); - node = hsr_node_get_first(self_node_db, &hsr->list_lock); + node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list); if (node) { - hlist_del_rcu(&node->mac_list); + list_del_rcu(&node->mac_list); kfree_rcu(node, rcu_head); } spin_unlock_bh(&hsr->list_lock); } -void hsr_del_nodes(struct hlist_head *node_db) +void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; - struct hlist_node *tmp; + struct hsr_node *tmp; - hlist_for_each_entry_safe(node, tmp, node_db, mac_list) - kfree_rcu(node, rcu_head); + list_for_each_entry_safe(node, tmp, node_db, mac_list) + kfree(node); } void prp_handle_san_frame(bool san, enum hsr_port_type port, @@ -168,7 +143,7 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port, * originating from the newly added node. */ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, - struct hlist_head *node_db, + struct list_head *node_db, unsigned char addr[], u16 seq_out, bool san, enum hsr_port_type rx_port) @@ -198,14 +173,14 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, hsr->proto_ops->handle_san_frame(san, rx_port, new_node); spin_lock_bh(&hsr->list_lock); - hlist_for_each_entry_rcu(node, node_db, mac_list, - lockdep_hsr_is_held(&hsr->list_lock)) { + list_for_each_entry_rcu(node, node_db, mac_list, + lockdep_is_held(&hsr->list_lock)) { if (ether_addr_equal(node->macaddress_A, addr)) goto out; if (ether_addr_equal(node->macaddress_B, addr)) goto out; } - hlist_add_tail_rcu(&new_node->mac_list, node_db); + list_add_tail_rcu(&new_node->mac_list, node_db); spin_unlock_bh(&hsr->list_lock); return new_node; out: @@ -225,7 +200,7 @@ void prp_update_san_info(struct hsr_node *node, bool is_sup) /* Get the hsr_node from which 'skb' was sent. */ -struct hsr_node *hsr_get_node(struct hsr_port *port, struct hlist_head *node_db, +struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct sk_buff *skb, bool is_sup, enum hsr_port_type rx_port) { @@ -241,7 +216,7 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct hlist_head *node_db, ethhdr = (struct ethhdr *)skb_mac_header(skb); - hlist_for_each_entry_rcu(node, node_db, mac_list) { + list_for_each_entry_rcu(node, node_db, mac_list) { if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) { if (hsr->proto_ops->update_san_info) hsr->proto_ops->update_san_info(node, is_sup); @@ -291,12 +266,11 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) struct hsr_sup_tlv *hsr_sup_tlv; struct hsr_node *node_real; struct sk_buff *skb = NULL; - struct hlist_head *node_db; + struct list_head *node_db; struct ethhdr *ethhdr; int i; unsigned int pull_size = 0; unsigned int total_pull_size = 0; - u32 hash; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol @@ -334,13 +308,11 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) hsr_sp = (struct hsr_sup_payload *)skb->data; /* Merge node_curr (registered on macaddress_B) into node_real */ - node_db = port_rcv->hsr->node_db; - hash = hsr_mac_hash(hsr, hsr_sp->macaddress_A); - node_real = find_node_by_addr_A(&node_db[hash], hsr_sp->macaddress_A); + node_db = &port_rcv->hsr->node_db; + node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A); if (!node_real) /* No frame received from AddrA of this node yet */ - node_real = hsr_add_node(hsr, &node_db[hash], - hsr_sp->macaddress_A, + node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, HSR_SEQNR_START - 1, true, port_rcv->type); if (!node_real) @@ -374,8 +346,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) hsr_sp = (struct hsr_sup_payload *)skb->data; /* Check if redbox mac and node mac are equal. */ - if (!ether_addr_equal(node_real->macaddress_A, - hsr_sp->macaddress_A)) { + if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) { /* This is a redbox supervision frame for a VDAN! */ goto done; } @@ -395,7 +366,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); - hlist_del_rcu(&node_curr->mac_list); + list_del_rcu(&node_curr->mac_list); spin_unlock_bh(&hsr->list_lock); kfree_rcu(node_curr, rcu_head); @@ -433,7 +404,6 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, struct hsr_port *port) { struct hsr_node *node_dst; - u32 hash; if (!skb_mac_header_was_set(skb)) { WARN_ONCE(1, "%s: Mac header not set\n", __func__); @@ -443,8 +413,7 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest)) return; - hash = hsr_mac_hash(port->hsr, eth_hdr(skb)->h_dest); - node_dst = find_node_by_addr_A(&port->hsr->node_db[hash], + node_dst = find_node_by_addr_A(&port->hsr->node_db, eth_hdr(skb)->h_dest); if (!node_dst) { if (net_ratelimit()) @@ -520,73 +489,59 @@ static struct hsr_port *get_late_port(struct hsr_priv *hsr, void hsr_prune_nodes(struct timer_list *t) { struct hsr_priv *hsr = from_timer(hsr, t, prune_timer); - struct hlist_node *tmp; struct hsr_node *node; + struct hsr_node *tmp; struct hsr_port *port; unsigned long timestamp; unsigned long time_a, time_b; - int i; spin_lock_bh(&hsr->list_lock); + list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) { + /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A] + * nor time_in[HSR_PT_SLAVE_B], will ever be updated for + * the master port. Thus the master node will be repeatedly + * pruned leading to packet loss. + */ + if (hsr_addr_is_self(hsr, node->macaddress_A)) + continue; + + /* Shorthand */ + time_a = node->time_in[HSR_PT_SLAVE_A]; + time_b = node->time_in[HSR_PT_SLAVE_B]; + + /* Check for timestamps old enough to risk wrap-around */ + if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2)) + node->time_in_stale[HSR_PT_SLAVE_A] = true; + if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2)) + node->time_in_stale[HSR_PT_SLAVE_B] = true; + + /* Get age of newest frame from node. + * At least one time_in is OK here; nodes get pruned long + * before both time_ins can get stale + */ + timestamp = time_a; + if (node->time_in_stale[HSR_PT_SLAVE_A] || + (!node->time_in_stale[HSR_PT_SLAVE_B] && + time_after(time_b, time_a))) + timestamp = time_b; + + /* Warn of ring error only as long as we get frames at all */ + if (time_is_after_jiffies(timestamp + + msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) { + rcu_read_lock(); + port = get_late_port(hsr, node); + if (port) + hsr_nl_ringerror(hsr, node->macaddress_A, port); + rcu_read_unlock(); + } - for (i = 0; i < hsr->hash_buckets; i++) { - hlist_for_each_entry_safe(node, tmp, &hsr->node_db[i], - mac_list) { - /* Don't prune own node. - * Neither time_in[HSR_PT_SLAVE_A] - * nor time_in[HSR_PT_SLAVE_B], will ever be updated - * for the master port. Thus the master node will be - * repeatedly pruned leading to packet loss. - */ - if (hsr_addr_is_self(hsr, node->macaddress_A)) - continue; - - /* Shorthand */ - time_a = node->time_in[HSR_PT_SLAVE_A]; - time_b = node->time_in[HSR_PT_SLAVE_B]; - - /* Check for timestamps old enough to - * risk wrap-around - */ - if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2)) - node->time_in_stale[HSR_PT_SLAVE_A] = true; - if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2)) - node->time_in_stale[HSR_PT_SLAVE_B] = true; - - /* Get age of newest frame from node. - * At least one time_in is OK here; nodes get pruned - * long before both time_ins can get stale - */ - timestamp = time_a; - if (node->time_in_stale[HSR_PT_SLAVE_A] || - (!node->time_in_stale[HSR_PT_SLAVE_B] && - time_after(time_b, time_a))) - timestamp = time_b; - - /* Warn of ring error only as long as we get - * frames at all - */ - if (time_is_after_jiffies(timestamp + - msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) { - rcu_read_lock(); - port = get_late_port(hsr, node); - if (port) - hsr_nl_ringerror(hsr, - node->macaddress_A, - port); - rcu_read_unlock(); - } - - /* Prune old entries */ - if (time_is_before_jiffies(timestamp + - msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { - hsr_nl_nodedown(hsr, node->macaddress_A); - hlist_del_rcu(&node->mac_list); - /* Note that we need to free this - * entry later: - */ - kfree_rcu(node, rcu_head); - } + /* Prune old entries */ + if (time_is_before_jiffies(timestamp + + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { + hsr_nl_nodedown(hsr, node->macaddress_A); + list_del_rcu(&node->mac_list); + /* Note that we need to free this entry later: */ + kfree_rcu(node, rcu_head); } } spin_unlock_bh(&hsr->list_lock); @@ -600,20 +555,17 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]) { struct hsr_node *node; - u32 hash; - - hash = hsr_mac_hash(hsr, addr); if (!_pos) { - node = hsr_node_get_first(&hsr->node_db[hash], - &hsr->list_lock); + node = list_first_or_null_rcu(&hsr->node_db, + struct hsr_node, mac_list); if (node) ether_addr_copy(addr, node->macaddress_A); return node; } node = _pos; - hlist_for_each_entry_continue_rcu(node, mac_list) { + list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) { ether_addr_copy(addr, node->macaddress_A); return node; } @@ -633,11 +585,8 @@ int hsr_get_node_data(struct hsr_priv *hsr, struct hsr_node *node; struct hsr_port *port; unsigned long tdiff; - u32 hash; - - hash = hsr_mac_hash(hsr, addr); - node = find_node_by_addr_A(&hsr->node_db[hash], addr); + node = find_node_by_addr_A(&hsr->node_db, addr); if (!node) return -ENOENT; diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index f3762e9e42b5..bdbb8c822ba1 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -28,17 +28,9 @@ struct hsr_frame_info { bool is_from_san; }; -#ifdef CONFIG_LOCKDEP -int lockdep_hsr_is_held(spinlock_t *lock); -#else -#define lockdep_hsr_is_held(lock) 1 -#endif - -u32 hsr_mac_hash(struct hsr_priv *hsr, const unsigned char *addr); -struct hsr_node *hsr_node_get_first(struct hlist_head *head, spinlock_t *lock); void hsr_del_self_node(struct hsr_priv *hsr); -void hsr_del_nodes(struct hlist_head *node_db); -struct hsr_node *hsr_get_node(struct hsr_port *port, struct hlist_head *node_db, +void hsr_del_nodes(struct list_head *node_db); +struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct sk_buff *skb, bool is_sup, enum hsr_port_type rx_port); void hsr_handle_sup_frame(struct hsr_frame_info *frame); @@ -76,7 +68,7 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port, void prp_update_san_info(struct hsr_node *node, bool is_sup); struct hsr_node { - struct hlist_node mac_list; + struct list_head mac_list; unsigned char macaddress_A[ETH_ALEN]; unsigned char macaddress_B[ETH_ALEN]; /* Local slave through which AddrB frames are received from this node */ diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index b158ba409f9a..16ae9fb09ccd 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -47,9 +47,6 @@ #define HSR_V1_SUP_LSDUSIZE 52 -#define HSR_HSIZE_SHIFT 8 -#define HSR_HSIZE BIT(HSR_HSIZE_SHIFT) - /* The helper functions below assumes that 'path' occupies the 4 most * significant bits of the 16-bit field shared by 'path' and 'LSDU_size' (or * equivalently, the 4 most significant bits of HSR tag byte 14). @@ -188,8 +185,8 @@ struct hsr_proto_ops { struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; - struct hlist_head node_db[HSR_HSIZE]; /* Known HSR nodes */ - struct hlist_head self_node_db; /* MACs of slaves */ + struct list_head node_db; /* Known HSR nodes */ + struct list_head self_node_db; /* MACs of slaves */ struct timer_list announce_timer; /* Supervision frame dispatch */ struct timer_list prune_timer; int announce_count; @@ -199,8 +196,6 @@ struct hsr_priv { spinlock_t seqnr_lock; /* locking for sequence_nr */ spinlock_t list_lock; /* locking for node list */ struct hsr_proto_ops *proto_ops; - u32 hash_buckets; - u32 hash_seed; #define PRP_LAN_ID 0x5 /* 0x1010 for A and 0x1011 for B. Bit 0 is set * based on SLAVE_A or SLAVE_B */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 7174a9092900..78fe40eb9f01 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -105,7 +105,6 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, static void hsr_dellink(struct net_device *dev, struct list_head *head) { struct hsr_priv *hsr = netdev_priv(dev); - int i; del_timer_sync(&hsr->prune_timer); del_timer_sync(&hsr->announce_timer); @@ -114,8 +113,7 @@ static void hsr_dellink(struct net_device *dev, struct list_head *head) hsr_del_ports(hsr); hsr_del_self_node(hsr); - for (i = 0; i < hsr->hash_buckets; i++) - hsr_del_nodes(&hsr->node_db[i]); + hsr_del_nodes(&hsr->node_db); unregister_netdevice_queue(dev, head); } -- cgit v1.2.3-70-g09d2 From 5aa2820177af650293b2f9f1873c1f6f8e4ad7a4 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Nov 2022 17:48:09 +0100 Subject: hsr: Add a rcu-read lock to hsr_forward_skb(). hsr_forward_skb() a skb and keeps information in an on-stack hsr_frame_info. hsr_get_node() assigns hsr_frame_info::node_src which is from a RCU list. This pointer is used later in hsr_forward_do(). I don't see a reason why this pointer can't vanish midway since there is no guarantee that hsr_forward_skb() is invoked from an RCU read section. Use rcu_read_lock() to protect hsr_frame_info::node_src from its assignment until it is no longer used. Fixes: f266a683a4804 ("net/hsr: Better frame dispatch") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jakub Kicinski --- net/hsr/hsr_forward.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 3e58d310b4d4..a8befa35841e 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -614,11 +614,13 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) { struct hsr_frame_info frame; + rcu_read_lock(); if (fill_frame_info(&frame, skb, port) < 0) goto out_drop; hsr_register_frame_in(frame.node_src, port, frame.sequence_nr); hsr_forward_do(&frame); + rcu_read_unlock(); /* Gets called for ingress frames as well as egress from master port. * So check and increment stats for master port only here. */ @@ -633,6 +635,7 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port) return; out_drop: + rcu_read_unlock(); port->dev->stats.tx_dropped++; kfree_skb(skb); } -- cgit v1.2.3-70-g09d2 From 0c74d9f79ec4299365bbe803baa736ae0068179e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Nov 2022 17:48:10 +0100 Subject: hsr: Avoid double remove of a node. Due to the hashed-MAC optimisation one problem become visible: hsr_handle_sup_frame() walks over the list of available nodes and merges two node entries into one if based on the information in the supervision both MAC addresses belong to one node. The list-walk happens on a RCU protected list and delete operation happens under a lock. If the supervision arrives on both slave interfaces at the same time then this delete operation can occur simultaneously on two CPUs. The result is the first-CPU deletes the from the list and the second CPUs BUGs while attempting to dereference a poisoned list-entry. This happens more likely with the optimisation because a new node for the mac_B entry is created once a packet has been received and removed (merged) once the supervision frame has been received. Avoid removing/ cleaning up a hsr_node twice by adding a `removed' field which is set to true after the removal and checked before the removal. Fixes: f266a683a4804 ("net/hsr: Better frame dispatch") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jakub Kicinski --- net/hsr/hsr_framereg.c | 16 +++++++++++----- net/hsr/hsr_framereg.h | 1 + 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 9b8eaebce254..f2dd846ff903 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -366,9 +366,12 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); - list_del_rcu(&node_curr->mac_list); + if (!node_curr->removed) { + list_del_rcu(&node_curr->mac_list); + node_curr->removed = true; + kfree_rcu(node_curr, rcu_head); + } spin_unlock_bh(&hsr->list_lock); - kfree_rcu(node_curr, rcu_head); done: /* Push back here */ @@ -539,9 +542,12 @@ void hsr_prune_nodes(struct timer_list *t) if (time_is_before_jiffies(timestamp + msecs_to_jiffies(HSR_NODE_FORGET_TIME))) { hsr_nl_nodedown(hsr, node->macaddress_A); - list_del_rcu(&node->mac_list); - /* Note that we need to free this entry later: */ - kfree_rcu(node, rcu_head); + if (!node->removed) { + list_del_rcu(&node->mac_list); + node->removed = true; + /* Note that we need to free this entry later: */ + kfree_rcu(node, rcu_head); + } } } spin_unlock_bh(&hsr->list_lock); diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index bdbb8c822ba1..b5f902397bf1 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -80,6 +80,7 @@ struct hsr_node { bool san_a; bool san_b; u16 seq_out[HSR_PT_PORTS]; + bool removed; struct rcu_head rcu_head; }; -- cgit v1.2.3-70-g09d2 From d5c7652eb16fa203d82546e0285136d7b321ffa9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Nov 2022 17:48:11 +0100 Subject: hsr: Disable netpoll. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hsr device is a software device. Its net_device_ops::ndo_start_xmit() routine will process the packet and then pass the resulting skb to dev_queue_xmit(). During processing, hsr acquires a lock with spin_lock_bh() (hsr_add_node()) which needs to be promoted to the _irq() suffix in order to avoid a potential deadlock. Then there are the warnings in dev_queue_xmit() (due to local_bh_disable() with disabled interrupts) left. Instead trying to address those (there is qdisc and…) for netpoll sake, just disable netpoll on hsr. Disable netpoll on hsr and replace the _irqsave() locking with _bh(). Fixes: f421436a591d3 ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jakub Kicinski --- net/hsr/hsr_device.c | 14 ++++++-------- net/hsr/hsr_forward.c | 5 ++--- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 7518f7e93043..84fba2a402a5 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -278,7 +278,6 @@ static void send_hsr_supervision_frame(struct hsr_port *master, __u8 type = HSR_TLV_LIFE_CHECK; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tag *hsr_stag; - unsigned long irqflags; struct sk_buff *skb; *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); @@ -299,7 +298,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); /* From HSRv1 on we have separate supervision sequence numbers. */ - spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); + spin_lock_bh(&hsr->seqnr_lock); if (hsr->prot_version > 0) { hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; @@ -307,7 +306,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_stag->sequence_nr = htons(hsr->sequence_nr); hsr->sequence_nr++; } - spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); + spin_unlock_bh(&hsr->seqnr_lock); hsr_stag->tlv.HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ @@ -332,7 +331,6 @@ static void send_prp_supervision_frame(struct hsr_port *master, struct hsr_priv *hsr = master->hsr; struct hsr_sup_payload *hsr_sp; struct hsr_sup_tag *hsr_stag; - unsigned long irqflags; struct sk_buff *skb; skb = hsr_init_skb(master); @@ -347,7 +345,7 @@ static void send_prp_supervision_frame(struct hsr_port *master, set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0)); /* From HSRv1 on we have separate supervision sequence numbers. */ - spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); + spin_lock_bh(&hsr->seqnr_lock); hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; @@ -358,11 +356,11 @@ static void send_prp_supervision_frame(struct hsr_port *master, ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); if (skb_put_padto(skb, ETH_ZLEN)) { - spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); + spin_unlock_bh(&hsr->seqnr_lock); return; } - spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); + spin_unlock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, master); } @@ -444,7 +442,7 @@ void hsr_dev_setup(struct net_device *dev) dev->header_ops = &hsr_header_ops; dev->netdev_ops = &hsr_device_ops; SET_NETDEV_DEVTYPE(dev, &hsr_type); - dev->priv_flags |= IFF_NO_QUEUE; + dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; dev->needs_free_netdev = true; diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index a8befa35841e..a828221335bd 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -500,7 +500,6 @@ static void handle_std_frame(struct sk_buff *skb, { struct hsr_port *port = frame->port_rcv; struct hsr_priv *hsr = port->hsr; - unsigned long irqflags; frame->skb_hsr = NULL; frame->skb_prp = NULL; @@ -510,10 +509,10 @@ static void handle_std_frame(struct sk_buff *skb, frame->is_from_san = true; } else { /* Sequence nr for the master node */ - spin_lock_irqsave(&hsr->seqnr_lock, irqflags); + spin_lock_bh(&hsr->seqnr_lock); frame->sequence_nr = hsr->sequence_nr; hsr->sequence_nr++; - spin_unlock_irqrestore(&hsr->seqnr_lock, irqflags); + spin_unlock_bh(&hsr->seqnr_lock); } } -- cgit v1.2.3-70-g09d2 From 06afd2c31d338fa762548580c1bf088703dd1e03 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Nov 2022 17:48:12 +0100 Subject: hsr: Synchronize sending frames to have always incremented outgoing seq nr. Sending frames via the hsr (master) device requires a sequence number which is tracked in hsr_priv::sequence_nr and protected by hsr_priv::seqnr_lock. Each time a new frame is sent, it will obtain a new id and then send it via the slave devices. Each time a packet is sent (via hsr_forward_do()) the sequence number is checked via hsr_register_frame_out() to ensure that a frame is not handled twice. This make sense for the receiving side to ensure that the frame is not injected into the stack twice after it has been received from both slave ports. There is no locking to cover the sending path which means the following scenario is possible: CPU0 CPU1 hsr_dev_xmit(skb1) hsr_dev_xmit(skb2) fill_frame_info() fill_frame_info() hsr_fill_frame_info() hsr_fill_frame_info() handle_std_frame() handle_std_frame() skb1's sequence_nr = 1 skb2's sequence_nr = 2 hsr_forward_do() hsr_forward_do() hsr_register_frame_out(, 2) // okay, send) hsr_register_frame_out(, 1) // stop, lower seq duplicate Both skbs (or their struct hsr_frame_info) received an unique id. However since skb2 was sent before skb1, the higher sequence number was recorded in hsr_register_frame_out() and the late arriving skb1 was dropped and never sent. This scenario has been observed in a three node HSR setup, with node1 + node2 having ping and iperf running in parallel. From time to time ping reported a missing packet. Based on tracing that missing ping packet did not leave the system. It might be possible (didn't check) to drop the sequence number check on the sending side. But if the higher sequence number leaves on wire before the lower does and the destination receives them in that order and it will drop the packet with the lower sequence number and never inject into the stack. Therefore it seems the only way is to lock the whole path from obtaining the sequence number and sending via dev_queue_xmit() and assuming the packets leave on wire in the same order (and don't get reordered by the NIC). Cover the whole path for the master interface from obtaining the ID until after it has been forwarded via hsr_forward_skb() to ensure the skbs are sent to the NIC in the order of the assigned sequence numbers. Fixes: f421436a591d3 ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jakub Kicinski --- net/hsr/hsr_device.c | 12 +++++++----- net/hsr/hsr_forward.c | 3 +-- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 84fba2a402a5..b1e86a7265b3 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -219,7 +219,9 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb->dev = master->dev; skb_reset_mac_header(skb); skb_reset_mac_len(skb); + spin_lock_bh(&hsr->seqnr_lock); hsr_forward_skb(skb, master); + spin_unlock_bh(&hsr->seqnr_lock); } else { dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); @@ -306,7 +308,6 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_stag->sequence_nr = htons(hsr->sequence_nr); hsr->sequence_nr++; } - spin_unlock_bh(&hsr->seqnr_lock); hsr_stag->tlv.HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ @@ -317,11 +318,13 @@ static void send_hsr_supervision_frame(struct hsr_port *master, hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr); - if (skb_put_padto(skb, ETH_ZLEN)) + if (skb_put_padto(skb, ETH_ZLEN)) { + spin_unlock_bh(&hsr->seqnr_lock); return; + } hsr_forward_skb(skb, master); - + spin_unlock_bh(&hsr->seqnr_lock); return; } @@ -360,9 +363,8 @@ static void send_prp_supervision_frame(struct hsr_port *master, return; } - spin_unlock_bh(&hsr->seqnr_lock); - hsr_forward_skb(skb, master); + spin_unlock_bh(&hsr->seqnr_lock); } /* Announce (supervision frame) timer function diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index a828221335bd..629daacc9607 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -509,10 +509,9 @@ static void handle_std_frame(struct sk_buff *skb, frame->is_from_san = true; } else { /* Sequence nr for the master node */ - spin_lock_bh(&hsr->seqnr_lock); + lockdep_assert_held(&hsr->seqnr_lock); frame->sequence_nr = hsr->sequence_nr; hsr->sequence_nr++; - spin_unlock_bh(&hsr->seqnr_lock); } } -- cgit v1.2.3-70-g09d2 From 5c7aa13210c3abdd34fd421f62347665ec6eb551 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Nov 2022 17:48:13 +0100 Subject: hsr: Synchronize sequence number updates. hsr_register_frame_out() compares new sequence_nr vs the old one recorded in hsr_node::seq_out and if the new sequence_nr is higher then it will be written to hsr_node::seq_out as the new value. This operation isn't locked so it is possible that two frames with the same sequence number arrive (via the two slave devices) and are fed to hsr_register_frame_out() at the same time. Both will pass the check and update the sequence counter later to the same value. As a result the content of the same packet is fed into the stack twice. This was noticed by running ping and observing DUP being reported from time to time. Instead of using the hsr_priv::seqnr_lock for the whole receive path (as it is for sending in the master node) add an additional lock that is only used for sequence number checks and updates. Add a per-node lock that is used during sequence number reads and updates. Fixes: f421436a591d3 ("net/hsr: Add support for the High-availability Seamless Redundancy protocol (HSRv0)") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Jakub Kicinski --- net/hsr/hsr_framereg.c | 9 ++++++++- net/hsr/hsr_framereg.h | 2 ++ 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index f2dd846ff903..39a6088080e9 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -157,6 +157,7 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, return NULL; ether_addr_copy(new_node->macaddress_A, addr); + spin_lock_init(&new_node->seq_out_lock); /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). @@ -353,6 +354,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) } ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); + spin_lock_bh(&node_real->seq_out_lock); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && time_after(node_curr->time_in[i], node_real->time_in[i])) { @@ -363,6 +365,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) node_real->seq_out[i] = node_curr->seq_out[i]; } + spin_unlock_bh(&node_real->seq_out_lock); node_real->addr_B_port = port_rcv->type; spin_lock_bh(&hsr->list_lock); @@ -456,13 +459,17 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, u16 sequence_nr) { + spin_lock_bh(&node->seq_out_lock); if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && time_is_after_jiffies(node->time_out[port->type] + - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) { + spin_unlock_bh(&node->seq_out_lock); return 1; + } node->time_out[port->type] = jiffies; node->seq_out[port->type] = sequence_nr; + spin_unlock_bh(&node->seq_out_lock); return 0; } diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index b5f902397bf1..b23556251d62 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -69,6 +69,8 @@ void prp_update_san_info(struct hsr_node *node, bool is_sup); struct hsr_node { struct list_head mac_list; + /* Protect R/W access to seq_out */ + spinlock_t seq_out_lock; unsigned char macaddress_A[ETH_ALEN]; unsigned char macaddress_B[ETH_ALEN]; /* Local slave through which AddrB frames are received from this node */ -- cgit v1.2.3-70-g09d2 From 20d3c1e9b861b85e1a774e1876d6adeeb0251fc3 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 29 Nov 2022 17:48:14 +0100 Subject: hsr: Use a single struct for self_node. self_node_db is a list_head with one entry of struct hsr_node. The purpose is to hold the two MAC addresses of the node itself. It is convenient to recycle the structure. However having a list_head and fetching always the first entry is not really optimal. Created a new data strucure contaning the two MAC addresses named hsr_self_node. Access that structure like an RCU protected pointer so it can be replaced on the fly without blocking the reader. Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Kurt Kanzenbach Signed-off-by: Jakub Kicinski --- net/hsr/hsr_device.c | 1 - net/hsr/hsr_framereg.c | 63 ++++++++++++++++++++++---------------------------- net/hsr/hsr_main.h | 8 ++++++- 3 files changed, 35 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index b1e86a7265b3..5a236aae2366 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -490,7 +490,6 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], hsr = netdev_priv(hsr_dev); INIT_LIST_HEAD(&hsr->ports); INIT_LIST_HEAD(&hsr->node_db); - INIT_LIST_HEAD(&hsr->self_node_db); spin_lock_init(&hsr->list_lock); eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 39a6088080e9..00db74d96583 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -38,21 +38,22 @@ static bool seq_nr_after(u16 a, u16 b) bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr) { - struct hsr_node *node; + struct hsr_self_node *sn; + bool ret = false; - node = list_first_or_null_rcu(&hsr->self_node_db, struct hsr_node, - mac_list); - if (!node) { + rcu_read_lock(); + sn = rcu_dereference(hsr->self_node); + if (!sn) { WARN_ONCE(1, "HSR: No self node\n"); - return false; + goto out; } - if (ether_addr_equal(addr, node->macaddress_A)) - return true; - if (ether_addr_equal(addr, node->macaddress_B)) - return true; - - return false; + if (ether_addr_equal(addr, sn->macaddress_A) || + ether_addr_equal(addr, sn->macaddress_B)) + ret = true; +out: + rcu_read_unlock(); + return ret; } /* Search for mac entry. Caller must hold rcu read lock. @@ -70,50 +71,42 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, return NULL; } -/* Helper for device init; the self_node_db is used in hsr_rcv() to recognize +/* Helper for device init; the self_node is used in hsr_rcv() to recognize * frames from self that's been looped over the HSR ring. */ int hsr_create_self_node(struct hsr_priv *hsr, const unsigned char addr_a[ETH_ALEN], const unsigned char addr_b[ETH_ALEN]) { - struct list_head *self_node_db = &hsr->self_node_db; - struct hsr_node *node, *oldnode; + struct hsr_self_node *sn, *old; - node = kmalloc(sizeof(*node), GFP_KERNEL); - if (!node) + sn = kmalloc(sizeof(*sn), GFP_KERNEL); + if (!sn) return -ENOMEM; - ether_addr_copy(node->macaddress_A, addr_a); - ether_addr_copy(node->macaddress_B, addr_b); + ether_addr_copy(sn->macaddress_A, addr_a); + ether_addr_copy(sn->macaddress_B, addr_b); spin_lock_bh(&hsr->list_lock); - oldnode = list_first_or_null_rcu(self_node_db, - struct hsr_node, mac_list); - if (oldnode) { - list_replace_rcu(&oldnode->mac_list, &node->mac_list); - spin_unlock_bh(&hsr->list_lock); - kfree_rcu(oldnode, rcu_head); - } else { - list_add_tail_rcu(&node->mac_list, self_node_db); - spin_unlock_bh(&hsr->list_lock); - } + old = rcu_replace_pointer(hsr->self_node, sn, + lockdep_is_held(&hsr->list_lock)); + spin_unlock_bh(&hsr->list_lock); + if (old) + kfree_rcu(old, rcu_head); return 0; } void hsr_del_self_node(struct hsr_priv *hsr) { - struct list_head *self_node_db = &hsr->self_node_db; - struct hsr_node *node; + struct hsr_self_node *old; spin_lock_bh(&hsr->list_lock); - node = list_first_or_null_rcu(self_node_db, struct hsr_node, mac_list); - if (node) { - list_del_rcu(&node->mac_list); - kfree_rcu(node, rcu_head); - } + old = rcu_replace_pointer(hsr->self_node, NULL, + lockdep_is_held(&hsr->list_lock)); spin_unlock_bh(&hsr->list_lock); + if (old) + kfree_rcu(old, rcu_head); } void hsr_del_nodes(struct list_head *node_db) diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 16ae9fb09ccd..5584c80a5c79 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -182,11 +182,17 @@ struct hsr_proto_ops { void (*update_san_info)(struct hsr_node *node, bool is_sup); }; +struct hsr_self_node { + unsigned char macaddress_A[ETH_ALEN]; + unsigned char macaddress_B[ETH_ALEN]; + struct rcu_head rcu_head; +}; + struct hsr_priv { struct rcu_head rcu_head; struct list_head ports; struct list_head node_db; /* Known HSR nodes */ - struct list_head self_node_db; /* MACs of slaves */ + struct hsr_self_node __rcu *self_node; /* MACs of slaves */ struct timer_list announce_timer; /* Supervision frame dispatch */ struct timer_list prune_timer; int announce_count; -- cgit v1.2.3-70-g09d2 From 47b438cc27254fa68b7360de153db4093c9259f4 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 30 Nov 2022 09:52:50 +0100 Subject: net: devlink: convert port_list into xarray Some devlink instances may contain thousands of ports. Storing them in linked list and looking them up is not scalable. Convert the linked list into xarray. Signed-off-by: Jiri Pirko Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/devlink.c | 57 ++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index fca3ebee97b0..907df7124157 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -41,7 +41,7 @@ struct devlink_dev_stats { struct devlink { u32 index; - struct list_head port_list; + struct xarray ports; struct list_head rate_list; struct list_head sb_list; struct list_head dpipe_table_list; @@ -382,19 +382,7 @@ static struct devlink *devlink_get_from_attrs(struct net *net, static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { - struct devlink_port *devlink_port; - - list_for_each_entry(devlink_port, &devlink->port_list, list) { - if (devlink_port->index == port_index) - return devlink_port; - } - return NULL; -} - -static bool devlink_port_index_exists(struct devlink *devlink, - unsigned int port_index) -{ - return devlink_port_get_by_index(devlink, port_index); + return xa_load(&devlink->ports, port_index); } static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink, @@ -1565,14 +1553,14 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; struct devlink_port *devlink_port; + unsigned long index, port_index; int start = cb->args[0]; - unsigned long index; int idx = 0; int err; devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { devl_lock(devlink); - list_for_each_entry(devlink_port, &devlink->port_list, list) { + xa_for_each(&devlink->ports, port_index, devlink_port) { if (idx < start) { idx++; continue; @@ -2886,10 +2874,11 @@ static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx, { struct devlink_port *devlink_port; u16 pool_count = devlink_sb_pool_count(devlink_sb); + unsigned long port_index; u16 pool_index; int err; - list_for_each_entry(devlink_port, &devlink->port_list, list) { + xa_for_each(&devlink->ports, port_index, devlink_port) { for (pool_index = 0; pool_index < pool_count; pool_index++) { if (*p_idx < start) { (*p_idx)++; @@ -3107,10 +3096,11 @@ static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, u32 portid, u32 seq) { struct devlink_port *devlink_port; + unsigned long port_index; u16 tc_index; int err; - list_for_each_entry(devlink_port, &devlink->port_list, list) { + xa_for_each(&devlink->ports, port_index, devlink_port) { for (tc_index = 0; tc_index < devlink_sb->ingress_tc_count; tc_index++) { if (*p_idx < start) { @@ -6207,6 +6197,7 @@ static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, { struct devlink_region *region; struct devlink_port *port; + unsigned long port_index; int err = 0; devl_lock(devlink); @@ -6225,7 +6216,7 @@ static int devlink_nl_cmd_region_get_devlink_dumpit(struct sk_buff *msg, (*idx)++; } - list_for_each_entry(port, &devlink->port_list, list) { + xa_for_each(&devlink->ports, port_index, port) { err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, idx, start); if (err) @@ -8057,10 +8048,10 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct devlink_health_reporter *reporter; + unsigned long index, port_index; struct devlink_port *port; struct devlink *devlink; int start = cb->args[0]; - unsigned long index; int idx = 0; int err; @@ -8089,7 +8080,7 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, devlinks_xa_for_each_registered_get(sock_net(msg->sk), index, devlink) { devl_lock(devlink); - list_for_each_entry(port, &devlink->port_list, list) { + xa_for_each(&devlink->ports, port_index, port) { mutex_lock(&port->reporters_lock); list_for_each_entry(reporter, &port->reporter_list, list) { if (idx < start) { @@ -9808,9 +9799,9 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, devlink->dev = dev; devlink->ops = ops; + xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); write_pnet(&devlink->_net, net); - INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->linecard_list); INIT_LIST_HEAD(&devlink->sb_list); @@ -9862,12 +9853,13 @@ static void devlink_notify_register(struct devlink *devlink) struct devlink_linecard *linecard; struct devlink_rate *rate_node; struct devlink_region *region; + unsigned long port_index; devlink_notify(devlink, DEVLINK_CMD_NEW); list_for_each_entry(linecard, &devlink->linecard_list, list) devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW); - list_for_each_entry(devlink_port, &devlink->port_list, list) + xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); list_for_each_entry(policer_item, &devlink->trap_policer_list, list) @@ -9901,6 +9893,7 @@ static void devlink_notify_unregister(struct devlink *devlink) struct devlink_port *devlink_port; struct devlink_rate *rate_node; struct devlink_region *region; + unsigned long port_index; list_for_each_entry_reverse(param_item, &devlink->param_list, list) devlink_param_notify(devlink, 0, param_item, @@ -9923,7 +9916,7 @@ static void devlink_notify_unregister(struct devlink *devlink) devlink_trap_policer_notify(devlink, policer_item, DEVLINK_CMD_TRAP_POLICER_DEL); - list_for_each_entry_reverse(devlink_port, &devlink->port_list, list) + xa_for_each(&devlink->ports, port_index, devlink_port) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); devlink_notify(devlink, DEVLINK_CMD_DEL); } @@ -9987,9 +9980,10 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->linecard_list)); - WARN_ON(!list_empty(&devlink->port_list)); + WARN_ON(!xa_empty(&devlink->ports)); xa_destroy(&devlink->snapshot_ids); + xa_destroy(&devlink->ports); WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink), &devlink->netdevice_nb)); @@ -10088,10 +10082,9 @@ int devl_port_register(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index) { - devl_assert_locked(devlink); + int err; - if (devlink_port_index_exists(devlink, port_index)) - return -EEXIST; + devl_assert_locked(devlink); ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); @@ -10101,7 +10094,11 @@ int devl_port_register(struct devlink *devlink, spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); mutex_init(&devlink_port->reporters_lock); - list_add_tail(&devlink_port->list, &devlink->port_list); + err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL); + if (err) { + mutex_destroy(&devlink_port->reporters_lock); + return err; + } INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); @@ -10150,7 +10147,7 @@ void devl_port_unregister(struct devlink_port *devlink_port) devlink_port_type_warn_cancel(devlink_port); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - list_del(&devlink_port->list); + xa_erase(&devlink_port->devlink->ports, devlink_port->index); WARN_ON(!list_empty(&devlink_port->reporter_list)); mutex_destroy(&devlink_port->reporters_lock); devlink_port->registered = false; -- cgit v1.2.3-70-g09d2 From 55fb80d518c7323d05b71eda0c9f9d657b373816 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Dec 2022 05:28:47 +0000 Subject: tcp: use 2-arg optimal variant of kfree_rcu() kfree_rcu(1-arg) should be avoided as much as possible, since this is only possible from sleepable contexts, and incurr extra rcu barriers. I wish the 1-arg variant of kfree_rcu() would get a distinct name, like kfree_rcu_slow() to avoid it being abused. Fixes: 459837b522f7 ("net/tcp: Disable TCP-MD5 static key on tcp_md5sig_info destruction") Signed-off-by: Eric Dumazet Cc: Paul E. McKenney Reviewed-by: Pavan Chebbi Reviewed-by: Dmitry Safonov Link: https://lore.kernel.org/r/20221202052847.2623997-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7fae586405cf..8320d0ecb13a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1245,7 +1245,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); rcu_assign_pointer(tp->md5sig_info, NULL); - kfree_rcu(md5sig); + kfree_rcu(md5sig, rcu); return -EUSERS; } } @@ -1271,7 +1271,7 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); rcu_assign_pointer(tp->md5sig_info, NULL); - kfree_rcu(md5sig); + kfree_rcu(md5sig, rcu); return -EUSERS; } } -- cgit v1.2.3-70-g09d2 From d93607082e982223cf92750f2d9039ff365b9d24 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 30 Nov 2022 23:28:26 +0100 Subject: net: add netdev_sw_irq_coalesce_default_on() Add a helper for drivers wanting to set SW IRQ coalescing by default. The related sysfs attributes can be used to override the default values. Follow Jakub's suggestion and put this functionality into net core so that drivers wanting to use software interrupt coalescing per default don't have to open-code it. Note that this function needs to be called before the netdevice is registered. Suggested-by: Jakub Kicinski Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5aa35c58c342..f78db610ada5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -78,6 +78,7 @@ struct xdp_buff; void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); +void netdev_sw_irq_coalesce_default_on(struct net_device *dev); /* Backlog congestion levels */ #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ diff --git a/net/core/dev.c b/net/core/dev.c index 7627c475d991..b76fb37b381e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10517,6 +10517,22 @@ void netdev_set_default_ethtool_ops(struct net_device *dev, } EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); +/** + * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default + * @dev: netdev to enable the IRQ coalescing on + * + * Sets a conservative default for SW IRQ coalescing. Users can use + * sysfs attributes to override the default values. + */ +void netdev_sw_irq_coalesce_default_on(struct net_device *dev) +{ + WARN_ON(dev->reg_state == NETREG_REGISTERED); + + dev->gro_flush_timeout = 20000; + dev->napi_defer_hard_irqs = 1; +} +EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on); + void netdev_freemem(struct net_device *dev) { char *addr = (char *)dev - dev->padded; -- cgit v1.2.3-70-g09d2 From 578ce69ffda49d6c1a252490553290d1f27199f0 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Thu, 1 Dec 2022 13:39:39 +0100 Subject: bpf: Add dummy type reference to nf_conn___init to fix type deduplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_ct_set_nat_info() kfunc is defined in the nf_nat.ko module, and takes as a parameter the nf_conn___init struct, which is allocated through the bpf_xdp_ct_alloc() helper defined in the nf_conntrack.ko module. However, because kernel modules can't deduplicate BTF types between each other, and the nf_conn___init struct is not referenced anywhere in vmlinux BTF, this leads to two distinct BTF IDs for the same type (one in each module). This confuses the verifier, as described here: https://lore.kernel.org/all/87leoh372s.fsf@toke.dk/ As a workaround, add an explicit BTF_TYPE_EMIT for the type in net/filter.c, so the type definition gets included in vmlinux BTF. This way, both modules can refer to the same type ID (as they both build on top of vmlinux BTF), and the verifier is no longer confused. v2: - Use BTF_TYPE_EMIT (which is a statement so it has to be inside a function definition; use xdp_func_proto() for this, since this is mostly xdp-related). Fixes: 820dc0523e05 ("net: netfilter: move bpf_ct_set_nat_info kfunc in nf_nat_bpf.c") Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221201123939.696558-1-toke@redhat.com Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 37baaa6b8fc3..8607136b6e2c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -80,6 +80,7 @@ #include #include #include +#include static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -7992,6 +7993,19 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) default: return bpf_sk_base_func_proto(func_id); } + +#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) + /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The + * kfuncs are defined in two different modules, and we want to be able + * to use them interchangably with the same BTF type ID. Because modules + * can't de-duplicate BTF IDs between each other, we need the type to be + * referenced in the vmlinux BTF or the verifier will get confused about + * the different types. So we add this dummy type reference which will + * be included in vmlinux BTF, allowing both modules to refer to the + * same type ID. + */ + BTF_TYPE_EMIT(struct nf_conn___init); +#endif } const struct bpf_func_proto bpf_sock_map_update_proto __weak; -- cgit v1.2.3-70-g09d2 From 0a182f8d607464911756b4dbef5d6cad8de22469 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 2 Dec 2022 11:16:40 +0000 Subject: bpf, sockmap: fix race in sock_map_free() sock_map_free() calls release_sock(sk) without owning a reference on the socket. This can cause use-after-free as syzbot found [1] Jakub Sitnicki already took care of a similar issue in sock_hash_free() in commit 75e68e5bf2c7 ("bpf, sockhash: Synchronize delete from bucket list on map free") [1] refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 0 PID: 3785 at lib/refcount.c:31 refcount_warn_saturate+0x17c/0x1a0 lib/refcount.c:31 Modules linked in: CPU: 0 PID: 3785 Comm: kworker/u4:6 Not tainted 6.1.0-rc7-syzkaller-00103-gef4d3ea40565 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 Workqueue: events_unbound bpf_map_free_deferred RIP: 0010:refcount_warn_saturate+0x17c/0x1a0 lib/refcount.c:31 Code: 68 8b 31 c0 e8 75 71 15 fd 0f 0b e9 64 ff ff ff e8 d9 6e 4e fd c6 05 62 9c 3d 0a 01 48 c7 c7 80 bb 68 8b 31 c0 e8 54 71 15 fd <0f> 0b e9 43 ff ff ff 89 d9 80 e1 07 80 c1 03 38 c1 0f 8c a2 fe ff RSP: 0018:ffffc9000456fb60 EFLAGS: 00010246 RAX: eae59bab72dcd700 RBX: 0000000000000004 RCX: ffff8880207057c0 RDX: 0000000000000000 RSI: 0000000000000201 RDI: 0000000000000000 RBP: 0000000000000004 R08: ffffffff816fdabd R09: fffff520008adee5 R10: fffff520008adee5 R11: 1ffff920008adee4 R12: 0000000000000004 R13: dffffc0000000000 R14: ffff88807b1c6c00 R15: 1ffff1100f638dcf FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b30c30000 CR3: 000000000d08e000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] __sock_put include/net/sock.h:779 [inline] tcp_release_cb+0x2d0/0x360 net/ipv4/tcp_output.c:1092 release_sock+0xaf/0x1c0 net/core/sock.c:3468 sock_map_free+0x219/0x2c0 net/core/sock_map.c:356 process_one_work+0x81c/0xd10 kernel/workqueue.c:2289 worker_thread+0xb14/0x1330 kernel/workqueue.c:2436 kthread+0x266/0x300 kernel/kthread.c:376 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:306 Fixes: 7e81a3530206 ("bpf: Sockmap, ensure sock lock held during tear down") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Jakub Sitnicki Cc: John Fastabend Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Song Liu Acked-by: John Fastabend Link: https://lore.kernel.org/r/20221202111640.2745533-1-edumazet@google.com Signed-off-by: Alexei Starovoitov --- net/core/sock_map.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 81beb16ab1eb..22fa2c5bc6ec 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -349,11 +349,13 @@ static void sock_map_free(struct bpf_map *map) sk = xchg(psk, NULL); if (sk) { + sock_hold(sk); lock_sock(sk); rcu_read_lock(); sock_map_unref(sk, psk); rcu_read_unlock(); release_sock(sk); + sock_put(sk); } } -- cgit v1.2.3-70-g09d2 From d14f28b8c1de668bab863bf5892a49c824cb110d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:27 +0200 Subject: xfrm: add new packet offload flag In the next patches, the xfrm core code will be extended to support new type of offload - packet offload. In that mode, both policy and state should be specially configured in order to perform whole offloaded data path. Full offload takes care of encryption, decryption, encapsulation and other operations with headers. As this mode is new for XFRM policy flow, we can "start fresh" with flag bits and release first and second bit for future use. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 +++++++ include/uapi/linux/xfrm.h | 6 ++++++ net/xfrm/xfrm_device.c | 3 +++ net/xfrm/xfrm_user.c | 2 ++ 4 files changed, 18 insertions(+) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e0cc6791c001..b39d24fa2ef0 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -131,12 +131,19 @@ enum { XFRM_DEV_OFFLOAD_OUT, }; +enum { + XFRM_DEV_OFFLOAD_UNSPECIFIED, + XFRM_DEV_OFFLOAD_CRYPTO, + XFRM_DEV_OFFLOAD_PACKET, +}; + struct xfrm_dev_offload { struct net_device *dev; netdevice_tracker dev_tracker; struct net_device *real_dev; unsigned long offload_handle; u8 dir : 2; + u8 type : 2; }; struct xfrm_mode { diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 4f84ea7ee14c..23543c33fee8 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -519,6 +519,12 @@ struct xfrm_user_offload { */ #define XFRM_OFFLOAD_IPV6 1 #define XFRM_OFFLOAD_INBOUND 2 +/* Two bits above are relevant for state path only, while + * offload is used for both policy and state flows. + * + * In policy offload mode, they are free and can be safely reused. + */ +#define XFRM_OFFLOAD_PACKET 4 struct xfrm_userpolicy_default { #define XFRM_USERPOLICY_UNSPEC 0 diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 21269e8f2db4..3b0c1ca8d4bb 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -291,12 +291,15 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, else xso->dir = XFRM_DEV_OFFLOAD_OUT; + xso->type = XFRM_DEV_OFFLOAD_CRYPTO; + err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { xso->dev = NULL; xso->dir = 0; xso->real_dev = NULL; netdev_put(dev, &xso->dev_tracker); + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; if (err != -EOPNOTSUPP) { NL_SET_ERR_MSG(extack, "Device failed to offload this state"); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0eb4696661c8..c3b8c1532718 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -956,6 +956,8 @@ static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb) xuo->ifindex = xso->dev->ifindex; if (xso->dir == XFRM_DEV_OFFLOAD_IN) xuo->flags = XFRM_OFFLOAD_INBOUND; + if (xso->type == XFRM_DEV_OFFLOAD_PACKET) + xuo->flags |= XFRM_OFFLOAD_PACKET; return 0; } -- cgit v1.2.3-70-g09d2 From 62f6eca5de103c6823f6ca2abbf2ee242e132207 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:28 +0200 Subject: xfrm: allow state packet offload mode Allow users to configure xfrm states with packet offload mode. The packet mode must be requested both for policy and state, and such requires us to do not implement fallback. We explicitly return an error if requested packet mode can't be configured. Reviewed-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- .../chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c | 4 ++++ drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 5 +++++ drivers/net/ethernet/intel/ixgbevf/ipsec.c | 5 +++++ .../ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 4 ++++ drivers/net/ethernet/netronome/nfp/crypto/ipsec.c | 5 +++++ drivers/net/netdevsim/ipsec.c | 5 +++++ net/xfrm/xfrm_device.c | 24 +++++++++++++++++----- 7 files changed, 47 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c index 585590520076..ca21794281d6 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c @@ -283,6 +283,10 @@ static int ch_ipsec_xfrm_add_state(struct xfrm_state *x) pr_debug("Cannot offload xfrm states with geniv other than seqiv\n"); return -EINVAL; } + if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + pr_debug("Unsupported xfrm offload\n"); + return -EINVAL; + } sa_entry = kzalloc(sizeof(*sa_entry), GFP_KERNEL); if (!sa_entry) { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 774de63dd93a..53a969e34883 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -585,6 +585,11 @@ static int ixgbe_ipsec_add_sa(struct xfrm_state *xs) return -EINVAL; } + if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + netdev_err(dev, "Unsupported ipsec offload type\n"); + return -EINVAL; + } + if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) { struct rx_sa rsa; diff --git a/drivers/net/ethernet/intel/ixgbevf/ipsec.c b/drivers/net/ethernet/intel/ixgbevf/ipsec.c index 9984ebc62d78..c1cf540d162a 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ipsec.c +++ b/drivers/net/ethernet/intel/ixgbevf/ipsec.c @@ -280,6 +280,11 @@ static int ixgbevf_ipsec_add_sa(struct xfrm_state *xs) return -EINVAL; } + if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + netdev_err(dev, "Unsupported ipsec offload type\n"); + return -EINVAL; + } + if (xs->xso.dir == XFRM_DEV_OFFLOAD_IN) { struct rx_sa rsa; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c index 1b03ab03fc5a..e6411533f911 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c @@ -253,6 +253,10 @@ static inline int mlx5e_xfrm_validate_state(struct xfrm_state *x) netdev_info(netdev, "Cannot offload xfrm states with geniv other than seqiv\n"); return -EINVAL; } + if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + netdev_info(netdev, "Unsupported xfrm offload type\n"); + return -EINVAL; + } return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c index 3728870d8e9c..4632268695cb 100644 --- a/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c +++ b/drivers/net/ethernet/netronome/nfp/crypto/ipsec.c @@ -302,6 +302,11 @@ static int nfp_net_xfrm_add_state(struct xfrm_state *x) return -EINVAL; } + if (x->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + nn_err(nn, "Unsupported xfrm offload tyoe\n"); + return -EINVAL; + } + cfg->spi = ntohl(x->id.spi); /* Hash/Authentication */ diff --git a/drivers/net/netdevsim/ipsec.c b/drivers/net/netdevsim/ipsec.c index 386336a38f34..b93baf5c8bee 100644 --- a/drivers/net/netdevsim/ipsec.c +++ b/drivers/net/netdevsim/ipsec.c @@ -149,6 +149,11 @@ static int nsim_ipsec_add_sa(struct xfrm_state *xs) return -EINVAL; } + if (xs->xso.type != XFRM_DEV_OFFLOAD_CRYPTO) { + netdev_err(dev, "Unsupported ipsec offload type\n"); + return -EINVAL; + } + /* find the first unused index */ ret = nsim_ipsec_find_empty_idx(ipsec); if (ret < 0) { diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3b0c1ca8d4bb..3184b2c394b6 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -229,6 +229,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_dev_offload *xso = &x->xso; xfrm_address_t *saddr; xfrm_address_t *daddr; + bool is_packet_offload; if (!x->type_offload) { NL_SET_ERR_MSG(extack, "Type doesn't support offload"); @@ -241,11 +242,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) { + if (xuo->flags & + ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); return -EINVAL; } + is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { @@ -260,7 +263,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, x->props.family, xfrm_smark_get(0, x)); if (IS_ERR(dst)) - return 0; + return (is_packet_offload) ? -EINVAL : 0; dev = dst->dev; @@ -271,7 +274,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { xso->dev = NULL; dev_put(dev); - return 0; + return (is_packet_offload) ? -EINVAL : 0; } if (x->props.flags & XFRM_STATE_ESN && @@ -291,7 +294,10 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, else xso->dir = XFRM_DEV_OFFLOAD_OUT; - xso->type = XFRM_DEV_OFFLOAD_CRYPTO; + if (is_packet_offload) + xso->type = XFRM_DEV_OFFLOAD_PACKET; + else + xso->type = XFRM_DEV_OFFLOAD_CRYPTO; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { @@ -301,7 +307,15 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, netdev_put(dev, &xso->dev_tracker); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; - if (err != -EOPNOTSUPP) { + /* User explicitly requested packet offload mode and configured + * policy in addition to the XFRM state. So be civil to users, + * and return an error instead of taking fallback path. + * + * This WARN_ON() can be seen as a documentation for driver + * authors to do not return -EOPNOTSUPP in packet offload mode. + */ + WARN_ON(err == -EOPNOTSUPP && is_packet_offload); + if (err != -EOPNOTSUPP || is_packet_offload) { NL_SET_ERR_MSG(extack, "Device failed to offload this state"); return err; } -- cgit v1.2.3-70-g09d2 From 919e43fad5163a8ceb39826ecdee897a9f799351 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:29 +0200 Subject: xfrm: add an interface to offload policy Extend netlink interface to add and delete XFRM policy from the device. This functionality is a first step to implement packet IPsec offload solution. Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 3 +++ include/net/xfrm.h | 45 +++++++++++++++++++++++++++++++ net/xfrm/xfrm_device.c | 67 ++++++++++++++++++++++++++++++++++++++++++++- net/xfrm/xfrm_policy.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_user.c | 18 +++++++++++++ 5 files changed, 201 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5aa35c58c342..4096e3fe8e4a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1040,6 +1040,9 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + int (*xdo_dev_policy_add) (struct xfrm_policy *x); + void (*xdo_dev_policy_delete) (struct xfrm_policy *x); + void (*xdo_dev_policy_free) (struct xfrm_policy *x); }; #endif diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b39d24fa2ef0..6fea34cbdf48 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -129,6 +129,7 @@ struct xfrm_state_walk { enum { XFRM_DEV_OFFLOAD_IN = 1, XFRM_DEV_OFFLOAD_OUT, + XFRM_DEV_OFFLOAD_FWD, }; enum { @@ -541,6 +542,8 @@ struct xfrm_policy { struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; struct hlist_node bydst_inexact_list; struct rcu_head rcu; + + struct xfrm_dev_offload xdo; }; static inline struct net *xp_net(const struct xfrm_policy *xp) @@ -1585,6 +1588,8 @@ struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); +int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, + bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); @@ -1899,6 +1904,9 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo, struct netlink_ext_ack *extack); +int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, + struct xfrm_user_offload *xuo, u8 dir, + struct netlink_ext_ack *extack); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x) @@ -1947,6 +1955,28 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) netdev_put(dev, &xso->dev_tracker); } } + +static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) +{ + struct xfrm_dev_offload *xdo = &x->xdo; + struct net_device *dev = xdo->dev; + + if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete) + dev->xfrmdev_ops->xdo_dev_policy_delete(x); +} + +static inline void xfrm_dev_policy_free(struct xfrm_policy *x) +{ + struct xfrm_dev_offload *xdo = &x->xdo; + struct net_device *dev = xdo->dev; + + if (dev && dev->xfrmdev_ops) { + if (dev->xfrmdev_ops->xdo_dev_policy_free) + dev->xfrmdev_ops->xdo_dev_policy_free(x); + xdo->dev = NULL; + netdev_put(dev, &xdo->dev_tracker); + } +} #else static inline void xfrm_dev_resume(struct sk_buff *skb) { @@ -1974,6 +2004,21 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x) { } +static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, + struct xfrm_user_offload *xuo, u8 dir, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static inline void xfrm_dev_policy_delete(struct xfrm_policy *x) +{ +} + +static inline void xfrm_dev_policy_free(struct xfrm_policy *x) +{ +} + static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { return false; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3184b2c394b6..04ae510dcc66 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -325,6 +325,69 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } EXPORT_SYMBOL_GPL(xfrm_dev_state_add); +int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, + struct xfrm_user_offload *xuo, u8 dir, + struct netlink_ext_ack *extack) +{ + struct xfrm_dev_offload *xdo = &xp->xdo; + struct net_device *dev; + int err; + + if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) { + /* We support only packet offload mode and it means + * that user must set XFRM_OFFLOAD_PACKET bit. + */ + NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); + return -EINVAL; + } + + dev = dev_get_by_index(net, xuo->ifindex); + if (!dev) + return -EINVAL; + + if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) { + xdo->dev = NULL; + dev_put(dev); + NL_SET_ERR_MSG(extack, "Policy offload is not supported"); + return -EINVAL; + } + + xdo->dev = dev; + netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC); + xdo->real_dev = dev; + xdo->type = XFRM_DEV_OFFLOAD_PACKET; + switch (dir) { + case XFRM_POLICY_IN: + xdo->dir = XFRM_DEV_OFFLOAD_IN; + break; + case XFRM_POLICY_OUT: + xdo->dir = XFRM_DEV_OFFLOAD_OUT; + break; + case XFRM_POLICY_FWD: + xdo->dir = XFRM_DEV_OFFLOAD_FWD; + break; + default: + xdo->dev = NULL; + dev_put(dev); + NL_SET_ERR_MSG(extack, "Unrecognized oflload direction"); + return -EINVAL; + } + + err = dev->xfrmdev_ops->xdo_dev_policy_add(xp); + if (err) { + xdo->dev = NULL; + xdo->real_dev = NULL; + xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + xdo->dir = 0; + netdev_put(dev, &xdo->dev_tracker); + NL_SET_ERR_MSG(extack, "Device failed to offload this policy"); + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(xfrm_dev_policy_add); + bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { int mtu; @@ -427,8 +490,10 @@ static int xfrm_api_check(struct net_device *dev) static int xfrm_dev_down(struct net_device *dev) { - if (dev->features & NETIF_F_HW_ESP) + if (dev->features & NETIF_F_HW_ESP) { xfrm_dev_state_flush(dev_net(dev), dev, true); + xfrm_dev_policy_flush(dev_net(dev), dev, true); + } return NOTIFY_DONE; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9b9e2765363d..8b8760907563 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -425,6 +425,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); + xfrm_dev_policy_free(policy); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); @@ -1769,12 +1770,41 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) } return err; } + +static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, + struct net_device *dev, + bool task_valid) +{ + struct xfrm_policy *pol; + int err = 0; + + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + if (pol->walk.dead || + xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || + pol->xdo.dev != dev) + continue; + + err = security_xfrm_policy_delete(pol->security); + if (err) { + xfrm_audit_policy_delete(pol, 0, task_valid); + return err; + } + } + return err; +} #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } + +static inline int xfrm_dev_policy_flush_secctx_check(struct net *net, + struct net_device *dev, + bool task_valid) +{ + return 0; +} #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) @@ -1814,6 +1844,44 @@ out: } EXPORT_SYMBOL(xfrm_policy_flush); +int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, + bool task_valid) +{ + int dir, err = 0, cnt = 0; + struct xfrm_policy *pol; + + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + + err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid); + if (err) + goto out; + +again: + list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { + dir = xfrm_policy_id2dir(pol->index); + if (pol->walk.dead || + dir >= XFRM_POLICY_MAX || + pol->xdo.dev != dev) + continue; + + __xfrm_policy_unlink(pol, dir); + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + cnt++; + xfrm_audit_policy_delete(pol, 1, task_valid); + xfrm_policy_kill(pol); + spin_lock_bh(&net->xfrm.xfrm_policy_lock); + goto again; + } + if (cnt) + __xfrm_policy_inexact_flush(net); + else + err = -ESRCH; +out: + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + return err; +} +EXPORT_SYMBOL(xfrm_dev_policy_flush); + int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) @@ -2245,6 +2313,7 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir) pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { + xfrm_dev_policy_delete(pol); xfrm_policy_kill(pol); return 0; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index c3b8c1532718..cf5172d4ce68 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1892,6 +1892,15 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, if (attrs[XFRMA_IF_ID]) xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + /* configure the hardware if offload is requested */ + if (attrs[XFRMA_OFFLOAD_DEV]) { + err = xfrm_dev_policy_add(net, xp, + nla_data(attrs[XFRMA_OFFLOAD_DEV]), + p->dir, extack); + if (err) + goto error; + } + return xp; error: *errp = err; @@ -1931,6 +1940,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) { + xfrm_dev_policy_delete(xp); security_xfrm_policy_free(xp->security); kfree(xp); return err; @@ -2043,6 +2053,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); + if (!err && xp->xdo.dev) + err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3381,6 +3393,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); + if (!err && xp->xdo.dev) + err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3499,6 +3513,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); + if (!err && xp->xdo.dev) + err = copy_user_offload(&xp->xdo, skb); if (err) { nlmsg_cancel(skb, nlh); return err; @@ -3582,6 +3598,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e err = xfrm_mark_put(skb, &xp->mark); if (!err) err = xfrm_if_id_put(skb, xp->if_id); + if (!err && xp->xdo.dev) + err = copy_user_offload(&xp->xdo, skb); if (err) goto out_free_skb; -- cgit v1.2.3-70-g09d2 From f8a70afafc1759b1fca4baaa891625dde49c10b7 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:30 +0200 Subject: xfrm: add TX datapath support for IPsec packet offload mode In IPsec packet mode, the device is going to encrypt and encapsulate packets that are associated with offloaded policy. After successful policy lookup to indicate if packets should be offloaded or not, the stack forwards packets to the device to do the magic. Signed-off-by: Raed Salem Signed-off-by: Huy Nguyen Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 15 +++++- net/xfrm/xfrm_output.c | 12 ++++- net/xfrm/xfrm_state.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 142 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 04ae510dcc66..3e9e874522a8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -132,6 +132,16 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN) return skb; + /* The packet was sent to HW IPsec packet offload engine, + * but to wrong device. Drop the packet, so it won't skip + * XFRM stack. + */ + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->xso.dev != dev) { + kfree_skb(skb); + dev_core_stats_tx_dropped_inc(dev); + return NULL; + } + /* This skb was already validated on the upper/virtual dev */ if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) return skb; @@ -398,8 +408,9 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) if (!x->type_offload || x->encap) return false; - if ((!dev || (dev == xfrm_dst_path(dst)->dev)) && - (!xdst->child->xfrm)) { + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET || + ((!dev || (dev == xfrm_dst_path(dst)->dev)) && + !xdst->child->xfrm)) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 78cb8d0a6a18..ff114d68cc43 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -492,7 +492,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err) struct xfrm_state *x = dst->xfrm; struct net *net = xs_net(x); - if (err <= 0) + if (err <= 0 || x->xso.type == XFRM_DEV_OFFLOAD_PACKET) goto resume; do { @@ -717,6 +717,16 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) break; } + if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) { + if (!xfrm_dev_offload_ok(skb, x)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -EHOSTUNREACH; + } + + return xfrm_output_resume(sk, skb, 0); + } + secpath_reset(skb); if (xfrm_dev_offload_ok(skb, x)) { diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 9ec481fbfb63..4d315e1a88fa 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -951,6 +951,49 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, x->props.family = tmpl->encap_family; } +static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, + const xfrm_address_t *daddr, + __be32 spi, u8 proto, + unsigned short family, + struct xfrm_dev_offload *xdo) +{ + unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + struct xfrm_state *x; + + hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { +#ifdef CONFIG_XFRM_OFFLOAD + if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (xdo->dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif + if (x->props.family != family || + x->id.spi != spi || + x->id.proto != proto || + !xfrm_addr_equal(&x->id.daddr, daddr, family)) + continue; + + if ((mark & x->mark.m) != x->mark.v) + continue; + if (!xfrm_state_hold_rcu(x)) + continue; + return x; + } + + return NULL; +} + static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, @@ -1092,6 +1135,23 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, rcu_read_lock(); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (pol->xdo.dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -1109,6 +1169,23 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + /* HW states are in the head of list, there is + * no need to iterate further. + */ + break; + + /* Packet offload: both policy and SA should + * have same device. + */ + if (pol->xdo.dev != x->xso.dev) + continue; + } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) + /* Skip HW policy for SW lookups */ + continue; +#endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && @@ -1126,8 +1203,10 @@ found: x = best; if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && - (x0 = __xfrm_state_lookup(net, mark, daddr, tmpl->id.spi, - tmpl->id.proto, encap_family)) != NULL) { + (x0 = __xfrm_state_lookup_all(net, mark, daddr, + tmpl->id.spi, tmpl->id.proto, + encap_family, + &pol->xdo)) != NULL) { to_put = x0; error = -EEXIST; goto out; @@ -1161,7 +1240,31 @@ found: x = NULL; goto out; } - +#ifdef CONFIG_XFRM_OFFLOAD + if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { + struct xfrm_dev_offload *xdo = &pol->xdo; + struct xfrm_dev_offload *xso = &x->xso; + + xso->type = XFRM_DEV_OFFLOAD_PACKET; + xso->dir = xdo->dir; + xso->dev = xdo->dev; + xso->real_dev = xdo->real_dev; + netdev_tracker_alloc(xso->dev, &xso->dev_tracker, + GFP_ATOMIC); + error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x); + if (error) { + xso->dir = 0; + netdev_put(xso->dev, &xso->dev_tracker); + xso->dev = NULL; + xso->real_dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + x->km.state = XFRM_STATE_DEAD; + to_put = x; + x = NULL; + goto out; + } + } +#endif if (km_query(x, tmpl, pol) == 0) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; @@ -1185,6 +1288,18 @@ found: xfrm_hash_grow_check(net, x->bydst.next != NULL); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } else { +#ifdef CONFIG_XFRM_OFFLOAD + struct xfrm_dev_offload *xso = &x->xso; + + if (xso->type == XFRM_DEV_OFFLOAD_PACKET) { + xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); + xso->dir = 0; + netdev_put(xso->dev, &xso->dev_tracker); + xso->dev = NULL; + xso->real_dev = NULL; + xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + } +#endif x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; -- cgit v1.2.3-70-g09d2 From 3c611d40c6923c81e6a83a67156cd30a9503c155 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:32 +0200 Subject: xfrm: speed-up lookup of HW policies Devices that implement IPsec packet offload mode should offload SA and policies too. In RX path, it causes to the situation that HW will always have higher priority over any SW policies. It means that we don't need to perform any search of inexact policies and/or priority checks if HW policy was discovered. In such situation, the HW will catch the packets anyway and HW can still implement inexact lookups. In case specific policy is not found, we will continue with packet lookup and check for existence of HW policies in inexact list. HW policies are added to the head of SPD to ensure fast lookup, as XFRM iterates over all policies in the loop. The same solution of adding HW SAs at the begging of the list is applied to SA database too. However, we don't need to change lookups as they are sorted by insertion order and not priority. Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_policy.c | 16 ++++++++---- net/xfrm/xfrm_state.c | 66 ++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 62 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 8b8760907563..e9eb82c5457d 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -536,7 +536,7 @@ redo: __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); - if (!entry0) { + if (!entry0 || pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; @@ -867,7 +867,7 @@ static void xfrm_policy_inexact_list_reinsert(struct net *net, break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, &n->hhead); @@ -1348,7 +1348,7 @@ static void xfrm_hash_rebuild(struct work_struct *work) else break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); @@ -1525,7 +1525,7 @@ static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); else hlist_add_head_rcu(&policy->bydst_inexact_list, chain); @@ -1562,9 +1562,12 @@ static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, break; } - if (newpos) + if (newpos && policy->xdo.type != XFRM_DEV_OFFLOAD_PACKET) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else + /* Packet offload policies enter to the head + * to speed-up lookups. + */ hlist_add_head_rcu(&policy->bydst, chain); return delpol; @@ -2181,6 +2184,9 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, break; } } + if (ret && ret->xdo.type == XFRM_DEV_OFFLOAD_PACKET) + goto skip_inexact; + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, daddr)) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 4d315e1a88fa..2a190e85da80 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -84,6 +84,25 @@ static unsigned int xfrm_seq_hash(struct net *net, u32 seq) return __xfrm_seq_hash(seq, net->xfrm.state_hmask); } +#define XFRM_STATE_INSERT(by, _n, _h, _type) \ + { \ + struct xfrm_state *_x = NULL; \ + \ + if (_type != XFRM_DEV_OFFLOAD_PACKET) { \ + hlist_for_each_entry_rcu(_x, _h, by) { \ + if (_x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ + continue; \ + break; \ + } \ + } \ + \ + if (!_x || _x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ + /* SAD is empty or consist from HW SAs only */ \ + hlist_add_head_rcu(_n, _h); \ + else \ + hlist_add_before_rcu(_n, &_x->by); \ + } + static void xfrm_hash_transfer(struct hlist_head *list, struct hlist_head *ndsttable, struct hlist_head *nsrctable, @@ -100,23 +119,25 @@ static void xfrm_hash_transfer(struct hlist_head *list, h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family, nhashmask); - hlist_add_head_rcu(&x->bydst, ndsttable + h); + XFRM_STATE_INSERT(bydst, &x->bydst, ndsttable + h, x->xso.type); h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family, nhashmask); - hlist_add_head_rcu(&x->bysrc, nsrctable + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, nsrctable + h, x->xso.type); if (x->id.spi) { h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family, nhashmask); - hlist_add_head_rcu(&x->byspi, nspitable + h); + XFRM_STATE_INSERT(byspi, &x->byspi, nspitable + h, + x->xso.type); } if (x->km.seq) { h = __xfrm_seq_hash(x->km.seq, nhashmask); - hlist_add_head_rcu(&x->byseq, nseqtable + h); + XFRM_STATE_INSERT(byseq, &x->byseq, nseqtable + h, + x->xso.type); } } } @@ -1269,16 +1290,24 @@ found: spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, + net->xfrm.state_bydst + h, + x->xso.type); h = xfrm_src_hash(net, daddr, saddr, encap_family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, + net->xfrm.state_bysrc + h, + x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); - hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, + net->xfrm.state_byspi + h, + x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); - hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + XFRM_STATE_INSERT(byseq, &x->byseq, + net->xfrm.state_byseq + h, + x->xso.type); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; hrtimer_start(&x->mtimer, @@ -1395,22 +1424,26 @@ static void __xfrm_state_insert(struct xfrm_state *x) h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, + x->xso.type); h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, + x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, + x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); - hlist_add_head_rcu(&x->byseq, net->xfrm.state_byseq + h); + XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h, + x->xso.type); } hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); @@ -1524,9 +1557,11 @@ static struct xfrm_state *__find_acq_core(struct net *net, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL_SOFT); list_add(&x->km.all, &net->xfrm.state_all); - hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); + XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, + x->xso.type); h = xfrm_src_hash(net, daddr, saddr, family); - hlist_add_head_rcu(&x->bysrc, net->xfrm.state_bysrc + h); + XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, + x->xso.type); net->xfrm.state_num++; @@ -2209,7 +2244,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, spin_lock_bh(&net->xfrm.xfrm_state_lock); x->id.spi = newspi; h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); - hlist_add_head_rcu(&x->byspi, net->xfrm.state_byspi + h); + XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, + x->xso.type); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; -- cgit v1.2.3-70-g09d2 From f3da86dc2c8c9004445cfbb15ac086773622d853 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:33 +0200 Subject: xfrm: add support to HW update soft and hard limits Both in RX and TX, the traffic that performs IPsec packet offload transformation is accounted by HW. It is needed to properly handle hard limits that require to drop the packet. It means that XFRM core needs to update internal counters with the one that accounted by the HW, so new callbacks are introduced in this patch. In case of soft or hard limit is occurred, the driver should call to xfrm_state_check_expire() that will perform key rekeying exactly as done by XFRM core. Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 1 + include/net/xfrm.h | 17 +++++++++++++++++ net/xfrm/xfrm_state.c | 4 ++++ 3 files changed, 22 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4096e3fe8e4a..29ae964e3b89 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1040,6 +1040,7 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + void (*xdo_dev_state_update_curlft) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index b6ee14991a32..5413cdd5ad62 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1571,6 +1571,23 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family); int xfrm_state_check_expire(struct xfrm_state *x); +#ifdef CONFIG_XFRM_OFFLOAD +static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) +{ + struct xfrm_dev_offload *xdo = &x->xso; + struct net_device *dev = xdo->dev; + + if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) + return; + + if (dev && dev->xfrmdev_ops && + dev->xfrmdev_ops->xdo_dev_state_update_curlft) + dev->xfrmdev_ops->xdo_dev_state_update_curlft(x); + +} +#else +static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {} +#endif void xfrm_state_insert(struct xfrm_state *x); int xfrm_state_add(struct xfrm_state *x); int xfrm_state_update(struct xfrm_state *x); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 2a190e85da80..cc1d0ea42672 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -570,6 +570,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) int err = 0; spin_lock(&x->lock); + xfrm_dev_state_update_curlft(x); + if (x->km.state == XFRM_STATE_DEAD) goto out; if (x->km.state == XFRM_STATE_EXPIRED) @@ -1936,6 +1938,8 @@ EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { + xfrm_dev_state_update_curlft(x); + if (!x->curlft.use_time) x->curlft.use_time = ktime_get_real_seconds(); -- cgit v1.2.3-70-g09d2 From 7112a04664bfc10ae4709b2079fe3991cbd1fe18 Mon Sep 17 00:00:00 2001 From: Sudheer Mogilappagari Date: Thu, 1 Dec 2022 16:25:55 -0800 Subject: ethtool: add netlink based get rss support Add netlink based support for "ethtool -x [context x]" command by implementing ETHTOOL_MSG_RSS_GET netlink message. This is equivalent to functionality provided via ETHTOOL_GRSSH in ioctl path. It sends RSS table, hash key and hash function of an interface to user space. This patch implements existing functionality available in ioctl path and enables addition of new RSS context based parameters in future. Signed-off-by: Sudheer Mogilappagari Link: https://lore.kernel.org/r/20221202002555.241580-1-sudheer.mogilappagari@intel.com Signed-off-by: Jakub Kicinski --- Documentation/networking/ethtool-netlink.rst | 31 +++++- include/uapi/linux/ethtool_netlink.h | 14 +++ net/ethtool/Makefile | 2 +- net/ethtool/netlink.c | 7 ++ net/ethtool/netlink.h | 2 + net/ethtool/rss.c | 153 +++++++++++++++++++++++++++ 6 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 net/ethtool/rss.c (limited to 'net') diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index bede24ef44fd..f10f8eb44255 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -222,6 +222,7 @@ Userspace to kernel: ``ETHTOOL_MSG_MODULE_GET`` get transceiver module parameters ``ETHTOOL_MSG_PSE_SET`` set PSE parameters ``ETHTOOL_MSG_PSE_GET`` get PSE parameters + ``ETHTOOL_MSG_RSS_GET`` get RSS settings ===================================== ================================= Kernel to userspace: @@ -263,6 +264,7 @@ Kernel to userspace: ``ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY`` PHC virtual clocks info ``ETHTOOL_MSG_MODULE_GET_REPLY`` transceiver module parameters ``ETHTOOL_MSG_PSE_GET_REPLY`` PSE parameters + ``ETHTOOL_MSG_RSS_GET_REPLY`` RSS settings ======================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1687,6 +1689,33 @@ to control PoDL PSE Admin functions. This option is implementing ``IEEE 802.3-2018`` 30.15.1.2.1 acPoDLPSEAdminControl. See ``ETHTOOL_A_PODL_PSE_ADMIN_STATE`` for supported values. +RSS_GET +======= + +Get indirection table, hash key and hash function info associated with a +RSS context of an interface similar to ``ETHTOOL_GRSSH`` ioctl request. + +Request contents: + +===================================== ====== ========================== + ``ETHTOOL_A_RSS_HEADER`` nested request header + ``ETHTOOL_A_RSS_CONTEXT`` u32 context number +===================================== ====== ========================== + +Kernel response contents: + +===================================== ====== ========================== + ``ETHTOOL_A_RSS_HEADER`` nested reply header + ``ETHTOOL_A_RSS_HFUNC`` u32 RSS hash func + ``ETHTOOL_A_RSS_INDIR`` binary Indir table bytes + ``ETHTOOL_A_RSS_HKEY`` binary Hash key bytes +===================================== ====== ========================== + +ETHTOOL_A_RSS_HFUNC attribute is bitmap indicating the hash function +being used. Current supported options are toeplitz, xor or crc32. +ETHTOOL_A_RSS_INDIR attribute returns RSS indrection table where each byte +indicates queue number. + Request translation =================== @@ -1768,7 +1797,7 @@ are netlink only. ``ETHTOOL_GMODULEEEPROM`` ``ETHTOOL_MSG_MODULE_EEPROM_GET`` ``ETHTOOL_GEEE`` ``ETHTOOL_MSG_EEE_GET`` ``ETHTOOL_SEEE`` ``ETHTOOL_MSG_EEE_SET`` - ``ETHTOOL_GRSSH`` n/a + ``ETHTOOL_GRSSH`` ``ETHTOOL_MSG_RSS_GET`` ``ETHTOOL_SRSSH`` n/a ``ETHTOOL_GTUNABLE`` n/a ``ETHTOOL_STUNABLE`` n/a diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index aaf7c6963d61..5799a9db034e 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -51,6 +51,7 @@ enum { ETHTOOL_MSG_MODULE_SET, ETHTOOL_MSG_PSE_GET, ETHTOOL_MSG_PSE_SET, + ETHTOOL_MSG_RSS_GET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -97,6 +98,7 @@ enum { ETHTOOL_MSG_MODULE_GET_REPLY, ETHTOOL_MSG_MODULE_NTF, ETHTOOL_MSG_PSE_GET_REPLY, + ETHTOOL_MSG_RSS_GET_REPLY, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -880,6 +882,18 @@ enum { ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) }; +enum { + ETHTOOL_A_RSS_UNSPEC, + ETHTOOL_A_RSS_HEADER, + ETHTOOL_A_RSS_CONTEXT, /* u32 */ + ETHTOOL_A_RSS_HFUNC, /* u32 */ + ETHTOOL_A_RSS_INDIR, /* binary */ + ETHTOOL_A_RSS_HKEY, /* binary */ + + __ETHTOOL_A_RSS_CNT, + ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1), +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 72ab0944262a..228f13df2e18 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -4,7 +4,7 @@ obj-y += ioctl.o common.o obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o -ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ +ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \ diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 1a4c11356c96..aee98be6237f 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -287,6 +287,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, + [ETHTOOL_MSG_RSS_GET] = ðnl_rss_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -1040,6 +1041,12 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_pse_set_policy, .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_RSS_GET, + .doit = ethnl_default_doit, + .policy = ethnl_rss_get_policy, + .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 1bfd374f9718..3753787ba233 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -346,6 +346,7 @@ extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct ethnl_request_ops ethnl_pse_request_ops; +extern const struct ethnl_request_ops ethnl_rss_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -386,6 +387,7 @@ extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; +extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_CONTEXT + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c new file mode 100644 index 000000000000..ebe6145aed3f --- /dev/null +++ b/net/ethtool/rss.c @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" + +struct rss_req_info { + struct ethnl_req_info base; + u32 rss_context; +}; + +struct rss_reply_data { + struct ethnl_reply_data base; + u32 indir_size; + u32 hkey_size; + u32 hfunc; + u32 *indir_table; + u8 *hkey; +}; + +#define RSS_REQINFO(__req_base) \ + container_of(__req_base, struct rss_req_info, base) + +#define RSS_REPDATA(__reply_base) \ + container_of(__reply_base, struct rss_reply_data, base) + +const struct nla_policy ethnl_rss_get_policy[] = { + [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32 }, +}; + +static int +rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct rss_req_info *request = RSS_REQINFO(req_info); + + if (tb[ETHTOOL_A_RSS_CONTEXT]) + request->rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]); + + return 0; +} + +static int +rss_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, struct genl_info *info) +{ + struct rss_reply_data *data = RSS_REPDATA(reply_base); + struct rss_req_info *request = RSS_REQINFO(req_base); + struct net_device *dev = reply_base->dev; + const struct ethtool_ops *ops; + u32 total_size, indir_bytes; + u8 dev_hfunc = 0; + u8 *rss_config; + int ret; + + ops = dev->ethtool_ops; + if (!ops->get_rxfh) + return -EOPNOTSUPP; + + /* Some drivers don't handle rss_context */ + if (request->rss_context && !ops->get_rxfh_context) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + data->indir_size = 0; + data->hkey_size = 0; + if (ops->get_rxfh_indir_size) + data->indir_size = ops->get_rxfh_indir_size(dev); + if (ops->get_rxfh_key_size) + data->hkey_size = ops->get_rxfh_key_size(dev); + + indir_bytes = data->indir_size * sizeof(u32); + total_size = indir_bytes + data->hkey_size; + rss_config = kzalloc(total_size, GFP_KERNEL); + if (!rss_config) { + ret = -ENOMEM; + goto out_ops; + } + + if (data->indir_size) + data->indir_table = (u32 *)rss_config; + + if (data->hkey_size) + data->hkey = rss_config + indir_bytes; + + if (request->rss_context) + ret = ops->get_rxfh_context(dev, data->indir_table, data->hkey, + &dev_hfunc, request->rss_context); + else + ret = ops->get_rxfh(dev, data->indir_table, data->hkey, + &dev_hfunc); + + if (ret) + goto out_ops; + + data->hfunc = dev_hfunc; +out_ops: + ethnl_ops_complete(dev); + return ret; +} + +static int +rss_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct rss_reply_data *data = RSS_REPDATA(reply_base); + int len; + + len = nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */ + nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */ + nla_total_size(data->hkey_size); /* _RSS_HKEY */ + + return len; +} + +static int +rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct rss_reply_data *data = RSS_REPDATA(reply_base); + + if (nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc) || + nla_put(skb, ETHTOOL_A_RSS_INDIR, + sizeof(u32) * data->indir_size, data->indir_table) || + nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey)) + return -EMSGSIZE; + + return 0; +} + +static void rss_cleanup_data(struct ethnl_reply_data *reply_base) +{ + const struct rss_reply_data *data = RSS_REPDATA(reply_base); + + kfree(data->indir_table); +} + +const struct ethnl_request_ops ethnl_rss_request_ops = { + .request_cmd = ETHTOOL_MSG_RSS_GET, + .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY, + .hdr_attr = ETHTOOL_A_RSS_HEADER, + .req_info_size = sizeof(struct rss_req_info), + .reply_data_size = sizeof(struct rss_reply_data), + + .parse_request = rss_parse_request, + .prepare_data = rss_prepare_data, + .reply_size = rss_reply_size, + .fill_reply = rss_fill_reply, + .cleanup_data = rss_cleanup_data, +}; -- cgit v1.2.3-70-g09d2 From ee9a113ab63468137802898bcd2c598998c96938 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 3 Dec 2022 10:46:56 +0200 Subject: xfrm: interface: rename xfrm_interface.c to xfrm_interface_core.c This change allows adding additional files to the xfrm_interface module. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20221203084659.1837829-2-eyal.birger@gmail.com Signed-off-by: Martin KaFai Lau --- net/xfrm/Makefile | 2 + net/xfrm/xfrm_interface.c | 1198 ---------------------------------------- net/xfrm/xfrm_interface_core.c | 1198 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1200 insertions(+), 1198 deletions(-) delete mode 100644 net/xfrm/xfrm_interface.c create mode 100644 net/xfrm/xfrm_interface_core.c (limited to 'net') diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 494aa744bfb9..08a2870fdd36 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -3,6 +3,8 @@ # Makefile for the XFRM subsystem. # +xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o + obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ xfrm_sysctl.o xfrm_replay.o xfrm_device.o diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c deleted file mode 100644 index 5a67b120c4db..000000000000 --- a/net/xfrm/xfrm_interface.c +++ /dev/null @@ -1,1198 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * XFRM virtual interface - * - * Copyright (C) 2018 secunet Security Networks AG - * - * Author: - * Steffen Klassert - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static int xfrmi_dev_init(struct net_device *dev); -static void xfrmi_dev_setup(struct net_device *dev); -static struct rtnl_link_ops xfrmi_link_ops __read_mostly; -static unsigned int xfrmi_net_id __read_mostly; -static const struct net_device_ops xfrmi_netdev_ops; - -#define XFRMI_HASH_BITS 8 -#define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) - -struct xfrmi_net { - /* lists for storing interfaces in use */ - struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; - struct xfrm_if __rcu *collect_md_xfrmi; -}; - -static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { - [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), - [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), -}; - -static void xfrmi_destroy_state(struct lwtunnel_state *lwt) -{ -} - -static int xfrmi_build_state(struct net *net, struct nlattr *nla, - unsigned int family, const void *cfg, - struct lwtunnel_state **ts, - struct netlink_ext_ack *extack) -{ - struct nlattr *tb[LWT_XFRM_MAX + 1]; - struct lwtunnel_state *new_state; - struct xfrm_md_info *info; - int ret; - - ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); - if (ret < 0) - return ret; - - if (!tb[LWT_XFRM_IF_ID]) { - NL_SET_ERR_MSG(extack, "if_id must be set"); - return -EINVAL; - } - - new_state = lwtunnel_state_alloc(sizeof(*info)); - if (!new_state) { - NL_SET_ERR_MSG(extack, "failed to create encap info"); - return -ENOMEM; - } - - new_state->type = LWTUNNEL_ENCAP_XFRM; - - info = lwt_xfrm_info(new_state); - - info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); - - if (tb[LWT_XFRM_LINK]) - info->link = nla_get_u32(tb[LWT_XFRM_LINK]); - - *ts = new_state; - return 0; -} - -static int xfrmi_fill_encap_info(struct sk_buff *skb, - struct lwtunnel_state *lwt) -{ - struct xfrm_md_info *info = lwt_xfrm_info(lwt); - - if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || - (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) - return -EMSGSIZE; - - return 0; -} - -static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) -{ - return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ - nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ -} - -static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) -{ - struct xfrm_md_info *a_info = lwt_xfrm_info(a); - struct xfrm_md_info *b_info = lwt_xfrm_info(b); - - return memcmp(a_info, b_info, sizeof(*a_info)); -} - -static const struct lwtunnel_encap_ops xfrmi_encap_ops = { - .build_state = xfrmi_build_state, - .destroy_state = xfrmi_destroy_state, - .fill_encap = xfrmi_fill_encap_info, - .get_encap_size = xfrmi_encap_nlsize, - .cmp_encap = xfrmi_encap_cmp, - .owner = THIS_MODULE, -}; - -#define for_each_xfrmi_rcu(start, xi) \ - for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) - -static u32 xfrmi_hash(u32 if_id) -{ - return hash_32(if_id, XFRMI_HASH_BITS); -} - -static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) -{ - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - struct xfrm_if *xi; - - for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { - if (x->if_id == xi->p.if_id && - (xi->dev->flags & IFF_UP)) - return xi; - } - - xi = rcu_dereference(xfrmn->collect_md_xfrmi); - if (xi && (xi->dev->flags & IFF_UP)) - return xi; - - return NULL; -} - -static bool xfrmi_decode_session(struct sk_buff *skb, - unsigned short family, - struct xfrm_if_decode_session_result *res) -{ - struct net_device *dev; - struct xfrm_if *xi; - int ifindex = 0; - - if (!secpath_exists(skb) || !skb->dev) - return false; - - switch (family) { - case AF_INET6: - ifindex = inet6_sdif(skb); - break; - case AF_INET: - ifindex = inet_sdif(skb); - break; - } - - if (ifindex) { - struct net *net = xs_net(xfrm_input_state(skb)); - - dev = dev_get_by_index_rcu(net, ifindex); - } else { - dev = skb->dev; - } - - if (!dev || !(dev->flags & IFF_UP)) - return false; - if (dev->netdev_ops != &xfrmi_netdev_ops) - return false; - - xi = netdev_priv(dev); - res->net = xi->net; - - if (xi->p.collect_md) - res->if_id = xfrm_input_state(skb)->if_id; - else - res->if_id = xi->p.if_id; - return true; -} - -static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) -{ - struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; - - rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); - rcu_assign_pointer(*xip, xi); -} - -static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) -{ - struct xfrm_if __rcu **xip; - struct xfrm_if *iter; - - for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; - (iter = rtnl_dereference(*xip)) != NULL; - xip = &iter->next) { - if (xi == iter) { - rcu_assign_pointer(*xip, xi->next); - break; - } - } -} - -static void xfrmi_dev_free(struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - - gro_cells_destroy(&xi->gro_cells); - free_percpu(dev->tstats); -} - -static int xfrmi_create(struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net *net = dev_net(dev); - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - int err; - - dev->rtnl_link_ops = &xfrmi_link_ops; - err = register_netdevice(dev); - if (err < 0) - goto out; - - if (xi->p.collect_md) - rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); - else - xfrmi_link(xfrmn, xi); - - return 0; - -out: - return err; -} - -static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) -{ - struct xfrm_if __rcu **xip; - struct xfrm_if *xi; - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - - for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - if (xi->p.if_id == p->if_id) - return xi; - - return NULL; -} - -static void xfrmi_dev_uninit(struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); - - if (xi->p.collect_md) - RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); - else - xfrmi_unlink(xfrmn, xi); -} - -static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) -{ - skb_clear_tstamp(skb); - skb->pkt_type = PACKET_HOST; - skb->skb_iif = 0; - skb->ignore_df = 0; - skb_dst_drop(skb); - nf_reset_ct(skb); - nf_reset_trace(skb); - - if (!xnet) - return; - - ipvs_reset(skb); - secpath_reset(skb); - skb_orphan(skb); - skb->mark = 0; -} - -static int xfrmi_rcv_cb(struct sk_buff *skb, int err) -{ - const struct xfrm_mode *inner_mode; - struct net_device *dev; - struct xfrm_state *x; - struct xfrm_if *xi; - bool xnet; - int link; - - if (err && !secpath_exists(skb)) - return 0; - - x = xfrm_input_state(skb); - - xi = xfrmi_lookup(xs_net(x), x); - if (!xi) - return 1; - - link = skb->dev->ifindex; - dev = xi->dev; - skb->dev = dev; - - if (err) { - dev->stats.rx_errors++; - dev->stats.rx_dropped++; - - return 0; - } - - xnet = !net_eq(xi->net, dev_net(skb->dev)); - - if (xnet) { - inner_mode = &x->inner_mode; - - if (x->sel.family == AF_UNSPEC) { - inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); - if (inner_mode == NULL) { - XFRM_INC_STATS(dev_net(skb->dev), - LINUX_MIB_XFRMINSTATEMODEERROR); - return -EINVAL; - } - } - - if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, - inner_mode->family)) - return -EPERM; - } - - xfrmi_scrub_packet(skb, xnet); - if (xi->p.collect_md) { - struct metadata_dst *md_dst; - - md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); - if (!md_dst) - return -ENOMEM; - - md_dst->u.xfrm_info.if_id = x->if_id; - md_dst->u.xfrm_info.link = link; - skb_dst_set(skb, (struct dst_entry *)md_dst); - } - dev_sw_netstats_rx_add(dev, skb->len); - - return 0; -} - -static int -xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; - struct dst_entry *dst = skb_dst(skb); - unsigned int length = skb->len; - struct net_device *tdev; - struct xfrm_state *x; - int err = -1; - u32 if_id; - int mtu; - - if (xi->p.collect_md) { - struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); - - if (unlikely(!md_info)) - return -EINVAL; - - if_id = md_info->if_id; - fl->flowi_oif = md_info->link; - } else { - if_id = xi->p.if_id; - } - - dst_hold(dst); - dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto tx_err_link_failure; - } - - x = dst->xfrm; - if (!x) - goto tx_err_link_failure; - - if (x->if_id != if_id) - goto tx_err_link_failure; - - tdev = dst->dev; - - if (tdev == dev) { - stats->collisions++; - net_warn_ratelimited("%s: Local routing loop detected!\n", - dev->name); - goto tx_err_dst_release; - } - - mtu = dst_mtu(dst); - if ((!skb_is_gso(skb) && skb->len > mtu) || - (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { - skb_dst_update_pmtu_no_confirm(skb, mtu); - - if (skb->protocol == htons(ETH_P_IPV6)) { - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - - if (skb->len > 1280) - icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - else - goto xmit; - } else { - if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto xmit; - icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - } - - dst_release(dst); - return -EMSGSIZE; - } - -xmit: - xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); - skb_dst_set(skb, dst); - skb->dev = tdev; - - err = dst_output(xi->net, skb->sk, skb); - if (net_xmit_eval(err) == 0) { - dev_sw_netstats_tx_add(dev, 1, length); - } else { - stats->tx_errors++; - stats->tx_aborted_errors++; - } - - return 0; -tx_err_link_failure: - stats->tx_carrier_errors++; - dst_link_failure(skb); -tx_err_dst_release: - dst_release(dst); - return err; -} - -static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net_device_stats *stats = &xi->dev->stats; - struct dst_entry *dst = skb_dst(skb); - struct flowi fl; - int ret; - - memset(&fl, 0, sizeof(fl)); - - switch (skb->protocol) { - case htons(ETH_P_IPV6): - xfrm_decode_session(skb, &fl, AF_INET6); - memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - if (!dst) { - fl.u.ip6.flowi6_oif = dev->ifindex; - fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; - dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); - if (dst->error) { - dst_release(dst); - stats->tx_carrier_errors++; - goto tx_err; - } - skb_dst_set(skb, dst); - } - break; - case htons(ETH_P_IP): - xfrm_decode_session(skb, &fl, AF_INET); - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - if (!dst) { - struct rtable *rt; - - fl.u.ip4.flowi4_oif = dev->ifindex; - fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; - rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); - if (IS_ERR(rt)) { - stats->tx_carrier_errors++; - goto tx_err; - } - skb_dst_set(skb, &rt->dst); - } - break; - default: - goto tx_err; - } - - fl.flowi_oif = xi->p.link; - - ret = xfrmi_xmit2(skb, dev, &fl); - if (ret < 0) - goto tx_err; - - return NETDEV_TX_OK; - -tx_err: - stats->tx_errors++; - stats->tx_dropped++; - kfree_skb(skb); - return NETDEV_TX_OK; -} - -static int xfrmi4_err(struct sk_buff *skb, u32 info) -{ - const struct iphdr *iph = (const struct iphdr *)skb->data; - struct net *net = dev_net(skb->dev); - int protocol = iph->protocol; - struct ip_comp_hdr *ipch; - struct ip_esp_hdr *esph; - struct ip_auth_hdr *ah ; - struct xfrm_state *x; - struct xfrm_if *xi; - __be32 spi; - - switch (protocol) { - case IPPROTO_ESP: - esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); - spi = esph->spi; - break; - case IPPROTO_AH: - ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); - spi = ah->spi; - break; - case IPPROTO_COMP: - ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); - spi = htonl(ntohs(ipch->cpi)); - break; - default: - return 0; - } - - switch (icmp_hdr(skb)->type) { - case ICMP_DEST_UNREACH: - if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) - return 0; - break; - case ICMP_REDIRECT: - break; - default: - return 0; - } - - x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, - spi, protocol, AF_INET); - if (!x) - return 0; - - xi = xfrmi_lookup(net, x); - if (!xi) { - xfrm_state_put(x); - return -1; - } - - if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) - ipv4_update_pmtu(skb, net, info, 0, protocol); - else - ipv4_redirect(skb, net, 0, protocol); - xfrm_state_put(x); - - return 0; -} - -static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; - struct net *net = dev_net(skb->dev); - int protocol = iph->nexthdr; - struct ip_comp_hdr *ipch; - struct ip_esp_hdr *esph; - struct ip_auth_hdr *ah; - struct xfrm_state *x; - struct xfrm_if *xi; - __be32 spi; - - switch (protocol) { - case IPPROTO_ESP: - esph = (struct ip_esp_hdr *)(skb->data + offset); - spi = esph->spi; - break; - case IPPROTO_AH: - ah = (struct ip_auth_hdr *)(skb->data + offset); - spi = ah->spi; - break; - case IPPROTO_COMP: - ipch = (struct ip_comp_hdr *)(skb->data + offset); - spi = htonl(ntohs(ipch->cpi)); - break; - default: - return 0; - } - - if (type != ICMPV6_PKT_TOOBIG && - type != NDISC_REDIRECT) - return 0; - - x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, - spi, protocol, AF_INET6); - if (!x) - return 0; - - xi = xfrmi_lookup(net, x); - if (!xi) { - xfrm_state_put(x); - return -1; - } - - if (type == NDISC_REDIRECT) - ip6_redirect(skb, net, skb->dev->ifindex, 0, - sock_net_uid(net, NULL)); - else - ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); - xfrm_state_put(x); - - return 0; -} - -static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) -{ - if (xi->p.link != p->link) - return -EINVAL; - - xi->p.if_id = p->if_id; - - return 0; -} - -static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) -{ - struct net *net = xi->net; - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - int err; - - xfrmi_unlink(xfrmn, xi); - synchronize_net(); - err = xfrmi_change(xi, p); - xfrmi_link(xfrmn, xi); - netdev_state_change(xi->dev); - return err; -} - -static int xfrmi_get_iflink(const struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - - return xi->p.link; -} - -static const struct net_device_ops xfrmi_netdev_ops = { - .ndo_init = xfrmi_dev_init, - .ndo_uninit = xfrmi_dev_uninit, - .ndo_start_xmit = xfrmi_xmit, - .ndo_get_stats64 = dev_get_tstats64, - .ndo_get_iflink = xfrmi_get_iflink, -}; - -static void xfrmi_dev_setup(struct net_device *dev) -{ - dev->netdev_ops = &xfrmi_netdev_ops; - dev->header_ops = &ip_tunnel_header_ops; - dev->type = ARPHRD_NONE; - dev->mtu = ETH_DATA_LEN; - dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = IP_MAX_MTU; - dev->flags = IFF_NOARP; - dev->needs_free_netdev = true; - dev->priv_destructor = xfrmi_dev_free; - netif_keep_dst(dev); - - eth_broadcast_addr(dev->broadcast); -} - -#define XFRMI_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_GSO_SOFTWARE | \ - NETIF_F_HW_CSUM) - -static int xfrmi_dev_init(struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); - int err; - - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - err = gro_cells_init(&xi->gro_cells, dev); - if (err) { - free_percpu(dev->tstats); - return err; - } - - dev->features |= NETIF_F_LLTX; - dev->features |= XFRMI_FEATURES; - dev->hw_features |= XFRMI_FEATURES; - - if (phydev) { - dev->needed_headroom = phydev->needed_headroom; - dev->needed_tailroom = phydev->needed_tailroom; - - if (is_zero_ether_addr(dev->dev_addr)) - eth_hw_addr_inherit(dev, phydev); - if (is_zero_ether_addr(dev->broadcast)) - memcpy(dev->broadcast, phydev->broadcast, - dev->addr_len); - } else { - eth_hw_addr_random(dev); - eth_broadcast_addr(dev->broadcast); - } - - return 0; -} - -static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - return 0; -} - -static void xfrmi_netlink_parms(struct nlattr *data[], - struct xfrm_if_parms *parms) -{ - memset(parms, 0, sizeof(*parms)); - - if (!data) - return; - - if (data[IFLA_XFRM_LINK]) - parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); - - if (data[IFLA_XFRM_IF_ID]) - parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); - - if (data[IFLA_XFRM_COLLECT_METADATA]) - parms->collect_md = true; -} - -static int xfrmi_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - struct net *net = dev_net(dev); - struct xfrm_if_parms p = {}; - struct xfrm_if *xi; - int err; - - xfrmi_netlink_parms(data, &p); - if (p.collect_md) { - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - - if (p.link || p.if_id) { - NL_SET_ERR_MSG(extack, "link and if_id must be zero"); - return -EINVAL; - } - - if (rtnl_dereference(xfrmn->collect_md_xfrmi)) - return -EEXIST; - - } else { - if (!p.if_id) { - NL_SET_ERR_MSG(extack, "if_id must be non zero"); - return -EINVAL; - } - - xi = xfrmi_locate(net, &p); - if (xi) - return -EEXIST; - } - - xi = netdev_priv(dev); - xi->p = p; - xi->net = net; - xi->dev = dev; - - err = xfrmi_create(dev); - return err; -} - -static void xfrmi_dellink(struct net_device *dev, struct list_head *head) -{ - unregister_netdevice_queue(dev, head); -} - -static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct net *net = xi->net; - struct xfrm_if_parms p = {}; - - xfrmi_netlink_parms(data, &p); - if (!p.if_id) { - NL_SET_ERR_MSG(extack, "if_id must be non zero"); - return -EINVAL; - } - - if (p.collect_md) { - NL_SET_ERR_MSG(extack, "collect_md can't be changed"); - return -EINVAL; - } - - xi = xfrmi_locate(net, &p); - if (!xi) { - xi = netdev_priv(dev); - } else { - if (xi->dev != dev) - return -EEXIST; - if (xi->p.collect_md) { - NL_SET_ERR_MSG(extack, - "device can't be changed to collect_md"); - return -EINVAL; - } - } - - return xfrmi_update(xi, &p); -} - -static size_t xfrmi_get_size(const struct net_device *dev) -{ - return - /* IFLA_XFRM_LINK */ - nla_total_size(4) + - /* IFLA_XFRM_IF_ID */ - nla_total_size(4) + - /* IFLA_XFRM_COLLECT_METADATA */ - nla_total_size(0) + - 0; -} - -static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - struct xfrm_if_parms *parm = &xi->p; - - if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || - nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || - (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) - goto nla_put_failure; - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static struct net *xfrmi_get_link_net(const struct net_device *dev) -{ - struct xfrm_if *xi = netdev_priv(dev); - - return xi->net; -} - -static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { - [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, - [IFLA_XFRM_LINK] = { .type = NLA_U32 }, - [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, - [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, -}; - -static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { - .kind = "xfrm", - .maxtype = IFLA_XFRM_MAX, - .policy = xfrmi_policy, - .priv_size = sizeof(struct xfrm_if), - .setup = xfrmi_dev_setup, - .validate = xfrmi_validate, - .newlink = xfrmi_newlink, - .dellink = xfrmi_dellink, - .changelink = xfrmi_changelink, - .get_size = xfrmi_get_size, - .fill_info = xfrmi_fill_info, - .get_link_net = xfrmi_get_link_net, -}; - -static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) -{ - struct net *net; - LIST_HEAD(list); - - rtnl_lock(); - list_for_each_entry(net, net_exit_list, exit_list) { - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - struct xfrm_if __rcu **xip; - struct xfrm_if *xi; - int i; - - for (i = 0; i < XFRMI_HASH_SIZE; i++) { - for (xip = &xfrmn->xfrmi[i]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - unregister_netdevice_queue(xi->dev, &list); - } - xi = rtnl_dereference(xfrmn->collect_md_xfrmi); - if (xi) - unregister_netdevice_queue(xi->dev, &list); - } - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -static struct pernet_operations xfrmi_net_ops = { - .exit_batch = xfrmi_exit_batch_net, - .id = &xfrmi_net_id, - .size = sizeof(struct xfrmi_net), -}; - -static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 10, -}; - -static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 10, -}; - -static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { - .handler = xfrm6_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 10, -}; - -#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) -static int xfrmi6_rcv_tunnel(struct sk_buff *skb) -{ - const xfrm_address_t *saddr; - __be32 spi; - - saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; - spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); - - return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); -} - -static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { - .handler = xfrmi6_rcv_tunnel, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 2, -}; - -static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { - .handler = xfrmi6_rcv_tunnel, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi6_err, - .priority = 2, -}; -#endif - -static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 10, -}; - -static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 10, -}; - -static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { - .handler = xfrm4_rcv, - .input_handler = xfrm_input, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 10, -}; - -#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) -static int xfrmi4_rcv_tunnel(struct sk_buff *skb) -{ - return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); -} - -static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { - .handler = xfrmi4_rcv_tunnel, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 3, -}; - -static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { - .handler = xfrmi4_rcv_tunnel, - .cb_handler = xfrmi_rcv_cb, - .err_handler = xfrmi4_err, - .priority = 2, -}; -#endif - -static int __init xfrmi4_init(void) -{ - int err; - - err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); - if (err < 0) - goto xfrm_proto_esp_failed; - err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); - if (err < 0) - goto xfrm_proto_ah_failed; - err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); - if (err < 0) - goto xfrm_proto_comp_failed; -#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) - err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); - if (err < 0) - goto xfrm_tunnel_ipip_failed; - err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); - if (err < 0) - goto xfrm_tunnel_ipip6_failed; -#endif - - return 0; - -#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) -xfrm_tunnel_ipip6_failed: - xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); -xfrm_tunnel_ipip_failed: - xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); -#endif -xfrm_proto_comp_failed: - xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); -xfrm_proto_ah_failed: - xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); -xfrm_proto_esp_failed: - return err; -} - -static void xfrmi4_fini(void) -{ -#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) - xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); - xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); -#endif - xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); - xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); - xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); -} - -static int __init xfrmi6_init(void) -{ - int err; - - err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); - if (err < 0) - goto xfrm_proto_esp_failed; - err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); - if (err < 0) - goto xfrm_proto_ah_failed; - err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); - if (err < 0) - goto xfrm_proto_comp_failed; -#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) - err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); - if (err < 0) - goto xfrm_tunnel_ipv6_failed; - err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); - if (err < 0) - goto xfrm_tunnel_ip6ip_failed; -#endif - - return 0; - -#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) -xfrm_tunnel_ip6ip_failed: - xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); -xfrm_tunnel_ipv6_failed: - xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); -#endif -xfrm_proto_comp_failed: - xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); -xfrm_proto_ah_failed: - xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); -xfrm_proto_esp_failed: - return err; -} - -static void xfrmi6_fini(void) -{ -#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) - xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); - xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); -#endif - xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); - xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); - xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); -} - -static const struct xfrm_if_cb xfrm_if_cb = { - .decode_session = xfrmi_decode_session, -}; - -static int __init xfrmi_init(void) -{ - const char *msg; - int err; - - pr_info("IPsec XFRM device driver\n"); - - msg = "tunnel device"; - err = register_pernet_device(&xfrmi_net_ops); - if (err < 0) - goto pernet_dev_failed; - - msg = "xfrm4 protocols"; - err = xfrmi4_init(); - if (err < 0) - goto xfrmi4_failed; - - msg = "xfrm6 protocols"; - err = xfrmi6_init(); - if (err < 0) - goto xfrmi6_failed; - - - msg = "netlink interface"; - err = rtnl_link_register(&xfrmi_link_ops); - if (err < 0) - goto rtnl_link_failed; - - lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); - - xfrm_if_register_cb(&xfrm_if_cb); - - return err; - -rtnl_link_failed: - xfrmi6_fini(); -xfrmi6_failed: - xfrmi4_fini(); -xfrmi4_failed: - unregister_pernet_device(&xfrmi_net_ops); -pernet_dev_failed: - pr_err("xfrmi init: failed to register %s\n", msg); - return err; -} - -static void __exit xfrmi_fini(void) -{ - xfrm_if_unregister_cb(); - lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); - rtnl_link_unregister(&xfrmi_link_ops); - xfrmi4_fini(); - xfrmi6_fini(); - unregister_pernet_device(&xfrmi_net_ops); -} - -module_init(xfrmi_init); -module_exit(xfrmi_fini); -MODULE_LICENSE("GPL"); -MODULE_ALIAS_RTNL_LINK("xfrm"); -MODULE_ALIAS_NETDEV("xfrm0"); -MODULE_AUTHOR("Steffen Klassert"); -MODULE_DESCRIPTION("XFRM virtual interface"); diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c new file mode 100644 index 000000000000..5a67b120c4db --- /dev/null +++ b/net/xfrm/xfrm_interface_core.c @@ -0,0 +1,1198 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * XFRM virtual interface + * + * Copyright (C) 2018 secunet Security Networks AG + * + * Author: + * Steffen Klassert + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int xfrmi_dev_init(struct net_device *dev); +static void xfrmi_dev_setup(struct net_device *dev); +static struct rtnl_link_ops xfrmi_link_ops __read_mostly; +static unsigned int xfrmi_net_id __read_mostly; +static const struct net_device_ops xfrmi_netdev_ops; + +#define XFRMI_HASH_BITS 8 +#define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) + +struct xfrmi_net { + /* lists for storing interfaces in use */ + struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; + struct xfrm_if __rcu *collect_md_xfrmi; +}; + +static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { + [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +static void xfrmi_destroy_state(struct lwtunnel_state *lwt) +{ +} + +static int xfrmi_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWT_XFRM_MAX + 1]; + struct lwtunnel_state *new_state; + struct xfrm_md_info *info; + int ret; + + ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); + if (ret < 0) + return ret; + + if (!tb[LWT_XFRM_IF_ID]) { + NL_SET_ERR_MSG(extack, "if_id must be set"); + return -EINVAL; + } + + new_state = lwtunnel_state_alloc(sizeof(*info)); + if (!new_state) { + NL_SET_ERR_MSG(extack, "failed to create encap info"); + return -ENOMEM; + } + + new_state->type = LWTUNNEL_ENCAP_XFRM; + + info = lwt_xfrm_info(new_state); + + info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); + + if (tb[LWT_XFRM_LINK]) + info->link = nla_get_u32(tb[LWT_XFRM_LINK]); + + *ts = new_state; + return 0; +} + +static int xfrmi_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwt) +{ + struct xfrm_md_info *info = lwt_xfrm_info(lwt); + + if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || + (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) + return -EMSGSIZE; + + return 0; +} + +static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ + nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ +} + +static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct xfrm_md_info *a_info = lwt_xfrm_info(a); + struct xfrm_md_info *b_info = lwt_xfrm_info(b); + + return memcmp(a_info, b_info, sizeof(*a_info)); +} + +static const struct lwtunnel_encap_ops xfrmi_encap_ops = { + .build_state = xfrmi_build_state, + .destroy_state = xfrmi_destroy_state, + .fill_encap = xfrmi_fill_encap_info, + .get_encap_size = xfrmi_encap_nlsize, + .cmp_encap = xfrmi_encap_cmp, + .owner = THIS_MODULE, +}; + +#define for_each_xfrmi_rcu(start, xi) \ + for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) + +static u32 xfrmi_hash(u32 if_id) +{ + return hash_32(if_id, XFRMI_HASH_BITS); +} + +static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) +{ + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if *xi; + + for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { + if (x->if_id == xi->p.if_id && + (xi->dev->flags & IFF_UP)) + return xi; + } + + xi = rcu_dereference(xfrmn->collect_md_xfrmi); + if (xi && (xi->dev->flags & IFF_UP)) + return xi; + + return NULL; +} + +static bool xfrmi_decode_session(struct sk_buff *skb, + unsigned short family, + struct xfrm_if_decode_session_result *res) +{ + struct net_device *dev; + struct xfrm_if *xi; + int ifindex = 0; + + if (!secpath_exists(skb) || !skb->dev) + return false; + + switch (family) { + case AF_INET6: + ifindex = inet6_sdif(skb); + break; + case AF_INET: + ifindex = inet_sdif(skb); + break; + } + + if (ifindex) { + struct net *net = xs_net(xfrm_input_state(skb)); + + dev = dev_get_by_index_rcu(net, ifindex); + } else { + dev = skb->dev; + } + + if (!dev || !(dev->flags & IFF_UP)) + return false; + if (dev->netdev_ops != &xfrmi_netdev_ops) + return false; + + xi = netdev_priv(dev); + res->net = xi->net; + + if (xi->p.collect_md) + res->if_id = xfrm_input_state(skb)->if_id; + else + res->if_id = xi->p.if_id; + return true; +} + +static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; + + rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); + rcu_assign_pointer(*xip, xi); +} + +static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *iter; + + for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; + (iter = rtnl_dereference(*xip)) != NULL; + xip = &iter->next) { + if (xi == iter) { + rcu_assign_pointer(*xip, xi->next); + break; + } + } +} + +static void xfrmi_dev_free(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + gro_cells_destroy(&xi->gro_cells); + free_percpu(dev->tstats); +} + +static int xfrmi_create(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = dev_net(dev); + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + dev->rtnl_link_ops = &xfrmi_link_ops; + err = register_netdevice(dev); + if (err < 0) + goto out; + + if (xi->p.collect_md) + rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); + else + xfrmi_link(xfrmn, xi); + + return 0; + +out: + return err; +} + +static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) +{ + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) + if (xi->p.if_id == p->if_id) + return xi; + + return NULL; +} + +static void xfrmi_dev_uninit(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); + + if (xi->p.collect_md) + RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); + else + xfrmi_unlink(xfrmn, xi); +} + +static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) +{ + skb_clear_tstamp(skb); + skb->pkt_type = PACKET_HOST; + skb->skb_iif = 0; + skb->ignore_df = 0; + skb_dst_drop(skb); + nf_reset_ct(skb); + nf_reset_trace(skb); + + if (!xnet) + return; + + ipvs_reset(skb); + secpath_reset(skb); + skb_orphan(skb); + skb->mark = 0; +} + +static int xfrmi_rcv_cb(struct sk_buff *skb, int err) +{ + const struct xfrm_mode *inner_mode; + struct net_device *dev; + struct xfrm_state *x; + struct xfrm_if *xi; + bool xnet; + int link; + + if (err && !secpath_exists(skb)) + return 0; + + x = xfrm_input_state(skb); + + xi = xfrmi_lookup(xs_net(x), x); + if (!xi) + return 1; + + link = skb->dev->ifindex; + dev = xi->dev; + skb->dev = dev; + + if (err) { + dev->stats.rx_errors++; + dev->stats.rx_dropped++; + + return 0; + } + + xnet = !net_eq(xi->net, dev_net(skb->dev)); + + if (xnet) { + inner_mode = &x->inner_mode; + + if (x->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); + if (inner_mode == NULL) { + XFRM_INC_STATS(dev_net(skb->dev), + LINUX_MIB_XFRMINSTATEMODEERROR); + return -EINVAL; + } + } + + if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, + inner_mode->family)) + return -EPERM; + } + + xfrmi_scrub_packet(skb, xnet); + if (xi->p.collect_md) { + struct metadata_dst *md_dst; + + md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); + if (!md_dst) + return -ENOMEM; + + md_dst->u.xfrm_info.if_id = x->if_id; + md_dst->u.xfrm_info.link = link; + skb_dst_set(skb, (struct dst_entry *)md_dst); + } + dev_sw_netstats_rx_add(dev, skb->len); + + return 0; +} + +static int +xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + unsigned int length = skb->len; + struct net_device *tdev; + struct xfrm_state *x; + int err = -1; + u32 if_id; + int mtu; + + if (xi->p.collect_md) { + struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); + + if (unlikely(!md_info)) + return -EINVAL; + + if_id = md_info->if_id; + fl->flowi_oif = md_info->link; + } else { + if_id = xi->p.if_id; + } + + dst_hold(dst); + dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); + if (IS_ERR(dst)) { + err = PTR_ERR(dst); + dst = NULL; + goto tx_err_link_failure; + } + + x = dst->xfrm; + if (!x) + goto tx_err_link_failure; + + if (x->if_id != if_id) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + net_warn_ratelimited("%s: Local routing loop detected!\n", + dev->name); + goto tx_err_dst_release; + } + + mtu = dst_mtu(dst); + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { + skb_dst_update_pmtu_no_confirm(skb, mtu); + + if (skb->protocol == htons(ETH_P_IPV6)) { + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (skb->len > 1280) + icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + else + goto xmit; + } else { + if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) + goto xmit; + icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + } + + dst_release(dst); + return -EMSGSIZE; + } + +xmit: + xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); + skb_dst_set(skb, dst); + skb->dev = tdev; + + err = dst_output(xi->net, skb->sk, skb); + if (net_xmit_eval(err) == 0) { + dev_sw_netstats_tx_add(dev, 1, length); + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + return err; +} + +static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + struct flowi fl; + int ret; + + memset(&fl, 0, sizeof(fl)); + + switch (skb->protocol) { + case htons(ETH_P_IPV6): + xfrm_decode_session(skb, &fl, AF_INET6); + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + if (!dst) { + fl.u.ip6.flowi6_oif = dev->ifindex; + fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; + dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); + if (dst->error) { + dst_release(dst); + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, dst); + } + break; + case htons(ETH_P_IP): + xfrm_decode_session(skb, &fl, AF_INET); + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + if (!dst) { + struct rtable *rt; + + fl.u.ip4.flowi4_oif = dev->ifindex; + fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); + if (IS_ERR(rt)) { + stats->tx_carrier_errors++; + goto tx_err; + } + skb_dst_set(skb, &rt->dst); + } + break; + default: + goto tx_err; + } + + fl.flowi_oif = xi->p.link; + + ret = xfrmi_xmit2(skb, dev, &fl); + if (ret < 0) + goto tx_err; + + return NETDEV_TX_OK; + +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + return NETDEV_TX_OK; +} + +static int xfrmi4_err(struct sk_buff *skb, u32 info) +{ + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->protocol; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah ; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + switch (icmp_hdr(skb)->type) { + case ICMP_DEST_UNREACH: + if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) + return 0; + break; + case ICMP_REDIRECT: + break; + default: + return 0; + } + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) + ipv4_update_pmtu(skb, net, info, 0, protocol); + else + ipv4_redirect(skb, net, 0, protocol); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; + struct net *net = dev_net(skb->dev); + int protocol = iph->nexthdr; + struct ip_comp_hdr *ipch; + struct ip_esp_hdr *esph; + struct ip_auth_hdr *ah; + struct xfrm_state *x; + struct xfrm_if *xi; + __be32 spi; + + switch (protocol) { + case IPPROTO_ESP: + esph = (struct ip_esp_hdr *)(skb->data + offset); + spi = esph->spi; + break; + case IPPROTO_AH: + ah = (struct ip_auth_hdr *)(skb->data + offset); + spi = ah->spi; + break; + case IPPROTO_COMP: + ipch = (struct ip_comp_hdr *)(skb->data + offset); + spi = htonl(ntohs(ipch->cpi)); + break; + default: + return 0; + } + + if (type != ICMPV6_PKT_TOOBIG && + type != NDISC_REDIRECT) + return 0; + + x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, + spi, protocol, AF_INET6); + if (!x) + return 0; + + xi = xfrmi_lookup(net, x); + if (!xi) { + xfrm_state_put(x); + return -1; + } + + if (type == NDISC_REDIRECT) + ip6_redirect(skb, net, skb->dev->ifindex, 0, + sock_net_uid(net, NULL)); + else + ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); + xfrm_state_put(x); + + return 0; +} + +static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) +{ + if (xi->p.link != p->link) + return -EINVAL; + + xi->p.if_id = p->if_id; + + return 0; +} + +static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) +{ + struct net *net = xi->net; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + int err; + + xfrmi_unlink(xfrmn, xi); + synchronize_net(); + err = xfrmi_change(xi, p); + xfrmi_link(xfrmn, xi); + netdev_state_change(xi->dev); + return err; +} + +static int xfrmi_get_iflink(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return xi->p.link; +} + +static const struct net_device_ops xfrmi_netdev_ops = { + .ndo_init = xfrmi_dev_init, + .ndo_uninit = xfrmi_dev_uninit, + .ndo_start_xmit = xfrmi_xmit, + .ndo_get_stats64 = dev_get_tstats64, + .ndo_get_iflink = xfrmi_get_iflink, +}; + +static void xfrmi_dev_setup(struct net_device *dev) +{ + dev->netdev_ops = &xfrmi_netdev_ops; + dev->header_ops = &ip_tunnel_header_ops; + dev->type = ARPHRD_NONE; + dev->mtu = ETH_DATA_LEN; + dev->min_mtu = ETH_MIN_MTU; + dev->max_mtu = IP_MAX_MTU; + dev->flags = IFF_NOARP; + dev->needs_free_netdev = true; + dev->priv_destructor = xfrmi_dev_free; + netif_keep_dst(dev); + + eth_broadcast_addr(dev->broadcast); +} + +#define XFRMI_FEATURES (NETIF_F_SG | \ + NETIF_F_FRAGLIST | \ + NETIF_F_GSO_SOFTWARE | \ + NETIF_F_HW_CSUM) + +static int xfrmi_dev_init(struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); + int err; + + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); + if (!dev->tstats) + return -ENOMEM; + + err = gro_cells_init(&xi->gro_cells, dev); + if (err) { + free_percpu(dev->tstats); + return err; + } + + dev->features |= NETIF_F_LLTX; + dev->features |= XFRMI_FEATURES; + dev->hw_features |= XFRMI_FEATURES; + + if (phydev) { + dev->needed_headroom = phydev->needed_headroom; + dev->needed_tailroom = phydev->needed_tailroom; + + if (is_zero_ether_addr(dev->dev_addr)) + eth_hw_addr_inherit(dev, phydev); + if (is_zero_ether_addr(dev->broadcast)) + memcpy(dev->broadcast, phydev->broadcast, + dev->addr_len); + } else { + eth_hw_addr_random(dev); + eth_broadcast_addr(dev->broadcast); + } + + return 0; +} + +static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void xfrmi_netlink_parms(struct nlattr *data[], + struct xfrm_if_parms *parms) +{ + memset(parms, 0, sizeof(*parms)); + + if (!data) + return; + + if (data[IFLA_XFRM_LINK]) + parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); + + if (data[IFLA_XFRM_IF_ID]) + parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); + + if (data[IFLA_XFRM_COLLECT_METADATA]) + parms->collect_md = true; +} + +static int xfrmi_newlink(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct net *net = dev_net(dev); + struct xfrm_if_parms p = {}; + struct xfrm_if *xi; + int err; + + xfrmi_netlink_parms(data, &p); + if (p.collect_md) { + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + + if (p.link || p.if_id) { + NL_SET_ERR_MSG(extack, "link and if_id must be zero"); + return -EINVAL; + } + + if (rtnl_dereference(xfrmn->collect_md_xfrmi)) + return -EEXIST; + + } else { + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + + xi = xfrmi_locate(net, &p); + if (xi) + return -EEXIST; + } + + xi = netdev_priv(dev); + xi->p = p; + xi->net = net; + xi->dev = dev; + + err = xfrmi_create(dev); + return err; +} + +static void xfrmi_dellink(struct net_device *dev, struct list_head *head) +{ + unregister_netdevice_queue(dev, head); +} + +static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct net *net = xi->net; + struct xfrm_if_parms p = {}; + + xfrmi_netlink_parms(data, &p); + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + + if (p.collect_md) { + NL_SET_ERR_MSG(extack, "collect_md can't be changed"); + return -EINVAL; + } + + xi = xfrmi_locate(net, &p); + if (!xi) { + xi = netdev_priv(dev); + } else { + if (xi->dev != dev) + return -EEXIST; + if (xi->p.collect_md) { + NL_SET_ERR_MSG(extack, + "device can't be changed to collect_md"); + return -EINVAL; + } + } + + return xfrmi_update(xi, &p); +} + +static size_t xfrmi_get_size(const struct net_device *dev) +{ + return + /* IFLA_XFRM_LINK */ + nla_total_size(4) + + /* IFLA_XFRM_IF_ID */ + nla_total_size(4) + + /* IFLA_XFRM_COLLECT_METADATA */ + nla_total_size(0) + + 0; +} + +static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + struct xfrm_if_parms *parm = &xi->p; + + if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || + nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || + (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static struct net *xfrmi_get_link_net(const struct net_device *dev) +{ + struct xfrm_if *xi = netdev_priv(dev); + + return xi->net; +} + +static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { + [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, + [IFLA_XFRM_LINK] = { .type = NLA_U32 }, + [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, + [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, +}; + +static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { + .kind = "xfrm", + .maxtype = IFLA_XFRM_MAX, + .policy = xfrmi_policy, + .priv_size = sizeof(struct xfrm_if), + .setup = xfrmi_dev_setup, + .validate = xfrmi_validate, + .newlink = xfrmi_newlink, + .dellink = xfrmi_dellink, + .changelink = xfrmi_changelink, + .get_size = xfrmi_get_size, + .fill_info = xfrmi_fill_info, + .get_link_net = xfrmi_get_link_net, +}; + +static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) +{ + struct net *net; + LIST_HEAD(list); + + rtnl_lock(); + list_for_each_entry(net, net_exit_list, exit_list) { + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + int i; + + for (i = 0; i < XFRMI_HASH_SIZE; i++) { + for (xip = &xfrmn->xfrmi[i]; + (xi = rtnl_dereference(*xip)) != NULL; + xip = &xi->next) + unregister_netdevice_queue(xi->dev, &list); + } + xi = rtnl_dereference(xfrmn->collect_md_xfrmi); + if (xi) + unregister_netdevice_queue(xi->dev, &list); + } + unregister_netdevice_many(&list); + rtnl_unlock(); +} + +static struct pernet_operations xfrmi_net_ops = { + .exit_batch = xfrmi_exit_batch_net, + .id = &xfrmi_net_id, + .size = sizeof(struct xfrmi_net), +}; + +static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { + .handler = xfrm6_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 10, +}; + +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +static int xfrmi6_rcv_tunnel(struct sk_buff *skb) +{ + const xfrm_address_t *saddr; + __be32 spi; + + saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; + spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); + + return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); +} + +static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { + .handler = xfrmi6_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 2, +}; + +static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { + .handler = xfrmi6_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi6_err, + .priority = 2, +}; +#endif + +static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { + .handler = xfrm4_rcv, + .input_handler = xfrm_input, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 10, +}; + +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) +static int xfrmi4_rcv_tunnel(struct sk_buff *skb) +{ + return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); +} + +static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { + .handler = xfrmi4_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 3, +}; + +static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { + .handler = xfrmi4_rcv_tunnel, + .cb_handler = xfrmi_rcv_cb, + .err_handler = xfrmi4_err, + .priority = 2, +}; +#endif + +static int __init xfrmi4_init(void) +{ + int err; + + err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) + err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ipip_failed; + err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); + if (err < 0) + goto xfrm_tunnel_ipip6_failed; +#endif + + return 0; + +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) +xfrm_tunnel_ipip6_failed: + xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); +xfrm_tunnel_ipip_failed: + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); +#endif +xfrm_proto_comp_failed: + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi4_fini(void) +{ +#if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) + xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); + xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); +#endif + xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); + xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); + xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); +} + +static int __init xfrmi6_init(void) +{ + int err; + + err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); + if (err < 0) + goto xfrm_proto_esp_failed; + err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); + if (err < 0) + goto xfrm_proto_ah_failed; + err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + if (err < 0) + goto xfrm_proto_comp_failed; +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); + if (err < 0) + goto xfrm_tunnel_ipv6_failed; + err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); + if (err < 0) + goto xfrm_tunnel_ip6ip_failed; +#endif + + return 0; + +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) +xfrm_tunnel_ip6ip_failed: + xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); +xfrm_tunnel_ipv6_failed: + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); +#endif +xfrm_proto_comp_failed: + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); +xfrm_proto_ah_failed: + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +xfrm_proto_esp_failed: + return err; +} + +static void xfrmi6_fini(void) +{ +#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) + xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); + xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); +#endif + xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); + xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); + xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); +} + +static const struct xfrm_if_cb xfrm_if_cb = { + .decode_session = xfrmi_decode_session, +}; + +static int __init xfrmi_init(void) +{ + const char *msg; + int err; + + pr_info("IPsec XFRM device driver\n"); + + msg = "tunnel device"; + err = register_pernet_device(&xfrmi_net_ops); + if (err < 0) + goto pernet_dev_failed; + + msg = "xfrm4 protocols"; + err = xfrmi4_init(); + if (err < 0) + goto xfrmi4_failed; + + msg = "xfrm6 protocols"; + err = xfrmi6_init(); + if (err < 0) + goto xfrmi6_failed; + + + msg = "netlink interface"; + err = rtnl_link_register(&xfrmi_link_ops); + if (err < 0) + goto rtnl_link_failed; + + lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); + + xfrm_if_register_cb(&xfrm_if_cb); + + return err; + +rtnl_link_failed: + xfrmi6_fini(); +xfrmi6_failed: + xfrmi4_fini(); +xfrmi4_failed: + unregister_pernet_device(&xfrmi_net_ops); +pernet_dev_failed: + pr_err("xfrmi init: failed to register %s\n", msg); + return err; +} + +static void __exit xfrmi_fini(void) +{ + xfrm_if_unregister_cb(); + lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); + rtnl_link_unregister(&xfrmi_link_ops); + xfrmi4_fini(); + xfrmi6_fini(); + unregister_pernet_device(&xfrmi_net_ops); +} + +module_init(xfrmi_init); +module_exit(xfrmi_fini); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_RTNL_LINK("xfrm"); +MODULE_ALIAS_NETDEV("xfrm0"); +MODULE_AUTHOR("Steffen Klassert"); +MODULE_DESCRIPTION("XFRM virtual interface"); -- cgit v1.2.3-70-g09d2 From 94151f5aa9667c562281abeaaa5e89b9d5c17729 Mon Sep 17 00:00:00 2001 From: Eyal Birger Date: Sat, 3 Dec 2022 10:46:57 +0200 Subject: xfrm: interface: Add unstable helpers for setting/getting XFRM metadata from TC-BPF This change adds xfrm metadata helpers using the unstable kfunc call interface for the TC-BPF hooks. This allows steering traffic towards different IPsec connections based on logic implemented in bpf programs. This object is built based on the availability of BTF debug info. When setting the xfrm metadata, percpu metadata dsts are used in order to avoid allocating a metadata dst per packet. In order to guarantee safe module unload, the percpu dsts are allocated on first use and never freed. The percpu pointer is stored in net/core/filter.c so that it can be reused on module reload. The metadata percpu dsts take ownership of the original skb dsts so that they may be used as part of the xfrm transmission logic - e.g. for MTU calculations. Signed-off-by: Eyal Birger Link: https://lore.kernel.org/r/20221203084659.1837829-3-eyal.birger@gmail.com Signed-off-by: Martin KaFai Lau --- include/net/dst_metadata.h | 1 + include/net/xfrm.h | 17 ++++++ net/core/dst.c | 8 ++- net/core/filter.c | 9 ++++ net/xfrm/Makefile | 6 +++ net/xfrm/xfrm_interface_bpf.c | 115 +++++++++++++++++++++++++++++++++++++++++ net/xfrm/xfrm_interface_core.c | 14 +++++ 7 files changed, 168 insertions(+), 2 deletions(-) create mode 100644 net/xfrm/xfrm_interface_bpf.c (limited to 'net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index a454cf4327fe..1b7fae4c6b24 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -26,6 +26,7 @@ struct macsec_info { struct xfrm_md_info { u32 if_id; int link; + struct dst_entry *dst_orig; }; struct metadata_dst { diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e0cc6791c001..3707e6b34e67 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2086,4 +2086,21 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk) return false; } #endif + +#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +extern struct metadata_dst __percpu *xfrm_bpf_md_dst; + +int register_xfrm_interface_bpf(void); + +#else + +static inline int register_xfrm_interface_bpf(void) +{ + return 0; +} + +#endif + #endif /* _NET_XFRM_H */ diff --git a/net/core/dst.c b/net/core/dst.c index bc9c9be4e080..bb14a0392388 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -316,6 +316,8 @@ void metadata_dst_free(struct metadata_dst *md_dst) if (md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&md_dst->u.tun_info.dst_cache); #endif + if (md_dst->type == METADATA_XFRM) + dst_release(md_dst->u.xfrm_info.dst_orig); kfree(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free); @@ -340,16 +342,18 @@ EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) { -#ifdef CONFIG_DST_CACHE int cpu; for_each_possible_cpu(cpu) { struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); +#ifdef CONFIG_DST_CACHE if (one_md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); - } #endif + if (one_md_dst->type == METADATA_XFRM) + dst_release(one_md_dst->u.xfrm_info.dst_orig); + } free_percpu(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/filter.c b/net/core/filter.c index 8607136b6e2c..929358677183 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5631,6 +5631,15 @@ static const struct bpf_func_proto bpf_bind_proto = { }; #ifdef CONFIG_XFRM + +#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \ + (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)) + +struct metadata_dst __percpu *xfrm_bpf_md_dst; +EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst); + +#endif + BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, struct bpf_xfrm_state *, to, u32, size, u64, flags) { diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 08a2870fdd36..cd47f88921f5 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -5,6 +5,12 @@ xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o +ifeq ($(CONFIG_XFRM_INTERFACE),m) +xfrm_interface-$(CONFIG_DEBUG_INFO_BTF_MODULES) += xfrm_interface_bpf.o +else ifeq ($(CONFIG_XFRM_INTERFACE),y) +xfrm_interface-$(CONFIG_DEBUG_INFO_BTF) += xfrm_interface_bpf.o +endif + obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ xfrm_sysctl.o xfrm_replay.o xfrm_device.o diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c new file mode 100644 index 000000000000..1ef2162cebcf --- /dev/null +++ b/net/xfrm/xfrm_interface_bpf.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable XFRM Helpers for TC-BPF hook + * + * These are called from SCHED_CLS BPF programs. Note that it is + * allowed to break compatibility for these functions since the interface they + * are exposed through to BPF programs is explicitly unstable. + */ + +#include +#include + +#include +#include + +/* bpf_xfrm_info - XFRM metadata information + * + * Members: + * @if_id - XFRM if_id: + * Transmit: if_id to be used in policy and state lookups + * Receive: if_id of the state matched for the incoming packet + * @link - Underlying device ifindex: + * Transmit: used as the underlying device in VRF routing + * Receive: the device on which the packet had been received + */ +struct bpf_xfrm_info { + u32 if_id; + int link; +}; + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in xfrm_interface BTF"); + +/* bpf_skb_get_xfrm_info - Get XFRM metadata + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @to - Pointer to memory to which the metadata will be copied + * Cannot be NULL + */ +__used noinline +int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct xfrm_md_info *info; + + info = skb_xfrm_md_info(skb); + if (!info) + return -EINVAL; + + to->if_id = info->if_id; + to->link = info->link; + return 0; +} + +/* bpf_skb_get_xfrm_info - Set XFRM metadata + * + * Parameters: + * @skb_ctx - Pointer to ctx (__sk_buff) in TC program + * Cannot be NULL + * @from - Pointer to memory from which the metadata will be copied + * Cannot be NULL + */ +__used noinline +int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, + const struct bpf_xfrm_info *from) +{ + struct sk_buff *skb = (struct sk_buff *)skb_ctx; + struct metadata_dst *md_dst; + struct xfrm_md_info *info; + + if (unlikely(skb_metadata_dst(skb))) + return -EINVAL; + + if (!xfrm_bpf_md_dst) { + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(0, METADATA_XFRM, GFP_ATOMIC); + if (!tmp) + return -ENOMEM; + if (cmpxchg(&xfrm_bpf_md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); + } + md_dst = this_cpu_ptr(xfrm_bpf_md_dst); + + info = &md_dst->u.xfrm_info; + + info->if_id = from->if_id; + info->link = from->link; + skb_dst_force(skb); + info->dst_orig = skb_dst(skb); + + dst_hold((struct dst_entry *)md_dst); + skb_dst_set(skb, (struct dst_entry *)md_dst); + return 0; +} + +__diag_pop() + +BTF_SET8_START(xfrm_ifc_kfunc_set) +BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info) +BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info) +BTF_SET8_END(xfrm_ifc_kfunc_set) + +static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = { + .owner = THIS_MODULE, + .set = &xfrm_ifc_kfunc_set, +}; + +int __init register_xfrm_interface_bpf(void) +{ + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, + &xfrm_interface_kfunc_set); +} diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 5a67b120c4db..1f99dc469027 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -396,6 +396,14 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if_id = md_info->if_id; fl->flowi_oif = md_info->link; + if (md_info->dst_orig) { + struct dst_entry *tmp_dst = dst; + + dst = md_info->dst_orig; + skb_dst_set(skb, dst); + md_info->dst_orig = NULL; + dst_release(tmp_dst); + } } else { if_id = xi->p.if_id; } @@ -1162,12 +1170,18 @@ static int __init xfrmi_init(void) if (err < 0) goto rtnl_link_failed; + err = register_xfrm_interface_bpf(); + if (err < 0) + goto kfunc_failed; + lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; +kfunc_failed: + rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: -- cgit v1.2.3-70-g09d2 From b93884eea26f97a3dc4c1df8c64389cbb0673001 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 2 Dec 2022 13:24:22 -0800 Subject: net/ncsi: Silence runtime memcpy() false positive warning The memcpy() in ncsi_cmd_handler_oem deserializes nca->data into a flexible array structure that overlapping with non-flex-array members (mfr_id) intentionally. Since the mem_to_flex() API is not finished, temporarily silence this warning, since it is a false positive, using unsafe_memcpy(). Reported-by: Joel Stanley Link: https://lore.kernel.org/netdev/CACPK8Xdfi=OJKP0x0D1w87fQeFZ4A2DP2qzGCRcuVbpU-9=4sQ@mail.gmail.com/ Cc: Samuel Mendoza-Jonas Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221202212418.never.837-kees@kernel.org Signed-off-by: Jakub Kicinski --- net/ncsi/ncsi-cmd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index dda8b76b7798..fd2236ee9a79 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -228,7 +228,8 @@ static int ncsi_cmd_handler_oem(struct sk_buff *skb, len += max(payload, padding_bytes); cmd = skb_put_zero(skb, len); - memcpy(&cmd->mfr_id, nca->data, nca->payload); + unsafe_memcpy(&cmd->mfr_id, nca->data, nca->payload, + /* skb allocated with enough to load the payload */); ncsi_cmd_build_header(&cmd->cmd.common, nca); return 0; -- cgit v1.2.3-70-g09d2 From 5b481acab4ce017fda8166fa9428511da41109e5 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Dec 2022 15:59:32 +0100 Subject: bpf: do not rely on ALLOW_ERROR_INJECTION for fmod_ret The current way of expressing that a non-bpf kernel component is willing to accept that bpf programs can be attached to it and that they can change the return value is to abuse ALLOW_ERROR_INJECTION. This is debated in the link below, and the result is that it is not a reasonable thing to do. Reuse the kfunc declaration structure to also tag the kernel functions we want to be fmodret. This way we can control from any subsystem which functions are being modified by bpf without touching the verifier. Link: https://lore.kernel.org/all/20221121104403.1545f9b5@gandalf.local.home/ Suggested-by: Alexei Starovoitov Signed-off-by: Benjamin Tissoires Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20221206145936.922196-2-benjamin.tissoires@redhat.com --- include/linux/btf.h | 2 ++ kernel/bpf/btf.c | 30 +++++++++++++++++++++++++----- kernel/bpf/verifier.c | 17 +++++++++++++++-- net/bpf/test_run.c | 14 +++++++++++--- 4 files changed, 53 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9aababc5d78..6a0808e7845c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -412,8 +412,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, u32 kfunc_btf_id); +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 35c07afac924..c1df506293e4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -204,6 +204,7 @@ enum btf_kfunc_hook { BTF_KFUNC_HOOK_STRUCT_OPS, BTF_KFUNC_HOOK_TRACING, BTF_KFUNC_HOOK_SYSCALL, + BTF_KFUNC_HOOK_FMODRET, BTF_KFUNC_HOOK_MAX, }; @@ -7448,11 +7449,14 @@ u32 *btf_kfunc_id_set_contains(const struct btf *btf, return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id); } -/* This function must be invoked only from initcalls/module init functions */ -int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, - const struct btf_kfunc_id_set *kset) +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id) +{ + return __btf_kfunc_id_set_contains(btf, BTF_KFUNC_HOOK_FMODRET, kfunc_btf_id); +} + +static int __register_btf_kfunc_id_set(enum btf_kfunc_hook hook, + const struct btf_kfunc_id_set *kset) { - enum btf_kfunc_hook hook; struct btf *btf; int ret; @@ -7471,13 +7475,29 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, if (IS_ERR(btf)) return PTR_ERR(btf); - hook = bpf_prog_type_to_kfunc_hook(prog_type); ret = btf_populate_kfunc_set(btf, hook, kset->set); btf_put(btf); return ret; } + +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, + const struct btf_kfunc_id_set *kset) +{ + enum btf_kfunc_hook hook; + + hook = bpf_prog_type_to_kfunc_hook(prog_type); + return __register_btf_kfunc_id_set(hook, kset); +} EXPORT_SYMBOL_GPL(register_btf_kfunc_id_set); +/* This function must be invoked only from initcalls/module init functions */ +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset) +{ + return __register_btf_kfunc_id_set(BTF_KFUNC_HOOK_FMODRET, kset); +} +EXPORT_SYMBOL_GPL(register_btf_fmodret_id_set); + s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id) { struct btf_id_dtor_kfunc_tab *tab = btf->dtor_kfunc_tab; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 225666307bba..ec2e6ec7309e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -15021,12 +15021,22 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, ret = -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_TRACING: - /* fentry/fexit/fmod_ret progs can be sleepable only if they are + + /* fentry/fexit/fmod_ret progs can be sleepable if they are * attached to ALLOW_ERROR_INJECTION and are not in denylist. */ if (!check_non_sleepable_error_inject(btf_id) && within_error_injection_list(addr)) ret = 0; + /* fentry/fexit/fmod_ret progs can also be sleepable if they are + * in the fmodret id set with the KF_SLEEPABLE flag. + */ + else { + u32 *flags = btf_kfunc_is_modify_return(btf, btf_id); + + if (flags && (*flags & KF_SLEEPABLE)) + ret = 0; + } break; case BPF_PROG_TYPE_LSM: /* LSM progs check that they are attached to bpf_lsm_*() funcs. @@ -15047,7 +15057,10 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "can't modify return codes of BPF programs\n"); return -EINVAL; } - ret = check_attach_modify_return(addr, tname); + ret = -EINVAL; + if (btf_kfunc_is_modify_return(btf, btf_id) || + !check_attach_modify_return(addr, tname)) + ret = 0; if (ret) { bpf_log(log, "%s() is not modifiable\n", tname); return ret; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 13d578ce2a09..5079fb089b5f 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -489,7 +489,6 @@ int noinline bpf_fentry_test1(int a) return a + 1; } EXPORT_SYMBOL_GPL(bpf_fentry_test1); -ALLOW_ERROR_INJECTION(bpf_fentry_test1, ERRNO); int noinline bpf_fentry_test2(int a, u64 b) { @@ -733,7 +732,15 @@ noinline void bpf_kfunc_call_test_destructive(void) __diag_pop(); -ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO); +BTF_SET8_START(bpf_test_modify_return_ids) +BTF_ID_FLAGS(func, bpf_modify_return_test) +BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) +BTF_SET8_END(bpf_test_modify_return_ids) + +static const struct btf_kfunc_id_set bpf_test_modify_return_set = { + .owner = THIS_MODULE, + .set = &bpf_test_modify_return_ids, +}; BTF_SET8_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test1) @@ -1668,7 +1675,8 @@ static int __init bpf_prog_test_run_init(void) }; int ret; - ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); + ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, -- cgit v1.2.3-70-g09d2 From 16dc16d9f058dce7031ee8b850f10622b8b5fb14 Mon Sep 17 00:00:00 2001 From: ye xingchen Date: Mon, 5 Dec 2022 19:18:56 +0800 Subject: net: ethernet: use sysfs_emit() to instead of scnprintf() Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: ye xingchen Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/202212051918564721658@zte.com.cn Signed-off-by: Jakub Kicinski --- net/ethernet/eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index e02daa74e833..2edc8b796a4e 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -398,7 +398,7 @@ EXPORT_SYMBOL(alloc_etherdev_mqs); ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) { - return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr); + return sysfs_emit(buf, "%*phC\n", len, addr); } EXPORT_SYMBOL(sysfs_format_mac); -- cgit v1.2.3-70-g09d2 From cb453926865ea0dddda898c8184eebb595f3c53d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:01 +0200 Subject: bridge: mcast: Centralize netlink attribute parsing Netlink attributes are currently passed deep in the MDB creation call chain, making it difficult to add new attributes. In addition, some validity checks are performed under the multicast lock although they can be performed before it is ever acquired. As a first step towards solving these issues, parse the RTM_{NEW,DEL}MDB messages into a configuration structure, relieving other functions from the need to handle raw netlink attributes. Subsequent patches will convert the MDB code to use this configuration structure. This is consistent with how other rtnetlink objects are handled, such as routes and nexthops. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 7 +++ 2 files changed, 127 insertions(+) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 321be94c445a..bd3a7d881d52 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -974,6 +974,116 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, return ret; } +static int br_mdb_config_attrs_init(struct nlattr *set_attrs, + struct br_mdb_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; + int err; + + err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, set_attrs, + br_mdbe_attrs_pol, extack); + if (err) + return err; + + if (mdb_attrs[MDBE_ATTR_SOURCE] && + !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], + cfg->entry->addr.proto, extack)) + return -EINVAL; + + __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs); + + return 0; +} + +static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, + struct br_mdb_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1]; + struct br_port_msg *bpm; + struct net_device *dev; + int err; + + err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, + MDBA_SET_ENTRY_MAX, NULL, extack); + if (err) + return err; + + memset(cfg, 0, sizeof(*cfg)); + + bpm = nlmsg_data(nlh); + if (!bpm->ifindex) { + NL_SET_ERR_MSG_MOD(extack, "Invalid bridge ifindex"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, bpm->ifindex); + if (!dev) { + NL_SET_ERR_MSG_MOD(extack, "Bridge device doesn't exist"); + return -ENODEV; + } + + if (!netif_is_bridge_master(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge"); + return -EOPNOTSUPP; + } + + cfg->br = netdev_priv(dev); + + if (!netif_running(cfg->br->dev)) { + NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running"); + return -EINVAL; + } + + if (!br_opt_get(cfg->br, BROPT_MULTICAST_ENABLED)) { + NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled"); + return -EINVAL; + } + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) { + NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute"); + return -EINVAL; + } + if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) { + NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length"); + return -EINVAL; + } + + cfg->entry = nla_data(tb[MDBA_SET_ENTRY]); + if (!is_valid_mdb_entry(cfg->entry, extack)) + return -EINVAL; + + if (cfg->entry->ifindex != cfg->br->dev->ifindex) { + struct net_device *pdev; + + pdev = __dev_get_by_index(net, cfg->entry->ifindex); + if (!pdev) { + NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist"); + return -ENODEV; + } + + cfg->p = br_port_get_rtnl(pdev); + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); + return -EINVAL; + } + + if (cfg->p->br != cfg->br) { + NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); + return -EINVAL; + } + } + + if (tb[MDBA_SET_ENTRY_ATTRS]) + return br_mdb_config_attrs_init(tb[MDBA_SET_ENTRY_ATTRS], cfg, + extack); + else + __mdb_entry_to_br_ip(cfg->entry, &cfg->group, NULL); + + return 0; +} + static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -984,9 +1094,14 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev, *pdev; struct br_mdb_entry *entry; struct net_bridge_vlan *v; + struct br_mdb_config cfg; struct net_bridge *br; int err; + err = br_mdb_config_init(net, nlh, &cfg, extack); + if (err) + return err; + err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; @@ -1101,9 +1216,14 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev, *pdev; struct br_mdb_entry *entry; struct net_bridge_vlan *v; + struct br_mdb_config cfg; struct net_bridge *br; int err; + err = br_mdb_config_init(net, nlh, &cfg, extack); + if (err) + return err; + err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); if (err < 0) return err; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 4c4fda930068..0a09f10966dc 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -92,6 +92,13 @@ struct bridge_mcast_stats { struct br_mcast_stats mstats; struct u64_stats_sync syncp; }; + +struct br_mdb_config { + struct net_bridge *br; + struct net_bridge_port *p; + struct br_mdb_entry *entry; + struct br_ip group; +}; #endif /* net_bridge_mcast_port must be always defined due to forwarding stubs */ -- cgit v1.2.3-70-g09d2 From 3866116815249be8a5ebfa37ddea9f1032c976a9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:02 +0200 Subject: bridge: mcast: Remove redundant checks These checks are now redundant as they are performed by br_mdb_config_init() while parsing the RTM_{NEW,DEL}MDB messages. Remove them. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 63 ++++++++--------------------------------------------- 1 file changed, 9 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index bd3a7d881d52..c8d78e4ec94e 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1090,11 +1090,10 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; - struct net_bridge_port *p = NULL; - struct net_device *dev, *pdev; struct br_mdb_entry *entry; struct net_bridge_vlan *v; struct br_mdb_config cfg; + struct net_device *dev; struct net_bridge *br; int err; @@ -1108,38 +1107,12 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, br = netdev_priv(dev); - if (!netif_running(br->dev)) { - NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running"); - return -EINVAL; - } - - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { - NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled"); - return -EINVAL; - } - if (entry->ifindex != br->dev->ifindex) { - pdev = __dev_get_by_index(net, entry->ifindex); - if (!pdev) { - NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist"); - return -ENODEV; - } - - p = br_port_get_rtnl(pdev); - if (!p) { - NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); - return -EINVAL; - } - - if (p->br != br) { - NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); - return -EINVAL; - } - if (p->state == BR_STATE_DISABLED && entry->state != MDB_PERMANENT) { + if (cfg.p->state == BR_STATE_DISABLED && entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); return -EINVAL; } - vg = nbp_vlan_group(p); + vg = nbp_vlan_group(cfg.p); } else { vg = br_vlan_group(br); } @@ -1150,12 +1123,12 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { entry->vid = v->vid; - err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); + err = __br_mdb_add(net, br, cfg.p, entry, mdb_attrs, extack); if (err) break; } } else { - err = __br_mdb_add(net, br, p, entry, mdb_attrs, extack); + err = __br_mdb_add(net, br, cfg.p, entry, mdb_attrs, extack); } return err; @@ -1170,9 +1143,6 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry, struct br_ip ip; int err = -EINVAL; - if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) - return -EINVAL; - __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); spin_lock_bh(&br->multicast_lock); @@ -1212,11 +1182,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; - struct net_bridge_port *p = NULL; - struct net_device *dev, *pdev; struct br_mdb_entry *entry; struct net_bridge_vlan *v; struct br_mdb_config cfg; + struct net_device *dev; struct net_bridge *br; int err; @@ -1230,24 +1199,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, br = netdev_priv(dev); - if (entry->ifindex != br->dev->ifindex) { - pdev = __dev_get_by_index(net, entry->ifindex); - if (!pdev) - return -ENODEV; - - p = br_port_get_rtnl(pdev); - if (!p) { - NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port"); - return -EINVAL; - } - if (p->br != br) { - NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device"); - return -EINVAL; - } - vg = nbp_vlan_group(p); - } else { + if (entry->ifindex != br->dev->ifindex) + vg = nbp_vlan_group(cfg.p); + else vg = br_vlan_group(br); - } /* If vlan filtering is enabled and VLAN is not specified * delete mdb entry on all vlans configured on the port. -- cgit v1.2.3-70-g09d2 From f2b5aac68117909c3ac9e660c3182fd59524242f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:03 +0200 Subject: bridge: mcast: Use MDB configuration structure where possible The MDB configuration structure (i.e., struct br_mdb_config) now includes all the necessary information from the parsed RTM_{NEW,DEL}MDB netlink messages, so use it. This will later allow us to delete the calls to br_mdb_parse() from br_mdb_add() and br_mdb_del(). No functional changes intended. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index c8d78e4ec94e..080516a3ee9c 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1094,7 +1094,6 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_bridge_vlan *v; struct br_mdb_config cfg; struct net_device *dev; - struct net_bridge *br; int err; err = br_mdb_config_init(net, nlh, &cfg, extack); @@ -1105,30 +1104,30 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - br = netdev_priv(dev); - - if (entry->ifindex != br->dev->ifindex) { - if (cfg.p->state == BR_STATE_DISABLED && entry->state != MDB_PERMANENT) { + if (cfg.p) { + if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); return -EINVAL; } vg = nbp_vlan_group(cfg.p); } else { - vg = br_vlan_group(br); + vg = br_vlan_group(cfg.br); } /* If vlan filtering is enabled and VLAN is not specified * install mdb entry on all vlans configured on the port. */ - if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { + if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { - entry->vid = v->vid; - err = __br_mdb_add(net, br, cfg.p, entry, mdb_attrs, extack); + cfg.entry->vid = v->vid; + err = __br_mdb_add(net, cfg.br, cfg.p, cfg.entry, + mdb_attrs, extack); if (err) break; } } else { - err = __br_mdb_add(net, br, cfg.p, entry, mdb_attrs, extack); + err = __br_mdb_add(net, cfg.br, cfg.p, cfg.entry, mdb_attrs, + extack); } return err; @@ -1186,7 +1185,6 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_bridge_vlan *v; struct br_mdb_config cfg; struct net_device *dev; - struct net_bridge *br; int err; err = br_mdb_config_init(net, nlh, &cfg, extack); @@ -1197,23 +1195,21 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - br = netdev_priv(dev); - - if (entry->ifindex != br->dev->ifindex) + if (cfg.p) vg = nbp_vlan_group(cfg.p); else - vg = br_vlan_group(br); + vg = br_vlan_group(cfg.br); /* If vlan filtering is enabled and VLAN is not specified * delete mdb entry on all vlans configured on the port. */ - if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) { + if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { - entry->vid = v->vid; - err = __br_mdb_del(br, entry, mdb_attrs); + cfg.entry->vid = v->vid; + err = __br_mdb_del(cfg.br, cfg.entry, mdb_attrs); } } else { - err = __br_mdb_del(br, entry, mdb_attrs); + err = __br_mdb_del(cfg.br, cfg.entry, mdb_attrs); } return err; -- cgit v1.2.3-70-g09d2 From 8bd9c08e324175051d6d8ad5b2acfeaccaa1047b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:04 +0200 Subject: bridge: mcast: Propagate MDB configuration structure further As an intermediate step towards only using the new MDB configuration structure, pass it further in the control path instead of passing individual attributes. No functional changes intended. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 080516a3ee9c..6017bff8316a 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -959,17 +959,15 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return 0; } -static int __br_mdb_add(struct net *net, struct net_bridge *br, - struct net_bridge_port *p, - struct br_mdb_entry *entry, +static int __br_mdb_add(const struct br_mdb_config *cfg, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { int ret; - spin_lock_bh(&br->multicast_lock); - ret = br_mdb_add_group(br, p, entry, mdb_attrs, extack); - spin_unlock_bh(&br->multicast_lock); + spin_lock_bh(&cfg->br->multicast_lock); + ret = br_mdb_add_group(cfg->br, cfg->p, cfg->entry, mdb_attrs, extack); + spin_unlock_bh(&cfg->br->multicast_lock); return ret; } @@ -1120,22 +1118,22 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; - err = __br_mdb_add(net, cfg.br, cfg.p, cfg.entry, - mdb_attrs, extack); + err = __br_mdb_add(&cfg, mdb_attrs, extack); if (err) break; } } else { - err = __br_mdb_add(net, cfg.br, cfg.p, cfg.entry, mdb_attrs, - extack); + err = __br_mdb_add(&cfg, mdb_attrs, extack); } return err; } -static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry, +static int __br_mdb_del(const struct br_mdb_config *cfg, struct nlattr **mdb_attrs) { + struct br_mdb_entry *entry = cfg->entry; + struct net_bridge *br = cfg->br; struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; @@ -1206,10 +1204,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; - err = __br_mdb_del(cfg.br, cfg.entry, mdb_attrs); + err = __br_mdb_del(&cfg, mdb_attrs); } } else { - err = __br_mdb_del(cfg.br, cfg.entry, mdb_attrs); + err = __br_mdb_del(&cfg, mdb_attrs); } return err; -- cgit v1.2.3-70-g09d2 From 9f52a51429791737eaf35c6fb5273d178ec005d9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:05 +0200 Subject: bridge: mcast: Use MDB group key from configuration structure The MDB group key (i.e., {source, destination, protocol, VID}) is currently determined under the multicast lock from the netlink attributes. Instead, use the group key from the MDB configuration structure that was prepared before acquiring the lock. No functional changes intended. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 6017bff8316a..b459886af675 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -855,20 +855,19 @@ out: static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_mdb_entry *entry, - struct nlattr **mdb_attrs, + const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp, *star_mp; struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; struct net_bridge_mcast *brmctx; - struct br_ip group, star_group; + struct br_ip group = cfg->group; unsigned long now = jiffies; unsigned char flags = 0; + struct br_ip star_group; u8 filter_mode; - __mdb_entry_to_br_ip(entry, &group, mdb_attrs); - brmctx = __br_mdb_choose_context(br, entry, extack); if (!brmctx) return -EINVAL; @@ -966,7 +965,7 @@ static int __br_mdb_add(const struct br_mdb_config *cfg, int ret; spin_lock_bh(&cfg->br->multicast_lock); - ret = br_mdb_add_group(cfg->br, cfg->p, cfg->entry, mdb_attrs, extack); + ret = br_mdb_add_group(cfg->br, cfg->p, cfg->entry, cfg, extack); spin_unlock_bh(&cfg->br->multicast_lock); return ret; @@ -1118,6 +1117,7 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; + cfg.group.vid = v->vid; err = __br_mdb_add(&cfg, mdb_attrs, extack); if (err) break; @@ -1137,11 +1137,9 @@ static int __br_mdb_del(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct br_ip ip; + struct br_ip ip = cfg->group; int err = -EINVAL; - __mdb_entry_to_br_ip(entry, &ip, mdb_attrs); - spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &ip); if (!mp) @@ -1204,6 +1202,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) { list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; + cfg.group.vid = v->vid; err = __br_mdb_del(&cfg, mdb_attrs); } } else { -- cgit v1.2.3-70-g09d2 From 3ee5662345f26c3d694e25f3e3b577deaaf31c0f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:06 +0200 Subject: bridge: mcast: Remove br_mdb_parse() The parsing of the netlink messages and the validity checks are now performed in br_mdb_config_init() so we can remove br_mdb_parse(). This finally allows us to stop passing netlink attributes deep in the MDB control path and only use the MDB configuration structure. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 93 +++-------------------------------------------------- 1 file changed, 5 insertions(+), 88 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index b459886af675..d0e018628f5d 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -754,73 +754,6 @@ static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { sizeof(struct in6_addr)), }; -static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, - struct net_device **pdev, struct br_mdb_entry **pentry, - struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) -{ - struct net *net = sock_net(skb->sk); - struct br_mdb_entry *entry; - struct br_port_msg *bpm; - struct nlattr *tb[MDBA_SET_ENTRY_MAX+1]; - struct net_device *dev; - int err; - - err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb, - MDBA_SET_ENTRY_MAX, NULL, NULL); - if (err < 0) - return err; - - bpm = nlmsg_data(nlh); - if (bpm->ifindex == 0) { - NL_SET_ERR_MSG_MOD(extack, "Invalid bridge ifindex"); - return -EINVAL; - } - - dev = __dev_get_by_index(net, bpm->ifindex); - if (dev == NULL) { - NL_SET_ERR_MSG_MOD(extack, "Bridge device doesn't exist"); - return -ENODEV; - } - - if (!netif_is_bridge_master(dev)) { - NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge"); - return -EOPNOTSUPP; - } - - *pdev = dev; - - if (!tb[MDBA_SET_ENTRY]) { - NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute"); - return -EINVAL; - } - if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) { - NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length"); - return -EINVAL; - } - - entry = nla_data(tb[MDBA_SET_ENTRY]); - if (!is_valid_mdb_entry(entry, extack)) - return -EINVAL; - *pentry = entry; - - if (tb[MDBA_SET_ENTRY_ATTRS]) { - err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, - tb[MDBA_SET_ENTRY_ATTRS], - br_mdbe_attrs_pol, extack); - if (err) - return err; - if (mdb_attrs[MDBE_ATTR_SOURCE] && - !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE], - entry->addr.proto, extack)) - return -EINVAL; - } else { - memset(mdb_attrs, 0, - sizeof(struct nlattr *) * (MDBE_ATTR_MAX + 1)); - } - - return 0; -} - static struct net_bridge_mcast * __br_mdb_choose_context(struct net_bridge *br, const struct br_mdb_entry *entry, @@ -959,7 +892,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } static int __br_mdb_add(const struct br_mdb_config *cfg, - struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { int ret; @@ -1084,23 +1016,16 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; - struct br_mdb_entry *entry; struct net_bridge_vlan *v; struct br_mdb_config cfg; - struct net_device *dev; int err; err = br_mdb_config_init(net, nlh, &cfg, extack); if (err) return err; - err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); - if (err < 0) - return err; - if (cfg.p) { if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); @@ -1118,19 +1043,18 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; - err = __br_mdb_add(&cfg, mdb_attrs, extack); + err = __br_mdb_add(&cfg, extack); if (err) break; } } else { - err = __br_mdb_add(&cfg, mdb_attrs, extack); + err = __br_mdb_add(&cfg, extack); } return err; } -static int __br_mdb_del(const struct br_mdb_config *cfg, - struct nlattr **mdb_attrs) +static int __br_mdb_del(const struct br_mdb_config *cfg) { struct br_mdb_entry *entry = cfg->entry; struct net_bridge *br = cfg->br; @@ -1174,23 +1098,16 @@ unlock: static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1]; struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; - struct br_mdb_entry *entry; struct net_bridge_vlan *v; struct br_mdb_config cfg; - struct net_device *dev; int err; err = br_mdb_config_init(net, nlh, &cfg, extack); if (err) return err; - err = br_mdb_parse(skb, nlh, &dev, &entry, mdb_attrs, extack); - if (err < 0) - return err; - if (cfg.p) vg = nbp_vlan_group(cfg.p); else @@ -1203,10 +1120,10 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, list_for_each_entry(v, &vg->vlan_list, vlist) { cfg.entry->vid = v->vid; cfg.group.vid = v->vid; - err = __br_mdb_del(&cfg, mdb_attrs); + err = __br_mdb_del(&cfg); } } else { - err = __br_mdb_del(&cfg, mdb_attrs); + err = __br_mdb_del(&cfg); } return err; -- cgit v1.2.3-70-g09d2 From 4c1ebc6c1f2196c11cff78832c2604bbcec7d088 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:07 +0200 Subject: bridge: mcast: Move checks out of critical section The checks only require information parsed from the RTM_NEWMDB netlink message and do not rely on any state stored in the bridge driver. Therefore, there is no need to perform the checks in the critical section under the multicast lock. Move the checks out of the critical section. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index d0e018628f5d..d954d8f7cb0a 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -805,24 +805,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, if (!brmctx) return -EINVAL; - /* host join errors which can happen before creating the group */ - if (!port && !br_group_is_l2(&group)) { - /* don't allow any flags for host-joined IP groups */ - if (entry->state) { - NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); - return -EINVAL; - } - if (!br_multicast_is_star_g(&group)) { - NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); - return -EINVAL; - } - } - - if (br_group_is_l2(&group) && entry->state != MDB_PERMANENT) { - NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); - return -EINVAL; - } - mp = br_multicast_new_group(br, &group); if (IS_ERR(mp)) return PTR_ERR(mp); @@ -1026,6 +1008,24 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + /* host join errors which can happen before creating the group */ + if (!cfg.p && !br_group_is_l2(&cfg.group)) { + /* don't allow any flags for host-joined IP groups */ + if (cfg.entry->state) { + NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(&cfg.group)) { + NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); + return -EINVAL; + } + } + + if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) { + NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); + return -EINVAL; + } + if (cfg.p) { if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); -- cgit v1.2.3-70-g09d2 From 090149eaf3911ce588775107b25c725d287752ef Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:08 +0200 Subject: bridge: mcast: Remove redundant function arguments Drop the first three arguments and instead extract them from the MDB configuration structure. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index d954d8f7cb0a..ae7d93c08880 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -786,13 +786,14 @@ out: return brmctx; } -static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, - struct br_mdb_entry *entry, - const struct br_mdb_config *cfg, +static int br_mdb_add_group(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp, *star_mp; struct net_bridge_port_group __rcu **pp; + struct br_mdb_entry *entry = cfg->entry; + struct net_bridge_port *port = cfg->p; + struct net_bridge *br = cfg->br; struct net_bridge_port_group *p; struct net_bridge_mcast *brmctx; struct br_ip group = cfg->group; @@ -879,7 +880,7 @@ static int __br_mdb_add(const struct br_mdb_config *cfg, int ret; spin_lock_bh(&cfg->br->multicast_lock); - ret = br_mdb_add_group(cfg->br, cfg->p, cfg->entry, cfg, extack); + ret = br_mdb_add_group(cfg, extack); spin_unlock_bh(&cfg->br->multicast_lock); return ret; -- cgit v1.2.3-70-g09d2 From f86c3e2c1b5ea5c959ef176541c2f831231fa631 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 6 Dec 2022 12:58:09 +0200 Subject: bridge: mcast: Constify 'group' argument in br_multicast_new_port_group() The 'group' argument is not modified, so mark it as 'const'. It will allow us to constify arguments of the callers of this function in future patches. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 2 +- net/bridge/br_private.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 5e988f0ed2c0..db4c3900ae95 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1273,7 +1273,7 @@ br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_i struct net_bridge_port_group *br_multicast_new_port_group( struct net_bridge_port *port, - struct br_ip *group, + const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 0a09f10966dc..3997e16c15fc 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -941,7 +941,8 @@ br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst); struct net_bridge_mdb_entry * br_multicast_new_group(struct net_bridge *br, struct br_ip *group); struct net_bridge_port_group * -br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, +br_multicast_new_port_group(struct net_bridge_port *port, + const struct br_ip *group, struct net_bridge_port_group __rcu *next, unsigned char flags, const unsigned char *src, u8 filter_mode, u8 rt_protocol); -- cgit v1.2.3-70-g09d2 From c0bea69d1ca7974a6a387dbc8d9ca15345e2779f Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:13 +0200 Subject: devlink: Validate port function request In order to avoid partial request processing, validate the request before processing it. Signed-off-by: Shay Drory Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index 907df7124157..035249c5dd17 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1632,11 +1632,6 @@ static int devlink_port_function_hw_addr_set(struct devlink_port *port, } } - if (!ops->port_function_hw_addr_set) { - NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes"); - return -EOPNOTSUPP; - } - return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len, extack); } @@ -1650,12 +1645,27 @@ static int devlink_port_fn_state_set(struct devlink_port *port, state = nla_get_u8(attr); ops = port->devlink->ops; - if (!ops->port_fn_state_set) { - NL_SET_ERR_MSG_MOD(extack, - "Function does not support state setting"); + return ops->port_fn_state_set(port, state, extack); +} + +static int devlink_port_function_validate(struct devlink_port *devlink_port, + struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && + !ops->port_function_hw_addr_set) { + NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], + "Port doesn't support function attributes"); return -EOPNOTSUPP; } - return ops->port_fn_state_set(port, state, extack); + if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) { + NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR], + "Function does not support state setting"); + return -EOPNOTSUPP; + } + return 0; } static int devlink_port_function_set(struct devlink_port *port, @@ -1672,6 +1682,10 @@ static int devlink_port_function_set(struct devlink_port *port, return err; } + err = devlink_port_function_validate(port, tb, extack); + if (err) + return err; + attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { err = devlink_port_function_hw_addr_set(port, attr, extack); -- cgit v1.2.3-70-g09d2 From da65e9ff3bf614d2836e38e1d405c7073e6ba3b7 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:15 +0200 Subject: devlink: Expose port function commands to control RoCE Expose port function commands to enable / disable RoCE, this is used to control the port RoCE device capabilities. When RoCE is disabled for a function of the port, function cannot create any RoCE specific resources (e.g GID table). It also saves system memory utilization. For example disabling RoCE enable a VF/SF saves 1 Mbytes of system memory per function. Example of a PCI VF port which supports function configuration: Set RoCE of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 roce enable $ devlink port function set pci/0000:06:00.0/2 roce disable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 roce disable Signed-off-by: Shay Drory Reviewed-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-port.rst | 34 ++++++- include/net/devlink.h | 18 ++++ include/uapi/linux/devlink.h | 10 ++ net/core/devlink.c | 113 ++++++++++++++++++++++ 4 files changed, 174 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index 2c637f4aae8e..c3302d23e480 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -110,7 +110,7 @@ devlink ports for both the controllers. Function configuration ====================== -A user can configure the function attribute before enumerating the PCI +Users can configure one or more function attributes before enumerating the PCI function. Usually it means, user should configure function attribute before a bus specific device for the function is created. However, when SRIOV is enabled, virtual function devices are created on the PCI bus. @@ -122,6 +122,9 @@ A user may set the hardware address of the function using `devlink port function set hw_addr` command. For Ethernet port function this means a MAC address. +Users may also set the RoCE capability of the function using +`devlink port function set roce` command. + Function attributes =================== @@ -162,6 +165,35 @@ device created for the PCI VF/SF. function: hw_addr 00:00:00:00:88:88 +RoCE capability setup +--------------------- +Not all PCI VFs/SFs require RoCE capability. + +When RoCE capability is disabled, it saves system memory per PCI VF/SF. + +When user disables RoCE capability for a VF/SF, user application cannot send or +receive any RoCE packets through this VF/SF and RoCE GID table for this PCI +will be empty. + +When RoCE capability is disabled in the device using port function attribute, +VF/SF driver cannot override it. + +- Get RoCE capability of the VF device:: + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 roce enable + +- Set RoCE capability of the VF device:: + + $ devlink port function set pci/0000:06:00.0/2 roce disable + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 roce disable + Subfunction ============ diff --git a/include/net/devlink.h b/include/net/devlink.h index 5f6eca5e4a40..ce4c65d2f2e7 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1451,6 +1451,24 @@ struct devlink_ops { int (*port_function_hw_addr_set)(struct devlink_port *port, const u8 *hw_addr, int hw_addr_len, struct netlink_ext_ack *extack); + /** + * @port_fn_roce_get: Port function's roce get function. + * + * Query RoCE state of a function managed by the devlink port. + * Return -EOPNOTSUPP if port function RoCE handling is not supported. + */ + int (*port_fn_roce_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + /** + * @port_fn_roce_set: Port function's roce set function. + * + * Enable/Disable the RoCE state of a function managed by the devlink + * port. + * Return -EOPNOTSUPP if port function RoCE handling is not supported. + */ + int (*port_fn_roce_set)(struct devlink_port *devlink_port, + bool enable, struct netlink_ext_ack *extack); /** * port_new() - Add a new port function of a specified flavor * @devlink: Devlink instance diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 70191d96af89..6cc2925bd478 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -658,11 +658,21 @@ enum devlink_resource_unit { DEVLINK_RESOURCE_UNIT_ENTRY, }; +enum devlink_port_fn_attr_cap { + DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT, + + /* Add new caps above */ + __DEVLINK_PORT_FN_ATTR_CAPS_MAX, +}; + +#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT) + enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, /* binary */ DEVLINK_PORT_FN_ATTR_STATE, /* u8 */ DEVLINK_PORT_FN_ATTR_OPSTATE, /* u8 */ + DEVLINK_PORT_FN_ATTR_CAPS, /* bitfield32 */ __DEVLINK_PORT_FUNCTION_ATTR_MAX, DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 035249c5dd17..8c0ad52431c5 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -195,11 +195,16 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); +#define DEVLINK_PORT_FN_CAPS_VALID_MASK \ + (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1) + static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY }, [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE, DEVLINK_PORT_FN_STATE_ACTIVE), + [DEVLINK_PORT_FN_ATTR_CAPS] = + NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK), }; static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = { @@ -680,6 +685,60 @@ devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb, return 0; } +static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps, + u32 cap, bool is_enable) +{ + caps->selector |= cap; + if (is_enable) + caps->value |= cap; +} + +static int devlink_port_fn_roce_fill(const struct devlink_ops *ops, + struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!ops->port_fn_roce_get) + return 0; + + err = ops->port_fn_roce_get(devlink_port, &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable); + return 0; +} + +static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, + struct devlink_port *devlink_port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) +{ + struct nla_bitfield32 caps = {}; + int err; + + err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack); + if (err) + return err; + + if (!caps.selector) + return 0; + err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, + caps.selector); + if (err) + return err; + + *msg_updated = true; + return 0; +} + static int devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, struct genl_info *info, @@ -1263,6 +1322,35 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops, return 0; } +static int +devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + return ops->port_fn_roce_set(devlink_port, enable, extack); +} + +static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nla_bitfield32 caps; + u32 caps_value; + int err; + + caps = nla_get_bitfield32(attr); + caps_value = caps.value & caps.selector; + if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) { + err = devlink_port_fn_roce_set(devlink_port, + caps_value & DEVLINK_PORT_FN_CAP_ROCE, + extack); + if (err) + return err; + } + return 0; +} + static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) @@ -1281,6 +1369,10 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por &msg_updated); if (err) goto out; + err = devlink_port_fn_caps_fill(ops, port, msg, extack, + &msg_updated); + if (err) + goto out; err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated); out: if (err || !msg_updated) @@ -1653,6 +1745,7 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, struct netlink_ext_ack *extack) { const struct devlink_ops *ops = devlink_port->devlink->ops; + struct nlattr *attr; if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] && !ops->port_function_hw_addr_set) { @@ -1665,6 +1758,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, "Function does not support state setting"); return -EOPNOTSUPP; } + attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; + if (attr) { + struct nla_bitfield32 caps; + + caps = nla_get_bitfield32(attr); + if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE && + !ops->port_fn_roce_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support RoCE function attribute"); + return -EOPNOTSUPP; + } + } return 0; } @@ -1692,6 +1797,14 @@ static int devlink_port_function_set(struct devlink_port *port, if (err) return err; } + + attr = tb[DEVLINK_PORT_FN_ATTR_CAPS]; + if (attr) { + err = devlink_port_fn_caps_set(port, attr, extack); + if (err) + return err; + } + /* Keep this as the last function attribute set, so that when * multiple port function attributes are set along with state, * Those can be applied first before activating the state. -- cgit v1.2.3-70-g09d2 From a8ce7b26a51efc4d7753b23d639ae092878a6193 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:18 +0200 Subject: devlink: Expose port function commands to control migratable Expose port function commands to enable / disable migratable capability, this is used to set the port function as migratable. Live migration is the process of transferring a live virtual machine from one physical host to another without disrupting its normal operation. In order for a VM to be able to perform LM, all the VM components must be able to perform migration. e.g.: to be migratable. In order for VF to be migratable, VF must be bound to VFIO driver with migration support. When migratable capability is enabled for a function of the port, the device is making the necessary preparations for the function to be migratable, which might include disabling features which cannot be migrated. Example of LM with migratable function configuration: Set migratable of the VF's port function. $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable disable $ devlink port function set pci/0000:06:00.0/2 migratable enable $ devlink port show pci/0000:06:00.0/2 pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 function: hw_addr 00:00:00:00:00:00 migratable enable Bind VF to VFIO driver with migration support: $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/unbind $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/bind Attach VF to the VM. Start the VM. Perform LM. Signed-off-by: Shay Drory Reviewed-by: Jiri Pirko Acked-by: Shannon Nelson Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-port.rst | 46 +++++++++++++++++++ include/net/devlink.h | 21 +++++++++ include/uapi/linux/devlink.h | 3 ++ net/core/devlink.c | 55 +++++++++++++++++++++++ 4 files changed, 125 insertions(+) (limited to 'net') diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index c3302d23e480..3da590953ce8 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -125,6 +125,9 @@ this means a MAC address. Users may also set the RoCE capability of the function using `devlink port function set roce` command. +Users may also set the function as migratable using +'devlink port function set migratable' command. + Function attributes =================== @@ -194,6 +197,49 @@ VF/SF driver cannot override it. function: hw_addr 00:00:00:00:00:00 roce disable +migratable capability setup +--------------------------- +Live migration is the process of transferring a live virtual machine +from one physical host to another without disrupting its normal +operation. + +User who want PCI VFs to be able to perform live migration need to +explicitly enable the VF migratable capability. + +When user enables migratable capability for a VF, and the HV binds the VF to VFIO driver +with migration support, the user can migrate the VM with this VF from one HV to a +different one. + +However, when migratable capability is enable, device will disable features which cannot +be migrated. Thus migratable cap can impose limitations on a VF so let the user decide. + +Example of LM with migratable function configuration: +- Get migratable capability of the VF device:: + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 migratable disable + +- Set migratable capability of the VF device:: + + $ devlink port function set pci/0000:06:00.0/2 migratable enable + + $ devlink port show pci/0000:06:00.0/2 + pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1 + function: + hw_addr 00:00:00:00:00:00 migratable enable + +- Bind VF to VFIO driver with migration support:: + + $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/unbind + $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override + $ echo > /sys/bus/pci/devices/0000:08:00.0/driver/bind + +Attach VF to the VM. +Start the VM. +Perform live migration. + Subfunction ============ diff --git a/include/net/devlink.h b/include/net/devlink.h index ce4c65d2f2e7..0f376a28b9c4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -1469,6 +1469,27 @@ struct devlink_ops { */ int (*port_fn_roce_set)(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack); + /** + * @port_fn_migratable_get: Port function's migratable get function. + * + * Query migratable state of a function managed by the devlink port. + * Return -EOPNOTSUPP if port function migratable handling is not + * supported. + */ + int (*port_fn_migratable_get)(struct devlink_port *devlink_port, + bool *is_enable, + struct netlink_ext_ack *extack); + /** + * @port_fn_migratable_set: Port function's migratable set function. + * + * Enable/Disable migratable state of a function managed by the devlink + * port. + * Return -EOPNOTSUPP if port function migratable handling is not + * supported. + */ + int (*port_fn_migratable_set)(struct devlink_port *devlink_port, + bool enable, + struct netlink_ext_ack *extack); /** * port_new() - Add a new port function of a specified flavor * @devlink: Devlink instance diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6cc2925bd478..3782d4219ac9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -660,12 +660,15 @@ enum devlink_resource_unit { enum devlink_port_fn_attr_cap { DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT, + DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT, /* Add new caps above */ __DEVLINK_PORT_FN_ATTR_CAPS_MAX, }; #define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT) +#define DEVLINK_PORT_FN_CAP_MIGRATABLE \ + _BITUL(DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT) enum devlink_port_function_attr { DEVLINK_PORT_FUNCTION_ATTR_UNSPEC, diff --git a/net/core/devlink.c b/net/core/devlink.c index 8c0ad52431c5..ab40ebcb4aea 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -715,6 +715,29 @@ static int devlink_port_fn_roce_fill(const struct devlink_ops *ops, return 0; } +static int devlink_port_fn_migratable_fill(const struct devlink_ops *ops, + struct devlink_port *devlink_port, + struct nla_bitfield32 *caps, + struct netlink_ext_ack *extack) +{ + bool is_enable; + int err; + + if (!ops->port_fn_migratable_get || + devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) + return 0; + + err = ops->port_fn_migratable_get(devlink_port, &is_enable, extack); + if (err) { + if (err == -EOPNOTSUPP) + return 0; + return err; + } + + devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable); + return 0; +} + static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, struct devlink_port *devlink_port, struct sk_buff *msg, @@ -728,6 +751,10 @@ static int devlink_port_fn_caps_fill(const struct devlink_ops *ops, if (err) return err; + err = devlink_port_fn_migratable_fill(ops, devlink_port, &caps, extack); + if (err) + return err; + if (!caps.selector) return 0; err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value, @@ -1322,6 +1349,15 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops, return 0; } +static int +devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable, + struct netlink_ext_ack *extack) +{ + const struct devlink_ops *ops = devlink_port->devlink->ops; + + return ops->port_fn_migratable_set(devlink_port, enable, extack); +} + static int devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable, struct netlink_ext_ack *extack) @@ -1348,6 +1384,13 @@ static int devlink_port_fn_caps_set(struct devlink_port *devlink_port, if (err) return err; } + if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { + err = devlink_port_fn_mig_set(devlink_port, caps_value & + DEVLINK_PORT_FN_CAP_MIGRATABLE, + extack); + if (err) + return err; + } return 0; } @@ -1769,6 +1812,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port, "Port doesn't support RoCE function attribute"); return -EOPNOTSUPP; } + if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) { + if (!ops->port_fn_migratable_set) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "Port doesn't support migratable function attribute"); + return -EOPNOTSUPP; + } + if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) { + NL_SET_ERR_MSG_ATTR(extack, attr, + "migratable function attribute supported for VFs only"); + return -EOPNOTSUPP; + } + } } return 0; } -- cgit v1.2.3-70-g09d2 From 895fa59647cd64da99eebd1199cf27ecce08c17c Mon Sep 17 00:00:00 2001 From: Li Qiong Date: Tue, 6 Dec 2022 15:44:14 +0800 Subject: netfilter: flowtable: add a 'default' case to flowtable datapath Add a 'default' case in case return a uninitialized value of ret, this should not ever happen since the follow transmit path types: - FLOW_OFFLOAD_XMIT_UNSPEC - FLOW_OFFLOAD_XMIT_TC are never observed from this path. Add this check for safety reasons. Signed-off-by: Li Qiong Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index b350fe9d00b0..19efba1e51ef 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -421,6 +421,10 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (ret == NF_DROP) flow_offload_teardown(flow); break; + default: + WARN_ON_ONCE(1); + ret = NF_DROP; + break; } return ret; @@ -682,6 +686,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (ret == NF_DROP) flow_offload_teardown(flow); break; + default: + WARN_ON_ONCE(1); + ret = NF_DROP; + break; } return ret; -- cgit v1.2.3-70-g09d2 From b534dc46c8ae0165b1b2509be24dbea4fa9c4011 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 7 Dec 2022 09:37:01 -0500 Subject: net_tstamp: add SOF_TIMESTAMPING_OPT_ID_TCP Add an option to initialize SOF_TIMESTAMPING_OPT_ID for TCP from write_seq sockets instead of snd_una. This should have been the behavior from the start. Because processes may now exist that rely on the established behavior, do not change behavior of the existing option, but add the right behavior with a new flag. It is encouraged to always set SOF_TIMESTAMPING_OPT_ID_TCP on stream sockets along with the existing SOF_TIMESTAMPING_OPT_ID. Intuitively the contract is that the counter is zero after the setsockopt, so that the next write N results in a notification for the last byte N - 1. On idle sockets snd_una == write_seq and this holds for both. But on sockets with data in transmission, snd_una records the unacked offset in the stream. This depends on the ACK response from the peer. A process cannot learn this in a race free manner (ioctl SIOCOUTQ is one racy approach). write_seq records the offset at the last byte written by the process. This is a better starting point. It matches the intuitive contract in all circumstances, unaffected by external behavior. The new timestamp flag necessitates increasing sk_tsflags to 32 bits. Move the field in struct sock to avoid growing the socket (for some common CONFIG variants). The UAPI interface so_timestamping.flags is already int, so 32 bits wide. Reported-by: Sotirios Delimanolis Signed-off-by: Willem de Bruijn Link: https://lore.kernel.org/r/20221207143701.29861-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/networking/timestamping.rst | 32 ++++++++++++++++++++++++++++++- include/net/sock.h | 6 +++--- include/uapi/linux/net_tstamp.h | 3 ++- net/core/sock.c | 9 ++++++++- net/ethtool/common.c | 1 + 5 files changed, 45 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst index be4eb1242057..f17c01834a12 100644 --- a/Documentation/networking/timestamping.rst +++ b/Documentation/networking/timestamping.rst @@ -179,7 +179,8 @@ SOF_TIMESTAMPING_OPT_ID: identifier and returns that along with the timestamp. The identifier is derived from a per-socket u32 counter (that wraps). For datagram sockets, the counter increments with each sent packet. For stream - sockets, it increments with every byte. + sockets, it increments with every byte. For stream sockets, also set + SOF_TIMESTAMPING_OPT_ID_TCP, see the section below. The counter starts at zero. It is initialized the first time that the socket option is enabled. It is reset each time the option is @@ -192,6 +193,35 @@ SOF_TIMESTAMPING_OPT_ID: among all possibly concurrently outstanding timestamp requests for that socket. +SOF_TIMESTAMPING_OPT_ID_TCP: + Pass this modifier along with SOF_TIMESTAMPING_OPT_ID for new TCP + timestamping applications. SOF_TIMESTAMPING_OPT_ID defines how the + counter increments for stream sockets, but its starting point is + not entirely trivial. This option fixes that. + + For stream sockets, if SOF_TIMESTAMPING_OPT_ID is set, this should + always be set too. On datagram sockets the option has no effect. + + A reasonable expectation is that the counter is reset to zero with + the system call, so that a subsequent write() of N bytes generates + a timestamp with counter N-1. SOF_TIMESTAMPING_OPT_ID_TCP + implements this behavior under all conditions. + + SOF_TIMESTAMPING_OPT_ID without modifier often reports the same, + especially when the socket option is set when no data is in + transmission. If data is being transmitted, it may be off by the + length of the output queue (SIOCOUTQ). + + The difference is due to being based on snd_una versus write_seq. + snd_una is the offset in the stream acknowledged by the peer. This + depends on factors outside of process control, such as network RTT. + write_seq is the last byte written by the process. This offset is + not affected by external inputs. + + The difference is subtle and unlikely to be noticed when configured + at initial socket creation, when no data is queued or sent. But + SOF_TIMESTAMPING_OPT_ID_TCP behavior is more robust regardless of + when the socket option is set. SOF_TIMESTAMPING_OPT_CMSG: Support recv() cmsg for all timestamped packets. Control messages diff --git a/include/net/sock.h b/include/net/sock.h index 6d207e7c4ad0..ecea3dcc2217 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -503,10 +503,10 @@ struct sock { #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif - u16 sk_tsflags; - u8 sk_shutdown; atomic_t sk_tskey; atomic_t sk_zckey; + u32 sk_tsflags; + u8 sk_shutdown; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, @@ -1899,7 +1899,7 @@ static inline void sock_replace_proto(struct sock *sk, struct proto *proto) struct sockcm_cookie { u64 transmit_time; u32 mark; - u16 tsflags; + u32 tsflags; }; static inline void sockcm_init(struct sockcm_cookie *sockc, diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h index 55501e5e7ac8..a2c66b3d7f0f 100644 --- a/include/uapi/linux/net_tstamp.h +++ b/include/uapi/linux/net_tstamp.h @@ -31,8 +31,9 @@ enum { SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13), SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14), SOF_TIMESTAMPING_BIND_PHC = (1 << 15), + SOF_TIMESTAMPING_OPT_ID_TCP = (1 << 16), - SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC, + SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID_TCP, SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) | SOF_TIMESTAMPING_LAST }; diff --git a/net/core/sock.c b/net/core/sock.c index 4571914a4aa8..b0ab841e0aed 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -901,13 +901,20 @@ int sock_set_timestamping(struct sock *sk, int optname, if (val & ~SOF_TIMESTAMPING_MASK) return -EINVAL; + if (val & SOF_TIMESTAMPING_OPT_ID_TCP && + !(val & SOF_TIMESTAMPING_OPT_ID)) + return -EINVAL; + if (val & SOF_TIMESTAMPING_OPT_ID && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { if (sk_is_tcp(sk)) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) return -EINVAL; - atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); + if (val & SOF_TIMESTAMPING_OPT_ID_TCP) + atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); + else + atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); } else { atomic_set(&sk->sk_tskey, 0); } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 21cfe8557205..6f399afc2ff2 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -417,6 +417,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = { [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo", [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw", [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)] = "bind-phc", + [const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)] = "option-id-tcp", }; static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT); -- cgit v1.2.3-70-g09d2 From abe2343d37c2b4361547d5d31e17340ff9ec7356 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 7 Dec 2022 09:23:14 +0000 Subject: xfrm: Fix spelling mistake "oflload" -> "offload" There is a spelling mistake in a NL_SET_ERR_MSG message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 3e9e874522a8..4aff76c6f12e 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -379,7 +379,7 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, default: xdo->dev = NULL; dev_put(dev); - NL_SET_ERR_MSG(extack, "Unrecognized oflload direction"); + NL_SET_ERR_MSG(extack, "Unrecognized offload direction"); return -EINVAL; } -- cgit v1.2.3-70-g09d2 From 44aa5a6dba8283bfda28b1517af4de711c5652a4 Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Tue, 6 Dec 2022 09:58:34 +0300 Subject: net: vmw_vsock: vmci: Check memcpy_from_msg() vmci_transport_dgram_enqueue() does not check the return value of memcpy_from_msg(). If memcpy_from_msg() fails, it is possible that uninitialized memory contents are sent unintentionally instead of user's message in the datagram to the destination. Return with an error if memcpy_from_msg() fails. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: 0f7db23a07af ("vmci_transport: switch ->enqeue_dgram, ->enqueue_stream and ->dequeue_stream to msghdr") Signed-off-by: Artem Chernyshev Reviewed-by: Stefano Garzarella Reviewed-by: Vishnu Dasa Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 842c94286d31..36eb16a40745 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1711,7 +1711,11 @@ static int vmci_transport_dgram_enqueue( if (!dg) return -ENOMEM; - memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len); + err = memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len); + if (err) { + kfree(dg); + return err; + } dg->dst = vmci_make_handle(remote_addr->svm_cid, remote_addr->svm_port); -- cgit v1.2.3-70-g09d2 From 7f0e810220e2d985338ecdd907c1598404db251d Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 6 Dec 2022 10:55:11 -0300 Subject: net/sched: add retpoline wrapper for tc On kernels using retpoline as a spectrev2 mitigation, optimize actions and filters that are compiled as built-ins into a direct call. On subsequent patches we expose the classifiers and actions functions and wire up the wrapper into tc. Signed-off-by: Pedro Tammela Reviewed-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Signed-off-by: David S. Miller --- include/net/tc_wrapper.h | 251 +++++++++++++++++++++++++++++++++++++++++++++++ net/sched/sch_api.c | 5 + 2 files changed, 256 insertions(+) create mode 100644 include/net/tc_wrapper.h (limited to 'net') diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h new file mode 100644 index 000000000000..ceed2fc089ff --- /dev/null +++ b/include/net/tc_wrapper.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __NET_TC_WRAPPER_H +#define __NET_TC_WRAPPER_H + +#include + +#if IS_ENABLED(CONFIG_RETPOLINE) + +#include +#include +#include + +#define TC_INDIRECT_SCOPE + +extern struct static_key_false tc_skip_wrapper; + +/* TC Actions */ +#ifdef CONFIG_NET_CLS_ACT + +#define TC_INDIRECT_ACTION_DECLARE(fname) \ + INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb, \ + const struct tc_action *a, \ + struct tcf_result *res)) + +TC_INDIRECT_ACTION_DECLARE(tcf_bpf_act); +TC_INDIRECT_ACTION_DECLARE(tcf_connmark_act); +TC_INDIRECT_ACTION_DECLARE(tcf_csum_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ct_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ctinfo_act); +TC_INDIRECT_ACTION_DECLARE(tcf_gact_act); +TC_INDIRECT_ACTION_DECLARE(tcf_gate_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ife_act); +TC_INDIRECT_ACTION_DECLARE(tcf_ipt_act); +TC_INDIRECT_ACTION_DECLARE(tcf_mirred_act); +TC_INDIRECT_ACTION_DECLARE(tcf_mpls_act); +TC_INDIRECT_ACTION_DECLARE(tcf_nat_act); +TC_INDIRECT_ACTION_DECLARE(tcf_pedit_act); +TC_INDIRECT_ACTION_DECLARE(tcf_police_act); +TC_INDIRECT_ACTION_DECLARE(tcf_sample_act); +TC_INDIRECT_ACTION_DECLARE(tcf_simp_act); +TC_INDIRECT_ACTION_DECLARE(tcf_skbedit_act); +TC_INDIRECT_ACTION_DECLARE(tcf_skbmod_act); +TC_INDIRECT_ACTION_DECLARE(tcf_vlan_act); +TC_INDIRECT_ACTION_DECLARE(tunnel_key_act); + +static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + if (static_branch_likely(&tc_skip_wrapper)) + goto skip; + +#if IS_BUILTIN(CONFIG_NET_ACT_GACT) + if (a->ops->act == tcf_gact_act) + return tcf_gact_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_MIRRED) + if (a->ops->act == tcf_mirred_act) + return tcf_mirred_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_PEDIT) + if (a->ops->act == tcf_pedit_act) + return tcf_pedit_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT) + if (a->ops->act == tcf_skbedit_act) + return tcf_skbedit_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SKBMOD) + if (a->ops->act == tcf_skbmod_act) + return tcf_skbmod_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_POLICE) + if (a->ops->act == tcf_police_act) + return tcf_police_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_BPF) + if (a->ops->act == tcf_bpf_act) + return tcf_bpf_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CONNMARK) + if (a->ops->act == tcf_connmark_act) + return tcf_connmark_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CSUM) + if (a->ops->act == tcf_csum_act) + return tcf_csum_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CT) + if (a->ops->act == tcf_ct_act) + return tcf_ct_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_CTINFO) + if (a->ops->act == tcf_ctinfo_act) + return tcf_ctinfo_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_GATE) + if (a->ops->act == tcf_gate_act) + return tcf_gate_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_MPLS) + if (a->ops->act == tcf_mpls_act) + return tcf_mpls_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_NAT) + if (a->ops->act == tcf_nat_act) + return tcf_nat_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY) + if (a->ops->act == tunnel_key_act) + return tunnel_key_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_VLAN) + if (a->ops->act == tcf_vlan_act) + return tcf_vlan_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_IFE) + if (a->ops->act == tcf_ife_act) + return tcf_ife_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_IPT) + if (a->ops->act == tcf_ipt_act) + return tcf_ipt_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SIMP) + if (a->ops->act == tcf_simp_act) + return tcf_simp_act(skb, a, res); +#endif +#if IS_BUILTIN(CONFIG_NET_ACT_SAMPLE) + if (a->ops->act == tcf_sample_act) + return tcf_sample_act(skb, a, res); +#endif + +skip: + return a->ops->act(skb, a, res); +} + +#endif /* CONFIG_NET_CLS_ACT */ + +/* TC Filters */ +#ifdef CONFIG_NET_CLS + +#define TC_INDIRECT_FILTER_DECLARE(fname) \ + INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb, \ + const struct tcf_proto *tp, \ + struct tcf_result *res)) + +TC_INDIRECT_FILTER_DECLARE(basic_classify); +TC_INDIRECT_FILTER_DECLARE(cls_bpf_classify); +TC_INDIRECT_FILTER_DECLARE(cls_cgroup_classify); +TC_INDIRECT_FILTER_DECLARE(fl_classify); +TC_INDIRECT_FILTER_DECLARE(flow_classify); +TC_INDIRECT_FILTER_DECLARE(fw_classify); +TC_INDIRECT_FILTER_DECLARE(mall_classify); +TC_INDIRECT_FILTER_DECLARE(route4_classify); +TC_INDIRECT_FILTER_DECLARE(rsvp_classify); +TC_INDIRECT_FILTER_DECLARE(rsvp6_classify); +TC_INDIRECT_FILTER_DECLARE(tcindex_classify); +TC_INDIRECT_FILTER_DECLARE(u32_classify); + +static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + if (static_branch_likely(&tc_skip_wrapper)) + goto skip; + +#if IS_BUILTIN(CONFIG_NET_CLS_BPF) + if (tp->classify == cls_bpf_classify) + return cls_bpf_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_U32) + if (tp->classify == u32_classify) + return u32_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FLOWER) + if (tp->classify == fl_classify) + return fl_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FW) + if (tp->classify == fw_classify) + return fw_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_MATCHALL) + if (tp->classify == mall_classify) + return mall_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_BASIC) + if (tp->classify == basic_classify) + return basic_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_CGROUP) + if (tp->classify == cls_cgroup_classify) + return cls_cgroup_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_FLOW) + if (tp->classify == flow_classify) + return flow_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_ROUTE4) + if (tp->classify == route4_classify) + return route4_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_RSVP) + if (tp->classify == rsvp_classify) + return rsvp_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_RSVP6) + if (tp->classify == rsvp6_classify) + return rsvp6_classify(skb, tp, res); +#endif +#if IS_BUILTIN(CONFIG_NET_CLS_TCINDEX) + if (tp->classify == tcindex_classify) + return tcindex_classify(skb, tp, res); +#endif + +skip: + return tp->classify(skb, tp, res); +} + +static inline void tc_wrapper_init(void) +{ +#ifdef CONFIG_X86 + if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) + static_branch_enable(&tc_skip_wrapper); +#endif +} + +#endif /* CONFIG_NET_CLS */ + +#else + +#define TC_INDIRECT_SCOPE static + +static inline int tc_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + return a->ops->act(skb, a, res); +} + +static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) +{ + return tp->classify(skb, tp, res); +} + +static inline void tc_wrapper_init(void) +{ +} + +#endif + +#endif /* __NET_TC_WRAPPER_H */ diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 4a27dfb1ba0f..2317db02c764 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -2273,6 +2274,8 @@ static struct pernet_operations psched_net_ops = { .exit = psched_net_exit, }; +DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); + static int __init pktsched_init(void) { int err; @@ -2300,6 +2303,8 @@ static int __init pktsched_init(void) rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, 0); + tc_wrapper_init(); + return 0; } -- cgit v1.2.3-70-g09d2 From 871cf386dd16705b1e08942efd02c58801293d01 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 6 Dec 2022 10:55:12 -0300 Subject: net/sched: avoid indirect act functions on retpoline kernels Expose the necessary tc act functions and wire up act_api to use direct calls in retpoline kernels. Signed-off-by: Pedro Tammela Reviewed-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Signed-off-by: David S. Miller --- net/sched/act_api.c | 3 ++- net/sched/act_bpf.c | 6 ++++-- net/sched/act_connmark.c | 6 ++++-- net/sched/act_csum.c | 6 ++++-- net/sched/act_ct.c | 5 +++-- net/sched/act_ctinfo.c | 6 ++++-- net/sched/act_gact.c | 6 ++++-- net/sched/act_gate.c | 6 ++++-- net/sched/act_ife.c | 6 ++++-- net/sched/act_ipt.c | 6 ++++-- net/sched/act_mirred.c | 6 ++++-- net/sched/act_mpls.c | 6 ++++-- net/sched/act_nat.c | 7 ++++--- net/sched/act_pedit.c | 6 ++++-- net/sched/act_police.c | 6 ++++-- net/sched/act_sample.c | 6 ++++-- net/sched/act_simple.c | 6 ++++-- net/sched/act_skbedit.c | 6 ++++-- net/sched/act_skbmod.c | 6 ++++-- net/sched/act_tunnel_key.c | 6 ++++-- net/sched/act_vlan.c | 6 ++++-- 21 files changed, 81 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9b31a10cc639..5b3c0ac495be 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef CONFIG_INET DEFINE_STATIC_KEY_FALSE(tcf_frag_xmit_count); @@ -1080,7 +1081,7 @@ restart_act_graph: repeat_ttl = 32; repeat: - ret = a->ops->act(skb, a, res); + ret = tc_act(skb, a, res); if (unlikely(ret == TC_ACT_REPEAT)) { if (--repeat_ttl != 0) goto repeat; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index b79eee44e24e..b0455fda7d0b 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -18,6 +18,7 @@ #include #include +#include #define ACT_BPF_NAME_LEN 256 @@ -31,8 +32,9 @@ struct tcf_bpf_cfg { static struct tc_action_ops act_bpf_ops; -static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, + const struct tc_action *act, + struct tcf_result *res) { bool at_ingress = skb_at_tc_ingress(skb); struct tcf_bpf *prog = to_bpf(act); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index d41002e4613f..7e63ff7e3ed7 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -27,8 +28,9 @@ static struct tc_action_ops act_connmark_ops; -static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash; struct nf_conntrack_tuple tuple; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 1366adf9b909..95e9304024b7 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -32,6 +32,7 @@ #include #include +#include static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, @@ -563,8 +564,9 @@ fail: return 0; } -static int tcf_csum_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_csum *p = to_tcf_csum(a); bool orig_vlan_tag_present = false; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index dd5ae7551956..f6df0168c91f 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1038,8 +1039,8 @@ static int tcf_ct_act_nat(struct sk_buff *skb, #endif } -static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) { struct net *net = dev_net(skb->dev); enum ip_conntrack_info ctinfo; diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index eaa02f098d1c..4b1b59da5c0b 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -75,8 +76,9 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } -static int tcf_ctinfo_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { const struct nf_conntrack_tuple_hash *thash = NULL; struct tcf_ctinfo *ca = to_ctinfo(a); diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 62d682b96b88..54f1b13b2360 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -18,6 +18,7 @@ #include #include #include +#include static struct tc_action_ops act_gact_ops; @@ -145,8 +146,9 @@ release_idr: return err; } -static int tcf_gact_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_gact_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_gact *gact = to_gact(a); int action = READ_ONCE(gact->tcf_action); diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 3049878e7315..9b8def0be41e 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct tc_action_ops act_gate_ops; @@ -113,8 +114,9 @@ static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) return HRTIMER_RESTART; } -static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_gate *gact = to_gate(a); diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 41d63b33461d..bc7611b0744c 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -29,6 +29,7 @@ #include #include #include +#include static int max_metacnt = IFE_META_MAX + 1; static struct tc_action_ops act_ife_ops; @@ -861,8 +862,9 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, return action; } -static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_ife_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 1625e1037416..5d96ffebd40f 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -216,8 +217,9 @@ static int tcf_xt_init(struct net *net, struct nlattr *nla, a, &act_xt_ops, tp, flags); } -static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_ipt_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { int ret = 0, result = 0; struct tcf_ipt *ipt = to_ipt(a); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index b8ad6ae282c0..7284bcea7b0b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -24,6 +24,7 @@ #include #include #include +#include static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); @@ -217,8 +218,9 @@ static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) return err; } -static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_mirred *m = to_mirred(a); struct sk_buff *skb2 = skb; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 8ad25cc8ccd5..ff47ce4d3968 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -14,6 +14,7 @@ #include #include #include +#include static struct tc_action_ops act_mpls_ops; @@ -49,8 +50,9 @@ static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse, return cpu_to_be32(new_lse); } -static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9265145f1040..74c74be33048 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -24,7 +24,7 @@ #include #include #include - +#include static struct tc_action_ops act_nat_ops; @@ -98,8 +98,9 @@ release_idr: return err; } -static int tcf_nat_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_nat *p = to_tcf_nat(a); struct iphdr *iph; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 94ed5857ce67..a0378e9f0121 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct tc_action_ops act_pedit_ops; @@ -319,8 +320,9 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb, return ret; } -static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_pedit *p = to_pedit(a); u32 max_offset; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0adb26e366a7..227cba58ce9f 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -19,6 +19,7 @@ #include #include #include +#include /* Each policer is serialized by its individual spinlock */ @@ -242,8 +243,9 @@ static bool tcf_police_mtu_check(struct sk_buff *skb, u32 limit) return len <= limit; } -static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_police *police = to_police(a); s64 now, toks, ppstoks = 0, ptoks = 0; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 7a25477f5d99..98dea08c1764 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -153,8 +154,9 @@ static bool tcf_sample_dev_ok_push(struct net_device *dev) } } -static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_sample_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 18d376135461..4b84514534f3 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -21,8 +22,9 @@ static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 -static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_simp_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_defact *d = to_defact(a); diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 1710780c908a..ce7008cf291c 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -36,8 +37,9 @@ static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params, return netdev_cap_txqueue(skb->dev, queue_mapping); } -static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index d98758a63934..dffa990a9629 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -15,14 +15,16 @@ #include #include #include +#include #include #include static struct tc_action_ops act_skbmod_ops; -static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); int action, max_edit_len, err; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2691a3d8e451..2d12d2626415 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -16,14 +16,16 @@ #include #include #include +#include #include #include static struct tc_action_ops act_tunnel_key_ops; -static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 7b24e898a3e6..0251442f5f29 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -12,14 +12,16 @@ #include #include #include +#include #include #include static struct tc_action_ops act_vlan_ops; -static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, + const struct tc_action *a, + struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; -- cgit v1.2.3-70-g09d2 From 9f3101dca3a7c69027c65770ac28803768efefa5 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 6 Dec 2022 10:55:13 -0300 Subject: net/sched: avoid indirect classify functions on retpoline kernels Expose the necessary tc classifier functions and wire up cls_api to use direct calls in retpoline kernels. Signed-off-by: Pedro Tammela Reviewed-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Signed-off-by: David S. Miller --- net/sched/cls_api.c | 3 ++- net/sched/cls_basic.c | 6 ++++-- net/sched/cls_bpf.c | 6 ++++-- net/sched/cls_cgroup.c | 6 ++++-- net/sched/cls_flow.c | 6 ++++-- net/sched/cls_flower.c | 6 ++++-- net/sched/cls_fw.c | 6 ++++-- net/sched/cls_matchall.c | 6 ++++-- net/sched/cls_route.c | 6 ++++-- net/sched/cls_rsvp.c | 2 ++ net/sched/cls_rsvp.h | 6 +++--- net/sched/cls_rsvp6.c | 2 ++ net/sched/cls_tcindex.c | 7 ++++--- net/sched/cls_u32.c | 6 ++++-- 14 files changed, 49 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 23d1cfa4f58c..668130f08903 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -40,6 +40,7 @@ #include #include #include +#include extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -1564,7 +1565,7 @@ reclassify: tp->protocol != htons(ETH_P_ALL)) continue; - err = tp->classify(skb, tp, res); + err = tc_classify(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { first_tp = orig_tp; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index d229ce99e554..1b92c33b5f81 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -18,6 +18,7 @@ #include #include #include +#include struct basic_head { struct list_head flist; @@ -36,8 +37,9 @@ struct basic_filter { struct rcu_work rwork; }; -static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int basic_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { int r; struct basic_head *head = rcu_dereference_bh(tp->root); diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index bc317b3eac12..466c26df853a 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -19,6 +19,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann "); @@ -77,8 +78,9 @@ static int cls_bpf_exec_opcode(int code) } } -static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); bool at_ingress = skb_at_tc_ingress(skb); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index ed00001b528a..bd9322d71910 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -13,6 +13,7 @@ #include #include #include +#include struct cls_cgroup_head { u32 handle; @@ -22,8 +23,9 @@ struct cls_cgroup_head { struct rcu_work rwork; }; -static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int cls_cgroup_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); u32 classid = task_get_classid(skb); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 014cd3de7b5d..535668e1f748 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -24,6 +24,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include @@ -292,8 +293,9 @@ static u32 flow_key_get(struct sk_buff *skb, int key, struct flow_keys *flow) (1 << FLOW_KEY_NFCT_PROTO_SRC) | \ (1 << FLOW_KEY_NFCT_PROTO_DST)) -static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int flow_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct flow_head *head = rcu_dereference_bh(tp->root); struct flow_filter *f; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 25bc57ee6ea1..0b15698b3531 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -305,8 +306,9 @@ static u16 fl_ct_info_to_flower_map[] = { TCA_FLOWER_KEY_CT_FLAGS_NEW, }; -static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); bool post_ct = tc_skb_cb(skb)->post_ct; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index a32351da968c..ae9439a6c56c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -21,6 +21,7 @@ #include #include #include +#include #define HTSIZE 256 @@ -47,8 +48,9 @@ static u32 fw_hash(u32 handle) return handle % HTSIZE; } -static int fw_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int fw_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct fw_head *head = rcu_dereference_bh(tp->root); struct fw_filter *f; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 39a5d9c170de..705f63da2c21 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -12,6 +12,7 @@ #include #include +#include struct cls_mall_head { struct tcf_exts exts; @@ -24,8 +25,9 @@ struct cls_mall_head { bool deleting; }; -static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int mall_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct cls_mall_head *head = rcu_dereference_bh(tp->root); diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 9e43b929d4ca..d0c53724d3e8 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * 1. For now we assume that route tags < 256. @@ -121,8 +122,9 @@ static inline int route4_hash_wild(void) return 0; \ } -static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int route4_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct route4_head *head = rcu_dereference_bh(tp->root); struct dst_entry *dst; diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c index de1c1d4da597..03d8619bd9c6 100644 --- a/net/sched/cls_rsvp.c +++ b/net/sched/cls_rsvp.c @@ -15,10 +15,12 @@ #include #include #include +#include #define RSVP_DST_LEN 1 #define RSVP_ID "rsvp" #define RSVP_OPS cls_rsvp_ops +#define RSVP_CLS rsvp_classify #include "cls_rsvp.h" MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index b00a7dbd0587..869efba9f834 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -124,8 +124,8 @@ static inline unsigned int hash_src(__be32 *src) return r; \ } -static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int RSVP_CLS(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res) { struct rsvp_head *head = rcu_dereference_bh(tp->root); struct rsvp_session *s; @@ -738,7 +738,7 @@ static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, static struct tcf_proto_ops RSVP_OPS __read_mostly = { .kind = RSVP_ID, - .classify = rsvp_classify, + .classify = RSVP_CLS, .init = rsvp_init, .destroy = rsvp_destroy, .get = rsvp_get, diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c index 64078846000e..e627cc32d633 100644 --- a/net/sched/cls_rsvp6.c +++ b/net/sched/cls_rsvp6.c @@ -15,10 +15,12 @@ #include #include #include +#include #define RSVP_DST_LEN 4 #define RSVP_ID "rsvp6" #define RSVP_OPS cls_rsvp6_ops +#define RSVP_CLS rsvp6_classify #include "cls_rsvp.h" MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 1c9eeb98d826..eb0e9458e722 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * Passing parameters to the root seems to be done more awkwardly than really @@ -98,9 +99,9 @@ static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, return NULL; } - -static int tcindex_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int tcindex_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct tcindex_data *p = rcu_dereference_bh(tp->root); struct tcindex_filter_result *f; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 34d25f7a0687..4e2e269f121f 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -39,6 +39,7 @@ #include #include #include +#include struct tc_u_knode { struct tc_u_knode __rcu *next; @@ -100,8 +101,9 @@ static inline unsigned int u32_hash_fold(__be32 key, return h; } -static int u32_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +TC_INDIRECT_SCOPE int u32_classify(struct sk_buff *skb, + const struct tcf_proto *tp, + struct tcf_result *res) { struct { struct tc_u_knode *knode; -- cgit v1.2.3-70-g09d2 From 1933ea365aa7a48ce26bea2ea09c9f7cc48cc668 Mon Sep 17 00:00:00 2001 From: wangchuanlei Date: Tue, 6 Dec 2022 20:38:57 -0500 Subject: net: openvswitch: Add support to count upcall packets Add support to count upall packets, when kmod of openvswitch upcall to count the number of packets for upcall succeed and failed, which is a better way to see how many packets upcalled on every interfaces. Signed-off-by: wangchuanlei Acked-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 14 +++++++++++ net/openvswitch/datapath.c | 41 ++++++++++++++++++++++++++++++++ net/openvswitch/vport.c | 50 ++++++++++++++++++++++++++++++++++++++++ net/openvswitch/vport.h | 16 +++++++++++++ 4 files changed, 121 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 94066f87e9ee..c5d62ee82567 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -277,11 +277,25 @@ enum ovs_vport_attr { OVS_VPORT_ATTR_PAD, OVS_VPORT_ATTR_IFINDEX, OVS_VPORT_ATTR_NETNSID, + OVS_VPORT_ATTR_UPCALL_STATS, __OVS_VPORT_ATTR_MAX }; #define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1) +/** + * enum ovs_vport_upcall_attr - attributes for %OVS_VPORT_UPCALL* commands + * @OVS_VPORT_UPCALL_SUCCESS: 64-bit upcall success packets. + * @OVS_VPORT_UPCALL_FAIL: 64-bit upcall fail packets. + */ +enum ovs_vport_upcall_attr { + OVS_VPORT_UPCALL_ATTR_SUCCESS, + OVS_VPORT_UPCALL_ATTR_FAIL, + __OVS_VPORT_UPCALL_ATTR_MAX +}; + +#define OVS_VPORT_UPCALL_ATTR_MAX (__OVS_VPORT_UPCALL_ATTR_MAX - 1) + enum { OVS_VXLAN_EXT_UNSPEC, OVS_VXLAN_EXT_GBP, /* Flag or __u32 */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 861dfb8daf4a..932bcf766d63 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -209,6 +209,26 @@ static struct vport *new_vport(const struct vport_parms *parms) return vport; } +static void ovs_vport_update_upcall_stats(struct sk_buff *skb, + const struct dp_upcall_info *upcall_info, + bool upcall_result) +{ + struct vport *p = OVS_CB(skb)->input_vport; + struct vport_upcall_stats_percpu *stats; + + if (upcall_info->cmd != OVS_PACKET_CMD_MISS && + upcall_info->cmd != OVS_PACKET_CMD_ACTION) + return; + + stats = this_cpu_ptr(p->upcall_stats); + u64_stats_update_begin(&stats->syncp); + if (upcall_result) + u64_stats_inc(&stats->n_success); + else + u64_stats_inc(&stats->n_fail); + u64_stats_update_end(&stats->syncp); +} + void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); @@ -216,6 +236,9 @@ void ovs_dp_detach_port(struct vport *p) /* First drop references to device. */ hlist_del_rcu(&p->dp_hash_node); + /* Free percpu memory */ + free_percpu(p->upcall_stats); + /* Then destroy it. */ ovs_vport_del(p); } @@ -305,6 +328,8 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); + + ovs_vport_update_upcall_stats(skb, upcall_info, !err); if (err) goto err; @@ -1826,6 +1851,12 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_destroy_portids; } + vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); + if (!vport->upcall_stats) { + err = -ENOMEM; + goto err_destroy_portids; + } + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_NEW); BUG_ON(err < 0); @@ -2098,6 +2129,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, OVS_VPORT_ATTR_PAD)) goto nla_put_failure; + if (ovs_vport_get_upcall_stats(vport, skb)) + goto nla_put_failure; + if (ovs_vport_get_upcall_portids(vport, skb)) goto nla_put_failure; @@ -2279,6 +2313,12 @@ restart: goto exit_unlock_free; } + vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); + if (!vport->upcall_stats) { + err = -ENOMEM; + goto exit_unlock_free; + } + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, OVS_VPORT_CMD_NEW, GFP_KERNEL); @@ -2508,6 +2548,7 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, + [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, }; static const struct genl_small_ops dp_vport_genl_ops[] = { diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 82a74f998966..7e0f5c45b512 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -284,6 +284,56 @@ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) stats->tx_packets = dev_stats->tx_packets; } +/** + * ovs_vport_get_upcall_stats - retrieve upcall stats + * + * @vport: vport from which to retrieve the stats. + * @skb: sk_buff where upcall stats should be appended. + * + * Retrieves upcall stats for the given device. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) +{ + struct nlattr *nla; + int i; + + __u64 tx_success = 0; + __u64 tx_fail = 0; + + for_each_possible_cpu(i) { + const struct vport_upcall_stats_percpu *stats; + unsigned int start; + + stats = per_cpu_ptr(vport->upcall_stats, i); + do { + start = u64_stats_fetch_begin(&stats->syncp); + tx_success += u64_stats_read(&stats->n_success); + tx_fail += u64_stats_read(&stats->n_fail); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + } + + nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); + if (!nla) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success, + OVS_VPORT_ATTR_PAD)) { + nla_nest_cancel(skb, nla); + return -EMSGSIZE; + } + + if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail, + OVS_VPORT_ATTR_PAD)) { + nla_nest_cancel(skb, nla); + return -EMSGSIZE; + } + nla_nest_end(skb, nla); + + return 0; +} + /** * ovs_vport_get_options - retrieve device options * diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 6ff45e8a0868..3e71ca8ad8a7 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -32,6 +32,8 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name); void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); +int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb); + int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); @@ -65,6 +67,7 @@ struct vport_portids { * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. + * @upcall_stats: Upcall stats of every ports. * @detach_list: list used for detaching vport in net-exit call. * @rcu: RCU callback head for deferred destruction. */ @@ -78,6 +81,7 @@ struct vport { struct hlist_node hash_node; struct hlist_node dp_hash_node; const struct vport_ops *ops; + struct vport_upcall_stats_percpu __percpu *upcall_stats; struct list_head detach_list; struct rcu_head rcu; @@ -137,6 +141,18 @@ struct vport_ops { struct list_head list; }; +/** + * struct vport_upcall_stats_percpu - per-cpu packet upcall statistics for + * a given vport. + * @n_success: Number of packets that upcall to userspace succeed. + * @n_fail: Number of packets that upcall to userspace failed. + */ +struct vport_upcall_stats_percpu { + struct u64_stats_sync syncp; + u64_stats_t n_success; + u64_stats_t n_fail; +}; + struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); -- cgit v1.2.3-70-g09d2 From 8b34b52c1794932a16308631121b758830349ae9 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 8 Dec 2022 16:44:30 -0800 Subject: mptcp: use nlmsg_free instead of kfree_skb Use nlmsg_free() instead of kfree_skb() in pm_netlink.c. The SKB's have been created by nlmsg_new(). The proper cleaning way should then be done with nlmsg_free(). For the moment, nlmsg_free() is simply calling kfree_skb() so we don't change the behaviour here. Suggested-by: Jakub Kicinski Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index eef69d0e44ec..08c65f3e70a3 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -2094,7 +2094,7 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) return; nla_put_failure: - kfree_skb(skb); + nlmsg_free(skb); } void mptcp_event_addr_announced(const struct sock *ssk, @@ -2151,7 +2151,7 @@ void mptcp_event_addr_announced(const struct sock *ssk, return; nla_put_failure: - kfree_skb(skb); + nlmsg_free(skb); } void mptcp_event_pm_listener(const struct sock *ssk, @@ -2203,7 +2203,7 @@ void mptcp_event_pm_listener(const struct sock *ssk, return; nla_put_failure: - kfree_skb(skb); + nlmsg_free(skb); } void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, @@ -2261,7 +2261,7 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, return; nla_put_failure: - kfree_skb(skb); + nlmsg_free(skb); } static const struct genl_small_ops mptcp_pm_ops[] = { -- cgit v1.2.3-70-g09d2 From 03e7d28cd25ee6ff731386fb154d03f76dede1cc Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 8 Dec 2022 16:44:31 -0800 Subject: mptcp: return 0 instead of 'err' var When 'err' is 0, it looks clearer to return '0' instead of the variable called 'err'. The behaviour is then not modified, just a clearer code. By doing this, we can also avoid false positive smatch warnings like this one: net/mptcp/pm_netlink.c:1169 mptcp_pm_parse_pm_addr_attr() warn: missing error code? 'err' Reported-by: kernel test robot Reported-by: Dan Carpenter Suggested-by: Mat Martineau Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 4 ++-- net/mptcp/sockopt.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 08c65f3e70a3..2ea7eae43bdb 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1190,7 +1190,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { if (!require_family) - return err; + return 0; NL_SET_ERR_MSG_ATTR(info->extack, attr, "missing family"); @@ -1224,7 +1224,7 @@ static int mptcp_pm_parse_pm_addr_attr(struct nlattr *tb[], if (tb[MPTCP_PM_ADDR_ATTR_PORT]) addr->port = htons(nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_PORT])); - return err; + return 0; } int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index a47423ebb33a..d4b1e6ec1b36 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -740,7 +740,7 @@ static int mptcp_setsockopt_v4_set_tos(struct mptcp_sock *msk, int optname, } release_sock(sk); - return err; + return 0; } static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, -- cgit v1.2.3-70-g09d2 From ce098da1497c6dee9589fce2c61d1910f4fcf0e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Dec 2022 22:02:59 -0800 Subject: skbuff: Introduce slab_build_skb() syzkaller reported: BUG: KASAN: slab-out-of-bounds in __build_skb_around+0x235/0x340 net/core/skbuff.c:294 Write of size 32 at addr ffff88802aa172c0 by task syz-executor413/5295 For bpf_prog_test_run_skb(), which uses a kmalloc()ed buffer passed to build_skb(). When build_skb() is passed a frag_size of 0, it means the buffer came from kmalloc. In these cases, ksize() is used to find its actual size, but since the allocation may not have been made to that size, actually perform the krealloc() call so that all the associated buffer size checking will be correctly notified (and use the "new" pointer so that compiler hinting works correctly). Split this logic out into a new interface, slab_build_skb(), but leave the original 0 checking for now to catch any stragglers. Reported-by: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com Link: https://groups.google.com/g/syzkaller-bugs/c/UnIKxTtU5-0/m/-wbXinkgAQAJ Fixes: 38931d8989b5 ("mm: Make ksize() a reporting-only function") Cc: Pavel Begunkov Cc: pepsipu Cc: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com Cc: Vlastimil Babka Cc: kasan-dev Cc: Andrii Nakryiko Cc: ast@kernel.org Cc: Daniel Borkmann Cc: Hao Luo Cc: Jesper Dangaard Brouer Cc: John Fastabend Cc: jolsa@kernel.org Cc: KP Singh Cc: martin.lau@linux.dev Cc: Stanislav Fomichev Cc: song@kernel.org Cc: Yonghong Song Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221208060256.give.994-kees@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnx2.c | 2 +- drivers/net/ethernet/qlogic/qed/qed_ll2.c | 2 +- include/linux/skbuff.h | 1 + net/bpf/test_run.c | 2 +- net/core/skbuff.c | 70 +++++++++++++++++++++++++++---- 5 files changed, 66 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index dbe310144780..9f473854b0f4 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -3045,7 +3045,7 @@ error: dma_unmap_single(&bp->pdev->dev, dma_addr, bp->rx_buf_use_size, DMA_FROM_DEVICE); - skb = build_skb(data, 0); + skb = slab_build_skb(data); if (!skb) { kfree(data); goto error; diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index ed274f033626..e5116a86cfbc 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -200,7 +200,7 @@ static void qed_ll2b_complete_rx_packet(void *cxt, dma_unmap_single(&cdev->pdev->dev, buffer->phys_addr, cdev->ll2->rx_size, DMA_FROM_DEVICE); - skb = build_skb(buffer->data, 0); + skb = slab_build_skb(buffer->data); if (!skb) { DP_INFO(cdev, "Failed to build SKB\n"); kfree(buffer->data); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4e464a27adaf..4c8492401a10 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1255,6 +1255,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); +struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 6094ef7cffcd..c9bfd263dcef 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -1128,7 +1128,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, } sock_init_data(NULL, sk); - skb = build_skb(data, 0); + skb = slab_build_skb(data); if (!skb) { kfree(data); kfree(ctx); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 4bf95e36ed16..3cbba7099c0f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -270,12 +270,10 @@ static struct sk_buff *napi_skb_cache_get(void) return skb; } -/* Caller must provide SKB that is memset cleared */ -static void __build_skb_around(struct sk_buff *skb, void *data, - unsigned int frag_size) +static inline void __finalize_skb_around(struct sk_buff *skb, void *data, + unsigned int size) { struct skb_shared_info *shinfo; - unsigned int size = frag_size ? : ksize(data); size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); @@ -297,15 +295,71 @@ static void __build_skb_around(struct sk_buff *skb, void *data, skb_set_kcov_handle(skb, kcov_common_handle()); } +static inline void *__slab_build_skb(struct sk_buff *skb, void *data, + unsigned int *size) +{ + void *resized; + + /* Must find the allocation size (and grow it to match). */ + *size = ksize(data); + /* krealloc() will immediately return "data" when + * "ksize(data)" is requested: it is the existing upper + * bounds. As a result, GFP_ATOMIC will be ignored. Note + * that this "new" pointer needs to be passed back to the + * caller for use so the __alloc_size hinting will be + * tracked correctly. + */ + resized = krealloc(data, *size, GFP_ATOMIC); + WARN_ON_ONCE(resized != data); + return resized; +} + +/* build_skb() variant which can operate on slab buffers. + * Note that this should be used sparingly as slab buffers + * cannot be combined efficiently by GRO! + */ +struct sk_buff *slab_build_skb(void *data) +{ + struct sk_buff *skb; + unsigned int size; + + skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC); + if (unlikely(!skb)) + return NULL; + + memset(skb, 0, offsetof(struct sk_buff, tail)); + data = __slab_build_skb(skb, data, &size); + __finalize_skb_around(skb, data, size); + + return skb; +} +EXPORT_SYMBOL(slab_build_skb); + +/* Caller must provide SKB that is memset cleared */ +static void __build_skb_around(struct sk_buff *skb, void *data, + unsigned int frag_size) +{ + unsigned int size = frag_size; + + /* frag_size == 0 is considered deprecated now. Callers + * using slab buffer should use slab_build_skb() instead. + */ + if (WARN_ONCE(size == 0, "Use slab_build_skb() instead")) + data = __slab_build_skb(skb, data, &size); + + __finalize_skb_around(skb, data, size); +} + /** * __build_skb - build a network buffer * @data: data buffer provided by caller - * @frag_size: size of data, or 0 if head was kmalloced + * @frag_size: size of data (must not be 0) * * Allocate a new &sk_buff. Caller provides space holding head and - * skb_shared_info. @data must have been allocated by kmalloc() only if - * @frag_size is 0, otherwise data should come from the page allocator - * or vmalloc() + * skb_shared_info. @data must have been allocated from the page + * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc() + * allocation is deprecated, and callers should use slab_build_skb() + * instead.) * The return is the new skb buffer. * On a failure the return is %NULL, and @data is not freed. * Notes : -- cgit v1.2.3-70-g09d2 From 5fc11a401a8dc491b326d2c916b07d22e7ac8833 Mon Sep 17 00:00:00 2001 From: Gavrilov Ilia Date: Thu, 8 Dec 2022 08:31:39 +0000 Subject: net: devlink: Add missing error check to devlink_resource_put() When the resource size changes, the return value of the 'nla_put_u64_64bit' function is not checked. That has been fixed to avoid rechecking at the next step. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) with SVACE. Note that this is harmless, we'd error out at the next put(). Signed-off-by: Ilia.Gavrilov Link: https://lore.kernel.org/r/20221208082821.3927937-1-Ilia.Gavrilov@infotecs.ru Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/devlink.c b/net/core/devlink.c index ab40ebcb4aea..6004bd0ccee4 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4441,9 +4441,10 @@ static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, DEVLINK_ATTR_PAD)) goto nla_put_failure; - if (resource->size != resource->size_new) - nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, - resource->size_new, DEVLINK_ATTR_PAD); + if (resource->size != resource->size_new && + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new, DEVLINK_ATTR_PAD)) + goto nla_put_failure; if (devlink_resource_occ_put(resource, skb)) goto nla_put_failure; if (devlink_resource_size_params_put(resource, skb)) -- cgit v1.2.3-70-g09d2 From 5df7d714d8cbcce7642936cc0f6532f0c4c3d197 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:45:59 +0200 Subject: ipvs: add rcu protection to stats In preparation to using RCU locking for the list with estimators, make sure the struct ip_vs_stats are released after RCU grace period by using RCU callbacks. This affects ipvs->tot_stats where we can not use RCU callbacks for ipvs, so we use allocated struct ip_vs_stats_rcu. For services and dests we force RCU callbacks for all cases. Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 8 +++++- net/netfilter/ipvs/ip_vs_core.c | 10 +++++-- net/netfilter/ipvs/ip_vs_ctl.c | 64 +++++++++++++++++++++++++++-------------- 3 files changed, 57 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index ff1804a0c469..bd8ae137e43b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -405,6 +405,11 @@ struct ip_vs_stats { struct ip_vs_kstats kstats0; /* reset values */ }; +struct ip_vs_stats_rcu { + struct ip_vs_stats s; + struct rcu_head rcu_head; +}; + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -688,6 +693,7 @@ struct ip_vs_dest { union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ + struct rcu_head rcu_head; struct list_head t_list; /* in dest_trash */ unsigned int in_rs_table:1; /* we are in rs_table */ }; @@ -869,7 +875,7 @@ struct netns_ipvs { atomic_t conn_count; /* connection counter */ /* ip_vs_ctl */ - struct ip_vs_stats tot_stats; /* Statistics & est. */ + struct ip_vs_stats_rcu *tot_stats; /* Statistics & est. */ int num_services; /* no of virtual services */ int num_services6; /* IPv6 virtual services */ diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 51ad557a525b..fcdaef1fcccf 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -143,7 +143,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->cnt.inbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); s->cnt.inpkts++; s->cnt.inbytes += skb->len; @@ -179,7 +179,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s->cnt.outbytes += skb->len; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); s->cnt.outpkts++; s->cnt.outbytes += skb->len; @@ -208,7 +208,7 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) s->cnt.conns++; u64_stats_update_end(&s->syncp); - s = this_cpu_ptr(ipvs->tot_stats.cpustats); + s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); s->cnt.conns++; u64_stats_update_end(&s->syncp); @@ -2448,6 +2448,10 @@ static void __exit ip_vs_cleanup(void) ip_vs_conn_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); + /* common rcu_barrier() used by: + * - ip_vs_control_cleanup() + */ + rcu_barrier(); pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 4d62059a6021..9016b641ae52 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -483,17 +483,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head) ip_vs_service_free(svc); } -static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay) +static void __ip_vs_svc_put(struct ip_vs_service *svc) { if (atomic_dec_and_test(&svc->refcnt)) { IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n", svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port)); - if (do_delay) - call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); - else - ip_vs_service_free(svc); + call_rcu(&svc->rcu_head, ip_vs_service_rcu_free); } } @@ -780,14 +777,22 @@ out: return dest; } +static void ip_vs_dest_rcu_free(struct rcu_head *head) +{ + struct ip_vs_dest *dest; + + dest = container_of(head, struct ip_vs_dest, rcu_head); + free_percpu(dest->stats.cpustats); + ip_vs_dest_put_and_free(dest); +} + static void ip_vs_dest_free(struct ip_vs_dest *dest) { struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1); __ip_vs_dst_cache_reset(dest); - __ip_vs_svc_put(svc, false); - free_percpu(dest->stats.cpustats); - ip_vs_dest_put_and_free(dest); + __ip_vs_svc_put(svc); + call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free); } /* @@ -811,6 +816,16 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) } } +static void ip_vs_stats_rcu_free(struct rcu_head *head) +{ + struct ip_vs_stats_rcu *rs = container_of(head, + struct ip_vs_stats_rcu, + rcu_head); + + free_percpu(rs->s.cpustats); + kfree(rs); +} + static void ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { @@ -923,7 +938,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, if (old_svc != svc) { ip_vs_zero_stats(&dest->stats); __ip_vs_bind_svc(dest, svc); - __ip_vs_svc_put(old_svc, true); + __ip_vs_svc_put(old_svc); } } @@ -1571,7 +1586,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* * Free the service if nobody refers to it */ - __ip_vs_svc_put(svc, true); + __ip_vs_svc_put(svc); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1761,7 +1776,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs) } } - ip_vs_zero_stats(&ipvs->tot_stats); + ip_vs_zero_stats(&ipvs->tot_stats->s); return 0; } @@ -2255,7 +2270,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) seq_puts(seq, " Conns Packets Packets Bytes Bytes\n"); - ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); + ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", (unsigned long long)show.conns, (unsigned long long)show.inpkts, @@ -2279,7 +2294,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) { struct net *net = seq_file_single_net(seq); - struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats; + struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s; struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats; struct ip_vs_kstats kstats; int i; @@ -4107,7 +4122,6 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) kfree(tbl); return -ENOMEM; } - ip_vs_start_estimator(ipvs, &ipvs->tot_stats); ipvs->sysctl_tbl = tbl; /* Schedule defense work */ INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); @@ -4118,6 +4132,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, expire_nodest_conn_handler); + ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); return 0; } @@ -4129,7 +4144,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) cancel_delayed_work_sync(&ipvs->defense_work); cancel_work_sync(&ipvs->defense_work.work); unregister_net_sysctl_table(ipvs->sysctl_hdr); - ip_vs_stop_estimator(ipvs, &ipvs->tot_stats); + ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); @@ -4165,17 +4180,20 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) atomic_set(&ipvs->conn_out_counter, 0); /* procfs stats */ - ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!ipvs->tot_stats.cpustats) + ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); + if (!ipvs->tot_stats) return -ENOMEM; + ipvs->tot_stats->s.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->tot_stats->s.cpustats) + goto err_tot_stats; for_each_possible_cpu(i) { struct ip_vs_cpu_stats *ipvs_tot_stats; - ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i); + ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats->s.cpustats, i); u64_stats_init(&ipvs_tot_stats->syncp); } - spin_lock_init(&ipvs->tot_stats.lock); + spin_lock_init(&ipvs->tot_stats->s.lock); #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, @@ -4207,7 +4225,10 @@ err_stats: err_vs: #endif - free_percpu(ipvs->tot_stats.cpustats); + free_percpu(ipvs->tot_stats->s.cpustats); + +err_tot_stats: + kfree(ipvs->tot_stats); return -ENOMEM; } @@ -4220,7 +4241,7 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); remove_proc_entry("ip_vs", ipvs->net->proc_net); #endif - free_percpu(ipvs->tot_stats.cpustats); + call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free); } int __init ip_vs_register_nl_ioctl(void) @@ -4280,5 +4301,6 @@ void ip_vs_control_cleanup(void) { EnterFunction(2); unregister_netdevice_notifier(&ip_vs_dst_notifier); + /* relying on common rcu_barrier() in ip_vs_cleanup() */ LeaveFunction(2); } -- cgit v1.2.3-70-g09d2 From de39afb3d811ba2c028de8662adafedb4899327b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:00 +0200 Subject: ipvs: use common functions for stats allocation Move alloc_percpu/free_percpu logic in new functions Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 5 +++ net/netfilter/ipvs/ip_vs_ctl.c | 96 ++++++++++++++++++++++++------------------ 2 files changed, 60 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index bd8ae137e43b..e5582c01a4a3 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -410,6 +410,11 @@ struct ip_vs_stats_rcu { struct rcu_head rcu_head; }; +int ip_vs_stats_init_alloc(struct ip_vs_stats *s); +struct ip_vs_stats *ip_vs_stats_alloc(void); +void ip_vs_stats_release(struct ip_vs_stats *stats); +void ip_vs_stats_free(struct ip_vs_stats *stats); + struct dst_entry; struct iphdr; struct ip_vs_conn; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 9016b641ae52..ec6db864ac36 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -471,7 +471,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) static void ip_vs_service_free(struct ip_vs_service *svc) { - free_percpu(svc->stats.cpustats); + ip_vs_stats_release(&svc->stats); kfree(svc); } @@ -782,7 +782,7 @@ static void ip_vs_dest_rcu_free(struct rcu_head *head) struct ip_vs_dest *dest; dest = container_of(head, struct ip_vs_dest, rcu_head); - free_percpu(dest->stats.cpustats); + ip_vs_stats_release(&dest->stats); ip_vs_dest_put_and_free(dest); } @@ -822,7 +822,7 @@ static void ip_vs_stats_rcu_free(struct rcu_head *head) struct ip_vs_stats_rcu, rcu_head); - free_percpu(rs->s.cpustats); + ip_vs_stats_release(&rs->s); kfree(rs); } @@ -879,6 +879,47 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) spin_unlock_bh(&stats->lock); } +/* Allocate fields after kzalloc */ +int ip_vs_stats_init_alloc(struct ip_vs_stats *s) +{ + int i; + + spin_lock_init(&s->lock); + s->cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!s->cpustats) + return -ENOMEM; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i); + + u64_stats_init(&cs->syncp); + } + return 0; +} + +struct ip_vs_stats *ip_vs_stats_alloc(void) +{ + struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL); + + if (s && ip_vs_stats_init_alloc(s) >= 0) + return s; + kfree(s); + return NULL; +} + +void ip_vs_stats_release(struct ip_vs_stats *stats) +{ + free_percpu(stats->cpustats); +} + +void ip_vs_stats_free(struct ip_vs_stats *stats) +{ + if (stats) { + ip_vs_stats_release(stats); + kfree(stats); + } +} + /* * Update a destination in the given service */ @@ -978,14 +1019,13 @@ static int ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) { struct ip_vs_dest *dest; - unsigned int atype, i; + unsigned int atype; + int ret; EnterFunction(2); #ifdef CONFIG_IP_VS_IPV6 if (udest->af == AF_INET6) { - int ret; - atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && @@ -1007,16 +1047,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) if (dest == NULL) return -ENOMEM; - dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!dest->stats.cpustats) + ret = ip_vs_stats_init_alloc(&dest->stats); + if (ret < 0) goto err_alloc; - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ip_vs_dest_stats; - ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i); - u64_stats_init(&ip_vs_dest_stats->syncp); - } - dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; @@ -1032,7 +1066,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); __ip_vs_update_dest(svc, dest, udest, 1); LeaveFunction(2); @@ -1040,7 +1073,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) err_alloc: kfree(dest); - return -ENOMEM; + return ret; } @@ -1299,7 +1332,7 @@ static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { - int ret = 0, i; + int ret = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; @@ -1359,18 +1392,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ret = -ENOMEM; goto out_err; } - svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!svc->stats.cpustats) { - ret = -ENOMEM; + ret = ip_vs_stats_init_alloc(&svc->stats); + if (ret < 0) goto out_err; - } - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ip_vs_stats; - ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i); - u64_stats_init(&ip_vs_stats->syncp); - } - /* I'm the first user of the service */ atomic_set(&svc->refcnt, 0); @@ -1387,7 +1411,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, INIT_LIST_HEAD(&svc->destinations); spin_lock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ if (sched) { @@ -4166,7 +4189,7 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { - int i, idx; + int idx; /* Initialize rs_table */ for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) @@ -4183,18 +4206,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); if (!ipvs->tot_stats) return -ENOMEM; - ipvs->tot_stats->s.cpustats = alloc_percpu(struct ip_vs_cpu_stats); - if (!ipvs->tot_stats->s.cpustats) + if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) goto err_tot_stats; - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *ipvs_tot_stats; - ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats->s.cpustats, i); - u64_stats_init(&ipvs_tot_stats->syncp); - } - - spin_lock_init(&ipvs->tot_stats->s.lock); - #ifdef CONFIG_PROC_FS if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter))) @@ -4225,7 +4239,7 @@ err_stats: err_vs: #endif - free_percpu(ipvs->tot_stats->s.cpustats); + ip_vs_stats_release(&ipvs->tot_stats->s); err_tot_stats: kfree(ipvs->tot_stats); -- cgit v1.2.3-70-g09d2 From 1dbd8d9a82e3f26b9d063292d47ece673f48fce2 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:01 +0200 Subject: ipvs: use u64_stats_t for the per-cpu counters Use the provided u64_stats_t type to avoid load/store tearing. Fixes: 316580b69d0a ("u64_stats: provide u64_stats_t type") Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Tested-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 10 +++++----- net/netfilter/ipvs/ip_vs_core.c | 30 +++++++++++++++--------------- net/netfilter/ipvs/ip_vs_ctl.c | 10 +++++----- net/netfilter/ipvs/ip_vs_est.c | 20 ++++++++++---------- 4 files changed, 35 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index e5582c01a4a3..a4d44138c2a8 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -351,11 +351,11 @@ struct ip_vs_seq { /* counters per cpu */ struct ip_vs_counters { - __u64 conns; /* connections scheduled */ - __u64 inpkts; /* incoming packets */ - __u64 outpkts; /* outgoing packets */ - __u64 inbytes; /* incoming bytes */ - __u64 outbytes; /* outgoing bytes */ + u64_stats_t conns; /* connections scheduled */ + u64_stats_t inpkts; /* incoming packets */ + u64_stats_t outpkts; /* outgoing packets */ + u64_stats_t inbytes; /* incoming bytes */ + u64_stats_t outbytes; /* outgoing bytes */ }; /* Stats per cpu */ struct ip_vs_cpu_stats { diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index fcdaef1fcccf..2fcc26507d69 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -132,21 +132,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.inpkts++; - s->cnt.inbytes += skb->len; + u64_stats_inc(&s->cnt.inpkts); + u64_stats_add(&s->cnt.inbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -168,21 +168,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) s = this_cpu_ptr(dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); svc = rcu_dereference(dest->svc); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.outpkts++; - s->cnt.outbytes += skb->len; + u64_stats_inc(&s->cnt.outpkts); + u64_stats_add(&s->cnt.outbytes, skb->len); u64_stats_update_end(&s->syncp); local_bh_enable(); @@ -200,17 +200,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) s = this_cpu_ptr(cp->dest->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(svc->stats.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); s = this_cpu_ptr(ipvs->tot_stats->s.cpustats); u64_stats_update_begin(&s->syncp); - s->cnt.conns++; + u64_stats_inc(&s->cnt.conns); u64_stats_update_end(&s->syncp); local_bh_enable(); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ec6db864ac36..5f9cc2e7ba71 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2335,11 +2335,11 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) do { start = u64_stats_fetch_begin(&u->syncp); - conns = u->cnt.conns; - inpkts = u->cnt.inpkts; - outpkts = u->cnt.outpkts; - inbytes = u->cnt.inbytes; - outbytes = u->cnt.outbytes; + conns = u64_stats_read(&u->cnt.conns); + inpkts = u64_stats_read(&u->cnt.inpkts); + outpkts = u64_stats_read(&u->cnt.outpkts); + inbytes = u64_stats_read(&u->cnt.inbytes); + outbytes = u64_stats_read(&u->cnt.outbytes); } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 9a1a7af6a186..f53150d82a92 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -67,11 +67,11 @@ static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, if (add) { do { start = u64_stats_fetch_begin(&s->syncp); - conns = s->cnt.conns; - inpkts = s->cnt.inpkts; - outpkts = s->cnt.outpkts; - inbytes = s->cnt.inbytes; - outbytes = s->cnt.outbytes; + conns = u64_stats_read(&s->cnt.conns); + inpkts = u64_stats_read(&s->cnt.inpkts); + outpkts = u64_stats_read(&s->cnt.outpkts); + inbytes = u64_stats_read(&s->cnt.inbytes); + outbytes = u64_stats_read(&s->cnt.outbytes); } while (u64_stats_fetch_retry(&s->syncp, start)); sum->conns += conns; sum->inpkts += inpkts; @@ -82,11 +82,11 @@ static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, add = true; do { start = u64_stats_fetch_begin(&s->syncp); - sum->conns = s->cnt.conns; - sum->inpkts = s->cnt.inpkts; - sum->outpkts = s->cnt.outpkts; - sum->inbytes = s->cnt.inbytes; - sum->outbytes = s->cnt.outbytes; + sum->conns = u64_stats_read(&s->cnt.conns); + sum->inpkts = u64_stats_read(&s->cnt.inpkts); + sum->outpkts = u64_stats_read(&s->cnt.outpkts); + sum->inbytes = u64_stats_read(&s->cnt.inbytes); + sum->outbytes = u64_stats_read(&s->cnt.outbytes); } while (u64_stats_fetch_retry(&s->syncp, start)); } } -- cgit v1.2.3-70-g09d2 From 705dd34440812735ece298eb5bc153fde9544d42 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:02 +0200 Subject: ipvs: use kthreads for stats estimation Estimating all entries in single list in timer context by single CPU causes large latency with multiple IPVS rules as reported in [1], [2], [3]. Spread the estimator structures in multiple chains and use kthread(s) for the estimation. The chains are processed in multiple (50) timer ticks to ensure the 2-second interval between estimations with some accuracy. Every chain is processed under RCU lock. Every kthread works over its own data structure and all such contexts are attached to array. The contexts can be preserved while the kthread tasks are stopped or restarted. When estimators are removed, unused kthread contexts are released and the slots in array are left empty. First kthread determines parameters to use, eg. maximum number of estimators to process per kthread based on chain's length (chain_max), allowing sub-100us cond_resched rate and estimation taking up to 1/8 of the CPU capacity to avoid any problems if chain_max is not correctly calculated. chain_max is calculated taking into account factors such as CPU speed and memory/cache speed where the cache_factor (4) is selected from real tests with current generation of CPU/NUMA configurations to correct the difference in CPU usage between cached (during calc phase) and non-cached (working) state of the estimated per-cpu data. First kthread also plays the role of distributor of added estimators to all kthreads, keeping low the time to add estimators. The optimization is based on the fact that newly added estimator should be estimated after 2 seconds, so we have the time to offload the adding to chain from controlling process to kthread 0. The allocated kthread context may grow from 1 to 50 allocated structures for timer ticks which saves memory for setups with small number of estimators. We also add delayed work est_reload_work that will make sure the kthread tasks are properly started/stopped. ip_vs_start_estimator() is changed to report errors which allows to safely store the estimators in allocated structures. Many thanks to Jiri Wiesner for his valuable comments and for spending a lot of time reviewing and testing the changes on different platforms with 48-256 CPUs and 1-8 NUMA nodes under different cpufreq governors. [1] Report from Yunhong Jiang: https://lore.kernel.org/netdev/D25792C1-1B89-45DE-9F10-EC350DC04ADC@gmail.com/ [2] https://marc.info/?l=linux-virtual-server&m=159679809118027&w=2 [3] Report from Dust: https://archive.linuxvirtualserver.org/html/lvs-devel/2020-12/msg00000.html Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Tested-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 88 ++++- net/netfilter/ipvs/ip_vs_ctl.c | 126 ++++-- net/netfilter/ipvs/ip_vs_est.c | 876 +++++++++++++++++++++++++++++++++++++---- 3 files changed, 990 insertions(+), 100 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index a4d44138c2a8..04960dc6228f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -42,6 +42,8 @@ static inline struct netns_ipvs *net_ipvs(struct net* net) /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; +extern struct mutex __ip_vs_mutex; + struct ip_vs_iphdr { int hdr_flags; /* ipvs flags */ __u32 off; /* Where IP or IPv4 header starts */ @@ -365,7 +367,7 @@ struct ip_vs_cpu_stats { /* IPVS statistics objects */ struct ip_vs_estimator { - struct list_head list; + struct hlist_node list; u64 last_inbytes; u64 last_outbytes; @@ -378,6 +380,10 @@ struct ip_vs_estimator { u64 outpps; u64 inbps; u64 outbps; + + s32 ktid:16, /* kthread ID, -1=temp list */ + ktrow:8, /* row/tick ID for kthread */ + ktcid:8; /* chain ID for kthread tick */ }; /* @@ -415,6 +421,66 @@ struct ip_vs_stats *ip_vs_stats_alloc(void); void ip_vs_stats_release(struct ip_vs_stats *stats); void ip_vs_stats_free(struct ip_vs_stats *stats); +/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */ +#define IPVS_EST_NTICKS 50 +/* Estimation uses a 2-second period containing ticks (in jiffies) */ +#define IPVS_EST_TICK ((2 * HZ) / IPVS_EST_NTICKS) + +/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C). + * Value of 4 and above ensures kthreads will take work without exceeding + * the CPU capacity under different circumstances. + */ +#define IPVS_EST_LOAD_DIVISOR 8 + +/* Kthreads should not have work that exceeds the CPU load above 50% */ +#define IPVS_EST_CPU_KTHREADS (IPVS_EST_LOAD_DIVISOR / 2) + +/* Desired number of chains per timer tick (chain load factor in 100us units), + * 48=4.8ms of 40ms tick (12% CPU usage): + * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50 + */ +#define IPVS_EST_CHAIN_FACTOR \ + ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8) + +/* Compiled number of chains per tick + * The defines should match cond_resched_rcu + */ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) +#define IPVS_EST_TICK_CHAINS IPVS_EST_CHAIN_FACTOR +#else +#define IPVS_EST_TICK_CHAINS 1 +#endif + +#if IPVS_EST_NTICKS > 127 +#error Too many timer ticks for ktrow +#endif + +/* Multiple chains processed in same tick */ +struct ip_vs_est_tick_data { + struct hlist_head chains[IPVS_EST_TICK_CHAINS]; + DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS); + DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS); + int chain_len[IPVS_EST_TICK_CHAINS]; +}; + +/* Context for estimation kthread */ +struct ip_vs_est_kt_data { + struct netns_ipvs *ipvs; + struct task_struct *task; /* task if running */ + struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS]; + DECLARE_BITMAP(avail, IPVS_EST_NTICKS); /* tick has space for ests */ + unsigned long est_timer; /* estimation timer (jiffies) */ + struct ip_vs_stats *calc_stats; /* Used for calculation */ + int tick_len[IPVS_EST_NTICKS]; /* est count */ + int id; /* ktid per netns */ + int chain_max; /* max ests per tick chain */ + int tick_max; /* max ests per tick */ + int est_count; /* attached ests to kthread */ + int est_max_count; /* max ests per kthread */ + int add_row; /* row for new ests */ + int est_row; /* estimated row */ +}; + struct dst_entry; struct iphdr; struct ip_vs_conn; @@ -953,9 +1019,17 @@ struct netns_ipvs { struct ctl_table_header *lblcr_ctl_header; struct ctl_table *lblcr_ctl_table; /* ip_vs_est */ - struct list_head est_list; /* estimator list */ - spinlock_t est_lock; - struct timer_list est_timer; /* Estimation timer */ + struct delayed_work est_reload_work;/* Reload kthread tasks */ + struct mutex est_mutex; /* protect kthread tasks */ + struct hlist_head est_temp_list; /* Ests during calc phase */ + struct ip_vs_est_kt_data **est_kt_arr; /* Array of kthread data ptrs */ + unsigned long est_max_threads;/* Hard limit of kthreads */ + int est_calc_phase; /* Calculation phase */ + int est_chain_max; /* Calculated chain_max */ + int est_kt_count; /* Allocated ptrs */ + int est_add_ktid; /* ktid where to add ests */ + atomic_t est_genid; /* kthreads reload genid */ + atomic_t est_genid_done; /* applied genid */ /* ip_vs_sync */ spinlock_t sync_lock; struct ipvs_master_sync_state *ms; @@ -1486,10 +1560,14 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state); void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts); /* IPVS rate estimator prototypes (from ip_vs_est.c) */ -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats); void ip_vs_zero_estimator(struct ip_vs_stats *stats); void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats); +void ip_vs_est_reload_start(struct netns_ipvs *ipvs); +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd); +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5f9cc2e7ba71..c41a5392edc9 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -49,8 +49,7 @@ MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME); -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); +DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */ /* sysctl variables */ @@ -241,6 +240,47 @@ static void defense_work_handler(struct work_struct *work) } #endif +static void est_reload_work_handler(struct work_struct *work) +{ + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, est_reload_work.work); + int genid_done = atomic_read(&ipvs->est_genid_done); + unsigned long delay = HZ / 10; /* repeat startups after failure */ + bool repeat = false; + int genid; + int id; + + mutex_lock(&ipvs->est_mutex); + genid = atomic_read(&ipvs->est_genid); + for (id = 0; id < ipvs->est_kt_count; id++) { + struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; + + /* netns clean up started, abort delayed work */ + if (!ipvs->enable) + goto unlock; + if (!kd) + continue; + /* New config ? Stop kthread tasks */ + if (genid != genid_done) + ip_vs_est_kthread_stop(kd); + if (!kd->task) { + /* Do not start kthreads above 0 in calc phase */ + if ((!id || !ipvs->est_calc_phase) && + ip_vs_est_kthread_start(ipvs, kd) < 0) + repeat = true; + } + } + + atomic_set(&ipvs->est_genid_done, genid); + + if (repeat) + queue_delayed_work(system_long_wq, &ipvs->est_reload_work, + delay); + +unlock: + mutex_unlock(&ipvs->est_mutex); +} + int ip_vs_use_count_inc(void) { @@ -831,7 +871,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) { #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c - spin_lock_bh(&src->lock); + spin_lock(&src->lock); IP_VS_SHOW_STATS_COUNTER(conns); IP_VS_SHOW_STATS_COUNTER(inpkts); @@ -841,7 +881,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src) ip_vs_read_estimator(dst, src); - spin_unlock_bh(&src->lock); + spin_unlock(&src->lock); } static void @@ -862,7 +902,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src) static void ip_vs_zero_stats(struct ip_vs_stats *stats) { - spin_lock_bh(&stats->lock); + spin_lock(&stats->lock); /* get current counters as zero point, rates are zeroed */ @@ -876,7 +916,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) ip_vs_zero_estimator(stats); - spin_unlock_bh(&stats->lock); + spin_unlock(&stats->lock); } /* Allocate fields after kzalloc */ @@ -998,7 +1038,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, spin_unlock_bh(&dest->dst_lock); if (add) { - ip_vs_start_estimator(svc->ipvs, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; sched = rcu_dereference_protected(svc->scheduler, 1); @@ -1051,6 +1090,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) if (ret < 0) goto err_alloc; + ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + if (ret < 0) + goto err_stats; + dest->af = udest->af; dest->protocol = svc->protocol; dest->vaddr = svc->addr; @@ -1071,6 +1114,9 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) LeaveFunction(2); return 0; +err_stats: + ip_vs_stats_release(&dest->stats); + err_alloc: kfree(dest); return ret; @@ -1135,14 +1181,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); + ret = ip_vs_start_estimator(svc->ipvs, &dest->stats); + if (ret < 0) + goto err; __ip_vs_update_dest(svc, dest, udest, 1); - ret = 0; } else { /* * Allocate and initialize the dest structure */ ret = ip_vs_new_dest(svc, udest); } + +err: LeaveFunction(2); return ret; @@ -1420,6 +1470,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, sched = NULL; } + ret = ip_vs_start_estimator(ipvs, &svc->stats); + if (ret < 0) + goto out_err; + /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); pe = NULL; @@ -1432,8 +1486,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, if (svc->pe && svc->pe->conn_out) atomic_inc(&ipvs->conn_out_counter); - ip_vs_start_estimator(ipvs, &svc->stats); - /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) ipvs->num_services++; @@ -1444,8 +1496,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_svc_hash(svc); *svc_p = svc; - /* Now there is a service - full throttle */ - ipvs->enable = 1; + + if (!ipvs->enable) { + /* Now there is a service - full throttle */ + ipvs->enable = 1; + + /* Start estimation for first time */ + ip_vs_est_reload_start(ipvs); + } + return 0; @@ -4065,13 +4124,16 @@ static void ip_vs_genl_unregister(void) static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { struct net *net = ipvs->net; - int idx; struct ctl_table *tbl; + int idx, ret; atomic_set(&ipvs->dropentry, 0); spin_lock_init(&ipvs->dropentry_lock); spin_lock_init(&ipvs->droppacket_lock); spin_lock_init(&ipvs->securetcp_lock); + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, + expire_nodest_conn_handler); if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); @@ -4139,24 +4201,27 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].mode = 0444; #endif + ret = -ENOMEM; ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); - if (ipvs->sysctl_hdr == NULL) { - if (!net_eq(net, &init_net)) - kfree(tbl); - return -ENOMEM; - } + if (!ipvs->sysctl_hdr) + goto err; ipvs->sysctl_tbl = tbl; + + ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); + if (ret < 0) + goto err; + /* Schedule defense work */ - INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); queue_delayed_work(system_long_wq, &ipvs->defense_work, DEFENSE_TIMER_PERIOD); - /* Init delayed work for expiring no dest conn */ - INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, - expire_nodest_conn_handler); - - ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s); return 0; + +err: + unregister_net_sysctl_table(ipvs->sysctl_hdr); + if (!net_eq(net, &init_net)) + kfree(tbl); + return ret; } static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) @@ -4189,6 +4254,7 @@ static struct notifier_block ip_vs_dst_notifier = { int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) { + int ret = -ENOMEM; int idx; /* Initialize rs_table */ @@ -4202,10 +4268,12 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->conn_out_counter, 0); + INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler); + /* procfs stats */ ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL); if (!ipvs->tot_stats) - return -ENOMEM; + goto out; if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0) goto err_tot_stats; @@ -4222,7 +4290,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) goto err_percpu; #endif - if (ip_vs_control_net_init_sysctl(ipvs)) + ret = ip_vs_control_net_init_sysctl(ipvs); + if (ret < 0) goto err; return 0; @@ -4243,13 +4312,16 @@ err_vs: err_tot_stats: kfree(ipvs->tot_stats); - return -ENOMEM; + +out: + return ret; } void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs) { ip_vs_trash_cleanup(ipvs); ip_vs_control_net_cleanup_sysctl(ipvs); + cancel_delayed_work_sync(&ipvs->est_reload_work); #ifdef CONFIG_PROC_FS remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net); remove_proc_entry("ip_vs_stats", ipvs->net->proc_net); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index f53150d82a92..2fb6c097437c 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -30,9 +30,6 @@ long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W @@ -47,68 +44,76 @@ to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c - */ - -/* - * Make a summary from each cpu + KEY POINTS: + - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled + - kthreads read the cpustats to update the estimators (svcs, dests, total) + - the states of estimators can be read (get stats) or modified (zero stats) + from processes + + KTHREADS: + - estimators are added initially to est_temp_list and later kthread 0 + distributes them to one or many kthreads for estimation + - kthread contexts are created and attached to array + - the kthread tasks are started when first service is added, before that + the total stats are not estimated + - the kthread context holds lists with estimators (chains) which are + processed every 2 seconds + - as estimators can be added dynamically and in bursts, we try to spread + them to multiple chains which are estimated at different time + - on start, kthread 0 enters calculation phase to determine the chain limits + and the limit of estimators per kthread + - est_add_ktid: ktid where to add new ests, can point to empty slot where + we should add kt data */ -static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, - struct ip_vs_cpu_stats __percpu *stats) -{ - int i; - bool add = false; - - for_each_possible_cpu(i) { - struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); - unsigned int start; - u64 conns, inpkts, outpkts, inbytes, outbytes; - if (add) { - do { - start = u64_stats_fetch_begin(&s->syncp); - conns = u64_stats_read(&s->cnt.conns); - inpkts = u64_stats_read(&s->cnt.inpkts); - outpkts = u64_stats_read(&s->cnt.outpkts); - inbytes = u64_stats_read(&s->cnt.inbytes); - outbytes = u64_stats_read(&s->cnt.outbytes); - } while (u64_stats_fetch_retry(&s->syncp, start)); - sum->conns += conns; - sum->inpkts += inpkts; - sum->outpkts += outpkts; - sum->inbytes += inbytes; - sum->outbytes += outbytes; - } else { - add = true; - do { - start = u64_stats_fetch_begin(&s->syncp); - sum->conns = u64_stats_read(&s->cnt.conns); - sum->inpkts = u64_stats_read(&s->cnt.inpkts); - sum->outpkts = u64_stats_read(&s->cnt.outpkts); - sum->inbytes = u64_stats_read(&s->cnt.inbytes); - sum->outbytes = u64_stats_read(&s->cnt.outbytes); - } while (u64_stats_fetch_retry(&s->syncp, start)); - } - } -} +static struct lock_class_key __ipvs_est_key; +static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); +static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); -static void estimation_timer(struct timer_list *t) +static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; + struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; - struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); - if (!sysctl_run_estimation(ipvs)) - goto skip; + hlist_for_each_entry_rcu(e, chain, list) { + u64 conns, inpkts, outpkts, inbytes, outbytes; + u64 kconns = 0, kinpkts = 0, koutpkts = 0; + u64 kinbytes = 0, koutbytes = 0; + unsigned int start; + int i; + + if (kthread_should_stop()) + break; - spin_lock(&ipvs->est_lock); - list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); + for_each_possible_cpu(i) { + c = per_cpu_ptr(s->cpustats, i); + do { + start = u64_stats_fetch_begin(&c->syncp); + conns = u64_stats_read(&c->cnt.conns); + inpkts = u64_stats_read(&c->cnt.inpkts); + outpkts = u64_stats_read(&c->cnt.outpkts); + inbytes = u64_stats_read(&c->cnt.inbytes); + outbytes = u64_stats_read(&c->cnt.outbytes); + } while (u64_stats_fetch_retry(&c->syncp, start)); + kconns += conns; + kinpkts += inpkts; + koutpkts += outpkts; + kinbytes += inbytes; + koutbytes += outbytes; + } spin_lock(&s->lock); - ip_vs_read_cpu_stats(&s->kstats, s->cpustats); + + s->kstats.conns = kconns; + s->kstats.inpkts = kinpkts; + s->kstats.outpkts = koutpkts; + s->kstats.inbytes = kinbytes; + s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; @@ -133,30 +138,754 @@ static void estimation_timer(struct timer_list *t) e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } - spin_unlock(&ipvs->est_lock); +} -skip: - mod_timer(&ipvs->est_timer, jiffies + 2*HZ); +static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) +{ + struct ip_vs_est_tick_data *td; + int cid; + + rcu_read_lock(); + td = rcu_dereference(kd->ticks[row]); + if (!td) + goto out; + for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { + if (kthread_should_stop()) + break; + ip_vs_chain_estimation(&td->chains[cid]); + cond_resched_rcu(); + td = rcu_dereference(kd->ticks[row]); + if (!td) + break; + } + +out: + rcu_read_unlock(); } -void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +static int ip_vs_estimation_kthread(void *data) { - struct ip_vs_estimator *est = &stats->est; + struct ip_vs_est_kt_data *kd = data; + struct netns_ipvs *ipvs = kd->ipvs; + int row = kd->est_row; + unsigned long now; + int id = kd->id; + long gap; + + if (id > 0) { + if (!ipvs->est_chain_max) + return 0; + } else { + if (!ipvs->est_chain_max) { + ipvs->est_calc_phase = 1; + /* commit est_calc_phase before reading est_genid */ + smp_mb(); + } + + /* kthread 0 will handle the calc phase */ + if (ipvs->est_calc_phase) + ip_vs_est_calc_phase(ipvs); + } + + while (1) { + if (!id && !hlist_empty(&ipvs->est_temp_list)) + ip_vs_est_drain_temp_list(ipvs); + set_current_state(TASK_IDLE); + if (kthread_should_stop()) + break; + + /* before estimation, check if we should sleep */ + now = jiffies; + gap = kd->est_timer - now; + if (gap > 0) { + if (gap > IPVS_EST_TICK) { + kd->est_timer = now - IPVS_EST_TICK; + gap = IPVS_EST_TICK; + } + schedule_timeout(gap); + } else { + __set_current_state(TASK_RUNNING); + if (gap < -8 * IPVS_EST_TICK) + kd->est_timer = now; + } + + if (sysctl_run_estimation(ipvs) && kd->tick_len[row]) + ip_vs_tick_estimation(kd, row); + + row++; + if (row >= IPVS_EST_NTICKS) + row = 0; + WRITE_ONCE(kd->est_row, row); + kd->est_timer += IPVS_EST_TICK; + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* Schedule stop/start for kthread tasks */ +void ip_vs_est_reload_start(struct netns_ipvs *ipvs) +{ + /* Ignore reloads before first service is added */ + if (!ipvs->enable) + return; + /* Bump the kthread configuration genid */ + atomic_inc(&ipvs->est_genid); + queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); +} + +/* Start kthread task with current configuration */ +int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd) +{ + unsigned long now; + int ret = 0; + long gap; + + lockdep_assert_held(&ipvs->est_mutex); + + if (kd->task) + goto out; + now = jiffies; + gap = kd->est_timer - now; + /* Sync est_timer if task is starting later */ + if (abs(gap) > 4 * IPVS_EST_TICK) + kd->est_timer = now; + kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", + ipvs->gen, kd->id); + if (IS_ERR(kd->task)) { + ret = PTR_ERR(kd->task); + kd->task = NULL; + goto out; + } + + pr_info("starting estimator thread %d...\n", kd->id); + wake_up_process(kd->task); + +out: + return ret; +} + +void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) +{ + if (kd->task) { + pr_info("stopping estimator thread %d...\n", kd->id); + kthread_stop(kd->task); + kd->task = NULL; + } +} + +/* Apply parameters to kthread */ +static void ip_vs_est_set_params(struct netns_ipvs *ipvs, + struct ip_vs_est_kt_data *kd) +{ + kd->chain_max = ipvs->est_chain_max; + /* We are using single chain on RCU preemption */ + if (IPVS_EST_TICK_CHAINS == 1) + kd->chain_max *= IPVS_EST_CHAIN_FACTOR; + kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; + kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; +} + +/* Create and start estimation kthread in a free or new array slot */ +static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) +{ + struct ip_vs_est_kt_data *kd = NULL; + int id = ipvs->est_kt_count; + int ret = -ENOMEM; + void *arr = NULL; + int i; + + if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && + ipvs->enable && ipvs->est_max_threads) + return -EINVAL; + + mutex_lock(&ipvs->est_mutex); + + for (i = 0; i < id; i++) { + if (!ipvs->est_kt_arr[i]) + break; + } + if (i >= id) { + arr = krealloc_array(ipvs->est_kt_arr, id + 1, + sizeof(struct ip_vs_est_kt_data *), + GFP_KERNEL); + if (!arr) + goto out; + ipvs->est_kt_arr = arr; + } else { + id = i; + } + + kd = kzalloc(sizeof(*kd), GFP_KERNEL); + if (!kd) + goto out; + kd->ipvs = ipvs; + bitmap_fill(kd->avail, IPVS_EST_NTICKS); + kd->est_timer = jiffies; + kd->id = id; + ip_vs_est_set_params(ipvs, kd); + + /* Pre-allocate stats used in calc phase */ + if (!id && !kd->calc_stats) { + kd->calc_stats = ip_vs_stats_alloc(); + if (!kd->calc_stats) + goto out; + } + + /* Start kthread tasks only when services are present */ + if (ipvs->enable) { + ret = ip_vs_est_kthread_start(ipvs, kd); + if (ret < 0) + goto out; + } + + if (arr) + ipvs->est_kt_count++; + ipvs->est_kt_arr[id] = kd; + kd = NULL; + /* Use most recent kthread for new ests */ + ipvs->est_add_ktid = id; + ret = 0; + +out: + mutex_unlock(&ipvs->est_mutex); + if (kd) { + ip_vs_stats_free(kd->calc_stats); + kfree(kd); + } + + return ret; +} + +/* Select ktid where to add new ests: available, unused or new slot */ +static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) +{ + int ktid, best = ipvs->est_kt_count; + struct ip_vs_est_kt_data *kd; + + for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { + kd = ipvs->est_kt_arr[ktid]; + if (kd) { + if (kd->est_count < kd->est_max_count) { + best = ktid; + break; + } + } else if (ktid < best) { + best = ktid; + } + } + ipvs->est_add_ktid = best; +} + +/* Add estimator to current kthread (est_add_ktid) */ +static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, + struct ip_vs_estimator *est) +{ + struct ip_vs_est_kt_data *kd = NULL; + struct ip_vs_est_tick_data *td; + int ktid, row, crow, cid, ret; + int delay = est->ktrow; + + BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, + "Too many chains for ktcid"); + + if (ipvs->est_add_ktid < ipvs->est_kt_count) { + kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; + if (kd) + goto add_est; + } + + ret = ip_vs_est_add_kthread(ipvs); + if (ret < 0) + goto out; + kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; + +add_est: + ktid = kd->id; + /* For small number of estimators prefer to use few ticks, + * otherwise try to add into the last estimated row. + * est_row and add_row point after the row we should use + */ + if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) + crow = READ_ONCE(kd->est_row); + else + crow = kd->add_row; + crow += delay; + if (crow >= IPVS_EST_NTICKS) + crow -= IPVS_EST_NTICKS; + /* Assume initial delay ? */ + if (delay >= IPVS_EST_NTICKS - 1) { + /* Preserve initial delay or decrease it if no space in tick */ + row = crow; + if (crow < IPVS_EST_NTICKS - 1) { + crow++; + row = find_last_bit(kd->avail, crow); + } + if (row >= crow) + row = find_last_bit(kd->avail, IPVS_EST_NTICKS); + } else { + /* Preserve delay or increase it if no space in tick */ + row = IPVS_EST_NTICKS; + if (crow > 0) + row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); + if (row >= IPVS_EST_NTICKS) + row = find_first_bit(kd->avail, IPVS_EST_NTICKS); + } + + td = rcu_dereference_protected(kd->ticks[row], 1); + if (!td) { + td = kzalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + ret = -ENOMEM; + goto out; + } + rcu_assign_pointer(kd->ticks[row], td); + } + + cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); + + kd->est_count++; + kd->tick_len[row]++; + if (!td->chain_len[cid]) + __set_bit(cid, td->present); + td->chain_len[cid]++; + est->ktid = ktid; + est->ktrow = row; + est->ktcid = cid; + hlist_add_head_rcu(&est->list, &td->chains[cid]); + + if (td->chain_len[cid] >= kd->chain_max) { + __set_bit(cid, td->full); + if (kd->tick_len[row] >= kd->tick_max) + __clear_bit(row, kd->avail); + } + + /* Update est_add_ktid to point to first available/empty kt slot */ + if (kd->est_count == kd->est_max_count) + ip_vs_est_update_ktid(ipvs); + + ret = 0; + +out: + return ret; +} - INIT_LIST_HEAD(&est->list); +/* Start estimation for stats */ +int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + int ret; + + if (!ipvs->est_max_threads && ipvs->enable) + ipvs->est_max_threads = IPVS_EST_CPU_KTHREADS * + num_possible_cpus(); + + est->ktid = -1; + est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ + + /* We prefer this code to be short, kthread 0 will requeue the + * estimator to available chain. If tasks are disabled, we + * will not allocate much memory, just for kt 0. + */ + ret = 0; + if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) + ret = ip_vs_est_add_kthread(ipvs); + if (ret >= 0) + hlist_add_head(&est->list, &ipvs->est_temp_list); + else + INIT_HLIST_NODE(&est->list); + return ret; +} - spin_lock_bh(&ipvs->est_lock); - list_add(&est->list, &ipvs->est_list); - spin_unlock_bh(&ipvs->est_lock); +static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) +{ + if (kd) { + if (kd->task) { + pr_info("stop unused estimator thread %d...\n", kd->id); + kthread_stop(kd->task); + } + ip_vs_stats_free(kd->calc_stats); + kfree(kd); + } } +/* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; + struct ip_vs_est_tick_data *td; + struct ip_vs_est_kt_data *kd; + int ktid = est->ktid; + int row = est->ktrow; + int cid = est->ktcid; + + /* Failed to add to chain ? */ + if (hlist_unhashed(&est->list)) + return; + + /* On return, estimator can be freed, dequeue it now */ + + /* In est_temp_list ? */ + if (ktid < 0) { + hlist_del(&est->list); + goto end_kt0; + } + + hlist_del_rcu(&est->list); + kd = ipvs->est_kt_arr[ktid]; + td = rcu_dereference_protected(kd->ticks[row], 1); + __clear_bit(cid, td->full); + td->chain_len[cid]--; + if (!td->chain_len[cid]) + __clear_bit(cid, td->present); + kd->tick_len[row]--; + __set_bit(row, kd->avail); + if (!kd->tick_len[row]) { + RCU_INIT_POINTER(kd->ticks[row], NULL); + kfree_rcu(td); + } + kd->est_count--; + if (kd->est_count) { + /* This kt slot can become available just now, prefer it */ + if (ktid < ipvs->est_add_ktid) + ipvs->est_add_ktid = ktid; + return; + } - spin_lock_bh(&ipvs->est_lock); - list_del(&est->list); - spin_unlock_bh(&ipvs->est_lock); + if (ktid > 0) { + mutex_lock(&ipvs->est_mutex); + ip_vs_est_kthread_destroy(kd); + ipvs->est_kt_arr[ktid] = NULL; + if (ktid == ipvs->est_kt_count - 1) { + ipvs->est_kt_count--; + while (ipvs->est_kt_count > 1 && + !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) + ipvs->est_kt_count--; + } + mutex_unlock(&ipvs->est_mutex); + + /* This slot is now empty, prefer another available kt slot */ + if (ktid == ipvs->est_add_ktid) + ip_vs_est_update_ktid(ipvs); + } + +end_kt0: + /* kt 0 is freed after all other kthreads and chains are empty */ + if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { + kd = ipvs->est_kt_arr[0]; + if (!kd || !kd->est_count) { + mutex_lock(&ipvs->est_mutex); + if (kd) { + ip_vs_est_kthread_destroy(kd); + ipvs->est_kt_arr[0] = NULL; + } + ipvs->est_kt_count--; + mutex_unlock(&ipvs->est_mutex); + ipvs->est_add_ktid = 0; + } + } +} + +/* Register all ests from est_temp_list to kthreads */ +static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) +{ + struct ip_vs_estimator *est; + + while (1) { + int max = 16; + + mutex_lock(&__ip_vs_mutex); + + while (max-- > 0) { + est = hlist_entry_safe(ipvs->est_temp_list.first, + struct ip_vs_estimator, list); + if (est) { + if (kthread_should_stop()) + goto unlock; + hlist_del_init(&est->list); + if (ip_vs_enqueue_estimator(ipvs, est) >= 0) + continue; + est->ktid = -1; + hlist_add_head(&est->list, + &ipvs->est_temp_list); + /* Abort, some entries will not be estimated + * until next attempt + */ + } + goto unlock; + } + mutex_unlock(&__ip_vs_mutex); + cond_resched(); + } + +unlock: + mutex_unlock(&__ip_vs_mutex); +} + +/* Calculate limits for all kthreads */ +static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) +{ + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + struct ip_vs_est_kt_data *kd; + struct hlist_head chain; + struct ip_vs_stats *s; + int cache_factor = 4; + int i, loops, ntest; + s32 min_est = 0; + ktime_t t1, t2; + s64 diff, val; + int max = 8; + int ret = 1; + + INIT_HLIST_HEAD(&chain); + mutex_lock(&__ip_vs_mutex); + kd = ipvs->est_kt_arr[0]; + mutex_unlock(&__ip_vs_mutex); + s = kd ? kd->calc_stats : NULL; + if (!s) + goto out; + hlist_add_head(&s->est.list, &chain); + + loops = 1; + /* Get best result from many tests */ + for (ntest = 0; ntest < 12; ntest++) { + if (!(ntest & 3)) { + /* Wait for cpufreq frequency transition */ + wait_event_idle_timeout(wq, kthread_should_stop(), + HZ / 50); + if (!ipvs->enable || kthread_should_stop()) + goto stop; + } + + local_bh_disable(); + rcu_read_lock(); + + /* Put stats in cache */ + ip_vs_chain_estimation(&chain); + + t1 = ktime_get(); + for (i = loops * cache_factor; i > 0; i--) + ip_vs_chain_estimation(&chain); + t2 = ktime_get(); + + rcu_read_unlock(); + local_bh_enable(); + + if (!ipvs->enable || kthread_should_stop()) + goto stop; + cond_resched(); + + diff = ktime_to_ns(ktime_sub(t2, t1)); + if (diff <= 1 * NSEC_PER_USEC) { + /* Do more loops on low time resolution */ + loops *= 2; + continue; + } + if (diff >= NSEC_PER_SEC) + continue; + val = diff; + do_div(val, loops); + if (!min_est || val < min_est) { + min_est = val; + /* goal: 95usec per chain */ + val = 95 * NSEC_PER_USEC; + if (val >= min_est) { + do_div(val, min_est); + max = (int)val; + } else { + max = 1; + } + } + } + +out: + if (s) + hlist_del_init(&s->est.list); + *chain_max = max; + return ret; + +stop: + ret = 0; + goto out; +} + +/* Calculate the parameters and apply them in context of kt #0 + * ECP: est_calc_phase + * ECM: est_chain_max + * ECP ECM Insert Chain enable Description + * --------------------------------------------------------------------------- + * 0 0 est_temp_list 0 create kt #0 context + * 0 0 est_temp_list 0->1 service added, start kthread #0 task + * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase + * 1 0 est_temp_list 1 kt #0: determine est_chain_max, + * stop tasks, move ests to est_temp_list + * and free kd for kthreads 1..last + * 1->0 0->N kt chains 1 ests can go to kthreads + * 0 N kt chains 1 drain est_temp_list, create new kthread + * contexts, start tasks, estimate + */ +static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) +{ + int genid = atomic_read(&ipvs->est_genid); + struct ip_vs_est_tick_data *td; + struct ip_vs_est_kt_data *kd; + struct ip_vs_estimator *est; + struct ip_vs_stats *stats; + int id, row, cid, delay; + bool last, last_td; + int chain_max; + int step; + + if (!ip_vs_est_calc_limits(ipvs, &chain_max)) + return; + + mutex_lock(&__ip_vs_mutex); + + /* Stop all other tasks, so that we can immediately move the + * estimators to est_temp_list without RCU grace period + */ + mutex_lock(&ipvs->est_mutex); + for (id = 1; id < ipvs->est_kt_count; id++) { + /* netns clean up started, abort */ + if (!ipvs->enable) + goto unlock2; + kd = ipvs->est_kt_arr[id]; + if (!kd) + continue; + ip_vs_est_kthread_stop(kd); + } + mutex_unlock(&ipvs->est_mutex); + + /* Move all estimators to est_temp_list but carefully, + * all estimators and kthread data can be released while + * we reschedule. Even for kthread 0. + */ + step = 0; + + /* Order entries in est_temp_list in ascending delay, so now + * walk delay(desc), id(desc), cid(asc) + */ + delay = IPVS_EST_NTICKS; + +next_delay: + delay--; + if (delay < 0) + goto end_dequeue; + +last_kt: + /* Destroy contexts backwards */ + id = ipvs->est_kt_count; + +next_kt: + if (!ipvs->enable || kthread_should_stop()) + goto unlock; + id--; + if (id < 0) + goto next_delay; + kd = ipvs->est_kt_arr[id]; + if (!kd) + goto next_kt; + /* kt 0 can exist with empty chains */ + if (!id && kd->est_count <= 1) + goto next_delay; + + row = kd->est_row + delay; + if (row >= IPVS_EST_NTICKS) + row -= IPVS_EST_NTICKS; + td = rcu_dereference_protected(kd->ticks[row], 1); + if (!td) + goto next_kt; + + cid = 0; + +walk_chain: + if (kthread_should_stop()) + goto unlock; + step++; + if (!(step & 63)) { + /* Give chance estimators to be added (to est_temp_list) + * and deleted (releasing kthread contexts) + */ + mutex_unlock(&__ip_vs_mutex); + cond_resched(); + mutex_lock(&__ip_vs_mutex); + + /* Current kt released ? */ + if (id >= ipvs->est_kt_count) + goto last_kt; + if (kd != ipvs->est_kt_arr[id]) + goto next_kt; + /* Current td released ? */ + if (td != rcu_dereference_protected(kd->ticks[row], 1)) + goto next_kt; + /* No fatal changes on the current kd and td */ + } + est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, + list); + if (!est) { + cid++; + if (cid >= IPVS_EST_TICK_CHAINS) + goto next_kt; + goto walk_chain; + } + /* We can cheat and increase est_count to protect kt 0 context + * from release but we prefer to keep the last estimator + */ + last = kd->est_count <= 1; + /* Do not free kt #0 data */ + if (!id && last) + goto next_delay; + last_td = kd->tick_len[row] <= 1; + stats = container_of(est, struct ip_vs_stats, est); + ip_vs_stop_estimator(ipvs, stats); + /* Tasks are stopped, move without RCU grace period */ + est->ktid = -1; + est->ktrow = row - kd->est_row; + if (est->ktrow < 0) + est->ktrow += IPVS_EST_NTICKS; + hlist_add_head(&est->list, &ipvs->est_temp_list); + /* kd freed ? */ + if (last) + goto next_kt; + /* td freed ? */ + if (last_td) + goto next_kt; + goto walk_chain; + +end_dequeue: + /* All estimators removed while calculating ? */ + if (!ipvs->est_kt_count) + goto unlock; + kd = ipvs->est_kt_arr[0]; + if (!kd) + goto unlock; + kd->add_row = kd->est_row; + ipvs->est_chain_max = chain_max; + ip_vs_est_set_params(ipvs, kd); + + pr_info("using max %d ests per chain, %d per kthread\n", + kd->chain_max, kd->est_max_count); + + /* Try to keep tot_stats in kt0, enqueue it early */ + if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && + ipvs->tot_stats->s.est.ktid == -1) { + hlist_del(&ipvs->tot_stats->s.est.list); + hlist_add_head(&ipvs->tot_stats->s.est.list, + &ipvs->est_temp_list); + } + + mutex_lock(&ipvs->est_mutex); + + /* We completed the calc phase, new calc phase not requested */ + if (genid == atomic_read(&ipvs->est_genid)) + ipvs->est_calc_phase = 0; + +unlock2: + mutex_unlock(&ipvs->est_mutex); + +unlock: + mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) @@ -191,14 +920,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { - INIT_LIST_HEAD(&ipvs->est_list); - spin_lock_init(&ipvs->est_lock); - timer_setup(&ipvs->est_timer, estimation_timer, 0); - mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); + INIT_HLIST_HEAD(&ipvs->est_temp_list); + ipvs->est_kt_arr = NULL; + ipvs->est_max_threads = 0; + ipvs->est_calc_phase = 0; + ipvs->est_chain_max = 0; + ipvs->est_kt_count = 0; + ipvs->est_add_ktid = 0; + atomic_set(&ipvs->est_genid, 0); + atomic_set(&ipvs->est_genid_done, 0); + __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { - del_timer_sync(&ipvs->est_timer); + int i; + + for (i = 0; i < ipvs->est_kt_count; i++) + ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); + kfree(ipvs->est_kt_arr); + mutex_destroy(&ipvs->est_mutex); } -- cgit v1.2.3-70-g09d2 From f0be83d5421718ead31707b6ece34cf77d411c00 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:03 +0200 Subject: ipvs: add est_cpulist and est_nice sysctl vars Allow the kthreads for stats to be configured for specific cpulist (isolation) and niceness (scheduling priority). Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/ipvs-sysctl.rst | 20 +++++ include/net/ip_vs.h | 58 +++++++++++++ net/netfilter/ipvs/ip_vs_ctl.c | 143 ++++++++++++++++++++++++++++++- net/netfilter/ipvs/ip_vs_est.c | 12 ++- 4 files changed, 229 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst index 387fda80f05f..1b778705d706 100644 --- a/Documentation/networking/ipvs-sysctl.rst +++ b/Documentation/networking/ipvs-sysctl.rst @@ -129,6 +129,26 @@ drop_packet - INTEGER threshold. When the mode 3 is set, the always mode drop rate is controlled by the /proc/sys/net/ipv4/vs/am_droprate. +est_cpulist - CPULIST + Allowed CPUs for estimation kthreads + + Syntax: standard cpulist format + empty list - stop kthread tasks and estimation + default - the system's housekeeping CPUs for kthreads + + Example: + "all": all possible CPUs + "0-N": all possible CPUs, N denotes last CPU number + "0,1-N:1/2": first and all CPUs with odd number + "": empty list + +est_nice - INTEGER + default 0 + Valid range: -20 (more favorable) .. 19 (less favorable) + + Niceness value to use for the estimation kthreads (scheduling + priority) + expire_nodest_conn - BOOLEAN - 0 - disabled (default) - not 0 - enabled diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 04960dc6228f..dc51b5497cf7 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -29,6 +29,7 @@ #include #endif #include /* Netw namespace */ +#include #define IP_VS_HDR_INVERSE 1 #define IP_VS_HDR_ICMP 2 @@ -365,6 +366,9 @@ struct ip_vs_cpu_stats { struct u64_stats_sync syncp; }; +/* Default nice for estimator kthreads */ +#define IPVS_EST_NICE 0 + /* IPVS statistics objects */ struct ip_vs_estimator { struct hlist_node list; @@ -1009,6 +1013,12 @@ struct netns_ipvs { int sysctl_schedule_icmp; int sysctl_ignore_tunneled; int sysctl_run_estimation; +#ifdef CONFIG_SYSCTL + cpumask_var_t sysctl_est_cpulist; /* kthread cpumask */ + int est_cpulist_valid; /* cpulist set */ + int sysctl_est_nice; /* kthread nice */ + int est_stopped; /* stop tasks */ +#endif /* ip_vs_lblc */ int sysctl_lblc_expiration; @@ -1162,6 +1172,19 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return ipvs->sysctl_run_estimation; } +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + if (ipvs->est_cpulist_valid) + return ipvs->sysctl_est_cpulist; + else + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return ipvs->sysctl_est_nice; +} + #else static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs) @@ -1259,6 +1282,16 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs) return 1; } +static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs) +{ + return housekeeping_cpumask(HK_TYPE_KTHREAD); +} + +static inline int sysctl_est_nice(struct netns_ipvs *ipvs) +{ + return IPVS_EST_NICE; +} + #endif /* IPVS core functions @@ -1569,6 +1602,31 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd); void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); +static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + ipvs->est_stopped = ipvs->est_cpulist_valid && + cpumask_empty(sysctl_est_cpulist(ipvs)); +#endif +} + +static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs) +{ +#ifdef CONFIG_SYSCTL + return ipvs->est_stopped; +#else + return false; +#endif +} + +static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs) +{ + unsigned int limit = IPVS_EST_CPU_KTHREADS * + cpumask_weight(sysctl_est_cpulist(ipvs)); + + return max(1U, limit); +} + /* Various IPVS packet transmitters (from ip_vs_xmit.c) */ int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c41a5392edc9..38df3ee655ed 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -263,7 +263,7 @@ static void est_reload_work_handler(struct work_struct *work) /* New config ? Stop kthread tasks */ if (genid != genid_done) ip_vs_est_kthread_stop(kd); - if (!kd->task) { + if (!kd->task && !ip_vs_est_stopped(ipvs)) { /* Do not start kthreads above 0 in calc phase */ if ((!id || !ipvs->est_calc_phase) && ip_vs_est_kthread_start(ipvs, kd) < 0) @@ -1940,6 +1940,122 @@ proc_do_sync_ports(struct ctl_table *table, int write, return rc; } +static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer) +{ + struct netns_ipvs *ipvs = table->extra2; + cpumask_var_t *valp = table->data; + cpumask_var_t newmask; + int ret; + + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) + return -ENOMEM; + + ret = cpulist_parse(buffer, newmask); + if (ret) + goto out; + + mutex_lock(&ipvs->est_mutex); + + if (!ipvs->est_cpulist_valid) { + if (!zalloc_cpumask_var(valp, GFP_KERNEL)) { + ret = -ENOMEM; + goto unlock; + } + ipvs->est_cpulist_valid = 1; + } + cpumask_and(newmask, newmask, ¤t->cpus_mask); + cpumask_copy(*valp, newmask); + /* est_max_threads may depend on cpulist size */ + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); + ipvs->est_calc_phase = 1; + ip_vs_est_reload_start(ipvs); + +unlock: + mutex_unlock(&ipvs->est_mutex); + +out: + free_cpumask_var(newmask); + return ret; +} + +static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer, + size_t size) +{ + struct netns_ipvs *ipvs = table->extra2; + cpumask_var_t *valp = table->data; + struct cpumask *mask; + int ret; + + mutex_lock(&ipvs->est_mutex); + + if (ipvs->est_cpulist_valid) + mask = *valp; + else + mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD); + ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask)); + + mutex_unlock(&ipvs->est_mutex); + + return ret; +} + +static int ipvs_proc_est_cpulist(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + /* Ignore both read and write(append) if *ppos not 0 */ + if (*ppos || !*lenp) { + *lenp = 0; + return 0; + } + if (write) { + /* proc_sys_call_handler() appends terminator */ + ret = ipvs_proc_est_cpumask_set(table, buffer); + if (ret >= 0) + *ppos += *lenp; + } else { + /* proc_sys_call_handler() allocates 1 byte for terminator */ + ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1); + if (ret >= 0) { + *lenp = ret; + *ppos += *lenp; + ret = 0; + } + } + return ret; +} + +static int ipvs_proc_est_nice(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + if (val < MIN_NICE || val > MAX_NICE) { + ret = -EINVAL; + } else { + mutex_lock(&ipvs->est_mutex); + if (*valp != val) { + *valp = val; + ip_vs_est_reload_start(ipvs); + } + mutex_unlock(&ipvs->est_mutex); + } + } + return ret; +} + /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without @@ -2116,6 +2232,18 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "est_cpulist", + .maxlen = NR_CPUS, /* unused */ + .mode = 0644, + .proc_handler = ipvs_proc_est_cpulist, + }, + { + .procname = "est_nice", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ipvs_proc_est_nice, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -4134,6 +4262,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work, expire_nodest_conn_handler); + ipvs->est_stopped = 0; if (!net_eq(net, &init_net)) { tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); @@ -4195,6 +4324,15 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_run_estimation = 1; tbl[idx++].data = &ipvs->sysctl_run_estimation; + + ipvs->est_cpulist_valid = 0; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_est_cpulist; + + ipvs->sysctl_est_nice = IPVS_EST_NICE; + tbl[idx].extra2 = ipvs; + tbl[idx++].data = &ipvs->sysctl_est_nice; + #ifdef CONFIG_IP_VS_DEBUG /* Global sysctls must be ro in non-init netns */ if (!net_eq(net, &init_net)) @@ -4234,6 +4372,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) unregister_net_sysctl_table(ipvs->sysctl_hdr); ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s); + if (ipvs->est_cpulist_valid) + free_cpumask_var(ipvs->sysctl_est_cpulist); + if (!net_eq(net, &init_net)) kfree(ipvs->sysctl_tbl); } diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 2fb6c097437c..e0f5f5da5b6d 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -57,6 +57,9 @@ - kthread contexts are created and attached to array - the kthread tasks are started when first service is added, before that the total stats are not estimated + - when configuration (cpulist/nice) is changed, the tasks are restarted + by work (est_reload_work) + - kthread tasks are stopped while the cpulist is empty - the kthread context holds lists with estimators (chains) which are processed every 2 seconds - as estimators can be added dynamically and in bursts, we try to spread @@ -229,6 +232,7 @@ void ip_vs_est_reload_start(struct netns_ipvs *ipvs) /* Ignore reloads before first service is added */ if (!ipvs->enable) return; + ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); @@ -259,6 +263,9 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, goto out; } + set_user_nice(kd->task, sysctl_est_nice(ipvs)); + set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); + pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); @@ -334,7 +341,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) } /* Start kthread tasks only when services are present */ - if (ipvs->enable) { + if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; @@ -478,8 +485,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) int ret; if (!ipvs->est_max_threads && ipvs->enable) - ipvs->est_max_threads = IPVS_EST_CPU_KTHREADS * - num_possible_cpus(); + ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ -- cgit v1.2.3-70-g09d2 From 144361c1949f227df9244302da02c258a363b674 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 22 Nov 2022 18:46:04 +0200 Subject: ipvs: run_estimation should control the kthread tasks Change the run_estimation flag to start/stop the kthread tasks. Signed-off-by: Julian Anastasov Cc: yunhong-cgl jiang Cc: "dust.li" Reviewed-by: Jiri Wiesner Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/ipvs-sysctl.rst | 4 ++-- include/net/ip_vs.h | 6 ++++-- net/netfilter/ipvs/ip_vs_ctl.c | 29 ++++++++++++++++++++++++++++- net/netfilter/ipvs/ip_vs_est.c | 2 +- 4 files changed, 35 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst index 1b778705d706..3fb5fa142eef 100644 --- a/Documentation/networking/ipvs-sysctl.rst +++ b/Documentation/networking/ipvs-sysctl.rst @@ -324,8 +324,8 @@ run_estimation - BOOLEAN 0 - disabled not 0 - enabled (default) - If disabled, the estimation will be stop, and you can't see - any update on speed estimation data. + If disabled, the estimation will be suspended and kthread tasks + stopped. You can always re-enable estimation by setting this value to 1. But be careful, the first estimation after re-enable is not diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index dc51b5497cf7..c6c61100d244 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1605,8 +1605,10 @@ void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd); static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs) { #ifdef CONFIG_SYSCTL - ipvs->est_stopped = ipvs->est_cpulist_valid && - cpumask_empty(sysctl_est_cpulist(ipvs)); + /* Stop tasks while cpulist is empty or if disabled with flag */ + ipvs->est_stopped = !sysctl_run_estimation(ipvs) || + (ipvs->est_cpulist_valid && + cpumask_empty(sysctl_est_cpulist(ipvs))); #endif } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 38df3ee655ed..c9f598505642 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2056,6 +2056,32 @@ static int ipvs_proc_est_nice(struct ctl_table *table, int write, return ret; } +static int ipvs_proc_run_estimation(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct netns_ipvs *ipvs = table->extra2; + int *valp = table->data; + int val = *valp; + int ret; + + struct ctl_table tmp_table = { + .data = &val, + .maxlen = sizeof(int), + .mode = table->mode, + }; + + ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos); + if (write && ret >= 0) { + mutex_lock(&ipvs->est_mutex); + if (*valp != val) { + *valp = val; + ip_vs_est_reload_start(ipvs); + } + mutex_unlock(&ipvs->est_mutex); + } + return ret; +} + /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) * Do not change order or insert new entries without @@ -2230,7 +2256,7 @@ static struct ctl_table vs_vars[] = { .procname = "run_estimation", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = ipvs_proc_run_estimation, }, { .procname = "est_cpulist", @@ -4323,6 +4349,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; ipvs->sysctl_run_estimation = 1; + tbl[idx].extra2 = ipvs; tbl[idx++].data = &ipvs->sysctl_run_estimation; ipvs->est_cpulist_valid = 0; diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index e0f5f5da5b6d..df56073bb282 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -212,7 +212,7 @@ static int ip_vs_estimation_kthread(void *data) kd->est_timer = now; } - if (sysctl_run_estimation(ipvs) && kd->tick_len[row]) + if (kd->tick_len[row]) ip_vs_tick_estimation(kd, row); row++; -- cgit v1.2.3-70-g09d2 From 1fd54773c26787b9aea80e2f62c7d0780ea444d0 Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Wed, 7 Dec 2022 13:35:53 +0200 Subject: udp: allow header check for dodgy GSO_UDP_L4 packets. Allow UDP_L4 for robust packets. Signed-off-by: Jason Wang Signed-off-by: Andrew Melnychenko Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 3 ++- net/ipv6/udp_offload.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index aedde65e2268..1f01e15ca24f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -387,7 +387,8 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) return __udp_gso_segment(skb, features, false); mss = skb_shinfo(skb)->gso_size; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index e0e10f6bcdc1..c39c1e32f980 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -42,7 +42,8 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) return __udp_gso_segment(skb, features, true); mss = skb_shinfo(skb)->gso_size; -- cgit v1.2.3-70-g09d2 From 3cf7203ca620682165706f70a1b12b5194607dce Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 8 Dec 2022 20:04:52 +0800 Subject: net/tunnel: wait until all sk_user_data reader finish before releasing the sock There is a race condition in vxlan that when deleting a vxlan device during receiving packets, there is a possibility that the sock is released after getting vxlan_sock vs from sk_user_data. Then in later vxlan_ecn_decapsulate(), vxlan_get_sk_family() we will got NULL pointer dereference. e.g. #0 [ffffa25ec6978a38] machine_kexec at ffffffff8c669757 #1 [ffffa25ec6978a90] __crash_kexec at ffffffff8c7c0a4d #2 [ffffa25ec6978b58] crash_kexec at ffffffff8c7c1c48 #3 [ffffa25ec6978b60] oops_end at ffffffff8c627f2b #4 [ffffa25ec6978b80] page_fault_oops at ffffffff8c678fcb #5 [ffffa25ec6978bd8] exc_page_fault at ffffffff8d109542 #6 [ffffa25ec6978c00] asm_exc_page_fault at ffffffff8d200b62 [exception RIP: vxlan_ecn_decapsulate+0x3b] RIP: ffffffffc1014e7b RSP: ffffa25ec6978cb0 RFLAGS: 00010246 RAX: 0000000000000008 RBX: ffff8aa000888000 RCX: 0000000000000000 RDX: 000000000000000e RSI: ffff8a9fc7ab803e RDI: ffff8a9fd1168700 RBP: ffff8a9fc7ab803e R8: 0000000000700000 R9: 00000000000010ae R10: ffff8a9fcb748980 R11: 0000000000000000 R12: ffff8a9fd1168700 R13: ffff8aa000888000 R14: 00000000002a0000 R15: 00000000000010ae ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffffa25ec6978ce8] vxlan_rcv at ffffffffc10189cd [vxlan] #8 [ffffa25ec6978d90] udp_queue_rcv_one_skb at ffffffff8cfb6507 #9 [ffffa25ec6978dc0] udp_unicast_rcv_skb at ffffffff8cfb6e45 #10 [ffffa25ec6978dc8] __udp4_lib_rcv at ffffffff8cfb8807 #11 [ffffa25ec6978e20] ip_protocol_deliver_rcu at ffffffff8cf76951 #12 [ffffa25ec6978e48] ip_local_deliver at ffffffff8cf76bde #13 [ffffa25ec6978ea0] __netif_receive_skb_one_core at ffffffff8cecde9b #14 [ffffa25ec6978ec8] process_backlog at ffffffff8cece139 #15 [ffffa25ec6978f00] __napi_poll at ffffffff8ceced1a #16 [ffffa25ec6978f28] net_rx_action at ffffffff8cecf1f3 #17 [ffffa25ec6978fa0] __softirqentry_text_start at ffffffff8d4000ca #18 [ffffa25ec6978ff0] do_softirq at ffffffff8c6fbdc3 Reproducer: https://github.com/Mellanox/ovs-tests/blob/master/test-ovs-vxlan-remove-tunnel-during-traffic.sh Fix this by waiting for all sk_user_data reader to finish before releasing the sock. Reported-by: Jianlin Shi Suggested-by: Jakub Sitnicki Fixes: 6a93cc905274 ("udp-tunnel: Add a few more UDP tunnel APIs") Signed-off-by: Hangbin Liu Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/udp_tunnel_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 8242c8947340..5f8104cf082d 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -176,6 +176,7 @@ EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); + synchronize_rcu(); kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } -- cgit v1.2.3-70-g09d2 From 526682b458b1b56d2e0db027df535cb5cdcfde59 Mon Sep 17 00:00:00 2001 From: Richard Gobert Date: Thu, 8 Dec 2022 15:54:46 +0100 Subject: net: setsockopt: fix IPV6_UNICAST_IF option for connected sockets Change the behaviour of ip6_datagram_connect to consider the interface set by the IPV6_UNICAST_IF socket option, similarly to udpv6_sendmsg. This change is the IPv6 counterpart of the fix for IP_UNICAST_IF. The tests introduced by that patch showed that the incorrect behavior is present in IPv6 as well. This patch fixes the broken test. Reported-by: kernel test robot Link: https://lore.kernel.org/r/202210062117.c7eef1a3-oliver.sang@intel.com Fixes: 0e4d354762ce ("net-next: Fix IP_UNICAST_IF option behavior for connected sockets") Signed-off-by: Richard Gobert Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/datagram.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 5ecb56522f9d..ba28aeb7cade 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -42,24 +42,29 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); + int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = sk->sk_protocol; fl6->daddr = sk->sk_v6_daddr; fl6->saddr = np->saddr; - fl6->flowi6_oif = sk->sk_bound_dev_if; fl6->flowi6_mark = sk->sk_mark; fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; fl6->flowlabel = np->flow_label; fl6->flowi6_uid = sk->sk_uid; - if (!fl6->flowi6_oif) - fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; + if (!oif) + oif = np->sticky_pktinfo.ipi6_ifindex; - if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) - fl6->flowi6_oif = np->mcast_oif; + if (!oif) { + if (ipv6_addr_is_multicast(&fl6->daddr)) + oif = np->mcast_oif; + else + oif = np->ucast_oif; + } + fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } -- cgit v1.2.3-70-g09d2 From 73e341e0281a35274629e9be27eae2f9b1b492bf Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 8 Dec 2022 23:01:58 +0800 Subject: af_unix: call proto_unregister() in the error path in af_unix_init() If register unix_stream_proto returns error, unix_dgram_proto needs be unregistered. Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") Signed-off-by: Yang Yingliang Reviewed-by: Simon Horman Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b3545fc68097..ede2b2a140a4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3738,6 +3738,7 @@ static int __init af_unix_init(void) rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); + proto_unregister(&unix_dgram_proto); goto out; } -- cgit v1.2.3-70-g09d2 From bf14f4923d516d77320500461c0692c9d4480c30 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 8 Dec 2022 11:56:08 -0500 Subject: openvswitch: delete the unncessary skb_pull_rcsum call in ovs_ct_nat_execute The calls to ovs_ct_nat_execute() are as below: ovs_ct_execute() ovs_ct_lookup() __ovs_ct_lookup() ovs_ct_nat() ovs_ct_nat_execute() ovs_ct_commit() __ovs_ct_lookup() ovs_ct_nat() ovs_ct_nat_execute() and since skb_pull_rcsum() and skb_push_rcsum() are already called in ovs_ct_execute(), there's no need to do it again in ovs_ct_nat_execute(). Reviewed-by: Saeed Mahameed Acked-by: Aaron Conole Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index d78f0fc4337d..dff093a10d6d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -735,10 +735,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, struct sw_flow_key *key) { - int hooknum, nh_off, err = NF_ACCEPT; - - nh_off = skb_network_offset(skb); - skb_pull_rcsum(skb, nh_off); + int hooknum, err = NF_ACCEPT; /* See HOOK2MANIP(). */ if (maniptype == NF_NAT_MANIP_SRC) @@ -755,7 +752,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, hooknum)) err = NF_DROP; - goto push; + goto out; } else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6)) { __be16 frag_off; @@ -770,7 +767,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, hooknum, hdrlen)) err = NF_DROP; - goto push; + goto out; } } /* Non-ICMP, fall thru to initialize if needed. */ @@ -788,7 +785,7 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, ? nf_nat_setup_info(ct, range, maniptype) : nf_nat_alloc_null_binding(ct, hooknum); if (err != NF_ACCEPT) - goto push; + goto out; } break; @@ -798,13 +795,11 @@ static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, default: err = NF_DROP; - goto push; + goto out; } err = nf_nat_packet(ct, ctinfo, hooknum, skb); -push: - skb_push_rcsum(skb, nh_off); - +out: /* Update the flow key if NAT successful. */ if (err == NF_ACCEPT) ovs_nat_update_key(key, skb, maniptype); -- cgit v1.2.3-70-g09d2 From 7795928921332fdd52c33eab73f1280d5e58678a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 8 Dec 2022 11:56:09 -0500 Subject: openvswitch: return NF_ACCEPT when OVS_CT_NAT is not set in info nat Either OVS_CT_SRC_NAT or OVS_CT_DST_NAT is set, OVS_CT_NAT must be set in info->nat. Thus, if OVS_CT_NAT is not set in info->nat, it will definitely not do NAT but returns NF_ACCEPT in ovs_ct_nat(). This patch changes nothing funcational but only makes this return earlier in ovs_ct_nat() to keep consistent with TC's processing in tcf_ct_act_nat(). Reviewed-by: Saeed Mahameed Acked-by: Aaron Conole Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index dff093a10d6d..5ea74270da46 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -816,6 +816,9 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, enum nf_nat_manip_type maniptype; int err; + if (!(info->nat & OVS_CT_NAT)) + return NF_ACCEPT; + /* Add NAT extension if not confirmed yet. */ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) return NF_ACCEPT; /* Can't NAT. */ @@ -825,8 +828,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, * Make sure new expected connections (IP_CT_RELATED) are NATted only * when committing. */ - if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && - ct->status & IPS_NAT_MASK && + if (ctinfo != IP_CT_NEW && ct->status & IPS_NAT_MASK && (ctinfo != IP_CT_RELATED || info->commit)) { /* NAT an established or related connection like before. */ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) -- cgit v1.2.3-70-g09d2 From 2b85144ab36e0e870f59b5ae55e299179eb8cdb8 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 8 Dec 2022 11:56:10 -0500 Subject: openvswitch: return NF_DROP when fails to add nat ext in ovs_ct_nat When it fails to allocate nat ext, the packet should be dropped, like the memory allocation failures in other places in ovs_ct_nat(). This patch changes to return NF_DROP when fails to add nat ext before doing NAT in ovs_ct_nat(), also it would keep consistent with tc action ct' processing in tcf_ct_act_nat(). Signed-off-by: Xin Long Acked-by: Aaron Conole Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 5ea74270da46..58c9f0edc3c4 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -821,7 +821,7 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, /* Add NAT extension if not confirmed yet. */ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) - return NF_ACCEPT; /* Can't NAT. */ + return NF_DROP; /* Can't NAT. */ /* Determine NAT type. * Check if the NAT type can be deduced from the tracked connection. -- cgit v1.2.3-70-g09d2 From 0564c3e51bc7bb200e76d0cad2d7067cc77cb83e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 8 Dec 2022 11:56:11 -0500 Subject: net: sched: update the nat flag for icmp error packets in ct_nat_execute In ovs_ct_nat_execute(), the packet flow key nat flags are updated when it processes ICMP(v6) error packets translation successfully. In ct_nat_execute() when processing ICMP(v6) error packets translation successfully, it should have done the same in ct_nat_execute() to set post_ct_s/dnat flag, which will be used to update flow key nat flags in OVS module later. Reviewed-by: Saeed Mahameed Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sched/act_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index f6df0168c91f..eac4e07eb56b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -937,13 +937,13 @@ static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, } err = nf_nat_packet(ct, ctinfo, hooknum, skb); +out: if (err == NF_ACCEPT) { if (maniptype == NF_NAT_MANIP_SRC) tc_skb_cb(skb)->post_ct_snat = 1; if (maniptype == NF_NAT_MANIP_DST) tc_skb_cb(skb)->post_ct_dnat = 1; } -out: return err; } #endif /* CONFIG_NF_NAT */ -- cgit v1.2.3-70-g09d2 From ebddb1404900657b7f03a56ee4c34a9d218c4030 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 8 Dec 2022 11:56:12 -0500 Subject: net: move the nat function to nf_nat_ovs for ovs and tc There are two nat functions are nearly the same in both OVS and TC code, (ovs_)ct_nat_execute() and ovs_ct_nat/tcf_ct_act_nat(). This patch creates nf_nat_ovs.c under netfilter and moves them there then exports nf_ct_nat() so that it can be shared by both OVS and TC, and keeps the nat (type) check and nat flag update in OVS and TC's own place, as these parts are different between OVS and TC. Note that in OVS nat function it was using skb->protocol to get the proto as it already skips vlans in key_extract(), while it doesn't in TC, and TC has to call skb_protocol() to get proto. So in nf_ct_nat_execute(), we keep using skb_protocol() which works for both OVS and TC contrack. Signed-off-by: Xin Long Acked-by: Aaron Conole Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_nat.h | 4 ++ net/netfilter/Kconfig | 3 + net/netfilter/Makefile | 1 + net/netfilter/nf_nat_ovs.c | 135 ++++++++++++++++++++++++++++++++++++++++ net/openvswitch/Kconfig | 1 + net/openvswitch/conntrack.c | 137 +++-------------------------------------- net/sched/Kconfig | 1 + net/sched/act_ct.c | 136 ++++------------------------------------ 8 files changed, 166 insertions(+), 252 deletions(-) create mode 100644 net/netfilter/nf_nat_ovs.c (limited to 'net') diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index e9eb01e99d2f..9877f064548a 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -104,6 +104,10 @@ unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); +int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, bool commit); + static inline int nf_nat_initialized(const struct nf_conn *ct, enum nf_nat_manip_type manip) { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 0846bd75b1da..f71b41c7ce2f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -459,6 +459,9 @@ config NF_NAT_REDIRECT config NF_NAT_MASQUERADE bool +config NF_NAT_OVS + bool + config NETFILTER_SYNPROXY tristate diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 1d4db1943936..3754eb06fb41 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o obj-$(CONFIG_NF_NAT) += nf_nat.o nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o +nf_nat-$(CONFIG_NF_NAT_OVS) += nf_nat_ovs.o ifeq ($(CONFIG_NF_NAT),m) nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c new file mode 100644 index 000000000000..551abd2da614 --- /dev/null +++ b/net/netfilter/nf_nat_ovs.c @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Support nat functions for openvswitch and used by OVS and TC conntrack. */ + +#include + +/* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. + * Returns either NF_ACCEPT or NF_DROP. + */ +static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, + enum nf_nat_manip_type maniptype) +{ + __be16 proto = skb_protocol(skb, true); + int hooknum, err = NF_ACCEPT; + + /* See HOOK2MANIP(). */ + if (maniptype == NF_NAT_MANIP_SRC) + hooknum = NF_INET_LOCAL_IN; /* Source NAT */ + else + hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + if (proto == htons(ETH_P_IP) && + ip_hdr(skb)->protocol == IPPROTO_ICMP) { + if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, + hooknum)) + err = NF_DROP; + goto out; + } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { + __be16 frag_off; + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + int hdrlen = ipv6_skip_exthdr(skb, + sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + + if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { + if (!nf_nat_icmpv6_reply_translation(skb, ct, + ctinfo, + hooknum, + hdrlen)) + err = NF_DROP; + goto out; + } + } + /* Non-ICMP, fall thru to initialize if needed. */ + fallthrough; + case IP_CT_NEW: + /* Seen it before? This can happen for loopback, retrans, + * or local packets. + */ + if (!nf_nat_initialized(ct, maniptype)) { + /* Initialize according to the NAT action. */ + err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) + /* Action is set up to establish a new + * mapping. + */ + ? nf_nat_setup_info(ct, range, maniptype) + : nf_nat_alloc_null_binding(ct, hooknum); + if (err != NF_ACCEPT) + goto out; + } + break; + + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + break; + + default: + err = NF_DROP; + goto out; + } + + err = nf_nat_packet(ct, ctinfo, hooknum, skb); + if (err == NF_ACCEPT) + *action |= BIT(maniptype); +out: + return err; +} + +int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, int *action, + const struct nf_nat_range2 *range, bool commit) +{ + enum nf_nat_manip_type maniptype; + int err, ct_action = *action; + + *action = 0; + + /* Add NAT extension if not confirmed yet. */ + if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) + return NF_DROP; /* Can't NAT. */ + + if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) && + (ctinfo != IP_CT_RELATED || commit)) { + /* NAT an established or related connection like before. */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) + /* This is the REPLY direction for a connection + * for which NAT was applied in the forward + * direction. Do the reverse NAT. + */ + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; + else + maniptype = ct->status & IPS_SRC_NAT + ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; + } else if (ct_action & BIT(NF_NAT_MANIP_SRC)) { + maniptype = NF_NAT_MANIP_SRC; + } else if (ct_action & BIT(NF_NAT_MANIP_DST)) { + maniptype = NF_NAT_MANIP_DST; + } else { + return NF_ACCEPT; + } + + err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, maniptype); + if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { + if (ct->status & IPS_SRC_NAT) { + if (maniptype == NF_NAT_MANIP_SRC) + maniptype = NF_NAT_MANIP_DST; + else + maniptype = NF_NAT_MANIP_SRC; + + err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, + maniptype); + } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + err = nf_ct_nat_execute(skb, ct, ctinfo, action, NULL, + NF_NAT_MANIP_SRC); + } + } + return err; +} +EXPORT_SYMBOL_GPL(nf_ct_nat); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 15bd287f5cbd..747d537a3f06 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -15,6 +15,7 @@ config OPENVSWITCH select NET_MPLS_GSO select DST_CACHE select NET_NSH + select NF_NAT_OVS if NF_NAT help Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 58c9f0edc3c4..c8b137649ca4 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -726,144 +726,27 @@ static void ovs_nat_update_key(struct sw_flow_key *key, } } -/* Modelled after nf_nat_ipv[46]_fn(). - * range is only used for new, uninitialized NAT state. - * Returns either NF_ACCEPT or NF_DROP. - */ -static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype, struct sw_flow_key *key) -{ - int hooknum, err = NF_ACCEPT; - - /* See HOOK2MANIP(). */ - if (maniptype == NF_NAT_MANIP_SRC) - hooknum = NF_INET_LOCAL_IN; /* Source NAT */ - else - hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (IS_ENABLED(CONFIG_NF_NAT) && - skb->protocol == htons(ETH_P_IP) && - ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - hooknum)) - err = NF_DROP; - goto out; - } else if (IS_ENABLED(CONFIG_IPV6) && - skb->protocol == htons(ETH_P_IPV6)) { - __be16 frag_off; - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - int hdrlen = ipv6_skip_exthdr(skb, - sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, - ctinfo, - hooknum, - hdrlen)) - err = NF_DROP; - goto out; - } - } - /* Non-ICMP, fall thru to initialize if needed. */ - fallthrough; - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - /* Initialize according to the NAT action. */ - err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) - /* Action is set up to establish a new - * mapping. - */ - ? nf_nat_setup_info(ct, range, maniptype) - : nf_nat_alloc_null_binding(ct, hooknum); - if (err != NF_ACCEPT) - goto out; - } - break; - - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - - default: - err = NF_DROP; - goto out; - } - - err = nf_nat_packet(ct, ctinfo, hooknum, skb); -out: - /* Update the flow key if NAT successful. */ - if (err == NF_ACCEPT) - ovs_nat_update_key(key, skb, maniptype); - - return err; -} - /* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - enum nf_nat_manip_type maniptype; - int err; + int err, action = 0; if (!(info->nat & OVS_CT_NAT)) return NF_ACCEPT; + if (info->nat & OVS_CT_SRC_NAT) + action |= BIT(NF_NAT_MANIP_SRC); + if (info->nat & OVS_CT_DST_NAT) + action |= BIT(NF_NAT_MANIP_DST); - /* Add NAT extension if not confirmed yet. */ - if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) - return NF_DROP; /* Can't NAT. */ + err = nf_ct_nat(skb, ct, ctinfo, &action, &info->range, info->commit); - /* Determine NAT type. - * Check if the NAT type can be deduced from the tracked connection. - * Make sure new expected connections (IP_CT_RELATED) are NATted only - * when committing. - */ - if (ctinfo != IP_CT_NEW && ct->status & IPS_NAT_MASK && - (ctinfo != IP_CT_RELATED || info->commit)) { - /* NAT an established or related connection like before. */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) - /* This is the REPLY direction for a connection - * for which NAT was applied in the forward - * direction. Do the reverse NAT. - */ - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; - else - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; - } else if (info->nat & OVS_CT_SRC_NAT) { - maniptype = NF_NAT_MANIP_SRC; - } else if (info->nat & OVS_CT_DST_NAT) { - maniptype = NF_NAT_MANIP_DST; - } else { - return NF_ACCEPT; /* Connection is not NATed. */ - } - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype, key); - - if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { - if (ct->status & IPS_SRC_NAT) { - if (maniptype == NF_NAT_MANIP_SRC) - maniptype = NF_NAT_MANIP_DST; - else - maniptype = NF_NAT_MANIP_SRC; - - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, - maniptype, key); - } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - err = ovs_ct_nat_execute(skb, ct, ctinfo, NULL, - NF_NAT_MANIP_SRC, key); - } - } + if (action & BIT(NF_NAT_MANIP_SRC)) + ovs_nat_update_key(key, skb, NF_NAT_MANIP_SRC); + if (action & BIT(NF_NAT_MANIP_DST)) + ovs_nat_update_key(key, skb, NF_NAT_MANIP_DST); return err; } diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 4662a6ce8a7e..777d6b50505c 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -977,6 +977,7 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE + select NF_NAT_OVS if NF_NAT help Say Y here to allow sending the packets to conntrack module. diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index eac4e07eb56b..0ca2bb8ed026 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -864,90 +864,6 @@ static void tcf_ct_params_free_rcu(struct rcu_head *head) tcf_ct_params_free(params); } -#if IS_ENABLED(CONFIG_NF_NAT) -/* Modelled after nf_nat_ipv[46]_fn(). - * range is only used for new, uninitialized NAT state. - * Returns either NF_ACCEPT or NF_DROP. - */ -static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype) -{ - __be16 proto = skb_protocol(skb, true); - int hooknum, err = NF_ACCEPT; - - /* See HOOK2MANIP(). */ - if (maniptype == NF_NAT_MANIP_SRC) - hooknum = NF_INET_LOCAL_IN; /* Source NAT */ - else - hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (proto == htons(ETH_P_IP) && - ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - hooknum)) - err = NF_DROP; - goto out; - } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) { - __be16 frag_off; - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - int hdrlen = ipv6_skip_exthdr(skb, - sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, - ctinfo, - hooknum, - hdrlen)) - err = NF_DROP; - goto out; - } - } - /* Non-ICMP, fall thru to initialize if needed. */ - fallthrough; - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - /* Initialize according to the NAT action. */ - err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) - /* Action is set up to establish a new - * mapping. - */ - ? nf_nat_setup_info(ct, range, maniptype) - : nf_nat_alloc_null_binding(ct, hooknum); - if (err != NF_ACCEPT) - goto out; - } - break; - - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - - default: - err = NF_DROP; - goto out; - } - - err = nf_nat_packet(ct, ctinfo, hooknum, skb); -out: - if (err == NF_ACCEPT) { - if (maniptype == NF_NAT_MANIP_SRC) - tc_skb_cb(skb)->post_ct_snat = 1; - if (maniptype == NF_NAT_MANIP_DST) - tc_skb_cb(skb)->post_ct_dnat = 1; - } - return err; -} -#endif /* CONFIG_NF_NAT */ - static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask) { #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) @@ -987,52 +903,22 @@ static int tcf_ct_act_nat(struct sk_buff *skb, bool commit) { #if IS_ENABLED(CONFIG_NF_NAT) - int err; - enum nf_nat_manip_type maniptype; + int err, action = 0; if (!(ct_action & TCA_CT_ACT_NAT)) return NF_ACCEPT; + if (ct_action & TCA_CT_ACT_NAT_SRC) + action |= BIT(NF_NAT_MANIP_SRC); + if (ct_action & TCA_CT_ACT_NAT_DST) + action |= BIT(NF_NAT_MANIP_DST); - /* Add NAT extension if not confirmed yet. */ - if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) - return NF_DROP; /* Can't NAT. */ - - if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) && - (ctinfo != IP_CT_RELATED || commit)) { - /* NAT an established or related connection like before. */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) - /* This is the REPLY direction for a connection - * for which NAT was applied in the forward - * direction. Do the reverse NAT. - */ - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; - else - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; - } else if (ct_action & TCA_CT_ACT_NAT_SRC) { - maniptype = NF_NAT_MANIP_SRC; - } else if (ct_action & TCA_CT_ACT_NAT_DST) { - maniptype = NF_NAT_MANIP_DST; - } else { - return NF_ACCEPT; - } + err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit); + + if (action & BIT(NF_NAT_MANIP_SRC)) + tc_skb_cb(skb)->post_ct_snat = 1; + if (action & BIT(NF_NAT_MANIP_DST)) + tc_skb_cb(skb)->post_ct_dnat = 1; - err = ct_nat_execute(skb, ct, ctinfo, range, maniptype); - if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { - if (ct->status & IPS_SRC_NAT) { - if (maniptype == NF_NAT_MANIP_SRC) - maniptype = NF_NAT_MANIP_DST; - else - maniptype = NF_NAT_MANIP_SRC; - - err = ct_nat_execute(skb, ct, ctinfo, range, - maniptype); - } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - err = ct_nat_execute(skb, ct, ctinfo, NULL, - NF_NAT_MANIP_SRC); - } - } return err; #else return NF_ACCEPT; -- cgit v1.2.3-70-g09d2 From f793458bba544e29657efbfa53b62dc511c6f22c Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 8 Dec 2022 17:09:40 +0800 Subject: net: af_can: remove useless parameter 'err' in 'can_rx_register()' Since commit bdfb5765e45b remove NULL-ptr checks from users of can_dev_rcv_lists_find(). 'err' parameter is useless, so remove it. Signed-off-by: Ye Bin Link: https://lore.kernel.org/all/20221208090940.3695670-1-yebin@huaweicloud.com Signed-off-by: Marc Kleine-Budde --- net/can/af_can.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index c69168f11e44..7343fd487dbe 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -446,7 +446,6 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, struct hlist_head *rcv_list; struct can_dev_rcv_lists *dev_rcv_lists; struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; - int err = 0; /* insert new receiver (dev,canid,mask) -> (func,data) */ @@ -481,7 +480,7 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, rcv_lists_stats->rcv_entries); spin_unlock_bh(&net->can.rcvlists_lock); - return err; + return 0; } EXPORT_SYMBOL(can_rx_register); -- cgit v1.2.3-70-g09d2 From 0826e82b8a32e646b7b32ba8b68ba30812028e47 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 9 Dec 2022 10:10:08 +0100 Subject: can: raw: add support for SO_MARK Add support for SO_MARK to the CAN_RAW protocol. This makes it possible to add traffic control filters based on the fwmark. Link: https://lore.kernel.org/all/20221210113653.170346-1-mkl@pengutronix.de Acked-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index 3eb7d3e2b541..81071cdb0301 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -857,6 +857,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sk->sk_priority; + skb->mark = sk->sk_mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, sockc.tsflags); -- cgit v1.2.3-70-g09d2 From da05cecc4939c0410d56c29e252998b192756318 Mon Sep 17 00:00:00 2001 From: Firo Yang Date: Fri, 9 Dec 2022 13:48:54 +0800 Subject: sctp: sysctl: make extra pointers netns aware Recently, a customer reported that from their container whose net namespace is different to the host's init_net, they can't set the container's net.sctp.rto_max to any value smaller than init_net.sctp.rto_min. For instance, Host: sudo sysctl net.sctp.rto_min net.sctp.rto_min = 1000 Container: echo 100 > /mnt/proc-net/sctp/rto_min echo 400 > /mnt/proc-net/sctp/rto_max echo: write error: Invalid argument This is caused by the check made from this'commit 4f3fdf3bc59c ("sctp: add check rto_min and rto_max in sysctl")' When validating the input value, it's always referring the boundary value set for the init_net namespace. Having container's rto_max smaller than host's init_net.sctp.rto_min does make sense. Consider that the rto between two containers on the same host is very likely smaller than it for two hosts. So to fix this problem, as suggested by Marcelo, this patch makes the extra pointers of rto_min, rto_max, pf_retrans, and ps_retrans point to the corresponding variables from the newly created net namespace while the new net namespace is being registered in sctp_sysctl_net_register. Fixes: 4f3fdf3bc59c ("sctp: add check rto_min and rto_max in sysctl") Reviewed-by: Marcelo Ricardo Leitner Reviewed-by: Jakub Kicinski Acked-by: Marcelo Ricardo Leitner Signed-off-by: Firo Yang Link: https://lore.kernel.org/r/20221209054854.23889-1-firo.yang@suse.com Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 73 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 7f40ed117fc7..a7a9136198fd 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -84,17 +84,18 @@ static struct ctl_table sctp_table[] = { { /* sentinel */ } }; +/* The following index defines are used in sctp_sysctl_net_register(). + * If you add new items to the sctp_net_table, please ensure that + * the index values of these defines hold the same meaning indicated by + * their macro names when they appear in sctp_net_table. + */ +#define SCTP_RTO_MIN_IDX 0 +#define SCTP_RTO_MAX_IDX 1 +#define SCTP_PF_RETRANS_IDX 2 +#define SCTP_PS_RETRANS_IDX 3 + static struct ctl_table sctp_net_table[] = { - { - .procname = "rto_initial", - .data = &init_net.sctp.rto_initial, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = &timer_max - }, - { + [SCTP_RTO_MIN_IDX] = { .procname = "rto_min", .data = &init_net.sctp.rto_min, .maxlen = sizeof(unsigned int), @@ -103,7 +104,7 @@ static struct ctl_table sctp_net_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &init_net.sctp.rto_max }, - { + [SCTP_RTO_MAX_IDX] = { .procname = "rto_max", .data = &init_net.sctp.rto_max, .maxlen = sizeof(unsigned int), @@ -112,6 +113,33 @@ static struct ctl_table sctp_net_table[] = { .extra1 = &init_net.sctp.rto_min, .extra2 = &timer_max }, + [SCTP_PF_RETRANS_IDX] = { + .procname = "pf_retrans", + .data = &init_net.sctp.pf_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &init_net.sctp.ps_retrans, + }, + [SCTP_PS_RETRANS_IDX] = { + .procname = "ps_retrans", + .data = &init_net.sctp.ps_retrans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &init_net.sctp.pf_retrans, + .extra2 = &ps_retrans_max, + }, + { + .procname = "rto_initial", + .data = &init_net.sctp.rto_initial, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &timer_max + }, { .procname = "rto_alpha_exp_divisor", .data = &init_net.sctp.rto_alpha, @@ -207,24 +235,6 @@ static struct ctl_table sctp_net_table[] = { .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, - { - .procname = "pf_retrans", - .data = &init_net.sctp.pf_retrans, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &init_net.sctp.ps_retrans, - }, - { - .procname = "ps_retrans", - .data = &init_net.sctp.ps_retrans, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &init_net.sctp.pf_retrans, - .extra2 = &ps_retrans_max, - }, { .procname = "sndbuf_policy", .data = &init_net.sctp.sndbuf_policy, @@ -597,6 +607,11 @@ int sctp_sysctl_net_register(struct net *net) for (i = 0; table[i].data; i++) table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; + table[SCTP_RTO_MIN_IDX].extra2 = &net->sctp.rto_max; + table[SCTP_RTO_MAX_IDX].extra1 = &net->sctp.rto_min; + table[SCTP_PF_RETRANS_IDX].extra2 = &net->sctp.ps_retrans; + table[SCTP_PS_RETRANS_IDX].extra1 = &net->sctp.pf_retrans; + net->sctp.sysctl_header = register_net_sysctl(net, "net/sctp", table); if (net->sctp.sysctl_header == NULL) { kfree(table); -- cgit v1.2.3-70-g09d2 From e0fe1123ab2b07d2cd5475660bd0b4e6993ffaa7 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 9 Dec 2022 16:28:07 -0800 Subject: mptcp: netlink: fix some error return code Fix to return negative error code -EINVAL from some error handling case instead of 0, as done elsewhere in those functions. Fixes: 9ab4807c84a4 ("mptcp: netlink: Add MPTCP_PM_CMD_ANNOUNCE") Fixes: 702c2f646d42 ("mptcp: netlink: allow userspace-driven subflow establishment") Cc: stable@vger.kernel.org Reviewed-by: Matthieu Baerts Signed-off-by: Wei Yongjun Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 9e82250cbb70..0430415357ba 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -156,6 +156,7 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { GENL_SET_ERR_MSG(info, "invalid addr id or flags"); + err = -EINVAL; goto announce_err; } @@ -282,6 +283,7 @@ int mptcp_nl_cmd_sf_create(struct sk_buff *skb, struct genl_info *info) if (addr_l.id == 0) { NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local addr id"); + err = -EINVAL; goto create_err; } @@ -395,11 +397,13 @@ int mptcp_nl_cmd_sf_destroy(struct sk_buff *skb, struct genl_info *info) if (addr_l.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); + err = -EINVAL; goto destroy_err; } if (!addr_l.port || !addr_r.port) { GENL_SET_ERR_MSG(info, "missing local or remote port"); + err = -EINVAL; goto destroy_err; } -- cgit v1.2.3-70-g09d2 From 3fff88186f047627bb128d65155f42517f8e448f Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 9 Dec 2022 16:28:08 -0800 Subject: mptcp: remove MPTCP 'ifdef' in TCP SYN cookies To ease the maintenance, it is often recommended to avoid having #ifdef preprocessor conditions. Here the section related to CONFIG_MPTCP was quite short but the next commit needs to add more code around. It is then cleaner to move specific MPTCP code to functions located in net/mptcp directory. Now that mptcp_subflow_request_sock_ops structure can be static, it can also be marked as "read only after init". Suggested-by: Paolo Abeni Reviewed-by: Mat Martineau Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 12 ++++++++++-- net/ipv4/syncookies.c | 7 +++---- net/mptcp/subflow.c | 12 +++++++++++- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 412479ebf5ad..3c5c68618fcc 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -97,8 +97,6 @@ struct mptcp_out_options { }; #ifdef CONFIG_MPTCP -extern struct request_sock_ops mptcp_subflow_request_sock_ops; - void mptcp_init(void); static inline bool sk_is_mptcp(const struct sock *sk) @@ -188,6 +186,9 @@ void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); +struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener); __be32 mptcp_get_reset_option(const struct sk_buff *skb); @@ -274,6 +275,13 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, return 0; /* TCP fallback */ } +static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + return NULL; +} + static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } #endif /* CONFIG_MPTCP */ diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 942d2dfa1115..26fb97d1d4d9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -288,12 +288,11 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct tcp_request_sock *treq; struct request_sock *req; -#ifdef CONFIG_MPTCP if (sk_is_mptcp(sk)) - ops = &mptcp_subflow_request_sock_ops; -#endif + req = mptcp_subflow_reqsk_alloc(ops, sk, false); + else + req = inet_reqsk_alloc(ops, sk, false); - req = inet_reqsk_alloc(ops, sk, false); if (!req) return NULL; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 2159b5f9988f..3f670f2d5c5c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -529,7 +529,7 @@ static int subflow_v6_rebuild_header(struct sock *sk) } #endif -struct request_sock_ops mptcp_subflow_request_sock_ops; +static struct request_sock_ops mptcp_subflow_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -582,6 +582,16 @@ drop: } #endif +struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, + struct sock *sk_listener, + bool attach_listener) +{ + ops = &mptcp_subflow_request_sock_ops; + + return inet_reqsk_alloc(ops, sk_listener, attach_listener); +} +EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); + /* validate hmac received in third ACK */ static bool subflow_hmac_valid(const struct request_sock *req, const struct mptcp_options_received *mp_opt) -- cgit v1.2.3-70-g09d2 From 34b21d1ddc8ace77a8fa35c1b1e06377209e0dae Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 9 Dec 2022 16:28:09 -0800 Subject: mptcp: dedicated request sock for subflow in v6 tcp_request_sock_ops structure is specific to IPv4. It should then not be used with MPTCP subflows on top of IPv6. For example, it contains the 'family' field, initialised to AF_INET. This 'family' field is used by TCP FastOpen code to generate the cookie but also by TCP Metrics, SELinux and SYN Cookies. Using the wrong family will not lead to crashes but displaying/using/checking wrong things. Note that 'send_reset' callback from request_sock_ops structure is used in some error paths. It is then also important to use the correct one for IPv4 or IPv6. The slab name can also be different in IPv4 and IPv6, it will be used when printing some log messages. The slab pointer will anyway be the same because the object size is the same for both v4 and v6. A BUILD_BUG_ON() has also been added to make sure this size is the same. Fixes: cec37a6e41aa ("mptcp: Handle MP_CAPABLE options for outgoing connections") Reviewed-by: Mat Martineau Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f670f2d5c5c..30524dd7d0ec 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -529,7 +529,7 @@ static int subflow_v6_rebuild_header(struct sock *sk) } #endif -static struct request_sock_ops mptcp_subflow_request_sock_ops __ro_after_init; +static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init; static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) @@ -542,7 +542,7 @@ static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; - return tcp_conn_request(&mptcp_subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops, &subflow_request_sock_ipv4_ops, sk, skb); drop: @@ -551,6 +551,7 @@ drop: } #if IS_ENABLED(CONFIG_MPTCP_IPV6) +static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init; static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init; @@ -573,7 +574,7 @@ static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb) return 0; } - return tcp_conn_request(&mptcp_subflow_request_sock_ops, + return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops, &subflow_request_sock_ipv6_ops, sk, skb); drop: @@ -586,7 +587,12 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op struct sock *sk_listener, bool attach_listener) { - ops = &mptcp_subflow_request_sock_ops; + if (ops->family == AF_INET) + ops = &mptcp_subflow_v4_request_sock_ops; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ops->family == AF_INET6) + ops = &mptcp_subflow_v6_request_sock_ops; +#endif return inet_reqsk_alloc(ops, sk_listener, attach_listener); } @@ -1914,7 +1920,6 @@ static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { static int subflow_ops_init(struct request_sock_ops *subflow_ops) { subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock); - subflow_ops->slab_name = "request_sock_subflow"; subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name, subflow_ops->obj_size, 0, @@ -1931,9 +1936,10 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) void __init mptcp_subflow_init(void) { - mptcp_subflow_request_sock_ops = tcp_request_sock_ops; - if (subflow_ops_init(&mptcp_subflow_request_sock_ops) != 0) - panic("MPTCP: failed to init subflow request sock ops\n"); + mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; + mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; + if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow v4 request sock ops\n"); subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops; subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req; @@ -1948,6 +1954,18 @@ void __init mptcp_subflow_init(void) tcp_prot_override.release_cb = tcp_release_cb_override; #if IS_ENABLED(CONFIG_MPTCP_IPV6) + /* In struct mptcp_subflow_request_sock, we assume the TCP request sock + * structures for v4 and v6 have the same size. It should not changed in + * the future but better to make sure to be warned if it is no longer + * the case. + */ + BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock)); + + mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; + mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; + if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) + panic("MPTCP: failed to init subflow v6 request sock ops\n"); + subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops; subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req; -- cgit v1.2.3-70-g09d2 From d3295fee3c756ece33ac0d935e172e68c0a4161b Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 9 Dec 2022 16:28:10 -0800 Subject: mptcp: use proper req destructor for IPv6 Before, only the destructor from TCP request sock in IPv4 was called even if the subflow was IPv6. It is important to use the right destructor to avoid memory leaks with some advanced IPv6 features, e.g. when the request socks contain specific IPv6 options. Fixes: 79c0949e9a09 ("mptcp: Add key generation and token tree") Reviewed-by: Mat Martineau Cc: stable@vger.kernel.org Signed-off-by: Matthieu Baerts Signed-off-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/subflow.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 30524dd7d0ec..613f515fedf0 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -45,7 +45,6 @@ static void subflow_req_destructor(struct request_sock *req) sock_put((struct sock *)subflow_req->msk); mptcp_token_destroy_request(req); - tcp_request_sock_ops.destructor(req); } static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, @@ -550,6 +549,12 @@ drop: return 0; } +static void subflow_v4_req_destructor(struct request_sock *req) +{ + subflow_req_destructor(req); + tcp_request_sock_ops.destructor(req); +} + #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init; static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init; @@ -581,6 +586,12 @@ drop: tcp_listendrop(sk); return 0; /* don't send reset */ } + +static void subflow_v6_req_destructor(struct request_sock *req) +{ + subflow_req_destructor(req); + tcp6_request_sock_ops.destructor(req); +} #endif struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, @@ -1929,8 +1940,6 @@ static int subflow_ops_init(struct request_sock_ops *subflow_ops) if (!subflow_ops->slab) return -ENOMEM; - subflow_ops->destructor = subflow_req_destructor; - return 0; } @@ -1938,6 +1947,8 @@ void __init mptcp_subflow_init(void) { mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops; mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4"; + mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor; + if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v4 request sock ops\n"); @@ -1963,6 +1974,8 @@ void __init mptcp_subflow_init(void) mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops; mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6"; + mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor; + if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0) panic("MPTCP: failed to init subflow v6 request sock ops\n"); -- cgit v1.2.3-70-g09d2 From e411443c32554b4db157eb49f199d9cfebda2817 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 7 Oct 2022 13:25:38 -0700 Subject: Bluetooth: hci_sync: Fix not setting static address This attempts to program the address stored in hdev->static_addr after the init sequence has been complete: @ MGMT Command: Set Static A.. (0x002b) plen 6 Address: C0:55:44:33:22:11 (Static) @ MGMT Event: Command Complete (0x0001) plen 7 Set Static Address (0x002b) plen 4 Status: Success (0x00) Current settings: 0x00008200 Low Energy Static Address @ MGMT Event: New Settings (0x0006) plen 4 Current settings: 0x00008200 Low Energy Static Address < HCI Command: LE Set Random.. (0x08|0x0005) plen 6 Address: C0:55:44:33:22:11 (Static) > HCI Event: Command Complete (0x0e) plen 4 LE Set Random Address (0x08|0x0005) ncmd 1 Status: Success (0x00) @ MGMT Event: Command Complete (0x0001) plen 7 Set Powered (0x0005) plen 4 Status: Success (0x00) Current settings: 0x00008201 Powered Low Energy Static Address @ MGMT Event: New Settings (0x0006) plen 4 Current settings: 0x00008201 Powered Low Energy Static Address Signed-off-by: Luiz Augusto von Dentz Tested-by: Brian Gix --- net/bluetooth/hci_sync.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 1fc693122a47..1cd05072dbb7 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -3055,6 +3055,7 @@ int hci_update_name_sync(struct hci_dev *hdev) * Enable Authentication * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class -> * Set Name -> Set EIR) + * HCI_FORCE_STATIC_ADDR | BDADDR_ANY && !HCI_BREDR_ENABLED (Set Static Address) */ int hci_powered_update_sync(struct hci_dev *hdev) { @@ -3094,6 +3095,23 @@ int hci_powered_update_sync(struct hci_dev *hdev) hci_update_eir_sync(hdev); } + /* If forcing static address is in use or there is no public + * address use the static address as random address (but skip + * the HCI command if the current random address is already the + * static one. + * + * In case BR/EDR has been disabled on a dual-mode controller + * and a static address has been configured, then use that + * address instead of the public BR/EDR address. + */ + if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || + (!bacmp(&hdev->bdaddr, BDADDR_ANY) && + !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))) { + if (bacmp(&hdev->static_addr, BDADDR_ANY)) + return hci_set_random_addr_sync(hdev, + &hdev->static_addr); + } + return 0; } -- cgit v1.2.3-70-g09d2 From eeb1aafe97fa6da558157d2eb18cce25878b8656 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 7 Oct 2022 18:08:43 -0700 Subject: Bluetooth: hci_sync: Fix not able to set force_static_address force_static_address shall be writable while hdev is initing but is not considered powered yet since the static address is written only when powered. Signed-off-by: Luiz Augusto von Dentz Tested-by: Brian Gix --- net/bluetooth/hci_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 3f401ec5bb0c..b7f682922a16 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -757,7 +757,7 @@ static ssize_t force_static_address_write(struct file *file, bool enable; int err; - if (test_bit(HCI_UP, &hdev->flags)) + if (hdev_is_powered(hdev)) return -EBUSY; err = kstrtobool_from_user(user_buf, count, &enable); -- cgit v1.2.3-70-g09d2 From 97dfaf073f5881c624856ef293be307b6166115c Mon Sep 17 00:00:00 2001 From: Archie Pusaka Date: Wed, 5 Oct 2022 15:09:47 +0800 Subject: Bluetooth: hci_sync: cancel cmd_timer if hci_open failed If a command is already sent, we take care of freeing it, but we also need to cancel the timeout as well. Signed-off-by: Archie Pusaka Reviewed-by: Abhishek Pandit-Subedi Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 1cd05072dbb7..4debe205d4af 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4721,6 +4721,7 @@ int hci_dev_open_sync(struct hci_dev *hdev) hdev->flush(hdev); if (hdev->sent_cmd) { + cancel_delayed_work_sync(&hdev->cmd_timer); kfree_skb(hdev->sent_cmd); hdev->sent_cmd = NULL; } -- cgit v1.2.3-70-g09d2 From d11ab690c300becca17223531f9ca4ae29284c52 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Tue, 11 Oct 2022 22:25:34 +0300 Subject: Bluetooth: hci_conn: use HCI dst_type values also for BIS For ISO BIS related functions in hci_conn.c, make dst_type values be HCI address type values, not ISO socket address type values. This makes it consistent with CIS functions. Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 6 ------ net/bluetooth/iso.c | 6 ++++-- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a6c12863a253..df914f521c9d 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2052,12 +2052,6 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, return -ENOMEM; } - /* Convert from ISO socket address type to HCI address type */ - if (dst_type == BDADDR_LE_PUBLIC) - dst_type = ADDR_LE_DEV_PUBLIC; - else - dst_type = ADDR_LE_DEV_RANDOM; - memset(cp, 0, sizeof(*cp)); cp->sid = sid; cp->addr_type = dst_type; diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 26db929b97c4..e23aabf4e0cf 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -270,7 +270,8 @@ static int iso_connect_bis(struct sock *sk) goto done; } - hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->dst_type, + hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, + le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { @@ -875,7 +876,8 @@ static int iso_listen_bis(struct sock *sk) hci_dev_lock(hdev); - err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, iso_pi(sk)->dst_type, + err = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, + le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid); hci_dev_unlock(hdev); -- cgit v1.2.3-70-g09d2 From 37224a290853a69f7e8e8c2269c9caea6e2bcb4b Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Mon, 17 Oct 2022 13:47:13 +0800 Subject: Bluetooth: Use kzalloc instead of kmalloc/memset Use kzalloc rather than duplicating its implementation, which makes code simple and easy to understand. ./net/bluetooth/hci_conn.c:2038:6-13: WARNING: kzalloc should be used for cp, instead of kmalloc/memset. Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=2406 Reported-by: Abaci Robot Signed-off-by: Jiapeng Chong Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index df914f521c9d..07ddc1ef1aff 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2046,13 +2046,12 @@ int hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) return -EBUSY; - cp = kmalloc(sizeof(*cp), GFP_KERNEL); + cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) { hci_dev_clear_flag(hdev, HCI_PA_SYNC); return -ENOMEM; } - memset(cp, 0, sizeof(*cp)); cp->sid = sid; cp->addr_type = dst_type; bacpy(&cp->addr, dst); -- cgit v1.2.3-70-g09d2 From 0d75da38e060d21f948b3df5f5e349c962cf1ed2 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 20 Oct 2022 10:16:56 +0800 Subject: Bluetooth: hci_core: fix error handling in hci_register_dev() If hci_register_suspend_notifier() returns error, the hdev and rfkill are leaked. We could disregard the error and print a warning message instead to avoid leaks, as it just means we won't be handing suspend requests. Fixes: 9952d90ea288 ("Bluetooth: Handle PM_SUSPEND_PREPARE and PM_POST_SUSPEND") Signed-off-by: Yang Yingliang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index d97fac4f7130..56f8569ace86 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2660,7 +2660,7 @@ int hci_register_dev(struct hci_dev *hdev) error = hci_register_suspend_notifier(hdev); if (error) - goto err_wqueue; + BT_WARN("register suspend notifier failed error:%d\n", error); queue_work(hdev->req_workqueue, &hdev->power_on); -- cgit v1.2.3-70-g09d2 From 3b1c7c00b8c22b3cb79532252c59eb0b287bb86d Mon Sep 17 00:00:00 2001 From: Inga Stotland Date: Fri, 21 Oct 2022 17:48:56 -0700 Subject: Bluetooth: MGMT: Fix error report for ADD_EXT_ADV_PARAMS When validating the parameter length for MGMT_OP_ADD_EXT_ADV_PARAMS command, use the correct op code in error status report: was MGMT_OP_ADD_ADVERTISING, changed to MGMT_OP_ADD_EXT_ADV_PARAMS. Fixes: 12410572833a2 ("Bluetooth: Break add adv into two mgmt commands") Signed-off-by: Inga Stotland Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index a92e7e485feb..0dd30a3beb77 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -8859,7 +8859,7 @@ static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev, * extra parameters we don't know about will be ignored in this request. */ if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_STATUS_INVALID_PARAMS); flags = __le32_to_cpu(cp->flags); -- cgit v1.2.3-70-g09d2 From 462fcd53924ccc21596d30edd0673fa7c939b392 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Thu, 27 Oct 2022 16:18:04 -0700 Subject: Bluetooth: Add CONFIG_BT_LE_L2CAP_ECRED This adds CONFIG_BT_LE_L2CAP_ECRED which can be used to enable L2CAP Enhanced Credit Flow Control Mode by default, previously it was only possible to set it via module parameter (e.g. bluetooth.enable_ecred=1). Since L2CAP ECRED mode is required by the likes of EATT which is recommended for LE Audio this enables it by default. Signed-off-by: Luiz Augusto von Dentz Tested-By: Tedd Ho-Jeong An --- net/bluetooth/Kconfig | 11 +++++++++++ net/bluetooth/l2cap_core.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index ae3bdc6dfc92..da7cac0a1b71 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -78,6 +78,17 @@ config BT_LE Bluetooth Low Energy includes support low-energy physical layer available with Bluetooth version 4.0 or later. +config BT_LE_L2CAP_ECRED + bool "Bluetooth L2CAP Enhanced Credit Flow Control" + depends on BT_LE + default y + help + Bluetooth Low Energy L2CAP Enhanced Credit Flow Control available with + Bluetooth version 5.2 or later. + + This can be overridden by passing bluetooth.enable_ecred=[1|0] + on the kernel commandline. + config BT_6LOWPAN tristate "Bluetooth 6LoWPAN support" depends on BT_LE && 6LOWPAN diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 9fdede5fe71c..a3e0dc6a6e73 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -45,7 +45,7 @@ #define LE_FLOWCTL_MAX_CREDITS 65535 bool disable_ertm; -bool enable_ecred; +bool enable_ecred = IS_ENABLED(CONFIG_BT_LE_L2CAP_ECRED); static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD; -- cgit v1.2.3-70-g09d2 From 63db780a93eb802ece1bbf61ab5894ad8827b56e Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 30 Oct 2022 08:00:03 +0100 Subject: Bluetooth: Fix EALREADY and ELOOP cases in bt_status() 'err' is known to be <0 at this point. So, some cases can not be reached because of a missing "-". Add it. Fixes: ca2045e059c3 ("Bluetooth: Add bt_status") Signed-off-by: Christophe JAILLET Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index 469a0c95b6e8..53a796ac078c 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -170,7 +170,7 @@ __u8 bt_status(int err) case -EMLINK: return 0x09; - case EALREADY: + case -EALREADY: return 0x0b; case -EBUSY: @@ -191,7 +191,7 @@ __u8 bt_status(int err) case -ECONNABORTED: return 0x16; - case ELOOP: + case -ELOOP: return 0x17; case -EPROTONOSUPPORT: -- cgit v1.2.3-70-g09d2 From 3958e87783e746a2ce78fbb30f24231569f03543 Mon Sep 17 00:00:00 2001 From: Kang Minchul Date: Mon, 31 Oct 2022 03:17:22 +0900 Subject: Bluetooth: Use kzalloc instead of kmalloc/memset Replace kmalloc+memset by kzalloc for better readability and simplicity. This addresses the cocci warning below: WARNING: kzalloc should be used for d, instead of kmalloc/memset Signed-off-by: Kang Minchul Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 07ddc1ef1aff..3287b2ca789e 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -824,11 +824,10 @@ static int hci_le_terminate_big(struct hci_dev *hdev, u8 big, u8 bis) bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", big, bis); - d = kmalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - memset(d, 0, sizeof(*d)); d->big = big; d->bis = bis; @@ -861,11 +860,10 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, u16 sync_handle) bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, sync_handle); - d = kmalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - memset(d, 0, sizeof(*d)); d->big = big; d->sync_handle = sync_handle; -- cgit v1.2.3-70-g09d2 From ad38e55e1c89384aecee1bb0425bf1bf21ec86fd Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Fri, 4 Nov 2022 22:13:00 +0100 Subject: Bluetooth: hci_event: Ignore reserved bits in LE Extended Adv Report Broadcom controllers present on Apple Silicon devices use the upper 8 bits of the event type in the LE Extended Advertising Report for the channel on which the frame has been received. These bits are reserved according to the Bluetooth spec anyway such that we can just drop them to ensure that the advertising results are parsed correctly. The following excerpt from a btmon trace shows a report received on channel 37 by these controllers: > HCI Event: LE Meta Event (0x3e) plen 55 LE Extended Advertising Report (0x0d) Num reports: 1 Entry 0 Event type: 0x2513 Props: 0x0013 Connectable Scannable Use legacy advertising PDUs Data status: Complete Reserved (0x2500) Legacy PDU Type: Reserved (0x2513) Address type: Public (0x00) Address: XX:XX:XX:XX:XX:XX (Shenzhen Jingxun Software [...]) Primary PHY: LE 1M Secondary PHY: No packets SID: no ADI field (0xff) TX power: 127 dBm RSSI: -76 dBm (0xb4) Periodic advertising interval: 0.00 msec (0x0000) Direct address type: Public (0x00) Direct address: 00:00:00:00:00:00 (OUI 00-00-00) Data length: 0x1d [...] Flags: 0x18 Simultaneous LE and BR/EDR (Controller) Simultaneous LE and BR/EDR (Host) Company: Harman International Industries, Inc. (87) Data: [...] Service Data (UUID 0xfddf): Name (complete): JBL Flip 5 Signed-off-by: Sven Peter Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_event.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 684f1cd28730..a035ff6055da 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2590,6 +2590,7 @@ struct hci_ev_le_conn_complete { #define LE_EXT_ADV_DIRECT_IND 0x0004 #define LE_EXT_ADV_SCAN_RSP 0x0008 #define LE_EXT_ADV_LEGACY_PDU 0x0010 +#define LE_EXT_ADV_EVT_TYPE_MASK 0x007f #define ADDR_LE_DEV_PUBLIC 0x00 #define ADDR_LE_DEV_RANDOM 0x01 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index faca701bce2a..ade2628aae0d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6494,7 +6494,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, info->length)) break; - evt_type = __le16_to_cpu(info->type); + evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK; legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &info->bdaddr, -- cgit v1.2.3-70-g09d2 From ffcb0a445ec2d5753751437706aa0a7ea8351099 Mon Sep 17 00:00:00 2001 From: Sven Peter Date: Fri, 4 Nov 2022 22:13:02 +0100 Subject: Bluetooth: Add quirk to disable MWS Transport Configuration Broadcom 4378/4387 controllers found in Apple Silicon Macs claim to support getting MWS Transport Layer Configuration, < HCI Command: Read Local Supported... (0x04|0x0002) plen 0 > HCI Event: Command Complete (0x0e) plen 68 Read Local Supported Commands (0x04|0x0002) ncmd 1 Status: Success (0x00) [...] Get MWS Transport Layer Configuration (Octet 30 - Bit 3)] [...] , but then don't actually allow the required command: > HCI Event: Command Complete (0x0e) plen 15 Get MWS Transport Layer Configuration (0x05|0x000c) ncmd 1 Status: Command Disallowed (0x0c) Number of transports: 0 Baud rate list: 0 entries 00 00 00 00 00 00 00 00 00 00 Signed-off-by: Sven Peter Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 10 ++++++++++ include/net/bluetooth/hci_core.h | 3 +++ net/bluetooth/hci_sync.c | 2 +- 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 4cf6bc363e4f..8d773b042c85 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -284,6 +284,16 @@ enum { * during the hdev->setup vendor callback. */ HCI_QUIRK_BROKEN_EXT_SCAN, + + /* + * When this quirk is set, the HCI_OP_GET_MWS_TRANSPORT_CONFIG command is + * disabled. This is required for some Broadcom controllers which + * erroneously claim to support MWS Transport Layer Configuration. + * + * This quirk can be set before hci_register_dev is called or + * during the hdev->setup vendor callback. + */ + HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1113d74e1f9f..7254edfba4c9 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1720,6 +1720,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL) #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER) +#define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ + (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) + /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 4debe205d4af..9e2d7e4b850c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -4279,7 +4279,7 @@ static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev) /* Get MWS transport configuration if the HCI command is supported */ static int hci_get_mws_transport_config_sync(struct hci_dev *hdev) { - if (!(hdev->commands[30] & 0x08)) + if (!mws_transport_config_capable(hdev)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG, -- cgit v1.2.3-70-g09d2 From 50757a259ba78c4e938b5735e76ffec6cd0c942e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 5 Dec 2022 17:11:57 -0800 Subject: Bluetooth: hci_conn: Fix crash on hci_create_cis_sync When attempting to connect multiple ISO sockets without using DEFER_SETUP may result in the following crash: BUG: KASAN: null-ptr-deref in hci_create_cis_sync+0x18b/0x2b0 Read of size 2 at addr 0000000000000036 by task kworker/u3:1/50 CPU: 0 PID: 50 Comm: kworker/u3:1 Not tainted 6.0.0-rc7-02243-gb84a13ff4eda #4373 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-1.fc36 04/01/2014 Workqueue: hci0 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x19/0x27 kasan_report+0xbc/0xf0 ? hci_create_cis_sync+0x18b/0x2b0 hci_create_cis_sync+0x18b/0x2b0 ? get_link_mode+0xd0/0xd0 ? __ww_mutex_lock_slowpath+0x10/0x10 ? mutex_lock+0xe0/0xe0 ? get_link_mode+0xd0/0xd0 hci_cmd_sync_work+0x111/0x190 process_one_work+0x427/0x650 worker_thread+0x87/0x750 ? process_one_work+0x650/0x650 kthread+0x14e/0x180 ? kthread_exit+0x50/0x50 ret_from_fork+0x22/0x30 Fixes: 26afbd826ee3 ("Bluetooth: Add initial implementation of CIS connections") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 3287b2ca789e..d3e542c2fc3e 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1879,7 +1879,7 @@ static int hci_create_cis_sync(struct hci_dev *hdev, void *data) continue; /* Check if all CIS(s) belonging to a CIG are ready */ - if (conn->link->state != BT_CONNECTED || + if (!conn->link || conn->link->state != BT_CONNECTED || conn->state != BT_CONNECT) { cmd.cp.num_cis = 0; break; -- cgit v1.2.3-70-g09d2 From 39c1eb6fcbae8ce9bb71b2ac5cb609355a2b181b Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 7 Dec 2022 10:18:34 +0800 Subject: Bluetooth: hci_core: don't call kfree_skb() under spin_lock_irqsave() It is not allowed to call kfree_skb() from hardware interrupt context or with interrupts being disabled. So replace kfree_skb() with dev_kfree_skb_irq() under spin_lock_irqsave(). Fixes: 9238f36a5a50 ("Bluetooth: Add request cmd_complete and cmd_status functions") Signed-off-by: Yang Yingliang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 56f8569ace86..b65c3aabcd53 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3985,7 +3985,7 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; else *req_complete = bt_cb(skb)->hci.req_complete; - kfree_skb(skb); + dev_kfree_skb_irq(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); } -- cgit v1.2.3-70-g09d2 From 0ba18967d4544955b2eff2fbc4f2a8750c4df90a Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 7 Dec 2022 10:18:35 +0800 Subject: Bluetooth: RFCOMM: don't call kfree_skb() under spin_lock_irqsave() It is not allowed to call kfree_skb() from hardware interrupt context or with interrupts being disabled. So replace kfree_skb() with dev_kfree_skb_irq() under spin_lock_irqsave(). Fixes: 81be03e026dc ("Bluetooth: RFCOMM: Replace use of memcpy_from_msg with bt_skb_sendmmsg") Signed-off-by: Yang Yingliang Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/rfcomm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 7324764384b6..8d6fce9005bd 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -590,7 +590,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) ret = rfcomm_dlc_send_frag(d, frag); if (ret < 0) { - kfree_skb(frag); + dev_kfree_skb_irq(frag); goto unlock; } -- cgit v1.2.3-70-g09d2 From 241f51931c35085449502c10f64fb3ecd6e02171 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 6 Dec 2022 16:34:42 -0800 Subject: Bluetooth: ISO: Avoid circular locking dependency This attempts to avoid circular locking dependency between sock_lock and hdev_lock: WARNING: possible circular locking dependency detected 6.0.0-rc7-03728-g18dd8ab0a783 #3 Not tainted ------------------------------------------------------ kworker/u3:2/53 is trying to acquire lock: ffff888000254130 (sk_lock-AF_BLUETOOTH-BTPROTO_ISO){+.+.}-{0:0}, at: iso_conn_del+0xbd/0x1d0 but task is already holding lock: ffffffff9f39a080 (hci_cb_list_lock){+.+.}-{3:3}, at: hci_le_cis_estabilished_evt+0x1b5/0x500 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (hci_cb_list_lock){+.+.}-{3:3}: __mutex_lock+0x10e/0xfe0 hci_le_remote_feat_complete_evt+0x17f/0x320 hci_event_packet+0x39c/0x7d0 hci_rx_work+0x2bf/0x950 process_one_work+0x569/0x980 worker_thread+0x2a3/0x6f0 kthread+0x153/0x180 ret_from_fork+0x22/0x30 -> #1 (&hdev->lock){+.+.}-{3:3}: __mutex_lock+0x10e/0xfe0 iso_connect_cis+0x6f/0x5a0 iso_sock_connect+0x1af/0x710 __sys_connect+0x17e/0x1b0 __x64_sys_connect+0x37/0x50 do_syscall_64+0x43/0x90 entry_SYSCALL_64_after_hwframe+0x62/0xcc -> #0 (sk_lock-AF_BLUETOOTH-BTPROTO_ISO){+.+.}-{0:0}: __lock_acquire+0x1b51/0x33d0 lock_acquire+0x16f/0x3b0 lock_sock_nested+0x32/0x80 iso_conn_del+0xbd/0x1d0 iso_connect_cfm+0x226/0x680 hci_le_cis_estabilished_evt+0x1ed/0x500 hci_event_packet+0x39c/0x7d0 hci_rx_work+0x2bf/0x950 process_one_work+0x569/0x980 worker_thread+0x2a3/0x6f0 kthread+0x153/0x180 ret_from_fork+0x22/0x30 other info that might help us debug this: Chain exists of: sk_lock-AF_BLUETOOTH-BTPROTO_ISO --> &hdev->lock --> hci_cb_list_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(hci_cb_list_lock); lock(&hdev->lock); lock(hci_cb_list_lock); lock(sk_lock-AF_BLUETOOTH-BTPROTO_ISO); *** DEADLOCK *** 4 locks held by kworker/u3:2/53: #0: ffff8880021d9130 ((wq_completion)hci0#2){+.+.}-{0:0}, at: process_one_work+0x4ad/0x980 #1: ffff888002387de0 ((work_completion)(&hdev->rx_work)){+.+.}-{0:0}, at: process_one_work+0x4ad/0x980 #2: ffff888001ac0070 (&hdev->lock){+.+.}-{3:3}, at: hci_le_cis_estabilished_evt+0xc3/0x500 #3: ffffffff9f39a080 (hci_cb_list_lock){+.+.}-{3:3}, at: hci_le_cis_estabilished_evt+0x1b5/0x500 Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 61 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index e23aabf4e0cf..035bb5d25f85 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -261,13 +261,13 @@ static int iso_connect_bis(struct sock *sk) if (!bis_capable(hdev)) { err = -EOPNOTSUPP; - goto done; + goto unlock; } /* Fail if out PHYs are marked as disabled */ if (!iso_pi(sk)->qos.out.phy) { err = -EINVAL; - goto done; + goto unlock; } hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, @@ -276,22 +276,27 @@ static int iso_connect_bis(struct sock *sk) iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); - goto done; + goto unlock; } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; - goto done; + goto unlock; } + hci_dev_unlock(hdev); + hci_dev_put(hdev); + + lock_sock(sk); + /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); err = iso_chan_add(conn, sk, NULL); if (err) - goto done; + goto release; if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); @@ -301,7 +306,11 @@ static int iso_connect_bis(struct sock *sk) iso_sock_set_timer(sk, sk->sk_sndtimeo); } -done: +release: + release_sock(sk); + return err; + +unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -325,13 +334,13 @@ static int iso_connect_cis(struct sock *sk) if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; - goto done; + goto unlock; } /* Fail if either PHYs are marked as disabled */ if (!iso_pi(sk)->qos.in.phy && !iso_pi(sk)->qos.out.phy) { err = -EINVAL; - goto done; + goto unlock; } /* Just bind if DEFER_SETUP has been set */ @@ -341,7 +350,7 @@ static int iso_connect_cis(struct sock *sk) &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); - goto done; + goto unlock; } } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, @@ -349,7 +358,7 @@ static int iso_connect_cis(struct sock *sk) &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); - goto done; + goto unlock; } } @@ -357,15 +366,20 @@ static int iso_connect_cis(struct sock *sk) if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; - goto done; + goto unlock; } + hci_dev_unlock(hdev); + hci_dev_put(hdev); + + lock_sock(sk); + /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); err = iso_chan_add(conn, sk, NULL); if (err) - goto done; + goto release; if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); @@ -378,7 +392,11 @@ static int iso_connect_cis(struct sock *sk) iso_sock_set_timer(sk, sk->sk_sndtimeo); } -done: +release: + release_sock(sk); + return err; + +unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; @@ -832,20 +850,23 @@ static int iso_sock_connect(struct socket *sock, struct sockaddr *addr, bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr); iso_pi(sk)->dst_type = sa->iso_bdaddr_type; + release_sock(sk); + if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); if (err) - goto done; + return err; + + lock_sock(sk); if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); } -done: release_sock(sk); return err; } @@ -1101,28 +1122,22 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct iso_pinfo *pi = iso_pi(sk); - int err; BT_DBG("sk %p", sk); - lock_sock(sk); - if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { switch (sk->sk_state) { case BT_CONNECT2: + lock_sock(sk); iso_conn_defer_accept(pi->conn->hcon); sk->sk_state = BT_CONFIG; release_sock(sk); return 0; case BT_CONNECT: - err = iso_connect_cis(sk); - release_sock(sk); - return err; + return iso_connect_cis(sk); } } - release_sock(sk); - return bt_sock_recvmsg(sock, msg, len, flags); } -- cgit v1.2.3-70-g09d2 From 7aca0ac4792e6cb0f35ef97bfcb39b1663a92fb7 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 7 Dec 2022 11:56:57 -0800 Subject: Bluetooth: Wait for HCI_OP_WRITE_AUTH_PAYLOAD_TO to complete This make sure HCI_OP_WRITE_AUTH_PAYLOAD_TO completes before notifying the encryption change just as is done with HCI_OP_READ_ENC_KEY_SIZE. Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index ade2628aae0d..0594af4e37ca 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -801,9 +801,6 @@ static u8 hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); - if (rp->status) - return rp->status; - sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO); if (!sent) return rp->status; @@ -811,9 +808,17 @@ static u8 hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle)); - if (conn) + if (!conn) { + rp->status = 0xff; + goto unlock; + } + + if (!rp->status) conn->auth_payload_timeout = get_unaligned_le16(sent + 2); + hci_encrypt_cfm(conn, 0); + +unlock: hci_dev_unlock(hdev); return rp->status; @@ -3680,8 +3685,13 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, cp.handle = cpu_to_le16(conn->handle); cp.timeout = cpu_to_le16(hdev->auth_payload_timeout); - hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO, - sizeof(cp), &cp); + if (hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO, + sizeof(cp), &cp)) { + bt_dev_err(hdev, "write auth payload timeout failed"); + goto notify; + } + + goto unlock; } notify: -- cgit v1.2.3-70-g09d2 From 8f18655c49eb6abfe7fc3711d32d23b311fbc6a6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 9 Dec 2022 19:58:40 +0200 Subject: net: dsa: don't call ptp_classify_raw() if switch doesn't provide RX timestamping ptp_classify_raw() is not exactly cheap, since it invokes a BPF program for every skb in the receive path. For switches which do not provide ds->ops->port_rxtstamp(), running ptp_classify_raw() provides precisely nothing, so check for the presence of the function pointer first, since that is much cheaper. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Reviewed-by: Kurt Kanzenbach Link: https://lore.kernel.org/r/20221209175840.390707-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/tag.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/dsa/tag.c b/net/dsa/tag.c index 383721e167d6..b2fba1a003ce 100644 --- a/net/dsa/tag.c +++ b/net/dsa/tag.c @@ -33,6 +33,9 @@ static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, struct dsa_switch *ds = p->dp->ds; unsigned int type; + if (!ds->ops->port_rxtstamp) + return false; + if (skb_headroom(skb) < ETH_HLEN) return false; @@ -45,10 +48,7 @@ static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p, if (type == PTP_CLASS_NONE) return false; - if (likely(ds->ops->port_rxtstamp)) - return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); - - return false; + return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type); } static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3-70-g09d2 From d7b061b80ee6f91aa0b89daa3069802d7ea4c57f Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Mon, 12 Dec 2022 11:24:26 +0800 Subject: net: tso: inline tso_count_descs() tso_count_descs() is a small function doing simple calculation, and tso_count_descs() is used in fast path, so inline it to reduce the overhead of calls. Signed-off-by: Yunsheng Lin Link: https://lore.kernel.org/r/20221212032426.16050-1-linyunsheng@huawei.com Signed-off-by: Jakub Kicinski --- include/net/tso.h | 8 +++++++- net/core/tso.c | 8 -------- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/tso.h b/include/net/tso.h index 62c98a9c60f1..e7e157ae0526 100644 --- a/include/net/tso.h +++ b/include/net/tso.h @@ -2,6 +2,7 @@ #ifndef _TSO_H #define _TSO_H +#include #include #define TSO_HEADER_SIZE 256 @@ -16,7 +17,12 @@ struct tso_t { u32 tcp_seq; }; -int tso_count_descs(const struct sk_buff *skb); +/* Calculate the worst case buffer count */ +static inline int tso_count_descs(const struct sk_buff *skb) +{ + return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; +} + void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last); void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size); diff --git a/net/core/tso.c b/net/core/tso.c index 4148f6d48953..e00796e3b146 100644 --- a/net/core/tso.c +++ b/net/core/tso.c @@ -5,14 +5,6 @@ #include #include -/* Calculate expected number of TX descriptors */ -int tso_count_descs(const struct sk_buff *skb) -{ - /* The Marvell Way */ - return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags; -} -EXPORT_SYMBOL(tso_count_descs); - void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, int size, bool is_last) { -- cgit v1.2.3-70-g09d2 From 8a321cf7becc6c065ae595b837b826a2a81036b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 9 Dec 2022 10:21:38 -0500 Subject: net: add IFF_NO_ADDRCONF and use it in bonding to prevent ipv6 addrconf Currently, in bonding it reused the IFF_SLAVE flag and checked it in ipv6 addrconf to prevent ipv6 addrconf. However, it is not a proper flag to use for no ipv6 addrconf, for bonding it has to move IFF_SLAVE flag setting ahead of dev_open() in bond_enslave(). Also, IFF_MASTER/SLAVE are historical flags used in bonding and eql, as Jiri mentioned, the new devices like Team, Failover do not use this flag. So as Jiri suggested, this patch adds IFF_NO_ADDRCONF in priv_flags of the device to indicate no ipv6 addconf, and uses it in bonding and moves IFF_SLAVE flag setting back to its original place. Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 18 +++++++++++++----- include/linux/netdevice.h | 3 ++- net/ipv6/addrconf.c | 4 ++-- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4048876f842c..f7767afe116b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1632,13 +1632,19 @@ static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave, { struct netdev_lag_upper_info lag_upper_info; enum netdev_lag_tx_type type; + int err; type = bond_lag_tx_type(bond); lag_upper_info.tx_type = type; lag_upper_info.hash_type = bond_lag_hash_type(bond, type); - return netdev_master_upper_dev_link(slave->dev, bond->dev, slave, - &lag_upper_info, extack); + err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave, + &lag_upper_info, extack); + if (err) + return err; + + slave->dev->flags |= IFF_SLAVE; + return 0; } static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave) @@ -1950,8 +1956,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, } } - /* set slave flag before open to prevent IPv6 addrconf */ - slave_dev->flags |= IFF_SLAVE; + /* set no_addrconf flag before open to prevent IPv6 addrconf */ + slave_dev->priv_flags |= IFF_NO_ADDRCONF; /* open the slave since the application closed it */ res = dev_open(slave_dev, extack); @@ -2254,7 +2260,7 @@ err_close: dev_close(slave_dev); err_restore_mac: - slave_dev->flags &= ~IFF_SLAVE; + slave_dev->priv_flags &= ~IFF_NO_ADDRCONF; if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* XXX TODO - fom follow mode needs to change master's @@ -2446,6 +2452,8 @@ static int __bond_release_one(struct net_device *bond_dev, /* close slave before restoring its mac address */ dev_close(slave_dev); + slave_dev->priv_flags &= ~IFF_NO_ADDRCONF; + if (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* restore original ("permanent") mac address */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2287cb8eb9e4..aad12a179e54 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1662,6 +1662,7 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device + * @IFF_NO_ADDRCONF: prevent ipv6 addrconf * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN @@ -1697,7 +1698,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, - /* was IFF_LIVE_RENAME_OK */ + IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9c3f5202a97b..c338dfb05d2c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3320,7 +3320,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route) return; /* no link local addresses on devices flagged as slaves */ - if (idev->dev->flags & IFF_SLAVE) + if (idev->dev->priv_flags & IFF_NO_ADDRCONF) return; ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); @@ -3560,7 +3560,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, if (idev && idev->cnf.disable_ipv6) break; - if (dev->flags & IFF_SLAVE) { + if (dev->priv_flags & IFF_NO_ADDRCONF) { if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP && dev->flags & IFF_MULTICAST) ipv6_mc_up(idev); -- cgit v1.2.3-70-g09d2 From cb54d392279dd450e65f6fa3c3f66db8cbdbcc0e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 9 Dec 2022 10:21:40 -0500 Subject: net: failover: use IFF_NO_ADDRCONF flag to prevent ipv6 addrconf Similar to Bonding and Team, to prevent ipv6 addrconf with IFF_NO_ADDRCONF in slave_dev->priv_flags for slave ports is also needed in net failover. Note that dev_open(slave_dev) is called in .slave_register, which is called after the IFF_NO_ADDRCONF flag is set in failover_slave_register(). Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- net/core/failover.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/failover.c b/net/core/failover.c index 655411c4ca51..2a140b3ea669 100644 --- a/net/core/failover.c +++ b/net/core/failover.c @@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev) goto err_upper_link; } - slave_dev->priv_flags |= IFF_FAILOVER_SLAVE; + slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_register && !fops->slave_register(slave_dev, failover_dev)) return NOTIFY_OK; netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); err_upper_link: netdev_rx_handler_unregister(slave_dev); done: @@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev) netdev_rx_handler_unregister(slave_dev); netdev_upper_dev_unlink(slave_dev, failover_dev); - slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE; + slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF); if (fops && fops->slave_unregister && !fops->slave_unregister(slave_dev, failover_dev)) -- cgit v1.2.3-70-g09d2 From e095493091e850d5292ad01d8fbf5cde1d89ac53 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Sat, 10 Dec 2022 01:52:42 +0200 Subject: net: dsa: tag_8021q: avoid leaking ctx on dsa_tag_8021q_register() error path If dsa_tag_8021q_setup() fails, for example due to the inability of the device to install a VLAN, the tag_8021q context of the switch will leak. Make sure it is freed on the error path. Fixes: 328621f6131f ("net: dsa: tag_8021q: absorb dsa_8021q_setup into dsa_tag_8021q_{,un}register") Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20221209235242.480344-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/tag_8021q.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 34e5ec5d3e23..89371b16416e 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -398,6 +398,7 @@ static void dsa_tag_8021q_teardown(struct dsa_switch *ds) int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto) { struct dsa_8021q_context *ctx; + int err; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -410,7 +411,15 @@ int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto) ds->tag_8021q_ctx = ctx; - return dsa_tag_8021q_setup(ds); + err = dsa_tag_8021q_setup(ds); + if (err) + goto err_free; + + return 0; + +err_free: + kfree(ctx); + return err; } EXPORT_SYMBOL_GPL(dsa_tag_8021q_register); -- cgit v1.2.3-70-g09d2 From b63e30651c59bdef89ec158879d146e8d89cd5e1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:20 +0200 Subject: bridge: mcast: Do not derive entry type from its filter mode Currently, the filter mode (i.e., INCLUDE / EXCLUDE) of MDB entries cannot be set from user space. Instead, it is set by the kernel according to the entry type: (*, G) entries are treated as EXCLUDE and (S, G) entries are treated as INCLUDE. This allows the kernel to derive the entry type from its filter mode. Subsequent patches will allow user space to set the filter mode of (*, G) entries, making the current assumption incorrect. As a preparation, remove the current assumption and instead determine the entry type from its key, which is a more direct way. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index ae7d93c08880..2b6921dbdc02 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -857,17 +857,14 @@ static int br_mdb_add_group(const struct br_mdb_config *cfg, * added to it for proper replication */ if (br_multicast_should_handle_mode(brmctx, group.proto)) { - switch (filter_mode) { - case MCAST_EXCLUDE: - br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); - break; - case MCAST_INCLUDE: + if (br_multicast_is_star_g(&group)) { + br_multicast_star_g_handle_mode(p, filter_mode); + } else { star_group = p->key.addr; memset(&star_group.src, 0, sizeof(star_group.src)); star_mp = br_mdb_ip_get(br, &star_group); if (star_mp) br_multicast_sg_add_exclude_ports(star_mp, p); - break; } } -- cgit v1.2.3-70-g09d2 From 6ff1e68eb21501042ebf8226d500398fd07350f3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:21 +0200 Subject: bridge: mcast: Split (*, G) and (S, G) addition into different functions When the bridge is using IGMP version 3 or MLD version 2, it handles the addition of (*, G) and (S, G) entries differently. When a new (S, G) port group entry is added, all the (*, G) EXCLUDE ports need to be added to the port group of the new entry. Similarly, when a new (*, G) EXCLUDE port group entry is added, the port needs to be added to the port group of all the matching (S, G) entries. Subsequent patches will create more differences between both entry types. Namely, filter mode and source list can only be specified for (*, G) entries. Given the current and future differences between both entry types, handle the addition of each entry type in a different function, thereby avoiding the creation of one complex function. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 145 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 96 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 2b6921dbdc02..e3bd2122d559 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -786,21 +786,107 @@ out: return brmctx; } +static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, + struct net_bridge_mdb_entry *mp, + struct net_bridge_mcast *brmctx, + unsigned char flags, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + unsigned long now = jiffies; + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, cfg->br)) != NULL; + pp = &p->next) { + if (p->key.port == cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port"); + return -EEXIST; + } + if ((unsigned long)p->key.port < (unsigned long)cfg->p) + break; + } + + p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, + MCAST_INCLUDE, RTPROT_STATIC); + if (unlikely(!p)) { + NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (S, G) port group"); + return -ENOMEM; + } + rcu_assign_pointer(*pp, p); + if (!(flags & MDB_PG_FLAGS_PERMANENT)) + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); + br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); + + /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for + * proper replication. + */ + if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) { + struct net_bridge_mdb_entry *star_mp; + struct br_ip star_group; + + star_group = p->key.addr; + memset(&star_group.src, 0, sizeof(star_group.src)); + star_mp = br_mdb_ip_get(cfg->br, &star_group); + if (star_mp) + br_multicast_sg_add_exclude_ports(star_mp, p); + } + + return 0; +} + +static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, + struct net_bridge_mdb_entry *mp, + struct net_bridge_mcast *brmctx, + unsigned char flags, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + unsigned long now = jiffies; + + for (pp = &mp->ports; + (p = mlock_dereference(*pp, cfg->br)) != NULL; + pp = &p->next) { + if (p->key.port == cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port"); + return -EEXIST; + } + if ((unsigned long)p->key.port < (unsigned long)cfg->p) + break; + } + + p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, + MCAST_EXCLUDE, RTPROT_STATIC); + if (unlikely(!p)) { + NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (*, G) port group"); + return -ENOMEM; + } + rcu_assign_pointer(*pp, p); + if (!(flags & MDB_PG_FLAGS_PERMANENT)) + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); + br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); + /* If we are adding a new EXCLUDE port group (*, G), it needs to be + * also added to all (S, G) entries for proper replication. + */ + if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) + br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); + + return 0; +} + static int br_mdb_add_group(const struct br_mdb_config *cfg, struct netlink_ext_ack *extack) { - struct net_bridge_mdb_entry *mp, *star_mp; - struct net_bridge_port_group __rcu **pp; struct br_mdb_entry *entry = cfg->entry; struct net_bridge_port *port = cfg->p; + struct net_bridge_mdb_entry *mp; struct net_bridge *br = cfg->br; - struct net_bridge_port_group *p; struct net_bridge_mcast *brmctx; struct br_ip group = cfg->group; - unsigned long now = jiffies; unsigned char flags = 0; - struct br_ip star_group; - u8 filter_mode; brmctx = __br_mdb_choose_context(br, entry, extack); if (!brmctx) @@ -823,52 +909,13 @@ static int br_mdb_add_group(const struct br_mdb_config *cfg, return 0; } - for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; - pp = &p->next) { - if (p->key.port == port) { - NL_SET_ERR_MSG_MOD(extack, "Group is already joined by port"); - return -EEXIST; - } - if ((unsigned long)p->key.port < (unsigned long)port) - break; - } - - filter_mode = br_multicast_is_star_g(&group) ? MCAST_EXCLUDE : - MCAST_INCLUDE; - if (entry->state == MDB_PERMANENT) flags |= MDB_PG_FLAGS_PERMANENT; - p = br_multicast_new_port_group(port, &group, *pp, flags, NULL, - filter_mode, RTPROT_STATIC); - if (unlikely(!p)) { - NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group"); - return -ENOMEM; - } - rcu_assign_pointer(*pp, p); - if (entry->state == MDB_TEMPORARY) - mod_timer(&p->timer, - now + brmctx->multicast_membership_interval); - br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); - /* if we are adding a new EXCLUDE port group (*,G) it needs to be also - * added to all S,G entries for proper replication, if we are adding - * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be - * added to it for proper replication - */ - if (br_multicast_should_handle_mode(brmctx, group.proto)) { - if (br_multicast_is_star_g(&group)) { - br_multicast_star_g_handle_mode(p, filter_mode); - } else { - star_group = p->key.addr; - memset(&star_group.src, 0, sizeof(star_group.src)); - star_mp = br_mdb_ip_get(br, &star_group); - if (star_mp) - br_multicast_sg_add_exclude_ports(star_mp, p); - } - } - - return 0; + if (br_multicast_is_star_g(&group)) + return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack); + else + return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack); } static int __br_mdb_add(const struct br_mdb_config *cfg, -- cgit v1.2.3-70-g09d2 From 1870a2d35abb6f4d8ff2213d50bbd082b1f8cde5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:22 +0200 Subject: bridge: mcast: Place netlink policy before validation functions Subsequent patches are going to add additional validation functions and netlink policies. Some of these functions will need to perform parsing using nla_parse_nested() and the new policies. In order to keep all the policies next to each other, move the current policy to before the validation functions. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index e3bd2122d559..fcdd464cf997 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -663,6 +663,12 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } +static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { + [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, + sizeof(struct in_addr), + sizeof(struct in6_addr)), +}; + static bool is_valid_mdb_entry(struct br_mdb_entry *entry, struct netlink_ext_ack *extack) { @@ -748,12 +754,6 @@ static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto, return true; } -static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { - [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, - sizeof(struct in_addr), - sizeof(struct in6_addr)), -}; - static struct net_bridge_mcast * __br_mdb_choose_context(struct net_bridge *br, const struct br_mdb_entry *entry, -- cgit v1.2.3-70-g09d2 From 160dd93114ddd31ed2b6290a7495d53717b79cf8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:23 +0200 Subject: bridge: mcast: Add a centralized error path Subsequent patches will add memory allocations in br_mdb_config_init() as the MDB configuration structure will include a linked list of source entries. This memory will need to be freed regardless if br_mdb_add() succeeded or failed. As a preparation for this change, add a centralized error path where the memory will be freed. Note that br_mdb_del() already has one error path and therefore does not require any changes. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index fcdd464cf997..95780652cdbf 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1053,28 +1053,29 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + err = -EINVAL; /* host join errors which can happen before creating the group */ if (!cfg.p && !br_group_is_l2(&cfg.group)) { /* don't allow any flags for host-joined IP groups */ if (cfg.entry->state) { NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups"); - return -EINVAL; + goto out; } if (!br_multicast_is_star_g(&cfg.group)) { NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined"); - return -EINVAL; + goto out; } } if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed"); - return -EINVAL; + goto out; } if (cfg.p) { if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) { NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent"); - return -EINVAL; + goto out; } vg = nbp_vlan_group(cfg.p); } else { @@ -1096,6 +1097,7 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = __br_mdb_add(&cfg, extack); } +out: return err; } -- cgit v1.2.3-70-g09d2 From fd0c696164cf13ae0128f14209e2dbfcd86584b8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:24 +0200 Subject: bridge: mcast: Expose br_multicast_new_group_src() Currently, new group source entries are only created in response to received Membership Reports. Subsequent patches are going to allow user space to install (*, G) entries with a source list. As a preparatory step, expose br_multicast_new_group_src() so that it could later be invoked from the MDB code (i.e., br_mdb.c) that handles RTM_NEWMDB messages. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 2 +- net/bridge/br_private.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index db4c3900ae95..b2bc23fdcee5 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1232,7 +1232,7 @@ br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip) return NULL; } -static struct net_bridge_group_src * +struct net_bridge_group_src * br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip) { struct net_bridge_group_src *grp_src; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 3997e16c15fc..183de6c57d72 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -974,6 +974,9 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, struct net_bridge_port_group *sg); struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); +struct net_bridge_group_src * +br_multicast_new_group_src(struct net_bridge_port_group *pg, + struct br_ip *src_ip); void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave); void br_multicast_ctx_init(struct net_bridge *br, -- cgit v1.2.3-70-g09d2 From 083e353482b4c9b727846643ad6ca7b784dd486b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:25 +0200 Subject: bridge: mcast: Expose __br_multicast_del_group_src() Expose __br_multicast_del_group_src() which is symmetric to br_multicast_new_group_src() and does not remove the installed {S, G} forwarding entry, unlike br_multicast_del_group_src(). The function will be used in the error path when user space was able to add a new source entry, but failed to install a corresponding forwarding entry. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 11 ++++++++--- net/bridge/br_private.h | 1 + 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b2bc23fdcee5..8432b4ea7f28 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -650,18 +650,23 @@ static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc) kfree_rcu(src, rcu); } -void br_multicast_del_group_src(struct net_bridge_group_src *src, - bool fastleave) +void __br_multicast_del_group_src(struct net_bridge_group_src *src) { struct net_bridge *br = src->pg->key.port->br; - br_multicast_fwd_src_remove(src, fastleave); hlist_del_init_rcu(&src->node); src->pg->src_ents--; hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list); queue_work(system_long_wq, &br->mcast_gc_work); } +void br_multicast_del_group_src(struct net_bridge_group_src *src, + bool fastleave) +{ + br_multicast_fwd_src_remove(src, fastleave); + __br_multicast_del_group_src(src); +} + static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc) { struct net_bridge_port_group *pg; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 183de6c57d72..a3db99d79a3d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -977,6 +977,7 @@ br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); struct net_bridge_group_src * br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip); +void __br_multicast_del_group_src(struct net_bridge_group_src *src); void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave); void br_multicast_ctx_init(struct net_bridge *br, -- cgit v1.2.3-70-g09d2 From a01ecb1712ddbcd41360ad0c554b460adbac0528 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:26 +0200 Subject: bridge: mcast: Add a flag for user installed source entries There are a few places where the bridge driver differentiates between (S, G) entries installed by the kernel (in response to Membership Reports) and those installed by user space. One of them is when deleting an (S, G) entry corresponding to a source entry that is being deleted. While user space cannot currently add a source entry to a (*, G), it can add an (S, G) entry that later corresponds to a source entry created by the reception of a Membership Report. If this source entry is later deleted because its source timer expired or because the (*, G) entry is being deleted, the bridge driver will not delete the corresponding (S, G) entry if it was added by user space as permanent. This is going to be a problem when the ability to install a (*, G) with a source list is exposed to user space. In this case, when user space installs the (*, G) as permanent, then all the (S, G) entries corresponding to its source list will also be installed as permanent. When user space deletes the (*, G), all the source entries will be deleted and the expectation is that the corresponding (S, G) entries will be deleted as well. Solve this by introducing a new source entry flag denoting that the entry was installed by user space. When the entry is deleted, delete the corresponding (S, G) entry even if it was installed by user space as permanent, as the flag tells us that it was installed in response to the source entry being created. The flag will be set in a subsequent patch where source entries are created in response to user requests. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_multicast.c | 3 ++- net/bridge/br_private.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 8432b4ea7f28..48170bd3785e 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -552,7 +552,8 @@ static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src, continue; if (p->rt_protocol != RTPROT_KERNEL && - (p->flags & MDB_PG_FLAGS_PERMANENT)) + (p->flags & MDB_PG_FLAGS_PERMANENT) && + !(src->flags & BR_SGRP_F_USER_ADDED)) break; if (fastleave) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a3db99d79a3d..74f17b56c9eb 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -300,6 +300,7 @@ struct net_bridge_fdb_flush_desc { #define BR_SGRP_F_DELETE BIT(0) #define BR_SGRP_F_SEND BIT(1) #define BR_SGRP_F_INSTALLED BIT(2) +#define BR_SGRP_F_USER_ADDED BIT(3) struct net_bridge_mcast_gc { struct hlist_node gc_node; -- cgit v1.2.3-70-g09d2 From 079afd66161bbbde14ed9f207f7fe6170f5b37b3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:27 +0200 Subject: bridge: mcast: Avoid arming group timer when (S, G) corresponds to a source User space will soon be able to install a (*, G) with a source list, prompting the creation of a (S, G) entry for each source. In this case, the group timer of the (S, G) entry should never be set. Solve this by adding a new field to the MDB configuration structure that denotes whether the (S, G) corresponds to a source or not. The field will be set in a subsequent patch where br_mdb_add_group_sg() is called in order to create a (S, G) entry for each user provided source. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 2 +- net/bridge/br_private.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 95780652cdbf..7cda9d1c5c93 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -814,7 +814,7 @@ static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, return -ENOMEM; } rcu_assign_pointer(*pp, p); - if (!(flags & MDB_PG_FLAGS_PERMANENT)) + if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 74f17b56c9eb..e98bfe3c02e1 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -98,6 +98,7 @@ struct br_mdb_config { struct net_bridge_port *p; struct br_mdb_entry *entry; struct br_ip group; + bool src_entry; }; #endif -- cgit v1.2.3-70-g09d2 From b1c8fec8d459fb49b2033c014256477e51913e2e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:28 +0200 Subject: bridge: mcast: Add support for (*, G) with a source list and filter mode In preparation for allowing user space to add (*, G) entries with a source list and associated filter mode, add the necessary plumbing to handle such requests. Extend the MDB configuration structure with a currently empty source array and filter mode that is currently hard coded to EXCLUDE. Add the source entries and the corresponding (S, G) entries before making the new (*, G) port group entry visible to the data path. Handle the creation of each source entry in a similar fashion to how it is created from the data path in response to received Membership Reports: Create the source entry, arm the source timer (if needed), add a corresponding (S, G) forwarding entry and finally mark the source entry as installed (by user space). Add the (S, G) entry by populating an MDB configuration structure and calling br_mdb_add_group_sg() as if a new entry is created by user space, with the sole difference that the 'src_entry' field is set to make sure that the group timer of such entries is never armed. Note that it is not currently possible to add more than 32 source entries to a port group entry. If this proves to be a problem we can either increase 'PG_SRC_ENT_LIMIT' or avoid forcing a limit on entries created by user space. Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++-- net/bridge/br_private.h | 7 +++ 2 files changed, 132 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 7cda9d1c5c93..e9a4b7e247e7 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -836,6 +836,114 @@ static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, return 0; } +static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, + struct br_ip *src_ip, + struct net_bridge_mcast *brmctx, + struct netlink_ext_ack *extack) +{ + struct net_bridge_mdb_entry *sgmp; + struct br_mdb_config sg_cfg; + struct br_ip sg_ip; + u8 flags = 0; + + sg_ip = cfg->group; + sg_ip.src = src_ip->src; + sgmp = br_multicast_new_group(cfg->br, &sg_ip); + if (IS_ERR(sgmp)) { + NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry"); + return PTR_ERR(sgmp); + } + + if (cfg->entry->state == MDB_PERMANENT) + flags |= MDB_PG_FLAGS_PERMANENT; + if (cfg->filter_mode == MCAST_EXCLUDE) + flags |= MDB_PG_FLAGS_BLOCKED; + + memset(&sg_cfg, 0, sizeof(sg_cfg)); + sg_cfg.br = cfg->br; + sg_cfg.p = cfg->p; + sg_cfg.entry = cfg->entry; + sg_cfg.group = sg_ip; + sg_cfg.src_entry = true; + sg_cfg.filter_mode = MCAST_INCLUDE; + return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); +} + +static int br_mdb_add_group_src(const struct br_mdb_config *cfg, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + struct br_mdb_src_entry *src, + struct netlink_ext_ack *extack) +{ + struct net_bridge_group_src *ent; + unsigned long now = jiffies; + int err; + + ent = br_multicast_find_group_src(pg, &src->addr); + if (!ent) { + ent = br_multicast_new_group_src(pg, &src->addr); + if (!ent) { + NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry"); + return -ENOSPC; + } + } else { + NL_SET_ERR_MSG_MOD(extack, "Source entry already exists"); + return -EEXIST; + } + + if (cfg->filter_mode == MCAST_INCLUDE && + cfg->entry->state == MDB_TEMPORARY) + mod_timer(&ent->timer, now + br_multicast_gmi(brmctx)); + else + del_timer(&ent->timer); + + /* Install a (S, G) forwarding entry for the source. */ + err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack); + if (err) + goto err_del_sg; + + ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED; + + return 0; + +err_del_sg: + __br_multicast_del_group_src(ent); + return err; +} + +static void br_mdb_del_group_src(struct net_bridge_port_group *pg, + struct br_mdb_src_entry *src) +{ + struct net_bridge_group_src *ent; + + ent = br_multicast_find_group_src(pg, &src->addr); + if (WARN_ON_ONCE(!ent)) + return; + br_multicast_del_group_src(ent, false); +} + +static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + struct netlink_ext_ack *extack) +{ + int i, err; + + for (i = 0; i < cfg->num_src_entries; i++) { + err = br_mdb_add_group_src(cfg, pg, brmctx, + &cfg->src_entries[i], extack); + if (err) + goto err_del_group_srcs; + } + + return 0; + +err_del_group_srcs: + for (i--; i >= 0; i--) + br_mdb_del_group_src(pg, &cfg->src_entries[i]); + return err; +} + static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, @@ -845,6 +953,7 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_port_group __rcu **pp; struct net_bridge_port_group *p; unsigned long now = jiffies; + int err; for (pp = &mp->ports; (p = mlock_dereference(*pp, cfg->br)) != NULL; @@ -858,23 +967,35 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, - MCAST_EXCLUDE, RTPROT_STATIC); + cfg->filter_mode, RTPROT_STATIC); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (*, G) port group"); return -ENOMEM; } + + err = br_mdb_add_group_srcs(cfg, p, brmctx, extack); + if (err) + goto err_del_port_group; + rcu_assign_pointer(*pp, p); - if (!(flags & MDB_PG_FLAGS_PERMANENT)) + if (!(flags & MDB_PG_FLAGS_PERMANENT) && + cfg->filter_mode == MCAST_EXCLUDE) mod_timer(&p->timer, now + brmctx->multicast_membership_interval); br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB); /* If we are adding a new EXCLUDE port group (*, G), it needs to be * also added to all (S, G) entries for proper replication. */ - if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) + if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) && + cfg->filter_mode == MCAST_EXCLUDE) br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); return 0; + +err_del_port_group: + hlist_del_init(&p->mglist); + kfree(p); + return err; } static int br_mdb_add_group(const struct br_mdb_config *cfg, @@ -967,6 +1088,7 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, return err; memset(cfg, 0, sizeof(*cfg)); + cfg->filter_mode = MCAST_EXCLUDE; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e98bfe3c02e1..368f5f6fa42b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -93,12 +93,19 @@ struct bridge_mcast_stats { struct u64_stats_sync syncp; }; +struct br_mdb_src_entry { + struct br_ip addr; +}; + struct br_mdb_config { struct net_bridge *br; struct net_bridge_port *p; struct br_mdb_entry *entry; struct br_ip group; bool src_entry; + u8 filter_mode; + struct br_mdb_src_entry *src_entries; + int num_src_entries; }; #endif -- cgit v1.2.3-70-g09d2 From 6afaae6d12f54accf194e73ece617d1f456e438d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:29 +0200 Subject: bridge: mcast: Allow user space to add (*, G) with a source list and filter mode Add new netlink attributes to the RTM_NEWMDB request that allow user space to add (*, G) with a source list and filter mode. The RTM_NEWMDB message can already dump such entries (created by the kernel) so there is no need to add dump support. However, the message contains a different set of attributes depending if it is a request or a response. The naming and structure of the new attributes try to follow the existing ones used in the response. Request: [ struct nlmsghdr ] [ struct br_port_msg ] [ MDBA_SET_ENTRY ] struct br_mdb_entry [ MDBA_SET_ENTRY_ATTRS ] [ MDBE_ATTR_SOURCE ] struct in_addr / struct in6_addr [ MDBE_ATTR_SRC_LIST ] // new [ MDBE_SRC_LIST_ENTRY ] [ MDBE_SRCATTR_ADDRESS ] struct in_addr / struct in6_addr [ ...] [ MDBE_ATTR_GROUP_MODE ] // new u8 Response: [ struct nlmsghdr ] [ struct br_port_msg ] [ MDBA_MDB ] [ MDBA_MDB_ENTRY ] [ MDBA_MDB_ENTRY_INFO ] struct br_mdb_entry [ MDBA_MDB_EATTR_TIMER ] u32 [ MDBA_MDB_EATTR_SOURCE ] struct in_addr / struct in6_addr [ MDBA_MDB_EATTR_RTPROT ] u8 [ MDBA_MDB_EATTR_SRC_LIST ] [ MDBA_MDB_SRCLIST_ENTRY ] [ MDBA_MDB_SRCATTR_ADDRESS ] struct in_addr / struct in6_addr [ MDBA_MDB_SRCATTR_TIMER ] u8 [...] [ MDBA_MDB_EATTR_GROUP_MODE ] u8 Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 20 +++++++ net/bridge/br_mdb.c | 130 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index a86a7e7b811f..0d9fe73fc48c 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -723,10 +723,30 @@ enum { enum { MDBE_ATTR_UNSPEC, MDBE_ATTR_SOURCE, + MDBE_ATTR_SRC_LIST, + MDBE_ATTR_GROUP_MODE, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) +/* per mdb entry source */ +enum { + MDBE_SRC_LIST_UNSPEC, + MDBE_SRC_LIST_ENTRY, + __MDBE_SRC_LIST_MAX, +}; +#define MDBE_SRC_LIST_MAX (__MDBE_SRC_LIST_MAX - 1) + +/* per mdb entry per source attributes + * these are embedded in MDBE_SRC_LIST_ENTRY + */ +enum { + MDBE_SRCATTR_UNSPEC, + MDBE_SRCATTR_ADDRESS, + __MDBE_SRCATTR_MAX, +}; +#define MDBE_SRCATTR_MAX (__MDBE_SRCATTR_MAX - 1) + /* Embedded inside LINK_XSTATS_TYPE_BRIDGE */ enum { BRIDGE_XSTATS_UNSPEC, diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index e9a4b7e247e7..61d46b0a31b6 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -663,10 +663,25 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } +static const struct nla_policy +br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = { + [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY, + sizeof(struct in_addr), + sizeof(struct in6_addr)), +}; + +static const struct nla_policy +br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = { + [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol), +}; + static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr), sizeof(struct in6_addr)), + [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, + MCAST_INCLUDE), + [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), }; static bool is_valid_mdb_entry(struct br_mdb_entry *entry, @@ -1051,6 +1066,76 @@ static int __br_mdb_add(const struct br_mdb_config *cfg, return ret; } +static int br_mdb_config_src_entry_init(struct nlattr *src_entry, + struct br_mdb_src_entry *src, + __be16 proto, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[MDBE_SRCATTR_MAX + 1]; + int err; + + err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry, + br_mdbe_src_list_entry_pol, extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS)) + return -EINVAL; + + if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack)) + return -EINVAL; + + src->addr.proto = proto; + nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS], + nla_len(tb[MDBE_SRCATTR_ADDRESS])); + + return 0; +} + +static int br_mdb_config_src_list_init(struct nlattr *src_list, + struct br_mdb_config *cfg, + struct netlink_ext_ack *extack) +{ + struct nlattr *src_entry; + int rem, err; + int i = 0; + + nla_for_each_nested(src_entry, src_list, rem) + cfg->num_src_entries++; + + if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)", + PG_SRC_ENT_LIMIT - 1); + return -EINVAL; + } + + cfg->src_entries = kcalloc(cfg->num_src_entries, + sizeof(struct br_mdb_src_entry), GFP_KERNEL); + if (!cfg->src_entries) + return -ENOMEM; + + nla_for_each_nested(src_entry, src_list, rem) { + err = br_mdb_config_src_entry_init(src_entry, + &cfg->src_entries[i], + cfg->entry->addr.proto, + extack); + if (err) + goto err_src_entry_init; + i++; + } + + return 0; + +err_src_entry_init: + kfree(cfg->src_entries); + return err; +} + +static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg) +{ + kfree(cfg->src_entries); +} + static int br_mdb_config_attrs_init(struct nlattr *set_attrs, struct br_mdb_config *cfg, struct netlink_ext_ack *extack) @@ -1070,6 +1155,44 @@ static int br_mdb_config_attrs_init(struct nlattr *set_attrs, __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs); + if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(&cfg->group)) { + NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries"); + return -EINVAL; + } + cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]); + } else { + cfg->filter_mode = MCAST_EXCLUDE; + } + + if (mdb_attrs[MDBE_ATTR_SRC_LIST]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups"); + return -EINVAL; + } + if (!br_multicast_is_star_g(&cfg->group)) { + NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries"); + return -EINVAL; + } + if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) { + NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode"); + return -EINVAL; + } + err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST], + cfg, extack); + if (err) + return err; + } + + if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) { + NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list"); + return -EINVAL; + } + return 0; } @@ -1162,6 +1285,11 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, return 0; } +static void br_mdb_config_fini(struct br_mdb_config *cfg) +{ + br_mdb_config_src_list_fini(cfg); +} + static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -1220,6 +1348,7 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, } out: + br_mdb_config_fini(&cfg); return err; } @@ -1295,6 +1424,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, err = __br_mdb_del(&cfg); } + br_mdb_config_fini(&cfg); return err; } -- cgit v1.2.3-70-g09d2 From 1d7b66a7d9754c229d12c53ad6a1effe88c16ca4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:30 +0200 Subject: bridge: mcast: Allow user space to specify MDB entry routing protocol Add the 'MDBE_ATTR_RTPORT' attribute to allow user space to specify the routing protocol of the MDB port group entry. Enforce a minimum value of 'RTPROT_STATIC' to prevent user space from using protocol values that should only be set by the kernel (e.g., 'RTPROT_KERNEL'). Maintain backward compatibility by defaulting to 'RTPROT_STATIC'. The protocol is already visible to user space in RTM_NEWMDB responses and notifications via the 'MDBA_MDB_EATTR_RTPROT' attribute. The routing protocol allows a routing daemon to distinguish between entries configured by it and those configured by the administrator. Once MDB flush is supported, the protocol can be used as a criterion according to which the flush is performed. Examples: # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 permanent proto kernel Error: integer out of range. # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 permanent proto static # bridge mdb add dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.1 permanent proto zebra # bridge mdb add dev br0 port dummy10 grp 239.1.1.2 permanent source_list 198.51.100.1,198.51.100.2 filter_mode include proto 250 # bridge -d mdb show dev br0 port dummy10 grp 239.1.1.2 src 198.51.100.2 permanent filter_mode include proto 250 dev br0 port dummy10 grp 239.1.1.2 src 198.51.100.1 permanent filter_mode include proto 250 dev br0 port dummy10 grp 239.1.1.2 permanent filter_mode include source_list 198.51.100.2/0.00,198.51.100.1/0.00 proto 250 dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.1 permanent filter_mode include proto zebra dev br0 port dummy10 grp 239.1.1.1 permanent filter_mode exclude proto static Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 15 +++++++++++++-- net/bridge/br_private.h | 1 + 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index 0d9fe73fc48c..d9de241d90f9 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -725,6 +725,7 @@ enum { MDBE_ATTR_SOURCE, MDBE_ATTR_SRC_LIST, MDBE_ATTR_GROUP_MODE, + MDBE_ATTR_RTPROT, __MDBE_ATTR_MAX, }; #define MDBE_ATTR_MAX (__MDBE_ATTR_MAX - 1) diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 61d46b0a31b6..72d4e53193e5 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -682,6 +682,7 @@ static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = { [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE, MCAST_INCLUDE), [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol), + [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC), }; static bool is_valid_mdb_entry(struct br_mdb_entry *entry, @@ -823,7 +824,7 @@ static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, - MCAST_INCLUDE, RTPROT_STATIC); + MCAST_INCLUDE, cfg->rt_protocol); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (S, G) port group"); return -ENOMEM; @@ -881,6 +882,7 @@ static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, sg_cfg.group = sg_ip; sg_cfg.src_entry = true; sg_cfg.filter_mode = MCAST_INCLUDE; + sg_cfg.rt_protocol = cfg->rt_protocol; return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); } @@ -982,7 +984,7 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, } p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL, - cfg->filter_mode, RTPROT_STATIC); + cfg->filter_mode, cfg->rt_protocol); if (unlikely(!p)) { NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (*, G) port group"); return -ENOMEM; @@ -1193,6 +1195,14 @@ static int br_mdb_config_attrs_init(struct nlattr *set_attrs, return -EINVAL; } + if (mdb_attrs[MDBE_ATTR_RTPROT]) { + if (!cfg->p) { + NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups"); + return -EINVAL; + } + cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]); + } + return 0; } @@ -1212,6 +1222,7 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, memset(cfg, 0, sizeof(*cfg)); cfg->filter_mode = MCAST_EXCLUDE; + cfg->rt_protocol = RTPROT_STATIC; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 368f5f6fa42b..cdc9e040f1f6 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -106,6 +106,7 @@ struct br_mdb_config { u8 filter_mode; struct br_mdb_src_entry *src_entries; int num_src_entries; + u8 rt_protocol; }; #endif -- cgit v1.2.3-70-g09d2 From 61f2183512a72c28674717b1ba706ed2749938d1 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 10 Dec 2022 16:56:31 +0200 Subject: bridge: mcast: Support replacement of MDB port group entries Now that user space can specify additional attributes of port group entries such as filter mode and source list, it makes sense to allow user space to atomically modify these attributes by replacing entries instead of forcing user space to delete the entries and add them back. Replace MDB port group entries when the 'NLM_F_REPLACE' flag is specified in the netlink message header. When a (*, G) entry is replaced, update the following attributes: Source list, state, filter mode, protocol and flags. If the entry is temporary and in EXCLUDE mode, reset the group timer to the group membership interval. If the entry is temporary and in INCLUDE mode, reset the source timers of associated sources to the group membership interval. Examples: # bridge mdb replace dev br0 port dummy10 grp 239.1.1.1 permanent source_list 192.0.2.1,192.0.2.2 filter_mode include # bridge -d -s mdb show dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.2 permanent filter_mode include proto static 0.00 dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.1 permanent filter_mode include proto static 0.00 dev br0 port dummy10 grp 239.1.1.1 permanent filter_mode include source_list 192.0.2.2/0.00,192.0.2.1/0.00 proto static 0.00 # bridge mdb replace dev br0 port dummy10 grp 239.1.1.1 permanent source_list 192.0.2.1,192.0.2.3 filter_mode exclude proto zebra # bridge -d -s mdb show dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.3 permanent filter_mode include proto zebra blocked 0.00 dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.1 permanent filter_mode include proto zebra blocked 0.00 dev br0 port dummy10 grp 239.1.1.1 permanent filter_mode exclude source_list 192.0.2.3/0.00,192.0.2.1/0.00 proto zebra 0.00 # bridge mdb replace dev br0 port dummy10 grp 239.1.1.1 temp source_list 192.0.2.4,192.0.2.3 filter_mode include proto bgp # bridge -d -s mdb show dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.4 temp filter_mode include proto bgp 0.00 dev br0 port dummy10 grp 239.1.1.1 src 192.0.2.3 temp filter_mode include proto bgp 0.00 dev br0 port dummy10 grp 239.1.1.1 temp filter_mode include source_list 192.0.2.4/259.44,192.0.2.3/259.44 proto bgp 0.00 Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Signed-off-by: Jakub Kicinski --- net/bridge/br_mdb.c | 102 +++++++++++++++++++++++++++++++++++++++++++++--- net/bridge/br_private.h | 1 + 2 files changed, 98 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 72d4e53193e5..00e5743647b0 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -802,6 +802,27 @@ out: return brmctx; } +static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + unsigned char flags) +{ + unsigned long now = jiffies; + + pg->flags = flags; + pg->rt_protocol = cfg->rt_protocol; + if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry) + mod_timer(&pg->timer, + now + brmctx->multicast_membership_interval); + else + del_timer(&pg->timer); + + br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); + + return 0; +} + static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, @@ -816,8 +837,12 @@ static int br_mdb_add_group_sg(const struct br_mdb_config *cfg, (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { - NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port"); - return -EEXIST; + if (!(cfg->nlflags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port"); + return -EEXIST; + } + return br_mdb_replace_group_sg(cfg, mp, p, brmctx, + flags); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; @@ -883,6 +908,7 @@ static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg, sg_cfg.src_entry = true; sg_cfg.filter_mode = MCAST_INCLUDE; sg_cfg.rt_protocol = cfg->rt_protocol; + sg_cfg.nlflags = cfg->nlflags; return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack); } @@ -903,7 +929,7 @@ static int br_mdb_add_group_src(const struct br_mdb_config *cfg, NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry"); return -ENOSPC; } - } else { + } else if (!(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Source entry already exists"); return -EEXIST; } @@ -961,6 +987,67 @@ err_del_group_srcs: return err; } +static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + struct netlink_ext_ack *extack) +{ + struct net_bridge_group_src *ent; + struct hlist_node *tmp; + int err; + + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags |= BR_SGRP_F_DELETE; + + err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack); + if (err) + goto err_clear_delete; + + hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) { + if (ent->flags & BR_SGRP_F_DELETE) + br_multicast_del_group_src(ent, false); + } + + return 0; + +err_clear_delete: + hlist_for_each_entry(ent, &pg->src_list, node) + ent->flags &= ~BR_SGRP_F_DELETE; + return err; +} + +static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + struct net_bridge_mcast *brmctx, + unsigned char flags, + struct netlink_ext_ack *extack) +{ + unsigned long now = jiffies; + int err; + + err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack); + if (err) + return err; + + pg->flags = flags; + pg->filter_mode = cfg->filter_mode; + pg->rt_protocol = cfg->rt_protocol; + if (!(flags & MDB_PG_FLAGS_PERMANENT) && + cfg->filter_mode == MCAST_EXCLUDE) + mod_timer(&pg->timer, + now + brmctx->multicast_membership_interval); + else + del_timer(&pg->timer); + + br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); + + if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) + br_multicast_star_g_handle_mode(pg, cfg->filter_mode); + + return 0; +} + static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, struct net_bridge_mdb_entry *mp, struct net_bridge_mcast *brmctx, @@ -976,8 +1063,12 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg, (p = mlock_dereference(*pp, cfg->br)) != NULL; pp = &p->next) { if (p->key.port == cfg->p) { - NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port"); - return -EEXIST; + if (!(cfg->nlflags & NLM_F_REPLACE)) { + NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port"); + return -EEXIST; + } + return br_mdb_replace_group_star_g(cfg, mp, p, brmctx, + flags, extack); } if ((unsigned long)p->key.port < (unsigned long)cfg->p) break; @@ -1223,6 +1314,7 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh, memset(cfg, 0, sizeof(*cfg)); cfg->filter_mode = MCAST_EXCLUDE; cfg->rt_protocol = RTPROT_STATIC; + cfg->nlflags = nlh->nlmsg_flags; bpm = nlmsg_data(nlh); if (!bpm->ifindex) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index cdc9e040f1f6..15ef7fd508ee 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -104,6 +104,7 @@ struct br_mdb_config { struct br_ip group; bool src_entry; u8 filter_mode; + u16 nlflags; struct br_mdb_src_entry *src_entries; int num_src_entries; u8 rt_protocol; -- cgit v1.2.3-70-g09d2 From 89300468e2b2ec216c7827ba04ac45c129794403 Mon Sep 17 00:00:00 2001 From: Coco Li Date: Sat, 10 Dec 2022 04:16:45 +0000 Subject: IPv6/GRO: generic helper to remove temporary HBH/jumbo header in driver IPv6/TCP and GRO stacks can build big TCP packets with an added temporary Hop By Hop header. Is GSO is not involved, then the temporary header needs to be removed in the driver. This patch provides a generic helper for drivers that need to modify their headers in place. Tested: Compiled and ran with ethtool -K eth1 tso off Could send Big TCP packets Signed-off-by: Coco Li Link: https://lore.kernel.org/r/20221210041646.3587757-1-lixiaoyan@google.com Signed-off-by: Jakub Kicinski --- include/net/ipv6.h | 33 +++++++++++++++++++++++++++++++++ net/ipv6/ip6_offload.c | 27 ++++----------------------- 2 files changed, 37 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index d383c895592a..03f3af02a9a6 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -500,6 +500,39 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb) return jhdr->nexthdr; } +/* Return 0 if HBH header is successfully removed + * Or if HBH removal is unnecessary (packet is not big TCP) + * Return error to indicate dropping the packet + */ +static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb) +{ + const int hophdr_len = sizeof(struct hop_jumbo_hdr); + int nexthdr = ipv6_has_hopopt_jumbo(skb); + struct ipv6hdr *h6; + + if (!nexthdr) + return 0; + + if (skb_cow_head(skb, 0)) + return -1; + + /* Remove the HBH header. + * Layout: [Ethernet header][IPv6 header][HBH][L4 Header] + */ + memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb), + skb_network_header(skb) - skb_mac_header(skb) + + sizeof(struct ipv6hdr)); + + __skb_pull(skb, hophdr_len); + skb->network_header += hophdr_len; + skb->mac_header += hophdr_len; + + h6 = ipv6_hdr(skb); + h6->nexthdr = nexthdr; + + return 0; +} + static inline bool ipv6_accept_ra(struct inet6_dev *idev) { /* If forwarding is enabled, RA are not accepted unless the special diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index 3ee345672849..00dc2e3b0184 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -77,7 +77,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; - int proto, nexthdr; + int proto, err; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; @@ -87,28 +87,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, bool gso_partial; skb_reset_network_header(skb); - nexthdr = ipv6_has_hopopt_jumbo(skb); - if (nexthdr) { - const int hophdr_len = sizeof(struct hop_jumbo_hdr); - int err; - - err = skb_cow_head(skb, 0); - if (err < 0) - return ERR_PTR(err); - - /* remove the HBH header. - * Layout: [Ethernet header][IPv6 header][HBH][TCP header] - */ - memmove(skb_mac_header(skb) + hophdr_len, - skb_mac_header(skb), - ETH_HLEN + sizeof(struct ipv6hdr)); - skb->data += hophdr_len; - skb->len -= hophdr_len; - skb->network_header += hophdr_len; - skb->mac_header += hophdr_len; - ipv6h = (struct ipv6hdr *)skb->data; - ipv6h->nexthdr = nexthdr; - } + err = ipv6_hopopt_jumbo_remove(skb); + if (err) + return ERR_PTR(err); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; -- cgit v1.2.3-70-g09d2 From 7c4a6309e27f411743817fe74a832ec2d2798a4b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 12 Dec 2022 19:20:37 -0800 Subject: ipvs: fix type warning in do_div() on 32 bit 32 bit platforms without 64bit div generate the following warning: net/netfilter/ipvs/ip_vs_est.c: In function 'ip_vs_est_calc_limits': include/asm-generic/div64.h:222:35: warning: comparison of distinct pointer types lacks a cast 222 | (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ | ^~ net/netfilter/ipvs/ip_vs_est.c:694:17: note: in expansion of macro 'do_div' 694 | do_div(val, loops); | ^~~~~~ include/asm-generic/div64.h:222:35: warning: comparison of distinct pointer types lacks a cast 222 | (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ | ^~ net/netfilter/ipvs/ip_vs_est.c:700:33: note: in expansion of macro 'do_div' 700 | do_div(val, min_est); | ^~~~~~ first argument of do_div() should be unsigned. We can't just cast as do_div() updates it as well, so we need an lval. Make val unsigned in the first place, all paths check that the value they assign to this variables are non-negative already. Fixes: 705dd3444081 ("ipvs: use kthreads for stats estimation") Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20221213032037.844517-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/netfilter/ipvs/ip_vs_est.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index df56073bb282..ce2a1549b304 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -640,9 +640,10 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) int i, loops, ntest; s32 min_est = 0; ktime_t t1, t2; - s64 diff, val; int max = 8; int ret = 1; + s64 diff; + u64 val; INIT_HLIST_HEAD(&chain); mutex_lock(&__ip_vs_mutex); -- cgit v1.2.3-70-g09d2